In [1]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import math

In [2]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents())

## I. Lexicon Tagging

In [3]:
# samples: Each sentence is a list of (word, pos) tuples
wsj[:2]

[[('Pierre', 'NNP'),
  ('Vinken', 'NNP'),
  (',', ','),
  ('61', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  (',', ','),
  ('will', 'MD'),
  ('join', 'VB'),
  ('the', 'DT'),
  ('board', 'NN'),
  ('as', 'IN'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('Nov.', 'NNP'),
  ('29', 'CD'),
  ('.', '.')],
 [('Mr.', 'NNP'),
  ('Vinken', 'NNP'),
  ('is', 'VBZ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Elsevier', 'NNP'),
  ('N.V.', 'NNP'),
  (',', ','),
  ('the', 'DT'),
  ('Dutch', 'NNP'),
  ('publishing', 'VBG'),
  ('group', 'NN'),
  ('.', '.')]]

In [5]:
# converting the list of sents to a list of (word, pos tag) tuples
tagged_words = [tup for sent in wsj for tup in sent]
print(len(tagged_words))
tagged_words[0:10]

100676


[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT')]

In [10]:
# question 1: Find the number of unique POS tags in the corpus
tags = [tags[1] for tags in tagged_words]
unique_tags = set(tags)
len(unique_tags)

46

In [12]:
# question 2: Which is the most frequent tag in the corpus
from collections import Counter
tag_counts = Counter(tags)
tag_counts

Counter({'#': 16,
         '$': 724,
         "''": 694,
         ',': 4886,
         '-LRB-': 120,
         '-NONE-': 6592,
         '-RRB-': 126,
         '.': 3874,
         ':': 563,
         'CC': 2265,
         'CD': 3546,
         'DT': 8165,
         'EX': 88,
         'FW': 4,
         'IN': 9857,
         'JJ': 5834,
         'JJR': 381,
         'JJS': 182,
         'LS': 13,
         'MD': 927,
         'NN': 13166,
         'NNP': 9410,
         'NNPS': 244,
         'NNS': 6047,
         'PDT': 27,
         'POS': 824,
         'PRP': 1716,
         'PRP$': 766,
         'RB': 2822,
         'RBR': 136,
         'RBS': 35,
         'RP': 216,
         'SYM': 1,
         'TO': 2179,
         'UH': 3,
         'VB': 2554,
         'VBD': 3043,
         'VBG': 1460,
         'VBN': 2134,
         'VBP': 1321,
         'VBZ': 2125,
         'WDT': 445,
         'WP': 241,
         'WP$': 14,
         'WRB': 178,
         '``': 712})

In [15]:
# the most common tags can be seen using the most_common() method of Counter
tag_counts.most_common(5)

[('NN', 13166), ('IN', 9857), ('NNP', 9410), ('DT', 8165), ('-NONE-', 6592)]

In [19]:
# question 3: Which tag is most commonly assigned to the word 'bank'.
bank = [word for word in tagged_words if word[0].lower()=='bank']
bank[1:10]

[('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN')]

## II. Rule Based Tagging

In [22]:
# 4. how many words with the tag 'VBD' (verb, past tense) end with 'ed'
past_tense_verbs = [word for word in tagged_words if word[1]=='VBD']
ed_verbs = [word for word in past_tense_verbs if word[0].endswith('ed')]
len(ed_verbs)/len(past_tense_words)

0.3881038448899113

In [23]:
# 5. how many words with the tag 'VBG' end with 'ing'
participle_verbs = [word for word in tagged_words if word[1]=='VBG']
ing_verbs = [word for word in participle_verbs if word[0].endswith('ing')]
len(ing_verbs)/len(participle_verbs)

0.9972602739726028