In [2]:
#import imp libraries
import nltk
import numpy as np
import pandas as pd
import pprint, time
import random
from sklearn.model_selection  import train_test_split
from nltk.tokenize import word_tokenize
import math

In [3]:
nltk.download("treebank")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [4]:
# reading the Treebank tagged sentences
corpus = list(nltk.corpus.treebank.tagged_sents())

In [5]:
#samples each sentence is a list of (words pos) tuples
corpus[:3]

[[('Pierre', 'NNP'),
  ('Vinken', 'NNP'),
  (',', ','),
  ('61', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  (',', ','),
  ('will', 'MD'),
  ('join', 'VB'),
  ('the', 'DT'),
  ('board', 'NN'),
  ('as', 'IN'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('Nov.', 'NNP'),
  ('29', 'CD'),
  ('.', '.')],
 [('Mr.', 'NNP'),
  ('Vinken', 'NNP'),
  ('is', 'VBZ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Elsevier', 'NNP'),
  ('N.V.', 'NNP'),
  (',', ','),
  ('the', 'DT'),
  ('Dutch', 'NNP'),
  ('publishing', 'VBG'),
  ('group', 'NN'),
  ('.', '.')],
 [('Rudolph', 'NNP'),
  ('Agnew', 'NNP'),
  (',', ','),
  ('55', 'CD'),
  ('years', 'NNS'),
  ('old', 'JJ'),
  ('and', 'CC'),
  ('former', 'JJ'),
  ('chairman', 'NN'),
  ('of', 'IN'),
  ('Consolidated', 'NNP'),
  ('Gold', 'NNP'),
  ('Fields', 'NNP'),
  ('PLC', 'NNP'),
  (',', ','),
  ('was', 'VBD'),
  ('named', 'VBN'),
  ('*-1', '-NONE-'),
  ('a', 'DT'),
  ('nonexecutive', 'JJ'),
  ('director', 'NN'),
  ('of', 'IN'),
  ('this'

In [6]:
#converting the list of sents to a list of (word, pos tag) tuples
tagged_words = [tup for sent in corpus for tup in sent]
print(len(tagged_words))
tagged_words[:50]

100676


[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.'),
 ('Mr.', 'NNP'),
 ('Vinken', 'NNP'),
 ('is', 'VBZ'),
 ('chairman', 'NN'),
 ('of', 'IN'),
 ('Elsevier', 'NNP'),
 ('N.V.', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Dutch', 'NNP'),
 ('publishing', 'VBG'),
 ('group', 'NN'),
 ('.', '.'),
 ('Rudolph', 'NNP'),
 ('Agnew', 'NNP'),
 (',', ','),
 ('55', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 ('and', 'CC'),
 ('former', 'JJ'),
 ('chairman', 'NN'),
 ('of', 'IN'),
 ('Consolidated', 'NNP'),
 ('Gold', 'NNP'),
 ('Fields', 'NNP'),
 ('PLC', 'NNP'),
 (',', ','),
 ('was', 'VBD'),
 ('named', 'VBN'),
 ('*-1', '-NONE-'),
 ('a', 'DT')]

Exploratory Data Analysis

In [7]:
#finding out the unique POS tags in the corpus
#we can use the set() function on the list of tags to get a unique set of tags
# and compute its length

tags = [pair[1] for pair in tagged_words]
unique_tags = set(tags)
len(unique_tags)

46

In [8]:
#most frequent tag in the corpus
from collections import Counter
tag_counts = Counter(tags)
tag_counts

Counter({'NNP': 9410,
         ',': 4886,
         'CD': 3546,
         'NNS': 6047,
         'JJ': 5834,
         'MD': 927,
         'VB': 2554,
         'DT': 8165,
         'NN': 13166,
         'IN': 9857,
         '.': 3874,
         'VBZ': 2125,
         'VBG': 1460,
         'CC': 2265,
         'VBD': 3043,
         'VBN': 2134,
         '-NONE-': 6592,
         'RB': 2822,
         'TO': 2179,
         'PRP': 1716,
         'RBR': 136,
         'WDT': 445,
         'VBP': 1321,
         'RP': 216,
         'PRP$': 766,
         'JJS': 182,
         'POS': 824,
         '``': 712,
         'EX': 88,
         "''": 694,
         'WP': 241,
         ':': 563,
         'JJR': 381,
         'WRB': 178,
         '$': 724,
         'NNPS': 244,
         'WP$': 14,
         '-LRB-': 120,
         '-RRB-': 126,
         'PDT': 27,
         'RBS': 35,
         'FW': 4,
         'UH': 3,
         'SYM': 1,
         'LS': 13,
         '#': 16})

In [9]:
tag_counts.most_common(5)

[('NN', 13166), ('IN', 9857), ('NNP', 9410), ('DT', 8165), ('-NONE-', 6592)]

In [10]:
#list of POS tags in nltk
nltk.download("tagsets")
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


In [11]:
# which tag is commonly assigned to the word w
bank = [pair for pair in tagged_words if pair[0].lower() == 'bank']
bank

[('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('Bank', 'NNP'),
 ('bank', 'NN'),
 ('bank', 'NN'),
 ('Bank', 'NNP'),
 

In [12]:
# which tag is commonly assigned to the word w
executive = [pair for pair in tagged_words if pair[0].lower() == 'executive']
executive

[('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'NN'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executive', 'JJ'),
 ('executi

In [13]:
# how many words with the tag "VBG" end with "ing"

participle_verbs = [pair for pair in tagged_words if pair[1] == "VBG"]
ing_verbs = [pair for pair in participle_verbs if pair[0].endswith("ing")]
print(len(ing_verbs)/len(participle_verbs))
ing_verbs[:20]

0.9972602739726028


[('publishing', 'VBG'),
 ('causing', 'VBG'),
 ('using', 'VBG'),
 ('talking', 'VBG'),
 ('having', 'VBG'),
 ('making', 'VBG'),
 ('surviving', 'VBG'),
 ('including', 'VBG'),
 ('including', 'VBG'),
 ('according', 'VBG'),
 ('remaining', 'VBG'),
 ('according', 'VBG'),
 ('declining', 'VBG'),
 ('rising', 'VBG'),
 ('yielding', 'VBG'),
 ('waiving', 'VBG'),
 ('holding', 'VBG'),
 ('holding', 'VBG'),
 ('cutting', 'VBG'),
 ('manufacturing', 'VBG')]

In [14]:
# how many words with the tag "VBD" end with "ed"

past_tense_verbs = [pair for pair in tagged_words if pair[1] == "VBD"]
ed_verbs = [pair for pair in past_tense_verbs if pair[0].endswith("ed")]
print(len(ed_verbs)/len(past_tense_verbs))
ed_verbs[:20]

0.3881038448899113


[('reported', 'VBD'),
 ('stopped', 'VBD'),
 ('studied', 'VBD'),
 ('led', 'VBD'),
 ('worked', 'VBD'),
 ('explained', 'VBD'),
 ('imposed', 'VBD'),
 ('dumped', 'VBD'),
 ('poured', 'VBD'),
 ('mixed', 'VBD'),
 ('described', 'VBD'),
 ('ventilated', 'VBD'),
 ('contracted', 'VBD'),
 ('continued', 'VBD'),
 ('eased', 'VBD'),
 ('ended', 'VBD'),
 ('lengthened', 'VBD'),
 ('reached', 'VBD'),
 ('resigned', 'VBD'),
 ('approved', 'VBD')]

In [15]:
# What fraction of adjectives JJ are followed by a noun NN

#create a list of all tags (without the words)
tags = [pair[1] for pair in tagged_words]

#create a list of JJ tags
jj_tags = [t for t in tags if t == 'JJ']

#create a list of (JJ, NN tags)

jj_nn_tags = [(t, tags[index+1]) for index, t  in enumerate(tags) 
              if t == "JJ" and tags[index+1] == "NN"]

print(len(jj_tags))
print(len(jj_nn_tags))
print(len(jj_nn_tags)/len(jj_tags))


5834
2611
0.4475488515598217


In [16]:
# What fraction of adjectives DT are followed by a noun NN

#create a list of DT tags
dt_tags = [t for t in tags if t == 'DT']

#create a list of (JJ, NN tags)

dd_nn_tags = [(t, tags[index+1]) for index, t  in enumerate(tags) 
              if t == "DT" and tags[index+1] == "NN"]

print(len(dt_tags))
print(len(dd_nn_tags))
print(len(dd_nn_tags)/len(dt_tags))


8165
3844
0.470789957134109


In [17]:
# What fraction of adjectives MD are followed by a noun VB

#create a list of DT tags
md_tags = [t for t in tags if t == 'MD']

#create a list of (JJ, NN tags)

md_nn_tags = [(t, tags[index+1]) for index, t  in enumerate(tags) 
              if t == "MD" and tags[index+1] == "VB"]

print(len(md_tags))
print(len(md_nn_tags))
print(len(md_nn_tags)/len(md_tags))


927
756
0.8155339805825242


In [34]:
#splitting into train and test
random.seed(1234)

train_set, test_set = train_test_split(corpus, test_size = .3)

print(len(train_set))
print(len(test_set))
print(train_set[:2])

2739
1175
[[('Bonds', 'NNS'), (':', ':'), ('Shearson', 'NNP'), ('Lehman', 'NNP'), ('Hutton', 'NNP'), ('Treasury', 'NNP'), ('index', 'NN'), ('3436.58', 'CD'), (',', ','), ('up', 'RB')], [('Now', 'RB'), (',', ','), ('on', 'IN'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN'), (',', ','), ('Chicago', 'NNP'), ("'s", 'POS'), ('stock-index', 'NN'), ('traders', 'NNS'), ('trade', 'VBP'), ('more', 'JJR'), ('dollars', 'NNS'), ('worth', 'NN'), ('of', 'IN'), ('stock', 'NN'), ('futures', 'NNS'), ('than', 'IN'), ('the', 'DT'), ('Big', 'NNP'), ('Board', 'NNP'), ('trades', 'VBZ'), ('in', 'IN'), ('stock', 'NN'), ('.', '.')]]


Lexicon (Unigram) Tagger

In [35]:
#Lexicon (or unigram tagger)
unigram_tagger = nltk.UnigramTagger(train_set)
unigram_tagger.evaluate(test_set)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  This is separate from the ipykernel package so we can avoid doing imports until


0.8738003995938555

Rule-Based(Regular Expression Tagger

In [36]:
#specify patterns for tagging
#example from NLTK book
patterns = [
    (r'. *ing$', "VBG" ),
    (r'. *ed$', "VBD" ),
    (r'. *es$', "VBZ" ),
    (r'. *ould$', "MD" ),
    (r'. *\'s$', "NNS" ),
    (r'. *s$', "NNS" ),
    (r'^-?[0-9]+(.[0-9]+)?$', "CD" ),
    (r'.*', "NN" )

]

In [37]:
regexp_tagger = nltk.RegexpTagger(patterns)

#help(regexp_tagger)

In [38]:
regexp_tagger.evaluate(test_set)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  """Entry point for launching an IPython kernel.


0.15875667354492157

Combining Taggers

In [39]:
#rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

#lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff = rule_based_tagger)

lexicon_tagger.evaluate(test_set)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  import sys


0.8960400903999214