In [16]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [12]:
text = 'It is a pleasant day today'

In [22]:
tokenized_text = word_tokenize(text)
tokenized_text

['It', 'is', 'a', 'pleasant', 'day', 'today']

In [23]:
pos_tag(tokenized_text)

[('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('pleasant', 'JJ'),
 ('day', 'NN'),
 ('today', 'NN')]

In [27]:
nltk.download('tagsets')
nltk.help.upenn_tagset('NNS')

NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...


[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [28]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [29]:
nltk.help.upenn_tagset('VB.*')

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...
VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...
VBN: verb, past participle
    multihulled dilapidated aerosolized chaired languished panelized used
    experimented flourished imitated reunifed factored condensed sheared
    unsettled primed dubbed desired ...
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
 

In [30]:
# Word disambiguation "bear". 
# The first "bear" is a verb.
# The second "bear" is a noun.
pos_tag(word_tokenize('I cannot bear the pain of bear'))

[('I', 'PRP'),
 ('can', 'MD'),
 ('not', 'RB'),
 ('bear', 'VB'),
 ('the', 'DT'),
 ('pain', 'NN'),
 ('of', 'IN'),
 ('bear', 'NN')]

## str2tuple and tuple2str

In [31]:
from nltk.tag import str2tuple, tuple2str

In [32]:
str2tuple('bear/NN')

('bear', 'NN')

In [33]:
tuple2str(('bear', 'NN'))

'bear/NN'

## Common tags in Treebank corpus

In [42]:
import nltk
nltk.download('treebank')
nltk.download('universal_tagset')
from nltk import bigrams, FreqDist
from nltk.corpus import treebank

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/alextanhongpin/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [39]:
treebank_tagged = treebank.tagged_words(tagset='universal')
tag = FreqDist(tag for (word, tag) in treebank_tagged)
tag.most_common()

[('NOUN', 28867),
 ('VERB', 13564),
 ('.', 11715),
 ('ADP', 9857),
 ('DET', 8725),
 ('X', 6613),
 ('ADJ', 6397),
 ('NUM', 3546),
 ('PRT', 3219),
 ('ADV', 3171),
 ('PRON', 2737),
 ('CONJ', 2265)]

In [48]:
# Calculate the number of tags occuring before a noun tag.
tagpairs = bigrams(treebank_tagged)

# The result is a bigram tuple, with the type as the second parameter.
# ((first_word, type), (second word, type))
print(next(tagpairs))

(('Pierre', 'NOUN'), ('Vinken', 'NOUN'))


In [None]:
# Find all bigrams where the second word is a noun.
preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN']

In [50]:
freqdist = FreqDist(preceders_noun)

In [51]:
[tag for (tag, _) in freqdist.most_common()]

['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'NUM',
 'PRT',
 'CONJ',
 'PRON',
 'X',
 'ADV']

In [52]:
freqdist.most_common()

[('NOUN', 7627),
 ('DET', 5569),
 ('ADJ', 4474),
 ('ADP', 3175),
 ('.', 2599),
 ('VERB', 1497),
 ('NUM', 1252),
 ('PRT', 796),
 ('CONJ', 792),
 ('PRON', 573),
 ('X', 410),
 ('ADV', 101)]

## Default Tagging

In [53]:
from nltk.tag import DefaultTagger

In [56]:
tagger = DefaultTagger('NN')

In [57]:
tagger.tag(['Beautiful', 'morning'])

[('Beautiful', 'NN'), ('morning', 'NN')]

In [59]:
# Untagging a sentence
from nltk.tag import untag
untag([('Beautiful', 'NN'), ('morning', 'NN')])

['Beautiful', 'morning']

## Selecting a machine learning algorithm

In [76]:
from nltk.tag import UnigramTagger
# DEPRECATED: from nltk.tag import FastBrillTaggerTrainer
from nltk.corpus import treebank
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.brill import Word, fntbl37

In [81]:
sentences = [tagger.tag(word_tokenize('Today is a beautiful morning'))]
sentences = treebank.tagged_sents()
templates = fntbl37()
tagger = UnigramTagger(sentences)
tagger = BrillTaggerTrainer(tagger, templates, trace=3)
tagger = tagger.train(sentences, max_rules=250)

TBL train (fast) (seqs: 3914; tokens: 100676; tpls: 37; min score: 2; min acc: None)
Finding initial useful rules...
    Found 96798 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
 202 208   6   3  | IN->WDT if Word:that@[0] & Pos:-NONE-@[1]
 142 147   5   0  | NN->VB if Pos:TO@[-1] & Pos:-NONE-@[-2]
  89  91   2   0  | VBP->VB if Pos:MD@[-3,-2,-1]
  76  76   0   0  | VBP->VB if Word:to@[-1]
  75  75   0   0  | NN->VB if Pos:MD@[-1]
  64  72   8   0  | VBD->VBN if Pos:VBZ@[-2,-1]
  58  58   0   1  | POS->VBZ if Pos:PRP@[-2,-1]
  58  61   3   0  | VBD->VBN if Pos:VBP@[-2,-1]
  57  64   7   0  | VBD->VBN if Pos:VBD@[-2,-1]
  55  55   0 

   3   3   0   0  | IN->RB if Word:month@[-1] & Word:before@[0] & Word:.@[1]
   3   3   0   0  | JJ->RB if Word:a@[-1] & Word:little@[0] & Word:more@[1]
   3   3   0   0  | NN->JJ if Word:and@[-1] & Word:stock-index@[0] &
                  |   Word:futures@[1]
   3   3   0   0  | NNS->NNPS if Word:'s@[-1] & Word:Investors@[0] &
                  |   Word:Service@[1]
   3   4   1   1  | CD->NN if Word:one@[0] & Word:no@[-1]
   3   3   0   0  | JJ->RB if Word:long@[0] & Word:have@[-1]
   3   3   0   0  | NNP->JJ if Word:Western@[0] & Word:in@[-1]
   3   3   0   0  | NNPS->NNP if Word:Industries@[0] & Word:Heavy@[-1]
   3   3   0   0  | POS->VBZ if Word:'s@[0] & Word:that@[-1]
   5   5   0   1  | IN->DT if Word:that@[0] & Pos:VBZ@[1]
   3   3   0   0  | RB->RP if Word:away@[0] & Word:take@[-1]
   3   3   0   0  | VBN->VBD if Word:proposed@[0] & Word:,@[-1]
   3   3   0   0  | IN->RP if Word:off@[0] & Word:.@[1]
   3   3   0   0  | JJ->IN if Word:outside@[0] & Word:the@[1]
   3   3   0   0

In [83]:
# To save the trainer. ! There's a newer way to save models.
import pickle
with open('demo.pkl', 'wb') as f:
    pickle.dump(tagger, f)

## Statistical modelling involving n-gram approach

In [89]:
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

In [91]:
training = treebank.tagged_sents()[:7000]
# Perform training using the first 7000 sentences of the treebank corpus.
tagger = UnigramTagger(training)
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [92]:
tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [95]:
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
training = treebank.tagged_sents()[:7000]
testing = treebank.tagged_sents()[2000:]

# Evaluating the tagging.
tagger = UnigramTagger(training)
# We obtain an accuracy of 96%
tagger.evaluate(testing)

0.9619024159944167

In [96]:
# Setting a cutoff frequency of 5.
tagger = UnigramTagger(training, cutoff=5)
tagger.evaluate(testing)

0.7972986842375351

In [97]:
from nltk.tag import DefaultTagger
tag = DefaultTagger('NN')
tagger = UnigramTagger(training, backoff=tag)
tagger.evaluate(testing)

0.9619024159944167

In [98]:
# BigramTagger make use of the previous tag as contextual information.
# TrigramTagger make use of the previous two tags as contextual information.

from nltk.tag import BigramTagger
tagger = BigramTagger(training)
tagger.evaluate(testing)

0.9171131227292321

In [99]:
from nltk.tag import TrigramTagger
tagger = TrigramTagger(training)
tagger.evaluate(testing)

0.9022107272615308

In [100]:
from nltk.tag import NgramTagger
tagger = NgramTagger(4, training)
tagger.evaluate(testing)

0.9304554878173943

In [101]:
# AffixTagger is also a ContextTagger which makes use of a prefix or suffix as the contextual information.
from nltk.tag import AffixTagger
tagger = AffixTagger(training)
tagger.evaluate(testing)

0.2902682841718497

In [103]:
# The following code make use of four character prefix.
tagger = AffixTagger(training, affix_length=4)
tagger.evaluate(testing)

0.2094751318841472

In [104]:
# The following code make use of three character suffix.
tagger = AffixTagger(training, affix_length=-3)
tagger.evaluate(testing)

0.2902682841718497

In [105]:
# Combination of many affix tagger.
tagger1 = AffixTagger(training, affix_length = 4)
tagger2 = AffixTagger(training, affix_length = -3, backoff=tagger1)
tagger2.evaluate(testing)

0.29166410082722666

In [None]:
# TnT is Trigrams n Tags, a statistical-based tagger that is based on the second order Markov models.
from nltk.tag import tnt
from nltk.corpus import treebank
tagger = tnt.TnT()
tagger.train(training)
tagger.evaluate(testing)

In [None]:
# To set a default for unknown tagger.
unknown = DefaultTagger('NN')
tagger = tnt.TnT(unk=unknown, Trained=True)
tagger.train(training)
tagger.evaluate(testing)

## Developing a chunker using pos-tagger corpora

In [2]:
from nltk import pos_tag, word_tokenize
text = 'A wise small girl of village became leader'
sentence = pos_tag(word_tokenize(text))
sentence

[('A', 'DT'),
 ('wise', 'NN'),
 ('small', 'JJ'),
 ('girl', 'NN'),
 ('of', 'IN'),
 ('village', 'NN'),
 ('became', 'VBD'),
 ('leader', 'NN')]

In [4]:
from nltk import RegexpParser
grammar = "NP: {<DT>?<JJ>*<NN><IN>?<NN>*}"
find = RegexpParser(grammar)
res = find.parse(sentence)
res

FileNotFoundError: [Errno 2] No such file or directory: '/var/folders/pt/v3mw_j891dv8yl5k2b5p54wm0000gn/T/tmpzfgz1tpo.png'

Tree('S', [Tree('NP', [('A', 'DT'), ('wise', 'NN')]), Tree('NP', [('small', 'JJ'), ('girl', 'NN'), ('of', 'IN'), ('village', 'NN')]), ('became', 'VBD'), Tree('NP', [('leader', 'NN')])])

In [5]:
res.draw()