In [2]:
# Important: From this chapter onwards, our program samples will assume you begin your interactive session or your program with the 
# following import statements:

from __future__ import division  # Python 2 users only
import nltk, re, pprint # nltk, regular expression, and pretty print?
from nltk import word_tokenize
from __future__ import print_function

# 5. Categorizing and Tagging Words

# 1   Using a Tagger

In [3]:
# A part-of-speech tagger, or POS-tagger, processes a sequence of words, and attaches a part of speech tag to each word 
# (don't forget to import nltk):

text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [4]:
# Let's look at another example, this time including some homonyms:
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [5]:
# Notice that refuse and permit both appear as a present tense verb (VBP) and a noun (NN). E.g. refUSE is a verb meaning "deny," 
# while REFuse is a noun meaning "trash" (i.e. they are not homophones). Thus, we need to know which word is being used in order 
# to pronounce the text correctly. (For this reason, text-to-speech systems usually perform POS-tagging.)

In [None]:
# Lexical categories like "noun" and part-of-speech tags like NN seem to have their uses, but the details will be obscure to many 
# readers. You might wonder what justification there is for introducing this extra level of information. Many of these categories 
# arise from superficial analysis the distribution of words in text. Consider the following analysis involving woman (a noun), 
# bought (a verb), over (a preposition), and the (a determiner). The text.similar() method takes a word w, finds all contexts w1w w2, 
# then finds all words w' that appear in the same context, i.e. w1w'w2.

In [6]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year car moment world family house country child boy
state job way war girl place word work


In [7]:
text.similar('bought')

made said put done seen had found left given heard brought got been
was set told took in felt that


In [8]:
text.similar('over')

in on to of and for with from at by that into as up out down through
is all about


In [9]:
text.similar('the')

a his this their its her an that our any all one these my in your no
some other and


In [None]:
# Observe that searching for woman finds nouns; searching for bought mostly finds verbs; searching for over generally finds 
# prepositions; searching for the finds several determiners. A tagger can correctly identify the tags on these words in the context 
# of a sentence, e.g. The woman bought over $150,000 worth of clothes.

# A tagger can also model our knowledge of unknown words, e.g. we can guess that scrobbling is probably a verb, with the root 
# scrobble, and likely to occur in contexts like he was scrobbling.

# 2   Tagged Corpora

## 2.1   Representing Tagged Tokens

In [10]:
# By convention in NLTK, a tagged token is represented using a tuple consisting of the token and the tag. We can create one of 
# these special tuples from the standard string representation of a tagged token, using the function str2tuple():

tagged_token = nltk.tag.str2tuple('fly/NN')
tagged_token

('fly', 'NN')

In [11]:
tagged_token[0]

'fly'

In [12]:
tagged_token[1]

'NN'

In [13]:
sent = '''
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
'''
[nltk.tag.str2tuple(t) for t in sent.split()]

[('The', 'AT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'AT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('other', 'AP'),
 ('topics', 'NNS'),
 (',', ','),
 ('AMONG', 'IN'),
 ('them', 'PPO'),
 ('the', 'AT'),
 ('Atlanta', 'NP'),
 ('and', 'CC'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('purchasing', 'VBG'),
 ('departments', 'NNS'),
 ('which', 'WDT'),
 ('it', 'PPS'),
 ('said', 'VBD'),
 ('``', '``'),
 ('ARE', 'BER'),
 ('well', 'QL'),
 ('operated', 'VBN'),
 ('and', 'CC'),
 ('follow', 'VB'),
 ('generally', 'RB'),
 ('accepted', 'VBN'),
 ('practices', 'NNS'),
 ('which', 'WDT'),
 ('inure', 'VB'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('best', 'JJT'),
 ('interest', 'NN'),
 ('of', 'IN'),
 ('both', 'ABX'),
 ('governments', 'NNS'),
 ("''", "''"),
 ('.', '.')]

## 2.2   Reading Tagged Corpora

In [14]:
 nltk.corpus.brown.tagged_words()

[(u'The', u'AT'), (u'Fulton', u'NP-TL'), ...]

In [15]:
 nltk.corpus.brown.tagged_words(tagset='universal')

[(u'The', u'DET'), (u'Fulton', u'NOUN'), ...]

In [16]:
print(nltk.corpus.nps_chat.tagged_words())

[(u'now', 'RB'), (u'im', 'PRP'), (u'left', 'VBD'), ...]


In [17]:
nltk.corpus.conll2000.tagged_words()

[(u'Confidence', u'NN'), (u'in', u'IN'), ...]

In [18]:
 nltk.corpus.treebank.tagged_words()

[(u'Pierre', u'NNP'), (u'Vinken', u'NNP'), ...]

In [19]:
nltk.corpus.brown.tagged_words(tagset='universal')

[(u'The', u'DET'), (u'Fulton', u'NOUN'), ...]

In [20]:
nltk.corpus.treebank.tagged_words(tagset='universal')

[(u'Pierre', u'NOUN'), (u'Vinken', u'NOUN'), ...]

### 2.3   A Universal Part-of-Speech Tagset

In [21]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[(u'NOUN', 30654),
 (u'VERB', 14399),
 (u'ADP', 12355),
 (u'.', 11928),
 (u'DET', 11389),
 (u'ADJ', 6706),
 (u'ADV', 3349),
 (u'CONJ', 2717),
 (u'PRON', 2535),
 (u'PRT', 2264),
 (u'NUM', 2166),
 (u'X', 92)]

## 2.4   Nouns

In [22]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]

[u'NOUN',
 u'DET',
 u'ADJ',
 u'ADP',
 u'.',
 u'VERB',
 u'CONJ',
 u'NUM',
 u'ADV',
 u'PRT',
 u'PRON',
 u'X']

## 2.5   Verbs

In [23]:
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)
[wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']

[u'is',
 u'said',
 u'was',
 u'are',
 u'be',
 u'has',
 u'have',
 u'will',
 u'says',
 u'would',
 u'were',
 u'had',
 u'been',
 u'could',
 u"'s",
 u'can',
 u'do',
 u'say',
 u'make',
 u'may',
 u'did',
 u'rose',
 u'made',
 u'does',
 u'expected',
 u'buy',
 u'take',
 u'get',
 u'might',
 u'sell',
 u'added',
 u'sold',
 u'help',
 u'including',
 u'should',
 u'reported',
 u'according',
 u'pay',
 u'being',
 u'compared',
 u'began',
 u'fell',
 u'based',
 u'closed',
 u'used',
 u"'re",
 u'want',
 u'see',
 u'took',
 u'yield',
 u'priced',
 u'offered',
 u'set',
 u'approved',
 u'come',
 u'cut',
 u'noted',
 u'ended',
 u'increased',
 u'found',
 u'think',
 u'become',
 u'declined',
 u'go',
 u'proposed',
 u'growing',
 u'trying',
 u'received',
 u'named',
 u'put',
 u'give',
 u'came',
 u'held',
 u'use',
 u'paid',
 u'going',
 u'called',
 u'raise',
 u'estimated',
 u'continue',
 u'designed',
 u'making',
 u'expects',
 u'seeking',
 u'plans',
 u'wo',
 u'must',
 u'got',
 u'gained',
 u'trading',
 u'owns',
 u'fined',
 u'say

In [24]:
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['yield'].most_common()
# Don't quite understand logic here.

[(u'VERB', 28), (u'NOUN', 20)]

In [25]:
cfd1['cut'].most_common()

[(u'VERB', 25), (u'NOUN', 3)]

In [28]:
wsj = nltk.corpus.treebank.tagged_words()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
list(cfd2['VBN'])

[u'limited',
 u'reorganized',
 u'managed',
 u'switched',
 u'caused',
 u'founded',
 u'assembled',
 u'concerned',
 u'contained',
 u'Rekindled',
 u'automated',
 u'bribed',
 u'voted',
 u'issued',
 u'cluttered',
 u'disapproved',
 u'sent',
 u'returned',
 u'synchronized',
 u'puzzled',
 u'desired',
 u'engineered',
 u'headlined',
 u'centralized',
 u'advised',
 u'stabbed',
 u'continued',
 u'perceived',
 u'presented',
 u'prolonged',
 u'Related',
 u'solved',
 u'noted',
 u'concluded',
 u'Filmed',
 u'infringed',
 u'construed',
 u'licensed',
 u'knitted',
 u'slowed',
 u'enclosed',
 u'replicated',
 u'estimated',
 u'imported',
 u'risen',
 u'assisted',
 u'beaten',
 u'contributed',
 u'expressed',
 u'enjoyed',
 u'industrialized',
 u'zoomed',
 u'crossed',
 u'learned',
 u'filled',
 u'told',
 u'drafted',
 u'deemed',
 u'kicked',
 u'led',
 u'ranged',
 u'slated',
 u'reported',
 u'focused',
 u'auctioned',
 u'crippled',
 u'represented',
 u'scrapped',
 u'invented',
 u'obtained',
 u'colored',
 u'skyrocketed',
 u'inv

## 2.6   Adjectives and Adverbs

## 2.7   Unsimplified Tags

## 2.8   Exploring Tagged Corpora

In [29]:
# brown_learned_text is the words from the Brown corpus with the category 'learned'
brown_learned_text = brown.words(categories='learned')

# Find all words that follow the word often. Make sure they are unique and then sort them
sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))

[u',',
 u'.',
 u'accomplished',
 u'analytically',
 u'appear',
 u'apt',
 u'associated',
 u'assuming',
 u'became',
 u'become',
 u'been',
 u'began',
 u'call',
 u'called',
 u'carefully',
 u'chose',
 u'classified',
 u'colorful',
 u'composed',
 u'contain',
 u'differed',
 u'difficult',
 u'encountered',
 u'enough',
 u'equate',
 u'extremely',
 u'found',
 u'happens',
 u'have',
 u'ignored',
 u'in',
 u'involved',
 u'more',
 u'needed',
 u'nightly',
 u'observed',
 u'of',
 u'on',
 u'out',
 u'quite',
 u'represent',
 u'responsible',
 u'revamped',
 u'seclude',
 u'set',
 u'shortened',
 u'sing',
 u'sounded',
 u'stated',
 u'still',
 u'sung',
 u'supported',
 u'than',
 u'to',
 u'when',
 u'work']

In [30]:
# Find tagged words in Brown corpus, with learned category, and use Universal Tagset
brown_lrnd_tagged = brown.tagged_words(categories='learned', tagset='universal')

# Since we have tagged set, each element is a tuple, consisting of (word, TAG)
# We still look at bigrams (a,b), but to access the tag, we do b[1]

tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']

# Generate frequency distribution 
fd = nltk.FreqDist(tags)

# Not exactly sure what tabulate does...
fd.tabulate()

VERB  ADV  ADP  ADJ    .  PRT 
  37    8    7    6    4    2 


In [31]:
# Example 2.3 (code_three_word_phrase.py): 
# Figure 2.3: Searching for Three-Word Phrases Using POS Tags

from nltk.corpus import brown
def process(sentence):      
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print(w1, w2, w3)

# Import Brown corpus
# Define function process
# NLTK finds all trigrams in the sentence. 
# Each element of the trigram is a 2-tuple: (word, POS TAG)
# If the first word's tag is a verb, the second word's tag is TO ('to'), and the third word's
# tag is a verb, then print the trigram (w1, w2, w3)

for tagged_sent in brown.tagged_sents():
    process(tagged_sent)
    
# Here, we turn the Brown corpus into sentences and send each sentence into function process.

combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
designed to provide
get to hear
expects to tell
expected to give
prefer to pay
required to obtain
permitted to teach
designed to reduce
Asked to elaborate
got to go
raised to pay
scheduled to go
cut to meet
needed to meet
hastened to add
found to prevent
continue to insist
compelled to make
made to remove
revamped to give
want to risk
appear to spark
fails to consider
plans to call
going to examine
plans to name
come to pass
voted to accept
happens to hold
authorized to adopt
hesitated to prosecute
try to make
decided to spend
taken to preserve
left to preserve
stand to bring
decided to seek
trying to induce
proposing to make
decided to run
directed to investigate
expected to pass
expected to make
expected to encounter
hopes to pass
came to pay
expected to receive
understood to follow
wanted to vote
decide

# 3   Mapping Words to Properties Using Python Dictionaries

## 3.1   Indexing Lists vs Dictionaries

## 3.2   Dictionaries in Python

In [40]:
pos = {}
pos

{}

In [41]:
pos['colorless'] = 'ADJ' # pos [key] = value
pos

{'colorless': 'ADJ'}

In [42]:
pos['ideas'] = 'N' # 'ideas' is a noun
pos['sleep'] = 'V' # 'sleep' is a verb
pos['furiously'] = 'ADV' # furiously is an adverb
pos # let's look at the dictionary

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

In [43]:
pos['ideas']

'N'

In [44]:
pos['colorless']

'ADJ'

In [45]:
pos['green']

KeyError: 'green'

In [46]:
list(pos)

['furiously', 'sleep', 'ideas', 'colorless']

In [47]:
sorted(pos)

['colorless', 'furiously', 'ideas', 'sleep']

In [48]:
[w for w in pos if w.endswith('s')] # CREATE A list of all words in pos that end with 's'

['ideas', 'colorless']

In [49]:
for word in sorted(pos):
     print(word + ":", pos[word])

colorless: ADJ
furiously: ADV
ideas: N
sleep: V


In [50]:
list(pos.keys())

['furiously', 'sleep', 'ideas', 'colorless']

In [51]:
list(pos.values())

['ADV', 'V', 'N', 'ADJ']

In [52]:
list(pos.items())

[('furiously', 'ADV'), ('sleep', 'V'), ('ideas', 'N'), ('colorless', 'ADJ')]

In [53]:
for key, val in sorted(pos.items()):
    print(key + ":", val)

colorless: ADJ
furiously: ADV
ideas: N
sleep: V


In [54]:
pos['sleep'] = 'V'
pos['sleep']

'V'

In [55]:
pos['sleep'] = 'N'
pos['sleep']

'N'

## 3.3   Defining Dictionaries

In [56]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')

In [57]:
pos = {['ideas', 'blogs', 'adventures']: 'N'}

TypeError: unhashable type: 'list'

In [58]:
# For migraine... Here is an option

# Initialize dictionary food

food = {}
food

food['bread'] = ['gluten', 'wheat'] # Bread can have 2 tags

food['bread'].append('allergy') # I can add more labels

food['vitamin C'] = ['supplement', 'vitamin']

food

# Look at the mapping created by Hassan Jannah
# Also look more into this datatype with Lutz's Book Learning Python

{'bread': ['gluten', 'wheat', 'allergy'],
 'vitamin C': ['supplement', 'vitamin']}

## 3.4   Default Dictionaries

In [59]:
from collections import defaultdict
frequency = defaultdict(int)
frequency['colorless'] = 4
frequency['ideas']

0

In [60]:
pos = defaultdict(list)
pos['sleep'] = ['NOUN', 'VERB']
pos['ideas']

[]

In [61]:
pos = defaultdict(lambda: 'NOUN')
pos['colorless'] = 'ADJ'
pos['blog']

'NOUN'

In [62]:
list(pos.items())

[('blog', 'NOUN'), ('colorless', 'ADJ')]

In [63]:
f = lambda: 'NOUN'
f()

'NOUN'

In [64]:
def g():
    return 'NOUN'
g()

'NOUN'

In [65]:
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = [word for (word, _) in vocab.most_common(1000)]
mapping = defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = v
alice2 = [mapping[v] for v in alice]
alice2[:100]

['UNK',
 u'Alice',
 u"'",
 u's',
 u'Adventures',
 u'in',
 u'Wonderland',
 u'by',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 u'CHAPTER',
 u'I',
 u'.',
 'UNK',
 u'the',
 u'Rabbit',
 u'-',
 'UNK',
 u'Alice',
 u'was',
 u'beginning',
 u'to',
 u'get',
 u'very',
 u'tired',
 u'of',
 u'sitting',
 u'by',
 u'her',
 u'sister',
 u'on',
 u'the',
 u'bank',
 u',',
 u'and',
 u'of',
 u'having',
 u'nothing',
 u'to',
 u'do',
 u':',
 u'once',
 u'or',
 u'twice',
 u'she',
 u'had',
 'UNK',
 u'into',
 u'the',
 u'book',
 u'her',
 u'sister',
 u'was',
 'UNK',
 u',',
 u'but',
 u'it',
 u'had',
 u'no',
 u'pictures',
 u'or',
 'UNK',
 u'in',
 u'it',
 u',',
 u"'",
 u'and',
 u'what',
 u'is',
 u'the',
 u'use',
 u'of',
 u'a',
 u'book',
 u",'",
 u'thought',
 u'Alice',
 u"'",
 u'without',
 u'pictures',
 u'or',
 u'conversation',
 u"?'",
 u'So',
 u'she',
 u'was',
 'UNK',
 u'in',
 u'her',
 u'own',
 u'mind',
 u'(',
 u'as',
 u'well',
 u'as',
 u'she',
 u'could',
 u',']

In [66]:
len(set(alice2))

1001

# 4   Automatic Tagging

In [67]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

## 4.1   The Default Tagger

In [68]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()

u'NN'

In [69]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [70]:
default_tagger.evaluate(brown_tagged_sents)

0.13089484257215028

## 4.2   The Regular Expression Tagger

In [71]:
patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
    (r'.*es$', 'VBZ'),                # 3rd singular present
    (r'.*ould$', 'MD'),               # modals
    (r'.*\'s$', 'NN$'),               # possessive nouns
    (r'.*s$', 'NNS'),                 # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')                     # nouns (default)
]

In [72]:
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])

[(u'``', 'NN'),
 (u'Only', 'NN'),
 (u'a', 'NN'),
 (u'relative', 'NN'),
 (u'handful', 'NN'),
 (u'of', 'NN'),
 (u'such', 'NN'),
 (u'reports', 'NNS'),
 (u'was', 'NNS'),
 (u'received', 'VBD'),
 (u"''", 'NN'),
 (u',', 'NN'),
 (u'the', 'NN'),
 (u'jury', 'NN'),
 (u'said', 'NN'),
 (u',', 'NN'),
 (u'``', 'NN'),
 (u'considering', 'VBG'),
 (u'the', 'NN'),
 (u'widespread', 'NN'),
 (u'interest', 'NN'),
 (u'in', 'NN'),
 (u'the', 'NN'),
 (u'election', 'NN'),
 (u',', 'NN'),
 (u'the', 'NN'),
 (u'number', 'NN'),
 (u'of', 'NN'),
 (u'voters', 'NNS'),
 (u'and', 'NN'),
 (u'the', 'NN'),
 (u'size', 'NN'),
 (u'of', 'NN'),
 (u'this', 'NNS'),
 (u'city', 'NN'),
 (u"''", 'NN'),
 (u'.', 'NN')]

In [73]:
regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

## 4.3   The Lookup Tagger

In [74]:
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common(100)
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)

0.45578495136941344

In [75]:
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)

[(u'``', u'``'),
 (u'Only', None),
 (u'a', u'AT'),
 (u'relative', None),
 (u'handful', None),
 (u'of', u'IN'),
 (u'such', None),
 (u'reports', None),
 (u'was', u'BEDZ'),
 (u'received', None),
 (u"''", u"''"),
 (u',', u','),
 (u'the', u'AT'),
 (u'jury', None),
 (u'said', u'VBD'),
 (u',', u','),
 (u'``', u'``'),
 (u'considering', None),
 (u'the', u'AT'),
 (u'widespread', None),
 (u'interest', None),
 (u'in', u'IN'),
 (u'the', u'AT'),
 (u'election', None),
 (u',', u','),
 (u'the', u'AT'),
 (u'number', None),
 (u'of', u'IN'),
 (u'voters', None),
 (u'and', u'CC'),
 (u'the', u'AT'),
 (u'size', None),
 (u'of', u'IN'),
 (u'this', u'DT'),
 (u'city', None),
 (u"''", u"''"),
 (u'.', u'.')]

In [76]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))

In [77]:
def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

def display():
    import pylab
    word_freqs = nltk.FreqDist(brown.words(categories='news')).most_common()
    words_by_freq = [w for (w, _) in word_freqs]
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()

In [78]:
%pylab inline
display()
# Skip this for now.

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


## 4.4   Evaluation

# 5   N-Gram Tagging

## 5.1   Unigram Tagging

In [79]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[(u'Various', u'JJ'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'apartments', u'NNS'),
 (u'are', u'BER'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'terrace', u'NN'),
 (u'type', u'NN'),
 (u',', u','),
 (u'being', u'BEG'),
 (u'on', u'IN'),
 (u'the', u'AT'),
 (u'ground', u'NN'),
 (u'floor', u'NN'),
 (u'so', u'QL'),
 (u'that', u'CS'),
 (u'entrance', u'NN'),
 (u'is', u'BEZ'),
 (u'direct', u'JJ'),
 (u'.', u'.')]

In [80]:
unigram_tagger.evaluate(brown_tagged_sents)

0.9349006503968017

## 5.2   Separating the Training and Testing Data

In [81]:
size = int(len(brown_tagged_sents) * 0.9)
size
# calculate size for training and test sets

4160

In [82]:
train_sents = brown_tagged_sents[:size] # training is first 90%
test_sents = brown_tagged_sents[size:] # Test set is last 10%
unigram_tagger = nltk.UnigramTagger(train_sents) # train tagger on first 90%
unigram_tagger.evaluate(test_sents) #Evaluate on test sentences

0.8120203329014253

## 5.3   General N-Gram Tagging

In [83]:
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])

[(u'Various', u'JJ'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'apartments', u'NNS'),
 (u'are', u'BER'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'terrace', u'NN'),
 (u'type', u'NN'),
 (u',', u','),
 (u'being', u'BEG'),
 (u'on', u'IN'),
 (u'the', u'AT'),
 (u'ground', u'NN'),
 (u'floor', u'NN'),
 (u'so', u'CS'),
 (u'that', u'CS'),
 (u'entrance', u'NN'),
 (u'is', u'BEZ'),
 (u'direct', u'JJ'),
 (u'.', u'.')]

In [84]:
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

[(u'The', u'AT'),
 (u'population', u'NN'),
 (u'of', u'IN'),
 (u'the', u'AT'),
 (u'Congo', u'NP'),
 (u'is', u'BEZ'),
 (u'13.5', None),
 (u'million', None),
 (u',', None),
 (u'divided', None),
 (u'into', None),
 (u'at', None),
 (u'least', None),
 (u'seven', None),
 (u'major', None),
 (u'``', None),
 (u'culture', None),
 (u'clusters', None),
 (u"''", None),
 (u'and', None),
 (u'innumerable', None),
 (u'tribes', None),
 (u'speaking', None),
 (u'400', None),
 (u'separate', None),
 (u'dialects', None),
 (u'.', None)]

In [85]:
bigram_tagger.evaluate(test_sents)

0.10276088906608193

## 5.4   Combining Taggers

In [86]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)

0.844911791089405

In [87]:
# Note

# Your Turn: Extend the above example by defining a TrigramTagger called t3, which backs off to t2.

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.NgramTagger(3, train_sents, backoff=t2)
t3.evaluate(test_sents)

0.8424200139539519

In [88]:
# Let's try a 4-Gram...
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.NgramTagger(3, train_sents, backoff=t2)
t4 = nltk.NgramTagger(4, train_sents, backoff=t3)
t4.evaluate(test_sents)

# Interestingly, the bigram was the best n-gram tagger...

0.8411242898435164

## 5.5   Tagging Unknown Words

## 5.6   Storing Taggers

In [None]:
from pickle import dump
output = open('t2.pkl', 'wb')
dump(t2, output, -1)
output.close()

In [None]:
from pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()

In [None]:
text = """The board's action shows what free enterprise
is up against in our complex maze of regulatory laws ."""
tokens = text.split()
tagger.tag(tokens)