NLP and Tagging

In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown.categories()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [None]:
brown.words(categories='fiction')

['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]

In [None]:
brown.sents(categories=['fiction'])

[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.'], ...]

In [None]:
brown.words(fileids=['ck04'])

['The', 'Bishop', 'looked', 'at', 'him', 'coldly', ...]

In [None]:
fiction_text = brown.words(categories='fiction')
fdist = nltk.FreqDist(w for w in fiction_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
  print(m + ':', fdist[m])

can: 37
could: 166
may: 8
might: 44
must: 55
will: 52


In [None]:
adventure_text = brown.words(categories='adventure')
fdist = nltk.FreqDist(w for w in adventure_text)
modals = ['what', 'when', 'where', 'who']
for m in modals:
  print(m + ':', fdist[m], end=" ")

what: 110 when: 126 where: 53 who: 91 

In [None]:
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

           can could   may might  must  will 
    news    93    86    66    38    50   389 
religion    82    59    78    12    54    71 
 hobbies   268    58   131    22    83   264 
 fiction    37   166     8    44    55    52 
 romance    74   193    11    51    45    43 
   humor    16    30     8     8     9    13 


In [None]:
brown_tagged_sents = brown.tagged_sents(categories='fiction')
brown_sents = brown.sents(categories='fiction')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[1])

[('Scotty', 'NP'),
 ('did', 'DOD'),
 ('not', '*'),
 ('go', 'VB'),
 ('back', 'RB'),
 ('to', 'TO'),
 ('school', 'NN'),
 ('.', '.')]

In [None]:
print(brown_sents[:2])

[['Thirty-three'], ['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.']]


In [None]:
unigram_tagger.evaluate(brown_tagged_sents)

0.9415956079897209

In [None]:
size = int(len(brown_tagged_sents) * 0.9)
size

3824

In [None]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

0.824455652601796

In [None]:
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[10])

[('They', 'PPSS'),
 ('ate', 'VBD'),
 ('the', 'AT'),
 ('cafeteria', 'NN'),
 ('food', 'NN'),
 ('with', 'IN'),
 ('its', 'PP$'),
 ('orange', 'JJ'),
 ('sauces', 'NNS'),
 ('and', 'CC'),
 ('Scotty', 'NP'),
 ('gazed', 'VBD'),
 ('without', 'IN'),
 ('interest', 'NN'),
 ('at', 'IN'),
 ('his', 'PP$'),
 ('food', 'NN'),
 (',', ','),
 ('the', 'AT'),
 ('teachers', 'NNS'),
 (',', ','),
 ('the', 'AT'),
 ('heroic', 'JJ'),
 ('baronial', 'JJ'),
 ('windows', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'AT'),
 ('bright', 'JJ'),
 ('ranks', 'NNS'),
 ('of', 'IN'),
 ('college', 'NN'),
 ('banners', 'NNS'),
 ('.', '.')]

In [None]:
unseen_sent = brown_sents[500]
bigram_tagger.tag(unseen_sent)

[('The', 'AT'),
 ('children', 'NNS'),
 ('had', 'HVD'),
 ('nowhere', 'RB'),
 ('to', 'IN'),
 ('go', None),
 ('and', None),
 ('no', None),
 ('place', None),
 ('to', None),
 ('play', None),
 (',', None),
 ('not', None),
 ('even', None),
 ('sidewalks', None),
 ('.', None)]

In [None]:
bigram_tagger.evaluate(test_sents)

0.16742526756058557

In [None]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)

0.8657891499569442

In [None]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.TrigramTagger(train_sents, backoff=t2)
t3.evaluate(test_sents)

0.8639439045393037

In [None]:
from pickle import dump
output = open('t3.pkl', 'wb')
dump(t3, output, -1)
output.close()

In [None]:
from pickle import load
input = open('t3.pkl', 'rb')
tagger = load(input)
input.close()

In [None]:
text = """Two roads diverged in a yellow wood,
And sorry I could not travel both. 
And be one traveler, long I stood..."""
tokens = text.split()
tagger.tag(tokens)

[('Two', 'CD'),
 ('roads', 'NNS'),
 ('diverged', 'NN'),
 ('in', 'IN'),
 ('a', 'AT'),
 ('yellow', 'JJ'),
 ('wood,', 'NN'),
 ('And', 'CC'),
 ('sorry', 'JJ'),
 ('I', 'PPSS'),
 ('could', 'MD'),
 ('not', '*'),
 ('travel', 'VB'),
 ('both.', 'NN'),
 ('And', 'CC'),
 ('be', 'BE'),
 ('one', 'CD'),
 ('traveler,', 'NN'),
 ('long', 'JJ'),
 ('I', 'PPSS'),
 ('stood...', 'NN')]