# POS Tagging

In [1]:
import antigravity

In [40]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
lemmatizer.lemmatize("cooking")

'cooking'

In [4]:
lemmatizer.lemmatize("cooking", pos="v")

'cook'

In [5]:
lemmatizer.lemmatize("cookbooks")

'cookbook'

In [6]:
lemmatizer.lemmatize("quickly", pos="r")

'quickly'

In [7]:
syn = wordnet.synsets("cookbook")[0]

In [8]:
print(syn)

Synset('cookbook.n.01')


In [9]:
syn.definition()

'a book of recipes and cooking directions'

# POS Tagging

In [41]:
import requests
import os
import re
from nltk.corpus import wordnet
from nltk.tag import DefaultTagger

In [11]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [12]:
nltk.help.brown_tagset()

(: opening parenthesis
    (
): closing parenthesis
    )
*: negator
    not n't
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ? ; ! :
:: colon
    :
ABL: determiner/pronoun, pre-qualifier
    quite such rather
ABN: determiner/pronoun, pre-quantifier
    all half many nary
ABX: determiner/pronoun, double conjunction or pre-quantifier
    both
AP: determiner/pronoun, post-determiner
    many other next more last former little several enough most least only
    very few fewer past same Last latter less single plenty 'nough lesser
    certain various manye next-to-last particular final previous present
    nuf
AP$: determiner/pronoun, post-determiner, genitive
    other's
AP+AP: determiner/pronoun, post-determiner, hyphenated pair
    many-much
AT: article
    the an no a every th' ever' ye
BE: verb 'to be', infinitive or imperative
    be
BED: verb 'to be', past tense, 2nd person singular or all persons plural
    were
BED*: verb 'to be', past tense, 2nd person singular or 

In [42]:
from nltk.probability import FreqDist
from nltk.corpus import treebank

fd = FreqDist()

for word,tag in treebank.tagged_words():
    fd[tag] += 1
fd.items()

dict_items([('NNP', 9410), (',', 4886), ('CD', 3546), ('NNS', 6047), ('JJ', 5834), ('MD', 927), ('VB', 2554), ('DT', 8165), ('NN', 13166), ('IN', 9857), ('.', 3874), ('VBZ', 2125), ('VBG', 1460), ('CC', 2265), ('VBD', 3043), ('VBN', 2134), ('-NONE-', 6592), ('RB', 2822), ('TO', 2179), ('PRP', 1716), ('RBR', 136), ('WDT', 445), ('VBP', 1321), ('RP', 216), ('PRP$', 766), ('JJS', 182), ('POS', 824), ('``', 712), ('EX', 88), ("''", 694), ('WP', 241), (':', 563), ('JJR', 381), ('WRB', 178), ('$', 724), ('NNPS', 244), ('WP$', 14), ('-LRB-', 120), ('-RRB-', 126), ('PDT', 27), ('RBS', 35), ('FW', 4), ('UH', 3), ('SYM', 1), ('LS', 13), ('#', 16)])

In [14]:
tagger = DefaultTagger("NN")

In [15]:
tagger.tag(["hello","world"])

[('hello', 'NN'), ('world', 'NN')]

In [21]:
test_sents = treebank.tagged_sents()[:3000]

In [17]:
tagger.evaluate(test_sents)

0.12702713163292953

In [None]:
tagger.tag_sents([
    ["hello","world"],
    ["tagging","is","fun","!"],
])

In [20]:
from nltk.tag import untag
untag([('hello', 'NN'), ('world', 'NN')])

['hello', 'world']

In [43]:
from nltk.tag import UnigramTagger
train_sents = treebank.tagged_sents()[:1000]

In [16]:
tagger = UnigramTagger(train_sents)

In [17]:
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [44]:
tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [22]:
tagger.evaluate(test_sents)

0.8365909354801254

In [23]:
tagger1 = DefaultTagger("NN")
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
tagger2.evaluate(test_sents)

0.8617486550296087

In [45]:
from nltk.tag import BigramTagger, TrigramTagger

In [25]:
bitagger = BigramTagger(train_sents)
bitagger.evaluate(test_sents)

0.34926655571467274

In [26]:
tritagger = TrigramTagger(train_sents)
tritagger.evaluate(test_sents)

0.3359781192346893

In [27]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for classes in tagger_classes:
        backoff = classes(train_sents, backoff=backoff)
    return backoff

In [28]:
#%%timeit
backoff = DefaultTagger("NN")
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)

0.8735018255473417

In [None]:
from nltk.tag import tnt
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

In [32]:
text = nltk.word_tokenize("And now for something completely different.")
tag1 = tagger.tag(text)

In [30]:
tagger

<TrigramTagger: size=218>

In [33]:
tag2 = nltk.pos_tag(text)

In [35]:
tag1.append(("TEST","TEST"))

In [36]:
tag1

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ'),
 ('.', '.'),
 ('TEST', 'TEST')]

In [38]:
for tag in tag1:
    if tag not in tag2:
        print(tag)

('TEST', 'TEST')


## How to train a tagger?

1. Hand tag one thousand sentences
2. Train my model
3. Test my model on the thousand sentences
4. Correct by hand
5. Repeat with the next 2000 sentences

In [None]:
help(nltk.tag)

In [46]:
import pickle
save_tagger = open("tagger.pickle", "wb")
pickle.dump(tagger, save_tagger)
save_tagger.close()

In [47]:
pickle_tagger_file = open("tagger.pickle", "rb")
pickle_tagger = pickle.load(pickle_tagger_file)
pickle_tagger_file.close()

In [None]:
pickle_tagger.tag(["hello","world"])

 # Graphing POS

In [48]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [49]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [50]:
def process_content():
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        print(tagged)

In [None]:
process_content()

In [51]:
tokens = nltk.word_tokenize(train_text+sample_text)
tagged = nltk.pos_tag(tokens)
tagged[:10]

[('PRESIDENT', 'NNP'),
 ('GEORGE', 'NNP'),
 ('W.', 'NNP'),
 ('BUSH', 'NNP'),
 ("'S", 'POS'),
 ('ADDRESS', 'NNP'),
 ('BEFORE', 'IN'),
 ('A', 'NNP'),
 ('JOINT', 'NNP'),
 ('SESSION', 'NNP')]

In [52]:
%matplotlib notebook
adjectives = [token for token,pos in tagged if "JJ" == pos]
adjFreqs = nltk.FreqDist(adjectives)

adjFreqs.plot(20, title="President Bush, Stat of the Union by JJ")

<IPython.core.display.Javascript object>

In [53]:
%matplotlib notebook
adjectives = [token for token,pos in tagged if "NN" == pos]
adjFreqs = nltk.FreqDist(adjectives)

adjFreqs.plot(20, title="President Bush, Stat of the Union by NN")

<IPython.core.display.Javascript object>

In [None]:
def draw_chunk_tree():
    for i in tokenized[10:20]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        chunk = r"""Chunk: {<RB.?>*<VB..?>*<NNP.?>}"""
        
        chunkParser = nltk.RegexpParser(chunk)
        chunked = chunkParser.parse(tagged)
        
        chunked.draw()

In [None]:
draw_chunk_tree()

## Named Entity Recognition



In [None]:
def ner_processor():
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        
        namedEnt = nltk.ne_chunk(tagged, binary=False)
        namedEnt.draw()

ner_processor()

In [None]:
ner_out