POS tagging exercises
----

Compare packages: nltk vs. TextBlob

In [1]:
reset -fs

In [12]:
text = """My whole life is about winning. I don't lose often. I almost never lose.
- Donald Trump"""

In [33]:
# : POS tag the tokens with nltk
from nltk import pos_tag
from nltk import word_tokenize

# pos_tag(tokenize(text))
tags_nltk = pos_tag(word_tokenize(text))

# word_tokenize(text)

['My',
 'whole',
 'life',
 'is',
 'about',
 'winning',
 '.',
 'I',
 'do',
 "n't",
 'lose',
 'often',
 '.',
 'I',
 'almost',
 'never',
 'lose',
 '.',
 '-',
 'Donald',
 'Trump']

In [27]:
assert tags_nltk == [('My', 'PRP$'),
 ('whole', 'JJ'),
 ('life', 'NN'),
 ('is', 'VBZ'),
 ('about', 'IN'),
 ('winning', 'VBG'),
 ('.', '.'),
 ('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('lose', 'VB'),
 ('often', 'RB'),
 ('.', '.'),
 ('I', 'PRP'),
 ('almost', 'RB'),
 ('never', 'RB'),
 ('lose', 'VBP'),
 ('.', '.'),
 ('-', ':'),
 ('Donald', 'NNP'),
 ('Trump', 'NNP')]

In [31]:
# : POS tag the tokens with TextBlob

from textblob import TextBlob
tags_blob = TextBlob(text).tags

In [32]:
assert tags_blob == [('My', 'PRP$'),
 ('whole', 'JJ'),
 ('life', 'NN'),
 ('is', 'VBZ'),
 ('about', 'IN'),
 ('winning', 'VBG'),
 ('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('lose', 'VB'),
 ('often', 'RB'),
 ('I', 'PRP'),
 ('almost', 'RB'),
 ('never', 'RB'),
 ('lose', 'VBP'),
 ('Donald', 'NNP'),
 ('Trump', 'NNP')]


# : Compare the results by hand (DO NOT WRITE ANY CODE). Are the same? Why or Why not?

Classifications are the same; however, TextBlob(text).tags does not include punctuation, while nltk.pos_tag(word_tokenize(text)) does.

# : Which api is easier to use?

TextBlob api is easier to use because the TextBlob class encapusulates the idea of having to make multiple function calls.

# Why? Try to describe in it computer science / oop terms

We encapsulate the pos tuples as a member variable of an object than a pos_tag function on a list of tokens.

<details><summary>
Click here for hints
</summary>
- [Class methods](https://julien.danjou.info/blog/2013/guide-python-static-class-abstract-methods)  
- [Properties](http://www.python-course.eu/python3_properties.php)
</details>

----
Let's define custom tags (fancy).

![](https://media.giphy.com/media/VlchmIoZPjjYQ/giphy.gif)

Default tagger: use the most frequent tag

In [36]:
tokens = word_tokenize("Thieves leave young athletes in the dark")

In [42]:
# : Create a default tagger makes all tags NN - Noun, singular or mass
# class DefaultTagger:
#     def __init__(self):
#         pass
    
#     def tag(tokens):
#         """
#         makes all tags NN - None, singular or mass
#         """
import nltk

default_tagger = nltk.DefaultTagger('NN')
default_tagger

<DefaultTagger: tag=NN>

In [43]:
default_tags = default_tagger.tag(tokens)

In [44]:
assert default_tags == [('Thieves', 'NN'),
 ('leave', 'NN'),
 ('young', 'NN'),
 ('athletes', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('dark', 'NN')]

In [54]:
# : Is a default tagger good for anything?

it is good for words not in the system.  For example, when you have unknown pos, such as in a foreign language, you can use default tagger to tag the unrealized words.

The regular expression tagger

In [92]:
text = 'January, He was born in March 1991, August'

tags:  
```('He', 'PRP')
('was', 'VBD')
('born', 'VBN')
('in', 'IN')
('March', 'NNP')
('1991', 'CD')```

('March', 'NNP') is not a great category.

In [88]:
a = [1, 2, 3, 4, 5]

# def f(x):
#     return x**2

print(*a)

# type(*a)

1 2 3 4 5


In [93]:
# Custom POS tags
pattern = [(r'(March)$','MONTH')]
tagger = nltk.RegexpTagger(pattern)

print(*tagger.tag(nltk.word_tokenize(text)), sep='\n')
# type(nltk.word_tokenize(text))

('January', None)
(',', None)
('He', None)
('was', None)
('born', None)
('in', None)
('March', 'MONTH')
('1991', None)
(',', None)
('August', None)


In [94]:
# tagger
calendar.month_name[1:13]

['January',
 'February',
 'March',
 'April',
 'May',
 'June',
 'July',
 'August',
 'September',
 'October',
 'November',
 'December']

In [96]:
# : Change pattern to include all months. 
import calendar
months = calendar.month_name[1:13]

months_pattern = [(r'(%s)$'%month, 'MONTH') for month in months]

tagger = nltk.RegexpTagger(months_pattern)

print(*tagger.tag(nltk.word_tokenize(text)), sep='\n')

('January', 'MONTH')
(',', None)
('He', None)
('was', None)
('born', None)
('in', None)
('March', 'MONTH')
('1991', None)
(',', None)
('August', 'MONTH')


<br>
<br>
<details><summary>
Click here for a hint:
</summary>
Load months from Standard Library. Then programmatically write the pattern.
</details>

Text your new pattern against this text:

In [97]:
text = """January brings the snow,
makes our feet and fingers glow.

February brings the rain,
Thaws the frozen lake again.

March brings breezes loud and shrill,
stirs the dancing daffodil.

April brings the primrose sweet,
Scatters daises at our feet.

May brings flocks of pretty lambs,
Skipping by their fleecy damns.
|
June brings tulips, lilies, roses,
Fills the children's hand with posies.

Hot july brings cooling showers,
Apricots and gillyflowers.

August brings the sheaves of corn,
Then the harvest home is borne.

Warm september brings the fruit,
Sportsmen then begin to shoot.

Fresh October brings the pheasents,
Then to gather nuts is pleasent.

Dull November brings the blast,
Then the leaves are whirling fast.

Chill December brings the sleet,
Blazing fire, and Christmas treat. """

The output should be like:

```
('January', 'MONTH')
('brings', None)
('the', None)
('snow', None)
...
```

In [98]:
# TODO: Apply pattern to text

tagger = nltk.RegexpTagger(months_pattern)

print(*tagger.tag(nltk.word_tokenize(text)), sep='\n')

('January', 'MONTH')
('brings', None)
('the', None)
('snow', None)
(',', None)
('makes', None)
('our', None)
('feet', None)
('and', None)
('fingers', None)
('glow', None)
('.', None)
('February', 'MONTH')
('brings', None)
('the', None)
('rain', None)
(',', None)
('Thaws', None)
('the', None)
('frozen', None)
('lake', None)
('again', None)
('.', None)
('March', 'MONTH')
('brings', None)
('breezes', None)
('loud', None)
('and', None)
('shrill', None)
(',', None)
('stirs', None)
('the', None)
('dancing', None)
('daffodil', None)
('.', None)
('April', 'MONTH')
('brings', None)
('the', None)
('primrose', None)
('sweet', None)
(',', None)
('Scatters', None)
('daises', None)
('at', None)
('our', None)
('feet', None)
('.', None)
('May', 'MONTH')
('brings', None)
('flocks', None)
('of', None)
('pretty', None)
('lambs', None)
(',', None)
('Skipping', None)
('by', None)
('their', None)
('fleecy', None)
('damns', None)
('.', None)
('|', None)
('June', 'MONTH')
('brings', None)
('tulips', None)
(',

In [101]:
# Unit Test
for tag in tags_nltk:
    if tag[0] in months:
        assert tag[1] == 'MONTH'

In [62]:
# TODO: Add the following regex patterns
# - VBG: gerund, e.g., ‘going’
# - 'NN': capturing all remaining patterns
patterns = [
(r'.*ed$', 'VBD'),
(r'.*es$', 'VBZ'),
(r'.*ould$', 'MD'),
(r'.*\'s$', 'NN$'),
(r'.*s$', 'NNS')]

In [65]:
text = "It's going going gone Go Cubs".split()

regexp_tagger = nltk.RegexpTagger(patterns)
tagged = regexp_tagger.tag(text)

In [66]:
assert tagged == [("It's", 'NN$'),
 ('going', 'VBG'),
 ('going', 'VBG'),
 ('gone', 'NN'),
 ('Go', 'NN'),
 ('Cubs', 'NNS')]

<br>
<br> 
<br>

----