# In this chapter, Bya will cover the following recipes:
1. Default tagging
2. Training a unigram part-of-speech tagger 
3. Combining taggers with backoff tagging 
4. Training and combining ngram taggers
5. Creating a model of likely word tags
6. Tagging with regular expressions
7. Affix tagging
8. Training a Brill tagger
9. Training the TnT tagger
10. Using WordNet for tagging
11. Tagging proper names
12. Classifier-based tagging
13. Training a tagger with NLTK-Trainer

**Part-of-speech** tagging is the process of converting a sentence, in the form of a list of words, into a list of tuples, where each tuple is of the form (**word, tag**). The **tag** is a part-of-speech tag, and signi es whether the word is a noun, adjective, verb, and so on.

# 1. Default tagging

In [1]:
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])

[('Hello', 'NN'), ('World', 'NN')]

### Evaluating accuracy

In [3]:
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.14331966328512843

### Tagging sentences

In [8]:
tagger.tag_sents([['Hello', 'world', '.'], ['How', 'are', 'you', '?']])

[[('Hello', 'NN'), ('world', 'NN'), ('.', 'NN')],
 [('How', 'NN'), ('are', 'NN'), ('you', 'NN'), ('?', 'NN')]]

### Untagging a tagged sentence

In [1]:
from nltk.tag import untag

untag([('Hello', 'NN'), ('World', 'NN')])

['Hello', 'World']

# 2. Training a unigram part-of-speech tagger

A **unigram** generally refers to a single token. Therefore, a unigram tagger only uses a single word as its context for determining the part-of-speech tag.

### Train a tagger with treebank corpus

In [3]:
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)

In [4]:
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [5]:
tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

### Evaluate test_sents

In [6]:
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]

tagger.evaluate(test_sents)

0.8585365853658536

### Overriding the context model

In [9]:
tagger = UnigramTagger(model={'Pierre': 'NN'})
tagger.tag(treebank.sents()[0])

[('Pierre', 'NN'),
 ('Vinken', None),
 (',', None),
 ('61', None),
 ('years', None),
 ('old', None),
 (',', None),
 ('will', None),
 ('join', None),
 ('the', None),
 ('board', None),
 ('as', None),
 ('a', None),
 ('nonexecutive', None),
 ('director', None),
 ('Nov.', None),
 ('29', None),
 ('.', None)]

### Minimum frequency cutoff

The ContextTagger class uses frequency of occurrence to decide which tag is most likely for a given context. By default, it will do this even if the context word and tag occurs only once. If you'd like to set a minimum frequency threshold, then you can pass a cutoff value to the UnigramTagger class.

In [10]:
tagger = UnigramTagger(train_sents, cutoff=3)
tagger.evaluate(test_sents)

0.7756529246708397

# 3. Combining tagger with backoff tagging

### Backoff tagging

In [11]:
from nltk.tag import DefaultTagger
tagger1 = DefaultTagger('NN')

from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger2 = UnigramTagger(train_sents, backoff=tagger1)

In [12]:
test_sents = treebank.tagged_sents()[3000:]
tagger2.evaluate(test_sents)

0.8755018346643644

In [14]:
tagger2._taggers

[<UnigramTagger: size=8818>, <DefaultTagger: tag=NN>]

### Saving and loading a trained tagger with pickle

In [15]:
# save the tagger
import pickle

with open('tagger.pickle', 'wb') as f:
    pickle.dump(tagger, f)

In [16]:
# load the tagger
import pickle

with open('tagger.pickle', 'rb') as f:
    tagger = pickle.load(f)

# 4. Training and combining ngram taggers

In [17]:
# datas
from nltk.corpus import treebank

train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

In [18]:
# BigramTagger
from nltk.tag import BigramTagger

bitagger = BigramTagger(train_sents)
bitagger.evaluate(test_sents)

0.11323116770990718

In [19]:
# TrigramTagger
from nltk.tag import TrigramTagger

tritagger = TrigramTagger(train_sents)
tritagger.evaluate(test_sents)

0.06906971724584503

# `tag_util.py`

In [None]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    
    return backoff

In [20]:
from tag_util import backoff_tagger
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
                                     TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)

0.881156917763868

### Quadgram tagger

In [23]:
from nltk.tag import NgramTagger

quadtagger = NgramTagger(4, train_sents)
quadtagger.evaluate(test_sents)

0.05836391107273905

# `taggers.py`

In [24]:
from nltk.tag import NgramTagger

class QuadgramTagger(NgramTagger):
    def __init__(self, *args, **kwargs):
        NgramTagger.__init__(self, 4, *args, **kwargs)

In [26]:
from taggers import QuadgramTagger

quadtagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
                                          TrigramTagger, QuadgramTagger],
                           backoff = backoff)
quadtagger.evaluate(test_sents)

0.8809842434707533

# 5. Creating a model of likely word tags

# `tag_util.py`

we can construct a model of the 200 most frequent words as keys, with the most frequent tag for each word as a value. 

In [3]:
from nltk.probability import FreqDist, ConditionalFreqDist

def word_tag_model(words, tagged_words, limit=200):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)
    
    most_freq = (word for word, count in fd.most_common(limit))
    
    return dict((word, cfd[word].max()) for word in most_freq)

In [1]:
from tag_util import word_tag_model
from nltk.corpus import treebank
from nltk.tag import UnigramTagger

model = word_tag_model(treebank.words(), treebank.tagged_words())
tagger = UnigramTagger(model=model)

test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.5594215411180661

In [5]:
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from tag_util import backoff_tagger

default_tagger = DefaultTagger('NN')
likely_tagger = UnigramTagger(model=model, backoff=default_tagger)

train_sents = treebank.tagged_sents()[:3000]
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger],
                       backoff=likely_tagger)
tagger.evaluate(test_sents)

0.8790848262464925

In [6]:
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,
TrigramTagger], backoff=default_tagger)

likely_tagger = UnigramTagger(model=model, backoff=tagger)
likely_tagger.evaluate(test_sents)

0.8810274120440319

# 6. Tagging with regular expressions

# `tag_util.py`

In [7]:
patterns = [
     (r'^\d+$', 'CD'), # cardinal numbers i.e 1 2 3
     (r'.*ing$', 'VBG'), # gerunds, i.e. wondering
     (r'.*ment$', 'NN'), # i.e. wonderment
     (r'.*ful$', 'JJ') # i.e. wonderful
]

In [3]:
from tag_util import patterns
from nltk.tag import RegexpTagger
from nltk.corpus import treebank

tagger = RegexpTagger(patterns)

test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)

0.037470321605870924

# 7. Affix tagging

The default arguments for an **AffixTagger** class specify three-character suffixes, and that words must be at least five characters long. If a word is less than five characters, then None is returned as the tag.

In [1]:
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

In [5]:
from nltk.tag import AffixTagger

tagger = AffixTagger(train_sents)
tagger.evaluate(test_sents)

0.27476796891862726

In [6]:
prefix_tagger = AffixTagger(train_sents, affix_length=3)
prefix_tagger.evaluate(test_sents)

0.23621843298078998

In [7]:
suffix_tagger = AffixTagger(train_sents, affix_length=2)
suffix_tagger.evaluate(test_sents)

0.3004101014461472

# 9. Training a TnT tagger

In [3]:
from nltk.tag import tnt
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

0.8756313403842003

In [4]:
from nltk.tag import DefaultTagger
unk = DefaultTagger('NN')
tnt_tagger = tnt.TnT(unk=unk, Trained=True)
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

0.8925102525361537

In [5]:
tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(train_sents)
tnt_tagger.evaluate(test_sents)

0.8756313403842003

# 10. Using WordNet for tagging

| WordNet tag        | Treebank tag|
|-------------:| -----:|
| n      | NN | 
| a      | JJ      |   
| s | JJ      |
| r|RB |
| v|VB |

# `taggers.py`

In [6]:
from nltk.tag import SequentialBackoffTagger
from nltk.corpus import wordnet
from nltk.probability import FreqDist

class WordNetTagger(SequentialBackoffTagger):
    '''
     >>> wt = WordNetTagger()
     >>> wt.tag(['food', 'is', 'great'])
     [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')]
    '''
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.wordnet_tag_map = {
            'n': 'NN',
            's': 'JJ',
            'a': 'JJ',
            'r': 'RB',
            'v': 'VB'
            }

    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        fd = FreqDist()
        for synset in wordnet.synsets(word):
            fd[synset.pos()] += 1
        return self.wordnet_tag_map.get(fd.max())
