# POS Taggers 

In [1]:
sentence = 'The brown fox is quick and he is jumping over the lazy dog'


# recommended tagger based on PTB
import nltk
tokens = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag(tokens, tagset='universal')
print (tagged_sent)

[('The', 'DET'), ('brown', 'ADJ'), ('fox', 'NOUN'), ('is', 'VERB'), ('quick', 'ADJ'), ('and', 'CONJ'), ('he', 'PRON'), ('is', 'VERB'), ('jumping', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN')]


In [7]:
from pattern.en import tag 

tagged_sent = tag(sentence)
print (tagged_sent)

[('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


 That output gives us tags that purely follow the Penn Treebank format, specifying the
form of adjective, noun, or verb in more detail. 

In [8]:
# building your own tagger

# preparing the data
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print (train_data[0])

# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

print (dt.evaluate(test_data))

print (dt.tag(tokens))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
0.1454158195372253
[('The', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'NN'), ('quick', 'NN'), ('and', 'NN'), ('he', 'NN'), ('is', 'NN'), ('jumping', 'NN'), ('over', 'NN'), ('the', 'NN'), ('lazy', 'NN'), ('dog', 'NN')]


We can see from the preceding output we have obtained 14 percent accuracy in
correctly tagging words from the treebank test dataset—which is not that great, and the
output tags on our sample sentence are all nouns, just as we expected because we fed the
tagger with the same tag.
 We will now use regular expressions and the RegexpTagger to see if we can build a
better performing tagger : 

In [9]:
# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print (rt.evaluate(test_data))
print (rt.tag(tokens))

0.24039113176493368
[('The', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'NNS'), ('quick', 'NN'), ('and', 'NN'), ('he', 'NN'), ('is', 'NNS'), ('jumping', 'VBG'), ('over', 'NN'), ('the', 'NN'), ('lazy', 'NN'), ('dog', 'NN')]


That output shows that the accuracy has now increased to 24 percent. But can we do
better? We will now train some n-gram taggers. n-grams are contiguous sequences of n
items from a sequence of text or speech. These items could consist of words, phonemes,
letters, characters, or syllables. Shingles are n-grams where the items only consist of
words. We will use n-grams of size 1, 2, and 3, which are also known as unigram , bigram ,
and trigram respectively. 

In [10]:
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print (ut.evaluate(test_data))
print (ut.tag(tokens))

print (bt.evaluate(test_data))
print (bt.tag(tokens))

print (tt.evaluate(test_data))
print (tt.tag(tokens))

0.8607803272340013
[('The', 'DT'), ('brown', None), ('fox', None), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', None), ('dog', None)]
0.13466937748087907
[('The', 'DT'), ('brown', None), ('fox', None), ('is', None), ('quick', None), ('and', None), ('he', None), ('is', None), ('jumping', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]
0.08064672281924679
[('The', 'DT'), ('brown', None), ('fox', None), ('is', None), ('quick', None), ('and', None), ('he', None), ('is', None), ('jumping', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]


 The preceding output clearly shows that we obtain 86 percent accuracy on the test
set using UnigramTagger tagger alone, which is really good compared to our last tagger.
The None tag indicates the tagger was unable to tag that word, the reason being that it was
unable to get a similar token in the training data. Accuracies of the bigram and trigram
models are far less because it is not always the case that the same bigrams and trigrams it
had observed in the training data will also be present in the same way in the testing data.
 We will now look at an approach to combine all the taggers by creating a combined
tagger with a list of taggers and use a backoff tagger. Essentially we would create a chain of
taggers, and each tagger would fall back on a backoff tagger if it cannot tag the input tokens : 

In [11]:
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

print (ct.evaluate(test_data))        
print (ct.tag(tokens))

0.9094781682641108
[('The', 'DT'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'NN'), ('dog', 'NN')]


 We now obtain an accuracy of 91 percent on the test data, which is excellent. Also we
see that this new tagger is able to successfully tag all the tokens in our sample sentence
(even though a couple of them are not correct, like brown should be an adjective). 

In [12]:
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=NaiveBayesClassifier.train)

print (nbt.evaluate(test_data))
print (nbt.tag(tokens))    


0.9306806079969019
[('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'VBG')]


Using the preceding tagger , we get an accuracy of 93 percent on our test data—the
highest out of all our taggers. Also if you observe the output tags for our sample sentence,
you will see they are correct and make perfect sense. This gives us an idea of how
powerful and effective classifier-based POS taggers can be.

In [16]:
# Using MaxentClassifier
met = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=MaxentClassifier.train)
print (met.evaluate(test_data))                           

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.82864        0.007
             2          -0.76176        0.957


  exp_nf_delta = 2 ** nf_delta
  sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)


         Final               nan        0.984
0.9270016458514861


In [17]:
print (met.tag(tokens))

[('The', 'DT'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'NN'), ('dog', 'NN')]
