# Context Tagger

This example demonstrates how to write a POS tagger with context.

In [1]:
import nltk

In [2]:
sentences = ["What is your address when you're in Singapore?",
            "the president's address on the state of the economy",
            "He addressed his remarks to the laywers in the audience.",
            "In order ot address an assembly, we should be ready",
            "He laughed inwardly at the scene.",
            "After all the advance publicity, the prizefight turned out to be a laugh.",
            "we can learn to laugh a little at even our most serious issues"]


In [3]:
def get_words():
    words = []
    for sent in sentences:
        words.append(nltk.pos_tag(nltk.word_tokenize(sent)))
    return words

In [4]:
def no_context_tagger():
    tagger = nltk.UnigramTagger(get_words())
    return tagger.tag('the little remarks towards assembly are laughable'.split())

In [8]:
def with_context_tagger():
    def word_features(words, word_pos_in_sentence):
        # Extract all the ing etc forms.
        end_features = {
            'last(1)': words[word_pos_in_sentence][-1],
            'last(2)': words[word_pos_in_sentence][-2:],
            'last(3)': words[word_pos_in_sentence][-3:]
        }
        # Use previous word to determine if the current word is verb or noun.
        if word_pos_in_sentence > 1:
            end_features['prev'] = words[word_pos_in_sentence - 1]
        else:
            end_features['prev'] = '|NONE|'
        return end_features
    
    all_sentences = get_words()
    featured_data = []
    for sent in all_sentences:
        untagged_sent = nltk.tag.untag(sent)
        featured_sent = [(word_features(untagged_sent, index), tag)
                         for index, (word, tag) in enumerate(sent)]
        featured_data.extend(featured_sent)
    
    breakup = len(featured_data) // 2
    traindata = featured_data[breakup:]
    testdata = featured_data[:breakup]
    classifier = nltk.NaiveBayesClassifier.train(traindata)
    print('Accuracy of the classifier: {}'.format(nltk.classify.accuracy(classifier, testdata)))

In [9]:
no_context_tagger()

[('the', 'DT'),
 ('little', 'JJ'),
 ('remarks', 'NNS'),
 ('towards', None),
 ('assembly', 'NN'),
 ('are', None),
 ('laughable', None)]

In [10]:
with_context_tagger()

Accuracy of the classifier: 0.39473684210526316
