In [1]:
import nltk
from nltk import pos_tag, word_tokenize

In [2]:
annotated_sent = nltk.corpus.treebank.tagged_sents()
annotated_sent[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [18]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'prefix-1': sentence[index][0],
        'suffix-1': sentence[index][-1],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1]
    }

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
            
    return X, y

In [19]:
cutoff = int(0.75 * len(annotated_sent))
training_sentences = annotated_sent[:cutoff]
test_sentences = annotated_sent[cutoff:]
X, y = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

## Implementing a classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

In [21]:
size=10000

In [22]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

clf.fit(X[:size], y[:size])
print('training OK')
score = clf.score(X_test, y_test)
print ("Accuracy: " + str(score))

training OK
Accuracy: 0.8790776152980877


### Without pipeline

In [23]:
# v = DictVectorizer(sparse=False)
# Xdv = v.fit_transform(X[:size])

# v2 = DictVectorizer(sparse=False)
# Xdv_test = v.transform(X_test)

# clf = DecisionTreeClassifier(criterion='entropy')

# clf.fit(Xdv[:size], y[:size])

# clf.score(Xdv_test, y_test)

## Using classifier

In [24]:
def pos_tag(sentence):
    print("checking...")
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

In [26]:
sentence = word_tokenize('Hello world, lets do something awesome today!')
[features(sentence, index) for index in range(len(sentence))]

[{'is_capitalized': True,
  'next_word': 'world',
  'prefix-1': 'H',
  'prev_word': '',
  'suffix-1': 'o',
  'word': 'Hello'},
 {'is_capitalized': False,
  'next_word': ',',
  'prefix-1': 'w',
  'prev_word': 'Hello',
  'suffix-1': 'd',
  'word': 'world'},
 {'is_capitalized': True,
  'next_word': 'lets',
  'prefix-1': ',',
  'prev_word': 'world',
  'suffix-1': ',',
  'word': ','},
 {'is_capitalized': False,
  'next_word': 'do',
  'prefix-1': 'l',
  'prev_word': ',',
  'suffix-1': 's',
  'word': 'lets'},
 {'is_capitalized': False,
  'next_word': 'something',
  'prefix-1': 'd',
  'prev_word': 'lets',
  'suffix-1': 'o',
  'word': 'do'},
 {'is_capitalized': False,
  'next_word': 'awesome',
  'prefix-1': 's',
  'prev_word': 'do',
  'suffix-1': 'g',
  'word': 'something'},
 {'is_capitalized': False,
  'next_word': 'today',
  'prefix-1': 'a',
  'prev_word': 'something',
  'suffix-1': 'e',
  'word': 'awesome'},
 {'is_capitalized': False,
  'next_word': '!',
  'prefix-1': 't',
  'prev_word': 'aw

In [27]:
print(list(pos_tag(word_tokenize('Hello world, lets do something awesome today!'))))

checking...
[('Hello', 'NN'), ('world', 'NN'), (',', ','), ('lets', 'NNS'), ('do', 'VB'), ('something', 'VBG'), ('awesome', 'NN'), ('today', 'NN'), ('!', 'NNP')]


## Rule based POS taggers

In [28]:
from nltk.corpus import brown
from nltk import DefaultTagger as df
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
from nltk import TrigramTagger as tg

In [47]:
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

size = int(len(brown_tagged_sents) * 0.9)

train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

### Default Tagger

In [41]:
def_model = nltk.DefaultTagger('NN')
print(def_model.evaluate(train_sents))
print(def_model.evaluate(test_sents))

0.131405972094873
0.1262832652247583


### Unigram Tagger

In [42]:
uni_model = nltk.UnigramTagger(train_sents)
print(uni_model.evaluate(train_sents))
print(uni_model.evaluate(test_sents))

0.9353630649241612
0.8121200039868434


### Bigram Tagger

In [43]:
bi_model = nltk.BigramTagger(train_sents)
print(bi_model.evaluate(train_sents))
print(bi_model.evaluate(test_sents))

0.7884137382485832
0.10206319146815508


### TrigramTagger

In [44]:
tri_model = nltk.TrigramTagger(train_sents)
print(tri_model.evaluate(train_sents))
print(tri_model.evaluate(test_sents))

0.8216104550325339
0.0626931127279976


### RegExp Tagger

In [49]:
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'),
            (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'),
            (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'),
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]

# patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), 
#             (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), 
#             (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), 
#             (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]

regexp_model = nltk.RegexpTagger(patterns)
print(regexp_model.evaluate(train_sents))
print(regexp_model.evaluate(test_sents))

0.20310204261994455
0.2047244094488189
