In [16]:
from nltk.corpus import treebank,brown
import math
import scipy
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

In [2]:
corpus = brown.tagged_sents(tagset='universal')
train_data = corpus[:-100] 
test_data = corpus[-100:]

print(len(train_data))
print(len(test_data))

57240
100


In [3]:
#HMM

class HMM():
    def __init__(self,laplace_constant=0.5):
        self.P = {}
        self.O = {}
        self.S = {}
        self.laplace_constant = 0.5

    def train(self,train_data):
        
        self.tag_dict={}
        self.word_dict={}

        for sent in train_data:
            for elem in sent:
                w = elem[0]
                tag= elem[1]

                if w not in self.word_dict:
                    self.word_dict[w]=0

                if tag not in self.tag_dict:
                    self.tag_dict[tag]=0

                self.word_dict[w]+=1
                self.tag_dict[tag]+=1
   
        for sent in train_data:
            for i in range(len(sent)-1):
                word = sent[i]
                if i == 0:
                    if word[1] not in self.S:
                        self.S[word[1]] = 1
                    else:
                        self.S[word[1]] += 1
                word2 = sent[i+1]
                if (word[1],word2[1]) not in self.P:
                    self.P[(word[1],word2[1])] = 1
                else:
                    self.P[(word[1],word2[1])] += 1
                if (word[1],word[0]) not in self.O:
                    self.O[(word[1],word[0])] = 1
                else:
                    self.O[(word[1],word[0])] += 1
                    
        for (s,s_1) in self.P:
            self.P[(s,s_1)] /= float(self.tag_dict[s])

        for (s,v) in self.O:
            self.O[(s,v)] /= float(self.tag_dict[s])

        for s in self.S:
            self.S[s] /= float(self.tag_dict[s])
            
            
    def transition(self,s1,s2):
        if (s1,s2) not in self.P:
            return self.laplace_constant/(self.laplace_constant*len(self.tag_dict) + self.queryTag(s1))
        else:
            return self.P[(s1,s2)]
    
    def emission(self,s1,v1):
        if (s1,v1) not in self.O:
            return self.laplace_constant/(self.laplace_constant*len(self.tag_dict) + self.queryTag(s1))
        else:
            return self.O[(s1,v1)]
        
    def start(self,s):
        if s not in self.S:
            return 1.0/len(self.tag_dict)
        else:
            return self.S[s]
        
    def getTags(self):
        return self.tag_dict
    
    def getWords(self):
        return self.word_dict
    
    def queryTag(self,s):
        if s not in self.tag_dict:
            return 0
        else:
            return self.tag_dict[s]

In [4]:
model = HMM()
model.train(train_data)

In [5]:
#print(P)

In [6]:
#print(O)

In [7]:
#print(S)
tag_set = model.getTags()
for t in tag_set:
    print(t)

DET
NOUN
ADJ
VERB
ADP
.
ADV
CONJ
PRT
PRON
NUM
X


In [None]:
tag_set = model.getTags()

for sent in test_data:
    V = {}
    V_1 = {}
    
    maxi = 0
    decisions = []

    for t in tag_set:
        V[t] = math.log(model.start(t))

    for wordT in sent:        
        word = wordT[0]
        decision = {}
        for t in tag_set:
            maxi = None
            for t2 in tag_set:
                value = V[t2] + math.log(model.transition(t2,t)) + math.log(model.emission(t2,word))
                if maxi == None:
                    maxi = value
                    decision[t] = t2
                if value > maxi:
                    maxi = value
                    decision[t] = t2
            V_1[t] = maxi
            
        decisions.append(decision)         
        V = V_1.copy()
        #print(V)
    #print( sent)
    
    maxi = None
    dec = None
    
    #print (decisions)
    for t in tag_set:
        if maxi == None:
            maxi = V[t]
            decision[t] = t
            dec = t
        if V[t] > maxi:
            maxi = V[t]
            dec = t

    bestSeq = []
    bestSeq.append(dec)
    
    for i in range(len(decisions)-1,-1,-1):
        dec = decisions[i][dec]
        bestSeq.append(dec)
        
    print(sent)
    bestSeq.reverse()
    print(bestSeq)
    
    

In [9]:

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

def word2features(sent,i):
    word = sent[i][0]
    
    features ={
    'bias': 1.0,
    'word.lower()': word.lower(),
    'word[-3:]': word[-3:], # last 2 letters for checking verb forms and adjectives
    'word[-2:]': word[-2:], # last 2 letters for checking verb forms and adjectives
    'word.isupper()': word.isupper(), # Acronyms
    'word.istitle()': word.istitle(), # Propernouns
    'word.isdigit()': word.isdigit(), # Numbers
    }
                
    return features

def sent2features(sent):
    return [word2features(sent,i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for i,label in sent]



In [10]:
X_train=[sent2features(s) for s in train_data]
y_train=[sent2labels(s) for s in train_data]

X_test=[sent2features(s) for s in test_data]
y_test=[sent2labels(s) for s in test_data]



In [11]:

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 

    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [12]:
y_pred = crf.predict(X_test)
labels=list(crf.classes_)

metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)


0.9337113931601664

In [13]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

          .      1.000     1.000     1.000       334
          X      0.000     0.000     0.000        17
        ADJ      0.820     0.750     0.784       140
        ADP      0.958     0.972     0.965       283
        ADV      0.877     0.806     0.840       124
       VERB      0.947     0.924     0.936       370
        DET      0.993     0.993     0.993       295
       CONJ      1.000     1.000     1.000        84
       NOUN      0.872     0.961     0.914       483
       PRON      0.981     0.956     0.968       160
        PRT      0.901     0.914     0.908        70
        NUM      0.947     0.857     0.900        21

avg / total      0.931     0.937     0.934      2381



  'precision', 'predicted', average, warn_for)


In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))