In [74]:
import pandas as pd
from collections import Counter as ctr
from operator import itemgetter

In [75]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'pos', 'other'])
train['word'] = train.word.str.lower()

In [76]:
train[:3]

Unnamed: 0,word,pos,other
0,confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP


In [77]:
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('O')

train[:3]

Unnamed: 0,word,pos,other,ppos
0,confidence,NN,B-NP,O
1,in,IN,B-PP,NN
2,the,DT,B-NP,IN


### Write the functions

$$P(T_i|T_{i-1})$$

$$P(W_i|T_i)$$

In [78]:
pos = list(set(train.pos))
smooth = 0.00001

pos

['NN',
 '$',
 ',',
 'VBN',
 '#',
 'VBZ',
 'RBR',
 'UH',
 'RBS',
 'PDT',
 'PRP',
 'CC',
 'RP',
 'CD',
 'NNP',
 ':',
 'VBD',
 '(',
 'JJR',
 'SYM',
 'POS',
 '``',
 'NNS',
 'VBG',
 'IN',
 'WDT',
 'WRB',
 'WP',
 'JJ',
 'JJS',
 'TO',
 'FW',
 'PRP$',
 '.',
 'MD',
 'NNPS',
 'DT',
 'VBP',
 'RB',
 'WP$',
 ')',
 'EX',
 'VB',
 "''"]

In [79]:
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)
    

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

In [80]:
Pwt('the','DT')

0.583419689119171

### Evaluate: decode

In [81]:
def greedy(words):
    hyp = []
    tag_prev = 'O'
    for word in words:
        t = max([(tag, Ptt(tag, tag_prev)*Pwt(word, tag)) for tag in pos], key=lambda x: x[1])[0]
        hyp.append(t)
        tag_prev = t
    return pd.Series(hyp)

class Node:
    def __init__(self, tag, prob, backptr=None):
        self.tag = tag
        self.prob = prob
        self.backptr = backptr
        
def viterbi(words):
    array = [Node(tag, Pwt(words[0], tag)) for tag in pos]
    for word in words[1:]:
        col = []
        for tag in pos:
            col.append(max([Node(tag, Ptt(tag, prev.tag)*prev.prob, prev) for prev in array], key=lambda x: x.prob))
        for t in col: 
            t.prob *= Pwt(word, t.tag)
        array = col
    last = max(array, key=lambda x: x.prob)
    res = []
    while last: 
        res.append(last.tag)
        last = last.backptr
    res.reverse()
    return pd.Series(res)

### Evaluate: check accuracy

- load test data
- pass all word sequences through your hmm
- decode
- find the most probable sequence
- compare to the "gold"

In [82]:
test = pd.read_csv('test.txt', delimiter=' ', names=['word', 'pos', 'other'])
test['word'] = test.word.str.lower()

In [83]:
test.describe()

Unnamed: 0,word,pos,other
count,47377,47377,47377
unique,7495,43,19
top,the,NN,I-NP
freq,2407,6642,14376


In [84]:
test[:3]

Unnamed: 0,word,pos,other
0,rockwell,NNP,B-NP
1,international,NNP,I-NP
2,corp.,NNP,I-NP


In [85]:
hyp = greedy(test.word)

In [86]:
from sklearn.metrics import accuracy_score

accuracy_score(test.pos, hyp)

0.9055870992253625

In [87]:
import time
t = time.time()
hyp = viterbi(test.word)
time.time() - t

76.91627168655396

In [71]:
from sklearn.metrics import accuracy_score

test['hyp'] = hyp
print(accuracy_score(test.pos[:80], test.hyp[:80]))
test[:80]

0.8375


Unnamed: 0,word,pos,other,hyp
0,rockwell,NNP,B-NP,FW
1,international,NNP,I-NP,NNP
2,corp.,NNP,I-NP,NNP
3,'s,POS,B-NP,POS
4,tulsa,NNP,I-NP,JJ
5,unit,NN,I-NP,NN
6,said,VBD,B-VP,VBD
7,it,PRP,B-NP,PRP
8,signed,VBD,B-VP,VBD
9,a,DT,B-NP,DT


In [90]:
class foo:
    def __init__(self, string, num):
        self.string = string
        self.num = num
        
f = foo("hello", 1)
g = foo("world", 1)
max([g,f], key=lambda x: x.num).string

'world'