In [1]:
import pandas as pd
from collections import Counter as ctr
from operator import itemgetter

In [2]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'pos', 'other'])
train['word'] = train.word.str.lower()

In [3]:
train[:10]

Unnamed: 0,word,pos,other
0,confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP
5,widely,RB,I-VP
6,expected,VBN,I-VP
7,to,TO,I-VP
8,take,VB,I-VP
9,another,DT,B-NP


In [4]:
#ppos is previous part of speach
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('O')

train[:10]

Unnamed: 0,word,pos,other,ppos
0,confidence,NN,B-NP,O
1,in,IN,B-PP,NN
2,the,DT,B-NP,IN
3,pound,NN,I-NP,DT
4,is,VBZ,B-VP,NN
5,widely,RB,I-VP,VBZ
6,expected,VBN,I-VP,RB
7,to,TO,I-VP,VBN
8,take,VB,I-VP,TO
9,another,DT,B-NP,VB


### Write the functions

$$P(T_i|T_{i-1})$$

$$P(W_i|T_i)$$

In [5]:
pos = list(set(train.pos))
smooth = 0.00001

In [6]:
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# the two equations

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]
    
# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

In [7]:
Pwt('in','IN')

0.16627130557019856

### Evaluate: build the trellis

- example trellis with an example sequence

In [8]:
sequence = ['the', 'government']

In [9]:
emissions = []
transitions = []
for word in sequence:
    emission = [(t,Pwt(word, t)) for t in pos]
    transition = [(tprev,Ptt(ti,tprev)) for ti in pos for tprev,prob in emission]
    emissions.append(emission)
    transitions.append(transition)
    

### Evaluate: decode

In [10]:
hyp = []

for e,t in zip(emissions, transitions):
    # argmax of e
    e = sorted(e, key = itemgetter(1))[-1]
    hyp.append(e)
    print(hyp)

[('DT', 0.583419689119171)]
[('DT', 0.583419689119171), ('NN', 0.005738547782532259)]


In [11]:
def emission(word = ''):
    return [(t, Pwt(word,t)) for t in pos]

def transition(ct):
    return [[t, Ptt(t, ct)] for t in pos]


def evaluate(word1, remaining):
    #start
    list = emission(word1)
    max(list)
    sent_tags=[]
    tag = max(list,key=itemgetter(1))[0]    #faster solution
    sent_tags.append(tag)
    i = 1
    #loop this
    while len(remaining) > i:
        tlist = transition(tag)
        next_word = remaining[i]
        i+=1
        elist = emission(next_word)
        clist=[(t[0], t[1]*e[1]) for t,e in zip(tlist, elist)] 
        tag = max(clist, key=itemgetter(1))[0]
        sent_tags.append(tag)
    return sent_tags

### Evaluate: check accuracy

- load test data
- pass all word sequences through your hmm
- decode
- find the most probable sequence
- compare to the "gold"

In [12]:
hyp

[('DT', 0.583419689119171), ('NN', 0.005738547782532259)]

In [13]:
test = pd.read_csv('test.txt', delimiter=' ', names=['word', 'pos', 'other'])
test['word'] = test.word.str.lower()

In [14]:
test.describe()
test

Unnamed: 0,word,pos,other
0,rockwell,NNP,B-NP
1,international,NNP,I-NP
2,corp.,NNP,I-NP
3,'s,POS,B-NP
4,tulsa,NNP,I-NP
5,unit,NN,I-NP
6,said,VBD,B-VP
7,it,PRP,B-NP
8,signed,VBD,B-VP
9,a,DT,B-NP


In [15]:
test[:3]

Unnamed: 0,word,pos,other
0,rockwell,NNP,B-NP
1,international,NNP,I-NP
2,corp.,NNP,I-NP


In [16]:
test['hyp'] = evaluate(test.word[0], test.word,)
test['hyp']

0          ``
1         NNP
2         NNP
3         POS
4         NNP
5          NN
6         VBD
7         PRP
8         VBD
9          DT
10         JJ
11         NN
12         IN
13       PRP$
14         NN
15         IN
16        NNP
17        NNP
18         TO
19         VB
20         JJ
21        NNS
22         IN
23        NNP
24        POS
25         CD
26         NN
27          .
28         DT
29        VBD
         ... 
47347     NNS
47348       .
47349      IN
47350     NNP
47351     NNP
47352       ,
47353      IN
47354      NN
47355       ,
47356     NNP
47357     VBZ
47358     VBN
47359      DT
47360      JJ
47361      NN
47362      NN
47363      IN
47364      DT
47365     VBZ
47366      NN
47367     VBZ
47368     VBN
47369     JJR
47370      IN
47371       ,
47372     VBG
47373      TO
47374     NNP
47375     NNP
47376       .
Name: hyp, Length: 47377, dtype: object

In [17]:
test[:10]

Unnamed: 0,word,pos,other,hyp
0,rockwell,NNP,B-NP,``
1,international,NNP,I-NP,NNP
2,corp.,NNP,I-NP,NNP
3,'s,POS,B-NP,POS
4,tulsa,NNP,I-NP,NNP
5,unit,NN,I-NP,NN
6,said,VBD,B-VP,VBD
7,it,PRP,B-NP,PRP
8,signed,VBD,B-VP,VBD
9,a,DT,B-NP,DT


In [18]:
from sklearn.metrics import accuracy_score

accuracy_score(test.pos, test.hyp)

0.9055659919370158

In [20]:
from sklearn.metrics import confusion_matrix
act = test.word
hyp = test.hyp
confusion_matrix(act, hyp)

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,  11,   0, ...,   0,   0,   0],
       [  0,   0, 381, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])