In [93]:
import pandas as pd
from collections import Counter as ctr
from operator import itemgetter

In [38]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'pos', 'other'])
train['word'] = train.word.str.lower()

In [39]:
train[:3]

Unnamed: 0,word,pos,other
0,confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP


In [41]:
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('O')

train[:3]

Unnamed: 0,word,pos,other,ppos
0,confidence,NN,B-NP,O
1,in,IN,B-PP,NN
2,the,DT,B-NP,IN


### Write the functions

$$P(T_i|T_{i-1})$$

$$P(W_i|T_i)$$

In [74]:
pos = list(set(train.pos))
smooth = 0.00001

pos

['NN',
 'NNPS',
 'RBR',
 'EX',
 'WP$',
 'MD',
 'JJR',
 'CD',
 ':',
 'PDT',
 'FW',
 'UH',
 "''",
 'TO',
 'JJS',
 'PRP',
 '$',
 'SYM',
 '#',
 ',',
 'DT',
 'RBS',
 'WP',
 'VBN',
 'VBG',
 'VBD',
 'WDT',
 'JJ',
 'IN',
 'VBZ',
 'NNS',
 '(',
 'VB',
 'PRP$',
 'POS',
 '.',
 'NNP',
 ')',
 'RP',
 '``',
 'CC',
 'WRB',
 'VBP',
 'RB']

In [100]:
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)
    

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

In [78]:
Pwt('the','DT')

0.583419689119171

### Evaluate: build the trellis

- example trellis with an example sequence

In [101]:
sequence = ['the', 'government']

In [102]:
emissions = []
transitions = []
for word in sequence:
    emission = [(t,Pwt(word, t)) for t in pos]
    transition = [(tprev,Ptt(ti,tprev)) for ti in pos for tprev,prob in emission]
    emissions.append(emission)
    transitions.append(transition)

### Evaluate: decode

In [103]:
hyp = []

for e,t in zip(emissions, transitions):
    # argmax of e
    e = sorted(e, key = itemgetter(1))[-1]
    hyp.append(e)

### Evaluate: check accuracy

- load test data
- pass all word sequences through your hmm
- decode
- find the most probable sequence
- compare to the "gold"

In [104]:
hyp

[('DT', 0.583419689119171), ('NN', 0.005738547782532259)]