In [2]:
import pandas as pd
from collections import Counter as ctr
from operator import itemgetter

In [3]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'pos', 'other'])
train['word'] = train.word.str.lower()
train['ppos'] = train.pos.shift(1)
train['ppos'] = train.ppos.fillna('O')

### Write the functions

$$P(T_i|T_{i-1})$$

$$P(W_i|T_i)$$

In [4]:
smooth = 0.00001
pos = list(set(train.pos))

In [5]:
pos_ppos = {}
pos_ppos_denom = {}
for col_name in set(train.ppos):
    sub_frame = train[train.ppos == col_name]
    counted = ctr(sub_frame.pos)
    pos_ppos[col_name] = counted
    pos_ppos_denom[col_name] = len(sub_frame)
    

# conditional 
def Ptt(Ti,Tprev):
    if Ti not in pos_ppos[Tprev]: return smooth
    return pos_ppos[Tprev][Ti] / pos_ppos_denom[Tprev]

word_pos = {}
word_pos_denom = {}
for col_name in set(train.pos):
    sub_frame = train[train.pos == col_name]
    counted = ctr(sub_frame.word)
    word_pos[col_name] = counted
    word_pos_denom[col_name] = len(sub_frame)
    

# conditional 
def Pwt(W, T):
    if W not in word_pos[T]: return smooth
    return word_pos[T][W] / word_pos_denom[T]

### Evaluate: decode

In [6]:
from math import log2

def greedy(words):
    hyp = []
    tag_prev = 'O'
    for word in words:
        t = max([(tag, Ptt(tag, tag_prev)*Pwt(word, tag)) for tag in pos], key=lambda x: x[1])[0]
        hyp.append(t)
        tag_prev = t
    return pd.Series(hyp)

class Node:
    def __init__(self, tag, prob, backptr=None):
        self.tag = tag
        self.prob = prob
        self.backptr = backptr
        
def viterbi(words):
    array = [Node(tag, log2(Pwt(words[0], tag))) for tag in pos]
    for word in words[1:]:
        col = []
        for tag in pos:
            col_mem = []
            for prev in array:
                col_mem.append(Node(tag, log2(Ptt(tag, prev.tag)) + prev.prob, prev))
            col.append(max(col_mem, key=lambda x: x.prob))
        for t in col: 
            t.prob += log2(Pwt(word, t.tag))
        array = col
    last = max(array, key=lambda x: x.prob)
    res = []
    while last: 
        res.append(last.tag)
        last = last.backptr
    res.reverse()
    return pd.Series(res)

### Evaluate: check accuracy

- load test data
- pass all word sequences through your hmm
- decode
- find the most probable sequence
- compare to the "gold"

In [7]:
test = pd.read_csv('test.txt', delimiter=' ', names=['word', 'pos', 'other'])
test['word'] = test.word.str.lower()

In [8]:
hyp = greedy(test.word)

In [9]:
from sklearn.metrics import accuracy_score

accuracy_score(test.pos, hyp)

0.9055659919370158

In [10]:
import time
t = time.time()
hyp = viterbi(test.word)
time.time() - t

85.59999513626099

In [11]:
from sklearn.metrics import accuracy_score

test['hyp'] = hyp
print(accuracy_score(test.pos, test.hyp))

0.9230850412647487


In [19]:
test['hyp'].name = 'predicted'
test['pos'].name = 'actual'
confusion_mx = pd.crosstab(test['pos'], test['hyp'], rownames=['Actual'], colnames=['Predicted'], margins=True)
pos = set(pos) - {"SYM"}
tuples = [(predicted, actual, confusion_mx[predicted][actual]) for predicted in pos for actual in pos if actual != 'All']
tuples = filter(lambda x: x[0] != x[1], tuples)
tuples = [(predicted, actual, count/confusion_mx[predicted]['All'], count) for (predicted, actual, count) in tuples if count]
sorted(tuples, key=lambda x: x[2])[::-1]

[('RP', 'IN', 0.625, 20),
 ('FW', 'NNP', 0.5, 1),
 ('NNPS', 'NNS', 0.23958333333333334, 23),
 ('PDT', 'DT', 0.18181818181818182, 2),
 ('RP', 'RB', 0.09375, 3),
 ('VBN', 'VBD', 0.07808090310442145, 83),
 ('WDT', 'DT', 0.07009345794392523, 15),
 ('JJ', 'NNP', 0.06651147322913202, 200),
 ('JJS', 'RBS', 0.06493506493506493, 5),
 ('NN', 'NNP', 0.06341181568684097, 439),
 ('VBP', 'VB', 0.06274509803921569, 32),
 ('NNP', 'NN', 0.05535224153705398, 242),
 ('JJR', 'RBR', 0.05365853658536585, 11),
 ('PRP', 'NNP', 0.038680318543799774, 34),
 ('VBD', 'VBN', 0.03726328649969456, 61),
 ('$', 'CD', 0.03631961259079903, 15),
 ('JJ', 'NN', 0.035916195543731294, 108),
 ('WDT', 'IN', 0.03271028037383177, 7),
 ('RBR', 'JJ', 0.03225806451612903, 2),
 ('RBR', 'JJR', 0.03225806451612903, 2),
 ('VB', 'NN', 0.030959752321981424, 40),
 ('NN', 'NNS', 0.027733641484905387, 192),
 ('VB', 'VBP', 0.02631578947368421, 34),
 ('DT', 'JJ', 0.0231935771632471, 104),
 ('DT', 'NNP', 0.0231935771632471, 104),
 ('RB', 'IN', 

The classifier mistook IN's (conjunction, subordinating or preposition) as being RP's (adverb, particle) 62.5% of the time it guessed RP. This is likely because some of the same words can either be a preposition or a particle (in, on, off, before, etc.). The fact that the disconnect is this high probably means that the test data does not accurately represent the training data for this particular issue. 

About half the time, the classifier will thnk a proper noun is a foriegn word. I find the distinction of "foreign word" confusing, because it doesn't communicate how the word is used at all. The classifier is also not able to capture the fact that proper nouns generally start with a capital letter (all the input is lowercased). Since this is the main identifier for proper nouns, it makes sense they would not be classified properly. The classifier only predicted foreign words twice, so there aren't any generalizations I can go off of.

24% of guessed plural proper nouns are actually plural nouns. Given their contextual similarity, this is not surprising.

In [21]:
from client.api.notebook import Notebook
ok = Notebook('a3.ok')
ok.auth(inline=True)

Assignment: A3 HMM
OK, version v1.13.11

Successfully logged in as arjunshukla@u.boisestate.edu


In [None]:
ok.submit()

<IPython.core.display.Javascript object>