In [133]:
import numpy as np
import pandas as pd
import json

In [22]:
### Read in data:
tr_headers = ["index", "word", "pos_tag"]
train_df = pd.read_csv("./data/train", sep="\t", header=None)
train_df.columns = tr_headers

dev_df = pd.read_csv("./data/dev", sep="\t", header=None)
dev_df.columns = tr_headers

test_headers = ["index", "word"]
test_df = pd.read_csv("./data/test", sep="\t", header=None)
test_df.columns = test_headers

### 1) Vocabulary Creation:

In [41]:
train_df

Unnamed: 0,index,word,pos_tag
0,1,Pierre,NNP
1,2,Vinken,NNP
2,3,",",","
3,4,61,CD
4,5,years,NNS
...,...,...,...
912090,22,to,TO
912091,23,San,NNP
912092,24,Francisco,NNP
912093,25,instead,RB


In [53]:
# Get the count of each word:
#word-type = word
cnt_d = {}
for row in train_df.iterrows():
    if row[1]["word"] in cnt_d:
        cnt_d[row[1]["word"]] += 1
    else:
        cnt_d[row[1]["word"]] = 1

In [111]:
# Create unknown key:
threshold = 3
unknown_cnt = 0
unknown_word_lst = []   #We want to keep track of unknown words but group together
for k, v in cnt_d.items():
    if v < threshold:
        unknown_cnt += v
        unknown_word_lst.append(k)
    else:
        continue
cnt_d["< unk >"] = unknown_cnt

In [55]:
# Sort the occurences in descending order:
cnt_d_sorted = {k: v for k, v in sorted(cnt_d.items(), key=lambda item: -item[1])}

In [56]:
cnt_d_sorted

{',': 46476,
 'the': 39533,
 '.': 37452,
 '< unk >': 32537,
 'of': 22104,
 'to': 21305,
 'a': 18469,
 'and': 15346,
 'in': 14609,
 "'s": 8872,
 'for': 7743,
 'that': 7723,
 '$': 6762,
 'is': 6735,
 '``': 6673,
 'The': 6578,
 "''": 6500,
 'said': 5418,
 'on': 4905,
 '%': 4718,
 'it': 4509,
 'by': 4274,
 'from': 4238,
 'at': 4142,
 'million': 4122,
 'as': 4054,
 'with': 3987,
 'Mr.': 3856,
 'are': 3629,
 'was': 3615,
 'be': 3584,
 'its': 3382,
 'has': 3184,
 "n't": 3132,
 'an': 3015,
 'have': 2993,
 'will': 2982,
 'he': 2496,
 'company': 2429,
 'or': 2429,
 'which': 2164,
 'year': 2162,
 'would': 2108,
 '--': 2005,
 'about': 1991,
 'says': 1948,
 'they': 1904,
 'this': 1844,
 'more': 1842,
 'were': 1776,
 'market': 1760,
 'In': 1706,
 'billion': 1680,
 'But': 1668,
 'their': 1655,
 'up': 1625,
 'had': 1625,
 'than': 1586,
 'U.S.': 1552,
 'but': 1552,
 'his': 1547,
 'who': 1509,
 'been': 1500,
 'also': 1399,
 'new': 1384,
 'share': 1358,
 'one': 1347,
 'other': 1344,
 ':': 1292,
 'not': 1

In [64]:
### Write the vocab to vocab.txt
#punctuation and numbers also count as being part of vocabulary
i = 0
with open('vocab.txt', 'w') as f:
    f.write("< unk >")
    f.write("/t")
    f.write(str(i))
    f.write("/t")
    f.write(str(cnt_d_sorted["< unk >"]))
    f.write("/n")
    i+=1
    for k, v in cnt_d_sorted.items():
        if k == "< unk >":
            continue
        elif v >= threshold:
            f.write(k)
            f.write("/t")
            f.write(str(i))
            f.write("/t")
            f.write(str(v))
            f.write("/n")
            i+=1
        

In [50]:
print("Selected threshold:", threshold)

Selected threshold: 3


In [63]:
print("Total Size of Vocab:", i)

Total Size of Vocab: 16920


In [62]:
print("Total occurences of < unk >:", cnt_d_sorted["< unk >"])

Total occurences of < unk >: 32537


In [89]:
#Read in the vocab file:
vocab_d = {} #want just 0,word1... 90,word90 mapping
vocab_file = open("vocab.txt", "r").read().splitlines()


for line in vocab_file:
    line_split = line.split("/n")
    for actual_line in line_split:
        actual_line_split = actual_line.split("/t")
        if len(actual_line_split) == 1:
            break
        vocab_d[int(actual_line_split[1])] = actual_line_split[0]

In [91]:
len(vocab_d)

16920

### 2. Model Learning

In [95]:
train_df.head()

Unnamed: 0,index,word,pos_tag
0,1,Pierre,NNP
1,2,Vinken,NNP
2,3,",",","
3,4,61,CD
4,5,years,NNS


In [140]:
### Transition Prob. must create prob of all pos_tag transitions, contain in dictionary:
pos_tag_d = {}
all_pos_tags = train_df["pos_tag"].values
for i, pos_tag in enumerate(all_pos_tags):
    if i == (len(all_pos_tags) - 1):
        break
    
    next_tag = all_pos_tags[i+1]
    if pos_tag in pos_tag_d:
        if next_tag in pos_tag_d[pos_tag]:
            pos_tag_d[pos_tag][next_tag] += 1
        else:
            pos_tag_d[pos_tag][next_tag] = 1
            
    else:
        pos_tag_d[pos_tag] = {}
        pos_tag_d[pos_tag][next_tag] = 1

In [141]:
# Need count of transition state individually:
pos_total_counts_d = {}
for k, v in pos_tag_d.items():
    pos_total_counts_d[k] = 0
    for k_inner, v_inner in v.items():
        pos_total_counts_d[k] += v_inner

In [142]:
# Create transtion prob:
transition_d = {}
for k, v in pos_tag_d.items():
    transition_d[k] = {}
    for k_inner, v_inner in v.items():
        transition_d[k][k_inner] = v_inner/pos_total_counts_d[k]

In [146]:
# Format transition d as wanted:
transition_d_formatted = {}
for k, v in transition_d.items():
    for k_inner, v_inner in v.items():
        key_str = str(k) + ", " + str(k_inner)
        transition_d_formatted[key_str] = transition_d[k][k_inner]

In [147]:
transition_d_formatted

{'NNP, NNP': 0.3788695096338234,
 'NNP, ,': 0.13846908958086018,
 'NNP, CD': 0.019176330928682313,
 'NNP, VBZ': 0.0391973335768423,
 'NNP, VBG': 0.0017806593005205004,
 'NNP, NN': 0.057665966578394665,
 'NNP, WDT': 0.000981645511825404,
 'NNP, NNS': 0.024438407451374305,
 'NNP, IN': 0.04104647977353666,
 'NNP, CC': 0.041149210117797465,
 'NNP, POS': 0.05481234590448361,
 'NNP, .': 0.05458405625057072,
 'NNP, VBD': 0.06480001826317232,
 'NNP, MD': 0.01116336407634006,
 'NNP, TO': 0.007533558579125194,
 'NNP, VBP': 0.004303259976257876,
 'NNP, :': 0.007042735823212492,
 'NNP, RB': 0.009257145466167474,
 'NNP, JJ': 0.008549447539037532,
 "NNP, ''": 0.0025568441238243084,
 'NNP, NNPS': 0.0167222171491188,
 'NNP, DT': 0.002545429641128664,
 'NNP, JJR': 0.00011414482695644234,
 'NNP, PRP': 0.0007647703406081636,
 'NNP, VBN': 0.0008104282713907406,
 'NNP, -RRB-': 0.0036298054972148663,
 'NNP, -LRB-': 0.0032987854990411836,
 'NNP, $': 0.0002853620673911058,
 'NNP, WP': 0.0006163820655647886,
 

In [123]:
### Emission Prob - Must create prob of word given POS tag. Check if word is in Unknown Lst:
#Takes ~3 min to run
pos_to_word_d = {}
all_pos_tags = train_df["pos_tag"].values
all_words = train_df["word"].values
for i, pos_tag in enumerate(all_pos_tags):
#     if i == (len(all_pos_tags) - 1):
#         break
    
#     next_tag = all_pos_tags[i+1]
    word = all_words[i]
    if pos_tag in pos_to_word_d:
        if word in unknown_word_lst:
            if "< unk >" in pos_to_word_d[pos_tag]:
                pos_to_word_d[pos_tag]["< unk >"] += 1
            else:
                pos_to_word_d[pos_tag]["< unk >"] = 1
        else:
            if word in pos_to_word_d[pos_tag]:
                pos_to_word_d[pos_tag][word] += 1
            else:
                pos_to_word_d[pos_tag][word] = 1
            
    else:
        pos_to_word_d[pos_tag] = {}
        pos_to_word_d[pos_tag][word] = 1

In [124]:
pos_to_word_d

{'NNP': {'Pierre': 6,
  '< unk >': 8154,
  'Nov.': 234,
  'Mr.': 3856,
  'N.V.': 13,
  'Dutch': 8,
  'Rudolph': 8,
  'Agnew': 3,
  'Consolidated': 14,
  'Gold': 13,
  'Fields': 3,
  'PLC': 105,
  'Kent': 11,
  'Lorillard': 4,
  'Inc.': 1011,
  'Loews': 3,
  'Corp.': 1157,
  'New': 1152,
  'England': 62,
  'Journal': 100,
  'Medicine': 9,
  'James': 178,
  'A.': 66,
  'Talcott': 4,
  'Boston': 152,
  'Cancer': 13,
  'Institute': 100,
  'Dr.': 99,
  'National': 307,
  'Harvard': 23,
  'University': 136,
  'West': 175,
  'Mass.': 55,
  'Co.': 854,
  'U.S.': 1549,
  'Brooke': 4,
  'T.': 38,
  'Vermont': 14,
  'College': 15,
  'July': 161,
  'Environmental': 18,
  'Protection': 17,
  'Agency': 34,
  'Phillips': 51,
  'Money': 20,
  'Fund': 79,
  'Report': 45,
  'Tuesday': 199,
  'August': 215,
  'Donoghue': 7,
  'Brenda': 3,
  'Malizia': 3,
  'Negus': 4,
  'Treasury': 291,
  'Monday': 204,
  'Dreyfus': 11,
  'Dollar': 8,
  'J.P.': 5,
  'W.R.': 5,
  'Grace': 16,
  'D.': 145,
  'Energy': 46,


In [127]:
# Create emission prob:
emission_d = {}
for k, v in pos_to_word_d.items():
    emission_d[k] = {}
    for k_inner, v_inner in v.items():
        emission_d[k][k_inner] = v_inner/pos_total_counts_d[k]

In [148]:
# Format emission d as wanted:
emission_d_formatted = {}
for k, v in emission_d.items():
    for k_inner, v_inner in v.items():
        key_str = str(k) + ", " + str(k_inner)
        emission_d_formatted[key_str] = emission_d[k][k_inner]

In [150]:
# Consolidate emission/transition:
e_t_results_d = {}
e_t_results_d["transition"] = transition_d_formatted
e_t_results_d["emission"] = emission_d_formatted

In [151]:
# Write the Emission/Transition Prob to a file:
with open('hmm.json', 'w') as f:
    json.dump(e_t_results_d, f)

In [153]:
print("# of Transition Parameters:", len(transition_d_formatted))

# of Transition Parameters: 1378


In [154]:
print("# of Emission Parameters:", len(emission_d_formatted))

# of Emission Parameters: 23374


In [159]:
# Load in Emission/Transition json:
hmm = open("hmm.json", "r")
e_t_model = json.load(hmm)

{'transition': {'NNP, NNP': 0.3788695096338234,
  'NNP, ,': 0.13846908958086018,
  'NNP, CD': 0.019176330928682313,
  'NNP, VBZ': 0.0391973335768423,
  'NNP, VBG': 0.0017806593005205004,
  'NNP, NN': 0.057665966578394665,
  'NNP, WDT': 0.000981645511825404,
  'NNP, NNS': 0.024438407451374305,
  'NNP, IN': 0.04104647977353666,
  'NNP, CC': 0.041149210117797465,
  'NNP, POS': 0.05481234590448361,
  'NNP, .': 0.05458405625057072,
  'NNP, VBD': 0.06480001826317232,
  'NNP, MD': 0.01116336407634006,
  'NNP, TO': 0.007533558579125194,
  'NNP, VBP': 0.004303259976257876,
  'NNP, :': 0.007042735823212492,
  'NNP, RB': 0.009257145466167474,
  'NNP, JJ': 0.008549447539037532,
  "NNP, ''": 0.0025568441238243084,
  'NNP, NNPS': 0.0167222171491188,
  'NNP, DT': 0.002545429641128664,
  'NNP, JJR': 0.00011414482695644234,
  'NNP, PRP': 0.0007647703406081636,
  'NNP, VBN': 0.0008104282713907406,
  'NNP, -RRB-': 0.0036298054972148663,
  'NNP, -LRB-': 0.0032987854990411836,
  'NNP, $': 0.000285362067391

### 3. Greedy Decoding with HMM:

In [155]:
# s1* = arg max t(s1)e(x1|s1)
# s2* = arg max t(s2|s1*)e(x2|s2)

In [163]:
train_df[train_df["index"] == 1]


Unnamed: 0,index,word,pos_tag
0,1,Pierre,NNP
18,1,Mr.,NNP
31,1,Rudolph,NNP
57,1,A,DT
93,1,The,DT
...,...,...,...
911951,1,After,IN
911981,1,And,CC
912016,1,Two,CD
912050,1,The,DT


In [171]:
# Calculate best odds to be t(s1):
most_likely_start_d = {}
pos_tags_start = train_df[train_df["index"] == 1]["pos_tag"]
for pos_tag in pos_tags_start:
    if pos_tag in most_likely_start_d:
        most_likely_start_d[pos_tag] += 1
    else:
        most_likely_start_d[pos_tag] = 1

In [172]:
most_likely_start_prob_d = {k:v/len(pos_tags_start) for k,v in most_likely_start_d.items()}

In [173]:
most_likely_start_prob_d

{'NNP': 0.19789104610393007,
 'DT': 0.21911141347009264,
 'IN': 0.1288398137003506,
 'PRP': 0.06148935056779528,
 'EX': 0.004238840337013972,
 '``': 0.07472918520069077,
 'CD': 0.011225077188759224,
 'RBR': 0.0020932544874143074,
 'NNS': 0.041237113402061855,
 'NN': 0.0411847820398765,
 'JJ': 0.041708095661730074,
 'JJR': 0.0017007692710241248,
 'RB': 0.05604688890051808,
 'WRB': 0.00609660369459417,
 'CC': 0.05691035637657648,
 'VBG': 0.012010047621539588,
 'WDT': 0.0008111361138730441,
 'VBN': 0.005834946883667382,
 '-LRB-': 0.003427704223140928,
 'VB': 0.0030613846878434245,
 'WP': 0.003113716050028782,
 'PRP$': 0.007797372965618295,
 'TO': 0.0035323669475116437,
 'JJS': 0.00248573970380449,
 'NNPS': 0.0020409231252289496,
 'VBZ': 0.001517609503375373,
 'VBD': 0.0007588047516876865,
 'LS': 0.0009157988382437595,
 "''": 0.0003663195352975038,
 ':': 0.002799727876916636,
 'VBP': 0.0003663195352975038,
 'PDT': 0.0007326390705950076,
 'UH': 0.0006279763462242922,
 'MD': 0.00054947930294

In [160]:
emission_d

{'NNP': {'Pierre': 6.84868961738654e-05,
  '< unk >': 0.09307369190028308,
  'Nov.': 0.0026709889507807506,
  'Mr.': 0.044014245274404167,
  'N.V.': 0.00014838827504337504,
  'Dutch': 9.131586156515387e-05,
  'Rudolph': 9.131586156515387e-05,
  'Agnew': 3.42434480869327e-05,
  'Consolidated': 0.00015980275773901928,
  'Gold': 0.00014838827504337504,
  'Fields': 3.42434480869327e-05,
  'PLC': 0.0011985206830426446,
  'Kent': 0.00012555930965208657,
  'Lorillard': 4.5657930782576936e-05,
  'Inc.': 0.01154004200529632,
  'Loews': 3.42434480869327e-05,
  'Corp.': 0.013206556478860378,
  'New': 0.013149484065382157,
  'England': 0.0007076979271299425,
  'Journal': 0.0011414482695644233,
  'Medicine': 0.0001027303442607981,
  'James': 0.0020317779198246737,
  'A.': 0.0007533558579125194,
  'Talcott': 4.5657930782576936e-05,
  'Boston': 0.0017350013697379236,
  'Cancer': 0.00014838827504337504,
  'Institute': 0.0011414482695644233,
  'Dr.': 0.001130033786868779,
  'National': 0.00350424618756

In [193]:
pred_lst = []
word_lst = []
all_words = train_df["word"].values
prev_pos = "None"
for i, word in enumerate(all_words):
    #word_lst.append(word)
    best_prob = 0
    best_pos = "None"
    if word in unknown_word_lst:
        word = "< unk >"
    #print(word)
    if i == 0:
        for pos_tag in emission_d.keys():
            if word in emission_d[pos_tag]: 
                e_x1_s1 = emission_d[pos_tag][word]*most_likely_start_prob_d[pos_tag]
                if e_x1_s1 > best_prob:
                    best_prob = e_x1_s1
                    best_pos = pos_tag
            else:
                continue
        pred_lst.append(best_pos)
        prev_pos = best_pos
    else:
        for pos_tag in emission_d.keys():
            if word in emission_d[pos_tag]:
                e_x2_s2 = emission_d[pos_tag][word]
                if pos_tag in transition_d[prev_pos]:
                    t_s2_s1 = transition_d[prev_pos][pos_tag]
                else:
                    t_s2_s1 = 0
                prob_ = e_x2_s2*t_s2_s1
                if prob_ > best_prob:
                    best_prob = prob_
                    best_pos = pos_tag
            else:
                continue
                
        if best_pos == "None": #Edge case where word and POS DO NOT Allign
            for pos_tag in emission_d.keys():
                if word in emission_d[pos_tag]:
                    e_x2_s2 = emission_d[pos_tag][word]
                    if e_x2_s2 > best_prob:
                        best_prob = e_x2_s2
                        best_pos = pos_tag
        pred_lst.append(best_pos)
        prev_pos = best_pos

In [196]:
training_acc = sum(np.array(pred_lst) == train_df["pos_tag"].values)/len(np.array(pred_lst))
training_acc

In [189]:
word_lst[6826]

'and'

In [190]:
pred_lst[6825]

'PDT'

In [191]:
transition_d["PDT"]

{'DT': 0.918918918918919, 'PRP$': 0.08108108108108109}

In [192]:
emission_d["PDT"]

{'half': 0.16816816816816818,
 'all': 0.4744744744744745,
 'Such': 0.021021021021021023,
 'such': 0.21921921921921922,
 'both': 0.03003003003003003,
 'All': 0.06606606606606606,
 'quite': 0.003003003003003003,
 'Quite': 0.003003003003003003,
 '< unk >': 0.003003003003003003,
 'Many': 0.003003003003003003,
 'many': 0.003003003003003003,
 'Both': 0.003003003003003003,
 'Half': 0.003003003003003003}