In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
import warnings
warnings.simplefilter("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashut\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
corpus = pd.read_csv('data/train', names=["index", "type", "pos"], sep='\t', error_bad_lines=False, warn_bad_lines=False)

In [3]:
# corpus['type'] = corpus['type'].str.lower()
corpus

Unnamed: 0,index,type,pos
0,1,Pierre,NNP
1,2,Vinken,NNP
2,3,",",","
3,4,61,CD
4,5,years,NNS
...,...,...,...
912090,22,to,TO
912091,23,San,NNP
912092,24,Francisco,NNP
912093,25,instead,RB


### 1. Vocabulary Creation

In [4]:
counts = pd.DataFrame(corpus['type'].value_counts())
# counts_df = pd.DataFrame({'type': counts.index, 'idx' :range(1, len(vocab) + 1), 'count': counts['type']})
counts.reset_index(inplace=True)
counts.columns = ['type', 'count']
counts.head()

Unnamed: 0,type,count
0,",",46476
1,the,39533
2,.,37452
3,of,22104
4,to,21305


In [5]:
unknown_count = len(counts[counts['count']<=1])

vocab = {'type': ['<unk>'], 'index': [0], 'count': [unknown_count]}
cnt = 1

for _, row in counts.iterrows():
    if row['count'] > 1:
        vocab['type'].append(row['type'])
        vocab['index'].append(cnt)
        vocab['count'].append(row['count'])
        cnt+=1

vocab = pd.DataFrame(vocab)
vocab
# vocab = vocab[vocab.count!=1]
# drop_indexes = vocab[vocab['count']==1]].index
# vocab.drop(drop_indexes, inplace=True)

Unnamed: 0,type,index,count
0,<unk>,0,20011
1,",",1,46476
2,the,2,39533
3,.,3,37452
4,of,4,22104
...,...,...,...
23178,wrestle,23178,2
23179,Etc.,23179,2
23180,parakeet,23180,2
23181,1990-2002,23181,2


In [6]:
vocab.to_csv('vocab.txt', sep='\t')

### I have selected the threshold as 1 to identify a word as unknown, my vocabulary size is 23183 and total occurences of 'unk' is 20011.

### 2. Model Learning

In [7]:
vocab_set = set(vocab['type'])
vocab_set.remove('<unk>')

In [8]:
emission_prob = {}
transition_prob = {}
pos_count = {}

for _, row in corpus.iterrows():
    print(f'\r{_}', end='')
    if row['type'] in vocab_set:
        tp = row['type']
    else:
        tp = '<unk>'
    pos_count[row['pos']] = pos_count.get(row['pos'], 0) + 1
    if tp not in emission_prob:
        emission_prob[tp] = {}
    emission_prob[tp][row['pos']] = emission_prob[tp].get(row['pos'], 0) + 1
    if row['index'] == 1:
        if '.' not in transition_prob:
            transition_prob['.'] = {}
        transition_prob['.'][row['pos']] = transition_prob['.'].get(row['pos'], 0) + 1
    else:
        if corpus.iloc[_-1]['pos'] not in transition_prob:
            transition_prob[corpus.iloc[_-1]['pos']] = {}
        transition_prob[corpus.iloc[_-1]['pos']][row['pos']] = transition_prob[corpus.iloc[_-1]['pos']].get(row['pos'], 0) + 1

912094

In [9]:
for k1 in emission_prob.keys():
    for k2 in emission_prob[k1].keys():
        emission_prob[k1][k2] /= pos_count[k2]
emission_prob

{'Pierre': {'NNP': 6.84868961738654e-05},
 'Vinken': {'NNP': 2.2828965391288468e-05},
 ',': {',': 0.9999139414802065},
 '61': {'CD': 0.0007168253240050465},
 'years': {'NNS': 0.019530237301024905},
 'old': {'JJ': 0.003613599348534202},
 'will': {'MD': 0.3138709335593939,
  'NN': 0.00014897987987517054,
  'VB': 3.9232610145552985e-05},
 'join': {'VB': 0.0015693044058221193, 'VBP': 8.112932013629726e-05},
 'the': {'DT': 0.5016439225642653,
  'NNP': 6.84868961738654e-05,
  'JJ': 0.00011875678610206297,
  'CD': 2.867301296020186e-05,
  'VBP': 8.112932013629726e-05,
  'NN': 7.841046309219502e-06},
 'board': {'NN': 0.0023287907538381922},
 'as': {'IN': 0.0353954283543342,
  'RB': 0.023598122953310152,
  'JJ': 1.696525515743757e-05},
 'a': {'DT': 0.2341478895588702,
  'JJ': 3.393051031487514e-05,
  'IN': 1.0553198674518247e-05,
  'LS': 0.0425531914893617,
  'FW': 0.03125,
  'SYM': 0.18181818181818182,
  'NNP': 2.2828965391288468e-05},
 'nonexecutive': {'JJ': 0.00010179153094462541},
 'directo

In [10]:
for k1 in transition_prob.keys():
    for k2 in transition_prob[k1].keys():
        transition_prob[k1][k2] /= pos_count[k1]
transition_prob

{'.': {'NNP': 0.2010400443470686,
  'DT': 0.22115460760763403,
  'IN': 0.13005833751286858,
  'PRP': 0.06205949898371301,
  'EX': 0.004276324472718634,
  "''": 0.058891851226143654,
  '``': 0.07591795792307895,
  'CD': 0.011456326056542513,
  'RBR': 0.0021117651717129054,
  'NNS': 0.04165456801203706,
  'NN': 0.041865744529208354,
  'JJ': 0.04218250930496529,
  'JJR': 0.0017158092020167358,
  'RB': 0.05662170366655228,
  'WRB': 0.006176913127260248,
  'CC': 0.05744001267059103,
  'VBG': 0.012116252672702795,
  'WDT': 0.0008183090040387509,
  'VBN': 0.005912942480796136,
  '-LRB-': 0.004936251088878916,
  '-RRB-': 0.005279412929282264,
  'VB': 0.00340522133938706,
  'WP': 0.0031676477575693584,
  'PRP$': 0.007866325264630573,
  'TO': 0.0035900007919119395,
  'JJS': 0.0025077211414090755,
  'NNPS': 0.0021909563656521396,
  'VBZ': 0.0015574268141382679,
  'VBD': 0.0007919119393923396,
  'LS': 0.0009238972626243962,
  ':': 0.003062059498983713,
  'VBP': 0.0003959559696961698,
  'PDT': 0.00

In [11]:
emission = {}
transition = {}

for _, row in corpus.iterrows():
    print(f'\r{_}', end='')
    if row['type'] in vocab_set:
        emission[str((row['type'], row['pos']))] = emission_prob.get(str((row['type'], row['pos'])), 0) + 1
        if row['index'] == 1:
            transition[str(('<start>', row['pos']))] = transition.get(str(('<start>', row['pos'])), 0) + 1
        else:
            transition[str((row['pos'], corpus.iloc[_-1]['pos']))] = transition.get(str((row['pos'], corpus.iloc[_-1]['pos'])), 0) + 1

912094

In [12]:
hmm = {'emission': emission, 'transition': transition}
import json

with open('hmm.json', 'w') as f:
    json.dump(hmm, f)

In [None]:
print(f'The number of emission and transition parameters are {len(hmm['emission'])} and {len(hmm['transition'])} respectively')

### 3. Greedy Decoding with HMM

In [13]:
transition_prob['NNP']['VBZ']

0.0391973335768423

In [14]:
dev_corpus = pd.read_csv('data/dev', names=["index", "type", "pos"], sep='\t', error_bad_lines=False, warn_bad_lines=False)

In [15]:
dev_corpus.head()

Unnamed: 0,index,type,pos
0,1,The,DT
1,2,Arizona,NNP
2,3,Corporations,NNP
3,4,Commission,NNP
4,5,authorized,VBD


In [16]:
len(dev_corpus)

131768

In [17]:
total = 0
correct = 0
prev = '.'

for idx, row in dev_corpus.iterrows():
    print(f'\r{idx}', end='')
    lst = []
    true_pos = row['pos']
    for k, v in transition_prob[prev].items():
        if row['type'] not in vocab_set:
            tp = '<unk>'
        else:
            tp = row['type']
#         if k not in emission_prob[tp]:
#             prob = 1 / (pos_count[k] + 1365)
#             num, den = 0, 0
#             for k1 in emission_prob.keys():
#                 for k2 in emission_prob[k1].keys():
#                     if k2 == k:
#                         num+=emission_prob[k1][k2]
#                         den+=1
#             ep = num / den
#             ep = unk_dist['unknown'][k]
#             lst.append((k, v*ep))
        if tp in emission_prob and k in emission_prob[tp]:
            lst.append((k, v*emission_prob[tp][k]))
        else:
            lst.append((k, 0))
    lst.sort(key=lambda x: x[1], reverse=True)
    if lst[0][0] == 0:
        tags = transition_prob[prev]
        vals = sorted(tags.items(), key=lambda x: x[1], reverse=True)
        tag = vals[0][0]
    else:
        tag = lst[0][0]
    prev = tag
    total+=1
    if tag == true_pos:
        correct+=1

131767

In [18]:
print('Accuracy with Greedy HMM algorithm is {:.2f}'.format(correct/total*100))

Accuracy with Greedy HMM algorithm is 93.50


In [19]:
sum([len(v) for k, v in transition_prob.items()])

1365

<h4> Evaluating on test data </h5>

In [20]:
test_corpus = pd.read_csv('data/test', names=["index", "type", "pos"], sep='\t', error_bad_lines=False, warn_bad_lines=False)

total = 0
correct = 0
prev = '.'

f = open('greedy.out', 'w')

for idx, row in test_corpus.iterrows():
    print(f'\r{idx}', end='')
    lst = []
#     true_pos = row['pos']
    for k, v in transition_prob[prev].items():
        if row['type'] not in vocab_set:
            tp = '<unk>'
        else:
            tp = row['type']
#         if k not in emission_prob[tp]:
#             prob = 1 / (pos_count[k] + 1365)
#             num, den = 0, 0
#             for k1 in emission_prob.keys():
#                 for k2 in emission_prob[k1].keys():
#                     if k2 == k:
#                         num+=emission_prob[k1][k2]
#                         den+=1
#             ep = num / den
#             ep = unk_dist['unknown'][k]
#             lst.append((k, v*ep))
        if tp in emission_prob and k in emission_prob[tp]:
            lst.append((k, v*emission_prob[tp][k]))
        else:
            lst.append((k, 0))
    lst.sort(key=lambda x: x[1], reverse=True)
    if lst[0][0] == 0:
        tags = transition_prob[prev]
        vals = sorted(tags.items(), key=lambda x: x[1], reverse=True)
        tag = vals[0][0]
    else:
        tag = lst[0][0]
    prev = tag
    total+=1
    f.write(str(str(row['index']) + '\t' + row['type'] + '\t' + tag))
    f.write('\n')
f.close()
#     if tag == true_pos:
#         correct+=1

129653

### 4. Viterbi Decoding with HMM

In [21]:
dev_corpus.head()

Unnamed: 0,index,type,pos
0,1,The,DT
1,2,Arizona,NNP
2,3,Corporations,NNP
3,4,Commission,NNP
4,5,authorized,VBD


In [25]:
hmmDecode = {}
prevs = ['.']
true_pos = []
pred_pos = []
total = 0
correct = 0
cntr = 0

for idx, row in dev_corpus.iterrows():
    print(f'\r{idx}', end='')
    hmmDecode[cntr] = {}  
    true_pos.append(row['pos'])  
    curr_prevs = set()
    for prev in prevs:
        for k, v in transition_prob[prev].items():
            if row['type'] not in vocab_set:
                tp = '<unk>'
            else:
                tp = row['type']
            if k not in emission_prob[tp]:
                et = v / (pos_count[k] + 1365)
#                 et = v*unk_dist[k][0]/unk_dist[k][1]
#                 num, den = 0, 0
#                 for key, val in emission_prob.items():
#                     if key == k:
#                         num+=val
#                         den+=1
#                 ep = v * num / den
#                 if true_pos != [] and true_pos[-1] == row['pos']:
#                     true_pos.pop()
            else:
                et = v*emission_prob[tp][k]
            curr_prevs.add(k)
            if cntr == 0:
                hmmDecode[cntr][k] = [et, [k]]
            elif k not in hmmDecode[cntr]:
                prevProb = hmmDecode[cntr - 1][prev][0]
                currProb = et
                path = hmmDecode[cntr - 1][prev][1] + [k]
                hmmDecode[cntr][k] = [prevProb*currProb, path]
            else:
                prevProb = hmmDecode[cntr][k][0]
                currProb = hmmDecode[cntr - 1][prev][0]*et
                if currProb > prevProb:
                    path = hmmDecode[cntr - 1][prev][1] + [k]
                    hmmDecode[cntr][k] = [currProb, path]
                    
#         print(hmmDecode[cntr] == {}, cntr)
#         if hmmDecode[cntr] == {}:
#             if cntr == 0:
#                 tags = transition_prob['<s>']
#                 vals = sorted(tags.items(), key=lambda x: x[1], reverse=True)
#                 tag = vals[0][0]
#                 hmmDecode[cntr][k] = [transition_prob['<s>'][tag], [k]]
#             else:
#                 tags = transition_prob[prev]
#                 vals = sorted(tags.items(), key=lambda x: x[1], reverse=True)
#                 tag = vals[0][0]
#                 prevProb = hmmDecode[cntr - 1][prev][0]
#                 hmmDecode[cntr][tag] = [prevProb*transition_prob[prev][tag], path + [tag]]
    cntr+=1
    prevs = tuple(curr_prevs)
    if idx == len(dev_corpus) - 1:
#     if idx == 1000:
#         pred_pos = list(hmmDecode[cntr-1].values())[0][1]
#         for i in range(len(true_pos)):
#             total+=1
#             if true_pos[i] == pred_pos[i]:
#                 correct+=1
        break
    if idx != 0 and dev_corpus.iloc[idx + 1]['index'] == 1:
        pred_pos = list(hmmDecode[cntr-1].values())[0][1]
#         print(pred_pos, true_pos)
        for i in range(len(true_pos) - 1):
            total+=1
#             if true_pos[i] == '.':
#                 correct+=1
            if true_pos[i] == pred_pos[i]:
                correct+=1
        hmmDecode = {}
        true_pos = []
        pred_pos = []
        prevs = ['.']
        cntr = 0


131767

In [26]:
print('Accuracy with Viterbi HMM algorithm is {:.2f}'.format(correct/total*100))

Accuracy with Viterbi HMM algorithm is 91.23


In [None]:
# pos_list = {}
# for idx, row in dev_corpus.iterrows():
#     print(f'\r{idx}', end='')
#     if row['type'] not in pos_list:
#         pos_list[row['type']] = {row['pos']}
#     else:
#         pos_list[row['type']].add(row['pos'])
# pos_list

In [405]:
test_corpus = pd.read_csv('data/test', names=["index", "type", "pos"], sep='\t', error_bad_lines=False, warn_bad_lines=False)

hmmDecode = {}
prevs = ['.']
pred_pos = []
final_preds = []
cntr = 0

f = open('viterbi.out', 'w')

for idx, row in test_corpus.iterrows():
    print(f'\r{idx}', end='')
    hmmDecode[cntr] = {}  
    curr_prevs = set()
    for prev in prevs:
        if row['type'] not in vocab_set:
            tp = '<unk>'
        else:
            tp = row['type']
        if k not in emission_prob[tp]:
            et = v / (pos_count[k] + 1365)
        else:
            et = v*emission_prob[tp][k]
#             print(idx, prev, k, et)
        curr_prevs.add(k)
        if cntr == 0:
            hmmDecode[cntr][k] = [et, [k]]
        elif k not in hmmDecode[cntr]:
            prevProb = hmmDecode[cntr - 1][prev][0]
            currProb = et
            path = hmmDecode[cntr - 1][prev][1] + [k]
            hmmDecode[cntr][k] = [prevProb*currProb, path]
        else:
            prevProb = hmmDecode[cntr][k][0]
            print(et)
#                 print(hmmDecode[cntr - 1][prev][0])
            currProb = hmmDecode[cntr - 1][prev][0]*et
            if currProb > prevProb:
                path = hmmDecode[cntr - 1][prev][1] + [k]
                hmmDecode[cntr][k] = [currProb, path]
    cntr+=1
    prevs = tuple(curr_prevs)
    if idx == len(test_corpus) - 1:
#     if idx == 100:
        break
    if idx != 0 and test_corpus.iloc[idx + 1]['index'] == 1:
        pred_pos = list(hmmDecode[cntr-1].values())[0][1]
        
        final_preds = final_preds + pred_pos
        print(hmmDecode)
#         for i in range(len(pred_pos) - 1):
            
#             f.write(str(str(row['index']) + '\t' + row['type'] + '\t' + pred_pos[i]))
#             f.write('\n')
        hmmDecode = {}
        pred_pos = []
        prevs = ['.']
        cntr = 0
f.close()


0123456789101112131415161718192021222324252627282930313233343536{0: {'JJS': [7.405847173696822e-07, ['JJS']]}, 1: {'JJS': [7.920679965687348e-15, ['JJS', 'JJS']]}, 2: {'JJS': [8.47130242461161e-23, ['JJS', 'JJS', 'JJS']]}, 3: {'JJS': [9.06020254322989e-31, ['JJS', 'JJS', 'JJS', 'JJS']]}, 4: {'JJS': [9.690041272267868e-39, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 5: {'JJS': [1.0363664543947513e-46, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 6: {'JJS': [1.108411613135859e-54, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 7: {'JJS': [1.1854651401776012e-62, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 8: {'JJS': [1.267875202606748e-70, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 9: {'JJS': [1.356014171065606e-78, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 10: {'JJS': [1.4502803023122679e-86, ['JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS', 'JJS']]}, 11: {

In [23]:
pos_count

{}

In [370]:
# emission_dist = {}

# for k1 in emission_prob.keys():
#     for k2 in emission_prob[k1].keys():
#         if k2 not in emission_dist:
#             emission_dist[k2] = [0, 0]
#         emission_dist[k2][0]+=emission_prob[k1][k2]
#         emission_dist[k2][1]+=1

# # for k1 in unk_dist.keys():
# #     for k2 in unk_dist[k1].keys():
# #         unk_dist[k1][k2] /= pos_count[k2]

# emission_dist

In [371]:
# transition_dist = {}

# for k1 in transition_prob.keys():
#     for k2 in transition_prob[k1].keys():
#         if k1 not in transition_dist:
#             transition_dist[k1] = [0, 0]
#         transition_dist[k1][0]+=transition_prob[k1][k2]
#         transition_dist[k1][1]+=1

# # for k1 in unk_dist.keys():
# #     for k2 in unk_dist[k1].keys():
# #         unk_dist[k1][k2] /= pos_count[k2]

# transition_dist

In [372]:
# emission_dist = {}
# total_tags = 0
# for idx, row in corpus.iterrows():
#     print(f'\r{idx}', end='')
#     total_tags+=1
#     emission_dist[row['pos']] = emission_dist.get(row['pos'], 0) + 1
    
# for k1 in emission_dist.keys():
#     emission_dist[k1] /= total_tags

# emission_dist, total_tags