In [1]:
# from reader import parse_data, list_to_freq_dict, parse_data_test
import xml.etree.ElementTree as ET
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import pickle
import os
import json
import numpy as np
import multiprocessing
import time
from numpy import asarray
from numpy import savetxt
pp = pprint.PrettyPrinter(indent=4)

In [2]:
import operator

def get_column(list_, n):
    return map(operator.itemgetter(n), list_)

In [3]:
def parse_data(file):
    
    tree = ET.parse(file)
    root = tree.getroot()
    
    data = []
    labels = []
    
    for s_tag in root.iter('s'):
        
        sentence = []
        tags = []
        
        for e_tag in s_tag:
            
            if e_tag.tag == 'w':
                word = e_tag.text.replace(" ", "")
                tag = e_tag.attrib['c5']
                
                sentence.append(word)
                tags.append(tag)
                
            elif e_tag.tag == 'c':
                if e_tag.text is not None:
                    tag = e_tag.attrib['c5']
                    word = e_tag.text.replace(" ", "")
                    
                    sentence.append(word)
                    tags.append(tag)
                    
            elif e_tag.tag == 'mw':
                tag = e_tag.attrib['c5']
                word = ""
                for w_tag in e_tag.iterfind('w'):     
                    word += w_tag.text.replace(" ", "")
                
                sentence.append(word)
                tags.append(tag)
                
        data.append(sentence)
        labels.append(tags)

    return data, labels

In [4]:
def load_dataset(path):

    data = []
    labels = []

    for subdir, dirs, files in os.walk(path):
        for file in files:

            fileName = subdir + '/' + str(file)
            file_data, file_labels = parse_data(fileName)
            data.extend(file_data)
            labels.extend(file_labels)

    return data, labels

In [5]:
# Load Dataset

train_path = 'Train-corpus/'
test_path = 'Test-corpus/'

data, labels = load_dataset(train_path)
test_data, test_labels = load_dataset(test_path)

In [6]:
print(len(data))
print(len(labels))

print(data[0])
print(labels[0])

print(len(test_data))
print(len(test_labels))

print(test_data[11])
print(test_labels[11])

483629
483629
['Wonder', 'boy', "'s", 'eyes', 'on', 'Wembley', '.']
['VVB-NN1', 'NN1', 'POS', 'NN2', 'PRP', 'NP0', 'PUN']
200468
200468
['These', '‘', 'communities', '’', 'are', 'of', 'two', 'kinds', '.']
['DT0', 'PUQ', 'NN2', 'PUQ', 'VBB', 'PRF', 'CRD', 'NN2', 'PUN']


In [7]:
# Load JSON Files

with open('words.json') as f:
    word_dict = json.load(f)
with open('tags.json') as f:
    tag_dict = json.load(f)
with open('word_tags.json') as f:
    word_tags_dict = json.load(f)

In [8]:
print(len(tag_dict))
print(len(word_dict))
print(len(word_tags_dict))


61
193511
253488


In [9]:
def compute_word_tag_freq_matrix():
    
    rows = len(word_dict.keys())
    cols = len(tag_dict.keys())
    
    mat = [[0 for i in range(cols)] for j in range(rows)] 
    
    i=0
    for word in word_dict.keys():
        j=0
        for tag in tag_dict.keys():
            case = word + "_" + tag
            if case in word_tags_dict.keys():
                mat[i][j] = word_tags_dict[case]
            j = j + 1
        i = i + 1
        
    return mat

In [10]:
freq_matrix = compute_word_tag_freq_matrix()
savetxt('freq_matrix.csv', freq_matrix, delimiter=',')

In [14]:
tag_dict.keys()

dict_keys(['VVB', 'NN1', 'POS', 'NN2', 'PRP', 'NP0', 'VVZ', 'AT0', 'PRF', 'AJ0', 'ORD', 'DT0', 'VM0', 'VVI', 'TO0', 'VHI', 'PNI', 'VBZ', 'XX0', 'AV0', 'CJC', 'PNP', 'PNQ', 'DPS', 'VHZ', 'VVN', 'NN0', 'CJT', 'CJS', 'AVQ', 'AVP', 'DTQ', 'AJS', 'VHD', 'CRD', 'VVG', 'VVD', 'VBD', 'VBG', 'VBI', 'AJC', 'UNC', 'VHB', 'VBN', 'PNX', 'VHG', 'EX0', 'VBB', 'VDN', 'VDD', 'ITJ', 'ZZ0', 'VHN', 'VDB', 'VDZ', 'VDI', 'VDG', 'PUN', 'PUQ', 'PUL', 'PUR'])

In [11]:
def compute_emission_prob_matrix(freq):
    rows = len(freq_matrix)
    cols = len(freq_matrix[0])
    mat = freq
    
    for j in range(0, cols):
        col_slice = list(get_column(mat, j))
        total = sum(col_slice)
        for i in range(rows):
            mat[i][j] = mat[i][j]/total
    
    return mat

In [12]:
emission_mat = compute_emission_prob_matrix(freq_matrix)
savetxt('emission_matrix.csv', emission_mat, delimiter=',')

In [15]:
tags_index_dict = dict(zip(list(tag_dict.keys()),range(0, len(tag_dict.keys()))))
word_index_dict = dict(zip(list(word_dict.keys()),range(0, len(word_dict.keys()))))
tags_inv_dict = {v: k for k, v in tags_index_dict.items()}
word_inv_dict = {v: k for k, v in word_index_dict.items()}
print(tags_inv_dict)
print(tags_index_dict)

{0: 'VVB', 1: 'NN1', 2: 'POS', 3: 'NN2', 4: 'PRP', 5: 'NP0', 6: 'VVZ', 7: 'AT0', 8: 'PRF', 9: 'AJ0', 10: 'ORD', 11: 'DT0', 12: 'VM0', 13: 'VVI', 14: 'TO0', 15: 'VHI', 16: 'PNI', 17: 'VBZ', 18: 'XX0', 19: 'AV0', 20: 'CJC', 21: 'PNP', 22: 'PNQ', 23: 'DPS', 24: 'VHZ', 25: 'VVN', 26: 'NN0', 27: 'CJT', 28: 'CJS', 29: 'AVQ', 30: 'AVP', 31: 'DTQ', 32: 'AJS', 33: 'VHD', 34: 'CRD', 35: 'VVG', 36: 'VVD', 37: 'VBD', 38: 'VBG', 39: 'VBI', 40: 'AJC', 41: 'UNC', 42: 'VHB', 43: 'VBN', 44: 'PNX', 45: 'VHG', 46: 'EX0', 47: 'VBB', 48: 'VDN', 49: 'VDD', 50: 'ITJ', 51: 'ZZ0', 52: 'VHN', 53: 'VDB', 54: 'VDZ', 55: 'VDI', 56: 'VDG', 57: 'PUN', 58: 'PUQ', 59: 'PUL', 60: 'PUR'}
{'VVB': 0, 'NN1': 1, 'POS': 2, 'NN2': 3, 'PRP': 4, 'NP0': 5, 'VVZ': 6, 'AT0': 7, 'PRF': 8, 'AJ0': 9, 'ORD': 10, 'DT0': 11, 'VM0': 12, 'VVI': 13, 'TO0': 14, 'VHI': 15, 'PNI': 16, 'VBZ': 17, 'XX0': 18, 'AV0': 19, 'CJC': 20, 'PNP': 21, 'PNQ': 22, 'DPS': 23, 'VHZ': 24, 'VVN': 25, 'NN0': 26, 'CJT': 27, 'CJS': 28, 'AVQ': 29, 'AVP': 30, 'DTQ':

In [16]:
def get_index(tag, isPrev=True):
    if tag == "start":
        return [0]
    if tag == "end":
        return [len(tags_index_dict.keys())]
    if "-" in tag:
        a1 = tags_index_dict[tag[:3]]
        a2 = tags_index_dict[tag[4:]]
        return [a1, a2]
    else:
        a = tags_index_dict[tag]
        return [a]

In [18]:
def compute_tag_tag_frequency_matrix(data, labels):
    
    rows = len(tag_dict.keys()) + 1
    cols = len(tag_dict.keys()) + 1
    
    mat = [[0 for i in range(cols)] for j in range(rows)] 
    
    i=0
    for sentence in data:
        prev_tag = "start"
        curr_tag = "start"
        
        j=0
        for word in sentence:
            prev_tag = curr_tag
            curr_tag = labels[i][j]
            
            prev_index = get_index(prev_tag)
            curr_index = get_index(curr_tag)
            
            a1=0
            a2=0
            b1=0
            b2=0
            
            if len(prev_index) == 1:
                a = prev_index[0]
                if prev_tag != "start":
                    a = a + 1
                if len(curr_index) == 1:
                    b = curr_index[0]
                    mat[a][b] = mat[a][b] + 1
#                     print(str(a) + "_" + str(b))
                else:
                    b1 = curr_index[0]
                    b2 = curr_index[1]
                    mat[a][b1] = mat[a][b1] + 1
                    mat[a][b2] = mat[a][b2] + 1
                    
#                     print(str(a) + "_" + str(b1) + "&" +str(b2))
                    
            else:
                a1 = prev_index[0] + 1
                a2 = prev_index[1] + 1
                if len(curr_index) == 1:
                    b = curr_index[0]
                    mat[a1][b] = mat[a1][b] + 1
                    mat[a2][b] = mat[a2][b] + 1
                    
#                     print(str(a1) + "&" + str(a2) + "_" +str(b))
                else:
                    b1 = curr_index[0]
                    b2 = curr_index[1]
                    mat[a1][b1] = mat[a1][b1] + 1
                    mat[a1][b2] = mat[a1][b2] + 1
                    mat[a2][b1] = mat[a2][b1] + 1
                    mat[a2][b2] = mat[a2][b2] + 1
                    
#                     print(str(a1) + "&" + str(a2) + "_" +str(b1) + "&" +str(b2))
            
            j=j+1
            
        curr_index = get_index(curr_tag)
        
        if len(curr_index) == 1:
            b = curr_index[0] + 1
            mat[b][61] = mat[b][61] + 1
            
#             print(str(b)+ "_" +str(61))
        else:
            b1 = curr_index[0] + 1
            b2 = curr_index[1] + 1
            mat[b1][61] = mat[b1][61] + 1
            mat[b2][61] = mat[b2][61] + 1
            
#             print(str(b1) + "&" + str(b2) + "_" +str(61))
        i=i+1
        
    return mat

In [19]:
tag_tag_freq_matrix = compute_tag_tag_frequency_matrix(data, labels)

savetxt('tag_tag_matrix.csv',tag_tag_freq_matrix, delimiter=',')
# tag_tag_freq_matrix

In [24]:
def compute_transition_prob_matrix(freq):
    rows = len(freq)
    cols = len(freq[0])
    
    mat = np.array(freq, dtype=float)
    
    for i in range(0, rows):
        
        total = float(sum(mat[i]))
        if total == 0:
            total = 1
            
        mat[i] = [x/total for x in mat[i]]
            
    return mat

In [25]:
transition_mat = compute_transition_prob_matrix(tag_tag_freq_matrix)
savetxt('transition_matrix.csv', transition_mat, delimiter=',')

503000.0


In [26]:
# Use Laplace Smoothing
def probability_word_given_tag(word, tag):
    
    count_tag = tag_dict[tag]
    
    if word+'_'+tag in word_tags_dict.keys():
        count_word_tag = word_tags_dict[word+'_'+tag]

        return (count_word_tag)/(count_tag)
    
    else:
        return 1/(count_tag + 1)

In [84]:
def Viterbi(sentence):
    state = []
     
    for index, word in enumerate(sentence):
        p = [] 
        for tag in tag_dict.keys():
            if index == 0:
                transition_p = transition_mat[0][tags_index_dict[tag]]
            elif index == len(sentence)-1:
                prev = tags_index_dict[state[-1]]
                curr = tags_index_dict[tag]
                transition_p = transition_mat[prev][curr]*transition_mat[curr][57]
            else:
                prev = tags_index_dict[state[-1]]
                curr = tags_index_dict[tag]
                transition_p = transition_mat[prev][curr]
            
            if word in word_dict.keys():
                word_index = word_index_dict[word]
                tag_index = tags_index_dict[tag]
            
                emission_p = emission_mat[word_index][tag_index]
                
            else:
                emission_p = probability_word_given_tag(word, tag)
            
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
#         pp.pprint(p)
#         print(p.index(pmax))
        
        state_max = tags_inv_dict[p.index(pmax)] 
        state.append(state_max)
    return state

In [85]:
print(Viterbi(['These', '‘', 'communities', '’', 'are', 'of', 'two', 'kinds', '.']))
# ['These', 'communities', 'are', 'of', 'two', 'kinds']
# ['DT0', 'NN2', 'VBB', 'PRF', 'CRD', 'NN2']
# ['These', '‘', 'communities', '’', 'are', 'of', 'two', 'kinds', '.']
# ['DT0', 'PUQ', 'NN2', 'PUQ', 'VBB', 'PRF', 'CRD', 'NN2', 'PUN']

['DT0', 'PUQ', 'NN2', 'PUQ', 'VBB', 'AV0', 'CRD', 'NN2', 'PUN']


In [86]:
def get_predictions(data):
    pred = []
    for index, sentence in enumerate(data):
        pred.append(Viterbi(sentence))
        if (index+1) % 10000==0:
            print("Predicted: %d " % (index+1))  
    return pred

In [None]:
preds = get_predictions(test_data)

Predicted: 10000 
Predicted: 20000 
Predicted: 30000 
Predicted: 40000 
Predicted: 50000 
Predicted: 60000 
Predicted: 70000 
Predicted: 80000 
Predicted: 90000 
Predicted: 100000 
Predicted: 110000 
Predicted: 120000 
Predicted: 130000 
Predicted: 140000 
Predicted: 150000 
Predicted: 160000 
Predicted: 170000 
Predicted: 180000 
Predicted: 190000 


In [None]:
def get_accuracy(test_data, test_labels, preds):
    
    correct = 0
    incorrect = 0
    
    true = []
    pr = []
    
    print("Total: %d" % len(test_data))
    
    t0 = time.process_time()
    
    for index, pred_labels in enumerate(preds):
        true_labels = test_labels[index]
        for i, pred_label in enumerate(pred_labels):
            if pred_label in true_labels[i]:
                correct = correct + 1
            else:
#                 print("True: " + str(true_labels[i]))
                true.append(true_labels[i])
                pr.append(pred_label)
#                 print("Pred: " + str(pred_label))
                incorrect = incorrect + 1
                
    print("Evaluated Words: %d " % (incorrect + correct))   
    print("Correct: %d " % (correct))   
    print("Incorrect: %d " % (incorrect))   
    
    print("Time Taken: %.2f \n " % (time.process_time()-t0))
    
    print("Final Accuracy = %.06f"  % (correct/(correct+incorrect)))
    
    return true, pr

In [None]:
truths, false = get_accuracy(test_data, test_labels, preds)

In [55]:
truths

['NN1',
 'VVI',
 'VM0',
 'VVI',
 'VVI',
 'CJT',
 'AJ0',
 'VVI',
 'VVN',
 'AJ0',
 'VVN',
 'PRF',
 'VVG',
 'AJ0',
 'CJT',
 'VVI',
 'TO0',
 'VVI',
 'NN1',
 'AJ0',
 'CRD',
 'VBN',
 'PRP',
 'PRP',
 'VVN',
 'AV0',
 'PRP',
 'CJS',
 'VBD',
 'NN1',
 'PRP',
 'CJS',
 'DT0',
 'CJS',
 'AV0',
 'AJ0-NN1',
 'VVD',
 'PRP',
 'VHI',
 'VBN',
 'VVN',
 'PRP',
 'EX0',
 'VVI',
 'VVI',
 'VHD',
 'AV0',
 'AJ0',
 'VVZ',
 'AJ0',
 'PRP',
 'VVB',
 'PRF',
 'VVN',
 'PRP',
 'PRP',
 'VVD',
 'PRP',
 'CJS',
 'CJS',
 'DT0',
 'NN1',
 'PRP',
 'VVD',
 'TO0',
 'VVI',
 'PRP',
 'DT0',
 'VVZ',
 'NN1',
 'NN1',
 'PRP',
 'PRP',
 'VBN',
 'PRP',
 'VVD',
 'VVB',
 'NN1',
 'NN2',
 'PRF',
 'CRD',
 'VVN',
 'DT0',
 'PRP',
 'NN2',
 'VVN-VVD',
 'PRP',
 'AV0',
 'TO0',
 'VHI',
 'VVN',
 'AV0',
 'VVN',
 'VVD',
 'AVP',
 'AV0',
 'DT0',
 'AJ0',
 'VBN',
 'CJS-PRP',
 'CJS',
 'VVI',
 'VHI',
 'NN1',
 'NN1',
 'CRD',
 'PRF',
 'AJ0',
 'VHI',
 'VBN',
 'AV0',
 'DT0',
 'CRD',
 'PRP',
 'VVN',
 'VVI',
 'VVD-VVN',
 'VVI',
 'TO0',
 'VVI',
 'NN1',
 'NP0',
 'CJT',


In [56]:
false

['VVN',
 'NN1',
 'NN1',
 'VVB',
 'VVB',
 'DT0',
 'NN1',
 'VVB',
 'VVD',
 'NN1',
 'AJ0',
 'AV0',
 'AJ0',
 'NN1',
 'DT0',
 'VVB',
 'PRP',
 'VVB',
 'VVG',
 'VVB',
 'AVP',
 'VVB',
 'AV0',
 'TO0',
 'AJ0',
 'PRP',
 'TO0',
 'AVQ',
 'VVB',
 'VDN',
 'TO0',
 'AV0',
 'AV0',
 'AV0',
 'PRP',
 'VVB',
 'NN1',
 'TO0',
 'VHB',
 'VVB',
 'AJ0',
 'TO0',
 'AV0',
 'NN1',
 'NN1',
 'VHN',
 'PRP',
 'VVN',
 'NN2',
 'VVB',
 'TO0',
 'NN1',
 'AV0',
 'VVD',
 'TO0',
 'TO0',
 'VVN',
 'TO0',
 'AV0',
 'AV0',
 'AV0',
 'VVG',
 'AV0',
 'AJ0',
 'PRP',
 'VVN',
 'TO0',
 'CJT',
 'NN2',
 'VVI',
 'VVB',
 'TO0',
 'CJS',
 'VVB',
 'TO0',
 'VVN',
 'NN1',
 'AJ0',
 'VVZ',
 'AV0',
 'AJS',
 'AJ0',
 'CJT',
 'TO0',
 'VVZ',
 'AJ0',
 'AV0',
 'PRP',
 'PRP',
 'VHB',
 'VVD',
 'PRP',
 'AJ0',
 'VVN',
 'NN1',
 'PRP',
 'CJT',
 'AV0',
 'VVB',
 'AV0',
 'AV0',
 'VVB',
 'VHB',
 'VVB',
 'VVN',
 'AVP',
 'AV0',
 'NN1',
 'VHB',
 'VVB',
 'PRP',
 'CJT',
 'PNI',
 'TO0',
 'PUQ',
 'VVB',
 'AJ0',
 'VVB',
 'PRP',
 'VVB',
 'VVB',
 'EX0',
 'DT0',
 'VVD',
 'DT0',
