In [40]:
# from reader import parse_data, list_to_freq_dict, parse_data_test
import xml.etree.ElementTree as ET
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import pickle
import os
import json
import numpy as np
import multiprocessing
import time
from numpy import asarray
from numpy import savetxt
pp = pprint.PrettyPrinter(indent=4)

In [2]:
def parse_data(file):
    
    tree = ET.parse(file)
    root = tree.getroot()
    
    data = []
    labels = []
    
    for s_tag in root.iter('s'):
        
        sentence = []
        tags = []
        
        for w_tag in s_tag.iterfind('w'):
            
            word = w_tag.text.replace(" ", "")
            tag = w_tag.attrib['c5']

            sentence.append(word)
            tags.append(tag)
        
        data.append(sentence)
        labels.append(tags)

    return data, labels

In [3]:
def load_dataset(path, training=True):

    if training == True:
        train_data = []
        train_labels = []
        
        for subdir, dirs, files in os.walk(path):
            for file in files:

                fileName = subdir + '/' + str(file)
                data, labels = parse_data(fileName)
                train_data.extend(data)
                train_labels.extend(labels)
            
        return train_data, train_labels
        
    else:
        all_tuples = []
        
        for subdir, dirs, files in os.walk(path):
            for file in files:

                fileName = subdir + '/' + str(file)
                tuples = parse_data_test(fileName)
                all_tuples.extend(tuples)

        return all_tuples

In [4]:
# Load Dataset

train_path = 'Train-corpus/'
test_path = 'Test-corpus/'

data, labels = load_dataset(train_path, training=True)

In [5]:
print(len(data))
print(len(labels))

print(data[0])
print(labels[0])

483629
483629
['Wonder', 'boy', "'s", 'eyes', 'on', 'Wembley']
['VVB-NN1', 'NN1', 'POS', 'NN2', 'PRP', 'NP0']


In [6]:
# Load JSON Files

with open('words.json') as f:
    word_dict = json.load(f)
with open('tags.json') as f:
    tag_dict = json.load(f)
with open('word_tags.json') as f:
    word_tags_dict = json.load(f)

In [7]:
print(len(tag_dict))
print(len(word_dict))
print(len(word_tags_dict))

57
192634
252564


In [170]:
def compute_word_tag_freq_matrix():
    
    rows = len(word_dict.keys())
    cols = len(tag_dict.keys())
    
    mat = [[0 for i in range(cols)] for j in range(rows)] 
    
    i=0
    for word in word_dict.keys():
        j=0
        for tag in tag_dict.keys():
            case = word + "_" + tag
            if case in word_tags_dict.keys():
                mat[i][j] = word_tags_dict[case]
            j = j + 1
        i = i + 1
        
    return mat

In [171]:
freq_matrix = compute_word_tag_freq_matrix()
# savetxt('data.csv', freq_matrix, delimiter=',')

In [148]:
def compute_emission_prob_matrix(freq):
    rows = len(freq_matrix)
    cols = len(freq_matrix[0])
    mat = np.array(freq)
    
    for j in range(0, cols):
        total = sum(mat[:, j])
        mat[:, j] = [x/total for x in mat[:, j]]
    
    return mat

In [149]:
emission_mat = compute_emission_prob_matrix(freq_matrix)

In [272]:
tags_index_dict = dict(zip(list(tag_dict.keys()),range(0, len(tag_dict.keys()))))
print(tags_index_dict)

{'VBB': 0, 'DPS': 1, 'VDG': 2, 'VBG': 3, 'VBD': 4, 'VBI': 5, 'VDZ': 6, 'AV0': 7, 'VBN': 8, 'VDB': 9, 'VHI': 10, 'VDD': 11, 'CRD': 12, 'VHD': 13, 'VDI': 14, 'VBZ': 15, 'DTQ': 16, 'VDN': 17, 'NP0': 18, 'VM0': 19, 'VHB': 20, 'PRF': 21, 'VVZ': 22, 'AJ0': 23, 'VHZ': 24, 'POS': 25, 'VVN': 26, 'PRP': 27, 'VVI': 28, 'VVG': 29, 'VVD': 30, 'VVB': 31, 'XX0': 32, 'EX0': 33, 'CJC': 34, 'AJS': 35, 'VHG': 36, 'ZZ0': 37, 'UNC': 38, 'CJS': 39, 'AJC': 40, 'NN2': 41, 'CJT': 42, 'NN0': 43, 'PNX': 44, 'ITJ': 45, 'PNQ': 46, 'PNP': 47, 'TO0': 48, 'AVP': 49, 'AVQ': 50, 'PNI': 51, 'VHN': 52, 'AT0': 53, 'ORD': 54, 'DT0': 55, 'NN1': 56}


In [320]:
def get_index(tag, isPrev= true):
    if tag == "start":
        return [0]
    if tag == "end":
        return [len(tags_index_dict.keys())]
    if "-" in tag:
        a1 = tags_index_dict[tag[:3]]
        a2 = tags_index_dict[tag[4:]]
        return [a1, a2]
    else:
        a = tags_index_dict[tag]+1
        return [a]

In [324]:
def compute_tag_tag_frequency_matrix():
    
    rows = len(tag_dict.keys()) + 1
    cols = len(tag_dict.keys()) + 1
    
    mat = [[0 for i in range(cols)] for j in range(rows)] 
    
    i=0
    for sentence in data:
        prev_tag = "start"
        curr_tag = "start"
        
        j=0
        for word in sentence:
            prev_tag = curr_tag
            curr_tag = labels[i][j]
            
            prev_index = get_index(prev_tag)
            curr_index = get_index(curr_tag)
            
            a1=0
            a2=0
            b1=0
            b2=0
            
            if len(prev_index) == 1:
                a = prev_index[0]
                if prev_tag != "start":
                    a = a + 1
                if len(curr_index) == 1:
                    b = curr_index[0]
                    mat[a][b] = mat[a][b] + 1
                else:
                    b1 = curr_index[0]
                    b2 = curr_index[1]
                    mat[a][b1] = mat[a][b1] + 1
                    mat[a][b2] = mat[a][b2] + 1
                    
            else:
                a1 = prev_index[0]
                a2 = prev_index[1]
                if len(curr_index) == 1:
                    b = curr_index[0]
                    mat[a1][b] = mat[a1][b] + 1
                    mat[a2][b] = mat[a2][b] + 1
                else:
                    b1 = curr_index[0]
                    b2 = curr_index[1]
                    mat[a1][b1] = mat[a1][b1] + 1
                    mat[a1][b2] = mat[a1][b2] + 1
                    mat[a2][b1] = mat[a2][b1] + 1
                    mat[a2][b2] = mat[a2][b2] + 1
            
            j=j+1
            
        curr_index = get_index(curr_tag)
        
        if len(curr_index) == 1:
            b = curr_index[0] + 1
            mat[b][57] = mat[b][57] + 1
        else:
            b1 = curr_index[0] + 1
            b2 = curr_index[1] + 1
            mat[b1][57] = mat[b1][57] + 1
            mat[b2][57] = mat[b2][57] + 1
        i=i+1
        
    return mat

In [325]:
tag_tag_freq_matrix = compute_tag_tag_frequency_matrix()

In [328]:
def compute_transition_prob_matrix(freq):
    rows = len(freq)
    cols = len(freq[0])
    
    mat = np.array(freq, dtype=float)
    
    for i in range(0, rows):
        
        total = float(sum(mat[i]))
        if total == 0:
            total = 1
            
        mat[i] = [x/total for x in mat[i]]
            
    return mat

In [329]:
transition_mat = compute_transition_prob_matrix(tag_tag_freq_matrix)