In [61]:
import pandas as pd
import numpy as np
import json

Task 1: Vocabulary Creation 

In [62]:
df = pd.read_csv('./data/train', sep='\t', names=['idx', 'name', 'tag'])
df['frequency'] = df['name'].map(df['name'].value_counts())
def replace_unk(entry):
    if entry['frequency']<=1:
        return '<unk>'
    else:
        return entry['name']

df['name'] = df.apply(lambda entry: replace_unk(entry), axis=1)

# Sort by descending freq
df = df.sort_values(by=['frequency'], ascending=False)
# Return a Series containing counts of unique values.
df_counted = df['name'].value_counts().reset_index()
df_counted.columns = [''] * len(df_counted.columns)

df_counted.columns = ['name', 'frequency']
df_unknown = df_counted[df_counted['name']=='<unk>']

unk_idx = df_counted[df_counted['name']=='<unk>'].index

df_counted = df_counted.drop(index=unk_idx)
df_counted = pd.concat([df_unknown, df_counted])
df_counted = df_counted.reset_index()
df_counted['index'] = df_counted.index+1
columns_titles = ["name","index", "frequency"]
df_counted=df_counted.reindex(columns=columns_titles)
df_counted.to_csv("vocab.txt", sep="\t", header=None)

print("What is the selected threshold for unknown words replacement?", 2)
print("What is the total size of your vocabulary", len(df_counted))
print("What is the total occurrences of the special token '<unk>' after replacement?", int(df_unknown['frequency']
                                                                                           [df_unknown['name']=='<unk>']))



What is the selected threshold for unknown words replacement? 2
What is the total size of your vocabulary 23183
What is the total occurrences of the special token '<unk>' after replacement? 20011


In [119]:
# df = pd.read_csv('./data/train', sep='\t', names=['idx', 'name', 'tag'])
# df['frequency'] = df['name'].map(df['name'].value_counts())
# df['name'] = df.apply(lambda entry: replace_unk(entry), axis=1)
# df_pos = pd.DataFrame(df['tag'].value_counts()).reset_index()
# df_pos.columns = [''] * len(df_pos.columns)
# df_pos.columns = ['tag', 'count']
# all_tags = list(df_pos['tag'])

# all_sentences = []
# temp_sentence = []
# for i in range(len(df)):
#     if df.loc[i]['idx']==1 and i!=0:
#         all_sentences.append(temp_sentence)
#         temp_sentence =[]
#     temp_sentence.append((df.loc[i]['name'], df.loc[i]['tag']))
    


# Transition Matrix code
transition_matrix = [[0 for j in range(len(all_tags))] for i in range(len(all_tags))]
tag_freq = {} # format: key = <TAG>, value = <tag_freq>
def generate_transition_matrix():
    # Calculate tag frequency
    for sentence in all_sentences:
        for i in range(len(sentence)):
            curr_tag = sentence[i][1]
            if curr_tag not in tag_freq:
                tag_freq[curr_tag]=1
            else:
                tag_freq[curr_tag]+=1
    
    # 1. Calculate the number of transitions from one tag to another for each sentence
    for sentence in all_sentences:
        for i in range(1, len(sentence)):
            curr_tag_index = all_tags.index(sentence[i][1])
            prev_tag_index = all_tags.index(sentence[i-1][1])
            transition_matrix[prev_tag_index][curr_tag_index]+=1
    
    # 2. Calculate the transition probabilities for each transition
    for i in range(len(transition_matrix)):
        for j in range(len(transition_matrix[0])):
            prev_tag_index = i
            prev_tag_count = tag_freq[all_tags[i]]
            if(transition_matrix[i][j] == 0) : transition_matrix[i][j] = 1e-10
            else: transition_matrix[i][j]/=prev_tag_count
            
# Emmission Matrix code
vocabulary = list(df_counted['name'])
emmission_matrix = [[0 for j in range(len(vocabulary))] for i in range(len(all_tags))]
def generate_emmission_matrix():    
    # 1. Calculate the number of transitions from one tag to another for each sentence
    for sentence in all_sentences:
        for i in range(len(sentence)):
            curr_word_index = vocabulary.index(sentence[i][0])
            prev_tag_index = all_tags.index(sentence[i][1])
            emmission_matrix[prev_tag_index][curr_word_index]+=1
    
    # 2. Calculate the transition probabilities for each transition
    for i in range(len(emmission_matrix)):
        for j in range(len(emmission_matrix[0])):
            prev_tag_index = i
            prev_tag_count = tag_freq[all_tags[i]]
            if(emmission_matrix[i][j] == 0) : emmission_matrix[i][j] = 1e-10
            else: emmission_matrix[i][j]/=prev_tag_count


generate_transition_matrix()
generate_emmission_matrix()
    

In [120]:
# np.array(emmission_matrix)
# np.array(transition_matrix)
# all_tags

array([[2.49580504e-02, 1.00000000e-10, 7.84104631e-06, ...,
        1.56820926e-05, 1.56820926e-05, 1.00000000e-10],
       [1.68854743e-04, 1.00000000e-10, 1.00000000e-10, ...,
        1.00000000e-10, 1.00000000e-10, 1.00000000e-10],
       [5.51585580e-02, 1.00000000e-10, 6.84915870e-05, ...,
        1.00000000e-10, 1.00000000e-10, 1.00000000e-10],
       ...,
       [1.60919540e-01, 1.00000000e-10, 1.00000000e-10, ...,
        1.00000000e-10, 1.00000000e-10, 1.00000000e-10],
       [1.81818182e-02, 1.00000000e-10, 1.00000000e-10, ...,
        1.00000000e-10, 1.00000000e-10, 1.00000000e-10],
       [1.00000000e-10, 1.00000000e-10, 1.00000000e-10, ...,
        1.00000000e-10, 1.00000000e-10, 1.00000000e-10]])

In [121]:
# Code for conversion of transition and emmission matrix to respective dictionary

def calculate_trans_prob():
    trans_prob_dict = {}
    for i in range(len(transition_matrix)):
        for j in range(len(transition_matrix[0])):
            tag_at_i = all_tags[i]
            tag_at_j = all_tags[j]
            trans_prob_dict['(' + tag_at_i + ', ' + tag_at_j + ')'] = transition_matrix[i][j]
    return trans_prob_dict
            
def calculate_emmission_prob():
    emmission_prob_dict = {}
    for i in range(len(emmission_matrix)):
        for j in range(len(vocabulary)):
            tag_at_i = all_tags[i]
            vocab_at_j = vocabulary[j]
            emmission_prob_dict['(' + tag_at_i + ', ' + vocab_at_j + ')'] = emmission_matrix[i][j]
    
    return emmission_prob_dict

start_tags = {} # maintains the initial tag frequency of all tag.
def starting_transition_prob():
    
    start_tags_total = 0
    start_tags_prob = {}
    
    for i in range(len(all_tags)):
        start_tags[all_tags[i]]=0
        
    for i in range(len(df)):
        if df.loc[i]['idx']==1:
            start_tags_total+=1
            start_tags[df.loc[i]['tag']]+=1
    
    for tag in start_tags:
        start_tags_prob[tag] = start_tags[tag]/start_tags_total
    
    return start_tags_prob

# Probability Matrices
trans_prob_dict = calculate_trans_prob()
emmission_prob_dict = calculate_emmission_prob()
start_tags_prob = starting_transition_prob()

total_transition_prob = {}
# Add both transition probs in to the final transition dictionary
for key in start_tags_prob:
    total_transition_prob['(' + '<s>' + ', ' + key + ')'] = start_tags_prob[key]
    
for key in trans_prob_dict:
    total_transition_prob[key] = trans_prob_dict[key]

print("Total transition and emission parameters in the HMM model: ", len(total_transition_prob), ',', len(emmission_prob_dict))
    

Total transition and emission parameters in the HMM model:  2070 , 1043235


In [122]:
# Dump the dictionaries to the json file
with open('hmm.json', 'w') as f:
    json.dump({"transition": total_transition_prob, "emission": emmission_prob_dict}, f, indent = 4)

Greedy Decoding

In [123]:
df_dev = pd.read_csv('./data/dev', sep='\t', names=['idx', 'name', 'tag'])
df_dev['frequency'] = df_dev['name'].map(df_dev['name'].value_counts())


all_sentences_dev = []
temp_sentence_dev = []
for i in range(len(df_dev)):
    if df_dev.loc[i]['idx']==1 and i!=0:
        all_sentences_dev.append(temp_sentence_dev)
        temp_sentence_dev =[]
    temp_sentence_dev.append((df_dev.loc[i]['name'], df_dev.loc[i]['tag']))
    
from_tag = None
tag_sequence = []
sentence_scores = [] # score for each tag to word score for each sentence

for sentence in all_sentences_dev:
    curr_sentence_scores = []
    curr_sequence = []
    for i in range(len(sentence)):
        max_score = float('-inf') # initialize max score
        for j in range(len(all_tags)):
            curr_score = 1
            if i==0:
                curr_score *= start_tags_prob[all_tags[j]]
            else:
#                 if str('(' + from_tag + ', ' + all_tags[j] + ')') in trans_prob_dict: #optional
                curr_score *= trans_prob_dict['(' + from_tag + ', ' + all_tags[j] + ')']
            
            if str('(' + all_tags[j] + ', ' + sentence[i][0] + ')') in emmission_prob_dict:
                curr_score *= emmission_prob_dict['(' + all_tags[j] + ', ' + sentence[i][0] + ')']
            else:
                curr_score *= emmission_prob_dict['(' + all_tags[j] + ', ' + '<unk>' + ')']
            
            if curr_score>max_score:
                max_score = curr_score
                highest_score_tag = all_tags[j]
        
        from_tag = highest_score_tag
        curr_sequence.append(highest_score_tag)
        curr_sentence_scores.append(max_score)
    sentence_scores.append(curr_sentence_scores)
    tag_sequence.append(curr_sequence)

def accuracy_finder():
    frequency = 0
    cur_tag_freq = 0
    
    for i in range(len(all_sentences_dev)):
        for j in range(len(all_sentences_dev[i])):
            if tag_sequence[i][j]==all_sentences_dev[i][j][1]:
                cur_tag_freq+=1
            frequency+=1
#     print(cur_tag_freq, frequency)
    return cur_tag_freq/frequency, cur_tag_freq, frequency

print(accuracy_finder())

(0.9351048568891318, 123201, 131751)
