In [268]:
import os
import sys
import numpy as np

In [269]:
def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

In [270]:
def getFileFromCommandLine():
    filename = sys.argv[1]
    return getFileContents(filename)

In [271]:
def splitWordTag(word_tag_pair):
    splitted = word_tag_pair.split('/')
    tag = splitted[-1]
    word = '/'.join(splitted[:-1])
    return word, tag

In [272]:
def getUniqueTags(tagged_data):
    tags = {}
    for line in tagged_data:
        word_tag_pairs = line.strip().split(' ')
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            if tag in tags.keys():
                tags[tag] += 1
            else:
                tags[tag] = 1
    return tags

In [273]:
def getOpenProbabilities(tagged_data, all_tags_dict):
    sentences_count = len(tagged_data)
    open_tag_count_dict = {}
    for line in tagged_data:
        first_word_tag_pairs = line.strip().split(' ')[0]
        word, tag = splitWordTag(first_word_tag_pairs)
        if tag in open_tag_count_dict.keys():
            open_tag_count_dict[tag] += 1
        else:
            open_tag_count_dict[tag] = 1
    
    incrementer = 0.001
    
    #increment all existing tags count to one
    open_tag_count_dict.update((tag, occurances + incrementer) for tag, occurances in open_tag_count_dict.items())
    sentences_count += (sentences_count*incrementer)
    
    #add one two non-opening tags
    for tag in all_tags_dict.keys():
        try:
            val = open_tag_count_dict[tag]
        except KeyError as e:
            open_tag_count_dict[tag] = incrementer
            sentences_count += incrementer
    
    open_tag_count_dict.update((tag, (occurances*1.0)/sentences_count) for tag, occurances in open_tag_count_dict.items())
    return open_tag_count_dict

In [274]:
def getCloseProbabilities(tagged_data, all_tags_dict):
    sentences_count = len(tagged_data)
    close_tag_count_dict = {}
    for line in tagged_data:
        last_word_tag_pairs = line.strip().split(' ')[-1]
        word, tag = splitWordTag(last_word_tag_pairs)
        if tag in close_tag_count_dict.keys():
            close_tag_count_dict[tag] += 1
        else:
            close_tag_count_dict[tag] = 1
            
    incrementer = 0.001

    #increment all existing tags count to one
    close_tag_count_dict.update((tag, occurances + incrementer) for tag, occurances in close_tag_count_dict.items())
    sentences_count += (sentences_count*incrementer)
    
    #add one two non-closing tags
    for tag in all_tags_dict.keys():
        try:
            val = close_tag_count_dict[tag]
        except KeyError as e:
            close_tag_count_dict[tag] = incrementer
            sentences_count += incrementer
            
    close_tag_count_dict.update((tag, (occurances*1.0)/sentences_count) for tag, occurances in close_tag_count_dict.items())
    return close_tag_count_dict

In [275]:
def buildTransitionMatrix(tagged_data, tags_dict):
    tags = tags_dict.keys()
    tags.sort()
    
    tags_index_dict = {}
    for index, tag in enumerate(tags):
        tags_index_dict[tag] = index
    
    tag_count = len(tags)
    
    #Change this line to np.ones for add 1 smoothing
    transition_matrix = np.zeros(shape=(tag_count, tag_count))
    
    for line in tagged_data:
        prev_tag = None
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            
            if prev_tag is not None:
                transition_matrix[tags_index_dict[prev_tag]][tags_index_dict[tag]] += 1
            
            prev_tag = tag
            
    incrementer = 0.001
            
    transition_matrix = transition_matrix + incrementer
    probability_transition_matrix = transition_matrix/transition_matrix.sum(axis=1, keepdims=True)
    probability_transition_matrix[np.isnan(probability_transition_matrix)] = 0
    return probability_transition_matrix, tags_index_dict
        

In [276]:
def getUniqueWords(tagged_data):
    words = []
    for line in tagged_data:
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            words.append(word)
    return list(set(words))

In [277]:
def computeEmissionProbabilities(tagged_data, tags_dict):
    tags = tags_dict.keys()
    tags.sort()
    
    words = getUniqueWords(tagged_data)
    words.sort()
    
    tags_index_dict = {}
    for index, tag in enumerate(tags):
        tags_index_dict[tag] = index
        
    words_index_dict = {}
    for index, word in enumerate(words):
        words_index_dict[word] = index
    
    tag_count = len(tags)
    word_count = len(words)
    
    # word_count + 1 => Last column for unseen words
    emission_matrix = np.zeros(shape=(tag_count, word_count + 1))
    
    for line in tagged_data:
        prev_tag = None
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            
            emission_matrix[tags_index_dict[tag]][words_index_dict[word]] += 1
            
            prev_tag = tag
    #increment 1 in all the elements so that the last col for unseen words have non zero values
    incrementer = 0.001
    emission_matrix = emission_matrix + incrementer
    probability_emission_matrix = emission_matrix/emission_matrix.sum(axis=1, keepdims=True)
    probability_emission_matrix[np.isnan(probability_emission_matrix)] = 0
    return probability_emission_matrix, tags_index_dict, words_index_dict

In [278]:
def printEmissionProbabilities(count):
    counter = 0
    global probability_emission_matrix, tags_index_dict, words_index_dict
    word_count = len(words_index_dict.keys())
    tag_count = len(tags_index_dict.keys())
    for word, word_index in words_index_dict.iteritems():
        for tag, tag_index in tags_index_dict.iteritems():
            if probability_emission_matrix[tag_index][word_index] != 0:
                print tag, " => ", word, ' => ', probability_emission_matrix[tag_index][word_index]
                counter += 1
                if counter > count:
                    return

In [279]:
def getMostProbableTags(sentence):
    global opening_probabilities, closing_probabilities, probability_transition_matrix, probability_emission_matrix, tags_index_dict, words_index_dict
    global tag_count
    
    sentence_words = sentence.strip().split(' ')
    
    sentence_len = len(sentence_words)
    
    viterbi_matrix = np.zeros(shape=(tag_count, sentence_len))
    
    tracing_matrix = [[None for x in range(sentence_len)] for y in range(tag_count)]
    
    for word_index in range(sentence_len):
        word = sentence_words[word_index]
        for model_tag, model_tag_index in tags_index_dict.iteritems():
            
            try:
                word_emission_probability = probability_emission_matrix[model_tag_index][words_index_dict[word]]
            except KeyError as e:
                word_emission_probability = probability_emission_matrix[model_tag_index][-1]
            
            if word_index == 0:
                try:
                    tag_opening_probability = opening_probabilities[model_tag]
                except KeyError as e:
                    print "tag_opening_probability : Keyerror encountered"
                    tag_opening_probability = 1.1754943508222875e-10
                viterbi_matrix[model_tag_index][word_index] = tag_opening_probability * word_emission_probability
            else:
                max_probability = -1
                max_tag = None
                for prev_model_tag, prev_model_tag_index in tags_index_dict.iteritems():
                    tag_transition_probability = probability_transition_matrix[prev_model_tag_index][model_tag_index]
                    if tag_transition_probability == 0.0:
                        print "Transition probability still zero"
                        tag_transition_probability = 1.1754943508222875e-10
                    temp_probability = viterbi_matrix[prev_model_tag_index][word_index-1] * tag_transition_probability * word_emission_probability  
                    if temp_probability > max_probability:
                        max_probability = temp_probability
                        max_tag = prev_model_tag
                        
                viterbi_matrix[model_tag_index][word_index] = max_probability
                tracing_matrix[model_tag_index][word_index] = max_tag
    
    max_probability = -1
    max_probability_tag = None
    for model_tag, model_tag_index in tags_index_dict.iteritems():
        temp_probability = 0.0
        try:
            tag_closing_probabilities = closing_probabilities[model_tag]
        except KeyError as e:
            print "tag_closing_probabilities : Keyerror encountered", 
            tag_closing_probabilities = 1.1754943508222875e-10
        temp_probability =  tag_closing_probabilities * viterbi_matrix[model_tag_index][sentence_len-1]
        if temp_probability > max_probability:
            max_probability = temp_probability
            max_probability_tag = model_tag

    assigned_tags = [max_probability_tag]
    current_best_tag = max_probability_tag
    for col in range(sentence_len-1, 0, -1):
        current_best_tag = tracing_matrix[tags_index_dict[current_best_tag]][col]
        assigned_tags.append(current_best_tag)
    assigned_tags = assigned_tags[::-1]
    
    anotated_sentence = ''
    for index, assigned_tag in enumerate(assigned_tags):
        anotated_sentence += str(sentence_words[index]) + '/' + str(assigned_tag) + ' '
    
    
    return anotated_sentence.strip()
    
    

In [280]:
tagged_data = getFileContents('data/en_train_tagged.txt')
tags_dict = getUniqueTags(tagged_data)

In [281]:
opening_probabilities = getOpenProbabilities(tagged_data, tags_dict)
closing_probabilities = getCloseProbabilities(tagged_data, tags_dict)

In [282]:
probability_transition_matrix, tags_index_dict = buildTransitionMatrix(tagged_data, tags_dict)

In [283]:
probability_emission_matrix, tags_index_dict, words_index_dict = computeEmissionProbabilities(tagged_data, tags_dict)

In [284]:
printEmissionProbabilities(5)

PRP$  =>  Unemployent  =>  3.23868492551e-07
VBG  =>  Unemployent  =>  2.98536603424e-07
FW  =>  Unemployent  =>  8.87524074091e-06
NFP  =>  Unemployent  =>  2.7958498405e-06
``  =>  Unemployent  =>  1.20095163407e-06
VBN  =>  Unemployent  =>  2.50898656262e-07


In [285]:
tag_count = len(tags_index_dict.keys())

In [286]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [287]:

# sentence = "Bush also nominated A. Noel Anketell Kramer for a 15 - year term as associate judge of the District of Columbia Court of Appeals , replacing John Montague Steadman ."
sentence = "The sheikh in wheel - chair has been attacked with a F - 16 - launched bomb ."
# sentence = "the Pakistan military"
getMostProbableTags(sentence)

'The/DT sheikh/JJS in/IN wheel/NN -/HYPH chair/NN has/VBZ been/VBN attacked/VBN with/IN a/DT F/NNP -/HYPH 16/CD -/HYPH launched/VBN bomb/NN ./.'

In [288]:
dev_tagged_data = getFileContents('data/en_dev_tagged.txt')
dev_untagged_data = getFileContents('data/en_dev_raw.txt')

In [289]:
correct = 0
total = 0
for index, test_line in enumerate(dev_untagged_data):
    predicted_tagged_line = getMostProbableTags(test_line)
    expected_tagged_line = dev_tagged_data[index]
    predicted_word_tag_pairs = predicted_tagged_line.strip().split(' ')
    expected_word_tag_pairs = expected_tagged_line.strip().split(' ')
    for index, predicted_word in enumerate(predicted_word_tag_pairs):
        if predicted_word == expected_word_tag_pairs[index]:
            correct += 1
        total += 1
        if total % 100 == 0:
            print correct, total, " => ", (correct*100.0)/total
accuracy = (correct*100.0)/total
print accuracy

85 100  =>  85.0
178 200  =>  89.0
267 300  =>  89.0
357 400  =>  89.25
452 500  =>  90.4
541 600  =>  90.1666666667
631 700  =>  90.1428571429
719 800  =>  89.875
807 900  =>  89.6666666667
899 1000  =>  89.9
988 1100  =>  89.8181818182
1075 1200  =>  89.5833333333
1167 1300  =>  89.7692307692
1244 1400  =>  88.8571428571
1328 1500  =>  88.5333333333
1421 1600  =>  88.8125
1508 1700  =>  88.7058823529
1597 1800  =>  88.7222222222
1687 1900  =>  88.7894736842
1779 2000  =>  88.95
1865 2100  =>  88.8095238095
1953 2200  =>  88.7727272727
2047 2300  =>  89.0
2136 2400  =>  89.0
2233 2500  =>  89.32
2317 2600  =>  89.1153846154
2406 2700  =>  89.1111111111
2493 2800  =>  89.0357142857
2585 2900  =>  89.1379310345
2671 3000  =>  89.0333333333
2761 3100  =>  89.064516129
2848 3200  =>  89.0
2936 3300  =>  88.9696969697
3018 3400  =>  88.7647058824
3109 3500  =>  88.8285714286
3194 3600  =>  88.7222222222
3286 3700  =>  88.8108108108
3372 3800  =>  88.7368421053
3460 3900  =>  88.7179487179
