In [1]:
import os
import sys
import numpy as np

In [2]:
def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

In [3]:
def getFileFromCommandLine():
    filename = sys.argv[1]
    return getFileContents(filename)

In [4]:
def splitWordTag(word_tag_pair):
    splitted = word_tag_pair.split('/')
    tag = splitted[-1]
    word = '/'.join(splitted[:-1])
    return word, tag

In [5]:
def getUniqueTags(tagged_data):
    tags = {}
    for line in tagged_data:
        word_tag_pairs = line.strip().split(' ')
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
#             if len(splitted) > 2:
#                 print splitted, tag, word
#                 print line
            if tag in tags.keys():
                tags[tag] += 1
            else:
                tags[tag] = 1
    return tags

In [6]:
tagged_data = getFileContents('data/en_train_tagged.txt')

In [7]:
tags_dict = getUniqueTags(tagged_data)

In [8]:
def getOpenProbabilities(tagged_data):
    sentences_count = len(tagged_data)
    open_tag_count_dict = {}
    for line in tagged_data:
        first_word_tag_pairs = line.strip().split(' ')[0]
        word, tag = splitWordTag(first_word_tag_pairs)
        if tag == 'XX':
            print line
        
        if tag in open_tag_count_dict.keys():
            open_tag_count_dict[tag] += 1
        else:
            open_tag_count_dict[tag] = 1
    open_tag_count_dict.update((tag, (occurances*1.0)/sentences_count) for tag, occurances in open_tag_count_dict.items())
    return open_tag_count_dict

In [20]:
def getCloseProbabilities(tagged_data):
    sentences_count = len(tagged_data)
    close_tag_count_dict = {}
    for line in tagged_data:
        last_word_tag_pairs = line.strip().split(' ')[-1]
        word, tag = splitWordTag(last_word_tag_pairs)
        if tag == 'XX':
            print line
        
        if tag in close_tag_count_dict.keys():
            close_tag_count_dict[tag] += 1
        else:
            close_tag_count_dict[tag] = 1
    close_tag_count_dict.update((tag, (occurances*1.0)/sentences_count) for tag, occurances in close_tag_count_dict.items())
    return close_tag_count_dict

In [21]:
opening_probabilities = getOpenProbabilities(tagged_data)
closing_probabilities = getCloseProbabilities(tagged_data)

%/XX

%/XX



In [42]:
opening_probabilities

{'$': 0.004783544606553456,
 ',': 0.0009567089213106913,
 '-LRB-': 0.010922426851630392,
 '.': 0.0005580802040979033,
 'ADD': 0.008690106035238778,
 'CC': 0.023120465598341706,
 'CD': 0.03236865183767838,
 'DT': 0.11249302399744877,
 'EX': 0.009009009009009009,
 'FW': 0.00023917723032767282,
 'GW': 0.00366738419835765,
 'IN': 0.07813122857370645,
 'JJ': 0.03332536075898908,
 'JJR': 0.000797257434425576,
 'JJS': 0.004624093119668341,
 'LS': 0.006696962449174839,
 'MD': 0.010045443673762257,
 'NFP': 0.013075021924579447,
 'NN': 0.051024475803236866,
 'NNP': 0.11600095670892131,
 'NNPS': 0.0008769831778681336,
 'NNS': 0.015387068484413617,
 'PDT': 0.0011161604081958065,
 'PRP': 0.22458741927768477,
 'PRP$': 0.02064896755162242,
 'RB': 0.06569401259666746,
 'RBR': 0.0007175316909830184,
 'RBS': 0.0001594514868851152,
 'SYM': 7.97257434425576e-05,
 'TO': 0.0017539663557362673,
 'UH': 0.03173084589013792,
 'VB': 0.035398230088495575,
 'VBD': 0.0058997050147492625,
 'VBG': 0.00837120306146854

In [43]:
closing_probabilities

{"''": 0.013154747668022004,
 ',': 0.023040739854899146,
 '-RRB-': 0.015147891254085945,
 '.': 0.7904807462329586,
 ':': 0.013553376385234793,
 'ADD': 0.016503228892609422,
 'CC': 0.00023917723032767282,
 'CD': 0.007494219883600415,
 'DT': 7.97257434425576e-05,
 'JJ': 0.003428206968029977,
 'JJR': 0.00023917723032767282,
 'JJS': 0.0003189029737702304,
 'LS': 0.0003189029737702304,
 'MD': 7.97257434425576e-05,
 'NFP': 0.01132105556884318,
 'NN': 0.038268356852427646,
 'NNP': 0.04903133221717292,
 'NNPS': 0.0015147891254085944,
 'NNS': 0.006776688192617396,
 'POS': 0.0001594514868851152,
 'PRP': 0.0008769831778681336,
 'RB': 0.0026309495336044007,
 'RP': 7.97257434425576e-05,
 'SYM': 0.000398628717212788,
 'UH': 0.0013553376385234792,
 'VB': 0.0011161604081958065,
 'VBG': 0.0001594514868851152,
 'VBN': 0.0012756118950809216,
 'VBP': 0.000398628717212788,
 'VBZ': 0.0003189029737702304,
 'WRB': 7.97257434425576e-05,
 'XX': 7.97257434425576e-05,
 '``': 7.97257434425576e-05}

In [10]:
print len(opening_probabilities.keys())

42


In [11]:
def buildTransitionMatrix(tagged_data, tags_dict):
    tags = tags_dict.keys()
    tags.sort()
    
    tags_index_dict = {}
    for index, tag in enumerate(tags):
        tags_index_dict[tag] = index
    
    tag_count = len(tags)
    transition_matrix = np.zeros(shape=(tag_count, tag_count))
    
    for line in tagged_data:
        prev_tag = None
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            
            if prev_tag is not None:
                transition_matrix[tags_index_dict[prev_tag]][tags_index_dict[tag]] += 1
            
            prev_tag = tag
            
    probability_transition_matrix = transition_matrix/transition_matrix.sum(axis=1, keepdims=True)
    probability_transition_matrix[np.isnan(probability_transition_matrix)] = 0
    return probability_transition_matrix, tags_index_dict
        

In [12]:
probability_transition_matrix, tags_index_dict = buildTransitionMatrix(tagged_data, tags_dict)



In [13]:
def getUniqueWords(tagged_data):
    words = []
    for line in tagged_data:
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            words.append(word)
    return list(set(words))

In [14]:
def computeEmissionProbabilities(tagged_data, tags_dict):
    tags = tags_dict.keys()
    tags.sort()
    
    words = getUniqueWords(tagged_data)
    words.sort()
    
    tags_index_dict = {}
    for index, tag in enumerate(tags):
        tags_index_dict[tag] = index
        
    words_index_dict = {}
    for index, word in enumerate(words):
        words_index_dict[word] = index
    
    tag_count = len(tags)
    word_count = len(words)
    
    emission_matrix = np.zeros(shape=(tag_count, word_count))
    
    for line in tagged_data:
        prev_tag = None
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            
            emission_matrix[tags_index_dict[tag]][words_index_dict[word]] += 1
            
            prev_tag = tag
            
    print emission_matrix[tags_index_dict['VBD'], words_index_dict['asked']]
            
    probability_emission_matrix = emission_matrix/emission_matrix.sum(axis=1, keepdims=True)
    probability_emission_matrix[np.isnan(probability_emission_matrix)] = 0
    print probability_emission_matrix
    return probability_emission_matrix, tags_index_dict, words_index_dict

In [15]:
probability_emission_matrix, tags_index_dict, words_index_dict = computeEmissionProbabilities(tagged_data, tags_dict)

53.0
[[ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.03694268  0.          0.        ]
 [ 0.          0.          0.00012412 ...,  0.          0.          0.00012412]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.00123001  0.          0.        ]]


In [40]:
def printEmissionProbabilities():
    counter = 0
    global probability_emission_matrix, tags_index_dict, words_index_dict
    word_count = len(words_index_dict.keys())
    tag_count = len(tags_index_dict.keys())
    for word, word_index in words_index_dict.iteritems():
        for tag, tag_index in tags_index_dict.iteritems():
            if probability_emission_matrix[tag_index][word_index] != 0:
                print tag, " => ", word, ' => ', probability_emission_matrix[tag_index][word_index]
                counter += 1
                if counter > 20:
                    return

In [41]:
printEmissionProbabilities()

NN  =>  Unemployent  =>  3.71581450654e-05
NNS  =>  Andreas  =>  0.000118553645525
NN  =>  rebel  =>  7.43162901308e-05
NNP  =>  CONFIRMIT  =>  8.03277371676e-05
MD  =>  Woud  =>  0.000303582270795
JJ  =>  unpretentious  =>  8.62663906142e-05
ADD  =>  http://www.tecsoc.org/pubs/history/2002/apr26.htm  =>  0.00342465753425
JJ  =>  yellow  =>  0.000517598343685
NNPS  =>  Heights  =>  0.00200803212851
CD  =>  four  =>  0.00800800800801
VBZ  =>  Does  =>  0.00349497597204
VBG  =>  hanging  =>  0.0012012012012
NN  =>  hanging  =>  3.71581450654e-05
NN  =>  flatbread  =>  7.43162901308e-05
JJ  =>  succulent  =>  8.62663906142e-05
IN  =>  Until  =>  0.00019303156066
NN  =>  aggression  =>  3.71581450654e-05
NN  =>  payoff  =>  3.71581450654e-05
VBG  =>  looking  =>  0.0246246246246
JJ  =>  LAST  =>  8.62663906142e-05
VBG  =>  granting  =>  0.0003003003003


In [34]:
def getMostProbableTags(sentence):
    global opening_probabilities, closing_probabilities, probability_transition_matrix, probability_emission_matrix, tags_index_dict, words_index_dict
    
    sentence_words = sentence.strip().split(' ')
    
    sentence_len = len(sentence_words)
    tag_count = len(tags_index_dict.keys())
    
    viterbi_matrix = np.zeros(shape=(tag_count, sentence_len))
    
    tracing_matrix = [[None for x in range(sentence_len)] for y in range(tag_count)]
    
    for word_index in range(sentence_len):
        word = sentence_words[word_index]
        for model_tag, model_tag_index in tags_index_dict.iteritems():
            
            word_emission_probability = 0.0
            if word in words_index_dict.keys():
                word_emission_probability = probability_emission_matrix[model_tag_index][words_index_dict[word]]
            
            if word_index == 0:
                if model_tag in opening_probabilities.keys():
                    viterbi_matrix[model_tag_index][word_index] = opening_probabilities[model_tag] * word_emission_probability
                else:
                    viterbi_matrix[model_tag_index][word_index] = 0.0
            else:
                max_probability = -1
                max_tag = None
                for prev_model_tag, prev_model_tag_index in tags_index_dict.iteritems():
                    
                    temp_probability = viterbi_matrix[prev_model_tag_index][word_index-1] * probability_transition_matrix[prev_model_tag_index][model_tag_index] * word_emission_probability  
                    if temp_probability > max_probability:
                        max_probability = temp_probability
                        max_tag = prev_model_tag
                        
                viterbi_matrix[model_tag_index][word_index] = max_probability
                tracing_matrix[model_tag_index][word_index] = max_tag
    
    max_probability = -1
    max_probability_tag = None
    for model_tag, model_tag_index in tags_index_dict.iteritems():
        temp_probability = 0.0
        if model_tag in closing_probabilities.keys():
            temp_probability =  closing_probabilities[model_tag] * viterbi_matrix[model_tag_index][sentence_len-1]
        if temp_probability > max_probability:
            max_probability = temp_probability
            max_probability_tag = model_tag

    assigned_tags = [max_probability_tag]
    current_best_tag = max_probability_tag
    for col in range(sentence_len-1, 0, -1):
        current_best_tag = tracing_matrix[tags_index_dict[current_best_tag]][col]
        assigned_tags.append(current_best_tag)
    assigned_tags = assigned_tags[::-1]
    
    anotated_sentence = ''
    for index, assigned_tag in enumerate(assigned_tags):
        anotated_sentence += str(sentence_words[index]) + '/' + str(assigned_tag) + ' '
        
    print anotated_sentence
    
    

In [35]:
sentence = "One of them was from the Jubur tribe and was deputy commander of the Hawijah garrison ."
getMostProbableTags(sentence)

One/CD of/IN them/PRP was/VBD from/IN the/DT Jubur/NNP tribe/NN and/CC was/VBD deputy/NN commander/NN of/IN the/DT Hawijah/NNP garrison/NN ./. 


In [37]:
"""One/CD of/IN them/PRP was/VBD from/IN the/DT Jubur/NNP tribe/NN and/CC was/VBD deputy/NN commander/NN of/IN the/DT Hawijah/NNP garrison/NN ./.""" 

'One/CD of/IN them/PRP was/VBD from/IN the/DT Jubur/NNP tribe/NN and/CC was/VBD deputy/NN commander/NN of/IN the/DT Hawijah/NNP garrison/NN ./.'