In [None]:
import os
import sys
import numpy as np
import math
import cProfile
%load_ext line_profiler

In [None]:
def readDatafromFile():
    filename = "pt-ud-train.conllu"
    datadir = os.path.join(os.getcwd(), filename)
    fileOpen = open(datadir,"r")
    fileData = fileOpen.readlines()
    return fileData

In [None]:
fileData = readDatafromFile()

In [None]:
def getTagWordfromPair(wordTagPair):
    tag = wordTagPair.split("/")[-1]
    wordList = wordTagPair.split("/")[:-1]
    word = '/'.join(wordList)
    return tag,word

In [None]:
def getTotalWordTagOccurrences(fileData):
    totalWordOccurrences = {}
    totalTagOccurrences = {}
    for line in fileData:
        wordTagPair = line.strip("\n").split(" ")
        for ele in wordTagPair:
            tag,word = getTagWordfromPair(ele)

            try:
                totalWordOccurrences[word] += 1
            except KeyError as e:
                totalWordOccurrences[word] = 1

            try:
                totalTagOccurrences[tag] += 1
            except KeyError as e:
                totalTagOccurrences[tag] = 1
                
    return totalTagOccurrences, totalWordOccurrences

def getUniqueTags(totalTagOccurrences):
    uniqueTags = totalTagOccurrences.keys()
    uniqueTags.sort()
    return uniqueTags


In [None]:
def getTagDictonaries(uniqueTags):
    tagIndexDict = {}
    tagIndexDictReverse = {}
    for tagIndex, tag in enumerate(uniqueTags):
        tagIndexDict[tag] = tagIndex
        tagIndexDictReverse[tagIndex] = tag
    return tagIndexDict, tagIndexDictReverse

In [None]:
def getUniqueWords(totalWordOccurrences):
    uniqueWords = totalWordOccurrences.keys()
    uniqueWords.sort()
    return uniqueWords

def getWordDictionaries(uniqueWords):
    wordIndexDict = {}
    wordIndexDictReverse = {}
    for wordIndex, word in enumerate(uniqueWords):
        wordIndexDict[word] = wordIndex
        wordIndexDictReverse[wordIndex] = word
    return wordIndexDict, wordIndexDictReverse

In [None]:
totalTagOccurrences, totalWordOccurrences = getTotalWordTagOccurrences(fileData)
uniqueTags = getUniqueTags(totalTagOccurrences)
uniqueWords = getUniqueWords(totalWordOccurrences)
tagIndexDict, tagIndexDictReverse = getTagDictonaries(uniqueTags)
wordIndexDict, wordIndexDictReverse = getWordDictionaries(uniqueWords)

In [None]:
def getTransitionMatrix(fileData, uniqueTags, tagIndexDict):
    transitionMatrix = np.ones(shape=(len(uniqueTags), len(uniqueTags)))
    
    for line in fileData:
        wordTagPairs = line.strip("\n").split(" ")
        for pairIndex in range(len(wordTagPairs) - 1):
            tag1 = wordTagPairs[pairIndex].split("/")[-1]
            tag2 = wordTagPairs[pairIndex + 1].split("/")[-1]
            tag1Index = tagIndexDict[tag1]
            tag2Index = tagIndexDict[tag2]
            transitionMatrix[tag1Index][tag2Index] += 1
    
    transitionMatrix = transitionMatrix / transitionMatrix.sum(axis = 1, keepdims = True)
    #transitionMatrix = np.log(transitionMatrix)
    return transitionMatrix


In [None]:
def getInitialProbabilities(fileData, uniqueTags):
    initialProbablities = {}
    sentenceCount = len(fileData)
    for line in fileData:
        wordTagPairs = line.strip("\n").split(" ")
        tag = wordTagPairs[0].split("/")[-1]
        try:
            initialProbablities[tag] += 1
        except KeyError as e:
            initialProbablities[tag] = 1
        
    
        
    for tag in uniqueTags:
        try:
            initialProbablities[tag] += 0.000000001
        except KeyError as e:
            initialProbablities[tag] = 0.000000001
        sentenceCount += 0.000000001
        
    initialProbablities.update((tag, (value*1.0/sentenceCount) ) for tag, value in initialProbablities.items())
    return initialProbablities

In [None]:
def getEndProbabilities(fileData, uniqueTags):
    endProbablities = {}
    sentenceCount = len(fileData)
    for line in fileData:
        lastWordTagPair = line.strip("\n").split(" ")[-1]
        tag = lastWordTagPair.split("/")[-1]
        try:
            endProbablities[tag] += 1
        except KeyError as e:
            endProbablities[tag] = 1



    for tag in uniqueTags:
        try:
            endProbablities[tag] += 0.000000001
        except KeyError as e:
            endProbablities[tag] = 0.000000001
        sentenceCount += 0.000000001

    endProbablities.update((tag, (value*1.0/sentenceCount) ) for tag, value in endProbablities.items())
    return endProbablities

In [None]:
def getEmissionMatrix(fileData):
    global uniqueTags, uniqueWords, tagIndexDict, wordIndexDict
    emissionMatrix = np.zeros(shape = (len(uniqueTags), len(uniqueWords)))

    for line in fileData:
        wordTagPairs = line.strip("\n").split(" ")
        for ele in wordTagPairs:
            tag, word = getTagWordfromPair(ele)

            tagIndex = tagIndexDict[tag]
            wordIndex = wordIndexDict[word]

            emissionMatrix[tagIndex][wordIndex] += 1
    #emissionMatrix += 0.000000000000001
    emissionMatrix = emissionMatrix / emissionMatrix.sum(axis = 1, keepdims = True)
    #emissionMatrix = np.log(emissionMatrix)
    return emissionMatrix

In [None]:
SEPARATOR = '******######******######******######******######******'

def writeModelParameters(transitionMatrix, emissionMatrix, initialProbablities, endProbablities, uniqueTags, uniqueWords, tagIndexDict, wordIndexDict, filename='hmmmodel.txt'):
    global SEPARATOR
#     output = "Transition Matrix:\n" + str(transitionMatrix) + "\n\nEmission Matrix:\n" + str(emissionMatrix) + "\n\nInitial Probabilities:\n" + str(initialProbablities) + "\n\nEnd Probabilities:\n" + str(endProbablities);
    output = str(len(uniqueTags)) + "\n" +  str(len(uniqueWords)) + "\n"
    output += SEPARATOR + '\n'
    output += "Transition Matrix:" + '\n'
    transRows = ""
    for row in transitionMatrix:
        transRows += ','.join(map(str,row)) + "\n"
        
    output += transRows
    output += SEPARATOR + '\n'
        
    output += "Emission Matrix:" + '\n'
    emiRows = ""
    for row in emissionMatrix:
        emiRows += ','.join(map(str,row)) + "\n"
        
    output += emiRows
    
    output += SEPARATOR + '\n'
    output += "Initial Probabilities:" + '\n'
    
    initialProbab = ""
    for key in initialProbablities:
        initialProbab += key + "\t" + str(initialProbablities[key]) + "\n"
    
    output += initialProbab
    output += SEPARATOR + '\n'
    
    output += "End Probabilities:" + '\n'
        
    endProbab = ""
    for key in endProbablities:
        endProbab += key + "\t" + str(endProbablities[key]) + "\t" + str(tagIndexDict[key]) + "\n"
        
    output += endProbab
    output += SEPARATOR + '\n'
    
    output += "Unique Tags:" + '\n'
    tags = '\t'.join(uniqueTags) + '\n'
    
    output += tags
    
    output += SEPARATOR + '\n'
    
    words = ""
    for word in uniqueWords:
        wordAndIndex = word + "\t" + str(wordIndexDict[word])
        words += wordAndIndex + '\n'
    
    output += "Unique Words:" + '\n'
    output += words
    
    with open (filename,'w') as f:
        f.write(output)

In [None]:
def readModelParameters():
    global SEPARATOR
    transitionMatrix = []
    emissionMatrix = []
    initialProbablities = {}
    endProbablities = {}
    tagIndexDict = {}
    wordIndexDict = {}
    filename = "hmmmodel.txt"
    fileContents = []
    with open(filename, 'r') as f:
        fileContents = f.readlines()
    lenUniqueTags = fileContents[0]
    lenUniqueWords = fileContents[1]
    
    flag = 0
    
    for index in range(0, len(fileContents), 1):
        line = fileContents[index].strip()
        
        if line == SEPARATOR:
            flag += 1
            start = 0
            continue
        
        if flag == 1:
            if start == 0:
                start = 1
                continue
            transitionMatrix.append(map(float, line.split(",")))
        elif flag == 2:
            if start == 0:
                start = 1
                continue
            emissionMatrix.append(map(float, line.split(",")))
        elif flag == 3:
            if start == 0:
                start = 1
                continue
            dicentry = line.split('\t')
            initialProbablities[dicentry[0]] = float(dictentry[1])
        elif flag == 4:
            if start == 0:
                start = 1
                continue
            dicentry = line.split('\t')
            endProbablities[dicentry[0]] = float(dictentry[1])
            tagIndexDict[dicentry[0]] = int(dicentry[2])
        elif flag == 5:
            if start == 0:
                start = 1
                continue
            uniqueTags = line.split("\t")
        elif flag == 6:
            if start == 0:
                start = 1
                continue
            uniqueWordsAndIndex = line.split("\t")
            uniqueWords.append(uniqueWordsAndIndex[0])
            wordIndexDict[uniqueWordsAndIndex[0]] = int(uniqueWordsAndIndex[1])
        else:
            pass
    
    return transitionMatrix , emissionMatrix, initialProbablities, endProbablities, uniqueTags, tagIndexDict, uniqueWords, wordIndexDict

In [None]:
transitionMatrix = getTransitionMatrix(fileData, uniqueTags, tagIndexDict)
emissionMatrix = getEmissionMatrix(fileData)
emissionMatrixNormal = emissionMatrix.tolist()
initialProbablities = getInitialProbabilities(fileData, uniqueTags)
endProbablities = getEndProbabilities(fileData, uniqueTags)
print endProbablities
writeModelParameters(transitionMatrix, emissionMatrixNormal, initialProbablities, endProbablities, uniqueTags, uniqueWords,tagIndexDict, wordIndexDict)
#print emissionMatrix[0]
emissionMatrix[tagIndexDict['HYPH']][wordIndexDict['-']]

In [None]:
def getTestDataFromFile():
    filenameTestTagged = "en_dev_tagged.txt"

    filenameTestRaw = "en_dev_raw.txt"
    testdir = os.path.join(os.getcwd(), 'coding1-data-corpus', filenameTestRaw)
    taggedTestDir = os.path.join(os.getcwd(), 'coding1-data-corpus', filenameTestTagged)
    fileOpenTest = open(testdir,"r")
    fileOpenTestTagged  = open(taggedTestDir,"r")
    testData = fileOpenTest.readlines()
    taggedTestData = fileOpenTestTagged.readlines()
    return testData, taggedTestData

def getPrediction(testData, taggedTestData, transitionMatrix, emissionMatrix):
    import time
    start = time.time()
    prediction = ""
    for sentenceIndex in range(len(testData)):
        words = testData[sentenceIndex].strip("\n").split(" ")
        predictedSentence = viterbiMatrix(words, transitionMatrix, emissionMatrix)
        prediction += predictedSentence + "\n"
    
    filename = 'hmmoutput.txt'    
    with open(filename, 'w') as f:
        f.write(prediction)
    
    print time.time() - start
        
    


def viterbiMatrix(words, transitionMatrix, emissionMatrix):
    global uniqueTags, tagIndexDictReverse
    totalWordCount = 0
    correctlyTagged = 0
    uniqueTagsLength = len(uniqueTags)
    sentenceLength = len(words)

        
    
    viterbiMatrix = np.zeros(shape = (uniqueTagsLength, sentenceLength))
    #viterbiMatrix += np.finfo(float).min
    
    correspondingTags = [["" for y in range(sentenceLength)]for x in range(uniqueTagsLength)]
    for col in range(sentenceLength):
        for row in range(uniqueTagsLength):
            tag = tagIndexDictReverse[row]
            word = words[col]
            try:
                obsProbability = emissionMatrix[row][wordIndexDict[word]]
                if obsProbability == 0.0:
                    continue
            except KeyError as e:
                try:
                    obsProbability = emissionMatrix[row][wordIndexDict[word.lower()]]
                except:
                    obsProbability = 1.0
                                
            if col == 0:
                transitionProbability = initialProbablities[tag]
                viterbiMatrix[row][col] = transitionProbability * obsProbability
                correspondingTags[row][col] = tag
            else:
                for prevColrow in range(uniqueTagsLength):
                    prevVal = viterbiMatrix[prevColrow][col-1]
                    if prevVal == 0.0:
                        continue
                    prevTag = tagIndexDictReverse[prevColrow]
                    transitionProbability = transitionMatrix[prevColrow][row]
                    probabilityVal = transitionProbability * obsProbability * prevVal
                    if probabilityVal > viterbiMatrix[row][col]:
                        correspondingTags[row][col] = prevTag
                        viterbiMatrix[row][col] = probabilityVal
                    correspondingTags[row][col]= 
                                            

        
    finalStateVal = 0.0
    finalTag = ""
    for rowIndex in range(uniqueTagsLength):
        tag = tagIndexDictReverse[rowIndex]
        finalStateProbability = endProbablities[tag] * viterbiMatrix[rowIndex][sentenceLength - 1]
        if finalStateProbability > finalStateVal:
            finalTag = tag
            finalStateVal = finalStateProbability
            
    assignedTags = [finalTag]    
    currentTag = finalTag
    for colIndex in range(sentenceLength -1,0,-1):
        try:
            currentTag = correspondingTags[tagIndexDict[currentTag]][colIndex]
        except KeyError as e:
            print viterbiMatrix[:,[len(words)-1]]
            print "--------------------------------"
            #print emissionMatrix[:,[len(words)-1]]
            #print words
        assignedTags.append(currentTag)

    assignedTags = assignedTags[::-1]

    predictedWordTagPair = []
    for i in range(sentenceLength):
        predictedWordTagPair.append('/'.join([words[i], assignedTags[i]]))
    return ' '.join(predictedWordTagPair)


In [None]:
testData, taggedTestData = getTestDataFromFile()
getPrediction(testData, taggedTestData, transitionMatrix, emissionMatrix)
# brokenSentence = ["President", "Bush", "on", "Tuesday", "nominated", "two", "individuals", "to", "replace", "retiring", "jurists", "on", "federal", "courts", "in", "the", "Washington", "area", "."]
# %lprun -f viterbiMatrix viterbiMatrix(brokenSentence, transitionMatrix, emissionMatrix)

In [23]:
def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

def computeAccuracy():
    dev_tagged_data = getFileContents('en-ud-test.conllu')
    predicted_data = getFileContents('hmmoutput.txt')
    correct = 0
    total = 0
    for index, line in enumerate(dev_tagged_data):
        predicted_tagged_line = predicted_data[index]
        expected_tagged_line = dev_tagged_data[index]
        
        predicted_word_tag_pairs = predicted_tagged_line.strip().split(' ')
        expected_word_tag_pairs = expected_tagged_line.strip().split(' ')
        for index, predicted_word in enumerate(predicted_word_tag_pairs):
            if predicted_word == expected_word_tag_pairs[index]:
                correct += 1
#             else:
#                 print predicted_word, "  should be  ", expected_word_tag_pairs[index]
            total += 1
            if total % 100 == 0:
                print correct, total, " => ", (correct*100.0)/total
    accuracy = (correct*100.0)/total
    print accuracy


computeAccuracy()

85 100  =>  85.0
176 200  =>  88.0
267 300  =>  89.0
360 400  =>  90.0
456 500  =>  91.2
550 600  =>  91.6666666667
641 700  =>  91.5714285714
733 800  =>  91.625
826 900  =>  91.7777777778
903 1000  =>  90.3
977 1100  =>  88.8181818182
1069 1200  =>  89.0833333333
1160 1300  =>  89.2307692308
1250 1400  =>  89.2857142857
1342 1500  =>  89.4666666667
1430 1600  =>  89.375
1522 1700  =>  89.5294117647
1610 1800  =>  89.4444444444
1708 1900  =>  89.8947368421
1798 2000  =>  89.9
1888 2100  =>  89.9047619048
1979 2200  =>  89.9545454545
2067 2300  =>  89.8695652174
2159 2400  =>  89.9583333333
2253 2500  =>  90.12
2345 2600  =>  90.1923076923
2430 2700  =>  90.0
2513 2800  =>  89.75
2591 2900  =>  89.3448275862
2672 3000  =>  89.0666666667
2760 3100  =>  89.0322580645
2850 3200  =>  89.0625
2941 3300  =>  89.1212121212
3035 3400  =>  89.2647058824
3119 3500  =>  89.1142857143
3210 3600  =>  89.1666666667
3300 3700  =>  89.1891891892
3394 3800  =>  89.3157894737
3483 3900  =>  89.307692307