In [212]:
import sys 
import math
class hmm_code():
    
    def __init__(self,modelFile,hmmCodeFile):
        self.modelFile = modelFile
        self.hmmCodeFile = hmmCodeFile
        
    def read_file(self,fileName):
        with open(fileName) as fn:
            hmmModelData = fn.readlines()
        return hmmModelData
    
    def write_file(self,output):
        with open("./hmmoutput.txt", "w") as fp:
            fp.write(output)
    
    def getWordSpace(self,emission_probability_dict):
        wordSpace = set()
        for tag in list(emission_probability_dict.keys()):
            for word in list(emission_probability_dict[tag].keys()):
                wordSpace.add(word)
        return wordSpace
    
    def getTagSpace(self,transition_probability_dict):
        tagSpace  = set(list(transition_probability_dict.keys()))
        if "<start>" in tagSpace:
            tagSpace.remove("<start>")
        return tagSpace
    
    def getEmissionDict(self,tagReturn,paramReturn,emission_probability_dict):
        if tagReturn not in emission_probability_dict:
            emission_probability_dict[tagReturn] = {}
        for element in paramReturn:
            wordTag, prob = element.rsplit("||",1)
            emission_probability_dict[tagReturn][wordTag] = float(prob)
        return emission_probability_dict
        
    def getTransitionDict(self,tagReturn,paramReturn,transition_probability_dict):
        if tagReturn not in transition_probability_dict:
            transition_probability_dict[tagReturn] = {}
        for element in paramReturn:
            wordTag, prob = element.rsplit("||",1)
            transition_probability_dict[tagReturn][wordTag] = float(prob)
        return transition_probability_dict

                
    
    def getTransitionEmissionDict(self,modelFile):
        transition_probability_dict = {}
        emission_probability_dict = {}
        for elements in modelFile:
            textReturn = elements.split()
            if (len(elements.split())) >= 2:
                paramType,tagReturn,paramReturn = textReturn[0][1:],textReturn[1][:-1],textReturn[2:]
                if paramType == "Emission":
                    emission_probability_dict = self.getEmissionDict(tagReturn,paramReturn,emission_probability_dict)
                elif paramType == "Transition":
                    transition_probability_dict = self.getTransitionDict(tagReturn,paramReturn,transition_probability_dict)

               

                
        
        tagSpace = self.getTagSpace(transition_probability_dict)
        wordSpace = self.getWordSpace(emission_probability_dict)
        vocab = set()
        tag_wise_vocab_count = {}
        total_vocab = set()
        for tag, word_dict in emission_probability_dict.items():
            total_vocab.update(set(word_dict.keys()))
            tag_wise_vocab_count[tag] = len(word_dict)

        open_class_tags = []
        for tag in tag_wise_vocab_count.keys():
            if tag_wise_vocab_count[tag]>0.0350*len(total_vocab):
                open_class_tags.append(tag)
        
        return tagSpace,wordSpace,transition_probability_dict,emission_probability_dict,open_class_tags

                
    def getMaximumProbability(self,current_state, path_prob, transition_probability_dict):
        pointer = ""
        probMax = -sys.maxsize
        for item in path_prob:
            if  path_prob[item]+transition_probability_dict[item][current_state] > probMax:
                pointer = item
                probMax =  transition_probability_dict[item][current_state]+path_prob[item]
        return pointer,probMax
    
    def viterbiAlgorithm(self,sentence,emission_probability_dict,transition_probability_dict,tagSpace,wordSpace,openTagList):
        words = sentence.split()
        probMax,pointer,probMax[0],pointer[0] = {},{},{},{}
        for state in tagSpace:
            if words[0] in emission_probability_dict[state] and words[0] in wordSpace  : 
                pointer[0][state] = "<start>"
                probMax[0][state] = emission_probability_dict[state][words[0]] + transition_probability_dict["<start>"][state]
            elif words[0] not in wordSpace:
                pointer[0][state] = "<start>"
                probMax[0][state] = transition_probability_dict["<start>"][state]
                

       
        for index in range(1,len(words)):
            pointer[index],probMax[index] = {},{}
            for state in tagSpace:
                if words[index] in wordSpace and words[index] in emission_probability_dict[state]: 
                    pointer[index][state],probMax[index][state] = self.getMaximumProbability(state, probMax[index-1], transition_probability_dict)
                    probMax[index][state] += emission_probability_dict[state][words[index]] 
                elif words[index] not in wordSpace:
                    if state not in openTagList:
                        continue
                    pointer[index][state],probMax[index][state] = self.getMaximumProbability(state, probMax[index-1], transition_probability_dict)

       
        endProb = -sys.maxsize
        endTag  = "" 
        
        for state in probMax[len(words) -1]:
            if probMax[len(words)-1][state] + transition_probability_dict[state]["<end>"] > endProb:
                endProb = transition_probability_dict[state]["<end>"] + probMax[len(words)-1][state]
                endTag = state


        output = []
        tagPrediction = endTag
        for i in range(len(words) - 1, -1, -1):
            output.append("{}/{}".format(words[i], tagPrediction))
            tagPrediction = pointer[i][tagPrediction]
        result = " ".join(output[::-1])
        return result
    
     
        
    def main(self):
        hmmModelData = self.read_file(self.modelFile)
        tagSpace,wordSpace,transition_probability_dict,emission_probability_dict,openTagList = self.getTransitionEmissionDict(hmmModelData)
        inputText = self.read_file(self.hmmCodeFile)
        output = ""
        for text in inputText:
            output += self.viterbiAlgorithm(text,emission_probability_dict,transition_probability_dict,tagSpace,wordSpace,openTagList) + "\n"
        output = output[:-1]
        self.write_file(output)


    

# hmmCodeFile = sys.argv[1]
# modelFile = "./hmmmodel.txt"
# obj =  hmm_code(modelFile,hmmCodeFile)
# obj.main()

In [213]:
hmmCodeFile = 'hmm-training-data/it_isdt_dev_raw.txt'
modelFile = "./hmmmodel.txt"
obj =  hmm_code(modelFile,hmmCodeFile)
obj.main()

ayush ['SP', 'S', 'A', 'V', 'N']


In [None]:
#  for index in range(1,len(words)):
#             pointer[index],probMax[index] = {},{}
#             for state in tagSpace:
#                 if words[index] in wordSpace and words[index] in emission_probability_dict[state]: 
#                     pointer[index][state],probMax[index][state] = self.getMaximumProbability(state, probMax[index-1], transition_probability_dict)
#                     probMax[index][state] += emission_probability_dict[state][words[index]] 
                    
#                 elif words[index] not in wordSpace:
#                     if(state not in open_tag_list):
#                         continue
#                     pointer[index][state],probMax[index][state] = self.getMaximumProbability(state, probMax[index-1], transition_probability_dict)



# Backup 

In [214]:
import sys 
import math
class hmm_code():
    
    def __init__(self,modelFile,hmmCodeFile):
        self.modelFile = modelFile
        self.hmmCodeFile = hmmCodeFile
        
    def read_file(self,fileName):
        with open(fileName) as fn:
            hmmModelData = fn.readlines()
        return hmmModelData
    
    def write_file(self,output):
        with open("./hmmoutput.txt", "w") as fp:
            fp.write(output)
    
    def getWordSpace(self,emission_probability_dict):
        wordSpace = set()
        for tag in list(emission_probability_dict.keys()):
            for word in list(emission_probability_dict[tag].keys()):
                wordSpace.add(word)
        return wordSpace
    
    def getTagSpace(self,transition_probability_dict):
        tagSpace  = set(list(transition_probability_dict.keys()))
        if "<start>" in tagSpace:
            tagSpace.remove("<start>")
        return tagSpace
    
    def getEmissionDict(self,tagReturn,paramReturn,emission_probability_dict):
        if tagReturn not in emission_probability_dict:
            emission_probability_dict[tagReturn] = {}
        for element in paramReturn:
            wordTag, prob = element.rsplit("||",1)
            emission_probability_dict[tagReturn][wordTag] = float(prob)
        return emission_probability_dict
        
    def getTransitionDict(self,tagReturn,paramReturn,transition_probability_dict):
        if tagReturn not in transition_probability_dict:
            transition_probability_dict[tagReturn] = {}
        for element in paramReturn:
            wordTag, prob = element.rsplit("||",1)
            transition_probability_dict[tagReturn][wordTag] = float(prob)
        return transition_probability_dict

                
    
    def getTransitionEmissionDict(self,modelFile):
        transition_probability_dict = {}
        emission_probability_dict = {}
        for elements in modelFile:
            textReturn = elements.split()
            paramType,tagReturn,paramReturn = textReturn[0][1:],textReturn[1][:-1],textReturn[2:]
            if paramType == "Emission":
                emission_probability_dict = self.getEmissionDict(tagReturn,paramReturn,emission_probability_dict)
            elif paramType == "Transition":
                transition_probability_dict = self.getTransitionDict(tagReturn,paramReturn,transition_probability_dict)
        
        tagSpace = self.getTagSpace(transition_probability_dict)
        wordSpace = self.getWordSpace(emission_probability_dict)
        
        return tagSpace,wordSpace,transition_probability_dict,emission_probability_dict

                
    def getMaximumProbability(self,current_state, path_prob, transition_probability_dict):
        pointer = ""
        probMax = -sys.maxsize
        for item in path_prob:
            if  path_prob[item]+transition_probability_dict[item][current_state] > probMax:
                pointer = item
                probMax =  transition_probability_dict[item][current_state]+path_prob[item]
        return pointer,probMax
    
    def viterbiAlgorithm(self,sentence,emission_probability_dict,transition_probability_dict,tagSpace,wordSpace):
        words = sentence.split()
        probMax,pointer,probMax[0],pointer[0] = {},{},{},{}
        for state in tagSpace:
            if words[0] in emission_probability_dict[state] and words[0] in wordSpace  : 
                pointer[0][state] = "<start>"
                probMax[0][state] = emission_probability_dict[state][words[0]] + transition_probability_dict["<start>"][state]
            elif words[0] not in wordSpace:
                pointer[0][state] = "<start>"
                probMax[0][state] = transition_probability_dict["<start>"][state]
                

       
        for index in range(1,len(words)):
            pointer[index],probMax[index] = {},{}
            for state in tagSpace:
                if words[index] in wordSpace and words[index] in emission_probability_dict[state]: 
                    pointer[index][state],probMax[index][state] = self.getMaximumProbability(state, probMax[index-1], transition_probability_dict)
                    probMax[index][state] += emission_probability_dict[state][words[index]] 
                    
                elif words[index] not in wordSpace:
                    pointer[index][state],probMax[index][state] = self.getMaximumProbability(state, probMax[index-1], transition_probability_dict)

       
        endProb = -sys.maxsize
        endTag  = "" 
        
        for state in probMax[len(words) -1]:
            if probMax[len(words)-1][state] + transition_probability_dict[state]["<end>"] > endProb:
                endProb = transition_probability_dict[state]["<end>"] + probMax[len(words)-1][state]
                endTag = state


        output = []
        tagPrediction = endTag
        for i in range(len(words) - 1, -1, -1):
            output.append("{}/{}".format(words[i], tagPrediction))
            tagPrediction = pointer[i][tagPrediction]
        result = " ".join(output[::-1])
        return result
    
     
        
    def main(self):
        hmmModelData = self.read_file(self.modelFile)
        tagSpace,wordSpace,transition_probability_dict,emission_probability_dict = self.getTransitionEmissionDict(hmmModelData)
        inputText = self.read_file(self.hmmCodeFile)
        output = ""
        for text in inputText:
            output += self.viterbiAlgorithm(text,emission_probability_dict,transition_probability_dict,tagSpace,wordSpace) + "\n"
        output = output[:-1]
        self.write_file(output)

hmmCodeFile = sys.argv[1]
modelFile = "./hmmmodel.txt"
obj =  hmm_code(modelFile,hmmCodeFile)
obj.main()

FileNotFoundError: [Errno 2] No such file or directory: '-f'