In [36]:
import numpy as np
import math
import sys
from collections import defaultdict
from math import expm1
import time
try:
    # Python 2.7
    import urllib2 as ur
    orl2 = True
except:
    #Python 3.4
    import urllib.request as ur
    orl2 = False

In [2]:
def computeInitialProb(trainDataFile,numOfStates):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    numOfStates=min(numOfStates,distinctObservations)
    empiricalCount=np.zeros(shape=numOfStates)
    empiricalFreq=defaultdict(int)
    for n in range(numSequences):
        line = trainFile.readline()#Reading Sequences 1 by 1
        l = line.split(" ")
        startState=int(l[1])
        empiricalFreq[startState] = empiricalFreq[startState]+1
    totalObservations=0
    for i in np.arange(numOfStates):
        empiricalCount[i]=empiricalFreq[i]
        totalObservations=totalObservations+empiricalCount[i]
    initialProb=[count/totalObservations for count in empiricalCount]
    return (numOfStates,distinctObservations,initialProb)

In [3]:
def createRandomMatrixA(numOfStates):
    matrixA=np.ndarray(shape=(numOfStates,numOfStates),dtype=float)
    prob=1.0/(numOfStates*numOfStates)
    matrixA.fill(prob)
    return matrixA
def createRandomMatrixB(numOfStates,distinctObservations):
    matrixB=np.ndarray(shape=(numOfStates,distinctObservations),dtype=float)
    prob=1.0/(numOfStates*distinctObservations)
    matrixB.fill(prob)
    return matrixB

In [4]:
def computeAlpha(observations,a,aTranspose,b,bTranspose,pi,alphaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    alphaDpScaleTime0=0
    alphaDP[0]=pi*bTranspose[observations[0]]
    alphaDpScaleTime0=np.sum(alphaDP[0])
    alphaDP[0]/=alphaDpScaleTime0
    for t in np.arange(1,timePts):
        alphaDpScaleTimeT=0
        for i in np.arange(statesC):
            alphaDP[t][i]=(np.sum(alphaDP[t-1]*aTranspose[i]))*b[i][observations[t]]
        alphaDpScaleTimeT=np.sum(alphaDP[t])
        alphaDP[t]/=alphaDpScaleTimeT
def computeAlphaUnScaled(observations,a,aTranspose,b,bTranspose,pi,alphaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    alphaDpScaleTime0=0
    alphaDP[0]=pi*bTranspose[observations[0]]
    for t in np.arange(1,timePts):
        alphaDpScaleTimeT=0
        for i in np.arange(statesC):
            alphaDP[t][i]=(np.sum(alphaDP[t-1]*aTranspose[i]))*b[i][observations[t]]
def observationsLikelihood(alphaDP):
    timePts=alphaDP.shape[0]
    ans=0.0
    ans=np.sum(alphaDP[timePts-1])
    return ans

In [5]:
def computeBeta(observations,a,b,bTranspose,pi,betaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    betaDP[timePts-1].fill(1)
    for t in np.arange(timePts-2,-1,-1):
        betaDpScaleTimeT=0
        for i in np.arange(statesC):
            betaDP[t][i]=np.sum(a[i]*bTranspose[observations[t+1]]*betaDP[t+1])
        betaDpScaleTimeT=np.sum(betaDP[t])
        betaDP[t]/=betaDpScaleTimeT
    return betaDP

In [6]:
def computeDiGammaDP(alphaDP,betaDP,a,b,bTranspose,observations):
    observationsC=alphaDP.shape[0]
    statesC=alphaDP.shape[1]
    diGammaDP=np.zeros(shape=(statesC,statesC),dtype=float)
    diGammaDenom=observationsLikelihood(alphaDP)
    for i in np.arange(statesC):
        for t in np.arange(observationsC-1):
            diGammaDP[i]+=alphaDP[t][i]*a[i]*bTranspose[observations[t+1]]*betaDP[t+1]
    diGammaDP/=diGammaDenom
    return diGammaDP
def computeTransitionProbabilityA(alphaDP,betaDP,a,b,bTranspose,observations):
    statesC=alphaDP.shape[1]
    newlyComputedTransitionProbA=np.zeros(shape=(statesC,statesC),dtype=float)
    diGammaDP=computeDiGammaDP(alphaDP,betaDP,a,b,bTranspose,observations)
    diGammaDPSumGrpByJ=np.apply_along_axis(np.sum,1,diGammaDP)
    for i in np.arange(statesC):    
        if (diGammaDPSumGrpByJ[i]==0):
            newlyComputedTransitionProbA[i]=0.0
        else:
            newlyComputedTransitionProbA[i]=diGammaDP[i]/diGammaDPSumGrpByJ[i]
    return newlyComputedTransitionProbA   

In [7]:
def computeGammaDP(alphaDP,betaDP):
    gammaDenom=observationsLikelihood(alphaDP)
    gammaDP=alphaDP*betaDP#[Time][State]
    gammaDP/=gammaDenom
    return gammaDP
def computeObsrProbNum(gammaDP,i,vk,observations):
    gammaDPi=gammaDP[i]
    return np.sum(gammaDPi[np.where(observations==vk)])
def computeTransitionProbabilityB(alphaDP,betaDP,a,b,observations,observationDict):
    statesC=a.shape[0]
    observationsC=b.shape[1]
    newlyComputedObsrProbB=np.zeros(shape=(observationsC,statesC),dtype=float)#Ideal Shape should be transposed
    gammaDP=computeGammaDP(alphaDP,betaDP)#[t][state]
    gammaDP=gammaDP.transpose()
    for i in np.arange(statesC):
        obsrProbDenom =np.sum(gammaDP[i])
        for vk in observationDict:
            newlyComputedObsrProbB[i][vk]=computeObsrProbNum(gammaDP,i,vk,observations)/obsrProbDenom
    return newlyComputedObsrProbB

In [8]:
#Change Convergence Criteria to be more reasonable/Useful
def isConverged(count,convergenceIters):
    if count>=convergenceIters:
        return True
    return False
def Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict):
    count=0
    updatedA=A
    updatedB=B
    while isConverged(count,convergenceIters)==False:
        #Expectation(E)-Step
        alphaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        betaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        updatedATranspose=updatedA.transpose()
        updatedBTranspose=updatedB.transpose()
        computeAlpha(observations,updatedA,updatedATranspose,updatedB,updatedBTranspose,pi,alphaDP)
        computeBeta(observations,updatedA,updatedB,updatedBTranspose,pi,betaDP)
        #Maximization(M)-Step
        newA=computeTransitionProbabilityA(alphaDP,betaDP,updatedA,updatedB,updatedBTranspose,observations)
        newB=computeTransitionProbabilityB(alphaDP,betaDP,updatedA,updatedB,observations,observationDict)
        updatedA=newA
        updatedB=newB
        count=count+1
    return (updatedA,updatedB)

In [9]:
def trainHMM(trainDataFile,A,B,pi,convergenceIters,maxSequences=-1):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    observationDict=np.arange(distinctObservations)
    updatedA=np.NaN
    updatedB=np.NaN
    isAUpdated=False
    if(maxSequences==-1):
        usedSeqs=numSequences
    else:
        usedSeqs=min(maxSequences,numSequences)
    actuallyUsedSeqs=0
    for n in range(usedSeqs):
        line = trainFile.readline()#Reading Sequences 1 by 1
        l = line.split(" ")
        if(int(l[0])<=1):
            continue
        actuallyUsedSeqs+=1
        observations=np.array([int(i) for i in l[1:len(l)]])
        learnedParams=Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict)
        if isAUpdated==False:
            isAUpdated=True
            updatedA=learnedParams[0]
            updatedB=learnedParams[1]
        else:
            updatedA+=learnedParams[0]
            updatedB+=learnedParams[1]
    updatedA=updatedA/actuallyUsedSeqs
    updatedB=updatedB/actuallyUsedSeqs
    return (updatedA,updatedB)

In [10]:
def trainModel(fileLoc,maxNoOfStates,convergenceIters,maxSequences=-1):
    start = time.time()
    initialProbs=computeInitialProb(fileLoc,maxNoOfStates)
    end = time.time()
    print("Computed Initial Prob. in ", end - start ,"seconds")
    pi=initialProbs[2]
    numOfStates=initialProbs[0]
    distinctObservations=initialProbs[1]
    A=createRandomMatrixA(numOfStates)
    B=createRandomMatrixB(numOfStates,distinctObservations)
    trainedParams=trainHMM(fileLoc,A,B,pi,convergenceIters,maxSequences)
    trainedParams=trainedParams+(pi,)
    end=time.time()
    print("For ",maxSequences," Sequences : Total Training Time ",end-start," seconds")
    return trainedParams

In [11]:
(A,B,pi)=trainModel('Data/0.spice.train.txt',4,7)

Computed Initial Prob. in  0.21244215965270996 seconds
For  -1  Sequences : Total Training Time  376.17602610588074  seconds


In [12]:
A

array([[ 0.17721276,  0.17721276,  0.17721276,  0.32573225],
       [ 0.17721276,  0.17721276,  0.17721276,  0.32573225],
       [ 0.17721276,  0.17721276,  0.17721276,  0.32573225],
       [ 0.26319066,  0.26319066,  0.26319066,  0.21042802]])

In [13]:
B

array([[ 0.33795274,  0.20093494,  0.12926566,  0.33184666],
       [ 0.33795274,  0.20093494,  0.12926566,  0.33184666],
       [ 0.33795274,  0.20093494,  0.12926566,  0.33184666],
       [ 0.02362414,  0.0242984 ,  0.01624732,  0.93583015]])

In [40]:
pi

[0.0, 0.0, 0.0, 1.0]

In [14]:
def getHmmRank(prefix,A,ATranspose,B,BTranspose,pi,uniqueSymbols,shouldScale=False):
    likelihoods=[]
    for i in np.arange(uniqueSymbols):
        prefix.append(i)
        observations=np.array(prefix)
        alphaDP=np.zeros(shape=(observations.shape[0],A.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        if shouldScale:
            computeAlpha(observations,B,ATranspose,B,BTranspose,pi,alphaDP)
        else:
            computeAlphaUnScaled(observations,B,ATranspose,B,BTranspose,pi,alphaDP)
        obsrLikelihood=observationsLikelihood(alphaDP)
        prefix.pop()
        likelihoods.append((i,obsrLikelihood))
    likelihoods=sorted(likelihoods, key=lambda x: -x[1])
    ranks=[i[0] for i in likelihoods]
    return ranks

In [37]:
def list_to_string(l):
    s=str(l[0])
    for x in l[1:]:
        s+= " " + str(x)
    return(s)
def formatString(string_in):
    """ Replace white spaces by %20 """
    return string_in.strip().replace(" ", "%20")
# get the test first prefix: the only element of the test set
def get_first_prefix(test_file):
    """ This function is called for the public test file(Which only has 1 line)
    """
    f = open(test_file)
    prefix = f.readline()
    f.close()
    return prefix
def predictOnSpicePublicData(problem_number,name):
    problem_number = str(problem_number)
    user_id = '68'
    #name = "hmm_Baseline"
    #train_file = 'Data/0.spice.train.txt'
    prefix_file = 'Data/'+problem_number+'.spice.public.test.txt'
    first_prefix = get_first_prefix(prefix_file)
    prefix_number=1
    # get the next symbol ranking on the first prefix
    p=first_prefix.split()
    prefix=[int(i) for i in p[1:len(p)]]#prefix holds the sequence of values in the public test file(Note:It has only 1 Seq)
    print("Prefix ",prefix)
    ranking=getHmmRank(prefix,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
    print("Model Ranking ",ranking)
    ranking_string=list_to_string(ranking[:5])
    #print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + first_prefix)
    first_prefix = formatString(first_prefix)

    # transform the ranking to follow submission format
    ranking_string=formatString(ranking_string)

    # create the url to submit the ranking
    name=name+"_Ver1.3"
    url_base = 'http://spice.lif.univ-mrs.fr/submit.php?user=' + user_id +\
        '&problem=' + problem_number + '&submission=' + name + '&'
    url = url_base + 'prefix=' + first_prefix + '&prefix_number=1' + '&ranking=' +\
        ranking_string
    response = ur.urlopen(url)
    print("URL ",url)
    content = response.read()
    print("Response from SPiCe ",content)#Content is a new Sequence returned from the SPiCe server: We will need to predict for this seq
    if not orl2:
        # Needed for python 3.4...
        content= content.decode('utf-8')
    list_element = content.split()
    head = str(list_element[0])
    return content,url_base

In [38]:
spiceContentOnPubFile,url_base=predictOnSpicePublicData(0,"hmm_baseline")

Prefix  [3, 0, 3, 0, 1, 3, 3]
Model Ranking  [3, 0, 1, 2]
URL  http://spice.lif.univ-mrs.fr/submit.php?user=68&problem=0&submission=hmm_baseline_Ver1.3&prefix=7%203%200%203%200%201%203%203&prefix_number=1&ranking=3%200%201%202
Response from SPiCe  b'2 3 3 \n'


In [34]:
def evaluateOnSpiceTrainDataSet(prevContent,url_base):
    prefix_number = 2
    head=''
    content=prevContent
    while(head != '[Error]' and head != '[Success]'):
        prefix = content[:-1]#Fetch the Sequence returned from Spice Server and exclude the last '\n'
        # Get the ranking
        p=prefix.split()
        prefix_list=[int(i) for i in p[1:len(p)]]
        ranking = getHmmRank(prefix_list,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
        ranking_string=list_to_string(ranking[:5])#Here At least alphabet should be 4: Else may get Runtime error
        if prefix_number % 200 == 0:
            print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + prefix)
        # Format the ranking
        ranking_string = formatString(ranking_string)
        # create prefix with submission needed format
        prefix=formatString(prefix)
        # Create the url with your ranking to get the next prefix
        url = url_base + 'prefix=' + prefix + '&prefix_number=' +\
            str(prefix_number) + '&ranking=' + ranking_string
        # Get the answer of the submission on current prefix
        response = ur.urlopen(url)
        content = response.read()
        if not orl2:
            # Needed for Python 3.4...
            content= content.decode('utf-8')
        list_element = content.split()
        # modify head in case it is finished or an erro occured
        head = str(list_element[0])
        # change prefix number
        prefix_number += 1
    # Post-treatment
    # The score is the last element of content (in case of a public test set)
    print(content)
    list_element = content.split()
    score = (list_element[-1])
    print(score)

In [39]:
evaluateOnSpiceTrainDataSet(spiceContentOnPubFile,url_base)

Prefix number: 200 Ranking: 3 0 1 2 Prefix: 8 3 0 1 3 3 1 2 1 
Prefix number: 400 Ranking: 3 0 1 2 Prefix: 7 3 3 1 0 1 3 3 
Prefix number: 600 Ranking: 3 0 1 2 Prefix: 3 3 0 1 
Prefix number: 800 Ranking: 3 0 1 2 Prefix: 1 3 
Prefix number: 1000 Ranking: 3 0 1 2 Prefix: 2 3 0 
[Success] Last prefix of the test set. The score of the submission named hmm_baseline_Ver1.3 on problem 0 is 0.85064155226946

0.85064155226946
