In [1]:
import numpy as np
import math
import sys
from collections import defaultdict
from math import expm1
import time
try:
    # Python 2.7
    import urllib2 as ur
    orl2 = True
except:
    #Python 3.4
    import urllib.request as ur
    orl2 = False
from collections import Counter

In [2]:
def computeInitialProb1(trainDataFile,numOfStates):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    numOfStates=min(numOfStates,distinctObservations)
    empiricalCount=np.zeros(shape=numOfStates)
    empiricalFreq=defaultdict(int)
    for n in range(numSequences):
        line = trainFile.readline()#Reading Sequences 1 by 1
        l = line.split(" ")
        startState=int(l[1])
        empiricalFreq[startState] = empiricalFreq[startState]+1
    totalObservations=0
    for i in np.arange(numOfStates):
        empiricalCount[i]=empiricalFreq[i]
        totalObservations=totalObservations+empiricalCount[i]
    initialProb=[count/totalObservations for count in empiricalCount]
    return (numOfStates,distinctObservations,initialProb)
def computeInitialProb(trainDataFile,numOfStates):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    numOfStates=min(numOfStates,distinctObservations)
    empiricalDistr=Counter()
    for n in range(numSequences):
        line = trainFile.readline()#Reading Sequences 1 by 1
        line=line.rstrip("\n")
        l = line.split(" ")
        l=l[1:]
        lDistr=Counter(l)
        empiricalDistr+=lDistr
    totalSymbolsSeen=sum(empiricalDistr.values())
    initialProb=[]
    for i in np.arange(numOfStates):
        initialProb.append((1.0*empiricalDistr[str(i)])/totalSymbolsSeen)
    return (numOfStates,distinctObservations,initialProb)

In [3]:
def createRandomMatrixA(numOfStates):
    matrixA=np.ndarray(shape=(numOfStates,numOfStates),dtype=float)
    prob=1.0/(numOfStates*numOfStates)
    matrixA.fill(prob)
    return matrixA
def createRandomMatrixB(numOfStates,distinctObservations):
    matrixB=np.ndarray(shape=(numOfStates,distinctObservations),dtype=float)
    prob=1.0/(numOfStates*distinctObservations)
    matrixB.fill(prob)
    return matrixB

In [38]:
def computeAlpha(observations,a,aTranspose,b,bTranspose,pi,alphaDP,alphaDPScaleT):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    alphaDpScaleTime0=0
    #Vec(alphaDP[0])=pi[0]*b[0][ob[0]],pi[1]*b[1][ob[0]],pi[2]*b[2][ob[0]],....,pi[n-1]*b[n-1][ob[0]]
    #Vec(alphaDP[0])=pi[0]*bTrans[ob[0]][0],pi[1]*bTrans[ob[0]][1],pi[2]*bTrans[ob[0]][2],....,pi[n-1]*bTrans[ob[0]][n-1]
    alphaDP[0]=pi*bTranspose[observations[0]]
    alphaDPScaleT[0]=1.0/np.sum(alphaDP[0])
    alphaDP[0]*=alphaDPScaleT[0]
    for t in np.arange(1,timePts):
        for i in np.arange(statesC):
            alphaDP[t][i]=b[i][observations[t]]*(np.sum(alphaDP[t-1]*aTranspose[i]))
        alphaDPScaleT[t]=1.0/np.sum(alphaDP[t])
        alphaDP[t]*=alphaDPScaleT[t]
def observationsLikelihood(alphaDPScaleT):
    return 1.0/np.prod(alphaDPScaleT)

In [39]:
def computeBeta(observations,a,b,bTranspose,pi,betaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    betaDP[timePts-1].fill(1)
    for t in np.arange(timePts-2,-1,-1):
        betaDpScaleTimeT=0
        for i in np.arange(statesC):
            betaDP[t][i]=np.sum(a[i]*bTranspose[observations[t+1]]*betaDP[t+1])
        betaDpScaleTimeT=1.0/np.sum(betaDP[t])
        betaDP[t]*=betaDpScaleTimeT
    return betaDP

In [40]:
def validateAlphaDP(alphaDP,betaDP):
    observationsC=alphaDP.shape[0]
    statesC=alphaDP.shape[1]
    ans=[]
    for t in np.arange(observationsC-1):
        ans.append(np.sum(alphaDP[t]*betaDP[t]))
    print("Alpha-BETA Validation ",ans)

In [41]:
def computeGammaDP(diGammaDP):
    return np.apply_along_axis(np.sum,2,diGammaDP)
def computeDiGammaDP(alphaDP,alphaDPScaleT,betaDP,a,b,bTranspose,observations):
    observationsC=alphaDP.shape[0]
    statesC=alphaDP.shape[1]
    diGammaDP=np.zeros(shape=(observationsC,statesC,statesC),dtype=float)
    diGammaDenom=observationsLikelihood(alphaDPScaleT)
    for i in np.arange(statesC):
        for t in np.arange(observationsC-1):
            diGammaDP[t][i]=alphaDP[t][i]*a[i]*bTranspose[observations[t+1]]*betaDP[t+1]
    diGammaDP/=diGammaDenom
    return diGammaDP

In [42]:
def computeTransitionProbabilityA(diGammaDP,gammaDP):
    diGammaIJSumMatrix=np.apply_along_axis(np.sum,0,diGammaDP)
    gammaDPISumMatrix=np.apply_along_axis(np.sum,0,gammaDP)
    timePts=diGammaIJSumMatrix.shape[0]
    for i in np.arange(timePts):
        diGammaIJSumMatrix[i]/=gammaDPISumMatrix[i]
    return diGammaIJSumMatrix

In [43]:
def computeObsrProbNum(gammaDPT,i,vk,observations):
    gammaDPi=gammaDPT[i]
    return np.sum(gammaDPi[np.where(observations==vk)])
def computeTransitionProbabilityB(gammaDP,observations,observationDict):
    statesC=gammaDP.shape[1]
    observationsC=len(observationDict)
    newlyComputedObsrProbB=np.zeros(shape=(statesC,observationsC),dtype=float)#Ideal Shape should be transposed
    gammaDPISumMatrix=np.apply_along_axis(np.sum,0,gammaDP)
    gammaDPT=gammaDP.transpose()
    for i in np.arange(statesC):
        for vk in observationDict:
            newlyComputedObsrProbB[i][vk]=computeObsrProbNum(gammaDPT,i,vk,observations)/gammaDPISumMatrix[i]
    return newlyComputedObsrProbB
def computeTransitionProbabilityB1(alphaDP,alphaDPScaleT,betaDP,a,b,observations,observationDict):
    statesC=a.shape[0]
    observationsC=b.shape[1]
    newlyComputedObsrProbB=np.zeros(shape=(observationsC,statesC),dtype=float)#Ideal Shape should be transposed
    #gammaDP=computeGammaDP(alphaDP,betaDP,alphaDPScaleT)#[t][state]
    gammaDP=gammaDP.transpose()
    for i in np.arange(statesC):
        obsrProbDenom =np.sum(gammaDP[i])
        for vk in observationDict:
            newlyComputedObsrProbB[i][vk]=computeObsrProbNum(gammaDP,i,vk,observations)/obsrProbDenom
    return newlyComputedObsrProbB

In [44]:
#Change Convergence Criteria to be more reasonable/Useful
def isConverged(count,convergenceIters):
    if count>=convergenceIters:
        return True
    return False
def Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict):
    count=0
    updatedA=A
    updatedB=B
    while isConverged(count,convergenceIters)==False:
        #Expectation(E)-Step
        alphaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        betaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        alphaDPScaleT=np.ones(shape=(observations.shape[0]))
        updatedATranspose=updatedA.transpose()
        updatedBTranspose=updatedB.transpose()
        computeAlpha(observations,updatedA,updatedATranspose,updatedB,updatedBTranspose,pi,alphaDP,alphaDPScaleT)
        computeBeta(observations,updatedA,updatedB,updatedBTranspose,pi,betaDP)
        #validateAlphaDP(alphaDP,betaDP)
        diGammaDP=computeDiGammaDP(alphaDP,alphaDPScaleT,betaDP,updatedA,updatedB,updatedBTranspose,observations)
        gammaDP=computeGammaDP(diGammaDP)#[t][state]
        #Maximization(M)-Step
        newA=computeTransitionProbabilityA(diGammaDP,gammaDP)
        #newB=computeTransitionProbabilityB(alphaDP,alphaDPScaleT,betaDP,updatedA,updatedB,observations,observationDict)
        newB=computeTransitionProbabilityB(gammaDP,observations,observationDict)
        updatedA=newA
        updatedB=newB
        count=count+1
    return (updatedA,updatedB)

In [56]:
def trainHMM(trainDataFile,A,B,pi,convergenceIters,maxSequences=-1):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    observationDict=np.arange(distinctObservations)
    updatedA=np.NaN
    updatedB=np.NaN
    isAUpdated=False
    if(maxSequences==-1):
        usedSeqs=numSequences
    else:
        usedSeqs=min(maxSequences,numSequences)
    actuallyUsedSeqs=0
    for n in range(usedSeqs):
        line = trainFile.readline()#Reading Sequences 1 by 1
        line=line.rstrip("\n")
        l = line.split(" ")
        if(int(l[0])<=1):
            continue
        actuallyUsedSeqs+=1
        observations=np.array([int(i) for i in l[1:len(l)]])
        learnedParams=Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict)
        if isAUpdated==False:
            isAUpdated=True
            updatedA=learnedParams[0]
            updatedB=learnedParams[1]
        else:
            updatedA+=learnedParams[0]
            updatedB+=learnedParams[1]
    updatedA=updatedA/actuallyUsedSeqs
    updatedB=updatedB/actuallyUsedSeqs
    return (updatedA,updatedB)

In [57]:
def trainModel(fileLoc,maxNoOfStates,convergenceIters,maxSequences=-1):
    start = time.time()
    initialProbs=computeInitialProb(fileLoc,maxNoOfStates)
    end = time.time()
    print("Computed Initial Prob. in ", end - start ,"seconds")
    pi=initialProbs[2]
    numOfStates=initialProbs[0]
    distinctObservations=initialProbs[1]
    A=createRandomMatrixA(numOfStates)
    B=createRandomMatrixB(numOfStates,distinctObservations)
    trainedParams=trainHMM(fileLoc,A,B,pi,convergenceIters,maxSequences)
    trainedParams=trainedParams+(pi,)
    end=time.time()
    print("For ",maxSequences," Sequences : Total Training Time ",end-start," seconds")
    return trainedParams

In [60]:
old_settings = np.seterr(all='ignore')  #seterr to known value
np.seterr(all='raise')#{'over': 'ignore', 'divide': 'ignore', 'invalid': 'ignore','under': 'ignore'}
(A,B,pi)=trainModel('Data/0.spice.train1.txt',4,4,2)
np.seterr(**old_settings)  # reset to default {'over': 'raise', 'divide': 'ignore', 'invalid': 'ignore', 'under': 'ignore'}
#(A,B,pi)=trainModel('Data/1.spice.train.txt',20,7,1)

Computed Initial Prob. in  0.0057408809661865234 seconds


FloatingPointError: invalid value encountered in true_divide

In [27]:
A

array([[ 0.24748808,  0.24642955,  0.24962722,  0.25645515],
       [ 0.24923185,  0.24287502,  0.2405963 ,  0.26729682],
       [ 0.25084286,  0.23898292,  0.23128661,  0.27888761],
       [ 0.24439687,  0.25169186,  0.26399802,  0.23991324]])

In [46]:
B

array([[ 0.10557188,  0.5276764 ,  0.        ,  0.36675172],
       [ 0.10957474,  0.54557272,  0.        ,  0.34485254],
       [ 0.12045131,  0.593053  ,  0.        ,  0.28649569],
       [ 0.08524658,  0.43384218,  0.        ,  0.48091124]])

In [23]:
def getHmmRank(prefix,A,ATranspose,B,BTranspose,pi,uniqueSymbols):
    likelihoods=[]
    for i in np.arange(uniqueSymbols):
        prefix.append(i)
        observations=np.array(prefix)
        alphaDP=np.zeros(shape=(observations.shape[0],A.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        alphaDPScaleT=np.ones(shape=(observations.shape[0]))
        computeAlpha(observations,B,ATranspose,B,BTranspose,pi,alphaDP,alphaDPScaleT)
        obsrLikelihood=observationsLikelihood(alphaDPScaleT)
        prefix.pop()
        likelihoods.append((i,obsrLikelihood))
    likelihoods=sorted(likelihoods, key=lambda x: -x[1])
    ranks=[i[0] for i in likelihoods]
    return ranks

In [25]:
def list_to_string(l):
    s=str(l[0])
    for x in l[1:]:
        s+= " " + str(x)
    return(s)
def formatString(string_in):
    """ Replace white spaces by %20 """
    return string_in.strip().replace(" ", "%20")
# get the test first prefix: the only element of the test set
def get_first_prefix(test_file):
    """ This function is called for the public test file(Which only has 1 line)
    """
    f = open(test_file)
    prefix = f.readline()
    f.close()
    return prefix
def predictOnSpicePublicData(problem_number,name):
    problem_number = str(problem_number)
    user_id = '68'
    #name = "hmm_Baseline"
    #train_file = 'Data/0.spice.train.txt'
    prefix_file = 'Data/'+problem_number+'.spice.public.test.txt'
    first_prefix = get_first_prefix(prefix_file)
    prefix_number=1
    # get the next symbol ranking on the first prefix
    p=first_prefix.split()
    prefix=[int(i) for i in p[1:len(p)]]#prefix holds the sequence of values in the public test file(Note:It has only 1 Seq)
    print("Prefix ",prefix)
    ranking=getHmmRank(prefix,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
    print("Model Ranking ",ranking)
    ranking_string=list_to_string(ranking[:5])
    #print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + first_prefix)
    first_prefix = formatString(first_prefix)

    # transform the ranking to follow submission format
    ranking_string=formatString(ranking_string)

    # create the url to submit the ranking
    #name=name+"_Ver1.7.2"
    name=name
    url_base = 'http://spice.lif.univ-mrs.fr/submit.php?user=' + user_id +\
        '&problem=' + problem_number + '&submission=' + name + '&'
    url = url_base + 'prefix=' + first_prefix + '&prefix_number=1' + '&ranking=' +\
        ranking_string
    response = ur.urlopen(url)
    print("URL ",url)
    content = response.read()
    print("Response from SPiCe ",content)#Content is a new Sequence returned from the SPiCe server: We will need to predict for this seq
    if not orl2:
        # Needed for python 3.4...
        content= content.decode('utf-8')
    list_element = content.split()
    head = str(list_element[0])
    return content,url_base

In [26]:
spiceContentOnPubFile,url_base=predictOnSpicePublicData(0,"hmm_baseline_v1")

Prefix  [3, 0, 3, 0, 1, 3, 3]
Model Ranking  [3, 0, 1, 2]
URL  http://spice.lif.univ-mrs.fr/submit.php?user=68&problem=0&submission=hmm_baseline_v1&prefix=7%203%200%203%200%201%203%203&prefix_number=1&ranking=3%200%201%202
Response from SPiCe  b'2 3 3 \n'


In [27]:
def evaluateOnSpiceTrainDataSet(prevContent,url_base):
    prefix_number = 2
    head=''
    content=prevContent
    while(head != '[Error]' and head != '[Success]'):
        prefix = content[:-1]#Fetch the Sequence returned from Spice Server and exclude the last '\n'
        # Get the ranking
        p=prefix.split()
        prefix_list=[int(i) for i in p[1:len(p)]]
        ranking = getHmmRank(prefix_list,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
        ranking_string=list_to_string(ranking[:5])#Here At least alphabet should be 4: Else may get Runtime error
        if prefix_number % 200 == 0:
            print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + prefix)
        # Format the ranking
        ranking_string = formatString(ranking_string)
        # create prefix with submission needed format
        prefix=formatString(prefix)
        # Create the url with your ranking to get the next prefix
        url = url_base + 'prefix=' + prefix + '&prefix_number=' +\
            str(prefix_number) + '&ranking=' + ranking_string
        # Get the answer of the submission on current prefix
        response = ur.urlopen(url)
        content = response.read()
        if not orl2:
            # Needed for Python 3.4...
            content= content.decode('utf-8')
        list_element = content.split()
        # modify head in case it is finished or an erro occured
        head = str(list_element[0])
        # change prefix number
        prefix_number += 1
    # Post-treatment
    # The score is the last element of content (in case of a public test set)
    print(content)
    list_element = content.split()
    score = (list_element[-1])
    print(score)

In [28]:
evaluateOnSpiceTrainDataSet(spiceContentOnPubFile,url_base)

Prefix number: 200 Ranking: 3 0 1 2 Prefix: 8 3 0 1 3 3 1 2 1 
Prefix number: 400 Ranking: 3 0 1 2 Prefix: 7 3 3 1 0 1 3 3 
Prefix number: 600 Ranking: 3 0 1 2 Prefix: 3 3 0 1 
Prefix number: 800 Ranking: 3 0 1 2 Prefix: 1 3 
Prefix number: 1000 Ranking: 3 0 1 2 Prefix: 2 3 0 
[Success] Last prefix of the test set. The score of the submission named hmm_baseline_v1 on problem 0 is 0.85064155226946

0.85064155226946


In [25]:
evaluateOnSpiceTrainDataSet(spiceContentOnPubFile,url_base)

Prefix number: 200 Ranking: 3 0 1 2 Prefix: 8 3 0 1 3 3 1 2 1 
Prefix number: 400 Ranking: 3 0 1 2 Prefix: 7 3 3 1 0 1 3 3 
Prefix number: 600 Ranking: 3 0 1 2 Prefix: 3 3 0 1 
Prefix number: 800 Ranking: 3 0 1 2 Prefix: 1 3 
Prefix number: 1000 Ranking: 3 0 1 2 Prefix: 2 3 0 
[Success] Last prefix of the test set. The score of the submission named hmm_baseline_Ver1.5 on problem 0 is 0.85064155226946

0.85064155226946
