In [3]:
import numpy as np
import math
import sys
from collections import defaultdict
from math import expm1
import time
try:
    # Python 2.7
    import urllib2 as ur
    orl2 = True
except:
    #Python 3.4
    import urllib.request as ur
    orl2 = False
from collections import Counter

In [4]:
def computeInitialProb(trainDataFile,numOfStates,randomized=False):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    empiricalDistr=Counter()
    for n in range(numSequences):
        line = trainFile.readline()#Reading Sequences 1 by 1
        line=line.rstrip("\n")
        l = line.split(" ")
        l=l[1:]
        lDistr=Counter(l)
        empiricalDistr+=lDistr
    totalSymbolsSeen=sum(empiricalDistr.values())
    initialProb=[]
    if randomized:
        for i in np.arange(numOfStates):
            initialProb.append((1.0*empiricalDistr[str(i%distinctObservations)])/totalSymbolsSeen)
    else:
        numOfStates=min(numOfStates,distinctObservations)
        for i in np.arange(numOfStates):
            initialProb.append((1.0*empiricalDistr[str(i)])/totalSymbolsSeen)
    return (numOfStates,distinctObservations,initialProb)

In [5]:
def createRandomMatrixA(numOfStates):
    matrixA=np.ndarray(shape=(numOfStates,numOfStates),dtype=float)
    prob=1.0/(numOfStates)
    matrixA.fill(prob)
    return matrixA
def createRandomMatrixB(numOfStates,distinctObservations):
    matrixB=np.ndarray(shape=(numOfStates,distinctObservations),dtype=float)
    prob=1.0/(distinctObservations)
    matrixB.fill(prob)
    return matrixB

In [15]:
def scaleVector(ndArrayToScale,valueToDivide):
    if valueToDivide==0:
        ndArrayToScale=0
    else:
        ndArrayToScale/=float(valueToDivide)
def computeAlpha(observations,a,aTranspose,b,bTranspose,pi,alphaDP,shouldScale=True):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    alphaDpScaleTime0=0
    alphaDP[0]=pi*bTranspose[observations[0]]
    if shouldScale:
        scaleVector(alphaDP[0],np.sum(alphaDP[0]))
    #alphaDPScaleT[0]=1.0/np.sum(alphaDP[0])
    #alphaDP[0]*=alphaDPScaleT[0]
    for t in np.arange(1,timePts):
        for i in np.arange(statesC):
            alphaDP[t][i]=b[i][observations[t]]*(np.sum(alphaDP[t-1]*aTranspose[i]))
        if shouldScale:
            scaleVector(alphaDP[t],np.sum(alphaDP[t]))
    #print("B ",b)
    #print("Modified Alpha ",alphaDP)
def getObservationLikelihood(observations,a,aTranspose,b,bTranspose,pi):
    alphaDP=np.zeros(shape=(len(observations),a.shape[0]))# Count_of_Observations*Count_of_Hidden_States
    computeAlpha(observations,a,aTranspose,b,bTranspose,pi,alphaDP,False)
    return np.sum(alphaDP[len(observations)-1])

In [7]:
def computeBeta(observations,a,b,bTranspose,pi,betaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    betaDP[timePts-1].fill(1)
    for t in np.arange(timePts-2,-1,-1):
        betaDpScaleTimeT=0
        for i in np.arange(statesC):
            betaDP[t][i]=np.sum(a[i]*bTranspose[observations[t+1]]*betaDP[t+1])
        betaDPtSum=np.sum(betaDP[t])
        if betaDPtSum==0:
            betaDpScaleTimeT=0
        else:
            betaDpScaleTimeT=1.0/betaDPtSum
        betaDP[t]*=betaDpScaleTimeT
    return betaDP

In [16]:
def computeGammaDP(diGammaDP):
    return np.sum(diGammaDP,axis=(2))
def computeDiGammaDP(alphaDP,betaDP,a,b,bTranspose,observations):
    observationsC=len(observations)
    statesC=alphaDP.shape[1]
    diGammaDP=np.zeros(shape=(observationsC-1,statesC,statesC),dtype=float)
    for i in np.arange(statesC):
        for t in np.arange(observationsC-1):
            diGammaDP[t][i]=alphaDP[t][i]*a[i]*bTranspose[observations[t+1]]*betaDP[t+1]
    diGammaDenom=np.sum(diGammaDP,axis=(1,2))#Sum(0),groupby(1,2)
    for t in np.arange(observationsC-1):
        if diGammaDenom[t]==0:
            diGammaDP[t]=0
        else:
            diGammaDP[t]/=diGammaDenom[t]
    return diGammaDP

In [17]:
def computeTransitionProbabilityA(diGammaDP,gammaDP):
    diGammaIJSumMatrix=np.sum(diGammaDP,axis=(0))
    gammaDPISumMatrix=np.sum(gammaDP,axis=(0))
    statesC=diGammaIJSumMatrix.shape[0]
    for i in np.arange(statesC):
        if gammaDPISumMatrix[i]==0:
            diGammaIJSumMatrix[i]=0
        else:
            diGammaIJSumMatrix[i]/=gammaDPISumMatrix[i]
    return diGammaIJSumMatrix

In [18]:
def computeObsrProbNum(gammaDPT,i,vk,observations):
    gammaDPi=gammaDPT[i]
    return np.sum(gammaDPi[np.where(observations==vk)])
def computeTransitionProbabilityB(gammaDP,observations,observationDict):
    observations=observations[:len(observations)-1]#Remember DiGammaDP is defined only from 0 to T-2
    statesC=gammaDP.shape[1]
    observationsC=len(observationDict)
    newlyComputedObsrProbB=np.zeros(shape=(statesC,observationsC),dtype=float)
    gammaDPISumMatrix=np.sum(gammaDP,axis=(0))
    gammaDPT=gammaDP.transpose()
    for i in np.arange(statesC):
        for vk in observationDict:
            if gammaDPISumMatrix[i]==0:
                newlyComputedObsrProbB[i][vk]=0
            else:
                newlyComputedObsrProbB[i][vk]=computeObsrProbNum(gammaDPT,i,vk,observations)/gammaDPISumMatrix[i]
    return newlyComputedObsrProbB

In [19]:
#Change Convergence Criteria to be more reasonable/Useful
def isConverged(count,convergenceIters):
    if count>=convergenceIters:
        return True
    return False
def Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict):
    count=0
    updatedA=A
    updatedB=B
    while isConverged(count,convergenceIters)==False:
        #Expectation(E)-Step
        alphaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        betaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        #alphaDPScaleT=np.ones(shape=(observations.shape[0]))
        updatedATranspose=updatedA.transpose()
        updatedBTranspose=updatedB.transpose()
        computeAlpha(observations,updatedA,updatedATranspose,updatedB,updatedBTranspose,pi,alphaDP)
        computeBeta(observations,updatedA,updatedB,updatedBTranspose,pi,betaDP)
        #print("Alpha ",alphaDP)
        #print("Beta ",alphaDP)
        #validateAlphaDP(alphaDP,betaDP)
        diGammaDP=computeDiGammaDP(alphaDP,betaDP,updatedA,updatedB,updatedBTranspose,observations)
        #print("DiGammaDP ",diGammaDP)
        gammaDP=computeGammaDP(diGammaDP)#[t][state]
        #Maximization(M)-Step
        newA=computeTransitionProbabilityA(diGammaDP,gammaDP)
        newB=computeTransitionProbabilityB(gammaDP,observations,observationDict)
        updatedA=newA
        updatedB=newB
        count=count+1
    return (updatedA,updatedB)

In [20]:
def trainHMM(trainDataFile,A,B,pi,convergenceIters,maxSequences=-1):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    observationDict=np.arange(distinctObservations)
    updatedA=np.NaN
    updatedB=np.NaN
    isAUpdated=False
    if(maxSequences==-1):
        usedSeqs=numSequences
    else:
        usedSeqs=min(maxSequences,numSequences)
    actuallyUsedSeqs=0
    for n in range(usedSeqs):
        line = trainFile.readline()#Reading Sequences 1 by 1
        #if n%3!=0:
        #    continue
        line=line.rstrip("\n")
        l = line.split(" ")
        if(int(l[0])<=1):
            continue
        actuallyUsedSeqs+=1
        observations=np.array([int(i) for i in l[1:len(l)]])
        print("Observation ",observations)
        learnedParams=Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict)
        if isAUpdated==False:
            isAUpdated=True
            updatedA=learnedParams[0]
            updatedB=learnedParams[1]
        else:
            updatedA+=learnedParams[0]
            updatedB+=learnedParams[1]
    updatedA=updatedA/actuallyUsedSeqs
    updatedB=updatedB/actuallyUsedSeqs
    return (updatedA,updatedB)

In [21]:
def trainModel(fileLoc,maxNoOfStates,convergenceIters,maxSequences=-1,initialProbRandomized=False):
    start = time.time()
    initialProbs=computeInitialProb(fileLoc,maxNoOfStates,initialProbRandomized)
    print("Inital Probs ",initialProbs)
    end = time.time()
    print("Computed Initial Prob. in ", end - start ,"seconds")
    pi=initialProbs[2]
    numOfStates=initialProbs[0]
    distinctObservations=initialProbs[1]
    A=createRandomMatrixA(numOfStates)
    B=createRandomMatrixB(numOfStates,distinctObservations)
    trainedParams=trainHMM(fileLoc,A,B,pi,convergenceIters,maxSequences)
    trainedParams=trainedParams+(pi,)
    end=time.time()
    print("For ",maxSequences," Sequences : Total Training Time ",end-start," seconds")
    return trainedParams

In [24]:
old_settings = np.seterr(all='ignore')  #seterr to known value
np.seterr(all='raise')#{'over': 'ignore', 'divide': 'ignore', 'invalid': 'ignore','under': 'ignore'}
(A,B,pi)=trainModel('Data/0.spice.train.txt',8,10,1,True)
np.seterr(**old_settings)  # reset to default {'over': 'raise', 'divide': 'ignore', 'invalid': 'ignore', 'under': 'ignore'}
#(A,B,pi)=trainModel('Data/1.spice.train.txt',20,7,1)

Inital Probs  (8, 4, [0.23949632215434485, 0.2009447422737536, 0.10146282674645722, 0.4580961088254443, 0.23949632215434485, 0.2009447422737536, 0.10146282674645722, 0.4580961088254443])
Computed Initial Prob. in  0.6592514514923096 seconds
Observation  [3 0 3 1 3 1 3]
For  1  Sequences : Total Training Time  0.7039992809295654  seconds


{'divide': 'raise', 'invalid': 'raise', 'over': 'raise', 'under': 'raise'}

In [25]:
A,B

(array([[  3.84603472e-05,   1.25988264e-01,   3.73952700e-01,
           2.05757059e-05,   3.84603472e-05,   1.25988264e-01,
           3.73952700e-01,   2.05757059e-05],
        [  3.37158404e-02,   2.56813628e-05,   1.23558906e-05,
           4.66246122e-01,   3.37158404e-02,   2.56813628e-05,
           1.23558906e-05,   4.66246122e-01],
        [  1.28650966e-02,   3.52292092e-07,   1.07876011e-17,
           4.87134551e-01,   1.28650966e-02,   3.52292092e-07,
           1.07876011e-17,   4.87134551e-01],
        [  1.63744904e-06,   5.30658312e-02,   4.46932531e-01,
           3.46381928e-22,   1.63744904e-06,   5.30658312e-02,
           4.46932531e-01,   3.46381928e-22],
        [  3.84603472e-05,   1.25988264e-01,   3.73952700e-01,
           2.05757059e-05,   3.84603472e-05,   1.25988264e-01,
           3.73952700e-01,   2.05757059e-05],
        [  3.37158404e-02,   2.56813628e-05,   1.23558906e-05,
           4.66246122e-01,   3.37158404e-02,   2.56813628e-05,
           1.2

In [38]:
#4,10,5000
A,B

(array([[ 0.1364827 ,  0.18664557,  0.31405828,  0.16281345],
        [ 0.15608442,  0.1496722 ,  0.22001805,  0.27422533],
        [ 0.1701589 ,  0.15330267,  0.17353171,  0.30300671],
        [ 0.13618877,  0.16192936,  0.39828321,  0.10359866]]),
 array([[ 0.1201357 ,  0.11696151,  0.08324975,  0.47965303],
        [ 0.22581657,  0.18487907,  0.08772761,  0.30157675],
        [ 0.31548659,  0.2241051 ,  0.08116688,  0.17924144],
        [ 0.02593511,  0.02730334,  0.0258246 ,  0.72093695]]))

In [36]:
#4,4,5000
A,B

(array([[ 0.1966799 ,  0.20011403,  0.21340815,  0.18979792],
        [ 0.19678702,  0.19577753,  0.19968287,  0.20775258],
        [ 0.19253404,  0.18672076,  0.18550081,  0.23524439],
        [ 0.1882928 ,  0.20259477,  0.25646367,  0.15264876]]),
 array([[ 0.18102743,  0.14381412,  0.06991829,  0.40524015],
        [ 0.20809308,  0.16294735,  0.07514728,  0.35381229],
        [ 0.25498474,  0.19355009,  0.08561325,  0.26585191],
        [ 0.08771256,  0.07386999,  0.04053156,  0.59788589]]))

In [34]:
#16,10,5000
A,B

(array([[ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05],
        [ 0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,  0.05,
          0

In [32]:
#16,10,1000
A,B

(array([[ 0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446],
        [ 0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446],
        [ 0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446],
        [ 0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446,  0.05115446,  0.05115446,  0.05115446,  0.05115446,
          0.05115446],
        [ 0.05115446

In [30]:
A,B

(array([[ 0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882],
        [ 0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882],
        [ 0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882],
        [ 0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882,  0.05330882,  0.05330882,  0.05330882,  0.05330882,
          0.05330882],
        [ 0.05330882

In [39]:
def getHmmRank(prefix,A,ATranspose,B,BTranspose,pi,uniqueSymbols):
    likelihoods=[]
    for i in np.arange(uniqueSymbols):
        prefix.append(i)
        observations=np.array(prefix)
        alphaDP=np.zeros(shape=(observations.shape[0],A.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        alphaDPScaleT=np.ones(shape=(observations.shape[0]))
        obsrLikelihood=getObservationLikelihood(observations,A,ATranspose,B,BTranspose,pi)
        #print("Obs : ",observations ," Likelihood : ",obsrLikelihood)
        #computeAlpha(observations,B,ATranspose,B,BTranspose,pi,alphaDP,alphaDPScaleT)
        #obsrLikelihood=observationsLikelihood(alphaDPScaleT)
        prefix.pop()
        likelihoods.append((i,obsrLikelihood))
    likelihoods=sorted(likelihoods, key=lambda x: -x[1])
    ranks=[i[0] for i in likelihoods]
    return ranks

In [40]:
def list_to_string(l):
    s=str(l[0])
    for x in l[1:]:
        s+= " " + str(x)
    return(s)
def formatString(string_in):
    """ Replace white spaces by %20 """
    return string_in.strip().replace(" ", "%20")
# get the test first prefix: the only element of the test set
def get_first_prefix(test_file):
    """ This function is called for the public test file(Which only has 1 line)
    """
    f = open(test_file)
    prefix = f.readline()
    f.close()
    return prefix
def predictOnSpicePublicData(problem_number,name):
    problem_number = str(problem_number)
    user_id = '68'
    #name = "hmm_Baseline"
    #train_file = 'Data/0.spice.train.txt'
    prefix_file = 'Data/'+problem_number+'.spice.public.test.txt'
    first_prefix = get_first_prefix(prefix_file)
    prefix_number=1
    # get the next symbol ranking on the first prefix
    p=first_prefix.split()
    prefix=[int(i) for i in p[1:len(p)]]#prefix holds the sequence of values in the public test file(Note:It has only 1 Seq)
    print("Prefix ",prefix)
    ranking=getHmmRank(prefix,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
    print("Model Ranking ",ranking)
    ranking_string=list_to_string(ranking[:5])
    #print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + first_prefix)
    first_prefix = formatString(first_prefix)

    # transform the ranking to follow submission format
    ranking_string=formatString(ranking_string)

    # create the url to submit the ranking
    #name=name+"_Ver1.7.2"
    name=name
    url_base = 'http://spice.lif.univ-mrs.fr/submit.php?user=' + user_id +\
        '&problem=' + problem_number + '&submission=' + name + '&'
    url = url_base + 'prefix=' + first_prefix + '&prefix_number=1' + '&ranking=' +\
        ranking_string
    response = ur.urlopen(url)
    print("URL ",url)
    content = response.read()
    print("Response from SPiCe ",content)#Content is a new Sequence returned from the SPiCe server: We will need to predict for this seq
    if not orl2:
        # Needed for python 3.4...
        content= content.decode('utf-8')
    list_element = content.split()
    head = str(list_element[0])
    return content,url_base

In [41]:
spiceContentOnPubFile,url_base=predictOnSpicePublicData(0,"hmm_baseline_v1.3")

Prefix  [3, 0, 3, 0, 1, 3, 3]
Model Ranking  [3, 0, 1, 2]
URL  http://spice.lif.univ-mrs.fr/submit.php?user=68&problem=0&submission=hmm_baseline_v1.3&prefix=7%203%200%203%200%201%203%203&prefix_number=1&ranking=3%200%201%202
Response from SPiCe  b'2 3 3 \n'


In [42]:
def evaluateOnSpiceTrainDataSet(prevContent,url_base):
    prefix_number = 2
    head=''
    content=prevContent
    while(head != '[Error]' and head != '[Success]'):
        prefix = content[:-1]#Fetch the Sequence returned from Spice Server and exclude the last '\n'
        # Get the ranking
        p=prefix.split()
        prefix_list=[int(i) for i in p[1:len(p)]]
        ranking = getHmmRank(prefix_list,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
        ranking_string=list_to_string(ranking[:5])#Here At least alphabet should be 4: Else may get Runtime error
        if prefix_number % 200 == 0:
            print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + prefix)
        # Format the ranking
        ranking_string = formatString(ranking_string)
        # create prefix with submission needed format
        prefix=formatString(prefix)
        # Create the url with your ranking to get the next prefix
        url = url_base + 'prefix=' + prefix + '&prefix_number=' +\
            str(prefix_number) + '&ranking=' + ranking_string
        # Get the answer of the submission on current prefix
        response = ur.urlopen(url)
        content = response.read()
        if not orl2:
            # Needed for Python 3.4...
            content= content.decode('utf-8')
        list_element = content.split()
        # modify head in case it is finished or an erro occured
        head = str(list_element[0])
        # change prefix number
        prefix_number += 1
    # Post-treatment
    # The score is the last element of content (in case of a public test set)
    print(content)
    list_element = content.split()
    score = (list_element[-1])
    print(score)

In [None]:
evaluateOnSpiceTrainDataSet(spiceContentOnPubFile,url_base)

In [25]:
evaluateOnSpiceTrainDataSet(spiceContentOnPubFile,url_base)

Prefix number: 200 Ranking: 3 0 1 2 Prefix: 8 3 0 1 3 3 1 2 1 
Prefix number: 400 Ranking: 3 0 1 2 Prefix: 7 3 3 1 0 1 3 3 
Prefix number: 600 Ranking: 3 0 1 2 Prefix: 3 3 0 1 
Prefix number: 800 Ranking: 3 0 1 2 Prefix: 1 3 
Prefix number: 1000 Ranking: 3 0 1 2 Prefix: 2 3 0 
[Success] Last prefix of the test set. The score of the submission named hmm_baseline_Ver1.5 on problem 0 is 0.85064155226946

0.85064155226946
