In [1]:
import numpy as np
import math
import sys
from collections import defaultdict
from math import expm1
import time

In [2]:
def computeInitialProb(trainDataFile,numOfStates):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    numOfStates=min(numOfStates,distinctObservations)
    empiricalCount=np.zeros(shape=numOfStates)
    empiricalFreq=defaultdict(int)
    for n in range(numSequences):
        line = trainFile.readline()#Reading Sequences 1 by 1
        l = line.split(" ")
        startState=int(l[1])
        empiricalFreq[startState] = empiricalFreq[startState]+1
    totalObservations=0
    for i in np.arange(numOfStates):
        empiricalCount[i]=empiricalFreq[i]
        totalObservations=totalObservations+empiricalCount[i]
    initialProb=[count/totalObservations for count in empiricalCount]
    return (numOfStates,distinctObservations,initialProb)

In [3]:
def createRandomMatrixA(numOfStates):
    matrixA=np.ndarray(shape=(numOfStates,numOfStates),dtype=float)
    prob=1.0/(numOfStates*numOfStates)
    matrixA.fill(prob)
    return matrixA
def createRandomMatrixB(numOfStates,distinctObservations):
    matrixB=np.ndarray(shape=(numOfStates,distinctObservations),dtype=float)
    prob=1.0/(numOfStates*distinctObservations)
    matrixB.fill(prob)
    return matrixB

In [4]:
def computeAlpha(observations,a,aTranspose,b,bTranspose,pi,alphaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    alphaDpScaleTime0=0
    #bTranspose=b.transpose()
    alphaDP[0]=pi*bTranspose[observations[0]]
    alphaDpScaleTime0=np.sum(alphaDP[0])
    alphaDP[0]/=alphaDpScaleTime0
    #aTranspose=a.transpose()
    for t in np.arange(1,timePts):
        alphaDpScaleTimeT=0
        for i in np.arange(statesC):
            alphaDP[t][i]=(np.sum(alphaDP[t-1]*aTranspose[i]))*b[i][observations[t]]
        alphaDpScaleTimeT=np.sum(alphaDP[t])
        alphaDP[t]/=alphaDpScaleTimeT
def observationsLikelihood(alphaDP):
    timePts=alphaDP.shape[0]
    ans=0.0
    ans=np.sum(alphaDP[timePts-1])
    return ans

In [5]:
def computeBeta(observations,a,b,bTranspose,pi,betaDP):
    statesC=a.shape[0]
    timePts=observations.shape[0]
    if timePts<1:
        return
    betaDP[timePts-1].fill(1)
    #bTranspose=b.transpose()
    for t in np.arange(timePts-2,-1,-1):
        betaDpScaleTimeT=0
        for i in np.arange(statesC):
            betaDP[t][i]=np.sum(a[i]*bTranspose[observations[t+1]]*betaDP[t+1])
        betaDpScaleTimeT=np.sum(betaDP[t])
        betaDP[t]/=betaDpScaleTimeT
    return betaDP

In [6]:
def computeDiGammaDP(alphaDP,betaDP,a,b,bTranspose,observations):
    observationsC=alphaDP.shape[0]
    statesC=alphaDP.shape[1]
    diGammaDP=np.zeros(shape=(statesC,statesC),dtype=float)
    diGammaDenom=observationsLikelihood(alphaDP)
    #bTranspose=b.transpose()
    for i in np.arange(statesC):
        for t in np.arange(observationsC-1):
            diGammaDP[i]+=alphaDP[t][i]*a[i]*bTranspose[observations[t+1]]*betaDP[t+1]
    diGammaDP/=diGammaDenom
    return diGammaDP
def computeTransitionProbabilityA(alphaDP,betaDP,a,b,bTranspose,observations):
    statesC=alphaDP.shape[1]
    newlyComputedTransitionProbA=np.zeros(shape=(statesC,statesC),dtype=float)
    diGammaDP=computeDiGammaDP(alphaDP,betaDP,a,b,bTranspose,observations)
    diGammaDPSumGrpByJ=np.apply_along_axis(np.sum,1,diGammaDP)
    for i in np.arange(statesC):    
        if (diGammaDPSumGrpByJ[i]==0):
            newlyComputedTransitionProbA[i]=0.0
        else:
            newlyComputedTransitionProbA[i]=diGammaDP[i]/diGammaDPSumGrpByJ[i]
    return newlyComputedTransitionProbA   

In [7]:
def computeGammaDP(alphaDP,betaDP):
    gammaDenom=observationsLikelihood(alphaDP)
    gammaDP=alphaDP*betaDP#[Time][State]
    gammaDP/=gammaDenom
    return gammaDP
def computeTransitionProbabilityB(alphaDP,betaDP,a,b,observations,observationDict):
    statesC=a.shape[0]
    observationsC=b.shape[1]
    newlyComputedObsrProbB=np.zeros(shape=(observationsC,statesC),dtype=float)#Ideal Shape should be transposed
    gammaDP=computeGammaDP(alphaDP,betaDP)#[t][state]
    obsrProbDenomVec=np.apply_along_axis(np.sum,0,gammaDP)#Sum Across Rows i.e. Across each state
    for vk in observationDict:
            newlyComputedObsrProbB[vk]=np.sum(gammaDP[np.where(observations==vk)])/obsrProbDenomVec
    newlyComputedObsrProbB=newlyComputedObsrProbB.transpose()#Back to Shape[i][vk]
    return newlyComputedObsrProbB

In [8]:
#Change Convergence Criteria to be more reasonable/Useful
def isConverged(count,convergenceIters):
    if count>=convergenceIters:
        return True
    return False
def Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict):
    count=0
    updatedA=A
    updatedB=B
    while isConverged(count,convergenceIters)==False:
        #Expectation(E)-Step
        alphaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        betaDP=np.zeros(shape=(observations.shape[0],updatedA.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        updatedATranspose=updatedA.transpose()
        updatedBTranspose=updatedB.transpose()
        computeAlpha(observations,updatedA,updatedATranspose,updatedB,updatedBTranspose,pi,alphaDP)
        computeBeta(observations,updatedA,updatedB,updatedBTranspose,pi,betaDP)
        #Maximization(M)-Step
        newA=computeTransitionProbabilityA(alphaDP,betaDP,updatedA,updatedB,updatedBTranspose,observations)
        newB=computeTransitionProbabilityB(alphaDP,betaDP,updatedA,updatedB,observations,observationDict)
        updatedA=newA
        updatedB=newB
        count=count+1
    return (updatedA,updatedB)

In [9]:
def trainHMM(trainDataFile,A,B,pi,convergenceIters,maxSequences=-1):
    trainFile=open(trainDataFile,"r")
    metaDataLine = trainFile.readline()
    headerLine = metaDataLine.split(" ")
    numSequences = int(headerLine[0])
    distinctObservations= int(headerLine[1])#Total Number of Distinct Observations
    observationDict=np.arange(distinctObservations)
    updatedA=np.NaN
    updatedB=np.NaN
    isAUpdated=False
    if(maxSequences==-1):
        usedSeqs=numSequences
    else:
        usedSeqs=min(maxSequences,numSequences)
    actuallyUsedSeqs=0
    for n in range(usedSeqs):
        line = trainFile.readline()#Reading Sequences 1 by 1
        l = line.split(" ")
        if(int(l[0])<=1):
            continue
        actuallyUsedSeqs+=1
        observations=np.array([int(i) for i in l[1:len(l)]])
        learnedParams=Forward_Backward_EM_Algo(observations,A,B,pi,convergenceIters,observationDict)
        if isAUpdated==False:
            isAUpdated=True
            updatedA=learnedParams[0]
            updatedB=learnedParams[1]
        else:
            updatedA+=learnedParams[0]
            updatedB+=learnedParams[1]
    updatedA=updatedA/actuallyUsedSeqs
    updatedB=updatedB/actuallyUsedSeqs
    return (updatedA,updatedB)

In [24]:
def trainModel(fileLoc,maxNoOfStates,convergenceIters,maxSequences=-1):
    start = time.time()
    initialProbs=computeInitialProb(fileLoc,maxNoOfStates)
    end = time.time()
    print("Computed Initial Prob. in ", end - start ,"seconds")
    pi=initialProbs[2]
    numOfStates=initialProbs[0]
    distinctObservations=initialProbs[1]
    A=createRandomMatrixA(numOfStates)
    B=createRandomMatrixB(numOfStates,distinctObservations)
    trainedParams=trainHMM(fileLoc,A,B,pi,convergenceIters,maxSequences)
    trainedParams=trainedParams+(pi,)
    end=time.time()
    print("For ",maxSequences," Sequences : Total Training Time ",end-start," seconds")
    return trainedParams

In [11]:
(A,B)=trainModel('Data/1.spice.train.txt',20,7,1)
#(A,B)

Computed Initial Prob. in  0.373828649520874 seconds
For  1  Sequences : Total Training Time  1.0151755809783936  seconds


In [12]:
%timeit trainModel('Data/1.spice.train.txt',20,7,15)


Computed Initial Prob. in  0.16173624992370605 seconds
For  15  Sequences : Total Training Time  4.560023069381714  seconds
Computed Initial Prob. in  0.15972256660461426 seconds
For  15  Sequences : Total Training Time  4.6220409870147705  seconds
Computed Initial Prob. in  0.15830111503601074 seconds
For  15  Sequences : Total Training Time  4.576323509216309  seconds
Computed Initial Prob. in  0.1583256721496582 seconds
For  15  Sequences : Total Training Time  4.626648426055908  seconds
1 loops, best of 3: 4.58 s per loop


In [13]:
%timeit trainModel('Data/1.spice.train.txt',20,7,15)
#(A,B)

Computed Initial Prob. in  0.16886186599731445 seconds
For  15  Sequences : Total Training Time  4.5674731731414795  seconds
Computed Initial Prob. in  0.196702241897583 seconds
For  15  Sequences : Total Training Time  4.59807562828064  seconds
Computed Initial Prob. in  0.15811681747436523 seconds
For  15  Sequences : Total Training Time  4.655432939529419  seconds
Computed Initial Prob. in  0.22752022743225098 seconds
For  15  Sequences : Total Training Time  4.61660361289978  seconds
1 loops, best of 3: 4.6 s per loop


In [13]:
%timeit trainModel('Data/1.spice.train.txt',20,7,150)

Computed Initial Prob. in  0.16386866569519043 seconds
For  150  Sequences : Total Training Time  49.785178661346436  seconds
Computed Initial Prob. in  0.16109967231750488 seconds
For  150  Sequences : Total Training Time  51.17705059051514  seconds
Computed Initial Prob. in  0.22313308715820312 seconds
For  150  Sequences : Total Training Time  50.26441955566406  seconds
Computed Initial Prob. in  0.15892481803894043 seconds
For  150  Sequences : Total Training Time  49.6641047000885  seconds
1 loops, best of 3: 49.7 s per loop


In [14]:
trainModel('Data/0.spice.train.txt',20,7,150)
#(A,B)

Computed Initial Prob. in  0.15481114387512207 seconds
For  150  Sequences : Total Training Time  2.766151189804077  seconds


(array([[ 0.25065612,  0.25065612,  0.25065612,  0.13614352],
        [ 0.25065612,  0.25065612,  0.25065612,  0.13614352],
        [ 0.25065612,  0.25065612,  0.25065612,  0.13614352],
        [ 0.28793509,  0.28793509,  0.28793509,  0.13619472]]),
 array([[ 0.89089087,  0.66680948,  0.41210032,  2.25830004],
        [ 0.89089087,  0.66680948,  0.41210032,  2.25830004],
        [ 0.89089087,  0.66680948,  0.41210032,  2.25830004],
        [ 0.83195173,  0.6505008 ,  0.41041104,  1.87133779]]))

In [15]:
(A,B,pi)=trainModel('Data/0.spice.train.txt',20,7,1500)
print(A,B)

Computed Initial Prob. in  0.062479496002197266 seconds
For  1500  Sequences : Total Training Time  24.869220495224  seconds


(array([[ 0.24017683,  0.24017683,  0.24017683,  0.13540768],
        [ 0.24017683,  0.24017683,  0.24017683,  0.13540768],
        [ 0.24017683,  0.24017683,  0.24017683,  0.13540768],
        [ 0.28817546,  0.28817546,  0.28817546,  0.13547361]]),
 array([[ 0.93656972,  0.57671503,  0.36336451,  2.41459482],
        [ 0.93656972,  0.57671503,  0.36336451,  2.41459482],
        [ 0.93656972,  0.57671503,  0.36336451,  2.41459482],
        [ 0.90163462,  0.55596422,  0.3553712 ,  1.89068153]]))

In [27]:
(A,B,pi)=trainModel('Data/0.spice.train.txt',20,7,15000)

Computed Initial Prob. in  0.06518864631652832 seconds
For  15000  Sequences : Total Training Time  254.3390028476715  seconds


In [28]:
(A,B,pi)=trainModel('Data/0.spice.train.txt',20,7)

Computed Initial Prob. in  0.06556081771850586 seconds
For  -1  Sequences : Total Training Time  355.021213054657  seconds


In [40]:
A,type(A)

(array([[ 0.24010455,  0.24010455,  0.24010455,  0.13705686],
        [ 0.24010455,  0.24010455,  0.24010455,  0.13705686],
        [ 0.24010455,  0.24010455,  0.24010455,  0.13705686],
        [ 0.28762596,  0.28762596,  0.28762596,  0.13712213]]), numpy.ndarray)

In [47]:
a6=[2,3,4,5]
a7=np.array(a6)
print(a7,type(a7))

[2 3 4 5] <class 'numpy.ndarray'>


In [48]:
def getHmmRank(prefix,A,ATranspose,B,BTranspose,pi,uniqueSymbols):
    likelihoods=[]
    for i in np.arange(uniqueSymbols):
        prefix.append(i)
        observations=np.array(prefix)
        alphaDP=np.zeros(shape=(observations.shape[0],A.shape[0]))# Count_of_Observations*Count_of_Hidden_States
        computeAlpha(observations,B,ATranspose,B,BTranspose,pi,alphaDP)
        obsrLikelihood=observationsLikelihood(alphaDP)
        prefix.pop()
        likelihoods.append((i,obsrLikelihood))
        print(likelihoods[i])
    ranks=[i[0] for i in likelihoods]
    return ranks

In [49]:
def list_to_string(l):
    s=str(l[0])
    for x in l[1:]:
        s+= " " + str(x)
    return(s)
def formatString(string_in):
    """ Replace white spaces by %20 """
    return string_in.strip().replace(" ", "%20")
# State the problem number
problem_number = '0'

# and the user id (given during registration)
user_id = '68'

# name of this submission (no space or special character)

name = "hmm_Baseline"

train_file = 'Data/0.spice.train.txt'
#(A,B,pi)=trainModel('Data/0.spice.train.txt',20,7)
prefix_file = 'Data/0.spice.public.test.txt'

In [51]:
# get the test first prefix: the only element of the test set
def get_first_prefix(test_file):
    """ get the only prefix in test_file
        This function is called for the public test file(Which only has 1 line)
    """
    f = open(test_file)
    prefix = f.readline()
    f.close()
    return prefix
first_prefix = get_first_prefix(prefix_file)
prefix_number=1
# get the next symbol ranking on the first prefix
p=first_prefix.split()
prefix=[int(i) for i in p[1:len(p)]]#prefix holds the sequence of values in the public test file(Note:It has only 1 Seq)
print("Prefix ",prefix)
ranking=getHmmRank(prefix,A,A.transpose(),B,B.transpose(),pi,A.shape[0])
print(ranking)
ranking_string=list_to_string(ranking[:5])
print("Prefix number: " + str(prefix_number) + " Ranking: " + ranking_string + " Prefix: " + first_prefix)

Prefix  [3, 0, 3, 0, 1, 3, 3]
(0, 1.0)
(1, 1.0)
(2, 1.0)
(3, 1.0)
[0, 1, 2, 3]
Prefix number: 1 Ranking: 0 1 2 3 Prefix: 7 3 0 3 0 1 3 3


In [52]:
print(A)

[[ 0.24010455  0.24010455  0.24010455  0.13705686]
 [ 0.24010455  0.24010455  0.24010455  0.13705686]
 [ 0.24010455  0.24010455  0.24010455  0.13705686]
 [ 0.28762596  0.28762596  0.28762596  0.13712213]]


In [53]:
print(B)

[[ 0.94886774  0.56048745  0.38347267  2.39554741]
 [ 0.94886774  0.56048745  0.38347267  2.39554741]
 [ 0.94886774  0.56048745  0.38347267  2.39554741]
 [ 0.91057537  0.54420096  0.37487202  1.87687201]]
