# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from hmmlearn import hmm
import warnings
from constants import *
import math
import random
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# Helper Methods

In [2]:
def generate_random_sample(X, size):
    '''
    Given a list X, 
    generate random samples of given size
    '''
    Z_temp = random.sample(list(X), size)
    
    #Concatenation
    Z = [Z_temp[0]]
    for val in Z_temp[1:]:
        Z = np.concatenate([Z,[val]])
    
    return Z

def convert_values_to_list(list_val):
    '''
    Given a list X = [1 2 3] , 
    return X = [[1],[2],[3]]
    '''
    X = []
    for i in list_val:
        X.append([i])
    return X

def conversion_list_of_list(X, DIMENSION):
    '''
    Given a list X with values in lists,
    X = [[ 3  2  2],[4 8 10]]
    Convert each value to a list
    Return list of lists, array of lengths of each sequence
    X = [[3] [2] [2] [4] [8] [10]]
    ''' 
    X_new = []
    length = len(X)
    for idx, val_list in enumerate(X):
        Y = []
        for val in val_list:
            Y.append([val])
        X_new.append(Y)

    #Concatenation
    Z = X_new[0]
    for val_list in X_new[1:]:
        Z = np.concatenate([Z,val_list])

    # assign array of lengths for HMM
    lengths = [DIMENSION]*length
    
    return Z,lengths

#Calculate likelihood for given sequence according to given HMMs and return HMM
def likelihood_sequence(sequence, HMM_array):
    '''
    Given list of K HMMs and sequence,
    determines likelihood of sequence under all HMM models
    Returns index of HMM which has max likelihood
    ''' 
    scores = []
    length = [len(sequence)]
    for i, HMM in enumerate(HMM_array):
        calculated_score = HMM.score(sequence, length)
        scores.append(calculated_score)
    idx = scores.index(max(scores))
    return idx

def HMM_model_stats(model):
    '''
    Details of HMM model
    ''' 
    print("*************************************")
    print("Transition matrix")
    print(model.transmat_)
    print("*************************************")
    print("Means and stds of each hidden state")
    for i in range(model.n_components):
        print("Hidden state {0}".format(i))
        print("mean = ", model.means_[i])
        print("std = ", [math.sqrt(model.covars_[i])])
        print()

def BIC(HMM,X):
    LogLikelihood = model.score(X)
    num_hidden_states = model.n_components
    # D counts transition matrix, emission matrix, sequences estimated (Z), covariance matrix
    D = (num_hidden_states) + 2*(num_hidden_states**2) + len(X)*DIMENSION
    BIC = LogLikelihood - (D/2)*np.log(len(X))
    return BIC
    
def BIC_array(HMM_array,X_i):
    BIC_total = 0
    for i in range(len(X_i)):
        model = HMM_array[i]
        X, lengths = conversion_list_of_list(X_i[i],DIMENSION)
        LogLikelihood = model.score(X,lengths)
        num_hidden_states = model.n_components
        # D counts transition matrix, emission matrix, sequences estimated (Z), covariance matrix
        D = num_hidden_states + 2*(num_hidden_states**2) + len(X)*DIMENSION
        BIC = LogLikelihood - (D/2)*np.log(len(X))
        BIC_total+= BIC
    return BIC_total

def likelihood_array(HMM_array,X_i):
    likelihood_total = 0
    for i in range(len(X_i)):
        model = HMM_array[i]
        X, lengths = conversion_list_of_list(X_i[i],DIMENSION)
        LogLikelihood = model.score(X,lengths)
        likelihood = LogLikelihood 
        likelihood_total+= likelihood
    return likelihood_total

def plot_BIC(list_k, BIC_score):
    fig = plt.subplot(111)
    plt.plot(list_k, BIC_score, marker='o')  
    plt.xlabel('Value of K')
    plt.ylabel('Objective')
    plt.title('BIC')
    plt.show() 

# Load Data and Clean

In [3]:
#Dataset
df_main = pd.read_csv('Data/cleaned.txt', sep=",")
LENGTH,DIMENSION = df_main.shape
print("Dataset size is",LENGTH)
print("Features are", DIMENSION)
print(df_main.head(5))
X = np.log2(df_main.values)
print("****************************")
print("First 5 log2 values\n",X[:5])

Dataset size is 11087
Features are 3
     cdRPKM0    cdRPKM1    cdRPKM2
0   8.539825   5.879642   5.990043
1  13.837680   4.128452   6.882499
2  39.456786  62.505536  84.519655
3   2.104887   3.488833   2.908098
4  10.010800  17.066902   9.484017
****************************
First 5 log2 values
 [[ 3.09420655  2.55572822  2.58256628]
 [ 3.79053016  2.04560095  2.78293239]
 [ 5.30220154  5.96591207  6.40121497]
 [ 1.07374244  1.80274443  1.54007587]
 [ 3.32348535  4.09312932  3.24549826]]


# Generate subsets of data for K HMMs

In [16]:
K = 3
HMM_array = []
X_i = []

for i in range(K):
    X_i.append([])

# Sequences for initial HMM estimation
for i in range(LENGTH):
    for j in range(K):
        if(i%K==j):
            X_i[j].append(list(X[i]))

# Train K HMMs

In [17]:
for i in range(K):
    model = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
    X_temp, lengths = conversion_list_of_list(X_i[i],DIMENSION)
    model.fit(X_temp, lengths)
    HMM_array.append(model)

In [18]:
likelihood_prev = likelihood_array(HMM_array,X_i)
print("Likelihood for iteration",0,"is",likelihood_prev)

Likelihood for iteration 0 is -56315.3467566


## Statistics for K HMMs

In [None]:
for i in range(K):
    print("Statistics for HMM Model", i)
    HMM_model_stats(HMM_array[i])
    print('\n\n')

# Check likelihood and do assignments

In [19]:
NUM_ITERATIONS = 1
NUM_CLUSTER_PREV = {}
NUM_CLUSTER_NOW = {}

In [20]:
#initialize empty subsets of data
X_i = []

for i in range(K):
    X_i.append([])

In [21]:
while (True):
    # Assign all sequences to HMM models

    print("************ Check likelihood of sequence in HMM  *********")
    NUM_CLUSTER_NOW = {}
    for x in X:
        sequence = convert_values_to_list(x)
        hmm_index = likelihood_sequence(sequence, HMM_array)
        X_i[hmm_index].append(list(x))
        if (hmm_index not in NUM_CLUSTER_NOW):
            NUM_CLUSTER_NOW[hmm_index] = 1
        else:
            NUM_CLUSTER_NOW[hmm_index] += 1
    print("************ Checking likelihood done  *********")


    # Re-estimate parameters for new HMMs
    print("************ Re-estimating HMM *********")
    HMM_array = []
    for i in range(K):
        model = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
        X_temp, lengths = conversion_list_of_list(X_i[i], DIMENSION)
        model.fit(X_temp, lengths)
        HMM_array.append(model)
    print("************ Re-estimation done *********")
    print("Previous assignments",NUM_CLUSTER_PREV)
    print("Current assignments",NUM_CLUSTER_NOW)
    likelihood_curr = likelihood_array(HMM_array,X_i)
    print("Likelihood for iteration",NUM_ITERATIONS,"is",likelihood_curr)
    print("*****************************************")
    
    # if no reassignments, then break
    if ((NUM_CLUSTER_PREV == NUM_CLUSTER_NOW)):
        break
    else:
        # initialize empty subsets of data for next iteration
        X_i = []
        for i in range(K):
            X_i.append([])

        NUM_CLUSTER_PREV = NUM_CLUSTER_NOW
        print("Num iterations is:", NUM_ITERATIONS)
        NUM_ITERATIONS += 1
        likelihood_prev = likelihood_curr

************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {}
Current assignments {0: 5464, 2: 592, 1: 5031}
Likelihood for iteration 1 is -42154.3231539
*****************************************
Num iterations is: 1
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {0: 5464, 2: 592, 1: 5031}
Current assignments {0: 4814, 1: 5139, 2: 1134}
Likelihood for iteration 2 is -40289.0881891
*****************************************
Num iterations is: 2
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {0: 4814, 1: 5139, 2

Likelihood for iteration 21 is -37437.6622948
*****************************************
Num iterations is: 21
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {0: 3680, 2: 3714, 1: 3693}
Current assignments {0: 3676, 2: 3699, 1: 3712}
Likelihood for iteration 22 is -37439.6901228
*****************************************
Num iterations is: 22
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {0: 3676, 2: 3699, 1: 3712}
Current assignments {0: 3679, 2: 3677, 1: 3731}
Likelihood for iteration 23 is -37435.4835508
*****************************************
Num iterations is: 23
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood

************ Re-estimation done *********
Previous assignments {2: 3642, 0: 3713, 1: 3732}
Current assignments {2: 3649, 0: 3713, 1: 3725}
Likelihood for iteration 42 is -37353.146622
*****************************************
Num iterations is: 42
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3649, 0: 3713, 1: 3725}
Current assignments {2: 3648, 0: 3724, 1: 3715}
Likelihood for iteration 43 is -37346.314563
*****************************************
Num iterations is: 43
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3648, 0: 3724, 1: 3715}
Current assignments {2: 3649, 0: 3729, 1: 3709}
Likelihood for iteration 44 is -37340.5308811
*******************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3646, 0: 3766, 1: 3675}
Current assignments {2: 3648, 0: 3764, 1: 3675}
Likelihood for iteration 63 is -37304.9092784
*****************************************
Num iterations is: 63
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3648, 0: 3764, 1: 3675}
Current assignments {2: 3650, 0: 3763, 1: 3674}
Likelihood for iteration 64 is -37304.9929418
*****************************************
Num iterations is: 64
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3650, 0: 3763, 1: 3674}
Current assignments {

Likelihood for iteration 83 is -37241.68267
*****************************************
Num iterations is: 83
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3785, 0: 3786, 1: 3516}
Current assignments {2: 3795, 0: 3777, 1: 3515}
Likelihood for iteration 84 is -37238.8014297
*****************************************
Num iterations is: 84
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3795, 0: 3777, 1: 3515}
Current assignments {2: 3799, 0: 3771, 1: 3517}
Likelihood for iteration 85 is -37234.1416762
*****************************************
Num iterations is: 85
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood d

************ Re-estimation done *********
Previous assignments {2: 3752, 0: 3773, 1: 3562}
Current assignments {2: 3750, 0: 3772, 1: 3565}
Likelihood for iteration 104 is -37220.2148657
*****************************************
Num iterations is: 104
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3750, 0: 3772, 1: 3565}
Current assignments {2: 3746, 0: 3773, 1: 3568}
Likelihood for iteration 105 is -37220.2349781
*****************************************
Num iterations is: 105
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3746, 0: 3773, 1: 3568}
Current assignments {2: 3738, 0: 3775, 1: 3574}
Likelihood for iteration 106 is -37219.566224
*************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3720, 0: 3765, 1: 3602}
Current assignments {2: 3717, 0: 3765, 1: 3605}
Likelihood for iteration 125 is -37199.3695775
*****************************************
Num iterations is: 125
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3717, 0: 3765, 1: 3605}
Current assignments {2: 3718, 0: 3764, 1: 3605}
Likelihood for iteration 126 is -37199.5288913
*****************************************
Num iterations is: 126
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {2: 3718, 0: 3764, 1: 3605}
Current assignmen

# Calculate BIC for these assignments

In [None]:
print(BIC_array(HMM_array,X_i))

In [15]:
print(BIC_array(HMM_array,X_i))

-484262.352425


# Noise Cluster

In [None]:
# Get list of lists to fit
Z, lengths = conversion_list_of_list(X,DIMENSION)

In [None]:
# Noise cluster 

model_noise = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
model_noise.fit(Z, lengths)

In [None]:
HMM_model_stats(model_noise)