# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from hmmlearn import hmm
import warnings
from constants import *
import math
import random
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# Helper Methods

In [2]:
def generate_random_sample(X, size):
    '''
    Given a list X, 
    generate random samples of given size
    '''
    Z_temp = random.sample(list(X), size)
    
    #Concatenation
    Z = [Z_temp[0]]
    for val in Z_temp[1:]:
        Z = np.concatenate([Z,[val]])
    
    return Z

def convert_values_to_list(list_val):
    '''
    Given a list X = [1 2 3] , 
    return X = [[1],[2],[3]]
    '''
    X = []
    for i in list_val:
        X.append([i])
    return X

def conversion_list_of_list(X, DIMENSION):
    '''
    Given a list X with values in lists,
    X = [[ 3  2  2],[4 8 10]]
    Convert each value to a list
    Return list of lists, array of lengths of each sequence
    X = [[3] [2] [2] [4] [8] [10]]
    ''' 
    X_new = []
    length = len(X)
    for idx, val_list in enumerate(X):
        Y = []
        for val in val_list:
            Y.append([val])
        X_new.append(Y)

    #Concatenation
    Z = X_new[0]
    for val_list in X_new[1:]:
        Z = np.concatenate([Z,val_list])

    # assign array of lengths for HMM
    lengths = [DIMENSION]*length
    
    return Z,lengths

#Calculate likelihood for given sequence according to given HMMs and return HMM
def likelihood_sequence(sequence, HMM_array):
    '''
    Given list of K HMMs and sequence,
    determines likelihood of sequence under all HMM models
    Returns index of HMM which has max likelihood
    ''' 
    scores = []
    length = [len(sequence)]
    for i, HMM in enumerate(HMM_array):
        calculated_score = HMM.score(sequence, length)
        scores.append(calculated_score)
    idx = scores.index(max(scores))
    return idx

def HMM_model_stats(model):
    '''
    Details of HMM model
    ''' 
    print("*************************************")
    print("Transition matrix")
    print(model.transmat_)
    print("*************************************")
    print("Means and stds of each hidden state")
    for i in range(model.n_components):
        print("Hidden state {0}".format(i))
        print("mean = ", model.means_[i])
        print("std = ", [math.sqrt(model.covars_[i])])
        print()

def BIC(HMM,X):
    LogLikelihood = model.score(X)
    num_hidden_states = model.n_components
    # D counts transition matrix, emission matrix, sequences estimated (Z), covariance matrix
    D = (num_hidden_states) + 2*(num_hidden_states**2) + len(X)*DIMENSION 
    BIC = LogLikelihood - (D/2)*np.log(len(X))
    return BIC
    
def BIC_array(HMM_array,X_i):
    BIC_total = 0
    for i in range(len(X_i)):
        model = HMM_array[i]
        X, lengths = conversion_list_of_list(X_i[i],DIMENSION)
        LogLikelihood = model.score(X,lengths)
        num_hidden_states = model.n_components
        # D counts transition matrix, emission matrix, sequences estimated (Z), covariance matrix
        D = num_hidden_states + 2*(num_hidden_states**2) + len(X)*DIMENSION 
        BIC = LogLikelihood - (D/2)*np.log(len(X))
        BIC_total+= BIC
    return BIC_total

def likelihood_array(HMM_array,X_i):
    likelihood_total = 0
    for i in range(len(X_i)):
        if(len(X_i[i])!=0):
            model = HMM_array[i]
            X, lengths = conversion_list_of_list(X_i[i],DIMENSION)
            LogLikelihood = model.score(X,lengths)
            likelihood = LogLikelihood 
            likelihood_total+= likelihood
    return likelihood_total

def plot_BIC(list_k, BIC_score):
    fig = plt.subplot(111)
    plt.plot(list_k, BIC_score, marker='o')  
    plt.xlabel('Value of K')
    plt.ylabel('Objective')
    plt.title('BIC')
    plt.show() 

# Load Data and Clean

In [3]:
df = pd.read_csv('Data/merged.txt', sep=",", na_values=['-'])
df = df.dropna()
df = df[['cdReads0','cdReads1','cdReads2','cdRPKM0','cdRPKM1','cdRPKM2']]

# Filter cdReads

In [4]:
df = df[(df['cdReads0'] >= 50) & (df['cdReads1'] >= 50) & (df['cdReads2'] >= 50)]

In [5]:
#Dataset
df_main = df[['cdReads0','cdReads1','cdReads2']]
LENGTH,DIMENSION = df_main.shape
print("Dataset size is",LENGTH)
print("Features are", DIMENSION)
print(df_main.head(5))
X = np.log2(df_main.values)
print("****************************")
print("First 5 log2 values\n",X[:5])

Dataset size is 2072
Features are 3
    cdReads0  cdReads1  cdReads2
6      113.0     180.0     292.0
19     455.0     340.0     326.0
34     353.0     397.0     499.0
57     113.0     150.0     137.0
81     126.0     175.0     202.0
****************************
First 5 log2 values
 [[ 6.82017896  7.4918531   8.18982456]
 [ 8.82972274  8.40939094  8.34872815]
 [ 8.46352437  8.6329952   8.96289601]
 [ 6.82017896  7.22881869  7.09803208]
 [ 6.97727992  7.45121111  7.65821148]]


# Arrays with HMM models for 1<=K<=25

In [6]:
HMM_K_ARRAYS_1 = []
X_i_K_ARRAYS_1 = []

# Check likelihood and do assignments

In [7]:
K_values_1 = [28]

In [12]:
for K in K_values_1:
    HMM_array = []
    X_i = []
    print("**************** K =", K ,"************************")
    for i in range(K):
        X_i.append([])

    # Sequences for initial HMM estimation
    for i in range(LENGTH):
        for j in range(K):
            if(i%K==j):
                X_i[j].append(list(X[i]))
    
    NUM_ITERATIONS = 0
    NUM_CLUSTER_PREV = {}
    NUM_CLUSTER_NOW = {}
                
    for i in range(K):
        model = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
        X_temp, lengths = conversion_list_of_list(X_i[i],DIMENSION)
        model.fit(X_temp, lengths)
        HMM_array.append(model)
    
    likelihood_prev = likelihood_array(HMM_array,X_i)
    print("Likelihood for iteration",NUM_ITERATIONS,"is",likelihood_prev)
    NUM_ITERATIONS+=1
    while (True):
        # Assign all sequences to HMM models

        print("************ Check likelihood of sequence in HMM  *********")
        NUM_CLUSTER_NOW = {}
        for x in X:
            sequence = convert_values_to_list(x)
            hmm_index = likelihood_sequence(sequence, HMM_array)
            X_i[hmm_index].append(list(x))
            if (hmm_index not in NUM_CLUSTER_NOW):
                NUM_CLUSTER_NOW[hmm_index] = 1
            else:
                NUM_CLUSTER_NOW[hmm_index] += 1
        print("************ Checking likelihood done  *********")

        # Re-estimate parameters for new HMMs
        print("************ Re-estimating HMM *********")
        HMM_array_prev = HMM_array
        HMM_array = []
        for i in range(K):
            model = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
            if(len(X_i[i])!=0):
                X_temp, lengths = conversion_list_of_list(X_i[i], DIMENSION)
                model.fit(X_temp, lengths)
                HMM_array.append(model)
            else:
                HMM_array.append(HMM_array_prev[i])
        print("************ Re-estimation done *********")
        print("Previous assignments",NUM_CLUSTER_PREV)
        print("Current assignments",NUM_CLUSTER_NOW)
        likelihood_curr = likelihood_array(HMM_array,X_i)
        print("Likelihood for iteration",NUM_ITERATIONS,"is",likelihood_curr)
        print("*****************************************")

        # if no reassignments, then break
        if ((NUM_CLUSTER_PREV == NUM_CLUSTER_NOW) or NUM_ITERATIONS>100):
            HMM_K_ARRAYS_1[26]=HMM_array
            X_i_K_ARRAYS_1[26]=X_i
            break
        else:
            # initialize empty subsets of data for next iteration
            X_i = []
            for i in range(K):
                X_i.append([])

            NUM_CLUSTER_PREV = NUM_CLUSTER_NOW
            print("Num iterations is:", NUM_ITERATIONS)
            NUM_ITERATIONS += 1
            likelihood_prev = likelihood_curr
    print("**********************************************************\n\n")

**************** K = 28 ************************
Likelihood for iteration 0 is -5379.4633368
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {}
Current assignments {12: 196, 2: 65, 25: 148, 26: 171, 4: 35, 6: 53, 1: 181, 0: 288, 13: 238, 15: 93, 24: 199, 21: 10, 14: 73, 19: 56, 10: 44, 8: 22, 22: 83, 11: 35, 5: 1, 3: 34, 17: 11, 20: 28, 23: 6, 7: 1, 27: 1}
Likelihood for iteration 1 is -7442.59907648
*****************************************
Num iterations is: 1
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {12: 196, 2: 65, 25: 148, 26: 171, 4: 35, 6: 53, 1: 181, 0: 288, 13: 238, 15: 93, 24: 199, 21: 10, 14: 73, 19: 56, 10: 44, 8: 22, 22: 83, 11: 35, 5: 1, 3: 

Likelihood for iteration 11 is 1776.74197475
*****************************************
Num iterations is: 11
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {7: 47, 2: 52, 20: 55, 26: 88, 12: 56, 8: 61, 16: 48, 10: 94, 0: 110, 22: 100, 23: 75, 25: 115, 1: 88, 13: 165, 19: 92, 24: 160, 21: 63, 14: 59, 9: 43, 11: 113, 5: 1, 17: 82, 4: 57, 3: 32, 15: 72, 18: 40, 6: 89, 27: 15}
Current assignments {7: 48, 2: 53, 20: 56, 26: 84, 12: 57, 8: 59, 16: 48, 10: 94, 0: 101, 22: 99, 23: 76, 25: 111, 1: 87, 13: 164, 19: 94, 24: 156, 21: 61, 14: 59, 9: 46, 11: 113, 5: 1, 17: 84, 4: 56, 3: 34, 15: 72, 18: 51, 27: 18, 6: 90}
Likelihood for iteration 12 is 1800.21422786
*****************************************
Num iterations is: 12
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
*

Likelihood for iteration 22 is 1886.68303715
*****************************************
Num iterations is: 22
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {7: 49, 2: 53, 20: 58, 26: 72, 12: 51, 8: 56, 22: 99, 10: 92, 23: 93, 9: 65, 25: 107, 1: 87, 13: 146, 19: 95, 0: 88, 24: 114, 21: 60, 14: 57, 16: 58, 11: 122, 5: 1, 17: 90, 27: 32, 3: 33, 4: 51, 15: 70, 18: 81, 6: 92}
Current assignments {7: 50, 2: 53, 20: 58, 26: 68, 12: 51, 8: 58, 22: 98, 10: 92, 23: 93, 9: 69, 25: 107, 1: 87, 13: 146, 19: 96, 0: 85, 24: 113, 21: 60, 14: 57, 16: 59, 11: 122, 5: 1, 17: 90, 27: 32, 3: 33, 4: 51, 15: 70, 18: 81, 6: 92}
Likelihood for iteration 23 is 1892.25181227
*****************************************
Num iterations is: 23
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
****

Likelihood for iteration 33 is 1933.49696432
*****************************************
Num iterations is: 33
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Previous assignments {7: 48, 2: 62, 20: 57, 9: 76, 12: 50, 8: 59, 22: 95, 10: 83, 23: 101, 25: 110, 1: 84, 26: 65, 13: 147, 19: 98, 0: 80, 24: 103, 21: 60, 14: 57, 16: 81, 11: 121, 5: 1, 17: 86, 27: 32, 3: 32, 4: 51, 15: 70, 18: 74, 6: 89}
Current assignments {7: 48, 2: 62, 20: 57, 9: 76, 12: 50, 8: 59, 22: 95, 10: 83, 23: 101, 25: 109, 1: 84, 26: 65, 13: 147, 19: 98, 0: 81, 24: 104, 21: 60, 14: 57, 16: 82, 11: 121, 5: 1, 17: 85, 27: 32, 3: 32, 4: 51, 15: 70, 18: 74, 6: 88}
Likelihood for iteration 34 is 1932.98429296
*****************************************
Num iterations is: 34
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
**

Likelihood for iteration 44 is 1939.06883094
*****************************************


IndexError: list assignment index out of range

In [14]:
HMM_K_ARRAY_26 = []
X_i_K_ARRAYS_26 = []
HMM_K_ARRAY_26.append(HMM_array)
X_i_K_ARRAYS_26.append(X_i)

In [15]:
import dill
dill.dump_session('notebook_env4.db')

# Calculate BIC for these assignments

In [None]:
print(BIC_array(HMM_array,X_i))

In [None]:
print(BIC_array(HMM_array,X_i))

# Noise Cluster

In [None]:
# Get list of lists to fit
Z, lengths = conversion_list_of_list(X,DIMENSION)

In [None]:
# Noise cluster 

model_noise = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
model_noise.fit(Z, lengths)

In [None]:
HMM_model_stats(model_noise)

# Generate subsets of data for K HMMs

In [None]:
K = 3
HMM_array = []
X_i = []

for i in range(K):
    X_i.append([])

# Sequences for initial HMM estimation
for i in range(LENGTH):
    for j in range(K):
        if(i%K==j):
            X_i[j].append(list(X[i]))

In [None]:
for i in range(K):
    model = hmm.GaussianHMM(n_components=3,covariance_type='spherical')
    X_temp, lengths = conversion_list_of_list(X_i[i],DIMENSION)
    model.fit(X_temp, lengths)
    HMM_array.append(model)

In [None]:
likelihood_prev = likelihood_array(HMM_array,X_i)
print("Likelihood for iteration",0,"is",likelihood_prev)

## Statistics for K HMMs

In [None]:
for i in range(K):
    print("Statistics for HMM Model", i)
    HMM_model_stats(HMM_array[i])
    print('\n\n')

In [None]:
NUM_ITERATIONS = 1
NUM_CLUSTER_PREV = {}
NUM_CLUSTER_NOW = {}

In [None]:
#initialize empty subsets of data
X_i = []

for i in range(K):
    X_i.append([])