# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from hmmlearn import hmm
import warnings
from constants import *
import math
import os
import random
import matplotlib.pyplot as plt
import dill
from functools import reduce

# Helper Methods

In [2]:
def generate_random_sample(X, size):
    '''
    Given a list X, 
    generate random samples of given size
    '''
    Z_temp = random.sample(list(X), size)
    
    #Concatenation
    Z = [Z_temp[0]]
    for val in Z_temp[1:]:
        Z = np.concatenate([Z,[val]])
    
    return Z

def convert_values_to_list(list_val):
    '''
    Given a list X = [1 2 3] , 
    return X = [[1],[2],[3]]
    '''
    X = []
    for i in list_val:
        X.append([i])
    return X

def conversion_list_of_list(X, DIMENSION):
    '''
    Given a list X with values in lists,
    X = [[ 3  2  2],[4 8 10]]
    Convert each value to a list
    Return list of lists, array of lengths of each sequence
    X = [[3] [2] [2] [4] [8] [10]]
    ''' 
    X_new = []
    length = len(X)
    for idx, val_list in enumerate(X):
        Y = []
        for val in val_list:
            Y.append([val])
        X_new.append(Y)

    #Concatenation
    Z = X_new[0]
    for val_list in X_new[1:]:
        Z = np.concatenate([Z,val_list])

    # assign array of lengths for HMM
    lengths = [DIMENSION]*length
    
    return Z,lengths

#Calculate likelihood for given sequence according to given HMMs and return HMM
def likelihood_sequence(sequence, HMM_array):
    '''
    Given list of K HMMs and sequence,
    determines likelihood of sequence under all HMM models
    Returns index of HMM which has max likelihood
    ''' 
    scores = []
    length = [len(sequence)]
    for i, HMM in enumerate(HMM_array):
        calculated_score = HMM.score(sequence, length)
        scores.append(calculated_score)
    idx = scores.index(max(scores))
    return idx

def HMM_model_stats(model):
    '''
    Details of HMM model
    ''' 
    print("*************************************")
    print("Transition matrix")
    print(model.transmat_)
    print("*************************************")
    print("Means and stds of each hidden state")
    for i in range(model.n_components):
        print("Hidden state {0}".format(i))
        print("mean = ", model.means_[i])
        print("std = ", [np.sqrt(model.covars_[i])])
        print()

def BIC(model,X,lengths):
    LogLikelihood = model.score(X,lengths)
    num_hidden_states = model.n_components
    # D counts transition matrix (emission estimated by PDF), means = num_hidden_states  
    # covariance matrix = num_hidden_states
    D = num_hidden_states**2 + 2*num_hidden_states
    BIC = LogLikelihood - (D/2)*np.log(len(X))
    return BIC
    
def BIC_array(HMM_array,X_i):
    BIC_total = 0
    for i in range(len(X_i)):
        model = HMM_array[i]
        X, lengths = conversion_list_of_list(X_i[i],DIMENSION)
        BIC_total+= BIC(model,X,lengths)
    return BIC_total

def likelihood_array(HMM_array,X_i):
    likelihood_total = 1
    for i in range(len(X_i)):
        if(len(X_i[i])>=HMM_array[i].n_components):
            model = HMM_array[i]
            X, lengths = conversion_list_of_list(X_i[i],DIMENSION)
            LogLikelihood = model.score(X,lengths)
            likelihood = LogLikelihood 
            likelihood_total*= likelihood
    return likelihood_total

def plot_BIC(list_k, BIC_score):
    fig = plt.subplot(111)
    plt.plot(list_k, BIC_score, marker='o')  
    plt.xlabel('Value of K')
    plt.ylabel('Objective')
    plt.title('BIC')
    plt.show() 

def print_stats(assignments,length):
    for i in range(length):
        if(i%100==0):
            print(assignments[i])   

# Merge data

In [13]:
previous = 'RPKMOutput/GeneNames/RNASeq/geneTXCD_RPKMoutput_'
timepoints = ['ATCACG-s_6_1_genome.txt','TTAGGC-s_6_1_genome.txt',
             'CAGATC-s_6_1_genome.txt','GCCAAT-s_6_1_genome.txt',
             'CTTGTA-s_6_1_genome.txt'
            ]
dataframes = []
for idx,file in enumerate(timepoints):
    df = pd.read_csv(previous+file, sep="	", na_values=['-'])
    df = df.dropna(how='any')
    df = df[['AccNum', 'GeneName','cdReads',
       'cdRPKM']]
    df.rename(columns={'cdRPKM':'cdRPKM'+str(idx), 'cdReads':'cdReads'+str(idx)}, inplace=True)
    dataframes.append(df)

In [14]:
df_rnaseq = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), dataframes)

In [15]:
df_rnaseq.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4
0,NM_017847,ODR4,50.0,6.642349,62.0,5.591876,71.0,6.004645,30.0,5.369988,51.0,8.507454
1,NM_001143986,TLE6,4.0,0.421957,6.0,0.429708,3.0,0.201468,3.0,0.426413,3.0,0.397381
2,NM_001003803,ATP5S,29.0,8.115351,52.0,9.879314,52.0,9.263817,19.0,7.164127,22.0,7.730539
3,NM_001003800,BICD2,778.0,54.937502,1155.0,55.37139,1444.0,64.913315,706.0,67.17294,883.0,78.293873
4,NM_016649,ESF1,17.0,1.20607,26.0,1.252307,10.0,0.451649,9.0,0.860333,20.0,1.781686


In [17]:
# df_rnaseq.to_csv('RPKMOutput/RNASeq.txt',sep=' ',index=False)

In [18]:
previous = 'RPKMOutput/GeneNames/RPF/geneTXCD_RPKMoutput_'
timepoints = ['ATCACG-s_7_1_genome.txt','TTAGGC-s_7_1_genome.txt',
             'CAGATC-s_7_1_genome.txt','GCCAAT-s_7_1_genome.txt',
             'CTTGTA-s_7_1_genome.txt'
            ]
dataframes = []
for idx,file in enumerate(timepoints):
    df = pd.read_csv(previous+file, sep="	", na_values=['-'])
    df = df.dropna(how='any')
    df = df[['AccNum', 'GeneName','cdReads',
       'cdRPKM']]
    df.rename(columns={'cdRPKM':'cdRPKM'+str(idx), 'cdReads':'cdReads'+str(idx)}, inplace=True)
    dataframes.append(df)
df_rpf = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), dataframes)
df_rpf.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4
0,NM_017847,ODR4,93.0,16.487335,62.0,13.206484,49.0,15.968568,37.0,16.540714,39.0,20.464235
1,NM_001143986,TLE6,4.0,0.563098,2.0,0.338285,2.0,0.517555,1.0,0.354984,1.0,0.416666
2,NM_001003803,ATP5S,81.0,30.248942,83.0,37.241851,71.0,48.740039,46.0,43.317961,26.0,28.738355
3,NM_001003800,BICD2,501.0,47.210965,389.0,44.043594,284.0,49.195554,257.0,61.069347,230.0,64.149959
4,NM_016649,ESF1,69.0,6.532635,52.0,5.915217,41.0,7.135519,26.0,6.207228,26.0,7.28578


In [19]:
# df_rpf.to_csv('RPKMOutput/RPF.txt',sep=' ',index=False)

In [20]:
# df_rnaseq = df_rnaseq[['AccNum','GeneName', 'cdRPKM0', 'cdRPKM1',
#        'cdRPKM2', 'cdRPKM3', 'cdRPKM4']]
# df_rpf = df_rpf[['AccNum','GeneName', 'cdRPKM0', 'cdRPKM1',
#        'cdRPKM2', 'cdRPKM3', 'cdRPKM4']]
df_TE = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), [df_rnaseq,df_rpf])
df_TE.head()

Unnamed: 0,AccNum,GeneName,cdReads0_x,cdRPKM0_x,cdReads1_x,cdRPKM1_x,cdReads2_x,cdRPKM2_x,cdReads3_x,cdRPKM3_x,...,cdReads0_y,cdRPKM0_y,cdReads1_y,cdRPKM1_y,cdReads2_y,cdRPKM2_y,cdReads3_y,cdRPKM3_y,cdReads4_y,cdRPKM4_y
0,NM_017847,ODR4,50.0,6.642349,62.0,5.591876,71.0,6.004645,30.0,5.369988,...,93.0,16.487335,62.0,13.206484,49.0,15.968568,37.0,16.540714,39.0,20.464235
1,NM_001143986,TLE6,4.0,0.421957,6.0,0.429708,3.0,0.201468,3.0,0.426413,...,4.0,0.563098,2.0,0.338285,2.0,0.517555,1.0,0.354984,1.0,0.416666
2,NM_001003803,ATP5S,29.0,8.115351,52.0,9.879314,52.0,9.263817,19.0,7.164127,...,81.0,30.248942,83.0,37.241851,71.0,48.740039,46.0,43.317961,26.0,28.738355
3,NM_001003800,BICD2,778.0,54.937502,1155.0,55.37139,1444.0,64.913315,706.0,67.17294,...,501.0,47.210965,389.0,44.043594,284.0,49.195554,257.0,61.069347,230.0,64.149959
4,NM_016649,ESF1,17.0,1.20607,26.0,1.252307,10.0,0.451649,9.0,0.860333,...,69.0,6.532635,52.0,5.915217,41.0,7.135519,26.0,6.207228,26.0,7.28578


In [21]:
df_TE[['cdRPKM0_x']] = df_TE[['cdRPKM0_x']].div(df_TE['cdRPKM0_y'].values,axis=0)
df_TE[['cdRPKM1_x']] = df_TE[['cdRPKM1_x']].div(df_TE['cdRPKM1_y'].values,axis=0)
df_TE[['cdRPKM2_x']] = df_TE[['cdRPKM2_x']].div(df_TE['cdRPKM2_y'].values,axis=0)
df_TE[['cdRPKM3_x']] = df_TE[['cdRPKM3_x']].div(df_TE['cdRPKM3_y'].values,axis=0)
df_TE[['cdRPKM4_x']] = df_TE[['cdRPKM4_x']].div(df_TE['cdRPKM4_y'].values,axis=0)
for i in range(0,5):
    df_TE.rename(columns={'cdRPKM'+str(i)+'_x':'TE'+str(i)}, inplace=True)


In [22]:
df_TE.keys()

Index(['AccNum', 'GeneName', 'cdReads0_x', 'TE0', 'cdReads1_x', 'TE1',
       'cdReads2_x', 'TE2', 'cdReads3_x', 'TE3', 'cdReads4_x', 'TE4',
       'cdReads0_y', 'cdRPKM0_y', 'cdReads1_y', 'cdRPKM1_y', 'cdReads2_y',
       'cdRPKM2_y', 'cdReads3_y', 'cdRPKM3_y', 'cdReads4_y', 'cdRPKM4_y'],
      dtype='object')

In [23]:
df_TE = df_TE[['AccNum', 'GeneName','TE0','TE1','TE2','TE3','TE4']]
df_TE.head()

Unnamed: 0,AccNum,GeneName,TE0,TE1,TE2,TE3,TE4
0,NM_017847,ODR4,0.402876,0.423419,0.376029,0.324653,0.415723
1,NM_001143986,TLE6,0.749349,1.270257,0.389269,1.201215,0.953718
2,NM_001003803,ATP5S,0.268285,0.265275,0.190066,0.165385,0.268997
3,NM_001003800,BICD2,1.16366,1.257195,1.319496,1.099945,1.220482
4,NM_016649,ESF1,0.184622,0.211709,0.063296,0.138602,0.244543


In [24]:
# df_TE.to_csv('RPKMOutput/TE.txt',sep=' ',index=False)

# Load Data and Clean

In [25]:
df_rnaseq.keys()

Index(['AccNum', 'GeneName', 'cdReads0', 'cdRPKM0', 'cdReads1', 'cdRPKM1',
       'cdReads2', 'cdRPKM2', 'cdReads3', 'cdRPKM3', 'cdReads4', 'cdRPKM4'],
      dtype='object')

In [26]:
#Dataset
df_rnaseq = df_rnaseq[(df_rnaseq['cdReads0'] >= 50) & (df_rnaseq['cdReads1'] >= 50) & (df_rnaseq['cdReads2'] >= 50)& (df_rnaseq['cdReads3'] >= 50)& (df_rnaseq['cdReads4'] >= 50)]
df_main = df_rnaseq[['cdRPKM0','cdRPKM1','cdRPKM2','cdRPKM3','cdRPKM4']]
LENGTH,DIMENSION = df_main.shape
print("Dataset size is",LENGTH)
print("Features are", DIMENSION)
print(df_main.head(5))
X = np.log2(df_main.values)
print("****************************")
print("First 5 log2 values\n",X[:5])

Dataset size is 6830
Features are 5
      cdRPKM0    cdRPKM1    cdRPKM2    cdRPKM3    cdRPKM4
3   54.937502  55.371390  64.913315  67.172940  78.293873
5   49.166094  35.735701  40.321640  35.461616  35.952554
8   35.030842  32.456656  35.944257  37.945878  41.572410
12  24.639646  22.985487  17.653063  16.492664  19.366281
14  41.400942  41.318228  31.891348  33.470470  34.830747
****************************
First 5 log2 values
 [[ 5.7797194   5.79106884  6.02044253  6.06980827  6.29082751]
 [ 5.61959185  5.1592942   5.33348242  5.1481864   5.16802236]
 [ 5.13055377  5.02044247  5.16768936  5.24587127  5.37755449]
 [ 4.62290962  4.52265131  4.14184663  4.04375252  4.27547504]
 [ 5.37159168  5.36870648  4.99509317  5.06481692  5.1222895 ]]


# Arrays with HMM models for 1<=K<=25

In [27]:
HMM_K_ARRAYS = []
X_i_K_ARRAYS = []

# Check likelihood and do assignments

In [28]:
K_values = range(2,26)

In [29]:
for K in K_values:
    HMM_array = []
    X_i = []
    print("**************** K =", K ,"************************")
    for i in range(K):
        X_i.append([])
        
    NUM_ITERATIONS = 0
    NUM_CLUSTER_PREV = {}
    NUM_CLUSTER_NOW = {}
    
    # Sequences for initial HMM estimation
    # Make K subsets data of LENGTH
    for i in range(LENGTH):
        for j in range(K):
            if(i%K==j):
                X_i[j].append(list(X[i]))
                NUM_CLUSTER_PREV[i] = j
                
    for i in range(K):
        model = hmm.GaussianHMM(n_components=3,covariance_type='diag')
        X_temp, lengths = conversion_list_of_list(X_i[i],DIMENSION)
        model.fit(X_temp, lengths)
        HMM_array.append(model)
    
    likelihood_prev = likelihood_array(HMM_array,X_i)
    print("Likelihood for iteration",NUM_ITERATIONS,"is",likelihood_prev)
    NUM_ITERATIONS+=1
    while (True):
        # Assign all sequences to HMM models

        print("************ Check likelihood of sequence in HMM  *********")
        NUM_CLUSTER_NOW = {}
        for idx,x in enumerate(X):
            sequence = convert_values_to_list(x)
            hmm_index = likelihood_sequence(sequence, HMM_array)
            X_i[hmm_index].append(list(x))
            NUM_CLUSTER_NOW[idx] = hmm_index
        print("************ Checking likelihood done  *********")

        # Re-estimate parameters for new HMMs
        print("************ Re-estimating HMM *********")
        HMM_array_prev = HMM_array
        HMM_array = []
        for i in range(K):
            model = hmm.GaussianHMM(n_components=3,covariance_type='diag')
            if(len(X_i[i])>=model.n_components):
                X_temp, lengths = conversion_list_of_list(X_i[i], DIMENSION)
                model.fit(X_temp, lengths)
                HMM_array.append(model)
            else:
                HMM_array.append(HMM_array_prev[i])
        print("************ Re-estimation done *********")
        likelihood_curr = likelihood_array(HMM_array,X_i)
        print("Likelihood for iteration",NUM_ITERATIONS,"is",likelihood_curr)
        print("*****************************************")

        # if no reassignments, then break
        if ((NUM_CLUSTER_PREV == NUM_CLUSTER_NOW) or NUM_ITERATIONS>40):
            HMM_K_ARRAYS.append(HMM_array)
            X_i_K_ARRAYS.append(X_i)
            break
        else:
            # initialize empty subsets of data for next iteration
            X_i = []
            for i in range(K):
                X_i.append([])

            NUM_CLUSTER_PREV = NUM_CLUSTER_NOW
            print("Num iterations is:", NUM_ITERATIONS)
            NUM_ITERATIONS += 1
            likelihood_prev = likelihood_curr
    print("**********************************************************\n\n")

**************** K = 2 ************************
Likelihood for iteration 0 is 449291319.043
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 1 is 1366805366.85
*****************************************
Num iterations is: 1
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 2 is 139405870.878
*****************************************
Num iterations is: 2
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 3 is 137601213.814
*****************************************
Num iterations is: 3
***********

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 28 is 135485655.502
*****************************************
Num iterations is: 28
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 29 is 135439043.128
*****************************************
Num iterations is: 29
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 30 is 135403444.188
*****************************************
Num iterations is: 30
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
*******

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 14 is -108597370476.0
*****************************************
Num iterations is: 14
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 15 is -108494516807.0
*****************************************
Num iterations is: 15
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 16 is -108374437981.0
*****************************************
Num iterations is: 16
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
*

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 12 is 4.01368914192e+13
*****************************************
Num iterations is: 12
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 13 is 4.00265907666e+13
*****************************************
Num iterations is: 13
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 14 is 4.01481799263e+13
*****************************************
Num iterations is: 14
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *****

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 39 is 3.95607952644e+13
*****************************************
Num iterations is: 39
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 40 is 3.95645985778e+13
*****************************************
Num iterations is: 40
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 41 is 3.96139715662e+13
*****************************************
**********************************************************


**************** K = 5 ************************
Likelihood for iteration 0 is -4.35403255341e+19
*********

Likelihood for iteration 24 is -4.07221832523e+15
*****************************************
Num iterations is: 24
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 25 is -4.27682887781e+15
*****************************************
Num iterations is: 25
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 26 is -4.43113869256e+15
*****************************************
Num iterations is: 26
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 27 is -4.57408672395e+15
********************************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 10 is 6.49658075388e+16
*****************************************
Num iterations is: 10
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 11 is 7.03192247011e+16
*****************************************
Num iterations is: 11
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 12 is 5.70200438047e+16
*****************************************
Num iterations is: 12
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *****

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 37 is -5.88951501804e+16
*****************************************
Num iterations is: 37
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 38 is -5.88107940062e+16
*****************************************
Num iterations is: 38
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 39 is -5.74745523765e+16
*****************************************
Num iterations is: 39
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM **

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 23 is 6.84841718257e+19
*****************************************
Num iterations is: 23
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 24 is 6.9489287886e+19
*****************************************
Num iterations is: 24
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 25 is 6.47911637984e+19
*****************************************
Num iterations is: 25
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM ******

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 9 is -1.38403519164e+21
*****************************************
Num iterations is: 9
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 10 is -1.02501926814e+21
*****************************************
Num iterations is: 10
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 11 is -7.13828549519e+20
*****************************************
Num iterations is: 11
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM ****

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 36 is 6.89332919315e+19
*****************************************
Num iterations is: 36
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 37 is 6.39853244309e+19
*****************************************
Num iterations is: 37
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 38 is 6.40595180682e+19
*****************************************
Num iterations is: 38
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *****

Likelihood for iteration 21 is -5.80138499693e+21
*****************************************
Num iterations is: 21
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 22 is -9.0180580013e+21
*****************************************
Num iterations is: 22
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 23 is -1.18362893838e+22
*****************************************
Num iterations is: 23
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 24 is -1.37570805574e+22
*********************************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 7 is -7.09851418957e+26
*****************************************
Num iterations is: 7
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 8 is 4.76666774299e+24
*****************************************
Num iterations is: 8
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 9 is 4.96672844846e+26
*****************************************
Num iterations is: 9
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********


************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 34 is 1.21924436948e+27
*****************************************
Num iterations is: 34
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 35 is 1.02652277898e+27
*****************************************
Num iterations is: 35
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 36 is 8.44983238867e+26
*****************************************
Num iterations is: 36
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *****

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 20 is 7.91973831071e+29
*****************************************
Num iterations is: 20
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 21 is 6.09802409727e+29
*****************************************
Num iterations is: 21
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 22 is 4.76215675964e+29
*****************************************
Num iterations is: 22
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *****

Likelihood for iteration 5 is -1.3046704772e+31
*****************************************
Num iterations is: 5
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 6 is -3.59016753147e+30
*****************************************
Num iterations is: 6
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 7 is 1.6135709486e+29
*****************************************
Num iterations is: 7
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 8 is -1.23203533584e+29
*****************************************


Likelihood for iteration 32 is 2.54282479155e+29
*****************************************
Num iterations is: 32
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 33 is 2.07950590542e+29
*****************************************
Num iterations is: 33
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 34 is 2.03926039661e+29
*****************************************
Num iterations is: 34
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 35 is 1.35108779773e+29
************************************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 18 is -4.06602360477e+35
*****************************************
Num iterations is: 18
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 19 is -2.79123580073e+35
*****************************************
Num iterations is: 19
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 20 is -1.84932327747e+35
*****************************************
Num iterations is: 20
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM **

Likelihood for iteration 3 is 7.3586256659e+33
*****************************************
Num iterations is: 3
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 4 is -2.115749394e+32
*****************************************
Num iterations is: 4
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 5 is -6.26165282107e+34
*****************************************
Num iterations is: 5
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 6 is -2.94347172215e+36
*****************************************
N

Likelihood for iteration 30 is 7.48911070562e+37
*****************************************
Num iterations is: 30
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 31 is 7.79236712539e+37
*****************************************
Num iterations is: 31
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 32 is 7.73366080579e+37
*****************************************
Num iterations is: 32
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 33 is 8.24826151525e+37
************************************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 16 is -3.64305123172e+35
*****************************************
Num iterations is: 16
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 17 is 6.94086807654e+35
*****************************************
Num iterations is: 17
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 18 is -2.24803441933e+36
*****************************************
Num iterations is: 18
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM ***

Likelihood for iteration 1 is 4.90965546958e+56
*****************************************
Num iterations is: 1
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 2 is -3.31477057035e+32
*****************************************
Num iterations is: 2
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 3 is -5.69307547842e+39
*****************************************
Num iterations is: 3
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 4 is 7.53702627413e+39
*****************************************

Likelihood for iteration 28 is 2.22534889327e+39
*****************************************
Num iterations is: 28
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 29 is 2.81677429353e+39
*****************************************
Num iterations is: 29
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 30 is 6.46762989507e+39
*****************************************
Num iterations is: 30
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 31 is 7.61799546264e+39
************************************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 14 is -6.81961045642e+44
*****************************************
Num iterations is: 14
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 15 is -7.84177221196e+44
*****************************************
Num iterations is: 15
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 16 is -7.5528918601e+44
*****************************************
Num iterations is: 16
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM ***

Likelihood for iteration 3 is -1.61637320707e+38
*****************************************
Num iterations is: 3
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 4 is 1.02237949294e+43
*****************************************
Num iterations is: 4
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 5 is -1.17224482581e+44
*****************************************
Num iterations is: 5
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 6 is 1.98482505242e+46
*****************************************

Likelihood for iteration 30 is 2.7155961432e+44
*****************************************
Num iterations is: 30
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 31 is 2.29494277315e+44
*****************************************
Num iterations is: 31
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 32 is 1.02285763079e+44
*****************************************
Num iterations is: 32
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 33 is -9.93474482074e+43
************************************

************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 16 is -6.49638145778e+47
*****************************************
Num iterations is: 16
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 17 is -5.43934260683e+47
*****************************************
Num iterations is: 17
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 18 is -5.45003075701e+47
*****************************************
Num iterations is: 18
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM **

Likelihood for iteration 1 is 2.67608693219e+68
*****************************************
Num iterations is: 1
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 2 is 8.46765428404e+40
*****************************************
Num iterations is: 2
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 3 is 9.7623896282e+42
*****************************************
Num iterations is: 3
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 4 is 8.41687093183e+44
*****************************************
Nu

Likelihood for iteration 28 is 9.70948481122e+49
*****************************************
Num iterations is: 28
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 29 is 9.6411931424e+49
*****************************************
Num iterations is: 29
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 30 is 8.06065892968e+49
*****************************************
Num iterations is: 30
************ Check likelihood of sequence in HMM  *********
************ Checking likelihood done  *********
************ Re-estimating HMM *********
************ Re-estimation done *********
Likelihood for iteration 31 is 7.03535227407e+49
*************************************

KeyboardInterrupt: 

In [30]:
print(len(HMM_K_ARRAYS))
print(len(X_i_K_ARRAYS))

19
19


In [31]:
dill.dump_session('../Weights/HMM_GaussianHMM_5points_log2_RNASeq.db')

# Calculate BIC for these assignments

In [None]:
print(BIC_array(HMM_array,X_i))

In [None]:
print(BIC_array(HMM_array,X_i))

# Noise Cluster

In [None]:
# Get list of lists to fit
Z, lengths = conversion_list_of_list(X,DIMENSION)

In [None]:
# Noise cluster 

model_noise = hmm.GMMHMM(n_components=3,n_mix=3,covariance_type='spherical')
model_noise.fit(Z, lengths)

In [None]:
HMM_model_stats(model_noise)

# Generate subsets of data for K HMMs

In [46]:
K = 3
HMM_array = []
X_i = []

for i in range(K):
    X_i.append([])

# Sequences for initial HMM estimation
for i in range(LENGTH):
    for j in range(K):
        if(i%K==j):
            X_i[j].append(list(X[i]))

In [47]:
for i in range(K):
    model = hmm.GaussianHMM(n_components=3, covariance_type='full')
    X_temp, lengths = conversion_list_of_list(X_i[i],DIMENSION)
    model.fit(X_temp, lengths)

## Statistics for K HMMs

In [48]:
HMM_model_stats(model)
print('\n\n')
model.predict([[7.56030946],[5.96591207],[6.40121497]],lengths=[3])

*************************************
Transition matrix
[[  9.99276869e-01   6.38482140e-04   8.46487741e-05]
 [  1.93144205e-04   9.99806856e-01   2.07287602e-13]
 [  1.68090394e-02   2.12582122e-11   9.83190961e-01]]
*************************************
Means and stds of each hidden state
Hidden state 0
mean =  [ 5.85591189]
std =  [array([[ 0.49587845]])]

Hidden state 1
mean =  [ 7.75447354]
std =  [array([[ 1.05975496]])]

Hidden state 2
mean =  [ 4.32879483]
std =  [array([[ 0.68723547]])]






array([1, 1, 1])

In [None]:
K = 3
HMM_array = []
X_i = []

for i in range(K):
    X_i.append([])

# Sequences for initial HMM estimation
for i in range(LENGTH):
    for j in range(K):
        if(i%K==j):
            X_i[j].append(list(X[i]))
for i in range(K):
    model = hmm.GaussianHMM(n_components=3, covariance_type='diag')
    X_temp, lengths = conversion_list_of_list(X_i[i],DIMENSION)
    model.fit(X_temp, lengths)
    HMM_array.append(model)
for i in range(K):
    print("Statistics for HMM Model", i)
    HMM_model_stats(HMM_array[i])
    print('\n\n')

In [None]:
NUM_ITERATIONS = 1
NUM_CLUSTER_PREV = {}
NUM_CLUSTER_NOW = {}

In [None]:
#initialize empty subsets of data
X_i = []

for i in range(K):
    X_i.append([])

In [None]:
likelihood_prev = likelihood_array(HMM_array,X_i)
print("Likelihood for iteration",0,"is",likelihood_prev)