# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from hmmlearn import hmm
import warnings
from constants import *
import math
import random
import matplotlib.pyplot as plt
import collections, operator
warnings.filterwarnings("ignore")

# Dummy HMM example

In [2]:
# #dummy HMM
# X1 = [ [3.09420655],  [2.55572822],  [2.58256628]]
# X2 = [ [3.79053016],  [2.04560095],  [2.78293239]]
# X = np.concatenate([X1, X2])
# print(X)
# lengths = [len(X1), len(X2)]
# model = hmm.GaussianHMM(n_components=3)
# model.fit(X, lengths)
# Z = model.predict(X)

# Helper Methods

In [6]:
def generate_random_sample(X, size):
    '''
    Given a list X, 
    generate random samples of given size
    '''
    Z_temp = random.sample(list(X), size)
    #Concatenation
    Z = [Z_temp[0]]
    for val in Z_temp[1:]:
        Z = np.concatenate([Z,[val]])
    
    return Z

def convert_values_to_list(list_val):
    '''
    Given a list X = [1 2 3] , 
    return X = [[1],[2],[3]]
    '''
    X = []
    for i in list_val:
        X.append([i])
    return X

def conversion_list_of_list(X):
    '''
    Given a list X with values in lists,
    X = [[ 3  2  2]]
    Convert each value to a list
    Return list of lists, array of lengths of each sequence
    X = [[3] [2] [2]]
    ''' 
    X_new = []
    length = len(X)
    for idx, val_list in enumerate(X):
        Y = []
        for val in val_list:
            Y.append([val])
        X_new.append(Y)
    
#     print("First 5 values of lists of lists:")
#     for idx, i in enumerate(X_new[:5]):
#         print("Index",idx,"is",i)

    #Concatenation
    Z = X_new[0]
    for val_list in X_new[1:]:
        Z = np.concatenate([Z,val_list])

    # assign array of lengths for HMM
    lengths = [DIMENSION]*length
#     print("*************************************")
#     print("Length of model to be fitted:",len(Z))
#     print("Total lengths", len(lengths)*DIMENSION)
    return Z,lengths

#Calculate likelihood for given sequence according to given HMMs and return HMM
def likelihood_sequence(sequence, HMM_array, model_noise):
    '''
    Given list of K HMMs and sequence,
    determines likelihood of sequence under all HMM models
    Returns index of HMM which has max likelihood
    ''' 
    score = model_noise.score(sequence)
    idx = -1
    for i,HMM in enumerate(HMM_array):
        calc_score = HMM.score(sequence)
        if(score < calc_score):
            idx = i
            score = calc_score
    return idx   

def HMM_model_stats(model):
    '''
    Details of HMM model
    ''' 
    print("*************************************")
    print("Transition matrix")
    print(model.transmat_)
    print("*************************************")
    print("Means and vars of each hidden state")
    for i in range(model.n_components):
        print("Hidden state {0}".format(i))
        print("mean = ", model.means_[i])
        print("std = ", [math.sqrt(model.covars_[i])])
        print()
    
#Check increasing sequence 
def monotonic_increase(x):
    dx = np.diff(x)
    return np.all(dx >= 0)

#Check decreasing sequence 
def monotonic_decrease(x):
    dx = np.diff(x)
    return np.all(dx <= 0)

#Check increasing then decreasing sequence 
def increase_decrease(x):
    return x[1]>=x[0] and x[2]<=x[1]

#Check decreasing then increasing sequence 
def decrease_increase(x):
    return x[1]<=x[0] and x[2]>=x[1]

# Load Data and Clean

In [7]:
#Dataset
DIMENSION = 3
df_main = pd.read_csv('Data/cleaned.txt', sep=",")
print("Dataset size is",len(df_main))
print(df_main.head(5))
X = np.log2(df_main.values)
print("****************************")
print("First 5 log2 values\n",X[:5])

Dataset size is 11087
     cdRPKM0    cdRPKM1    cdRPKM2
0   8.539825   5.879642   5.990043
1  13.837680   4.128452   6.882499
2  39.456786  62.505536  84.519655
3   2.104887   3.488833   2.908098
4  10.010800  17.066902   9.484017
****************************
First 5 log2 values
 [[ 3.09420655  2.55572822  2.58256628]
 [ 3.79053016  2.04560095  2.78293239]
 [ 5.30220154  5.96591207  6.40121497]
 [ 1.07374244  1.80274443  1.54007587]
 [ 3.32348535  4.09312932  3.24549826]]


In [8]:
# Get list of lists to fit
Z, lengths = conversion_list_of_list(X)

# Noise Cluster

In [9]:
# Noise cluster 

model_noise = hmm.GaussianHMM(n_components=3, covariance_type='spherical',n_iter=3)
model_noise.fit(Z, lengths)

GaussianHMM(algorithm='viterbi', covariance_type='spherical',
      covars_prior=0.01, covars_weight=1, init_params='stmc',
      means_prior=0, means_weight=0, min_covar=0.001, n_components=3,
      n_iter=3, params='stmc', random_state=None, startprob_prior=1.0,
      tol=0.01, transmat_prior=1.0, verbose=False)

In [10]:
HMM_model_stats(model_noise)

*************************************
Transition matrix
[[ 0.56184189  0.25425283  0.18390527]
 [ 0.21676412  0.76926958  0.0139663 ]
 [ 0.24299447  0.02136202  0.73564351]]
*************************************
Means and vars of each hidden state
Hidden state 0
mean =  [ 3.63251272]
std =  [1.1112236719384652]

Hidden state 1
mean =  [ 5.3243617]
std =  [1.446425736567844]

Hidden state 2
mean =  [ 1.74913881]
std =  [1.4693773305448643]



# Generate subsets of data for 4 HMMs

In [11]:
#Generate K HMMs

K = 4
HMM_array = []
X_i = []

# 0 = monotone increasing, 1 = monotone decreasing, 2 = increasing then decreasing, 3 = decreasing then increasing
for i in range(K):
    X_i.append([])

# Sequences
for i in range(len(df_main)):
    # Monotone increasing
    if(monotonic_increase(X[i])):
        X_i[0].append(list(X[i]))
    elif(monotonic_decrease(X[i])):
        X_i[1].append(list(X[i]))
    elif(increase_decrease(X[i])):
        X_i[2].append(list(X[i]))
    elif(decrease_increase(X[i])):
        X_i[3].append(list(X[i])) 
        
# Check sizes of each subX
sum=0
for i in X_i:
    print("Length is",len(i))
    sum+=len(i)
print("Total length is",sum)

Length is 2514
Length is 1725
Length is 3430
Length is 3418
Total length is 11087


# Train 4 HMMs

In [12]:
for i in range(K):
    model = hmm.GaussianHMM(n_components=3)
    X_temp, lengths = conversion_list_of_list(X_i[i])
    model.fit(X_temp, lengths)
    HMM_array.append(model)

In [13]:
for i in range(K):
    print("Statistics for HMM Model", i)
    HMM_model_stats(HMM_array[i])
    print('\n\n')

Statistics for HMM Model 0
*************************************
Transition matrix
[[  7.74186228e-01   5.11789403e-05   2.25762593e-01]
 [  6.55530255e-17   9.99999841e-01   1.59452657e-07]
 [  8.95119560e-08   8.37918781e-02   9.16208032e-01]]
*************************************
Means and vars of each hidden state
Hidden state 0
mean =  [ 1.60292297]
std =  [1.1095786521100743]

Hidden state 1
mean =  [ 5.58917279]
std =  [0.9841313359470235]

Hidden state 2
mean =  [ 3.64542082]
std =  [0.5962794502107487]




Statistics for HMM Model 1
*************************************
Transition matrix
[[  9.99999915e-01   8.52650245e-08   3.28019996e-17]
 [  5.37186489e-02   9.46280703e-01   6.48356877e-07]
 [  3.20083921e-10   1.79282679e-02   9.82071732e-01]]
*************************************
Means and vars of each hidden state
Hidden state 0
mean =  [ 1.27287753]
std =  [1.3054982389083356]

Hidden state 1
mean =  [ 3.88889313]
std =  [0.7414751285802]

Hidden state 2
mean =  [ 6.400

# Check likelihood and do assignments

In [14]:
NUM_CLUSTER = {}
for x in X:
    sequence = convert_values_to_list(x)
    hmm_index = likelihood_sequence(sequence, HMM_array, model)
    if(hmm_index not in NUM_CLUSTER):
        NUM_CLUSTER[hmm_index] = 1
    else:
        NUM_CLUSTER[hmm_index]+= 1

In [15]:
print(NUM_CLUSTER)

{-1: 4233, 0: 2665, 2: 3567, 1: 622}


# Calculate BIC for these assignments