In [1]:
import numpy as np
import pandas as pd
from pomegranate import *
import random
import os
import itertools
from functools import reduce

### Methods

In [2]:
def init(K,X):
    """This method initializes the models for EM
    
    K: Number of clusters
    X: Data
    
    Return: x_k, models,alpha_k, indices_array, CML
    """
    LENGTH, DIMENSION = X.shape
    models = []
    x_k = [[] for i in range(K)] #initialize K empty data arrays
    alpha_k = []
    indices_array = [[] for i in range(K)]
    print("**************** K =", K ,"************************")


    # Sequences for initial CL Multinet Estimation
    # Make K subsets of data
    for i in range(LENGTH):
        random_integer = random.randint(0,K-1)
        x_k[random_integer].append(X[i])
        indices_array[random_integer].append(i)

    for i in range(K):
        print("Length of model",i+1,":",len(x_k[i]))
        alpha_k.append(len(x_k[i])/LENGTH)
        model = BayesianNetwork.from_samples(x_k[i],algorithm='chow-liu') 
        models.append(model)

    print("Initial Model Structures",alpha_k)
    for model in models:
        print(model.structure)

    print("Initial Alphas",alpha_k)

    CML = 0

    for i in range(K):
        x = x_k[i]
        model = models[i]
        CML+=sum(np.log(model.probability(x)))+len(x)*np.log(alpha_k[i])
    print("Initial CML",CML)
    
    return x_k,models,alpha_k,indices_array,CML
           
def e_step(K,X,x_k,models,alpha_k,indices_array):
    """This method performs the E step in EM for the mth iteration
    
    K: Number of clusters
    X: Data
    x_k: Previously classified data in the (m-1)th step
    models: models from the (m-1)th step
    alpha_k: alphas from the previous step
    indices_array: current indices of the original data (for each cluster)
    
    Return: x_k,models, alpha_k,indices_array
    """
    x_k_temp = [[] for i in range(K)] #initialize K empty data arrays for C step (assign)
    indices_array = [[] for i in range(K)]

    #E Step: Calculate each point's posterior probability for K clusters (trees)
    for idx_first,x in enumerate(X):
        model_prob = []
        for idx,model in enumerate(models): # K trees
            try:
                model_prob.append(model.probability(x))
            except KeyError: #if a point doesn't exist in a tree, then the probability is zero
                model_prob.append(0)
        total = [a*b for a,b in zip(model_prob,alpha_k)]
        max_prob_idx = total.index(max(total)) #return index of the max posterior probability
        x_k_temp[max_prob_idx].append(x)
        indices_array[max_prob_idx].append(idx_first)

    #C step: Assign data-points to the trees that maximize their posterior probability
    x_k = x_k_temp
    alpha_k = [len(x_k[i])/LENGTH for i in range(K)]
    models = []
    for j in range(K):
        model = BayesianNetwork.from_samples(x_k[j],algorithm='chow-liu') 
        models.append(model)
        
    return x_k,models, alpha_k,indices_array

def m_step(K,x_k, models, alpha_k,CML):
    """This method performs the M step in EM for the mth iteration
    
    K: Number of clusters
    x_k: Previously classified data in the (m-1)th step
    models: models from the (m-1)th step
    alpha_k: alphas from the previous step
    CML: Classification Maximum Likelihood    
    
    Return: x_k,models, alpha_k,indices_array
    """
    #M step: Calculate the CML criterion and re-estimate parameters
    init_CML = CML
    CML=0
    for j in range(K):
        x = x_k[j]
        model = models[j]
        CML+=sum(np.log(model.probability(x)))+len(x)*np.log(alpha_k[j])
    return CML, models

# def s_step(K,X):

#     LENGTH, DIMENSION = X.shape
    
#     #S Step
#     x_k = [[] for i in range(K)] #initialize K empty data arrays
#     #S step: Assign data-points randomly
#     for j in range(LENGTH):
#         x_k[random.randint(0,K-1)].append(X[j])
#     alpha_k = [len(x_k[k])/LENGTH for k in range(K)]
#     models = []
#     for j in range(K):
#         model = BayesianNetwork.from_samples(x_k[j],algorithm='chow-liu') 
#         models.append(model)

#     #M step: Calculate the CML criterion and re-estimate parameters
#     init_CML = CML
#     CML=0
#     for j in range(K):
#         x = x_k[j]
#         model = models[j]
#         CML+=sum(np.log(model.probability(x)))+len(x)*np.log(alpha_k[j])
#     print("New CML is:", CML)   


def save(indices_array, path):
    """This method saves the clustered data
    
    indices_array: array of the indices (of the original data) for each cluster
    path: path to save to, sample: '/Users/akankshitadash/Desktop/Bayesian Networks1/RPF_chrE/'
    Directory should already exist, and contain subdirectories of Genes/ and AccNum/
    
    Return: x_k,models, alpha_k,indices_array
    """
    for idx,indices in enumerate(indices_array):
        genes=[]
        acc_nums=[]
        for index in indices:
            genes.append(df.iloc[index]['GeneName'])
            acc_nums.append(df.iloc[index]['AccNum'])
        print(len(indices),len(genes),len(acc_nums))
    #             os.mkdir('/Users/akankshitadash/Desktop/Bayesian Networks/'+str(len(indices_array)))
    #             os.mkdir('/Users/akankshitadash/Desktop/Bayesian Networks/'+str(len(indices_array))+'/Genes/')
    #             os.mkdir('/Users/akankshitadash/Desktop/Bayesian Networks/'+str(len(indices_array))+'/AccNums/')
        with open(path+str(len(indices_array))+'/Genes/Gene'+str(idx+1)+'.txt','w') as f:
            for gene in genes:
                f.write("%s\n" % gene)
        with open(path+str(len(indices_array))+'/AccNums/AccNum'+str(idx+1)+'.txt','w') as f:
            for acc_num in acc_nums:
                f.write("%s\n" % acc_num)
    
def em(K,X,path):    
    """This method performs EM
    
    K: Number of clusters
    X: Discrete data
    path: path to save to
    Return: None
    """
    
    x_k,models,alpha_k,indices_array,CML = init(K,X) #initialize K models
    prev_CML = CML
    
    for i in range(100): #start with 100 iterations of EM
        x_k, models, alpha_k,indices_array = e_step(K,X,x_k,models,alpha_k,indices_array)
        CML, models = m_step(K,x_k, models, alpha_k,CML)
        if(prev_CML==CML):
            break
        else:
            prev_CML = CML
            print("CML is",CML)
    save(indices_array,path)

In [3]:
# df_rnaseq = pd.read_csv('AdjustedRPKMOutput/RNASeq_chrE.txt',sep='\t')
# df_rpf = pd.read_csv('AdjustedRPKMOutput/RPF_chrE.txt',sep='\t')
# df_rnaseq = df_rnaseq[(df_rnaseq['cdReads0'] >= 10) & (df_rnaseq['cdReads1'] >= 10) & (df_rnaseq['cdReads2'] >= 10)& (df_rnaseq['cdReads3'] >= 10)& (df_rnaseq['cdReads4'] >= 10)]
# df_rpf = df_rpf[(df_rpf['cdReads0'] >= 10) & (df_rpf['cdReads1'] >= 10) & (df_rpf['cdReads2'] >= 10)& (df_rpf['cdReads3'] >= 10)& (df_rpf['cdReads4'] >= 10)]
# df_TE = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), [df_rpf,df_rnaseq])
# df_TE[['cdRPKM0_x']] = df_TE[['cdRPKM0_x']].div(df_TE['cdRPKM0_y'].values,axis=0)
# df_TE[['cdRPKM1_x']] = df_TE[['cdRPKM1_x']].div(df_TE['cdRPKM1_y'].values,axis=0)
# df_TE[['cdRPKM2_x']] = df_TE[['cdRPKM2_x']].div(df_TE['cdRPKM2_y'].values,axis=0)
# df_TE[['cdRPKM3_x']] = df_TE[['cdRPKM3_x']].div(df_TE['cdRPKM3_y'].values,axis=0)
# df_TE[['cdRPKM4_x']] = df_TE[['cdRPKM4_x']].div(df_TE['cdRPKM4_y'].values,axis=0)
# for i in range(0,5):
#     df_TE.rename(columns={'cdRPKM'+str(i)+'_x':'TE'+str(i)}, inplace=True)
# df_TE = df_TE[['AccNum','GeneName','TE0','TE1','TE2','TE3','TE4']]
# df_TE.to_csv('AdjustedRPKMOutput/TE_chrE.txt',sep='\t')

### Read Data

In [4]:
path = 'AdjustedRPKMOutput/TE_chrE_filtered.txt'
df = pd.read_csv(path,sep='\t')

In [5]:
df.head(5)

Unnamed: 0,AccNum,GeneName,TE0,TE1,TE2,TE3,TE4,foldTE1,foldTE2,foldTE3,foldTE4
0,NM_017847,ODR4,1.318009,1.243997,1.416672,1.717463,1.757317,-0.074012,0.098663,0.399454,0.439308
1,NM_001003803,ATP5S,1.904575,1.918597,2.401017,2.690534,2.385348,0.014021,0.496441,0.785959,0.480773
2,NM_001003800,BICD2,-0.212253,-0.326054,-0.394399,-0.043,0.203561,-0.113801,-0.182145,0.169253,0.415814
3,NM_016647,THEM6,0.05771,0.211936,0.366747,0.71446,0.559378,0.154226,0.309037,0.656751,0.501668
4,NM_016640,MRPS30,0.562779,0.555941,0.268715,0.379276,0.831364,-0.006838,-0.294065,-0.183503,0.268585


In [6]:
df.keys()

Index(['AccNum', 'GeneName', 'TE0', 'TE1', 'TE2', 'TE3', 'TE4', 'foldTE1',
       'foldTE2', 'foldTE3', 'foldTE4'],
      dtype='object')

In [7]:
X = df[['foldTE1',
       'foldTE2', 'foldTE3', 'foldTE4']].values.round()

In [8]:
print(X.shape)

(5642, 4)


In [9]:
LENGTH, DIMENSION = X.shape

In [10]:
print(np.min(X))

-2.0


In [11]:
print(np.max(X))

4.0


#### Digitize the data

In [12]:
bin_size = 100 #state number of bins here, multiple of 5
step = (np.max(X)-np.min(X))/bin_size
bins = np.arange(np.min(X),np.max(X)+0.1,step)
print(bins)
X = np.digitize(X,bins)

[-2.   -1.94 -1.88 -1.82 -1.76 -1.7  -1.64 -1.58 -1.52 -1.46 -1.4  -1.34
 -1.28 -1.22 -1.16 -1.1  -1.04 -0.98 -0.92 -0.86 -0.8  -0.74 -0.68 -0.62
 -0.56 -0.5  -0.44 -0.38 -0.32 -0.26 -0.2  -0.14 -0.08 -0.02  0.04  0.1
  0.16  0.22  0.28  0.34  0.4   0.46  0.52  0.58  0.64  0.7   0.76  0.82
  0.88  0.94  1.    1.06  1.12  1.18  1.24  1.3   1.36  1.42  1.48  1.54
  1.6   1.66  1.72  1.78  1.84  1.9   1.96  2.02  2.08  2.14  2.2   2.26
  2.32  2.38  2.44  2.5   2.56  2.62  2.68  2.74  2.8   2.86  2.92  2.98
  3.04  3.1   3.16  3.22  3.28  3.34  3.4   3.46  3.52  3.58  3.64  3.7
  3.76  3.82  3.88  3.94  4.    4.06]


In [13]:
print(X[:5])

[[34 34 34 34]
 [34 34 50 34]
 [34 34 34 34]
 [34 34 50 50]
 [34 34 34 34]]


### Sample network

In [14]:
model = BayesianNetwork.from_samples(X,algorithm='chow-liu')

In [15]:
model.structure

((), (0,), (1,), (2,))

### Perform EM

In [17]:
path = '/Users/akankshitadash/Desktop/TE_chrE_filtered/'
for k in range(4,7):
    em(k,X,path)

**************** K = 4 ************************
Length of model 1 : 1417
Length of model 2 : 1430
Length of model 3 : 1404
Length of model 4 : 1391
Initial Model Structures [0.2511520737327189, 0.2534562211981567, 0.2488479262672811, 0.2465437788018433]
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
Initial Alphas [0.2511520737327189, 0.2534562211981567, 0.2488479262672811, 0.2465437788018433]
Initial CML -23468.70964036546
CML is -15735.81127315852
CML is -15668.846703084157
913 913 913
3474 3474 3474
286 286 286
969 969 969
**************** K = 5 ************************
Length of model 1 : 1099
Length of model 2 : 1139
Length of model 3 : 1148
Length of model 4 : 1137
Length of model 5 : 1119
Initial Model Structures [0.19478908188585609, 0.20187876639489544, 0.20347394540942929, 0.20152428216944346, 0.19833392414037576]
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
((), (0,), (1,), (2,))
Init