In [31]:
import numpy as np
import pandas as pd
from pomegranate import *
import random
import os
import itertools
from functools import reduce

### Methods

In [16]:
def init(K,X):
    """This method initializes the models for EM
    
    K: Number of clusters
    X: Data
    
    Return: x_k, models,alpha_k, indices_array, CML
    """
    LENGTH, DIMENSION = X.shape
    models = []
    x_k = [[] for i in range(K)] #initialize K empty data arrays
    alpha_k = []
    indices_array = [[] for i in range(K)]
    print("**************** K =", K ,"************************")


    # Sequences for initial CL Multinet Estimation
    # Make K subsets of data
    for i in range(LENGTH):
        random_integer = random.randint(0,K-1)
        x_k[random_integer].append(X[i])
        indices_array[random_integer].append(i)

    for i in range(K):
        print("Length of model",i+1,":",len(x_k[i]))
        alpha_k.append(len(x_k[i])/LENGTH)
        model = BayesianNetwork.from_samples(x_k[i],algorithm='chow-liu') 
        models.append(model)

    print("Initial Model Structures",alpha_k)
    for model in models:
        print(model.structure)

    print("Initial Alphas",alpha_k)

    CML = 0

    for i in range(K):
        x = x_k[i]
        model = models[i]
        CML+=sum(np.log(model.probability(x)))+len(x)*np.log(alpha_k[i])
    print("Initial CML",CML)
    
    return x_k,models,alpha_k,indices_array,CML
           
def e_step(K,X,x_k,models,alpha_k,indices_array):
    """This method performs the E step in EM for the mth iteration
    
    K: Number of clusters
    X: Data
    x_k: Previously classified data in the (m-1)th step
    models: models from the (m-1)th step
    alpha_k: alphas from the previous step
    indices_array: current indices of the original data (for each cluster)
    
    Return: x_k,models, alpha_k,indices_array
    """
    x_k_temp = [[] for i in range(K)] #initialize K empty data arrays for C step (assign)
    indices_array = [[] for i in range(K)]

    #E Step: Calculate each point's posterior probability for K clusters (trees)
    for idx_first,x in enumerate(X):
        model_prob = []
        for idx,model in enumerate(models): # K trees
            try:
                model_prob.append(model.probability(x))
            except KeyError: #if a point doesn't exist in a tree, then the probability is zero
                model_prob.append(0)
        total = [a*b for a,b in zip(model_prob,alpha_k)]
        max_prob_idx = total.index(max(total)) #return index of the max posterior probability
        x_k_temp[max_prob_idx].append(x)
        indices_array[max_prob_idx].append(idx_first)

    #C step: Assign data-points to the trees that maximize their posterior probability
    x_k = x_k_temp
    alpha_k = [len(x_k[i])/LENGTH for i in range(K)]
    models = []
    for j in range(K):
        model = BayesianNetwork.from_samples(x_k[j],algorithm='chow-liu') 
        models.append(model)
        
    return x_k,models, alpha_k,indices_array

def m_step(K,x_k, models, alpha_k,CML):
    """This method performs the M step in EM for the mth iteration
    
    K: Number of clusters
    x_k: Previously classified data in the (m-1)th step
    models: models from the (m-1)th step
    alpha_k: alphas from the previous step
    CML: Classification Maximum Likelihood    
    
    Return: x_k,models, alpha_k,indices_array
    """
    #M step: Calculate the CML criterion and re-estimate parameters
    init_CML = CML
    CML=0
    for j in range(K):
        x = x_k[j]
        model = models[j]
        CML+=sum(np.log(model.probability(x)))+len(x)*np.log(alpha_k[j])
    return CML, models

# def s_step(K,X):

#     LENGTH, DIMENSION = X.shape
    
#     #S Step
#     x_k = [[] for i in range(K)] #initialize K empty data arrays
#     #S step: Assign data-points randomly
#     for j in range(LENGTH):
#         x_k[random.randint(0,K-1)].append(X[j])
#     alpha_k = [len(x_k[k])/LENGTH for k in range(K)]
#     models = []
#     for j in range(K):
#         model = BayesianNetwork.from_samples(x_k[j],algorithm='chow-liu') 
#         models.append(model)

#     #M step: Calculate the CML criterion and re-estimate parameters
#     init_CML = CML
#     CML=0
#     for j in range(K):
#         x = x_k[j]
#         model = models[j]
#         CML+=sum(np.log(model.probability(x)))+len(x)*np.log(alpha_k[j])
#     print("New CML is:", CML)   


def save(indices_array, path):
    """This method saves the clustered data
    
    indices_array: array of the indices (of the original data) for each cluster
    path: path to save to, sample: '/Users/akankshitadash/Desktop/Bayesian Networks1/RPF_chrE/'
    Directory should already exist, and contain subdirectories of Genes/ and AccNum/
    
    Return: x_k,models, alpha_k,indices_array
    """
    for idx,indices in enumerate(indices_array):
        genes=[]
        acc_nums=[]
        for index in indices:
            genes.append(df.iloc[index]['GeneName'])
            acc_nums.append(df.iloc[index]['AccNum'])
        print(len(indices),len(genes),len(acc_nums))
    #             os.mkdir('/Users/akankshitadash/Desktop/Bayesian Networks/'+str(len(indices_array)))
    #             os.mkdir('/Users/akankshitadash/Desktop/Bayesian Networks/'+str(len(indices_array))+'/Genes/')
    #             os.mkdir('/Users/akankshitadash/Desktop/Bayesian Networks/'+str(len(indices_array))+'/AccNums/')
        with open(path+str(len(indices_array))+'/Genes/Gene'+str(idx+1)+'.txt','w') as f:
            for gene in genes:
                f.write("%s\n" % gene)
        with open(path+str(len(indices_array))+'/AccNums/AccNum'+str(idx+1)+'.txt','w') as f:
            for acc_num in acc_nums:
                f.write("%s\n" % acc_num)
    
def em(K,X,path):    
    """This method performs EM
    
    K: Number of clusters
    X: Discrete data
    path: path to save to
    Return: None
    """
    
    x_k,models,alpha_k,indices_array,CML = init(K,X) #initialize K models
    prev_CML = CML
    
    for i in range(100): #start with 100 iterations of EM
        x_k, models, alpha_k,indices_array = e_step(K,X,x_k,models,alpha_k,indices_array)
        CML, models = m_step(K,x_k, models, alpha_k,CML)
        if(prev_CML==CML):
            break
        else:
            prev_CML = CML
            print("CML is",CML)
    save(indices_array,path)

In [64]:
# df_rnaseq = pd.read_csv('AdjustedRPKMOutput/RNASeq_chrE.txt',sep='\t')
# df_rpf = pd.read_csv('AdjustedRPKMOutput/RPF_chrE.txt',sep='\t')
# df_rnaseq = df_rnaseq[(df_rnaseq['cdReads0'] >= 10) & (df_rnaseq['cdReads1'] >= 10) & (df_rnaseq['cdReads2'] >= 10)& (df_rnaseq['cdReads3'] >= 10)& (df_rnaseq['cdReads4'] >= 10)]
# df_rpf = df_rpf[(df_rpf['cdReads0'] >= 10) & (df_rpf['cdReads1'] >= 10) & (df_rpf['cdReads2'] >= 10)& (df_rpf['cdReads3'] >= 10)& (df_rpf['cdReads4'] >= 10)]
# df_TE = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), [df_rpf,df_rnaseq])
# df_TE[['cdRPKM0_x']] = df_TE[['cdRPKM0_x']].div(df_TE['cdRPKM0_y'].values,axis=0)
# df_TE[['cdRPKM1_x']] = df_TE[['cdRPKM1_x']].div(df_TE['cdRPKM1_y'].values,axis=0)
# df_TE[['cdRPKM2_x']] = df_TE[['cdRPKM2_x']].div(df_TE['cdRPKM2_y'].values,axis=0)
# df_TE[['cdRPKM3_x']] = df_TE[['cdRPKM3_x']].div(df_TE['cdRPKM3_y'].values,axis=0)
# df_TE[['cdRPKM4_x']] = df_TE[['cdRPKM4_x']].div(df_TE['cdRPKM4_y'].values,axis=0)
# for i in range(0,5):
#     df_TE.rename(columns={'cdRPKM'+str(i)+'_x':'TE'+str(i)}, inplace=True)
# df_TE = df_TE[['AccNum','GeneName','TE0','TE1','TE2','TE3','TE4']]
# df_TE.to_csv('AdjustedRPKMOutput/TE_chrE.txt',sep='\t')

### Read Data

In [82]:
path = 'AdjustedRPKMOutput/TE_chrE.txt'
df = pd.read_csv(path,sep='\t')

In [83]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,AccNum,GeneName,TE0,TE1,TE2,TE3,TE4
0,0,NM_017847,ODR4,2.493218,2.368538,2.66969,3.288575,3.380689
1,1,NM_001003803,ATP5S,3.743987,3.780551,5.281752,6.455525,5.224701
2,2,NM_001003800,BICD2,0.863188,0.797715,0.760806,0.970635,1.151537
3,3,NM_016647,THEM6,1.040812,1.158241,1.289442,1.640869,1.473634
4,4,NM_016640,MRPS30,1.477112,1.470127,1.204734,1.300689,1.779367


In [84]:
df.keys()

Index(['Unnamed: 0', 'AccNum', 'GeneName', 'TE0', 'TE1', 'TE2', 'TE3', 'TE4'], dtype='object')

In [85]:
X = df[['TE0', 'TE1', 'TE2', 'TE3', 'TE4']].values.round()

In [86]:
print(X.shape)

(8989, 5)


In [87]:
LENGTH, DIMENSION = X.shape

In [88]:
print(np.min(X))

0.0


In [89]:
print(np.max(X))

130.0


#### Digitize the data

In [90]:
bin_size = 100 #state number of bins here, multiple of 5
step = (np.max(X)-np.min(X))/bin_size
bins = np.arange(np.min(X),np.max(X)+0.1,step)
print(bins)
X = np.digitize(X,bins)

[  0.    1.3   2.6   3.9   5.2   6.5   7.8   9.1  10.4  11.7  13.   14.3
  15.6  16.9  18.2  19.5  20.8  22.1  23.4  24.7  26.   27.3  28.6  29.9
  31.2  32.5  33.8  35.1  36.4  37.7  39.   40.3  41.6  42.9  44.2  45.5
  46.8  48.1  49.4  50.7  52.   53.3  54.6  55.9  57.2  58.5  59.8  61.1
  62.4  63.7  65.   66.3  67.6  68.9  70.2  71.5  72.8  74.1  75.4  76.7
  78.   79.3  80.6  81.9  83.2  84.5  85.8  87.1  88.4  89.7  91.   92.3
  93.6  94.9  96.2  97.5  98.8 100.1 101.4 102.7 104.  105.3 106.6 107.9
 109.2 110.5 111.8 113.1 114.4 115.7 117.  118.3 119.6 120.9 122.2 123.5
 124.8 126.1 127.4 128.7 130. ]


In [91]:
print(X[:5])

[[2 2 3 3 3]
 [4 4 4 5 4]
 [1 1 1 1 1]
 [1 1 1 2 1]
 [1 1 1 1 2]]


### Sample network

In [92]:
model = BayesianNetwork.from_samples(X,algorithm='chow-liu')

In [93]:
model.structure

((), (0,), (1,), (2,), (3,))

### Perform EM

In [94]:
path = '/Users/akankshitadash/Desktop/Bayesian Networks/TE_chrE/'
for k in range(4,7):
    em(k,X,path)

**************** K = 4 ************************
Length of model 1 : 2263
Length of model 2 : 2184
Length of model 3 : 2242
Length of model 4 : 2300
Initial Model Structures [0.25175214150628544, 0.24296362220491713, 0.24941595283123819, 0.25586828345755924]
((), (0,), (3,), (4,), (0,))
((), (0,), (1,), (2,), (3,))
((), (0,), (1,), (2,), (3,))
((), (0,), (3,), (4,), (0,))
Initial Alphas [0.25175214150628544, 0.24296362220491713, 0.24941595283123819, 0.25586828345755924]
Initial CML -51789.74909720743
CML is -40753.85083045007
CML is -40139.77588574818
CML is -39912.50266608162
CML is -39841.51866478809
CML is -39837.07330030018
CML is -39830.29867896126
CML is -39822.69611387217
CML is -39821.46375103855
CML is -39799.085130095584
CML is -39782.56032112075
CML is -39777.99799448479
3657 3657 3657
1202 1202 1202
2680 2680 2680
1450 1450 1450
**************** K = 5 ************************
Length of model 1 : 1869
Length of model 2 : 1797
Length of model 3 : 1790
Length of model 4 : 1782
