In [2]:
import pandas as pd
import numpy as np
from functools import reduce
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import math
import seaborn as sns

from sklearn.mixture import GaussianMixture as GMM
from sklearn.cluster import KMeans

from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans,AffinityPropagation

In [3]:
def gmm(X,k):
    gmm = GMM(n_components=k, covariance_type='full')
    gmm.fit(X)
    X_predicted = [ [] for i in range(k)]
    for x in X:
        X_predicted[gmm.predict([x])[0]].append(x)
    return X_predicted

In [4]:
def kmeans(X,k):
    kmeans = KMeans(n_clusters=k, init='k-means++')
    kmeans.fit(X)
    X_predicted = [ [] for i in range(k)]
    for x in X:
        X_predicted[kmeans.predict([x])[0]].append(x.tolist())
    return X_predicted

In [10]:
RNASeq_path = 'Replicate2/RNASeq_chrE_mockT0added.txt' 
RPF_path = 'Replicate2/RPF_chrE_mockT0added.txt'

In [11]:
RNASeq = pd.read_csv(RNASeq_path, sep='\t')
RPF = pd.read_csv(RPF_path, sep='\t')

In [12]:
RNASeq.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4,cdReads5,cdRPKM5
0,NM_017847,ODR4,65.0,6.602898,38.0,4.059277,70.0,6.54327,63.0,5.507023,38.0,2.443294,48.0,2.633165
1,NM_001003803,ATP5S,41.0,8.773294,38.0,8.550792,51.0,10.042101,41.0,7.549494,33.0,4.469549,21.0,2.426687
2,NM_001003800,BICD2,1082.0,58.423302,944.0,53.601229,1126.0,55.946484,1067.0,49.576787,711.0,24.299615,752.0,21.92766
3,NM_016649,ESF1,45.0,2.441212,19.0,1.083903,53.0,2.645723,33.0,1.540501,29.0,0.995777,27.0,0.790993
4,NM_016647,THEM6,200.0,44.229936,174.0,40.464994,161.0,32.763309,124.0,23.597344,65.0,9.098517,69.0,8.240452


In [13]:
RPF.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4,cdReads5,cdRPKM5
0,NM_017847,ODR4,194.0,24.619391,178.0,22.843822,189.0,22.942286,83.0,16.106297,44.0,5.221536,55.0,10.502382
1,NM_001003803,ATP5S,112.0,29.939962,108.0,29.196458,121.0,30.939829,51.0,20.847089,28.0,6.999408,23.0,9.251467
2,NM_001003800,BICD2,525.0,35.413798,516.0,35.199468,576.0,37.165074,300.0,30.944003,274.0,17.283584,278.0,28.216787
3,NM_016649,ESF1,259.0,17.55283,261.0,17.887971,194.0,12.576171,88.0,9.119522,83.0,5.260118,66.0,6.7304
4,NM_016647,THEM6,110.0,30.390187,109.0,30.453722,95.0,25.105195,74.0,31.261833,44.0,11.367459,28.0,11.639874


### Filter for cdReads>=10

In [14]:
RNASeq = RNASeq[(RNASeq['cdReads0'] >= 10) & (RNASeq['cdReads1'] >= 10) 
                & (RNASeq['cdReads2'] >= 10)& (RNASeq['cdReads3'] >= 10)
                & (RNASeq['cdReads4'] >= 10)& (RNASeq['cdReads5'] >= 10)]
RPF = RPF[(RPF['cdReads0'] >= 10) 
          & (RPF['cdReads1'] >= 10) 
          & (RPF['cdReads2'] >= 10)
          & (RPF['cdReads3'] >= 10)
          & (RPF['cdReads4'] >= 10)& (RPF['cdReads5'] >= 10)]

### Apply log2

In [15]:
RNASeq[['cdRPKM0', 'cdRPKM1', 'cdRPKM2', 'cdRPKM3', 'cdRPKM4','cdRPKM5']] = RNASeq[['cdRPKM0', 
                                                                          'cdRPKM1', 
                                                                          'cdRPKM2', 
                                                                          'cdRPKM3', 'cdRPKM4','cdRPKM5'
                                                                               ]].apply(np.log2)

In [16]:
RPF[['cdRPKM0', 'cdRPKM1', 'cdRPKM2', 'cdRPKM3', 'cdRPKM4','cdRPKM5']] = RPF[['cdRPKM0', 
                                                                    'cdRPKM1', 
                                                                    'cdRPKM2', 
                                                                    'cdRPKM3', 'cdRPKM4','cdRPKM5']].apply(np.log2)

In [17]:
RNASeq['log2cdRPKM1']=RNASeq['cdRPKM1']-RNASeq['cdRPKM0']
RNASeq['log2cdRPKM2']=RNASeq['cdRPKM2']-RNASeq['cdRPKM0']
RNASeq['log2cdRPKM3']=RNASeq['cdRPKM3']-RNASeq['cdRPKM0']
RNASeq['log2cdRPKM4']=RNASeq['cdRPKM4']-RNASeq['cdRPKM0']
RNASeq['log2cdRPKM5']=RNASeq['cdRPKM5']-RNASeq['cdRPKM0']

In [18]:
RPF['log2cdRPKM1']=RPF['cdRPKM1']-RPF['cdRPKM0']
RPF['log2cdRPKM2']=RPF['cdRPKM2']-RPF['cdRPKM0']
RPF['log2cdRPKM3']=RPF['cdRPKM3']-RPF['cdRPKM0']
RPF['log2cdRPKM4']=RPF['cdRPKM4']-RPF['cdRPKM0']
RPF['log2cdRPKM5']=RPF['cdRPKM5']-RPF['cdRPKM0']

In [19]:
RNASeq.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4,cdReads5,cdRPKM5,log2cdRPKM1,log2cdRPKM2,log2cdRPKM3,log2cdRPKM4,log2cdRPKM5
0,NM_017847,ODR4,65.0,2.723099,38.0,2.021223,70.0,2.710012,63.0,2.461273,38.0,1.288828,48.0,1.396798,-0.701876,-0.013088,-0.261827,-1.434272,-1.326301
1,NM_001003803,ATP5S,41.0,3.133119,38.0,3.096058,51.0,3.327989,41.0,2.91638,33.0,2.160129,21.0,1.278988,-0.037061,0.194871,-0.216739,-0.972989,-1.854131
2,NM_001003800,BICD2,1082.0,5.868472,944.0,5.744194,1126.0,5.805976,1067.0,5.631593,711.0,4.602862,752.0,4.45468,-0.124278,-0.062496,-0.236879,-1.26561,-1.413792
3,NM_016649,ESF1,45.0,1.287598,19.0,0.116236,53.0,1.403662,33.0,0.6234,29.0,-0.006106,27.0,-0.338264,-1.171362,0.116065,-0.664198,-1.293703,-1.625862
4,NM_016647,THEM6,200.0,5.466951,174.0,5.338602,161.0,5.034009,124.0,4.560553,65.0,3.185631,69.0,3.042723,-0.128349,-0.432942,-0.906399,-2.28132,-2.424228


In [20]:
RPF.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4,cdReads5,cdRPKM5,log2cdRPKM1,log2cdRPKM2,log2cdRPKM3,log2cdRPKM4,log2cdRPKM5
0,NM_017847,ODR4,194.0,4.621723,178.0,4.513732,189.0,4.519937,83.0,4.009553,44.0,2.384474,55.0,3.392645,-0.107991,-0.101786,-0.61217,-2.237249,-1.229078
1,NM_001003803,ATP5S,112.0,4.904,108.0,4.867721,121.0,4.951393,51.0,4.381774,28.0,2.807233,23.0,3.209682,-0.036279,0.047393,-0.522226,-2.096768,-1.694318
2,NM_001003800,BICD2,525.0,5.14624,516.0,5.137482,576.0,5.215876,300.0,4.951588,274.0,4.11133,278.0,4.818482,-0.008758,0.069636,-0.194652,-1.034909,-0.327758
3,NM_016649,ESF1,259.0,4.133632,261.0,4.160918,194.0,3.652621,88.0,3.188958,83.0,2.395095,66.0,2.750692,0.027286,-0.481011,-0.944673,-1.738537,-1.38294
4,NM_016647,THEM6,110.0,4.925534,109.0,4.928547,95.0,4.649914,74.0,4.96633,44.0,3.506838,28.0,3.541004,0.003013,-0.27562,0.040797,-1.418696,-1.38453


# TE

### ONLY RUN FOR MEDIAN NORMALIZATION

In [21]:
RNASeq['log2cdRPKM1']-=np.median(RNASeq['log2cdRPKM1'])
RNASeq['log2cdRPKM2']-=np.median(RNASeq['log2cdRPKM2'])
RNASeq['log2cdRPKM3']-=np.median(RNASeq['log2cdRPKM3'])
RNASeq['log2cdRPKM4']-=np.median(RNASeq['log2cdRPKM4'])
RNASeq['log2cdRPKM5']-=np.median(RNASeq['log2cdRPKM5'])

In [22]:
filteredRNASeq = RNASeq[(RNASeq['log2cdRPKM1'] <= 0.5) 
                        & (RNASeq['log2cdRPKM2'] <= 0.5)
                        & (RNASeq['log2cdRPKM3'] <= 0.5)
                        & (RNASeq['log2cdRPKM4'] <= 0.5)
                       & (RNASeq['log2cdRPKM5'] <= 0.5)]
filteredRNASeq = filteredRNASeq[(filteredRNASeq['log2cdRPKM1'] >= -0.5) 
                                & (filteredRNASeq['log2cdRPKM2'] >= -0.5)
                                & (filteredRNASeq['log2cdRPKM3'] >= -0.5)
                                & (filteredRNASeq['log2cdRPKM4'] >= -0.5)
                               & (filteredRNASeq['log2cdRPKM5'] >= -0.5)]

In [23]:
filteredRNASeq.shape

(6090, 19)

In [24]:
filteredRNASeq.head()

Unnamed: 0,AccNum,GeneName,cdReads0,cdRPKM0,cdReads1,cdRPKM1,cdReads2,cdRPKM2,cdReads3,cdRPKM3,cdReads4,cdRPKM4,cdReads5,cdRPKM5,log2cdRPKM1,log2cdRPKM2,log2cdRPKM3,log2cdRPKM4,log2cdRPKM5
1,NM_001003803,ATP5S,41.0,3.133119,38.0,3.096058,51.0,3.327989,41.0,2.91638,33.0,2.160129,21.0,1.278988,0.010445,0.274601,0.168953,0.443472,-0.021818
2,NM_001003800,BICD2,1082.0,5.868472,944.0,5.744194,1126.0,5.805976,1067.0,5.631593,711.0,4.602862,752.0,4.45468,-0.076772,0.017234,0.148813,0.150851,0.418521
6,NM_016640,MRPS30,378.0,5.311337,315.0,5.120866,312.0,4.914494,296.0,4.741809,216.0,3.844151,202.0,3.51841,-0.142965,-0.317113,-0.183836,-0.050725,0.039386
9,NM_001080825,TMEM120B,48.0,2.706026,36.0,2.363552,42.0,2.393378,46.0,2.427886,23.0,0.984794,20.0,0.554095,-0.294968,-0.232917,0.107552,-0.30477,-0.319618
10,NM_001008781,FAT3,2287.0,4.53551,2135.0,4.508853,2538.0,4.565743,1734.0,3.919418,1261.0,3.01679,1123.0,2.620515,0.020849,0.109963,-0.230399,-0.102258,-0.082682


In [25]:
TE = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), [RPF,filteredRNASeq])

In [26]:
TE.shape

(5801, 36)

In [27]:
TE.head()

Unnamed: 0,AccNum,GeneName,cdReads0_x,cdRPKM0_x,cdReads1_x,cdRPKM1_x,cdReads2_x,cdRPKM2_x,cdReads3_x,cdRPKM3_x,...,cdRPKM3_y,cdReads4_y,cdRPKM4_y,cdReads5_y,cdRPKM5_y,log2cdRPKM1_y,log2cdRPKM2_y,log2cdRPKM3_y,log2cdRPKM4_y,log2cdRPKM5_y
0,NM_001003803,ATP5S,112.0,4.904,108.0,4.867721,121.0,4.951393,51.0,4.381774,...,2.91638,33.0,2.160129,21.0,1.278988,0.010445,0.274601,0.168953,0.443472,-0.021818
1,NM_001003800,BICD2,525.0,5.14624,516.0,5.137482,576.0,5.215876,300.0,4.951588,...,5.631593,711.0,4.602862,752.0,4.45468,-0.076772,0.017234,0.148813,0.150851,0.418521
2,NM_016640,MRPS30,400.0,5.71403,413.0,5.77636,368.0,5.52962,158.0,4.986657,...,4.741809,216.0,3.844151,202.0,3.51841,-0.142965,-0.317113,-0.183836,-0.050725,0.039386
3,NM_001080825,TMEM120B,63.0,3.419422,56.0,3.265685,75.0,3.606845,23.0,2.578407,...,2.427886,23.0,0.984794,20.0,0.554095,-0.294968,-0.232917,0.107552,-0.30477,-0.319618
4,NM_001008781,FAT3,1009.0,3.676058,1008.0,3.690816,973.0,3.559528,441.0,3.094686,...,3.919418,1261.0,3.01679,1123.0,2.620515,0.020849,0.109963,-0.230399,-0.102258,-0.082682


In [28]:
TE['cdRPKM0_x'] = TE['cdRPKM0_x']-TE['cdRPKM0_y']
TE['cdRPKM1_x'] = TE['cdRPKM1_x']-TE['cdRPKM1_y']
TE['cdRPKM2_x'] = TE['cdRPKM2_x']-TE['cdRPKM2_y']
TE['cdRPKM3_x'] = TE['cdRPKM3_x']-TE['cdRPKM3_y']
TE['cdRPKM4_x'] = TE['cdRPKM4_x']-TE['cdRPKM4_y']
TE['cdRPKM5_x'] = TE['cdRPKM5_x']-TE['cdRPKM5_y']

for i in range(0,6):
    TE.rename(columns={'cdRPKM'+str(i)+'_x':'TE'+str(i)}, inplace=True)
TE = TE[['AccNum', 'GeneName','TE0','TE1','TE2','TE3','TE4','TE5']]

In [29]:
TE['foldTE1']=TE['TE1']-TE['TE0']
TE['foldTE2']=TE['TE2']-TE['TE0']
TE['foldTE3']=TE['TE3']-TE['TE0']
TE['foldTE4']=TE['TE4']-TE['TE0']
TE['foldTE5']=TE['TE5']-TE['TE0']

In [30]:
TE.head()

Unnamed: 0,AccNum,GeneName,TE0,TE1,TE2,TE3,TE4,TE5,foldTE1,foldTE2,foldTE3,foldTE4,foldTE5
0,NM_001003803,ATP5S,1.770882,1.771663,1.623404,1.465394,0.647103,1.930694,0.000782,-0.147478,-0.305488,-1.123778,0.159812
1,NM_001003800,BICD2,-0.722232,-0.606712,-0.5901,-0.680005,-0.491531,0.363802,0.11552,0.132132,0.042227,0.230701,1.086034
2,NM_016640,MRPS30,0.402693,0.655493,0.615126,0.244848,0.162678,0.974128,0.252801,0.212433,-0.157844,-0.240015,0.571436
3,NM_001080825,TMEM120B,0.713396,0.902133,1.213467,0.150521,0.884143,2.177961,0.188737,0.500071,-0.562875,0.170746,1.464565
4,NM_001008781,FAT3,-0.859451,-0.818037,-1.006214,-0.824733,-1.08978,-0.158767,0.041414,-0.146763,0.034719,-0.230329,0.700684


In [31]:
TE.keys()

Index(['AccNum', 'GeneName', 'TE0', 'TE1', 'TE2', 'TE3', 'TE4', 'TE5',
       'foldTE1', 'foldTE2', 'foldTE3', 'foldTE4', 'foldTE5'],
      dtype='object')

In [36]:
TE.shape

(5801, 13)

In [37]:
TE.to_csv('Replicate2/TE_chrE_filtered_mockT0added.txt',sep='\t',index=False)

In [None]:
TE.head()

In [32]:
X = TE[['foldTE1',
       'foldTE2', 'foldTE3', 'foldTE4','foldTE5']].values

In [33]:
X[:5]

array([[ 7.81569213e-04, -1.47477741e-01, -3.05487608e-01,
        -1.12377837e+00,  1.59812365e-01],
       [ 1.15519875e-01,  1.32132342e-01,  4.22273735e-02,
         2.30701248e-01,  1.08603414e+00],
       [ 2.52800685e-01,  2.12433253e-01, -1.57844410e-01,
        -2.40014523e-01,  5.71435515e-01],
       [ 1.88736996e-01,  5.00071126e-01, -5.62875450e-01,
         1.70746320e-01,  1.46456495e+00],
       [ 4.14142582e-02, -1.46762887e-01,  3.47188262e-02,
        -2.30329139e-01,  7.00684411e-01]])

In [35]:
# def plot_heatmap(X,idx):
#     plt.subplot(3, 2, idx+1)
# #     plt.figure()
#     sns.heatmap(X,xticklabels=['foldTE1',
#        'foldTE2', 'foldTE3', 'foldTE4'])
#     plt.title('Heatmap'+str(idx+1))

# list_nums = [5]   
# for num in list_nums:
#     clusters = kmeans(X,num)
#     idx=0
#     for cluster in clusters:
#         print(len(cluster))
#         plot_heatmap(cluster)
#         idx+=1
        
list_nums = [4,5,6]
for num in list_nums:
    clusters = kmeans(X,num) 
    for idx,cluster in enumerate(clusters):
        print(len(cluster))
        genes=[]
        acc_nums=[]
        for row in cluster:
            temp = TE.loc[(TE['foldTE1'] == row[0])& (TE['foldTE2']== row[1])
                          & (TE['foldTE3']== row[2])]
            if(not temp.empty):
                genes.append(temp['GeneName'].values[0])
                acc_nums.append(temp['AccNum'].values[0])
        print(len(cluster),len(genes))
        with open('/Users/akankshitadash/Desktop/Replicate2/MockT0Added/'+str(num)+
                  '/Gene'+str(idx+1)+'.txt','w') as f:
            for gene in genes:
                f.write("%s\n" % gene)
        with open('/Users/akankshitadash/Desktop/Replicate2/MockT0Added/'+str(num)+
                  '/AccNum'+str(idx+1)+'.txt','w') as f:
            for acc_num in acc_nums:
                f.write("%s\n" % acc_num)


1600
1600 1600
1446
1446 1446
2343
2343 2343
412
412 412
283
283 283
1904
1904 1904
1640
1640 1640
1021
1021 1021
953
953 953
869
869 869
933
933 933
1772
1772 1772
1051
1051 1051
899
899 899
277
277 277


In [None]:
# def plot_heatmap(X):
#     plt.figure()
#     sns.heatmap(X,xticklabels=['TE0', 'TE1', 'TE2', 'TE3', 'TE4', 'foldTE1',
#        'foldTE2', 'foldTE3', 'foldTE4'])
#     plt.title('Heatmap')
#     plt.savefig('NonChrE/KMeansOutputTE/Heatmap'+str(len(X)))
#     plt.show()

# list_nums = [5]   
# for num in list_nums:
#     clusters = kmeans(X,num)
#     idx=0
#     for cluster in clusters:
#         print(len(cluster))
#         plot_heatmap(cluster)
#         idx+=1

# RPF

In [None]:
RPF.keys()

In [None]:
RPF.head()

In [None]:
RPF_new = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), [RPF,filteredRNASeq])

In [None]:
RPF_new.head()
for i in range(0,5):
    RPF_new.rename(columns={'cdRPKM'+str(i)+'_x':'cdRPKM'+str(i)}, inplace=True)
RPF_new = RPF_new[['AccNum', 'GeneName','cdRPKM0','cdRPKM1','cdRPKM2','cdRPKM3','cdRPKM4']]

In [None]:
RPF_new.to_csv('/Users/akankshitadash/Desktop/RPF_NonchrE.txt',sep='\t',index=False)

In [None]:
RPF_new.shape

In [None]:
X = RPF_new[['cdRPKM0', 'cdRPKM1', 'cdRPKM2', 'cdRPKM3', 'cdRPKM4']].values

In [None]:
list_nums = [4,5,6]
for num in list_nums:
    clusters = kmeans(X,num)
    for cluster in clusters:
        print(len(cluster))
    for idx,cluster in enumerate(clusters):
        genes=[]
        acc_nums=[]
        for row_nd in cluster:
            row = row_nd[0].tolist()
            temp = RPF.loc[(RPF['cdRPKM0'] == row[0])& (RPF['cdRPKM1']== row[1]) & (RPF['cdRPKM2']== row[2])]
            if(not temp.empty):
                genes.append(temp['GeneName'].values[0])
                acc_nums.append(temp['AccNum'].values[0])
        print(len(cluster),len(genes))
        with open('NonChrE/KMeansOutputRPF/'+str(num)+'/Gene'+str(idx+1)+'.txt','w') as f:
            for gene in genes:
                f.write("%s\n" % gene)
        with open('NonChrE/KMeansOutputRPF/'+str(num)+'/AccNum'+str(idx+1)+'.txt','w') as f:
            for acc_num in acc_nums:
                f.write("%s\n" % acc_num)

In [None]:
import random
def plot_heatmap(X):
    plt.figure()
    sns.heatmap(X,xticklabels=['cdRPKM0', 'cdRPKM1', 'cdRPKM2', 'cdRPKM3', 'cdRPKM4'])
    plt.title('Heatmap')
    plt.savefig('NonChrE/KMeansOutputRPF/Heatmap'+str(len(X)))
    plt.show()

list_nums = [5]   
for num in list_nums:
    clusters = kmeans(X,num)
    idx=0
    for cluster in clusters:
        print(len(cluster))
        plot_heatmap(cluster)
        idx+=1

In [None]:
RNASeq['log2cdRPKM1']

# Median Normalize

In [None]:
plt.hist(RPF['log2cdRPKM1'], bins = np.arange(-2.0,2.0,0.1))
print(np.median(RPF['log2cdRPKM1']))
plt.xlabel('log2fold values RPF')
plt.ylabel('Frequency')
plt.title('log2fold: 2 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:
plt.hist(RPF['log2cdRPKM2'], bins = np.arange(-2.0,2.0,0.1))
print(np.median(RPF['log2cdRPKM2']))

plt.xlabel('log2fold values RPF')
plt.ylabel('Frequency')
plt.title('log2fold: 4 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:

plt.hist(RPF['log2cdRPKM3'], bins = np.arange(-2.0,2.0,0.1))
print(np.median(RPF['log2cdRPKM3']))


plt.xlabel('log2fold values RPF')
plt.ylabel('Frequency')
plt.title('log2fold: 6 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:

plt.hist(RPF['log2cdRPKM4'], bins = np.arange(-2.0,2.0,0.1))
print(np.median(RPF['log2cdRPKM4']))

plt.xlabel('log2fold values RPF')
plt.ylabel('Frequency')
plt.title('log2fold: 8 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:
RNASeq_2['log2cdRPKM1']=RNASeq_2['cdRPKM1']-RNASeq_2['cdRPKM0']
RNASeq_2['log2cdRPKM2']=RNASeq_2['cdRPKM2']-RNASeq_2['cdRPKM0']
RNASeq_2['log2cdRPKM3']=RNASeq_2['cdRPKM3']-RNASeq_2['cdRPKM0']
RNASeq_2['log2cdRPKM4']=RNASeq_2['cdRPKM4']-RNASeq_2['cdRPKM0']

In [None]:
RNASeq.head(5)

In [None]:
RNASeq_2.head(5)

In [None]:
filteredRNASeq = RNASeq_2[(RNASeq_2['log2cdRPKM1'] <= 0.5) 
                        & (RNASeq_2['log2cdRPKM2'] <= 0.5)
                        & (RNASeq_2['log2cdRPKM3'] <= 0.5)
                        & (RNASeq_2['log2cdRPKM4'] <= 0.5)]
filteredRNASeq = filteredRNASeq[(filteredRNASeq['log2cdRPKM1'] >= -0.5) 
                                & (filteredRNASeq['log2cdRPKM2'] >= -0.5)
                                & (filteredRNASeq['log2cdRPKM3'] >= -0.5)
                                & (filteredRNASeq['log2cdRPKM4'] >= -0.5)]



In [None]:
filteredRNASeq = RNASeq[(RNASeq['log2cdRPKM1'] <= 0.5) 
                        & (RNASeq['log2cdRPKM2'] <= 0.5)
                        & (RNASeq['log2cdRPKM3'] <= 0.5)
                        & (RNASeq['log2cdRPKM4'] <= 0.5)]
filteredRNASeq = filteredRNASeq[(filteredRNASeq['log2cdRPKM1'] >= -0.5) 
                                & (filteredRNASeq['log2cdRPKM2'] >= -0.5)
                                & (filteredRNASeq['log2cdRPKM3'] >= -0.5)
                                & (filteredRNASeq['log2cdRPKM4'] >= -0.5)]



In [None]:
filteredRNASeq.head(5)

In [None]:
print(filteredRNASeq.shape)

In [None]:
TE = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), [RPF,RNASeq])
TE.head()

In [None]:
TE.shape

In [None]:
TE['cdRPKM0_x'] = TE['cdRPKM0_x']-TE['cdRPKM0_y']
TE['cdRPKM1_x'] = TE['cdRPKM1_x']-TE['cdRPKM1_y']
TE['cdRPKM2_x'] = TE['cdRPKM2_x']-TE['cdRPKM2_y']
TE['cdRPKM3_x'] = TE['cdRPKM3_x']-TE['cdRPKM3_y']
TE['cdRPKM4_x'] = TE['cdRPKM4_x']-TE['cdRPKM4_y']
for i in range(0,5):
    TE.rename(columns={'cdRPKM'+str(i)+'_x':'TE'+str(i)}, inplace=True)
TE = TE[['AccNum', 'GeneName','TE0','TE1','TE2','TE3','TE4']]
TE.head()

In [None]:
TE['foldTE1']=TE['TE1']-TE['TE0']
TE['foldTE2']=TE['TE2']-TE['TE0']
TE['foldTE3']=TE['TE3']-TE['TE0']
TE['foldTE4']=TE['TE4']-TE['TE0']

In [None]:
TE.head(5)

In [None]:
plt.hist(TE['foldTE1'], bins = np.arange(-2.0,2.0,0.1))


plt.xlabel('log2fold values TE')
plt.ylabel('Frequency')
plt.title('log2fold: 2 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:
plt.hist(TE['foldTE2'], bins = np.arange(-2.0,2.0,0.1))


plt.xlabel('log2fold values TE')
plt.ylabel('Frequency')
plt.title('log2fold: 4 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:
plt.hist(TE['foldTE3'], bins = np.arange(-2.0,2.0,0.1))


plt.xlabel('log2fold values TE')
plt.ylabel('Frequency')
plt.title('log2fold: 6 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:
plt.hist(TE['foldTE4'], bins = np.arange(-2.0,2.0,0.1))


plt.xlabel('log2fold values TE')
plt.ylabel('Frequency')
plt.title('log2fold: 8 hpi/0hpi')
# plt.axis(-2,2)
plt.grid(True)

plt.show()

In [None]:
TE.to_csv('/Users/akankshitadash/Desktop/TE_medianNormalize.txt',sep='\t',index=False)

In [None]:
previous = 'AdjustedRPKMOutput/RPF_chrE/geneTXCD_RPKMoutput_EVadjusted_ReadOutput_'
timepoints = ['ATCACG-s_7_1_genome.txt','TTAGGC-s_7_1_genome.txt',
             'CAGATC-s_7_1_genome.txt','GCCAAT-s_7_1_genome.txt',
             'CTTGTA-s_7_1_genome.txt'
            ]
dataframes = []
for idx,file in enumerate(timepoints):
    df = pd.read_csv(previous+file, sep="	", na_values=['-'])
    df = df.dropna(how='any')
    df = df[['AccNum', 'GeneName','cdReads',
       'cdRPKM']]
    df.rename(columns={'cdRPKM':'cdRPKM'+str(idx), 'cdReads':'cdReads'+str(idx)}, inplace=True)
    dataframes.append(df)

In [None]:
previous = 'AdjustedRPKMOutput/RNASeq_chrE/geneTXCD_RPKMoutput_EVadjusted_ReadOutput_'
timepoints = ['ATCACG-s_6_1_genome.txt','TTAGGC-s_6_1_genome.txt',
             'CAGATC-s_6_1_genome.txt','GCCAAT-s_6_1_genome.txt',
             'CTTGTA-s_6_1_genome.txt'
            ]
dataframes = []
for idx,file in enumerate(timepoints):
    df = pd.read_csv(previous+file, sep="	", na_values=['-'])
    df = df.dropna(how='any')
    df = df[['AccNum', 'GeneName','cdReads',
       'cdRPKM']]
    df.rename(columns={'cdRPKM':'cdRPKM'+str(idx), 'cdReads':'cdReads'+str(idx)}, inplace=True)
    dataframes.append(df)

In [None]:
df_rnaseq = reduce(lambda left,right: pd.merge(left,right,on=['AccNum','GeneName']), dataframes)
df_rnaseq.to_csv('AdjustedRPKMOutput/RNASeq_chrE.txt',sep='\t',index=False)