In [11]:
import pandas as pd
import numpy as np
import pickle
from cmapPy.pandasGEXpress.parse import parse

from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

## Predicting cell viability for the whole LINCS-L1000 study
Using our previously built models, we will predict cell viaiblity for the whole LINCS-L1000 study.

In [46]:
#we load the previously pickled models
fin=open('../results/model/final_models/ctrp.pkl','rb')
model_ctrp=pickle.load(fin)
fin.close()
fin=open('../results/model/final_models/achilles.pkl','rb')
model_achilles=pickle.load(fin)
fin.close()

In [47]:
#we have to use the same order of genes as in our models, so we load this gene order
signatures_ctrp=pd.read_table('../results/CTRP/signatures_merged_lm.csv',
                            sep=',',header=0,index_col=[0])
signatures_achilles=pd.read_table('../results/Achilles/signatures_merged_lm.csv',
                            sep=',',header=0,index_col=[0])
assert np.sum(signatures_achilles.columns!=signatures_ctrp.columns)==0
gene_order=signatures_achilles.columns
del signatures_ctrp
del signatures_achilles

In [34]:
# let's start with GSE92742
sig_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_sig_info.txt',
                       sep='\t',header=0,index_col=[0],low_memory=False)
# columns to store predicted values
sig_info['Achilles_prediction']=0 
sig_info['CTRP_prediction']=0

In [35]:
#as reading in all ~400,000 signatures would be too memory consuming, we will do it batches
for i in range(sig_info.shape[0]//10000+1):
    print(i)
    a=range(i*10000,(i+1)*10000)
    if a[-1]>=sig_info.shape[0]:
        a=range(i*10000,sig_info.shape[0])
    sig_ids=sig_info.index[a]
    signatures=parse('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx', 
                          cid=sig_ids,rid=gene_order)
    signatures=signatures.data_df.T.loc[sig_ids,gene_order]
    sig_info.loc[sig_ids,'Achilles_prediction']=model_achilles.predict(signatures)
    sig_info.loc[sig_ids,'CTRP_prediction']=model_ctrp.predict(signatures)
sig_info.to_csv('../results/predictions/GSE92742_pred.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47


In [48]:
#and continue with GSE70138
sig_info=pd.read_table('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info.txt',
                       sep='\t',header=0,index_col=[0],low_memory=False)
# columns to store predicted values
sig_info['Achilles_prediction']=0 
sig_info['CTRP_prediction']=0
for i in range(sig_info.shape[0]//10000+1):
    print(i)
    a=range(i*10000,(i+1)*10000)
    if a[-1]>=sig_info.shape[0]:
        a=range(i*10000,sig_info.shape[0])
    sig_ids=sig_info.index[a]
    signatures=parse('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx', 
                          cid=sig_ids,rid=gene_order)
    signatures=signatures.data_df.T.loc[sig_ids,gene_order]
    sig_info.loc[sig_ids,'Achilles_prediction']=model_achilles.predict(signatures)
    sig_info.loc[sig_ids,'CTRP_prediction']=model_ctrp.predict(signatures)
sig_info.to_csv('../results/predictions/GSE70138_pred.csv')

0
1
2
3
4
5
6
7
8
9
10
11


In [53]:
#mergeing GSE70138 and GSE92742
gse92742=pd.read_table('../results/predictions/GSE92742_pred.csv',
                       sep=',',header=0,index_col=[0],low_memory=False)
gse70138=pd.read_table('../results/predictions/GSE70138_pred.csv',
                      sep=',',header=0,index_col=[0],low_memory=False)
gse92742=gse92742[['pert_id','pert_iname','pert_type','cell_id','pert_idose','pert_itime',
                  'Achilles_prediction','CTRP_prediction']]
gse70138=gse70138[['pert_id','pert_iname','pert_type','cell_id','pert_idose','pert_itime',
                  'Achilles_prediction','CTRP_prediction']]
gse92742['pert_idose']=gse92742['pert_idose'].apply(lambda x:x.replace('?','u'))
merged=pd.concat([gse92742,gse70138])
merged.to_csv('../results/predictions/merged_pred.csv',sep=',')