In [1]:
import pandas as pd
import numpy as np
import pickle
from cmapPy.pandasGEXpress.parse import parse

from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

## Predicting cell viability for the whole LINCS-L1000 study
Using our previously built models, we will predict cell viaiblity for the whole LINCS-L1000 study.

In [46]:
#we load the previously pickled models
fin=open('../results/model/final_models/ctrp.pkl','rb')
model_ctrp=pickle.load(fin)
fin.close()
fin=open('../results/model/final_models/achilles.pkl','rb')
model_achilles=pickle.load(fin)
fin.close()

In [47]:
#we have to use the same order of genes as in our models, so we load this gene order
signatures_ctrp=pd.read_table('../results/CTRP/signatures_merged_lm.csv',
                            sep=',',header=0,index_col=[0])
signatures_achilles=pd.read_table('../results/Achilles/signatures_merged_lm.csv',
                            sep=',',header=0,index_col=[0])
assert np.sum(signatures_achilles.columns!=signatures_ctrp.columns)==0
gene_order=signatures_achilles.columns
del signatures_ctrp
del signatures_achilles

In [34]:
# let's start with GSE92742
sig_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_sig_info.txt',
                       sep='\t',header=0,index_col=[0],low_memory=False)
# columns to store predicted values
sig_info['Achilles_prediction']=0 
sig_info['CTRP_prediction']=0

In [35]:
#as reading in all ~400,000 signatures would be too memory consuming, we will do it batches
for i in range(sig_info.shape[0]//10000+1):
    print(i)
    a=range(i*10000,(i+1)*10000)
    if a[-1]>=sig_info.shape[0]:
        a=range(i*10000,sig_info.shape[0])
    sig_ids=sig_info.index[a]
    signatures=parse('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx', 
                          cid=sig_ids,rid=gene_order)
    signatures=signatures.data_df.T.loc[sig_ids,gene_order]
    sig_info.loc[sig_ids,'Achilles_prediction']=model_achilles.predict(signatures)
    sig_info.loc[sig_ids,'CTRP_prediction']=model_ctrp.predict(signatures)
sig_info.to_csv('../results/predictions/GSE92742_pred.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47


In [48]:
#and continue with GSE70138
sig_info=pd.read_table('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info.txt',
                       sep='\t',header=0,index_col=[0],low_memory=False)
# columns to store predicted values
sig_info['Achilles_prediction']=0 
sig_info['CTRP_prediction']=0
for i in range(sig_info.shape[0]//10000+1):
    print(i)
    a=range(i*10000,(i+1)*10000)
    if a[-1]>=sig_info.shape[0]:
        a=range(i*10000,sig_info.shape[0])
    sig_ids=sig_info.index[a]
    signatures=parse('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx', 
                          cid=sig_ids,rid=gene_order)
    signatures=signatures.data_df.T.loc[sig_ids,gene_order]
    sig_info.loc[sig_ids,'Achilles_prediction']=model_achilles.predict(signatures)
    sig_info.loc[sig_ids,'CTRP_prediction']=model_ctrp.predict(signatures)
sig_info.to_csv('../results/predictions/GSE70138_pred.csv')

0
1
2
3
4
5
6
7
8
9
10
11


In [53]:
#mergeing GSE70138 and GSE92742
gse92742=pd.read_table('../results/predictions/GSE92742_pred.csv',
                       sep=',',header=0,index_col=[0],low_memory=False)
gse70138=pd.read_table('../results/predictions/GSE70138_pred.csv',
                      sep=',',header=0,index_col=[0],low_memory=False)
gse92742=gse92742[['pert_id','pert_iname','pert_type','cell_id','pert_idose','pert_itime',
                  'Achilles_prediction','CTRP_prediction']]
gse70138=gse70138[['pert_id','pert_iname','pert_type','cell_id','pert_idose','pert_itime',
                  'Achilles_prediction','CTRP_prediction']]
gse92742['pert_idose']=gse92742['pert_idose'].apply(lambda x:x.replace('?','u'))
merged=pd.concat([gse92742,gse70138])
merged.to_csv('../results/predictions/merged_pred.csv',sep=',')

## Predicting cell viability for NCI60
Using our previously built models, we predict cell viaiblities for compounds in NCI60.

At first we have to match NCI60 and LINCS instances.

In [12]:
fname='CANCER60GI50.LST'

In [13]:
#just some helper function
def remove_space(x):
    if x[-1]==' ':
        return x[:-1]
    else:
        return x

In [38]:
def match_prediction_with_nci60(fname):
    """matches NCI60 drug sensitivity data with LINCS-L1000 predictions (and also CTRP data)"""
    #read id matching
    fin=open('../data/NCI60/BRD_NSC_match.pkl','rb')
    id_match=pickle.load(fin,encoding='latin1')
    fin.close()
    nscs=[]
    for brd in id_match:
        nscs+=id_match[brd]
    nscs=list(set(nscs))
    #match cell lines
    cell_match=pd.read_table('../data/NCI60/NCI60_CTRP_cell_matching.csv',sep=',',header=0,index_col=[0])
    cell_match['NCI60']=cell_match['NCI60'].apply(remove_space)
    fil=~pd.isnull(cell_match['CTRP'])
    cell_match=cell_match[fil]
    #read the actual NCI60 data
    nci60=pd.read_table('../data/NCI60/'+fname, sep=',',header=0,index_col=None)
    fil=np.in1d(nci60['NSC'],nscs)
    nci60=nci60[fil]
    nci60['CELL']=nci60['CELL'].apply(remove_space)
    fil=np.in1d(nci60['CELL'],cell_match['NCI60'])
    nci60=nci60[fil]
    response=nci60.columns[-4]
    nci60=nci60.loc[:,['NSC','LCONC','CELL',response]]
    #delta is the difference between maximal concentration and effective concentration
    nci60['DELTA']=-nci60[response]-nci60['LCONC']
    #getting l1000 data
    l1000pred=pd.read_table('../results/predictions/merged_pred.csv',sep=',',header=0,index_col=[0],low_memory=False)
    fil=l1000pred['pert_itime']=='24 h'
    l1000pred=l1000pred[fil]
    fil=np.in1d(l1000pred['pert_id'],list(id_match.keys()))
    l1000pred=l1000pred[fil]
    fil=np.in1d(l1000pred['cell_id'],cell_match['CTRP'])
    l1000pred=l1000pred[fil]
    cell_match.index=cell_match['CTRP']
    cell_match=cell_match['NCI60']
    l1000pred['cell_id']=cell_match[l1000pred['cell_id']].values
    l1000mins=l1000pred.drop_duplicates(['cell_id','pert_id']).loc[:,['cell_id','pert_id',
                                                                          'CTRP_prediction','Achilles_prediction']]
    l1000mins.index=range(len(l1000mins.index))
    for i in l1000mins.index:
            cell,pert=l1000mins.loc[i,['cell_id','pert_id']]
            fil=(l1000pred['cell_id']==cell)&(l1000pred['pert_id']==pert)
            l1000mins.loc[i,['CTRP_prediction','Achilles_prediction']]=np.min(l1000pred[fil].loc[:,['CTRP_prediction','Achilles_prediction']])
    #make reverse ID dict
    rev_id_match={}
    for l in id_match:
        for n in id_match[l]:
            try:
                rev_id_match[n].append(l)
            except KeyError:
                rev_id_match[n]=[l]
    #match l1000 with nci60
    fil=np.in1d(nci60['CELL'],l1000mins['cell_id'])
    indexes=nci60.index[fil]
    nci60['CTRP_prediction']=np.nan
    nci60['Achilles_prediction']=np.nan
    for i in indexes:
        nsc,cell=nci60.loc[i,['NSC','CELL']]
        brd=rev_id_match[nsc]
        fil=np.in1d(l1000mins['pert_id'],brd)&(l1000mins['cell_id']==cell)
        if sum(fil)>0:
            nci60.loc[i,['CTRP_prediction','Achilles_prediction']]=np.mean(l1000mins[fil].loc[:,['CTRP_prediction','Achilles_prediction']])
    fil=~pd.isnull(nci60['CTRP_prediction'])
    nci60=nci60[fil]
    #match with ctrp
    ctrp=pd.read_table('../results/CTRP/ctrp_DR.csv',sep=',',header=0,index_col=[0])
    fil=np.in1d(ctrp['broad_cpd_id'],list(id_match.keys()))
    ctrp=ctrp[fil]
    fil=np.in1d(ctrp['ccl_name'],cell_match.index)
    ctrp=ctrp[fil]       
    ctrp['ccl_name']=cell_match[ctrp['ccl_name']].values
    nci60['pred_pv_high_conc']=np.nan
    nci60['area_under_curve']=np.nan
    for i in nci60.index:
        nsc,cell=nci60.loc[i,['NSC','CELL']]
        brd=rev_id_match[nsc]
        fil=np.in1d(ctrp['broad_cpd_id'],brd)&(ctrp['ccl_name']==cell)
        if sum(fil)>0:
            nci60.loc[i,['pred_pv_high_conc','area_under_curve']]=np.mean(ctrp[fil].loc[:,['pred_pv_high_conc','area_under_curve']])
    nci60.to_csv('../results/NCI60/'+fname[:-4]+'_validation.csv',sep=',')

In [40]:
# we do this for different drug metrics from NCI60
for fname in ['CANCER60GI50.LST','CANCER60LC50.LST','CANCER60TGI.LST']:
    match_prediction_with_nci60(fname)

Now we will select maximal concentration instances for each compound, and select the maximal (less toxic) and minimal (most toxic) cell viability values from them (these are obviously coming from different cell lines). These help us to get a general insight about our predictions.

In [67]:
#read predicted values
data=pd.read_csv('../results/predictions/merged_pred.csv',sep=',',header=0,index_col=0,low_memory=False)

In [68]:
#filter for compounds and 24 h pertrubation
fil=np.in1d(data['pert_type'],['trt_cp','trt_lig']) & (data['pert_itime']=='24 h')
data=data[fil]

In [69]:
#just some concentration conversations to be able to select highest concetration
fil=data['pert_idose'].apply(lambda x:x.split()[1])=='ng/uL'
#measured concentration in ng/uL
data_ng=data[fil].copy()
#measured concentration in nM / uM
data_um=data[~fil].copy()
data_um['cc']=data_um['pert_idose'].apply(lambda x:x.split()[0]).astype(float).values
data_um['unit']=data_um['pert_idose'].apply(lambda x:x.split()[1]).values
indexes=data_um.index[data_um['unit']=='nM']
data_um.loc[indexes,'cc']=data_um.loc[indexes,'cc']/1000
data_ng['cc']=data_ng['pert_idose'].apply(lambda x:x.split()[0]).astype(float).values

In [61]:
#short ascending based on concentration, and keep first for each perturbation - cell pair - ie highest concentration
data_ng=data_ng.sort_values('cc',ascending=False)
data_ng=data_ng.drop_duplicates(['pert_iname','cell_id'],keep='first').copy()
data_um=data_um.sort_values('cc',ascending=False)
data_um=data_um.drop_duplicates(['pert_iname','cell_id'],keep='first').copy()

In [70]:
#keep only highest concentraion instances
for comp in list(set(data_ng['pert_iname'])):
    fil1=data_ng['pert_iname']==comp
    fil2=data_ng[fil1]['cc']<np.max(data_ng[fil1]['cc'])
    data_ng.loc[data_ng.index[fil1][fil2],'cc']=np.nan
fil=~pd.isnull(data_ng['cc'])
data_ng=data_ng[fil]
for comp in list(set(data_um['pert_iname'])):
    fil1=data_um['pert_iname']==comp
    fil2=data_um[fil1]['cc']<np.max(data_um[fil1]['cc'])
    data_um.loc[data_um.index[fil1][fil2],'cc']=np.nan   
fil=~pd.isnull(data_um['cc'])
data_um=data_um[fil]

In [76]:
data_um=data_um.groupby(['pert_iname','cell_id']).mean()
data_um.reset_index(inplace=True)
data_ng=data_ng.groupby(['pert_iname','cell_id']).mean()
data_ng.reset_index(inplace=True)

In [77]:
#select lowest and highest cell viaiblity
data_ng=data_ng.sort_values('Achilles_prediction',ascending=False)
data_ng_max=data_ng.drop_duplicates(['pert_iname'],keep='first').copy()
data_ng_min=data_ng.drop_duplicates(['pert_iname'],keep='last').copy()
data_ng_min=data_ng_min.sort_values(['pert_iname','cell_id'])
data_ng_max=data_ng_max.sort_values(['pert_iname','cell_id'])
data_um=data_um.sort_values('Achilles_prediction',ascending=False)
data_um_max=data_um.drop_duplicates(['pert_iname'],keep='first').copy()
data_um_min=data_um.drop_duplicates(['pert_iname'],keep='last').copy()
data_um_min=data_um_min.sort_values(['pert_iname','cell_id'])
data_um_max=data_um_max.sort_values(['pert_iname','cell_id'])
for data in [data_ng_min,data_ng_max,data_um_min,data_um_max]:
    data.index=range(len(data))

In [78]:
#just check everything is fine with ordering
assert np.sum(data_um_min['pert_iname']!=data_um_max['pert_iname'])==0
assert np.sum(data_ng_min['pert_iname']!=data_ng_max['pert_iname'])==0

In [79]:
results_um=pd.DataFrame(index=data_um_min.index,columns=['pert_iname','cell_max','cell_min','pred_max','pred_min'])
results_um['pert_iname']=data_um_min['pert_iname']
results_um['cell_max']=data_um_max['cell_id']
results_um['cell_min']=data_um_min['cell_id']
results_um['pred_max']=data_um_max['Achilles_prediction']
results_um['pred_min']=data_um_min['Achilles_prediction']
results_ng=pd.DataFrame(index=data_ng_min.index,columns=['pert_iname','cell_max','cell_min','pred_max','pred_min'])
results_ng['pert_iname']=data_ng_min['pert_iname']
results_ng['cell_max']=data_ng_max['cell_id']
results_ng['cell_min']=data_ng_min['cell_id']
results_ng['pred_max']=data_ng_max['Achilles_prediction']
results_ng['pred_min']=data_ng_min['Achilles_prediction']

In [80]:
results=pd.concat([results_um,results_ng],ignore_index=True)
results.to_csv('../results/predictions/predicted_min_max.csv',sep=',')

And we select the minimal predicted cell viability for VCAP, PC3 and other cell lines.

In [244]:
#read predicted values
data=pd.read_csv('../results/predictions/merged_pred.csv',sep=',',header=0,index_col=0,low_memory=False)
fil=np.in1d(data['pert_type'],['trt_cp','trt_lig'])
data=data[fil]
data=data.sort_values('Achilles_prediction')

In [245]:
fil_vc=data['cell_id']=='VCAP'
data_vcap=data[fil_vc].copy()
data_vcap=data_vcap.drop_duplicates('pert_iname')

fil_pc=data['cell_id']=='PC3'
data_pc=data[fil_pc].copy()
data_pc=data_pc.drop_duplicates('pert_iname')

data_other=data[(~fil_pc)&(~fil_vc)].copy()
data_other=data_other.drop_duplicates('pert_iname')

In [246]:
#subset and order based on shared pert_inames
pert_inames=list(set(data_vcap['pert_iname'])&set(data_pc['pert_iname'])&set(data_other['pert_iname']))

fil=np.in1d(data_vcap['pert_iname'],pert_inames)
data_vcap=data_vcap[fil]
data_vcap=data_vcap.sort_values('pert_iname')

fil=np.in1d(data_pc['pert_iname'],pert_inames)
data_pc=data_pc[fil]
data_pc=data_pc.sort_values('pert_iname')

fil=np.in1d(data_other['pert_iname'],pert_inames)
data_other=data_other[fil]
data_other=data_other.sort_values('pert_iname')

In [247]:
#check everything is right ordered
assert np.sum(data_pc['pert_iname'].values!=data_vcap['pert_iname'].values)==0
assert np.sum(data_pc['pert_iname'].values!=data_other['pert_iname'].values)==0

In [248]:
results=pd.DataFrame(index=range(len(data_pc)),columns=['pert_iname','VCAP','PC3','Other'])
results['pert_iname']=data_pc['pert_iname'].values
results['VCAP']=data_vcap['Achilles_prediction'].values
results['PC3']=data_pc['Achilles_prediction'].values
results['Other']=data_other['Achilles_prediction'].values

In [249]:
results.to_csv('../results/predictions/predicted_prostate.csv',sep=',')