In [1]:
import pandas as pd
import numpy as np
from cmapPy.pandasGEXpress.parse import parse

## At first we should get cell viabilities from CTRP data.

In [76]:
# read raw CTRP cell viability data
ctrp_raw=pd.read_table('../data/CTRP/v20.data.per_cpd_post_qc.txt',
                       sep='\t',header=0,index_col=None)
#read CTRP metadata
cell_info=pd.read_table('../data/CTRP/v20.meta.per_cell_line.txt',
                        sep='\t',header=0,index_col=None)
compound_info=pd.read_table('../data/CTRP/v20.meta.per_compound.txt',
                            sep='\t',header=0,index_col=None)
experiment_info=pd.read_table('../data/CTRP/v20.meta.per_experiment.txt',
                              sep='\t',header=0,index_col=None)

In [77]:
#using information from metadata we prepocess CTRP data
#to have usable format to match with LINCS
ctrp_proc=ctrp_raw.loc[:,['experiment_id','master_cpd_id',
                          'cpd_conc_umol','cpd_avg_pv']].copy()
print('Preprocessing',end='')
experiment_info=experiment_info.drop_duplicates(['experiment_id',
                                                 'master_ccl_id'])
print('.',end='')
experiment_info.index=experiment_info['experiment_id']
print('.',end='')
ctrp_proc['master_ccl_id']=experiment_info.loc[ctrp_proc['experiment_id'].values,
                                               'master_ccl_id'].values
print('.',end='')
cell_info.index=cell_info['master_ccl_id']
print('.',end='')
ctrp_proc['ccl_name']=cell_info.loc[ctrp_proc['master_ccl_id'].values,
                                    'ccl_name'].values
print('.',end='')
compound_info.index=compound_info['master_cpd_id']
print('.',end='')
ctrp_proc['broad_cpd_id']=compound_info.loc[ctrp_proc['master_cpd_id'].values,
                                            'broad_cpd_id'].values
print('.')
ctrp_proc=ctrp_proc.loc[:,['ccl_name','broad_cpd_id',
                           'cpd_conc_umol','cpd_avg_pv']]
print('Done!')

Preprocessing.......
Done!


Now we have a dataframe with Cell lines, Compound IDs, Compound concentrations and Cell viabilities as columns

In [78]:
print(ctrp_proc.head())
print('Shape:',ctrp_proc.shape)

  ccl_name   broad_cpd_id  cpd_conc_umol  cpd_avg_pv
0     CAS1  BRD-K46556387        0.00030      0.9303
1     CAS1  BRD-K46556387        0.00061      0.8337
2     CAS1  BRD-K46556387        0.00120      1.0460
3     CAS1  BRD-K46556387        0.00240      1.0910
4     CAS1  BRD-K46556387        0.00490      1.0190
Shape: (6171005, 4)


Now we can read some  metadata, to select the intersection (same cell line, compound) between LINCS-L1000 and CTRP.

In [79]:
#cell line metadata is same for gse92742 and gse70138
gse92742_cell=pd.read_table('../data/LINCS/GSE70138/GSE92742_Broad_LINCS_cell_info.txt',
                            sep='\t',header=0,index_col=None) 
lincs_cells=list(set(gse92742_cell['cell_id']))
gse92742_comp=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_pert_info.txt',
                            sep='\t',header=0,index_col=None)
gse70138_comp=pd.read_table('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_pert_info.txt',
                            sep='\t',header=0,index_col=None)
lincs_compounds=list(set(gse92742_comp['pert_id'])|set(gse70138_comp['pert_id']))

In [80]:
#we select data from ctrp with lincs cell lines and compounds
fil=np.in1d(ctrp_proc['ccl_name'],lincs_cells)&np.in1d(ctrp_proc['broad_cpd_id'],lincs_compounds)
ctrp_proc=ctrp_proc[fil]
print('Shape:',ctrp_proc.shape)

Shape: (238005, 4)


For duplicated entries in CTRP (same cell line, same drug, same concentraion) we calculate and keep average cell viability (of course, being duplicated for floats like concentrations is a bit complicated, but here we want only to remove real duplicated elements). 

In [81]:
ctrp_proc=ctrp_proc.groupby(['ccl_name','broad_cpd_id','cpd_conc_umol']).mean()
ctrp_proc.reset_index(inplace=True)
print('Shape:',ctrp_proc.shape)
ctrp_proc.to_csv('../results/CTRP/cell_viability_ctrp.csv',sep=',')

Shape: (221306, 4)


## Now we can match CTRP and LINCS-L1000
We will match CTRP and LINCS-L1000 instances based on cell line, drug and concentration. As the concentrations in CTRP and LINCS-L1000 are not the same, we will accept match LINCS-L1000 with the CTRP instance with the closest concentration (and same cell line and drug), as far as the absolute log10 concentration difference is smaller than 0.2 (~1.5 fold concetration difference).

In [131]:
def get_closest_cc_ctrp(l1000):
    """Selects the closest concentration instance from CTRP
    for a given L1000 instance, uses ctrp global variable CTRP"""
    fil=(CTRP['broad_cpd_id']==l1000['pert_id']) & (CTRP['ccl_name']==l1000['cell_id'])
    if np.sum(fil)>0:
        ctrp_temp=CTRP[fil].copy()
        ctrp_temp['delta_cc']=np.abs(ctrp_temp['log10_cpd_conc_umol']-l1000['log10_pert_dose'])
        j=ctrp_temp.sort_values('delta_cc').index[0]
        return ctrp_temp.loc[j,['cpd_avg_pv','log10_cpd_conc_umol']]
    else:
        return np.nan

In [132]:
# just read data
CTRP=pd.read_table('../results/CTRP/cell_viability_ctrp.csv',sep=',',
                       header=0,index_col=[0])
CTRP['log10_cpd_conc_umol']=np.log10(CTRP['cpd_conc_umol'])
cell_lines=list(set(CTRP['ccl_name']))
compounds=list(set(CTRP['broad_cpd_id']))
#read lincs metadata
sig_info_gse92742=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_sig_info.txt',
                                sep='\t',header=0,index_col=None,low_memory=False)
sig_info_gse70138=pd.read_table('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info.txt',
                                sep='\t',header=0,index_col=None,low_memory=False)
print('Shape GSE92742 sig_info:',sig_info_gse92742.shape)
print('Shape GSE70138 sig_info:',sig_info_gse70138.shape)

Shape GSE92742 sig_info: (473647, 12)
Shape GSE70138 sig_info: (118050, 8)


In [133]:
#filter for cell line and compound intersection with ctrp
fil=np.in1d(sig_info_gse92742['pert_id'],compounds) & np.in1d(sig_info_gse92742['cell_id'],cell_lines)
sig_info_gse92742=sig_info_gse92742[fil]    
fil=np.in1d(sig_info_gse70138['pert_id'],compounds) & np.in1d(sig_info_gse70138['cell_id'],cell_lines)    
sig_info_gse70138=sig_info_gse70138[fil]
print('Shape GSE92742 sig_info:',sig_info_gse92742.shape)
print('Shape GSE70138 sig_info:',sig_info_gse70138.shape)

Shape GSE92742 sig_info: (16010, 12)
Shape GSE70138 sig_info: (12693, 8)


In [134]:
#for matching we use log10 concentration, we start with gse92742
assert len(set(sig_info_gse92742['pert_dose_unit']))==1 #all doses are in uM
sig_info_gse92742['pert_dose']=sig_info_gse92742['pert_dose'].astype(float)
fil=sig_info_gse92742['pert_dose']!=0.0 #remove 0 concentration instances
sig_info_gse92742=sig_info_gse92742[fil]
sig_info_gse92742['log10_pert_dose']=np.log10(sig_info_gse92742['pert_dose'].astype(float))

In [142]:
#takes some time
sig_info_gse92742_nearest=sig_info_gse92742.apply(get_closest_cc_ctrp,axis=1)
sig_info_gse92742=pd.concat([sig_info_gse92742,sig_info_gse92742_nearest],1)
fil=~pd.isnull(sig_info_gse92742['cpd_avg_pv'])
sig_info_gse92742=sig_info_gse92742[fil]
sig_info_gse92742.to_csv('../results/CTRP/sig_info_gse92742_viab.csv')

In [160]:
#let's do this with gse70138, gse70138 does not have pert_dose and pert_dose unit
#only pert_idose, which is a string like '10.0 um'
sig_info_gse70138['pert_dose']=sig_info_gse70138['pert_idose'].apply(lambda x:float(x.split()[0]))
sig_info_gse70138['pert_dose_unit']=sig_info_gse70138['pert_idose'].apply(lambda x:x.split()[1])
assert len(set(sig_info_gse92742['pert_dose_unit']))==1 #all doses are in um
#remove 0 and log transform
fil=sig_info_gse70138['pert_dose']!=0.0 #remove 0 concentration instances
sig_info_gse70138=sig_info_gse70138[fil]
sig_info_gse70138['log10_pert_dose']=np.log10(sig_info_gse70138['pert_dose'].astype(float))

In [162]:
#takes some time
sig_info_gse70138_nearest=sig_info_gse70138.apply(get_closest_cc_ctrp,axis=1)
sig_info_gse70138=pd.concat([sig_info_gse70138,sig_info_gse70138_nearest],1)
fil=~pd.isnull(sig_info_gse70138['cpd_avg_pv'])
sig_info_gse70138=sig_info_gse70138[fil]
sig_info_gse70138.to_csv('../results/CTRP/sig_info_gse70138_viab.csv')

Now we can read the corresponding gene expression signatures. We will only work with the actual measured (landmark) genes.

In [168]:
#also takes some time
sig_ids_gse70138=list(sig_info_gse70138['sig_id'])
sig_ids_gse92742=list(sig_info_gse92742['sig_id'])
gene_info=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',sep='\t')
fil=gene_info['pr_is_lm']==1
gene_ids = list(gene_info.loc[gene_info.index[fil],'pr_gene_id'].astype(str))
signatures_gse92742=parse('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx', 
                          cid=sig_ids_gse92742,rid=gene_ids)
signatures_gse92742=signatures_gse92742.data_df.T
signatures_gse70138=parse('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx',
                          cid=sig_ids_gse70138,rid=gene_ids)
signatures_gse70138=signatures_gse70138.data_df.T
signatures_gse70138.to_csv('../results/CTRP/signatures_gse70138.csv',sep=',')
signatures_gse92742.to_csv('../results/CTRP/signatures_gse92742.csv',sep=',')

Finally, we can merge our data regarding GSE70138 and GSE92742, and we are ready with the CTRP preprocessing step.

In [190]:
sig_info_gse92742=pd.read_table('../results/CTRP/sig_info_gse92742_viab.csv',
                                sep=',',header=0,index_col=[0])
sig_info_gse70138=pd.read_table('../results/CTRP/sig_info_gse70138_viab.csv',
                                sep=',',header=0,index_col=[0])
sig_info_gse92742.index=sig_info_gse92742['sig_id']
sig_info_gse92742=sig_info_gse92742.loc[:,['pert_id','cell_id','log10_pert_dose','pert_itime',
                                           'log10_cpd_conc_umol','cpd_avg_pv']]
fil=np.abs(sig_info_gse92742['log10_pert_dose']-sig_info_gse92742['log10_cpd_conc_umol'])<0.2
fig_info_gse92742=sig_info_gse92742[fil]
    
sig_info_gse70138.index=sig_info_gse70138['sig_id']
sig_info_gse70138=sig_info_gse70138.loc[:,['pert_id','cell_id','log10_pert_dose','pert_itime',
                                           'log10_cpd_conc_umol','cpd_avg_pv']]
fil=np.abs(sig_info_gse70138['log10_pert_dose']-sig_info_gse70138['log10_cpd_conc_umol'])<0.2
sig_info_gse70138=sig_info_gse70138[fil]

In [191]:
sig_info_gse70138.shape

(9826, 6)

In [188]:
sig_info_gse70138=pd.read_table('../results/CTRP/sig_info_gse70138_viab.csv',
                                sep=',',header=0,index_col=[0])