In [75]:
import pandas as pd
import numpy as np
from cmapPy.pandasGEXpress.parse import parse

## At first we should get cell viabilities from CTRP data.

In [76]:
# read raw CTRP cell viability data
ctrp_raw=pd.read_table('../data/CTRP/v20.data.per_cpd_post_qc.txt',
                       sep='\t',header=0,index_col=None)
#read CTRP metadata
cell_info=pd.read_table('../data/CTRP/v20.meta.per_cell_line.txt',
                        sep='\t',header=0,index_col=None)
compound_info=pd.read_table('../data/CTRP/v20.meta.per_compound.txt',
                            sep='\t',header=0,index_col=None)
experiment_info=pd.read_table('../data/CTRP/v20.meta.per_experiment.txt',
                              sep='\t',header=0,index_col=None)

In [77]:
#using information from metadata we prepocess CTRP data
#to have usable format to match with LINCS
ctrp_proc=ctrp_raw.loc[:,['experiment_id','master_cpd_id',
                          'cpd_conc_umol','cpd_avg_pv']].copy()
print('Preprocessing',end='')
experiment_info=experiment_info.drop_duplicates(['experiment_id',
                                                 'master_ccl_id'])
print('.',end='')
experiment_info.index=experiment_info['experiment_id']
print('.',end='')
ctrp_proc['master_ccl_id']=experiment_info.loc[ctrp_proc['experiment_id'].values,
                                               'master_ccl_id'].values
print('.',end='')
cell_info.index=cell_info['master_ccl_id']
print('.',end='')
ctrp_proc['ccl_name']=cell_info.loc[ctrp_proc['master_ccl_id'].values,
                                    'ccl_name'].values
print('.',end='')
compound_info.index=compound_info['master_cpd_id']
print('.',end='')
ctrp_proc['broad_cpd_id']=compound_info.loc[ctrp_proc['master_cpd_id'].values,
                                            'broad_cpd_id'].values
print('.')
ctrp_proc=ctrp_proc.loc[:,['ccl_name','broad_cpd_id',
                           'cpd_conc_umol','cpd_avg_pv']]
print('Done!')

Preprocessing.......
Done!


Now we have a dataframe with Cell lines, Compound IDs, Compound concentrations and Cell viabilities as columns

In [78]:
print(ctrp_proc.head())
print('Shape:',ctrp_proc.shape)

  ccl_name   broad_cpd_id  cpd_conc_umol  cpd_avg_pv
0     CAS1  BRD-K46556387        0.00030      0.9303
1     CAS1  BRD-K46556387        0.00061      0.8337
2     CAS1  BRD-K46556387        0.00120      1.0460
3     CAS1  BRD-K46556387        0.00240      1.0910
4     CAS1  BRD-K46556387        0.00490      1.0190
Shape: (6171005, 4)


Now we can read some  metadata, to select the intersection (same cell line, compound) between LINCS-L1000 and CTRP.

In [79]:
#cell line metadata is same for gse92742 and gse70138
gse92742_cell=pd.read_table('../data/LINCS/GSE70138/GSE92742_Broad_LINCS_cell_info.txt',
                            sep='\t',header=0,index_col=None) 
lincs_cells=list(set(gse92742_cell['cell_id']))
gse92742_comp=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_pert_info.txt',
                            sep='\t',header=0,index_col=None)
gse70138_comp=pd.read_table('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_pert_info.txt',
                            sep='\t',header=0,index_col=None)
lincs_compounds=list(set(gse92742_comp['pert_id'])|set(gse70138_comp['pert_id']))

In [80]:
#we select data from ctrp with lincs cell lines and compounds
fil=np.in1d(ctrp_proc['ccl_name'],lincs_cells)&np.in1d(ctrp_proc['broad_cpd_id'],lincs_compounds)
ctrp_proc=ctrp_proc[fil]
print('Shape:',ctrp_proc.shape)

Shape: (238005, 4)


For duplicated entries in CTRP (same cell line, same drug, same concentraion) we calculate and keep average cell viability (of course, being duplicated for floats like concentrations is a bit complicated, but here we want only to remove real duplicated elements). 

In [81]:
ctrp_proc=ctrp_proc.groupby(['ccl_name','broad_cpd_id','cpd_conc_umol']).mean()
ctrp_proc.reset_index(inplace=True)
print('Shape:',ctrp_proc.shape)
ctrp_proc.to_csv('../results/CTRP/cell_viability_ctrp.csv',sep=',')

Shape: (221306, 4)


## Now we can match CTRP and LINCS-L1000
We will match CTRP and LINCS-L1000 instances based on cell line, drug and concentration. As the concentrations in CTRP and LINCS-L1000 are not the same, we will accept match LINCS-L1000 with the CTRP instance with the closest concentration (and same cell line and drug), as far as the absolute log10 concentration difference is smaller than 0.2 (~1.5 fold concetration difference).

In [88]:
ctrp=pd.read_table('../results/CTRP/cell_viability_ctrp.csv',sep=',',
                       header=0,index_col=[0])
ctrp['log10_cpd_conc_umol']=np.log10(ctrp['cpd_conc_umol'])
cell_lines=list(set(ctrp['ccl_name']))
compounds=list(set(ctrp['broad_cpd_id']))
#read lincs metadata
sig_info_gse92742=pd.read_table('../data/LINCS/GSE92742/GSE92742_Broad_LINCS_sig_info.txt',
                                sep='\t',header=0,index_col=None,low_memory=False)
sig_info_gse70138=pd.read_table('../data/LINCS/GSE70138/GSE70138_Broad_LINCS_sig_info.txt',
                                sep='\t',header=0,index_col=None,low_memory=False)

In [90]:
#filter for cell line and compound intersection with ctrp
fil=np.in1d(sig_info_gse92742['pert_id'],compounds) & np.in1d(sig_info_gse92742['cell_id'],cell_lines)
sig_info_gse92742=sig_info_gse92742[fil]    
fil=np.in1d(sig_info_gse70138['pert_id'],compounds) & np.in1d(sig_info_gse70138['cell_id'],cell_lines)    
sig_info_gse70138=sig_info_gse70138[fil]