**Author:** Benoît BAILLIF

**Purpose:**
- Merge GSE92742 and GSE70138 perturbagen and signature info  
- Select only compounds (not genetic perturbagens)  
- Process data (canonize SMILES RDKIT, harmonize dose) 

**Input:**
- data/raw/
 - GSE92742_Broad_LINCS_pert_info.txt
 - GSE92742_Broad_LINCS_sig_info.txt
 - GSE70138_Broad_LINCS_pert_info.txt
 - GSE70138_Broad_LINCS_sig_info_2017-03-06.txt
 
**Output:** 
- data/processed/
 - cmp_info_cmap.csv
 - sig_info_cmap_cmp.csv in processed data directory

In [1]:
import os
import numpy  as np
import pandas as pd
from rdkit import Chem

# Functions

In [2]:
def dose_to_micromolar(dose_string) :
    """Convert a dose (string) in micromolar if needed
    Args:
        dose_string (str) : dose expressed as a string, containing generally a value and a unit
        
    Returns:
        float : converted dose
    """
    value_unit = dose_string.split()
    value = float(value_unit[0])
    unit = value_unit[1]
        
    if (unit == 'nM') :
        value = value / 1000
            
    return value

# Input

In [3]:
raw_data_directory = 'data/raw/'

In [4]:
GSE92742_pert_info_file_name = 'GSE92742_Broad_LINCS_pert_info.txt'
GSE92742_pert_info_path = raw_data_directory + GSE92742_pert_info_file_name

In [5]:
GSE92742_sig_info_file_name = 'GSE92742_Broad_LINCS_sig_info.txt'
GSE92742_sig_info_path = raw_data_directory + GSE92742_sig_info_file_name

In [6]:
GSE70138_pert_info_file_name = 'GSE70138_Broad_LINCS_pert_info.txt'
GSE70138_pert_info_path = raw_data_directory + GSE70138_pert_info_file_name

In [7]:
GSE70138_sig_info_file_name = 'GSE70138_Broad_LINCS_sig_info_2017-03-06.txt'
GSE70138_sig_info_path = raw_data_directory + GSE70138_sig_info_file_name

# Output

In [8]:
processed_data_directory = 'data/processed/'
if not os.path.exists(processed_data_directory) :
    os.mkdir(processed_data_directory)

In [9]:
cmp_info_cmap_path = processed_data_directory + 'cmp_info_cmap.csv'

In [10]:
sig_info_cmap_path = processed_data_directory + 'sig_info_cmap.csv'

# Load GSE92742 perturbagen info

In [11]:
pert_info_GSE92742 = pd.read_csv(GSE92742_pert_info_path, sep='\t', dtype='str')

In [12]:
n_pert_GSE92742 = pert_info_GSE92742.shape[0]
print('Number of perturbagens in GSE92742 : ' + str(n_pert_GSE92742) + 
      ' (controls included)')

pert_info_GSE92742.head()

Number of perturbagens in GSE92742 : 51383 (controls included)


Unnamed: 0,pert_id,pert_iname,pert_type,is_touchstone,inchi_key_prefix,inchi_key,canonical_smiles,pubchem_cid
0,56582,AKT2,trt_oe,0,-666,-666,-666,-666
1,5981,HSF1,trt_oe,0,-666,-666,-666,-666
2,7150,NFE2L2,trt_oe,0,-666,-666,-666,-666
3,ABL1_G2A,ABL1,trt_oe.mut,0,-666,-666,-666,-666
4,ABL1_T315I,ABL1,trt_oe.mut,0,-666,-666,-666,-666


In [13]:
# Drop inchi_key_prefix because is contained in inchi_key
pert_info_GSE92742 = pert_info_GSE92742.drop('inchi_key_prefix', axis=1)

# Load GSE70138 Perturbagen info

In [14]:
pert_info_GSE70138 = pd.read_csv(GSE70138_pert_info_path, sep='\t', dtype='str')

In [15]:
n_pert_GSE70138 = pert_info_GSE70138.shape[0]
print('Number of perturbagens in GSE70138 : ' 
      + str(n_pert_GSE70138) + ' (controls excluded)')

pert_info_GSE70138.head()

Number of perturbagens in GSE70138 : 2170 (controls excluded)


Unnamed: 0,pert_id,canonical_smiles,inchi_key,pert_iname,pert_type
0,BRD-K70792160,CCN(CC)CCCCN1c2ccccc2Oc2ccc(Cl)cc12,GYBXAGDWMCJZJK-UHFFFAOYSA-N,10-DEBC,trt_cp
1,BRD-K68552125,CCCCCCCCCCCCCC(=O)O[C@@H]1[C@@H](C)[C@]2(O)[C@...,PHEDXBVPIONUQT-RGYGYFBISA-N,phorbol-myristate-acetate,trt_cp
2,BRD-K92301463,CCCCC(C)(C)[C@H](O)\C=C\[C@H]1[C@H](O)CC(=O)[C...,QAOBBBBDJSWHMU-WMBBNPMCSA-N,"16,16-dimethylprostaglandin-e2",trt_cp
3,BRD-A29731977,CCCCCC(=O)O[C@@]1(CCC2C3CCC4=CC(=O)CC[C@]4(C)C...,DOMWKUIIPQCAJU-JKPPDDDBSA-N,17-hydroxyprogesterone-caproate,trt_cp
4,BRD-K07954936,OC(=O)CCCC[C@@H]1SC[C@@H]2NC(=N)N[C@H]12,WWVANQJRLPIHNS-ZKWXMUAHSA-N,2-iminobiotin,trt_cp


# Compile the 2 GEO perturbagen info

In [16]:
pert_info_cmap = pd.concat([pert_info_GSE92742, pert_info_GSE70138])

In [17]:
# drop duplicate perturbagen, based on pert_id. The first occurence is conserved, the GSE92742 one
# containing the pubchem_cid information
pert_info_cmap = pert_info_cmap.drop_duplicates(subset='pert_id')

In [18]:
n_pert_cmap = pert_info_cmap.shape[0]
print('Total number of perturbagens in compiled GEO datasets : ' + str(n_pert_cmap))

Total number of perturbagens in compiled GEO datasets : 52641


In [19]:
pert_info_cmap.head()

Unnamed: 0,pert_id,pert_iname,pert_type,is_touchstone,inchi_key,canonical_smiles,pubchem_cid
0,56582,AKT2,trt_oe,0,-666,-666,-666
1,5981,HSF1,trt_oe,0,-666,-666,-666
2,7150,NFE2L2,trt_oe,0,-666,-666,-666
3,ABL1_G2A,ABL1,trt_oe.mut,0,-666,-666,-666
4,ABL1_T315I,ABL1,trt_oe.mut,0,-666,-666,-666


In [20]:
pert_info_cmap.pert_type.value_counts(dropna=False)
# More info on : https://clue.io/connectopedia/perturbagen_types_and_controls

trt_cp             21299
trt_sh             18493
trt_sh.cgs          4345
trt_sh.css          3807
trt_oe              3492
trt_lig              622
trt_xpr              353
trt_oe.mut           135
ctl_vector            80
ctl_vector.cns         8
ctl_vehicle            3
ctl_vehicle.cns        2
ctl_untrt              1
ctl_untrt.cns          1
Name: pert_type, dtype: int64

In [21]:
pert_info_cmap['is_touchstone'].value_counts(dropna=False)
# More info on : https://clue.io/connectopedia/the_touchstone_dataset
# NaN values corresponds to GSE70138, as they did not present this distinction

0      29806
1      21577
NaN     1258
Name: is_touchstone, dtype: int64

# Select compounds only

In [22]:
# We select compound with the pert_type, but note that DMSO vehicule are under the category ctl_vehicule
cmp_info_cmap = pert_info_cmap[pert_info_cmap['pert_type'] == 'trt_cp'].copy()

In [23]:
n_cmp_cmap = cmp_info_cmap.shape[0]
print('Number of compounds in CMap : ' + str(n_cmp_cmap))
print('+ 1 (DMSO vehicule) = ' + str(n_cmp_cmap + 1))

Number of compounds in CMap : 21299
+ 1 (DMSO vehicule) = 21300


In [24]:
# pert_type is trt_cp for all, so this column is not relevant anymore
cmp_info_cmap = cmp_info_cmap.drop('pert_type', axis=1)

In [25]:
incorrect_smiles = cmp_info_cmap['canonical_smiles'].isin(['-666', 'restricted'])
cmp_info_cmap = cmp_info_cmap[~incorrect_smiles]

In [26]:
# Make RDKIT canonized SMILES
cmp_info_cmap['canonical_smiles'] = cmp_info_cmap['canonical_smiles'].apply(Chem.MolFromSmiles)
cmp_info_cmap['canonical_smiles'] = cmp_info_cmap['canonical_smiles'].apply(Chem.MolToSmiles)

In [27]:
cmp_info_cmap.head()

Unnamed: 0,pert_id,pert_iname,is_touchstone,inchi_key,canonical_smiles,pubchem_cid
98,BRD-A00100033,nifurtimox,1,ARFHIAQFJWUCFH-UHFFFAOYSA-N,CC1CS(=O)(=O)CCN1N=Cc1ccc([N+](=O)[O-])o1,6842999
99,BRD-A00150179,5-hydroxytryptophan,0,QSHLMQDRPXXYEE-UHFFFAOYSA-N,NC(Cc1c[nH]c2cccc(O)c12)C(=O)O,589768
100,BRD-A00267231,hemado,1,KOCIMZNSNPOGOP-UHFFFAOYSA-N,CCCCC#Cc1nc(NC)c2ncn(C3OC(CO)C(O)C3O)c2n1,4043357
101,BRD-A00420644,SA-3676,0,ASCBUEVCEVGOFP-UHFFFAOYSA-N,CCN1c2ccccc2NC2N=C(OC)C(c3ccccc3)C21,2853908
102,BRD-A00474148,BRD-A00474148,0,RCGAUPRLRFZAMS-UHFFFAOYSA-N,O=C1Cc2cc([S+](=O)([O-])N3CCN(c4ccc(O)cc4)CC3)...,44825297


In [28]:
n_correct_cmp_cmap = cmp_info_cmap.shape[0]
print('Number of compounds in CMap with correct structure: ' + str(n_correct_cmp_cmap))
print('So we have ' + str(n_cmp_cmap - n_correct_cmp_cmap) + ' incorrect compounds')

Number of compounds in CMap with correct structure: 21220
So we have 79 incorrect compounds


In [30]:
cmap_correct_cmp_list = cmp_info_cmap['pert_id'].values

# Load GSE92742 signature info

In [31]:
sig_info_GSE92742 = pd.read_csv(GSE92742_sig_info_path, sep='\t', dtype='str')

In [32]:
n_sig_GSE92742 = sig_info_GSE92742.shape[0]
print('Number of signatures in GSE92742 : ' + str(n_sig_GSE92742) + 
      ' (controls included)')

sig_info_GSE92742.head()

Number of signatures in GSE92742 : 473647 (controls included)


Unnamed: 0,sig_id,pert_id,pert_iname,pert_type,cell_id,pert_dose,pert_dose_unit,pert_idose,pert_time,pert_time_unit,pert_itime,distil_id
0,AML001_CD34_24H:A05,DMSO,DMSO,ctl_vehicle,CD34,0.1,%,0.1 %,24,h,24 h,AML001_CD34_24H_X1_F1B10:A05
1,AML001_CD34_24H:A06,DMSO,DMSO,ctl_vehicle,CD34,0.1,%,0.1 %,24,h,24 h,AML001_CD34_24H_X3_F1B10:A06
2,AML001_CD34_24H:B05,DMSO,DMSO,ctl_vehicle,CD34,0.1,%,0.1 %,24,h,24 h,AML001_CD34_24H_X1_F1B10:B05|AML001_CD34_24H_X...
3,AML001_CD34_24H:B06,DMSO,DMSO,ctl_vehicle,CD34,0.1,%,0.1 %,24,h,24 h,AML001_CD34_24H_X3_F1B10:B06
4,AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,BRD-A03772856,trt_cp,CD34,0.37037,µM,500 nM,24,h,24 h,AML001_CD34_24H_X1_F1B10:J04|AML001_CD34_24H_X...


In [33]:
# Add a new GEO column to stock the origin of the signatures
sig_info_GSE92742['GEO'] = 'GSE92742'

# Load GSE70138 signature info

In [34]:
sig_info_GSE70138 = pd.read_csv(GSE70138_sig_info_path, sep='\t', dtype='str')

In [35]:
n_sig_GSE70138 = sig_info_GSE70138.shape[0]
print('Number of signatures in GSE70138 : ' + str(n_sig_GSE70138) + 
      ' (controls excluded)')

sig_info_GSE70138.head()

Number of signatures in GSE70138 : 118050 (controls excluded)


Unnamed: 0,sig_id,pert_id,pert_iname,pert_type,cell_id,pert_idose,pert_itime,distil_id
0,LJP005_A375_24H:A03,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A03|LJP005_A375_24H_X2_...
1,LJP005_A375_24H:A04,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A04|LJP005_A375_24H_X2_...
2,LJP005_A375_24H:A05,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A05|LJP005_A375_24H_X2_...
3,LJP005_A375_24H:A06,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A06|LJP005_A375_24H_X2_...
4,LJP005_A375_24H:A07,BRD-K76908866,CP-724714,trt_cp,A375,10.0 um,24 h,LJP005_A375_24H_X1_B19:A07|LJP005_A375_24H_X2_...


In [36]:
sig_info_GSE70138['GEO'] = 'GSE70138'

# Merge the 2 GEO signature info

In [37]:
sig_info_cmap = pd.concat([sig_info_GSE92742, sig_info_GSE70138], sort=False)

In [38]:
n_sig_cmap = sig_info_cmap.shape[0]
print('Number of signatures in both GEO : ' + str(n_sig_cmap))

Number of signatures in both GEO : 591697


# Select signatures from compounds only

In [39]:
is_compound = sig_info_cmap['pert_type'] == 'trt_cp'
is_dmso = sig_info_cmap['pert_id'] == 'DMSO'
sig_info_cmap_cmp = sig_info_cmap[is_compound | is_dmso]

In [40]:
n_sig_cmp = sig_info_cmap_cmp.shape[0]
print('Total number of compound (+ DMSO vehicule) signatures in cmap : ' + str(n_sig_cmp))

Total number of compound (+ DMSO vehicule) signatures in cmap : 333273


In [41]:
sig_info_cmap_cmp = sig_info_cmap[sig_info_cmap['pert_id'].isin(cmap_correct_cmp_list)]

In [42]:
# Remove irrelevant columns : pert_idose and pert_itime combine these data ; pert_type because all trt_cp
columns_to_drop = ['pert_dose', 'pert_dose_unit', 'pert_time', 'pert_time_unit', 'pert_type']
sig_info_cmap_cmp = sig_info_cmap_cmp.drop(columns_to_drop, axis=1)

In [44]:
# Curate concentration and time info : set respectively to µM and h, using only float in pert_idose
sig_info_cmap_cmp['pert_idose'] = sig_info_cmap_cmp['pert_idose'].apply(dose_to_micromolar)
sig_info_cmap_cmp['pert_itime'] = sig_info_cmap_cmp['pert_itime'].apply(lambda s : int(s.split()[0]))
sig_info_cmap_cmp = sig_info_cmap_cmp.rename({'pert_idose' : 'pert_idose (µM)',
                                             'pert_itime' : 'pert_itime (h)'}, axis=1)

In [45]:
sig_info_cmap_cmp.head()

Unnamed: 0,sig_id,pert_id,pert_iname,cell_id,pert_idose (µM),pert_itime (h),distil_id,GEO
4,AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,BRD-A03772856,CD34,0.5,24,AML001_CD34_24H_X1_F1B10:J04|AML001_CD34_24H_X...,GSE92742
5,AML001_CD34_24H:BRD-A03772856:1.11111,BRD-A03772856,BRD-A03772856,CD34,1.0,24,AML001_CD34_24H_X1_F1B10:J03|AML001_CD34_24H_X...,GSE92742
6,AML001_CD34_24H:BRD-A03772856:10,BRD-A03772856,BRD-A03772856,CD34,10.0,24,AML001_CD34_24H_X1_F1B10:I03|AML001_CD34_24H_X...,GSE92742
7,AML001_CD34_24H:BRD-A03772856:3.33333,BRD-A03772856,BRD-A03772856,CD34,3.0,24,AML001_CD34_24H_X1_F1B10:I04|AML001_CD34_24H_X...,GSE92742
8,AML001_CD34_24H:BRD-A19037878:1.11111,BRD-A19037878,trichostatin-a,CD34,1.0,24,AML001_CD34_24H_X1_F1B10:F05|AML001_CD34_24H_X...,GSE92742


In [46]:
n_sig_cmap_cmp = sig_info_cmap_cmp.shape[0]
print('Total number of correct compound signatures in cmap : ' + str(n_sig_cmap_cmp))

Total number of correct compound signatures in cmap : 310114


In [47]:
sig_info_cmap_cmp.to_csv(sig_info_cmap_path, index=False)

In [64]:
sig_info_cmap_cmp['cell_id'].value_counts()[:10]

MCF7      41608
PC3       34724
VCAP      34674
A375      27352
HT29      26236
A549      23483
HA1E      20982
YAPC      10061
HELA      10057
HCC515    10024
Name: cell_id, dtype: int64

In [49]:
sig_info_cmap_cmp['pert_idose (µM)'].value_counts()

10.00    139056
5.00      46741
0.04      17578
0.37      17546
1.11      17527
          ...  
4.00          6
4.50          6
8.00          6
2.50          6
1.70          6
Name: pert_idose (µM), Length: 95, dtype: int64

In [50]:
sig_info_cmap_cmp['pert_itime (h)'].value_counts(dropna=False)

24    209613
6      95577
3       4866
48        58
Name: pert_itime (h), dtype: int64

# Cell line choice for downstream analysis

In [77]:
# We wanted to choose first the the 9 core cell lines presented in the original L1000 paper
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5990023/
core_cell_lines = ['MCF7', 'PC3', 'A549', 'A375', 'HEPG2', 'VCAP', 'HCC515', 'HT29', 'HA1E']

In [78]:
concentration_is_10um = sig_info_cmap_cmp['pert_idose (µM)'] == 10
time_is_24h = sig_info_cmap_cmp['pert_itime (h)'] == 24
cell_line_is_core = sig_info_cmap_cmp['cell_id'].isin(core_cell_lines)
sig_info_10uM_24h = sig_info_cmap_cmp[concentration_is_10um & time_is_24h & cell_line_is_core]

In [80]:
sig_info_10uM_24h['cell_id'].value_counts()

PC3       10314
MCF7      10195
VCAP       7554
A549       6069
HA1E       4400
A375       3962
HT29       3605
HCC515     2098
HEPG2       544
Name: cell_id, dtype: int64

In [82]:
# HepG2 has a low number of 10 µM and 24 h signatures, we decided to remove it from the downstream analysis
sig_info_10uM_24h = sig_info_10uM_24h[sig_info_10uM_24h['cell_id'] != 'HEPG2']

In [92]:
used_compounds = sig_info_10uM_24h['pert_id'].unique()
n_used_compounds = used_compounds.shape[0]
print('There are ' + str(n_used_compounds) + ' compounds profiled at 10 µM and 24 h in the 8 selected cell lines')

There are 9035 compounds profiled at 10 µM and 24 h in the 8 selected cell lines


In [93]:
cmp_info_cmap['used_compound'] = cmp_info_cmap['pert_id'].isin(used_compounds).astype(int)

In [96]:
cmp_info_cmap.to_csv(cmp_info_cmap_path, index=False)

In [86]:
unique_sig_info = sig_info_10uM_24h.drop_duplicates(subset=['pert_id', 'cell_id'])

In [88]:
unique_sig_info['cell_id'].value_counts()

PC3       8071
MCF7      7546
VCAP      6365
A549      5267
HA1E      3646
A375      3525
HT29      3192
HCC515    1932
Name: cell_id, dtype: int64