- PDB has metadata for drugs. Use for clustering. Error analysis (which drugs are less predictive).

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
file_path = os.getcwd()  # os.path.dirname(os.path.relpath(__file__))
utils_path = os.path.abspath(os.path.join(file_path, 'utils_py'))
sys.path.append(utils_path)
import utils_all as utils_py
import utils

import warnings
warnings.filterwarnings('ignore')

SEED=0

Using TensorFlow backend.


# Utils

In [2]:
DATADIR = '/Users/apartin/work/jdacs/Benchmarks/Data/Pilot1'

fname_single_drug_growth = 'rescaled_combined_single_drug_growth'
# fname_drug_growth = 'combined_single_drug_growth'

fname_combo_drug_growth = 'ComboDrugGrowth_Nov2017.csv'

fname_nci60_cellname = 'NCI60_CELLNAME_to_Combo.txt'

cellmap_path = os.path.join(DATADIR, 'NCI60_CELLNAME_to_Combo.txt')

# expression_data = 'combined_rnaseq_data'
expression_data = 'combined_rnaseq_data_lincs1000'

In [3]:
OUTDIR = './save_pharmaco_vs_us'
os.makedirs(OUTDIR, exist_ok=True)

# Load dose response - single drug (df1)

In [4]:
df_resp_org = pd.read_csv(os.path.join(DATADIR, fname_single_drug_growth), sep='\t', engine='c',
                     na_values=['na', '-', ''],
                     dtype={'SOURCE': str, 'DRUG_ID': str,
                            'CELLNAME': str, 'CONCUNIT': str,
                            'LOG_CONCENTRATION': np.float32,
                            'EXPID': str, 'GROWTH': np.float32})
print(f'Shape {df_resp_org.shape}\n')
print(df_resp_org.SOURCE.value_counts())
display(df_resp_org[:2])

Shape (27769716, 7)

NCI60    18862308
CTRP      6171005
GDSC      1894212
SCLC       389510
SCL        301336
CCLE        93251
gCSI        58094
Name: SOURCE, dtype: int64


Unnamed: 0,SOURCE,DRUG_ID,CELLNAME,CONCUNIT,LOG_CONCENTRATION,EXPID,GROWTH
0,CCLE,CCLE.1,CCLE.1321N1,M,-8.60206,fake_exp,117.339996
1,CCLE,CCLE.1,CCLE.1321N1,M,-8.09691,fake_exp,122.0


### Pre-proc response

In [5]:
# Extract CCLE response
df_resp = df_resp_org[df_resp_org['SOURCE']=='CCLE']
# df_resp = df_resp_org[df_resp_org['SOURCE']=='gCSI']
print(f'Shape {df_resp.shape}\n')
print(df_resp.nunique())

Shape (93251, 7)

SOURCE                  1
DRUG_ID                24
CELLNAME              504
CONCUNIT                1
LOG_CONCENTRATION       8
EXPID                   2
GROWTH               2035
dtype: int64


In [6]:
# df['DOSE'] = -df['LOG_CONCENTRATION']
df_resp = df_resp.rename(columns={'CELLNAME': 'CELL', 'DRUG_ID': 'DRUG', 'EXPID': 'STUDY',
                                  'LOG_CONCENTRATION': 'LOGCONC'})
# df = df[['SOURCE', 'CELL', 'DRUG', 'DOSE', 'GROWTH', 'STUDY']]
df_resp = df_resp[['SOURCE', 'CELL', 'DRUG', 'LOGCONC', 'GROWTH', 'STUDY']]
# df['GROWTH'] /= 100

### Cell lines vs drugs

In [7]:
# Each value in the table contains the number drug concentrations applied for each [cell, drug] pair
cd1 = pd.crosstab(df_resp['CELL'], df_resp['DRUG'])
cd1.columns.name = None
cd1.reset_index(inplace=True)
print('cd1.shape', cd1.shape)
cd1[:3]

cd1.shape (504, 25)


Unnamed: 0,CELL,CCLE.1,CCLE.10,CCLE.11,CCLE.12,CCLE.13,CCLE.14,CCLE.15,CCLE.16,CCLE.17,...,CCLE.22,CCLE.23,CCLE.24,CCLE.3,CCLE.4,CCLE.5,CCLE.6,CCLE.7,CCLE.8,CCLE.9
0,CCLE.1321N1,8,8,8,8,0,8,8,8,0,...,8,8,8,8,8,8,8,8,8,8
1,CCLE.22RV1,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
2,CCLE.42MGBA,8,8,8,8,0,8,8,8,0,...,8,8,8,8,8,8,8,8,8,8


In [8]:
# df.groupby(['CELLNAME']).agg({'DRUG_ID': ['unique', 'nunique']})

In [9]:
# Data from PharamcoDB
cd2 = pd.read_csv('ccle_sensnum', sep='\t')
cd2.index.name = 'CELL'
cd2.reset_index(inplace=True)
print('cd2.shape', cd2.shape)
# cd2.iloc[:3, :10]
cd2[:3]

cd2.shape (504, 25)


Unnamed: 0,CELL,17-AAG,AEW541,AZD0530,AZD6244,Erlotinib,Irinotecan,L-685458,lapatinib,LBW242,...,PD-0332991,Crizotinib,PHA-665752,PLX4720,RAF265,Sorafenib,TAE684,TKI258,Topotecan,Vandetanib
0,22RV1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,5637,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,639-V,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Check consistency of cell line names across Pharmaco and ours

In [10]:
rmv = ' .-_*/\|{}[]()#$%@!~'
n1 = cd1['CELL'].map(lambda s: ''.join(['' if c in rmv else c.upper() for c in s]))
n1 = n1.map(lambda s: s.replace('CCLE', '')).tolist()

n2 = cd2['CELL'].map(lambda s: ''.join(['' if c in rmv else c.upper() for c in s])).tolist()

In [11]:
print(len(set(n1).intersection(set(n2))))
print(list(set(n1).difference(set(n2))))
print(list(set(n2).difference(set(n1))))

492
['SNU16', 'JURKAT', 'KO52', 'LU99', '786O', 'KPNSI9S', 'SNU1', 'NIHOVCAR3', 'KNS81', 'MDAMB435S', 'HEC1B', 'COLO320']
['OVCAR3', 'HEC1', 'NCISNU1', 'LU99A', 'JURKAT,CLONEE61', 'KPNS19S', 'MDAMB435', 'NCISNU16', 'COLO320HSR', 'K052', 'KNS81FD', '7860']


### TODO: Extract selected drugs

In [12]:
# (ap) extract selected drugs for CCLE
# df_selected_drugs[df_selected_drugs['Drug'].map(lambda s: True if 'CCLE' in s.split('.')[0] else False)]

# Load expression data

In [13]:
# Load the data
path = os.path.join(DATADIR, expression_data)
df_cols = pd.read_table(path, engine='c', nrows=0)
dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:])
df_exp = pd.read_table(path, engine='c', usecols=None, dtype=dtype_dict)
print(df_exp.shape)

(15196, 943)


In [14]:
# Extract CCLE expression
df_exp = df_exp[df_exp['Sample'].map(lambda s: True if 'CCLE' in s else False)].reset_index(drop=True)
print(f'Shape {df_exp.shape}\n')

Shape (1018, 943)



### Pre-proc expression

In [15]:
# Embedding data source into features by onehot
# THE IDEA IS TO USE THIS AS REGULARIZER FOR BATCH EFFECT!!(??)

# prefixes = df_exp['Sample'].str.extract('^([^.]*)', expand=False).rename('Source')
# sources = prefixes.drop_duplicates().reset_index(drop=True)
# df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.')
# df_source = pd.concat([sources, df_source], axis=1)
# df_source

In [16]:
# Merge 'Sample' and one-hot encoded Source
df1 = df_exp['Sample']
# df_sample_source = pd.concat([df1, prefixes], axis=1)
# df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1)
# print(f'Embedding RNAseq data source into features: {df1.shape[1]-1} additional columns')

In [17]:
# df with gene features only
df2 = df_exp.drop('Sample', 1)
df2 = df2.add_prefix('rnaseq.')

In [18]:
scaling = 'std'
imputing = 'mean'
df2 = utils.impute_and_scale(df2, scaling, imputing)

In [19]:
df_exp = pd.concat([df1, df2], axis=1)

In [20]:
print('Loaded combined RNAseq data: ', df_exp.shape)

Loaded combined RNAseq data:  (1018, 943)


### Extract cell lines with response data

In [21]:
sample = df_resp[['CELL']].rename(columns={'CELL': 'Sample'}).drop_duplicates().sort_values('Sample').reset_index(drop=True)
sample.shape

(504, 1)

In [22]:
df_exp_with_resp = sample.merge(df_exp, on='Sample')
print(df_exp_with_resp.shape)
print('{} CCLE samples with expression and response data'.format(df_exp_with_resp.shape[0]))

(474, 943)
474 CCLE samples with expression and response data


Only 474 CCLE samples with response (FF got the same result)

# Load drug data

In [23]:
# load_drug_info()
path = os.path.join(DATADIR, 'drug_info')
df = pd.read_table(path, dtype=object)
df['PUBCHEM'] = 'PubChem.CID.' + df['PUBCHEM']
df_info = df.copy()

df_info['Drug'] = df_info['PUBCHEM']
print(df_info.shape)
df_info[:2]

(846, 7)


Unnamed: 0,ID,NAME,CLEAN_NAME,SMILES,INCHIKEY,PUBCHEM,Drug
0,CCLE.1,AEW541,AEW541,C1CN(C1)CC2CC(C2)N3C=C(C4=C3N=CN=C4N)C5=CC(=CC...,AECDBHGVIIRMOI-UHFFFAOYSA-N,PubChem.CID.11476171,PubChem.CID.11476171
1,CCLE.10,ZD-6474,ZD6474,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...,UHTHHESEBZOYNR-UHFFFAOYSA-N,PubChem.CID.3081361,PubChem.CID.3081361


In [24]:
# Number of drugs used in each study
df_info['ID'].map(lambda s: s.split('.')[0]).value_counts()  # (ap)

CTRP    544
GDSC    262
CCLE     24
gCSI     16
Name: ID, dtype: int64

In [25]:
# Extract CCLE drugs
df_info = df_info[df_info['ID'].map(lambda s: True if 'CCLE' in s else False)].reset_index(drop=True)
print(f'Shape {df_info.shape}\n')

Shape (24, 7)



In [26]:
# (df_info['CLEAN_NAME']=='AEW541').sum()
# df_info['CLEAN_NAME'].nunique()
tmp = df_info.copy()
tmp['SOURCE'] = df_info['ID'].map(lambda s: s.split('.')[0])
tmp.groupby(['SOURCE']).agg({'CLEAN_NAME': 'unique'}).reset_index()

Unnamed: 0,SOURCE,CLEAN_NAME
0,CCLE,"[AEW541, ZD6474, PANOBINOSTAT, SORAFENIB, IRIN..."


### Check consistency of drug names across Pharmaco and ours

In [27]:
d1 = df_info[df_info['ID'].map(lambda s: True if 'CCLE' in s.split('.')[0] else False)].loc[:, 'NAME'].tolist()
d1 = sorted(d1)

In [28]:
d2 = cd2.columns[1:].tolist()
d2 = sorted(d2)

In [29]:
print(len(set(d1).intersection(set(d2))))
print(set(d1).difference(set(d2)))
print(set(d2).difference(set(d1)))

20
{'Lapatinib', 'Paclitaxel', 'PF2341066', 'ZD-6474'}
{'Vandetanib', 'lapatinib', 'paclitaxel', 'Crizotinib'}


### load_drug_set_descriptors( drug_set = 'Combined_PubChem' )

In [30]:
path = os.path.join(DATADIR, 'Combined_PubChem_dragon7_descriptors.tsv')
df_cols = pd.read_table(path, engine='c', nrows=0)
total = df_cols.shape[1] - 1
dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:])
df = pd.read_table(path, engine='c', usecols=None, dtype=dtype_dict, na_values=['na', '-', ''])

print(df.shape)
df[:2]

(517, 5271)


Unnamed: 0,NAME,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL
0,PubChem.CID.10026128,438.51001,7.831,36.275002,56.813999,37.387001,63.286999,0.648,1.015,0.668,...,,,,,,,,,,
1,PubChem.CID.10027278,460.48999,8.688,36.691002,54.224998,37.141998,59.958,0.692,1.023,0.701,...,,,,,,,,,,


In [31]:
# Extract drug features (descriptors and fingerprints) into df2
df1 = pd.DataFrame(df.loc[:, 'NAME'])
df1.rename(columns={'NAME': 'Drug'}, inplace=True)
df2 = df.drop('NAME', 1)

In [32]:
scaling = None
imputing = None
df2 = utils.impute_and_scale(df2, scaling, imputing, dropna=None)

In [33]:
df = pd.concat([df1, df2], axis=1)
df_desc = df.copy()
print(df_desc.shape)

(517, 5271)


In [34]:
print('df_info', df_info.shape)
print('df_desc', df_desc.shape)

df_info (24, 7)
df_desc (517, 5271)


In [35]:
display(df_info[:2])
display(df_desc[:2])

Unnamed: 0,ID,NAME,CLEAN_NAME,SMILES,INCHIKEY,PUBCHEM,Drug
0,CCLE.1,AEW541,AEW541,C1CN(C1)CC2CC(C2)N3C=C(C4=C3N=CN=C4N)C5=CC(=CC...,AECDBHGVIIRMOI-UHFFFAOYSA-N,PubChem.CID.11476171,PubChem.CID.11476171
1,CCLE.10,ZD-6474,ZD6474,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...,UHTHHESEBZOYNR-UHFFFAOYSA-N,PubChem.CID.3081361,PubChem.CID.3081361


Unnamed: 0,Drug,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL
0,PubChem.CID.10026128,438.51001,7.831,36.275002,56.813999,37.387001,63.286999,0.648,1.015,0.668,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PubChem.CID.10027278,460.48999,8.688,36.691002,54.224998,37.141998,59.958,0.692,1.023,0.701,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Important!
df_desc = pd.merge(df_info[['ID', 'Drug']], df_desc, on='Drug').drop('Drug', axis=1).rename(columns={'ID': 'Drug'})
print(df_desc.shape)
df_desc[:2]

(24, 5271)


Unnamed: 0,Drug,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL
0,CCLE.1,439.609985,7.09,39.143002,61.439999,41.619999,69.682999,0.631,0.991,0.671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCLE.10,475.399994,8.804,34.717999,54.522999,36.597,61.16,0.643,1.01,0.678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Merge dataframes

In [37]:
print(df_resp.shape)
print(df_exp.shape)
print(df_desc.shape)

(93251, 6)
(1018, 943)
(24, 5271)


In [55]:
print(df_resp.isna().sum())
print('NA values in expression', df_exp.isna().sum().sum())
print('NA values in descriptors', df_desc.isna().sum().sum())

Source     0
Sample     0
Drug       0
LOGCONC    0
Growth     0
Study      0
dtype: int64
NA values in expression 0
NA values in descriptors 389


In [38]:
df_resp = df_resp.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug', 'SOURCE': 'Source',
                                  'GROWTH': 'Growth', 'STUDY': 'Study'})

In [39]:
print('df_resp')
print(df_resp.shape)
display(df_resp[:2])

print('df_exp')
print(df_exp.shape)
display(df_exp[:2])

print('df_desc')
print(df_desc.shape)
display(df_desc[:2])

df_resp
(93251, 6)


Unnamed: 0,Source,Sample,Drug,LOGCONC,Growth,Study
0,CCLE,CCLE.1321N1,CCLE.1,-8.60206,117.339996,fake_exp
1,CCLE,CCLE.1321N1,CCLE.1,-8.09691,122.0,fake_exp


df_exp
(1018, 943)


Unnamed: 0,Sample,rnaseq.AARS,rnaseq.ABCB6,rnaseq.ABCC5,rnaseq.ABCF1,rnaseq.ABCF3,rnaseq.ABHD4,rnaseq.ABHD6,rnaseq.ABL1,rnaseq.ACAA1,...,rnaseq.ZMIZ1,rnaseq.ZMYM2,rnaseq.ZNF131,rnaseq.ZNF274,rnaseq.ZNF318,rnaseq.ZNF395,rnaseq.ZNF451,rnaseq.ZNF586,rnaseq.ZNF589,rnaseq.ZW10
0,CCLE.22RV1,0.64346,1.665659,-0.003287,-1.612203,0.440622,-0.603358,-0.688464,-0.25927,0.177445,...,-1.105644,0.636475,0.150561,-0.28004,0.931946,0.295963,1.2412,1.413897,0.753615,0.892602
1,CCLE.2313287,1.464804,1.039212,-0.30937,0.023618,0.33263,1.231317,0.944933,-0.957579,0.93068,...,-0.572183,0.575539,0.609772,-0.104828,-0.055401,-0.619046,1.37306,0.961168,0.649255,1.093517


df_desc
(24, 5271)


Unnamed: 0,Drug,MW,AMW,Sv,Se,Sp,Si,Mv,Me,Mp,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL
0,CCLE.1,439.609985,7.09,39.143002,61.439999,41.619999,69.682999,0.631,0.991,0.671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCLE.10,475.399994,8.804,34.717999,54.522999,36.597,61.16,0.643,1.01,0.678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Merge response and expression**

In [40]:
print(df_resp['Sample'].nunique())
print(df_exp['Sample'].nunique())

504
1018


In [41]:
df_resp_exp = df_resp.merge(df_exp, on='Sample')
print(df_resp_exp.shape)
print(df_resp_exp['Sample'].nunique())
df_resp_exp[:2]

(87665, 948)
474


Unnamed: 0,Source,Sample,Drug,LOGCONC,Growth,Study,rnaseq.AARS,rnaseq.ABCB6,rnaseq.ABCC5,rnaseq.ABCF1,...,rnaseq.ZMIZ1,rnaseq.ZMYM2,rnaseq.ZNF131,rnaseq.ZNF274,rnaseq.ZNF318,rnaseq.ZNF395,rnaseq.ZNF451,rnaseq.ZNF586,rnaseq.ZNF589,rnaseq.ZW10
0,CCLE,CCLE.22RV1,CCLE.1,-8.60206,101.879997,fake_exp,0.64346,1.665659,-0.003287,-1.612203,...,-1.105644,0.636475,0.150561,-0.28004,0.931946,0.295963,1.2412,1.413897,0.753615,0.892602
1,CCLE,CCLE.22RV1,CCLE.1,-8.09691,125.0,fake_exp,0.64346,1.665659,-0.003287,-1.612203,...,-1.105644,0.636475,0.150561,-0.28004,0.931946,0.295963,1.2412,1.413897,0.753615,0.892602


**Merge descriptors to response and expression**

In [42]:
df_resp_exp_desc = df_resp_exp.merge(df_desc, on='Drug')