In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections

import warnings
warnings.filterwarnings('ignore')

# Utils
file_path = os.getcwd()  # os.path.dirname(os.path.relpath(__file__))
utils_path = os.path.abspath(os.path.join(file_path, '../src/data'))
sys.path.append(utils_path)

# import utils_models
import utils_data as utils

In [2]:
DATADIR = '/Users/apartin/work/jdacs/Benchmarks/Data/Pilot1'

RSP_FILENAME = 'combined_single_response_agg'  # reposne data filename
# RSP_FILENAME = 'ChemPartner_single_response_agg'  # reposne data filename

DSC_FILENAME = 'Combined_PubChem_dragon7_descriptors.tsv'  # drug descriptors data filename
#DSC_FILENAME = 'pan_drugs_dragon7_descriptors.tsv'

DRUG_META_FILENAME = 'drug_info'

In [3]:
# ========================================================================
#       Args TODO: add to argparse
# ========================================================================
# sources = ['ccle', 'gcsi', 'gdsc', 'ctrp']
sources = ['ccle', 'gcsi', 'gdsc', 'ctrp', 'nci60']
drug_features = ['dsc']  # ['dsc', 'fng']
cell_features = ['rna']  # ['rna', 'cnv']
dropna_thres = 0.4

verbose = True
na_values = ['na', '-', '']
tidy_data_format = 'parquet'

# Analysis of fibro samples are implemented in ccle_fibroblast.py and ccle_preproc.R
drop_fibro = True
fibro_names = ['CCLE.HS229T', 'CCLE.HS739T', 'CCLE.HS840T', 'CCLE.HS895T', 'CCLE.RKN',
               'CTRP.Hs-895-T', 'CTRP.RKN', 'GDSC.RKN', 'gCSI.RKN']

# Prefix to add to feature names based on feature types
fea_prfx_dict = {'rna': 'cell_rna.',
                 'cnv': 'cell_cnv.',
                 'dsc': 'drug_dsc.',
                 'fng': 'drug_fng.'}

prfx_dtypes = {'rna': np.float32,
               'cnv': np.int8,
               'dsc': np.float32,
               'fng': np.int8}

In [4]:
# ========================================================================
#       Load response data
# ========================================================================
print(f'Loading combined response ... {RSP_FILENAME}')
rsp_cols = ['AUC', 'AUC1', 'EC50', 'EC50se',
            'R2fit', 'Einf', 'IC50',
            'HS', 'AAC1', 'DSS1']
rsp = pd.read_table(os.path.join(DATADIR, RSP_FILENAME), sep='\t',
                    na_values=na_values,
                    dtype={'SOURCE': str, 'CELL': str, 'DRUG': str,
                           'AUC': np.float32, 'IC50': np.float32, 'EC50': np.float32,
                           'EC50se': np.float32, 'R2fit': np.float32, 'Einf': np.float32,
                           'HS': np.float32, 'AAC1': np.float32, 'AUC1': np.float32, 'DSS1': np.float32},
                    warn_bad_lines=True)
rsp['SOURCE'] = rsp['SOURCE'].apply(lambda x: x.lower())
print(f'rsp.shape {rsp.shape}')


Loading combined response ... combined_single_response_agg
rsp.shape (4484081, 14)


In [5]:
# ========================================================================
#   Load rna (combined_dataset)
# ========================================================================
print('Loading rna-seq ... ')
lincs = utils.CombinedRNASeqLINCS(datadir=DATADIR, dataset='raw', sources=sources, na_values=na_values, verbose=False)
rna, cmeta = lincs._df_rna, lincs._meta
rna.rename(columns={'Sample': 'CELL'}, inplace=True)
cmeta.rename(columns={'Sample': 'CELL', 'source': 'SOURCE'}, inplace=True)
rna = rna.rename(columns={c: fea_prfx_dict['rna']+c for c in rna.columns[1:] if fea_prfx_dict['rna'] not in c}) # add fea prefix
print(f'rna.shape {rna.shape}')

# Impute missing values
rna = utils.impute_values(data=rna, fea_prfx_dict=fea_prfx_dict)


Loading rna-seq ... 
rna.shape (2917, 943)


In [6]:
# ========================================================================
#   Load drug descriptors
# ========================================================================
print('Loading drug descriptors ... {}'.format(DSC_FILENAME))
path = os.path.join(DATADIR, DSC_FILENAME)
cols = pd.read_table(path, engine='c', nrows=0)
dtype_dict = {c: prfx_dtypes['dsc'] for c in cols.columns[1:]}
dsc = pd.read_table(path, dtype=dtype_dict, na_values=na_values, warn_bad_lines=True)
dsc.rename(columns={'NAME': 'PUBCHEM'}, inplace=True)
dsc = dsc.rename(columns={c: fea_prfx_dict['dsc']+c for c in dsc.columns[1:] if fea_prfx_dict['dsc'] not in c}) # add fea prefix
print(f'dsc.shape {dsc.shape}')


# ------------------
# Filter descriptors
# ------------------
# dsc.nunique(dropna=True).value_counts()
# dsc.nunique(dropna=True).sort_values()

print('Drop descriptors with *lots* of NA values ...')
#utils.plot_dsc_na_dist(dsc=dsc, savepath=os.path.join(OUTDIR, 'dsc_hist_ratio_of_na.png'))
dsc = utils.dropna(df=dsc, axis=1, th=dropna_thres)
print(f'dsc.shape {dsc.shape}')
# dsc.isna().sum().sort_values(ascending=False)

# There are descriptors for which there is a single unique value excluding NA (drop those)
print('Drop descriptors that have a single unique value (excluding NAs) ...')
col_idx = dsc.nunique(dropna=True).values==1
dsc = dsc.iloc[:, ~col_idx]
print(f'dsc.shape {dsc.shape}')

# There are still lots of descriptors which have only a few unique values
# We can categorize those values. E.g.: 564 descriptors have only 2 unique vals,
# and 154 descriptors have only 3 unique vals, etc.
# TODO: use utility code from p1h_alex/utils/data_preproc.py that transform those
# features into categorical and also applies an appropriate imputation.
# dsc.nunique(dropna=True).value_counts()[:10]
# dsc.nunique(dropna=True).value_counts().sort_index()[:10]

# Impute missing values
dsc = utils.impute_values(data=dsc, fea_prfx_dict=fea_prfx_dict)

# Drop low var cols
# tmp, idx = utils_all.drop_low_var_cols(df=dsc, skipna=False)

if verbose:
    print('dsc memory usage: {:.3f} GB'.format(sys.getsizeof(dsc)/1e9))


Loading drug descriptors ... Combined_PubChem_dragon7_descriptors.tsv
dsc.shape (517, 5271)
Drop descriptors with *lots* of NA values ...
dsc.shape (517, 3838)
Drop descriptors that have a single unique value (excluding NAs) ...
dsc.shape (517, 2734)
dsc memory usage: 0.006 GB


In [7]:
# ========================================================================
#   Load drug meta
# ========================================================================
print(f'Loading drug metadata ... {DRUG_META_FILENAME}')
dmeta = pd.read_table(os.path.join(DATADIR, DRUG_META_FILENAME), dtype=object)
dmeta['PUBCHEM'] = 'PubChem.CID.' + dmeta['PUBCHEM']
dmeta.insert(loc=0, column='SOURCE', value=dmeta['ID'].map(lambda x: x.split('.')[0].lower()))
dmeta.rename(columns={'ID': 'DRUG'}, inplace=True)
print(f'dmeta.shape {dmeta.shape}')

Loading drug metadata ... drug_info
dmeta.shape (846, 7)


### Dump shapes

In [8]:
rsp.iloc[:2]

Unnamed: 0,SOURCE,CELL,DRUG,STUDY,AUC,IC50,EC50,EC50se,R2fit,Einf,HS,AAC1,AUC1,DSS1
0,ccle,CCLE.1321N1,CCLE.1,fake_exp,0.833,4.883,5.183,0.5746,0.9864,0.2017,0.7463,0.1017,0.8983,0.0551
1,ccle,CCLE.1321N1,CCLE.10,fake_exp,0.7909,5.217,5.217,2.284,0.6526,0.0,0.8993,0.1136,0.8864,0.0728


In [9]:
rna.iloc[:2, :5]

Unnamed: 0,CELL,cell_rna.AARS,cell_rna.ABCB6,cell_rna.ABCC5,cell_rna.ABCF1
0,CCLE.22RV1,8.31,7.17,4.12,5.64
1,CCLE.2313287,8.94,6.3,3.83,6.6


In [10]:
dsc.iloc[:2, :5]

Unnamed: 0,PUBCHEM,drug_dsc.MW,drug_dsc.AMW,drug_dsc.Sv,drug_dsc.Se
0,PubChem.CID.10026128,438.51001,7.831,36.275002,56.813999
1,PubChem.CID.10027278,460.48999,8.688,36.691002,54.224998


In [11]:
dmeta.iloc[:2]

Unnamed: 0,SOURCE,DRUG,NAME,CLEAN_NAME,SMILES,INCHIKEY,PUBCHEM
0,ccle,CCLE.1,AEW541,AEW541,C1CN(C1)CC2CC(C2)N3C=C(C4=C3N=CN=C4N)C5=CC(=CC...,AECDBHGVIIRMOI-UHFFFAOYSA-N,PubChem.CID.11476171
1,ccle,CCLE.10,ZD-6474,ZD6474,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...,UHTHHESEBZOYNR-UHFFFAOYSA-N,PubChem.CID.3081361


In [12]:
# ========================================================================
#   Merge the data
# ========================================================================
"""
Data tables: rsp, rna, dsc, cmeta, dmeta
(rsp, rna): on 'CELL'
(rsp, dsc): on pubchem through fields in dmeta
"""
print('==========================')
print('... Start merging data ...')
print('==========================')

# Update rsp with additional drug field 'PUBCHEM' (this will be used to merge with descriptors)
print('\nMerge response (rsp) with drug metadata (dmeta) on DRUG in order to add PUBCHEM (required for descriptors) ...')
print(f'rsp.shape   {rsp.shape}')
print(f'dmeta.shape {dmeta.shape}')
rsp = pd.merge(rsp, dmeta[['DRUG', 'PUBCHEM']], on='DRUG', how='left')
print(f'rsp.shape   {rsp.shape}')
print('NA values after merging rsp and dmeta: \n{}'.format(rsp[['DRUG', 'PUBCHEM']].isna().sum()))
print('')
print(rsp.groupby('SOURCE').agg({'DRUG': 'nunique', 'PUBCHEM': 'nunique'}).reset_index())


... Start merging data ...

Merge response (rsp) with drug metadata (dmeta) on DRUG in order to add PUBCHEM (required for descriptors) ...
rsp.shape   (4484081, 14)
dmeta.shape (846, 7)
rsp.shape   (4484081, 15)
NA values after merging rsp and dmeta: 
DRUG             0
PUBCHEM    3973264
dtype: int64

  SOURCE   DRUG  PUBCHEM
0   ccle     24       24
1   ctrp    544      370
2   gcsi     16       16
3   gdsc    249      235
4  nci60  52671        0
5    scl    445        0
6   sclc    526        0


In [13]:
# --------------------
# Merge rsp with cmeta
# --------------------
print('\nMerge response (rsp) and cell metadata (cmeta) ...')
print(f'rsp.shape   {rsp.shape}')
print(f'cmeta.shape {cmeta.shape}')
rsp1 = pd.merge(rsp, cmeta[['CELL', 'core_str', 'csite', 'ctype', 'simplified_csite', 'simplified_ctype']], on='CELL', how='left')
print(f'rsp1.shape  {rsp1.shape}')
print('')
print(rsp1.groupby('SOURCE').agg({'CELL': 'nunique', 'DRUG': 'nunique',
                                        'PUBCHEM': 'nunique'}).reset_index())
del rsp


Merge response (rsp) and cell metadata (cmeta) ...
rsp.shape   (4484081, 15)
cmeta.shape (2917, 20)
rsp1.shape  (4484081, 20)

  SOURCE  CELL   DRUG  PUBCHEM
0   ccle   504     24       24
1   ctrp   887    544      370
2   gcsi   409     16       16
3   gdsc  1075    249      235
4  nci60    59  52671        0
5    scl    65    445        0
6   sclc    70    526        0


In [14]:
# --------------
# Merge with rna
# --------------
print('\nMerge with expression (rna) ...')
print(f'rsp1.shape {rsp1.shape}')
print(f'rna.shape  {rna.shape}')
rsp2 = pd.merge(rsp1, rna, on='CELL', how='inner')
print(f'rsp2.shape {rsp2.shape}')
print('')
print(rsp2.groupby('SOURCE').agg({'CELL': 'nunique', 'DRUG': 'nunique',
                                        'PUBCHEM': 'nunique'}).reset_index())
del rsp1


Merge with expression (rna) ...
rsp1.shape (4484081, 20)
rna.shape  (2917, 943)
rsp2.shape (4301221, 962)

  SOURCE  CELL   DRUG  PUBCHEM
0   ccle   474     24       24
1   ctrp   812    544      370
2   gcsi   357     16       16
3   gdsc   670    249      235
4  nci60    59  52671        0


In [15]:
# --------------
# Merge with dsc
# --------------
print('\nMerge with descriptors (dsc) ...')
print(f'rsp2.shape {rsp2.shape}')
print(f'dsc.shape  {dsc.shape}')
data = pd.merge(rsp2, dsc, on='PUBCHEM', how='inner')
print(f'data.shape {data.shape}')
print('')
print(data.groupby('SOURCE').agg({'CELL': 'nunique', 'DRUG': 'nunique',
                                        'PUBCHEM': 'nunique'}).reset_index())
del rsp2


Merge with descriptors (dsc) ...
rsp2.shape (4301221, 962)
dsc.shape  (517, 2734)
data.shape (406889, 3695)

  SOURCE  CELL  DRUG  PUBCHEM
0   ccle   474    24       24
1   ctrp   812   370      370
2   gcsi   357    16       16
3   gdsc   670   236      235
