In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections

# Utils
file_path = os.getcwd()  # os.path.dirname(os.path.relpath(__file__))
utils_path = os.path.abspath(os.path.join(file_path, '../src/data'))
sys.path.append(utils_path)

# import utils_models
import utils_data as utils

print(file_path)

/vol/ml/apartin/projects/cell-line-drug-sensitivity/notebooks


In [2]:
# DATADIR = '/Users/apartin/work/jdacs/Benchmarks/Data/Pilot1'
DATADIR = '/vol/ml/apartin/Benchmarks/Data/Pilot1'

RSP_FILENAME = 'combined_single_response_agg'  # reposne data filename
RSP_FILENAME_CHEM = 'ChemPartner_single_response_agg'  # reposne data filename

#DSC_FILENAME = 'Combined_PubChem_dragon7_descriptors.tsv'  # drug descriptors data filename
DSC_FILENAME = 'pan_drugs_dragon7_descriptors.tsv'

DRUG_META_FILENAME = 'drug_info'


In [3]:
# ========================================================================
#       Arg parser
# ========================================================================
drop_fibro = True
drop_bad = True
dropna_thres = 0.4
verbose = True
drug_features = ['dsc']  # ['dsc', 'fng']
cell_features = ['rna']  # ['rna', 'cnv']

In [4]:
# ========================================================================
#       Other settings
# ========================================================================
sources = ['ccle', 'gcsi', 'gdsc', 'ctrp', 'nci60', 'chempartner']
na_values = ['na', '-', '']
tidy_file_name = 'tidy_data'
tidy_file_format = 'parquet'

# Response columns
rsp_cols = ['AUC', 'AUC1', 'EC50', 'EC50se',
            'R2fit', 'Einf', 'IC50',
            'HS', 'AAC1', 'DSS1']

# Analysis of fibro samples are implemented in ccle_fibroblast.py and ccle_preproc.R
fibro_names = ['CCLE.HS229T', 'CCLE.HS739T', 'CCLE.HS840T', 'CCLE.HS895T', 'CCLE.RKN',
               'CTRP.Hs-895-T', 'CTRP.RKN', 'GDSC.RKN', 'gCSI.RKN']

# Prefix to add to feature names based on feature types
fea_prfx_dict = {'rna': 'cell_rna.', 'cnv': 'cell_cnv.',
                 'dsc': 'drug_dsc.', 'fng': 'drug_fng.'}

prfx_dtypes = {'rna': np.float32,
               'cnv': np.int8,
               'dsc': np.float32,
               'fng': np.int8}


In [5]:
# ========================================================================
#       Load response data
# ========================================================================
# Combined response
print(f'Loading combined response ... {RSP_FILENAME}')
rsp = pd.read_table(os.path.join(DATADIR, RSP_FILENAME), sep='\t',
                    na_values=na_values,
                    dtype={'SOURCE': str, 'CELL': str, 'DRUG': str,
                           'AUC': np.float32, 'IC50': np.float32, 'EC50': np.float32,
                           'EC50se': np.float32, 'R2fit': np.float32, 'Einf': np.float32,
                           'HS': np.float32, 'AAC1': np.float32, 'AUC1': np.float32, 'DSS1': np.float32},
                    warn_bad_lines=True)

# Chempartner response
print(f'Loading chempartner response ... {RSP_FILENAME_CHEM}')
rsp_chem = pd.read_table(os.path.join(DATADIR, RSP_FILENAME_CHEM), sep='\t',
                         na_values=na_values,
                         dtype={'SOURCE': str, 'CELL': str, 'DRUG': str,
                                'AUC': np.float32, 'IC50': np.float32, 'EC50': np.float32,
                                'EC50se': np.float32, 'R2fit': np.float32, 'Einf': np.float32,
                                'HS': np.float32, 'AAC1': np.float32, 'AUC1': np.float32, 'DSS1': np.float32},
                         warn_bad_lines=True)
rsp_chem['SOURCE'] = rsp_chem['SOURCE'].map(lambda x: x.split('_')[0])

# Merge rsp from combined and chempartner
rsp = pd.concat([rsp, rsp_chem], axis=0)

rsp['SOURCE'] = rsp['SOURCE'].apply(lambda x: x.lower())
print(f'rsp.shape {rsp.shape}')

# Replace -Inf and Inf with nan
rsp.replace([np.inf, -np.inf], value=np.nan, inplace=True)

print(rsp.groupby('SOURCE').agg({'CELL': 'nunique', 'DRUG': 'nunique'}).reset_index())


Loading combined response ... combined_single_response_agg
Loading chempartner response ... ChemPartner_single_response_agg
rsp.shape (4485109, 14)
        SOURCE  CELL   DRUG
0         ccle   504     24
1  chempartner    49     39
2         ctrp   887    544
3         gcsi   409     16
4         gdsc  1075    249
5        nci60    59  52671
6          scl    65    445
7         sclc    70    526


In [6]:
# ========================================================================
#   Drop fibroblast
# ========================================================================
if drop_fibro:
    print('\n\nDrop fibroblast samples ...')
    # rna = rna[rna['CELL'].map(lambda x: False if x in fibro_names else True)]
    # cmeta = cmeta[cmeta['CELL'].map(lambda x: False if x in fibro_names else True)]
    # rsp = rsp[rsp['CELL'].map(lambda x: False if x in fibro_names else True)]
    id_drop = rsp['CELL'].map(lambda x: True if x in fibro_names else False)
    print(f'Drops {sum(id_drop)} rsp data points.')
    rsp = rsp.loc[~id_drop,:]
    print(f'rsp.shape {rsp.shape}')
    # logger.info(f'rna.shape   {rna.shape}')
    # logger.info(f'cmeta.shape {cmeta.shape}')



Drop fibroblast samples ...
Drops 1285 rsp data points.
rsp.shape (4483824, 14)


In [7]:
# ========================================================================
#   Drop 'bad' points (from Yitan)
# ========================================================================    
if drop_bad:
    print('\n\nDrop bad samples ...')
    id_drop = (rsp['AUC'] == 0) & (rsp['EC50se'] == 0) & (rsp['R2fit'] == 0)
    print(f'Drops {sum(id_drop)} rsp data points.')
    rsp = rsp.loc[~id_drop,:]
    print(f'rsp.shape {rsp.shape}')  



Drop bad samples ...
Drops 4520 rsp data points.
rsp.shape (4479304, 14)


In [8]:
# ========================================================================
#   Load rna (combined_dataset)
# ========================================================================
print('Loading rna-seq ... ')
lincs = utils.CombinedRNASeqLINCS(datadir=DATADIR, dataset='raw', sources=sources, na_values=na_values, verbose=False)
rna, cmeta = lincs._df_rna, lincs._meta
rna.rename(columns={'Sample': 'CELL'}, inplace=True)
cmeta.rename(columns={'Sample': 'CELL', 'source': 'SOURCE'}, inplace=True)
rna = rna.rename(columns={c: fea_prfx_dict['rna']+c for c in rna.columns[1:] if fea_prfx_dict['rna'] not in c}) # add fea prefix
print(f'rna.shape {rna.shape}')

# Impute missing values
rna = utils.impute_values(data=rna, fea_prfx_dict=fea_prfx_dict)

print(cmeta.groupby('SOURCE').agg({'CELL': 'nunique', 'ctype': 'nunique', 'csite': 'nunique'}).reset_index())


Loading rna-seq ... 
rna.shape (2917, 943)
  SOURCE  CELL  ctype  csite
0   ccle  1018     65     27
1   ctrp   812     55     24
2   gcsi   357     40     22
3   gdsc   670     44     24
4  nci60    60     20      9


In [9]:
# ========================================================================
#   Load drug descriptors
# ========================================================================
print(f'Loading drug descriptors ... {DSC_FILENAME}')
path = os.path.join(DATADIR, DSC_FILENAME)
cols = pd.read_table(path, engine='c', nrows=0)
dtype_dict = {c: prfx_dtypes['dsc'] for c in cols.columns[1:]}
dsc = pd.read_table(path, dtype=dtype_dict, na_values=na_values, warn_bad_lines=True)
#dsc.rename(columns={'NAME': 'PUBCHEM'}, inplace=True) # used in the old code
dsc.rename(columns={'NAME': 'DRUG'}, inplace=True)
dsc = dsc.rename(columns={c: fea_prfx_dict['dsc']+c for c in dsc.columns[1:] if fea_prfx_dict['dsc'] not in c}) # add fea prefix
print(f'dsc.shape {dsc.shape}')


# ------------------
# Filter descriptors
# ------------------
# dsc.nunique(dropna=True).value_counts()
# dsc.nunique(dropna=True).sort_values()

print('Drop descriptors with *lots* of NA values ...')
#utils.plot_dsc_na_dist(dsc=dsc, savepath=os.path.join(OUTDIR, 'dsc_hist_ratio_of_na.png'))
dsc = utils.dropna(df=dsc, axis=1, th=dropna_thres)
print(f'dsc.shape {dsc.shape}')
# dsc.isna().sum().sort_values(ascending=False)

# There are descriptors for which there is a single unique value excluding NA (drop those)
print('Drop descriptors that have a single unique value (excluding NAs) ...')
col_idx = dsc.nunique(dropna=True).values==1
dsc = dsc.iloc[:, ~col_idx]
print(f'dsc.shape {dsc.shape}')

# There are still lots of descriptors which have only a few unique values
# We can categorize those values. E.g.: 564 descriptors have only 2 unique vals,
# and 154 descriptors have only 3 unique vals, etc.
# TODO: use utility code from p1h_alex/utils/data_preproc.py that transform those
# features into categorical and also applies an appropriate imputation.
# dsc.nunique(dropna=True).value_counts()[:10]
# dsc.nunique(dropna=True).value_counts().sort_index()[:10]

# Impute missing values
dsc = utils.impute_values(data=dsc, fea_prfx_dict=fea_prfx_dict)

# Drop low var cols
# tmp, idx = utils_all.drop_low_var_cols(df=dsc, skipna=False)

if verbose:
    print('dsc memory usage: {:.3f} GB'.format(sys.getsizeof(dsc)/1e9))


Loading drug descriptors ... pan_drugs_dragon7_descriptors.tsv
dsc.shape (1801, 5271)
Drop descriptors with *lots* of NA values ...
dsc.shape (1801, 3838)
Drop descriptors that have a single unique value (excluding NAs) ...
dsc.shape (1801, 2821)
dsc memory usage: 0.020 GB


In [10]:
# ========================================================================
#   Load drug meta
# ========================================================================
""" We don't need drug meta with the new descriptors file. """
# DRUG_META_FILENAME = 'drug_info'
# print(f'Loading drug metadata ... {DRUG_META_FILENAME}')
# dmeta = pd.read_table(os.path.join(DATADIR, DRUG_META_FILENAME), dtype=object)
# dmeta['PUBCHEM'] = 'PubChem.CID.' + dmeta['PUBCHEM']
# dmeta.insert(loc=0, column='SOURCE', value=dmeta['ID'].map(lambda x: x.split('.')[0].lower()))
# dmeta.rename(columns={'ID': 'DRUG'}, inplace=True)
# print(f'dmeta.shape {dmeta.shape}')


" We don't need drug meta with the new descriptors file. "

### Dump shapes

In [11]:
rsp.iloc[:2]

Unnamed: 0,SOURCE,CELL,DRUG,STUDY,AUC,IC50,EC50,EC50se,R2fit,Einf,HS,AAC1,AUC1,DSS1
0,ccle,CCLE.1321N1,CCLE.1,fake_exp,0.833,4.883,5.183,0.5746,0.9864,0.2017,0.7463,0.1017,0.8983,0.0551
1,ccle,CCLE.1321N1,CCLE.10,fake_exp,0.7909,5.217,5.217,2.284,0.6526,0.0,0.8993,0.1136,0.8864,0.0728


In [12]:
rna.iloc[:2, :5]

Unnamed: 0,CELL,cell_rna.AARS,cell_rna.ABCB6,cell_rna.ABCC5,cell_rna.ABCF1
0,CCLE.22RV1,8.31,7.17,4.12,5.64
1,CCLE.2313287,8.94,6.3,3.83,6.6


In [13]:
dsc.iloc[:2, :5]

Unnamed: 0,DRUG,drug_dsc.MW,drug_dsc.AMW,drug_dsc.Sv,drug_dsc.Se
0,CCLE.10,475.399994,8.804,34.717999,54.522999
1,CCLE.11,349.470001,7.132,30.761999,48.796001


In [14]:
# dmeta.iloc[:2]

In [15]:
# ========================================================================
#   Merge the data
# ========================================================================
"""
Data tables: rsp, rna, dsc, cmeta, dmeta
(rsp, rna): on 'CELL'
(rsp, dsc): on pubchem through fields in dmeta
"""
print('==========================')
print('... Start merging data ...')
print('==========================')

# Update rsp with additional drug field 'PUBCHEM' (this will be used to merge with descriptors)
""" No need dmeta with the new descriptors file. """
# print('\nMerge response (rsp) with drug metadata (dmeta) on DRUG in order to add PUBCHEM (required for descriptors) ...')
# print(f'rsp.shape   {rsp.shape}')
# print(f'dmeta.shape {dmeta.shape}')
# rsp = pd.merge(rsp, dmeta[['DRUG', 'PUBCHEM']], on='DRUG', how='left')
# print(f'rsp.shape   {rsp.shape}')
# print('NA values after merging rsp and dmeta: \n{}'.format(rsp[['DRUG', 'PUBCHEM']].isna().sum()))
# print('')
# print(rsp.groupby('SOURCE').agg({'DRUG': 'nunique', 'PUBCHEM': 'nunique'}).reset_index())


... Start merging data ...


' No need dmeta with the new descriptors file. '

In [16]:
# --------------------
# Merge rsp with cmeta
# --------------------
print('\nMerge response (rsp) and cell metadata (cmeta) ...')
print(f'rsp.shape   {rsp.shape}')
print(f'cmeta.shape {cmeta.shape}')
rsp1 = pd.merge(rsp, cmeta[['CELL', 'core_str', 'csite', 'ctype', 'simplified_csite', 'simplified_ctype']], on='CELL', how='left')
print(f'rsp1.shape  {rsp1.shape}')
print('')
# print(rsp1.groupby('SOURCE').agg({'CELL': 'nunique', 'DRUG': 'nunique',
#                                         'PUBCHEM': 'nunique'}).reset_index())
print(rsp1.groupby('SOURCE').agg({'CELL': 'nunique'}).reset_index())
del rsp



Merge response (rsp) and cell metadata (cmeta) ...
rsp.shape   (4479304, 14)
cmeta.shape (2917, 20)
rsp1.shape  (4479304, 19)

        SOURCE  CELL
0         ccle   499
1  chempartner    49
2         ctrp   885
3         gcsi   408
4         gdsc  1074
5        nci60    59
6          scl    65
7         sclc    70


In [17]:
# --------------
# Merge with rna
# --------------
print('\nMerge with expression (rna) ...')
print(f'rsp1.shape {rsp1.shape}')
print(f'rna.shape  {rna.shape}')
rsp2 = pd.merge(rsp1, rna, on='CELL', how='inner')
print(f'rsp2.shape {rsp2.shape}')
print('')
print(rsp2.groupby('SOURCE').agg({'CELL': 'nunique'}).reset_index())
del rsp1



Merge with expression (rna) ...
rsp1.shape (4479304, 19)
rna.shape  (2917, 943)
rsp2.shape (4296966, 961)

        SOURCE  CELL
0         ccle   469
1  chempartner    49
2         ctrp   810
3         gcsi   356
4         gdsc   669
5        nci60    59


In [18]:
# --------------
# Merge with dsc
# --------------
print('\nMerge with descriptors (dsc) ...')
print(f'rsp2.shape {rsp2.shape}')
print(f'dsc.shape  {dsc.shape}')
#data = pd.merge(rsp2, dsc, on='PUBCHEM', how='inner')
data = pd.merge(rsp2, dsc, on='DRUG', how='inner')
print(f'data.shape {data.shape}')
print('')
print(data.groupby('SOURCE').agg({'CELL': 'nunique', 'DRUG': 'nunique'}).reset_index())
del rsp2



Merge with descriptors (dsc) ...
rsp2.shape (4296966, 961)
dsc.shape  (1801, 2821)
data.shape (704035, 3781)

        SOURCE  CELL  DRUG
0         ccle   469    24
1  chempartner    49    37
2         ctrp   810   495
3         gcsi   356    16
4         gdsc   669   239
5        nci60    59  1006


In [19]:
data.iloc[:3, :4]

Unnamed: 0,SOURCE,CELL,DRUG,STUDY
0,ccle,CCLE.22RV1,CCLE.1,fake_exp
1,ccle,CCLE.42MGBA,CCLE.1,fake_exp
2,ccle,CCLE.5637,CCLE.1,fake_exp


In [20]:
df = data.copy()

In [21]:
print(df.shape)
target_name = 'AUC'
df = df[~df[target_name].isna()]
print(df.shape)

(704035, 3781)
(704034, 3781)
