# We aim to explore how learning curves behave as a function of drug and cell diversity

In [6]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
import sys
from pathlib import Path

import sklearn
import numpy as np
import pandas as pd
from glob import glob

import matplotlib
import matplotlib.pyplot as plt

# Make all python scripts available in the path
sys.path.append('../')

import pp_utils
import lrn_crv_plot
from build_tidy_data import load_dsc, load_rna, load_rsp
# from keras.models import load_model
%matplotlib inline

filepath = Path(os.getcwd())
print(filepath)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/vol/ml/apartin/projects/LearningCurves/notebooks


In [7]:
datapath = Path('./data')

In [16]:
# Default settings
DATADIR = filepath / '../data'
OUTDIR = filepath / '../'
RSP_FILENAME = 'combined_single_response_agg'  # reposne data
DSC_FILENAME = 'pan_drugs_dragon7_descriptors.tsv'  # drug descriptors data (new)
DRUG_META_FILENAME = 'drug_info'
CELL_META_FILENAME = 'combined_cancer_types'

In [17]:
# Settings
na_values = ['na', '-', '']
fea_prfx_dct = {'rna': 'GE_', 'cnv': 'CNV_', 'snp': 'SNP_',
                'dsc': 'DD_', 'fng': 'FNG_'}

prfx_dtypes = {'rna': np.float32, 'cnv': np.int8, 'snp': np.int8,
               'dsc': np.float32, 'fng': np.int8}

In [26]:
rsp = load_rsp(RSP_FILENAME, src=None)
rna = load_rna(DATADIR, rna_norm='raw', float_type=prfx_dtypes['rna'])
dsc = load_dsc(DSC_FILENAME, float_type=prfx_dtypes['dsc'])

cmeta = pd.read_csv(DATADIR/CELL_META_FILENAME, sep='\t', header=None, names=['CELL', 'CANCER_TYPE'])
dmeta = pd.read_csv(DATADIR/DRUG_META_FILENAME, sep='\t')
dmeta.rename(columns={'ID': 'DRUG', 'NAME': 'DRUG_NAME', 'CLEAN_NAME': 'DRUG_CLEAN_NAME'}, inplace=True)


Load response from ... /vol/ml/apartin/projects/LearningCurves/data/combined_single_response_agg

Drop samples with low R2fit ...
Dropped 429932 rsp data points.
rsp.shape (4054148, 13)

Extract specific sources.
rsp.shape (4054148, 13)

Load RNA-Seq ... {datadir / fname}
Impute NA values ...
Cols with missing values (before impute): 0
rna.shape (2917, 943)

Load drug descriptors ... /vol/ml/apartin/projects/LearningCurves/data/pan_drugs_dragon7_descriptors.tsv
Drop descriptors with too many NA values ...
dsc.shape (1801, 3838)
Drop descriptors that have a single unique value (excluding NAs) ...
dsc.shape (1801, 2821)
Impute NA values ...
Cols with missing values (before impute): 1196
Cols with missing values (after impute): 0
dsc.shape (1801, 2821)


In [27]:
data = pd.merge(rsp, cmeta, on='CELL', how='left')
data = pd.merge(data, dmeta, on='DRUG', how='left')
data = pd.merge(data, rna, on='CELL', how='inner')
data = pd.merge(data, dsc, on='DRUG', how='inner')
# print(sys.getsizeof(data)/1e9)

In [31]:
subset_cols = [c for c in data if 'DD_' not in c]

In [36]:
# subset_cols = dsc.columns[1:]
# data.duplicated(subset=subset_cols).sum()
tmp = data.duplicated(subset=subset_cols)