In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
from pathlib import Path
import sys
from time import time
import numpy as np
import pandas as pd

import sklearn
from collections import OrderedDict
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

SEED = 42

# Load data

In [2]:
datadir = Path('../../data/yitan/Data')
ccl_folds_dir = Path('../../data/yitan/CCL_10Fold_Partition')
pdm_folds_dir = Path('../../data/yitan/PDM_10Fold_Partition')

In [3]:
# Un-pickle files
import _pickle as cp

pkl_file = open(datadir/'CCL_PDM_TransferLearningData_rmFactor_0.0_ddNorm_std.pkl', 'rb')

res = cp.load(pkl_file)
genomics = cp.load(pkl_file)
drug = cp.load(pkl_file)

pkl_file.close()

In [4]:
print('res     ', res.shape)
print('genomics', genomics.shape)
print('drug    ', drug.shape)

res      (708662, 5)
genomics (1430, 4582)
drug     (1402, 4392)


# First look at the data

In [5]:
display(res[:2])
display(genomics.iloc[:2, :7])
display(drug.iloc[:2, :7])

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID,area_under_curve,groupID
0,CCLE,CCL_61,Drug_11,0.7153,0.0
1,CCLE,CCL_61,Drug_1,0.9579,0.9164


Unnamed: 0,geneGE_AARS,geneGE_ABCB6,geneGE_ABCC5,geneGE_ABCF1,geneGE_ABCF3,geneGE_ABHD4,geneGE_ABHD6
CCL_1,-0.125161,-0.400237,-0.960208,0.575207,-0.468406,-0.136257,0.083319
CCL_10,-0.217106,0.354776,-1.164841,0.328071,-0.735267,0.23299,-0.174979


Unnamed: 0,DD_MW|num,DD_AMW|num,DD_Sv|num,DD_Se|num,DD_Sp|num,DD_Si|num,DD_Mv|num
Drug_1,0.123446,0.526234,-0.07218,-0.088861,-0.05846,-0.0831,-0.009539
Drug_10,0.053188,1.9661,-0.333843,-0.379081,-0.359584,-0.398841,1.172374


In [6]:
res.groupby('SOURCE').agg({'ccl_name': 'nunique', 'ctrpDrugID': 'nunique'}).reset_index()

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID
0,CCLE,474,24
1,CTRP,812,494
2,GDSC,670,238
3,NCI60,59,987
4,PDM,473,18
5,gCSI,357,16


# Update dfs

In [7]:
gen = genomics
drg = drug

In [8]:
# Bring in the row index
res = res.reset_index()
res = res.rename(columns={'index': 'idx'})
res[:2]

Unnamed: 0,idx,SOURCE,ccl_name,ctrpDrugID,area_under_curve,groupID
0,0,CCLE,CCL_61,Drug_11,0.7153,0.0
1,1,CCLE,CCL_61,Drug_1,0.9579,0.9164


# Where are the genomic features coming from?
Gene expression is coming from CCLE or NCI60.

In [9]:
# gen_ = genomics.reset_index().rename(columns={'index': 'ccl_name'})

# display(gen_.iloc[:2, :5])
# display(res[:2])

In [10]:
# aa = pd.merge(gen_[['ccl_name']], res[['SOURCE', 'ccl_name']], on='ccl_name', how='inner')
# # aa = aa.drop_duplicates().reset_index(drop=True)
# print(aa.shape)
# print(aa['SOURCE'].value_counts())

# bb = aa[aa['ccl_name']=='CCL_1']['SOURCE'].value_counts()
# print(bb)

In [11]:
# del gen_, aa, bb

# What features are available?

In [12]:
# def cnt_fea(df, fea_sep='_', verbose=True):
#     """ Count the number of features per feature type. """
#     dct = {}
#     unq_prfx = df.columns.map(lambda x: x.split(fea_sep)[0]).unique() # unique feature prefixes
#     for prfx in unq_prfx:
#         fea_type_cols = [c for c in df.columns if (c.split(fea_sep)[0]) in prfx] # all fea names of specific type
#         dct[prfx] = len(fea_type_cols)
#     if verbose: print(dct)
#     return dct

# cnt_fea(gen, fea_sep='_');
# cnt_fea(drg, fea_sep='_');

In [13]:
# def extract_subset_fea(df, fea_list, fea_sep='_'):
#     """ Extract features based feature prefix name. """
#     fea = [c for c in df.columns if (c.split(fea_sep)[0]) in fea_list]
#     df = df[fea]
#     return df

# tmp = extract_subset_fea(gen, fea_list=['geneGE', 'c2cpMaxGE'], fea_sep='_')
# cnt_fea(tmp, fea_sep='_', verbose=True);

In [14]:
# def extract_unq_fea_dfs(df, fea_sep='_'):
#     """ Generate dict where each element is a separate df with unique feature type. """
#     dct_fea_prfx = cnt_fea(df, fea_sep=fea_sep)
#     dct_dfs = {}
#     for k in dct_fea_prfx.keys():
#         fea_type_cols = [c for c in df.columns if (c.split(fea_sep)[0]) in k]
#         dct_dfs[k] = df[fea_type_cols]
#     return dct_dfs

# # Genomic dfs
# gen_dct = extract_unq_fea_dfs(gen, fea_sep='_')
# display(gen_dct['geneGE'].shape)
# display(gen_dct['c2cpMaxGE'].shape)
# display(gen_dct['c2cpMinGE'].shape)

# # Drug dfs
# drg_dct = extract_unq_fea_dfs(drg, fea_sep='_')
# display(drg_dct['DD'].shape)
# display(drg_dct['ECFP'].shape)
# display(drg_dct['PFP'].shape)

## Some functions for the pytroch dataset

In [15]:
# res_ = res.copy()
# gen_ = gen.copy()
# drg_ = drg.copy()

# # Extract specific fea types
# gen_fea_list = ['geneGE']
# drg_fea_list = ['DD']

# cnt_fea(gen_, fea_sep='_', verbose=True);
# gen_ = extract_subset_fea(df=gen_, fea_list=gen_fea_list, fea_sep='_')
# cnt_fea(gen_, fea_sep='_', verbose=True);

# cnt_fea(drg_, fea_sep='_', verbose=True);
# drg_ = extract_subset_fea(df=drg, fea_list=drg_fea_list, fea_sep='_')
# cnt_fea(drg_, fea_sep='_', verbose=True);

In [16]:
# # Bring the labels in
# gen_ = gen_.reset_index().rename(columns={'index': 'ccl_name'})
# drg_ = drg_.reset_index().rename(columns={'index': 'ctrpDrugID'})

# # Extract src and fold
# src = 'CCLE'
# fold = 0
# path = ccl_folds_dir/f'{src}/cv_{fold}' # 'TestList.txt'

# tr_id = pd.read_csv(path/'TrainList.txt', header=None).squeeze().values
# vl_id = pd.read_csv(path/'ValList.txt', header=None).squeeze().values
# te_id = pd.read_csv(path/'TestList.txt', header=None).squeeze().values

# # Show how much is left for train, val, and test
# tr_sz, vl_sz, te_sz = len(tr_id), len(vl_id), len(te_id)
# sz = tr_sz + vl_sz + te_sz
# print(tr_sz/sz)

# # Retain specific source and shuffle
# print(res_.shape)
# res_ = res_[ res['SOURCE'].isin([src]) ]
# # res_ = res_.sample(frac=1.0, random_state=42).reset_index(drop=True)
# print(res_.shape)

In [17]:
# # Merge data
# len(res_.ccl_name.unique())

# mrg = pd.merge(res_, gen_, on='ccl_name', how='inner')
# mrg = pd.merge(mrg, drg_, on='ctrpDrugID', how='inner')
# print(mrg.shape)
# display(mrg[:2])

In [18]:
# # Get tr, vl, and te sets
# tr_df = mrg[mrg['ccl_name'].isin(tr_id)]
# vl_df = mrg[mrg['ccl_name'].isin(vl_id)]
# te_df = mrg[mrg['ccl_name'].isin(te_id)]

# print(tr_df.shape)
# print(vl_df.shape)
# print(te_df.shape)

# PyTorch data generator

In [19]:
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [20]:
print(res.shape)
print(gen.shape)
print(drg.shape)

(708662, 6)
(1430, 4582)
(1402, 4392)


In [21]:
# drg_df = drg.copy()
# gen_df = gen.copy()

# drg_fea_dct = {idx: row.values for idx, row in drg_df.iterrows()}
# gen_fea_dct = {idx: row.values for idx, row in gen_df.iterrows()}

# res.values

### Test code of pytroch dataset

In [22]:
# res_df = res.copy()
# drg_df = drg.copy()
# gen_df = gen.copy()

# tr_ph = 'train'
# fold = 0
# src = 'CCLE'
# gen_fea_list = ['geneGE']
# drg_fea_list = ['DD']

# res_df = res_df[ res_df['SOURCE'].isin([src]) ]
# drg_df = extract_subset_fea(drg_df, drg_fea_list, fea_sep='_')
# gen_df = extract_subset_fea(gen_df, gen_fea_list, fea_sep='_')

# path = ccl_folds_dir/f'{src}/cv_{fold}'

# res_arr = res_df.values
# drg_fea_dct = {idx: row.values for idx, row in drg_df.iterrows()}
# gen_fea_dct = {idx: row.values for idx, row in gen_df.iterrows()}

## Pytorch dataset

In [23]:
class CCLDataset(Dataset):
    # discuss.pytorch.org/t/data-processing-as-a-batch-way/14154
    # github.com/utkuozbulak/pytorch-custom-dataset-examples#incorporating-pandas
    # nbviewer.jupyter.org/github/FraPochetti/KagglePlaygrounds/blob/master/NYC%20Taxi%20Fares%20Prediction.ipynb
    
    def __init__(self,
                 res_df: pd.DataFrame,
                 gen_df: pd.DataFrame,
                 drg_df: pd.DataFrame,
                 ccl_folds_dir: str,
                 src: str,
                 fold: int,
                 tr_ph: str,
                 gen_fea_list: list=None,
                 drg_fea_list: list=None,
                 fea_sep: str='_',
                 drg_dsc_preproc: str=None,   # TODO
                 cell_rna_preproc: str=None,  # TODO
                 verbose: bool=True):
        """ 
        Args:
            res_df (pd.DataFrame) : drug response df
            gen_df (pd.DataFrame) : genomics feature df
            drg_df (pd.DataFrame) : drug feature df
            ccl_folds_dir (str) : folder path that contains cv patitions in text files
            src (str) : source name
            fold (int) : fold index
            tr_ph (str) : training phase ('tr', 'vl', 'te')
            gen_fea_list (list) : list of prefixes of genomics features to retain
            drg_fea_list (list) : list of prefixes of drug features to retain
            fea_sep (str) : separator of feature prefix that indicates type and feature name
            drg_dsc_preproc (str) : TODO: not implemented
            cell_rna_preproc (str) : TODO: not implemented
            verbose : bool=True
        """
        
        # ============================================
        # Initialize
        # ============================================
        self.ccl_folds_dir = ccl_folds_dir
        self.src = src
        self.fold = fold
        self.tr_ph = tr_ph.lower()
        self.gen_fea_list = gen_fea_list
        self.drg_fea_list = drg_fea_list
        self.fea_sep = fea_sep
        self.drg_dsc_preproc = None
        self.cell_rna_preproc = None

        # ============================================
        # Get the ccl names
        # ============================================
        if self.tr_ph in ['tr', 'train', 'training']:
            self.ids_fname = 'TrainList.txt'
        elif self.tr_ph in ['vl', 'val', 'validation']:
            self.ids_fname = 'ValList.txt'
        elif self.tr_ph in ['te', 'test', 'testing']:
            self.ids_fname = 'TestList.txt'
        else:
            raise ValueError('Wrong `tr_ph` specified.')
            
        self.ids_path = self.ccl_folds_dir/f'{self.src}/cv_{self.fold}'/self.ids_fname # 'TestList.txt'        
        self.ids_list = pd.read_csv(self.ids_path, header=None).squeeze().values
        
        # ============================================
        # Load dfs
        # ============================================
        res_df = res_df[ res_df['SOURCE'].isin([src]) ]
        self.res_df = res_df[ res_df['ccl_name'].isin( self.ids_list ) ]
        
        self.gen_df = drg_df if self.gen_fea_list is None else self.extract_subset_fea(gen_df, fea_list=self.gen_fea_list, fea_sep=self.fea_sep)
        self.drg_df = drg_df if self.drg_fea_list is None else self.extract_subset_fea(drg_df, fea_list=self.drg_fea_list, fea_sep=self.fea_sep)
        
        # ============================================
        # Public attributes
        # ============================================
        self.cells = self.res_df['ccl_name'].unique().tolist()
        self.drugs = self.res_df['ctrpDrugID'].unique().tolist()
        self.num_records = len(self.res_df)
        self.gen_dim = self.gen_df.shape[1]
        self.drg_dim = self.drg_df.shape[1]
        
        self.gen_fea_cnt = self.cnt_fea(self.gen_df, fea_sep=self.fea_sep, verbose=False)
        self.drg_fea_cnt = self.cnt_fea(self.drg_df, fea_sep=self.fea_sep, verbose=False)
        
        # ============================================
        # Convert dfs to arrays and dict for faster access
        # ============================================
        self.res_arr = self.res_df.values
        self.gen_fea_dct = {idx: row.values for idx, row in self.gen_df.iterrows()}
        self.drg_fea_dct = {idx: row.values for idx, row in self.drg_df.iterrows()}

        # ============================================
        # Summary
        # ============================================
        if verbose:
            print('=' * 80)
            print(f'Data source: {self.src}')
            print(f'Phase: {tr_ph}')
            print(f'Data points: {self.num_records}')
            print(f'Unique cells: {len(self.cells)}')
            print(f'Unique drugs: {len(self.drugs)}')
            print(f'gen_df.shape: {self.gen_df.shape}')
            print(f'drg_df.shape: {self.drg_df.shape}')
            print(f'Genomic features: {self.drg_fea_cnt}')
            print(f'Drug features:    {self.gen_fea_cnt}')
            

        
    def __len__(self):
        return len(self.res_arr)

    
    def __getitem__(self, index):
        """ 
        Ref: github.com/xduan7/UnoPytorch/blob/master/utils/datasets/drug_resp_dataset.py
        Look for __getitem__ in DrugRespDataset
        
        res indices: [idx, SOURCE, ccl_name, ctrpDrugID, area_under_curve, groupID]
        """
        res = self.res_arr[index]
        
        idx = res[0]
        src = res[1]
        ccl_id = res[2]
        drg_id = res[3]
        auc = res[4]
        
        ccl_fea = self.gen_fea_dct[ccl_id]
        drg_fea = self.drg_fea_dct[drg_id]
        
        # Cast values
        ccl_fea = ccl_fea.astype(np.float32)
        drg_fea = drg_fea.astype(np.float32)
                
        return idx, src, ccl_id, drg_id, auc, ccl_fea, drg_fea
    
    
    def extract_subset_fea(self, df, fea_list, fea_sep='_'):
        """ Extract features based feature prefix name. """
        fea = [c for c in df.columns if (c.split(fea_sep)[0]) in fea_list]
        df = df[fea]
        return df    
    
    
    def cnt_fea(self, df, fea_sep='_', verbose=True):
        """ Count the number of features per feature type. """
        dct = {}
        unq_prfx = df.columns.map(lambda x: x.split(fea_sep)[0]).unique() # unique feature prefixes
        for prfx in unq_prfx:
            fea_type_cols = [c for c in df.columns if (c.split(fea_sep)[0]) in prfx] # all fea names of specific type
            dct[prfx] = len(fea_type_cols)
        if verbose: print(dct)
        return dct

In [24]:
# Define datasets
gen_fea_list = ['geneGE']
drg_fea_list = ['DD']

ds_kwargs = {
    'res_df': res,
    'gen_df': gen,
    'drg_df': drg,
    'ccl_folds_dir': ccl_folds_dir,
    'src': 'CCLE',
    'gen_fea_list': gen_fea_list,
    'drg_fea_list': drg_fea_list,
    'fea_sep': '_'}

fold = 0
tr_ds = CCLDataset(tr_ph = 'tr', fold=fold, **ds_kwargs)
vl_ds = CCLDataset(tr_ph = 'vl', fold=fold, **ds_kwargs)
te_ds = CCLDataset(tr_ph = 'te', fold=fold, **ds_kwargs)

Data source: CCLE
Phase: tr
Data points: 8755
Unique cells: 378
Unique drugs: 24
gen_df.shape: (1430, 1927)
drg_df.shape: (1402, 2344)
Genomic features: {'DD': 2344}
Drug features:    {'geneGE': 1927}
Data source: CCLE
Phase: vl
Data points: 1117
Unique cells: 48
Unique drugs: 24
gen_df.shape: (1430, 1927)
drg_df.shape: (1402, 2344)
Genomic features: {'DD': 2344}
Drug features:    {'geneGE': 1927}
Data source: CCLE
Phase: te
Data points: 1099
Unique cells: 48
Unique drugs: 24
gen_df.shape: (1430, 1927)
drg_df.shape: (1402, 2344)
Genomic features: {'DD': 2344}
Drug features:    {'geneGE': 1927}


## Data loaders
TODO: Need to confirm the rows generated by the loaders are consistent with the rows that I get from merged df!

In [26]:
# Define data loaders
batch_size = 1
num_workers = 1
tr_loader_kwargs = {'batch_size': batch_size, 'shuffle': True, 'num_workers': num_workers}
vl_loader_kwargs = {'batch_size': batch_size, 'shuffle': False, 'num_workers': num_workers} # 4*batch_size
te_loader_kwargs = {'batch_size': batch_size, 'shuffle': False, 'num_workers': num_workers} # 4*batch_size

tr_loader = DataLoader(tr_ds, **tr_loader_kwargs)
vl_loader = DataLoader(vl_ds, **vl_loader_kwargs)
te_loader = DataLoader(te_ds, **te_loader_kwargs)

In [27]:
ret = next(iter(tr_loader))  # (idx, SOURCE, ccl_id, drg_id, auc, ccl_fea, drg_fea)

idx = ret[0].item()
src = ret[1]
cell_id = ret[2]
drug_id = ret[3]
auc = ret[4].item()
cell_fea = ret[5]
drug_fea = ret[6]

print('cell_fea:', cell_fea.shape)
print('drug_fea:', drug_fea.shape)
ret

cell_fea: torch.Size([1, 1927])
drug_fea: torch.Size([1, 2344])


[tensor([3128]),
 ('CCLE',),
 ('CCL_361',),
 ('Drug_15',),
 tensor([0.8596], dtype=torch.float64),
 tensor([[-0.0517,  0.0277, -0.0287,  ..., -0.3625, -1.0558,  1.4393]]),
 tensor([[-0.2059, -0.1298, -0.1793,  ...,  0.0000,  0.0000,  0.0000]])]

In [28]:
# # print( sum(tr_ds.res_df['ccl_name']==ret[2][0]) )
# # print( sum(tr_ds.res_df['ctrpDrugID']==ret[3][0]) )
# # sum( (tr_ds.res_df['ccl_name']==ret[2][0]) & (tr_ds.res_df['ctrpDrugID']==ret[3][0]) )

# tr_ds.res_df.loc[ret[0]]

In [29]:
# # Now, look at the merged dataset
# tmp = tr_df[tr_df['idx']==ret[0].item()]
# display(tmp.iloc[:, :10])

# ge = extract_subset_fea(df=tmp, fea_list=gen_fea_list, fea_sep='_')
# dd = extract_subset_fea(df=tmp, fea_list=drg_fea_list, fea_sep='_')

# print('ge.shape', ge.shape)
# print('dd.shape', dd.shape)

## Define NN
Look at "Constructing and initializing neural networks" in https://github.com/xduan7/UnoPytorch/blob/master/uno_pytorch.py

In [None]:
class NN(nn.Module):
    def __init__(self):
        pass
    
    def forward(self, x):
        pass
        return x
    

In [None]:
nn = NN(ge_dim, dd_dim).to(device)

In [None]:
res_opt = get_opt()

## Training/validation loops
github.com/xduan7/UnoPytorch/blob/master/uno_pytorch.py

In [None]:
max_epochs = 100