In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
from pathlib import Path
import sys
from time import time
import numpy as np
import pandas as pd

import sklearn
from collections import OrderedDict
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

SEED = None

In [2]:
datadir = Path('../../data/yitan/Data')
ccl_folds_dir = Path('../../data/yitan/CCL_10Fold_Partition')
pdm_folds_dir = Path('../../data/yitan/PDM_10Fold_Partition')

In [3]:
import _pickle as cp

# pkl_file = open('../Data/CCL_PDM_TransferLearningData_rmFactor_0.0_ddNorm_std.pkl', 'rb')
pkl_file = open(datadir/'CCL_PDM_TransferLearningData_rmFactor_0.0_ddNorm_std.pkl', 'rb')
res = cp.load(pkl_file)
genomics = cp.load(pkl_file)
drug = cp.load(pkl_file)
pkl_file.close()

In [4]:
print('res     ', res.shape)
print('genomics', genomics.shape)
print('drug    ', drug.shape)

res      (708662, 5)
genomics (1430, 4582)
drug     (1402, 4392)


In [5]:
def cnt_feas(df, sep='_'):
    """ Count the number of unique features types. """
    dct = {}
    for c in df.columns:
        prfx = c.split(sep)[0]
        if prfx in dct.keys():
            dct[prfx] += 1
        else:
            dct[prfx] = 1
    print(dct)

In [6]:
cnt_feas(genomics, sep='_')
cnt_feas(drug, sep='_')

{'geneGE': 1927, 'c2cpMaxGE': 1328, 'c2cpMinGE': 1327}
{'DD': 2344, 'ECFP': 1024, 'PFP': 1024}


In [7]:
display(res[:2])
display(genomics[:2])
display(drug[:2])

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID,area_under_curve,groupID
0,CCLE,CCL_61,Drug_11,0.7153,0.0
1,CCLE,CCL_61,Drug_1,0.9579,0.9164


Unnamed: 0,geneGE_AARS,geneGE_ABCB6,geneGE_ABCC5,geneGE_ABCF1,geneGE_ABCF3,geneGE_ABHD4,geneGE_ABHD6,geneGE_ABI1,geneGE_ABL1,geneGE_ABL2,...,c2cpMinGE_NABA_COLLAGENS,c2cpMinGE_NABA_ECM_GLYCOPROTEINS,c2cpMinGE_NABA_ECM_REGULATORS,c2cpMinGE_NABA_ECM_AFFILIATED,c2cpMinGE_NABA_PROTEOGLYCANS,c2cpMinGE_NABA_SECRETED_FACTORS,c2cpMinGE_NABA_CORE_MATRISOME,c2cpMinGE_NABA_MATRISOME_ASSOCIATED,c2cpMinGE_NABA_BASEMENT_MEMBRANES,c2cpMinGE_NABA_MATRISOME
CCL_1,-0.125161,-0.400237,-0.960208,0.575207,-0.468406,-0.136257,0.083319,-0.351853,0.41738,0.851592,...,-0.791218,0.808121,0.194184,-0.926096,-0.166554,0.226442,0.397394,-0.212669,1.256016,-0.119846
CCL_10,-0.217106,0.354776,-1.164841,0.328071,-0.735267,0.23299,-0.174979,0.073205,-0.424111,0.435865,...,0.627679,0.399675,0.703399,0.444488,0.552236,-0.118246,0.490652,0.493968,-0.277873,0.594839


Unnamed: 0,DD_MW|num,DD_AMW|num,DD_Sv|num,DD_Se|num,DD_Sp|num,DD_Si|num,DD_Mv|num,DD_Me|num,DD_Mp|num,DD_Mi|num,...,PFP_1015|int,PFP_1016|int,PFP_1017|int,PFP_1018|int,PFP_1019|int,PFP_1020|int,PFP_1021|int,PFP_1022|int,PFP_1023|int,PFP_1024|int
Drug_1,0.123446,0.526234,-0.07218,-0.088861,-0.05846,-0.0831,-0.009539,-0.082688,0.154432,0.23353,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
Drug_10,0.053188,1.9661,-0.333843,-0.379081,-0.359584,-0.398841,1.172374,1.412555,0.955882,0.357831,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


## Get batch

In [8]:
src = 'CCLE'
fold = 0
path = ccl_folds_dir/f'{src}/cv_{fold}' # 'TestList.txt'

In [9]:
tr_id = pd.read_csv(path/'TrainList.txt', header=None).squeeze().values
vl_id = pd.read_csv(path/'ValList.txt', header=None).squeeze().values
te_id = pd.read_csv(path/'TestList.txt', header=None).squeeze().values

In [10]:
tr_sz, vl_sz, te_sz = len(tr_id), len(vl_id), len(te_id)
sz = tr_sz + vl_sz + te_sz
tr_sz/sz

0.7974683544303798

In [11]:
print(res.shape)
res = res[ res['SOURCE'].isin([src]) ]
res = res.sample(frac=1.0, random_state=42).reset_index(drop=True)
print(res.shape)

(708662, 5)
(10971, 5)


In [13]:
res[:3]

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID,area_under_curve,groupID
0,CCLE,CCL_721,Drug_23,0.8402,0.3803
1,CCLE,CCL_1029,Drug_16,0.9736,0.9583
2,CCLE,CCL_838,Drug_20,0.7968,0.0


In [28]:
res_b = res[:32]
len(res_b.ccl_name.unique())

32

In [32]:
gen_b = genomics.reset_index().rename(columns={'index': 'ccl_name'})
drug_b = drug.reset_index().rename(columns={'index': 'ctrpDrugID'})

In [33]:
mrg = pd.merge(res_b, gen_b, on='ccl_name')

In [35]:
mrg = pd.merge(mrg, drug_b, on='ctrpDrugID')

In [37]:
mrg.head()

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID,area_under_curve,groupID,geneGE_AARS,geneGE_ABCB6,geneGE_ABCC5,geneGE_ABCF1,geneGE_ABCF3,...,PFP_1015|int,PFP_1016|int,PFP_1017|int,PFP_1018|int,PFP_1019|int,PFP_1020|int,PFP_1021|int,PFP_1022|int,PFP_1023|int,PFP_1024|int
0,CCLE,CCL_721,Drug_23,0.8402,0.3803,0.96781,1.036133,-0.302806,-0.382533,-2.022989,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
1,CCLE,CCL_847,Drug_23,0.8238,0.0,-0.6994,-0.987664,0.558943,0.068202,0.948403,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
2,CCLE,CCL_1029,Drug_16,0.9736,0.9583,0.557073,-0.617276,-0.666227,0.489379,-0.60275,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,CCLE,CCL_838,Drug_20,0.7968,0.0,-1.311603,-1.600102,1.332247,-0.048945,0.7941,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
4,CCLE,CCL_309,Drug_20,0.7725,0.0,-0.807465,0.632148,-0.717308,-0.35584,0.233752,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
