# Try different decompositions. Specifically, those assuming that the data is non-negative.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
from pathlib import Path
import sys
from time import time
import numpy as np
import pandas as pd

import sklearn
from collections import OrderedDict
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

SEED = 42

In [2]:
# utils_path = Path(os.getcwd()) / '../../utils'
# sys.path.append(str(utils_path))
# import utils

# Load data

In [3]:
datadir = Path('../../data/yitan/Data')
ccl_folds_dir = Path('../../data/yitan/CCL_10Fold_Partition')
pdm_folds_dir = Path('../../data/yitan/PDM_10Fold_Partition')
fea_data_name = 'CCL_PDM_TransferLearningData_rmFactor_0.0_ddNorm_std.pkl'

In [4]:
# Un-pickle files
import _pickle as cp

pkl_file = open(datadir/fea_data_name, 'rb')
res = cp.load(pkl_file)
ccl = cp.load(pkl_file)
drg = cp.load(pkl_file)
pkl_file.close()

In [5]:
print('res: {}'.format(res.shape))
print('ccl: {}'.format(ccl.shape))
print('drg: {}'.format(drg.shape))

res: (708662, 5)
ccl: (1430, 4582)
drg: (1402, 4392)


# First look at the data

In [6]:
display(res[:2])
display(ccl.iloc[:2, :7])
display(drg.iloc[:2, :7])

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID,area_under_curve,groupID
0,CCLE,CCL_61,Drug_11,0.7153,0.0
1,CCLE,CCL_61,Drug_1,0.9579,0.9164


Unnamed: 0,geneGE_AARS,geneGE_ABCB6,geneGE_ABCC5,geneGE_ABCF1,geneGE_ABCF3,geneGE_ABHD4,geneGE_ABHD6
CCL_1,-0.125161,-0.400237,-0.960208,0.575207,-0.468406,-0.136257,0.083319
CCL_10,-0.217106,0.354776,-1.164841,0.328071,-0.735267,0.23299,-0.174979


Unnamed: 0,DD_MW|num,DD_AMW|num,DD_Sv|num,DD_Se|num,DD_Sp|num,DD_Si|num,DD_Mv|num
Drug_1,0.123446,0.526234,-0.07218,-0.088861,-0.05846,-0.0831,-0.009539
Drug_10,0.053188,1.9661,-0.333843,-0.379081,-0.359584,-0.398841,1.172374


In [7]:
res.groupby('SOURCE').agg({'ccl_name': 'nunique', 'ctrpDrugID': 'nunique'}).reset_index()

Unnamed: 0,SOURCE,ccl_name,ctrpDrugID
0,CCLE,474,24
1,CTRP,812,494
2,GDSC,670,238
3,NCI60,59,987
4,PDM,473,18
5,gCSI,357,16


# Get subset of features

In [8]:
def cnt_fea(df, fea_sep='_', verbose=True):
    """ Count the number of features per feature type. """
    dct = {}
    unq_prfx = df.columns.map(lambda x: x.split(fea_sep)[0]).unique() # unique feature prefixes
    for prfx in unq_prfx:
        fea_type_cols = [c for c in df.columns if (c.split(fea_sep)[0]) in prfx] # all fea names of specific type
        dct[prfx] = len(fea_type_cols)
    if verbose: print(dct)
    return dct

def extract_subset_fea(df, fea_list, fea_sep='_'):
    """ Extract features based feature prefix name. """
    fea = [c for c in df.columns if (c.split(fea_sep)[0]) in fea_list]
    df = df[fea]
    return df

In [9]:
ccl_fea_list = ['geneGE']
drg_fea_list = ['DD']

In [10]:
cnt_fea(ccl);
ge = extract_subset_fea(ccl, fea_list=ccl_fea_list)
cnt_fea(ge);

{'geneGE': 1927, 'c2cpMaxGE': 1328, 'c2cpMinGE': 1327}
{'geneGE': 1927}


In [11]:
cnt_fea(drg);
dd = extract_subset_fea(drg, fea_list=drg_fea_list)
cnt_fea(dd);

{'DD': 2344, 'ECFP': 1024, 'PFP': 1024}
{'DD': 2344}


# PCA

In [12]:
def plt_explained_variance_pca(pca):
    # Plot explained variance PCA
    plt.figure(figsize=(8, 6))
    plt.plot(pca.explained_variance_ratio_, '--o')
    plt.xlabel('Principal component', fontsize=14)
    plt.ylabel('Variance explained (ratio)', fontsize=14)
    plt.grid(True)
    
def pca_exp_var(pca, k):
    print(np.sum(pca.explained_variance_ratio_[:k]))
    
def dump_pca_data(datadir, pca_x, k, fea_name):
    df = pd.DataFrame(pca_x.iloc[:, :k])
    df.to_csv(datadir/f'{fea_name}_pca{k}.csv', index=False)    

### GE (gene expression)

In [13]:
index = ge.index
fea_name = 'ge'

In [14]:
n = 1400
ge_pca = PCA(n_components=n, random_state=SEED, svd_solver='auto')
ge_pca_x = ge_pca.fit_transform(ge)
ge_pca_x = pd.DataFrame(ge_pca_x, index=index, columns=[f'{fea_name}_PC'+str(i+1) for i in range(n)])

In [15]:
pca_exp_var(ge_pca, k=1400)

0.9999387036197616


In [16]:
k=120
pca_exp_var(ge_pca, k)
dump_pca_data(datadir, ge_pca_x, k, fea_name=fea_name);

0.7543514715445558


In [17]:
k=550
pca_exp_var(ge_pca, k)
dump_pca_data(datadir, ge_pca_x, k, fea_name=fea_name);

0.9505486040316659


In [18]:
k=940
pca_exp_var(ge_pca, k=940)
dump_pca_data(datadir, ge_pca_x, k, fea_name=fea_name);

0.9909181163806056


### DD (drug descriptors)

In [19]:
index = dd.index
fea_name = 'dd'

In [20]:
n = 1400
dd_pca = PCA(n_components=n, random_state=SEED, svd_solver='auto')
dd_pca_x = dd_pca.fit_transform(dd)
dd_pca_x = pd.DataFrame(dd_pca_x, index=index, columns=[f'{fea_name}_PC'+str(i+1) for i in range(n)])

In [21]:
pca_exp_var(dd_pca, k=1400)

1.0


In [22]:
k=120
pca_exp_var(dd_pca, k)
dump_pca_data(datadir, dd_pca_x, k, fea_name=fea_name);

0.8572525902667022


In [23]:
k=500
pca_exp_var(dd_pca, k)
dump_pca_data(datadir, dd_pca_x, k, fea_name=fea_name);

0.9912240461620668
