# 0. Setup

In [2]:
# pip install --user GitPython
# pip install --user rdkit

In [3]:
# Verify we're in the correct working directory
import os
import git 
from pathlib import Path

def get_project_root():
    return Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

root = get_project_root()

os.chdir(root)
os.getcwd()

'/Users/seraphinashi/Desktop/DataFusion/DrugResponse_Omics_Molecules'

In [4]:
plot_folder = "images/GDSC/"

## import packages, models, trainers

In [5]:
import argparse
import logging
import sys
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200

import torch
from torch import nn, optim, Tensor
from torch.nn import functional as F
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

print('pytorch version:', torch.__version__)
print('orig num threads:', torch.get_num_threads())



pytorch version: 1.13.1
orig num threads: 4


In [6]:
from models import *
from trainers import *
from losses import *
from utils import *
# from cpd_smiles_embed import *

In [7]:
import random
seed=42

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# 1. Prepare dataset

## Load 

In [8]:
data_folder = "data/GDSC"
c_data = pd.read_csv(os.path.join(data_folder, "c_data.csv"), index_col = 0)
c_meta = pd.read_csv(os.path.join(data_folder, "c_meta.csv"), index_col = 0)
# RNAseq_meta['COSMIC_ID'] = RNAseq_meta['COSMIC_ID'].astype(int)

d_data = pd.read_csv(os.path.join(data_folder, "d_data.csv"), index_col = 0)

cdr = pd.read_csv(os.path.join(data_folder, "cdr.csv"), index_col = 0)

In [9]:
cdr.shape

(847, 174)

## Prepare data
Skin cancer

In [10]:
c_types = ["SKCM"] 

c_meta = c_meta[c_meta["cancer_type"].isin(c_types)]
c_data = c_data[c_data.index.isin(c_meta["COSMIC_ID"])]
cdr = cdr[cdr.index.isin(c_meta["COSMIC_ID"])]

In [11]:
cdr.shape

(54, 174)

In [12]:
# 1. prepare c_meta, 
c_meta_id_col_name = 'COSMIC_ID'
c_meta_type_col_name = 'cancer_type'

c_meta = c_meta[[c_meta_id_col_name, c_meta_type_col_name]]
c_meta = c_meta.rename(columns = {c_meta_id_col_name:'C_ID', c_meta_type_col_name:'C_type'})
c_meta = c_meta[~c_meta['C_ID'].isnull()]

c_meta, meta_map = get_CCL_meta_codes(c_data.index.values, c_meta)
c_meta.index = c_meta.index.astype(str)

print(f"Cancer type coding map: ")
print(meta_map)

Cancer type coding map: 
  C_type  code  count
0   SKCM     0     54


### Get clusters from last time train the model

In [13]:
cdr_hat = pd.read_csv(os.path.join(data_folder, "GDSC_skin_cdr_hat.csv"), index_col = 0)
cdr_hat.head()

cdr_hat_tmp = cdr_hat[['c_name', 'cluster']]
cdr_hat_tmp = cdr_hat_tmp.drop_duplicates()
cdr_hat_tmp['c_name'] = cdr_hat_tmp['c_name'].astype(str)

c_meta_tmp = c_meta.copy()
c_meta_tmp['c_name'] = c_meta_tmp.index.values.astype(str)
c_meta_new = pd.merge(c_meta_tmp, cdr_hat_tmp, on = 'c_name', how = 'left')
c_meta_new = c_meta_new.loc[:, ['c_name', 'cluster']]
c_meta_new = c_meta_new.rename(columns = {'c_name':'C_ID', 'cluster':'C_type'})

c_meta_new, meta_map_new = get_CCL_meta_codes(c_data.index.values, c_meta_new)
c_meta_new.index = c_meta_new.index.astype(str)

print(f"Cancer type coding map: ")
print(meta_map_new)

Cancer type coding map: 
  C_type  code  count
0     -1     0     17
3      0     1     37


In [14]:
# 2. prepare c_data
## make sure: 
##   1. the index (row names) is cancer cell line names
c_data.index = c_data.index.astype(str)
c_data.shape

(54, 5703)

In [15]:
# 3. prepare d_data
## make sure: 
##   1. the index (row names) is drug names
# cpd_smiles = cpd_smiles[['drug_id', 'smiles']]
# cpd_smiles = cpd_smiles.set_index('drug_id')

# d_data = smiles_to_AtonBondDescriptor_PCAembedings(cpd_smiles)
d_data.index = d_data.index.astype(str)

d_data.shape

(174, 75)

In [16]:
# 4. prepare cdr
## make sure: 
##   1. the index (row names) is cancer cell line names
##   2. the columns (column names) is drug names
cdr.index = cdr.index.astype("str")

common_drugs = list(set(cdr.columns).intersection(set(d_data.index)))
cdr = cdr[common_drugs]
d_data = d_data.loc[common_drugs]

common_cancers = list(set(cdr.index).intersection(set(c_data.index)))
cdr = cdr.loc[common_cancers]
c_data = c_data.loc[common_cancers]
c_meta = c_meta.loc[common_cancers]

print(f'cdr shape: {cdr.shape}')
print(f'c_data shape: {c_data.shape}')
print(f'c_meta shape: {c_meta.shape}')
print(f'd_data shape: {d_data.shape}')

cdr shape: (54, 174)
c_data shape: (54, 5703)
c_meta shape: (54, 2)
d_data shape: (174, 75)


# 2. Hyperparameters

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(device)
print(device)

cpu


In [20]:
class Train_Args:
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __contains__(self, key):
        return hasattr(self, key)

    valid_size = 0.2 #@param {type: "float"}

    n_epochs = 200 #@param {type: "integer"}
    batch_size = 50 #@param {type: "integer"}
    lr = 0.01 #@param {type: "float"}

    C_VAE_loss_weight = 1 #@param {type: "float"}
    C_recon_loss_weight = 0.1 #@param {type: "float"}
    C_kld_weight = 0.5 #@param {type: "float"}
    C_cluster_distance_weight = 150 #@param {type: "float"}
    
    D_VAE_loss_weight = 1 #@param {type: "float"}
    D_recon_loss_weight = 1 #@param {type: "float"}
    D_kld_weight = 0.2 #@param {type: "float"}
    D_cluster_distance_weight = 50 #@param {type: "float"}
    
    predict_loss_weight = 2000 #@param {type: "float"}  
    
    cVAE_save_path = 'data/model_fits/GDSC_skin2_c_vae' #@param
    dVAE_save_path = 'data/model_fits/GDSC_skin2_d_vae_skin' #@param
    
    c_p_save_path = 'data/model_fits/GDSC_skin2_c_vae_predictor' #@param
    d_p_save_path = 'data/model_fits/GDSC_skin2_d_vae_predictor' #@param

    

class CDPModel_sub_Args:
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __contains__(self, key):
        return hasattr(self, key)

    # c_VAE
    c_input_dim = 0 #@param {type: "integer"}
    c_h_dims = [1024, 512, 256] #@param {type: "vactor"}
    c_latent_dim = 32 #@param {type: "integer"}

    # d_VAE
    d_input_dim = 0 #@param {type: "integer"}
    d_h_dims = [64]  #@param {type: "vactor"}
    d_latent_dim = 32 #@param {type: "integer"}

    # predictor
    p_sec_dim = 16 #@param {type: "integer"}
    p_h_dims = [p_sec_dim*2, 16]  #@param {type: "vactor"}
    
    # all
    drop_out = 0  #@param {type: "float"}
    
    # sensitive threshold
    sens_cutoff = 0.5
    



In [21]:
train_args = Train_Args()

K = len(c_meta[c_meta['code'] != -1]['code'].unique())

CDPmodel_args = CDPModel_sub_Args()
CDPmodel_args['c_input_dim'] = c_data.shape[1] 
CDPmodel_args['d_input_dim'] = d_data.shape[1]

if CDPmodel_args['c_input_dim'] <= 0:
  warnings.warn(
      '''\nCancer Cell line feature number not specified''')
if CDPmodel_args['d_input_dim'] <= 0:
  warnings.warn(
      '''\nDrug feature number not specified''')

# 3. Train Model

In [22]:
CDPmodel = CDPmodel(K, CDPmodel_args)

In [None]:
n_rounds = 3
returns = CDPmodel.fit(c_data, c_meta, d_data, cdr, train_args, n_rounds=n_rounds, device = device)
# c_meta, c_meta_hist, d_sens_hist, losses_train_hist_list, best_epos_list, c_latent_list, d_latent_list = returns
c_meta, c_meta_hist, d_sens_hist, losses_train_hist_list, best_epos_list, C_VAE_init_losses, D_VAE_init_losses, c_latent_list, d_latent_list = returns


=> Initialize C-VAE:


# 4. Results and visualizations

In [None]:
cdr_hat = CDPmodel.predict(c_data, d_data)
pd.crosstab(cdr_hat.cluster, cdr_hat.cdr_hat, rownames = ['cluster'], colnames = ['cdr_hat'])

In [None]:
cdr_train_hat = CDPmodel.predict(c_data, d_data)

cdr_train_rslt = cdr.copy()
cdr_train_rslt['c_name'] = cdr_train_rslt.index.values
cdr_train_rslt = pd.melt(cdr_train_rslt, id_vars='c_name', value_vars=None, var_name=None, value_name='value', col_level=None)
cdr_train_rslt = cdr_train_rslt.rename(columns={'variable':'d_name', 'value':'cdr'})


cdr_train_rslt = pd.merge(cdr_train_rslt, cdr_train_hat, on=['c_name', 'd_name'], how='outer')

cdr_train_rslt

## Clusters

In [None]:
print('Cancer clustering before:')
print(c_meta_hist.code.value_counts())
print('Cancer clustering after:')
print(c_meta_hist.code_latest.value_counts())

In [None]:
print('Sensitive to clusters before:')
print(d_sens_hist.sensitive_k.value_counts())
print('Sensitive to clusters after:')
print(d_sens_hist.sensitive_k_latest.value_counts())

## Visualizations

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
for k in range(K):
    plot_c_PCA_latent(c_data, c_latent_list, c_meta_hist, n_rounds, legend_title='cluster', k=k, 
                      plot_save_path=f'results/images/GDSC/GDSC_skin_c_latent_k{k}.png')

In [None]:
for k in range(K):
    plot_d_PCA_latent(d_data, d_latent_list, d_sens_hist, n_rounds, legend_title='cluster', k=k, 
                      plot_save_path=f'results/images/GDSC/GDSC_skin_d_latent_k{k}.png')

In [None]:
for k in range(K):
    print(f'k = {k}:')
    for b in range(n_rounds):
        print(f'round {b}:')
        plot_training_losses_train_test_2cols(losses_train_hist_list[b][k], best_epoch_1round = best_epos_list[b][k],
                                              plot_save_path=f'results/images/GDSC/GDSC_skin_losses_b{b}_k{k}.png')
        

In [None]:
def plot_pre_training_losses_train_test_2cols(losses_train_hist_list_1round, best_epoch_1round = [], plot_save_path=''):
    
    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
    fig.suptitle('')
    
    losses = losses_train_hist_list_1round[0]
    axs[0,0].plot(np.array(losses["epoch"]), 
                  np.array(losses["prediction_loss_train"]), 
                  label = "prediction loss (train)");
    if best_epoch_1round != []:
            axs[0,0].axvline(x=best_epoch_1round[0], color='r', linestyle='--')
    axs[0,0].set_title('(c) D-VAE & Predictor losses [train]')

    
    
    axs[0,1].plot(np.array(losses["epoch"]), 
                  np.array(losses["prediction_loss_test"]), 
                  label = "prediction loss (test)");
    if best_epoch_1round != []:
        axs[0,1].axvline(x=best_epoch_1round[0], color='r', linestyle='--')
    axs[0,1].set_title('(c) D-VAE & Predictor losses [test]')
    axs[0,1].legend(loc='center left', bbox_to_anchor=(1, 0.5))
    

    
    losses = losses_train_hist_list_1round[1]
    axs[1,0].plot(np.array(losses["epoch"]), 
                  np.array(losses["prediction_loss_train"]), 
                  label = "prediction loss (train)");
    if best_epoch_1round != []:
            axs[1,0].axvline(x=best_epoch_1round[1], color='r', linestyle='--')
    axs[1,0].set_title('(g) C-VAE & Predictor losses [train]')
    
    
    axs[1,1].plot(np.array(losses["epoch"]), 
                  np.array(losses["prediction_loss_test"]), 
                  label = "prediction loss (test)");
    if best_epoch_1round != []:
        axs[1,1].axvline(x=best_epoch_1round[1], color='r', linestyle='--')
    axs[1,1].set_title('(g) C-VAE & Predictor losses [test]')
    axs[1,1].legend(loc='center left', bbox_to_anchor=(1, 0.5))

    plt.tight_layout()

    if plot_save_path != '':
        plt.savefig(plot_save_path, dpi=1200)

    plt.show()








In [None]:
for k in range(K):
    print(f'k = {k}:')
    for b in range(n_rounds):
        print(f'round {b}:')
        plot_pre_training_losses_train_test_2cols(losses_train_hist_list[b][k], best_epoch_1round = best_epos_list[b][k])
        

# New model with 2 clusters

In [None]:
cdr_hat_tmp = cdr_hat[['c_name', 'cluster']]
cdr_hat_tmp = cdr_hat_tmp.drop_duplicates()
cdr_hat_tmp['c_name'] = cdr_hat_tmp['c_name'].astype(str)

cdr_hat_tmp['cluster'].value_counts()

In [None]:
c_meta_tmp = c_meta.copy()
c_meta_tmp['c_name'] = c_meta_tmp.index.values.astype(str)
c_meta_new = pd.merge(c_meta_tmp, cdr_hat_tmp, on = 'c_name', how = 'left')
c_meta_new = c_meta_new.loc[:, ['c_name', 'cluster']]
c_meta_new = c_meta_new.rename(columns = {'c_name':'C_ID', 'cluster':'C_type'})

c_meta_new, meta_map_new = get_CCL_meta_codes(c_data.index.values, c_meta_new)
c_meta_new.index = c_meta_new.index.astype(str)

print(f"Cancer type coding map: ")
print(meta_map_new)

In [None]:
c_meta = c_meta_new

## 2. Hyperparameters

In [None]:
train_args = Train_Args()

K = len(c_meta[c_meta['code'] != -1]['code'].unique())

CDPmodel_args = CDPModel_sub_Args()
CDPmodel_args['c_input_dim'] = c_data.shape[1] 
CDPmodel_args['d_input_dim'] = d_data.shape[1]

if CDPmodel_args['c_input_dim'] <= 0:
  warnings.warn(
      '''\nCancer Cell line feature number not specified''')
if CDPmodel_args['d_input_dim'] <= 0:
  warnings.warn(
      '''\nDrug feature number not specified''')

In [None]:
K

## 3. Train Model

In [None]:
CDPmodel_new = CDPmodel(K, CDPmodel_args)

### 