# 0. Setup

In [1]:
# Verify we're in the correct working directory
import os
os.getcwd()   

'/Users/seraphinashi/Desktop/DataFusion/DrugResponse_Omics_Molecules/scripts_model'

In [2]:
os.chdir('/Users/seraphinashi/Desktop/DataFusion/DrugResponse_Omics_Molecules')
os.getcwd()

'/Users/seraphinashi/Desktop/DataFusion/DrugResponse_Omics_Molecules'

In [3]:
plot_folder = "images/simulation/"

## import packages, models, trainers

In [4]:
# pip install torchvision

In [5]:
import argparse
import logging
import sys
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200

import torch
from torch import nn, optim, Tensor
from torch.nn import functional as F
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, TensorDataset

In [6]:
print('pytorch version:', torch.__version__)

pytorch version: 1.13.1


In [7]:
print('orig num threads:', torch.get_num_threads())

orig num threads: 4


In [8]:
from ccl_VAE import *
from d_VAE import *
from predictor import *
from utils import *

In [9]:
import random
seed=42

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# 1. Prepare dataset

## Load 

In [10]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [11]:
simu_folder = "data/simulations"
RNAseq = pd.read_csv(os.path.join(simu_folder, "simu1_RNAseq.csv"), index_col = 0)
RNAseq_meta = pd.read_csv(os.path.join(simu_folder, "simu1_RNAseq_meta.csv"), index_col = 0)
d_fp = pd.read_csv(os.path.join(simu_folder, "simu1_d_fp.csv"), index_col = 0)
cdr = pd.read_csv(os.path.join(simu_folder, "simu1_cdr.csv"), index_col = 0)

## Convert to tensor

In [12]:
RNAseq_meta, meta_map= get_CCL_meta_codes(RNAseq.columns.values, RNAseq_meta)
print(f"Cancer type coding map: {meta_map}")
print(f"Count of each coded cancer type:")
print(RNAseq_meta['code'].value_counts())

Cancer type coding map: [('grp2', 1) ('grp1', 0)]
Count of each coded cancer type:
1    41
0    35
Name: code, dtype: int64


# 2. Define Model

In [13]:
c_data = RNAseq.T
c_meta = RNAseq_meta

c_meta_org = c_meta.copy()

d_data = d_fp.T

cdr = cdr

## Hyperparameters

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(device)
print(device)

cpu


In [15]:
# for all
K = len(c_meta['code'].unique())

valid_size = 0.1
l_r = 0.05

n_epochs = 100
batch_size = 50

drop_out = 0

# c_VAE
c_input_dim = c_data.shape[1]
c_h_dims = [128, 64]
c_latent_dim = 32

c_cluster_distance_weight = 2

# d_VAE
d_input_dim = d_data.shape[1]
d_h_dims = [128, 64]
d_latent_dim = 32

d_cluster_distance_weight = 2

# predictor
p_sec_dim = 16
p_h_dims = [p_sec_dim*2, 16]

In [16]:
len(c_meta['code'].unique())

2

## Define C-VAE, D-VAE, and Predictor

In [17]:
c_vae_list = []
d_vae_list = []
predictor_list = []

for k in range(0,K):
    c_vae_list.append(c_VAE(input_dim=c_input_dim, h_dims=c_h_dims, latent_dim=c_latent_dim).to(device)) 
    d_vae_list.append(d_VAE(input_dim=d_input_dim, h_dims=d_h_dims, latent_dim=d_latent_dim).to(device))
    predictor_list.append(Predictor(c_input_dim=c_latent_dim, d_input_dim=d_latent_dim, sec_dim = p_sec_dim, h_dims=p_h_dims).to(device))
    

## Train c_vae

In [18]:
import time

In [19]:
# for k in range(0,K):
    

In [20]:
k = 0

In [21]:
#=================================================================================
# Train C_VAE
##---------------------
## prepare data 
X_train, X_valid = train_test_split(c_data, test_size=valid_size, random_state=42)
X_meta_train = get_CCL_meta(X_train.index.values, c_meta)
X_meta_valid = get_CCL_meta(X_valid.index.values, c_meta)

X_trainTensor = torch.FloatTensor(X_train.values).to(device)
X_meta_trainTensor = torch.FloatTensor(X_meta_train.values).to(device)
X_validTensor = torch.FloatTensor(X_valid.values).to(device)
X_meta_validTensor = torch.FloatTensor(X_meta_valid.values).to(device)

train_dataset = TensorDataset(X_trainTensor, X_meta_trainTensor)
valid_dataset = TensorDataset(X_validTensor, X_meta_validTensor)

X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True)

dataloaders_C = {'train':X_trainDataLoader,'val':X_validDataLoader}
##---------------------
## define optimizer
optimizer_e = optim.Adam(c_vae_list[k].parameters(), lr=1e-2)
exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e)

##---------------------
## update C_VAE
print(f"Training C_VAE {k}")
start = time.time()
c_vae_list[k],loss_train = train_c_VAE(
    vae=c_vae_list[k],
    data_loaders=dataloaders_C,
    cluster_label=k,
    cluster_distance_weight = c_cluster_distance_weight,
    optimizer=optimizer_e,
    n_epochs=n_epochs,
    scheduler=exp_lr_scheduler_e)
end = time.time()
print(f"   Running time: {end - start}")


Training C_VAE 0
   Running time: 0.7658121585845947


In [22]:
c_vae_list[k].parameters()

<generator object Module.parameters at 0x7fa048551890>

In [23]:
#=================================================================================
# Cell lines in cluster k with latent space that is not close to the centroid will be dropped from the cluster
c_latent = c_vae_list[k].encode(torch.from_numpy(c_data.values).float().to(device), repram=False)

if any(c_data.index.values == c_meta.index.values):
    c_meta = get_CCL_meta(c_data.index.values, c_meta)
    
idx_cluster = c_meta.code == k

c_cluster_latent = c_latent[idx_cluster]
c_centroid = c_cluster_latent.mean(dim=0)

c_cluster_distances = torch.cdist(c_cluster_latent, c_centroid.view(1, -1))
c_outlier_idx = find_outliers_IQR(c_cluster_distances)[0]

idx_cluster_updated = idx_cluster.copy()
idx_cluster_updated[c_outlier_idx] = False

c_meta.code[idx_cluster] = -1
c_meta.code[idx_cluster_updated] = k

In [24]:
#=================================================================================
# Train D_VAE and predictor
##---------------------
## prepare data
### cluster K cell line latent space 
c_latent = pd.DataFrame(c_latent[idx_cluster_updated].detach().numpy(), index=c_data.index[idx_cluster_updated])
c_meta = get_CCL_meta(c_latent.index.values, c_meta)

### all drugs 
d_data = d_data

### corresponding cdr
cdr['c_name'] = cdr.index.values
cdr = pd.melt(cdr, id_vars='c_name', value_vars=None,
              var_name=None, value_name='value', col_level=None)
cdr = cdr.rename(columns={'variable':'d_name', 'value':'cdr'})
cdr = cdr.loc[cdr.c_name.isin(c_latent.index.values)]