# 0. Setup

In [1]:
# Verify we're in the correct working directory
import os
import git 
from pathlib import Path

def get_project_root():
    return Path(git.Repo('.', search_parent_directories=True).working_tree_dir)

root = get_project_root()

os.chdir(root)
os.getcwd()

'/Users/seraphinashi/Desktop/DataFusion/DrugResponse_Omics_Molecules'

In [2]:
plot_folder = "images/simulation/"

## import packages, models, trainers

In [3]:
import argparse
import logging
import sys
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 200

import torch
from torch import nn, optim, Tensor
from torch.nn import functional as F
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

print('pytorch version:', torch.__version__)
print('orig num threads:', torch.get_num_threads())

pytorch version: 1.13.1
orig num threads: 4


In [4]:
from models import *
from trainers import *
from losses import *
from utils import *

In [5]:
import random
seed=42

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# 1. Prepare dataset

## Load 

In [6]:
simu_folder = "data/simulations"
RNAseq = pd.read_csv(os.path.join(simu_folder, "simu1_RNAseq.csv"), index_col = 0)
RNAseq_meta = pd.read_csv(os.path.join(simu_folder, "simu1_RNAseq_meta.csv"), index_col = 0)
d_fp = pd.read_csv(os.path.join(simu_folder, "simu1_d_fp.csv"), index_col = 0)
cdr = pd.read_csv(os.path.join(simu_folder, "simu1_cdr.csv"), index_col = 0)


# originally
c_meta, meta_map = get_CCL_meta_codes(RNAseq.columns.values, RNAseq_meta)
print(f"Cancer type coding map: {meta_map}")
print(f"Count of each coded cancer type:")
print(c_meta['code'].value_counts())

Cancer type coding map: [('grp2', 1) ('grp1', 0)]
Count of each coded cancer type:
1    41
0    35
Name: code, dtype: int64


In [7]:
# only two groups
two_grp = False
if two_grp:
    RNAseq_meta.loc[RNAseq_meta.C_type=='grp2', 'C_type'] = 'grp1'

    c_meta, meta_map = get_CCL_meta_codes(RNAseq.columns.values, RNAseq_meta)
    print(f"Cancer type coding map: {meta_map}")
    print(f"Count of each coded cancer type:")
    print(c_meta['code'].value_counts())

In [8]:
c_data = RNAseq.T
c_meta = c_meta

d_data = d_fp.T

cdr = cdr
cdr.index = cdr.index.astype("str")
cdr_org = cdr.copy()

In [9]:
c_meta

Unnamed: 0,code,k0,k1
1240142,0,1,0
1240183,0,1,0
1240190,0,1,0
1247873,0,1,0
1290908,0,1,0
...,...,...,...
909728,1,0,1
910399,1,0,1
910900,1,0,1
910931,1,0,1


# 2. Hyperparameters

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(device)
print(device)

cpu


In [11]:
class Train_Args:
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __contains__(self, key):
        return hasattr(self, key)

    valid_size = 0.1 #@param {type: "float"}

    n_epochs = 100 #@param {type: "integer"}
    batch_size = 50 #@param {type: "integer"}
    lr = 0.01 #@param {type: "float"}
    
    c_cluster_distance_weight = 100 #@param {type: "float"}
    d_cluster_distance_weight = 100 #@param {type: "float"}
    predict_loss_weight = 1000 #@param {type: "float"}
    
    c_save_path = 'data/model_fits/c_vae.pkl' #@param
    d_save_path = 'data/model_fits/d_vae.pkl' #@param
    
    c_p_save_path = 'data/model_fits/c_vae_predictor.pkl' #@param
    d_p_save_path = 'data/model_fits/d_vae_predictor.pkl' #@param
    

class CDPModel_sub_Args:
    def __getitem__(self, key):
        return getattr(self, key)
    def __setitem__(self, key, val):
        setattr(self, key, val)
    def __contains__(self, key):
        return hasattr(self, key)

    # c_VAE
    c_input_dim = 0 #@param {type: "integer"}
    c_h_dims = [64] #@param {type: "vactor"}
    c_latent_dim = 32 #@param {type: "integer"}

    # d_VAE
    d_input_dim = 0 #@param {type: "integer"}
    d_h_dims = [64]  #@param {type: "vactor"}
    d_latent_dim = 32 #@param {type: "integer"}

    # predictor
    p_sec_dim = 16 #@param {type: "integer"}
    p_h_dims = [p_sec_dim*2, 16]  #@param {type: "vactor"}
    
    # all
    drop_out = 0  #@param {type: "float"}


In [12]:
train_args = Train_Args()

K = len(c_meta['code'].unique())

CDPmodel_args = CDPModel_sub_Args()
CDPmodel_args['c_input_dim'] = c_data.shape[1] 
CDPmodel_args['d_input_dim'] = d_data.shape[1]

if CDPmodel_args['c_input_dim'] <= 0:
  warnings.warn(
      '''\nCancer Cell line feature number not specified''')
if CDPmodel_args['d_input_dim'] <= 0:
  warnings.warn(
      '''\nDrug feature number not specified''')

# 3. Train Model

In [13]:
CDPmodel = CDPmodel(K, CDPmodel_args)
returns = CDPmodel.fit(c_data, c_meta, d_data, cdr_org, train_args, n_rounds=1, device)

c_meta, c_meta_hist, d_sens_hist, losses_train_hist_list = returns

TypeError: fit() got multiple values for argument 'n_rounds'

In [None]:
c_name in CDPmodel.c_clusters_in_trainnig[0]

# 4. Results and visualizations

## Clusters

In [None]:
c_meta_hist = add_meta_code_b(c_meta_hist, K, b=0)
print('Cancer clustering before:')
print(c_meta_hist.code.value_counts())
print('Cancer clustering after:')
print(c_meta_hist.code_b0.value_counts())

In [None]:
d_sens_hist = add_sensk_to_d_sens_hist(d_sens_hist, K, b = -1)
d_sens_hist = add_sensk_to_d_sens_hist(d_sens_hist, K, b = 0)
print('Sensitive to clusters before:')
print(d_sens_hist.sensitive_k.value_counts())
print('Sensitive to clusters after:')
print(d_sens_hist.sensitive_k_b0.value_counts())

## Visualizations

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
for k in range(K):
    c_latent_k = CDPmodel.CDPmodel_list[k].c_VAE.encode(torch.from_numpy(c_data.values).float().to(device), repram=False)
    plot_PCA_latent(latent = c_latent_k.detach().numpy(),
                    label_org = c_meta_hist['code'].astype(str),
                    label_updates = c_meta_hist['code_b0'].astype(str),
                    legend_title = "cluster",
                    k = k)


In [None]:
c_meta_hist.loc[c_meta_hist.code_b0 == '-1']

In [None]:
for k in range(K):
    d_latent_k = CDPmodel.CDPmodel_list[k].d_VAE.encode(torch.from_numpy(d_data.values).float().to(device), repram=False)
    plot_PCA_latent(latent = d_latent_k.detach().numpy(),
                    label_org = d_sens_hist['sensitive_k'],
                    label_updates = d_sens_hist['sensitive_k_b0'],
                    legend_title = "Sensitive to cluster k",
                    k = k)

In [None]:
plot_training_losses(losses_train_hist_list[0])

In [None]:
plot_training_losses(losses_train_hist_list[1])

In [None]:
losses_train_hist_list[1][1]