In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys

In [3]:
import torch
import torch.nn as nn

In [4]:
sys.path.append("../src/baselines/CODE-AE/code/")

In [5]:
import datetime
import logging
import os
import time
import torch
import random
import pickle

In [6]:
from torch import nn
from torch.nn import functional as F

from functools import cached_property

from torch.nn import Linear, ReLU, Sequential
from sklearn.metrics import average_precision_score, ndcg_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
torch.manual_seed(2020)
random.seed(2020)
np.random.seed(2020)

In [9]:
# To avoid randomness in DataLoaders - https://pytorch.org/docs/stable/notes/randomness.html
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7ff611635e10>

In [10]:
import itertools
from torch.utils.data import TensorDataset, DataLoader
import json
import train_code_adv

In [11]:
class arguments():
    def __init__(self, is_train=True):
        self.method = "code_adv" # we will use CODE-AE ADV since that was the best performing one in the original paper.
        if is_train:
            self.retrain_flag = True
        else:
            self.retrain_flag = False
        self.pdtc_flag = False
        self.norm_flag = False
        self.measurement = "AUC"

In [12]:
args = arguments()

In [13]:
params_grid = {
    "pretrain_num_epochs": [100], #originally 300
    "train_num_epochs": [50], # originally 500
    "dop": [0.1] # originally 0.1
}

if args.method not in ['code_adv', 'adsn', 'adae', 'dsnw']:
    params_grid.pop('pretrain_num_epochs')

keys, values = zip(*params_grid.items())
update_params_dict_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

In [14]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.device_count() > 0:
        torch.cuda.manual_seed_all(seed)

In [15]:
sample_id = 2

#### Load Data

In [16]:
cl_train_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_train_sample{sample_id}.csv", index_col=0)
cl_train_df.shape

(1569, 7776)

In [17]:
cl_test_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_test_sample{sample_id}.csv", index_col=0)
cl_test_df.shape

(175, 7776)

In [18]:
cl_train_dataset = TensorDataset(torch.tensor(cl_train_df.values), torch.tensor(cl_train_df.values))
cl_test_dataset = TensorDataset(torch.tensor(cl_test_df.values), torch.tensor(cl_test_df.values))
cl_train_dataloader = DataLoader(cl_train_dataset, batch_size=256, shuffle=True)
cl_test_dataloader = DataLoader(cl_test_dataset, batch_size=256, shuffle=False)

In [19]:
tcga_train_df = pd.read_csv(f"../data/diffusion_pretraining/tcga_diffusion_train_sample{sample_id}.csv", index_col=0)
tcga_train_df.shape

(476, 7776)

In [20]:
tcga_test_df = pd.read_csv(f"../data/diffusion_pretraining/tcga_diffusion_test_sample{sample_id}.csv", index_col=0)
tcga_test_df.shape

(120, 7776)

In [21]:
tcga_train_dataset = TensorDataset(torch.tensor(tcga_train_df.values), torch.tensor(tcga_train_df.values))
tcga_test_dataset = TensorDataset(torch.tensor(tcga_test_df.values), torch.tensor(tcga_test_df.values))
tcga_train_dataloader = DataLoader(tcga_train_dataset, batch_size=256, shuffle=True)
tcga_test_dataloader = DataLoader(tcga_test_dataset, batch_size=256, shuffle=False)

In [22]:
def unlabeled_dataloaders_mutations(batch_size, seed):
    set_seed(seed)
    # train cell line mutations
    cl_train_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_train_sample{sample_id}.csv", index_col=0)
    cl_train_dataset = TensorDataset(torch.tensor(cl_train_df.values, dtype=torch.float32))
    cl_train_dataloader = DataLoader(cl_train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    # test cell line mutations
    cl_test_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_test_sample{sample_id}.csv", index_col=0)
    cl_test_dataset = TensorDataset(torch.tensor(cl_test_df.values, dtype=torch.float32))
    cl_test_dataloader = DataLoader(cl_test_dataset, batch_size=batch_size, shuffle=False)

    # train patient mutations
    tcga_train_df = pd.read_csv(f"../data/diffusion_pretraining/tcga_diffusion_train_sample{sample_id}.csv", index_col=0)
    tcga_train_dataset = TensorDataset(torch.tensor(tcga_train_df.values, dtype=torch.float32))
    tcga_train_dataloader = DataLoader(tcga_train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    
    # test patient mutations
    tcga_test_df = pd.read_csv(f"../data/diffusion_pretraining/tcga_diffusion_test_sample{sample_id}.csv", index_col=0)
    tcga_test_dataset = TensorDataset(torch.tensor(tcga_test_df.values, dtype=torch.float32))
    tcga_test_dataloader = DataLoader(tcga_test_dataset, batch_size=batch_size, shuffle=False)
    
    return (cl_train_dataloader, cl_test_dataloader), (tcga_train_dataloader, tcga_test_dataloader)

In [23]:
def generate_encoded_features(encoder, dataloader, normalize_flag=False):
    """

    :param normalize_flag:
    :param encoder:
    :param dataloader:
    :return:
    """
    encoder.eval()
    raw_feature_tensor = dataloader.dataset.tensors[0].cpu()
    label_tensor = dataloader.dataset.tensors[1].cpu()

    encoded_feature_tensor = encoder.cpu()(raw_feature_tensor)
    if normalize_flag:
        encoded_feature_tensor = torch.nn.functional.normalize(encoded_feature_tensor, p=2, dim=1)
    return encoded_feature_tensor, label_tensor


def load_pickle(pickle_file):
    data = []
    with open(pickle_file, 'rb') as f:
        try:
            while True:
                data.append(pickle.load(f))
        except EOFError:
            pass

    return data


def wrap_training_params(training_params, type='unlabeled'):
    aux_dict = {k: v for k, v in training_params.items() if k not in ['unlabeled', 'labeled']}
    aux_dict.update(**training_params[type])

    return aux_dict


def safe_make_dir(new_folder_name):
    if not os.path.exists(new_folder_name):
        os.makedirs(new_folder_name)
    else:
        print(new_folder_name, 'exists!')


def dict_to_str(d):
    return "_".join(["_".join([k, str(v)]) for k, v in d.items()])


In [24]:
# From https://github.com/XieResearchGroup/CODE-AE/blob/main/code/pretrain_hyper_main.py
def pretrain(args, update_params_dict):
    if args.method == 'dsn':
        train_fn = train_dsn.train_dsn
    elif args.method == 'adae':
        train_fn = train_adae.train_adae
    elif args.method == 'coral':
        train_fn = train_coral.train_coral
    elif args.method == 'dae':
        train_fn = train_dae.train_dae
    elif args.method == 'vae':
        train_fn = train_vae.train_vae
    elif args.method == 'vaen':
        train_fn = train_vae.train_vae
    elif args.method == 'ae':
        train_fn = train_ae.train_ae
    elif args.method == 'code_mmd':
        train_fn = train_code_mmd.train_code_mmd
    elif args.method == 'code_base':
        train_fn = train_code_base.train_code_base
    elif args.method == 'dsna':
        train_fn = train_dsna.train_dsna
    else:
        train_fn = train_code_adv.train_code_adv

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    

    with open(os.path.join('../src/baselines/CODE-AE/code/model_save/train_params.json'), 'r') as f:
        training_params = json.load(f)

    training_params['unlabeled'].update(update_params_dict)
    param_str = dict_to_str(update_params_dict)

    if not args.norm_flag:
        method_save_folder = os.path.join('../src/baselines/CODE-AE/code/model_save', args.method)
    else:
        method_save_folder = os.path.join('../src/baselines/CODE-AE/code/model_save', f'{args.method}_norm')

    training_params.update(
        {
            'device': device,
            'input_dim': 7776,
            'model_save_folder': os.path.join(method_save_folder, param_str),
            'es_flag': False,
            'retrain_flag': args.retrain_flag,
            'norm_flag': args.norm_flag
        })

    safe_make_dir(training_params['model_save_folder'])
    random.seed(2020)

    
    s_dataloaders, t_dataloaders = unlabeled_dataloaders_mutations(
        batch_size=training_params['unlabeled']['batch_size'],
        seed=2020,
    )

    # start unlabeled training
    encoder, historys = train_fn(s_dataloaders=s_dataloaders,
                                 t_dataloaders=t_dataloaders,
                                 **wrap_training_params(training_params, type='unlabeled'))
    with open(os.path.join(training_params['model_save_folder'], f'unlabel_train_history.pickle'),
              'wb') as f:
        for history in historys:
            pickle.dump(dict(history), f)


In [25]:
from collections import defaultdict
from copy import deepcopy

In [26]:
from scipy.stats import zscore

In [27]:
import fine_tuning

In [28]:
def get_labeled_dataloader(drug, batch_size):
    # returns cell lines with response to drug mentioned
    cl_responses_df = pd.read_csv("/data/ajayago/copied_from_cdal1/yiming_data_folder/dataset/CellLine/patient_auc.csv")

    # filter by drug specified
    drug_specific_audrc = cl_responses_df[cl_responses_df.drug_name == drug]
    if len(drug_specific_audrc) == 0:
        print(f"Drug {drug} not found in cell lines.")
        return 
    ccle_train_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_train_sample{sample_id}.csv", index_col = 0)
    ccle_test_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_test_sample{sample_id}.csv", index_col = 0)
    train_features = ccle_train_df[ccle_train_df.index.isin(drug_specific_audrc.depmap_id)]
    test_features = ccle_test_df[ccle_test_df.index.isin(drug_specific_audrc.depmap_id)]

    print(train_features.shape)
    print(test_features.shape)

    if len(train_features) == 0 or len(test_features) == 0:
        print("No train/test samples for this drug")
        return
        
    # convert AUDRC to binary based on Z score
    zscores_from_auc = zscore(drug_specific_audrc["auc"].values, nan_policy="omit")
    drug_specific_audrc["zscores"] = zscores_from_auc
    zscore_threshold = 0.0
    train_features = train_features.merge(drug_specific_audrc, left_on=train_features.index, right_on=drug_specific_audrc.depmap_id)
    test_features = test_features.merge(drug_specific_audrc, left_on=test_features.index, right_on=drug_specific_audrc.depmap_id)
    # print(train_features.shape)
    # print(test_features.shape)
    # print(train_features.columns[-10:])

    if len(train_features) == 0 or len(test_features) == 0:
        print("No train/test samples for this drug")
        return
        
    train_features["labels"] = train_features["auc"].apply(lambda x: int(x < zscore_threshold))
    test_features["labels"] = test_features["auc"].apply(lambda x: int(x < zscore_threshold))
    train_features.set_index(["key_0"], inplace=True, drop=True)
    test_features.set_index(["key_0"], inplace=True, drop=True)
    # print(train_features.head())
    
    ccle_train = torch.from_numpy(train_features.values[:, :7776].astype('float32'))
    ccle_labels_train = torch.from_numpy(train_features["labels"].values)
    ccle_test = torch.from_numpy(test_features.values[:, :7776].astype('float32'))
    ccle_labels_test = torch.from_numpy(test_features["labels"].values)
    train_labeled_ccle_dataset = TensorDataset(ccle_train, ccle_labels_train)
    test_labeled_ccle_dataset = TensorDataset(ccle_test, ccle_labels_test)
    train_labeled_ccle_dataloader = DataLoader(train_labeled_ccle_dataset, batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker)
    test_labeled_ccle_dataloader = DataLoader(test_labeled_ccle_dataset, batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker)

    return train_labeled_ccle_dataloader, test_labeled_ccle_dataloader, None

In [29]:
def fine_tune(args, drug, update_params_dict):
    if args.method == 'dsn':
        train_fn = train_dsn.train_dsn
    elif args.method == 'adae':
        train_fn = train_adae.train_adae
    elif args.method == 'coral':
        train_fn = train_coral.train_coral
    elif args.method == 'dae':
        train_fn = train_dae.train_dae
    elif args.method == 'vae':
        train_fn = train_vae.train_vae
    elif args.method == 'ae':
        train_fn = train_ae.train_ae
    elif args.method == 'code_mmd':
        train_fn = train_code_mmd.train_code_mmd
    elif args.method == 'code_base':
        train_fn = train_code_base.train_code_base
    elif args.method == 'dsna':
        train_fn = train_dsna.train_dsna
    else:
        train_fn = train_code_adv.train_code_adv

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    with open(os.path.join('../src/baselines/CODE-AE/code/model_save/train_params.json'), 'r') as f:
        training_params = json.load(f)

    training_params['unlabeled'].update(update_params_dict)
    param_str = dict_to_str(update_params_dict)

    if not args.norm_flag:
        method_save_folder = os.path.join('../src/baselines/CODE-AE/code/model_save', args.method)
    else:
        method_save_folder = os.path.join('../src/baselines/CODE-AE/code/model_save', f'{args.method}_norm')

    training_params.update(
        {
            'device': device,
            'input_dim': 7776,
            'model_save_folder': os.path.join(method_save_folder, param_str),
            'es_flag': False,
            'retrain_flag': args.retrain_flag,
            'norm_flag': args.norm_flag
        })
    if args.pdtc_flag:
        task_save_folder = os.path.join(f'{method_save_folder}', args.measurement, 'pdtc', drug)
    else:
        task_save_folder = os.path.join(f'{method_save_folder}', args.measurement, drug)

    safe_make_dir(training_params['model_save_folder'])
    safe_make_dir(task_save_folder)

    random.seed(2020)

    s_dataloaders, t_dataloaders = unlabeled_dataloaders_mutations(
        batch_size=training_params['unlabeled']['batch_size'],
        seed=2020,
    )


    # start unlabeled training
    encoder, historys = train_fn(s_dataloaders=s_dataloaders,
                                 t_dataloaders=t_dataloaders,
                                 **wrap_training_params(training_params, type='unlabeled'))
    if args.retrain_flag:
        with open(os.path.join(training_params['model_save_folder'], f'unlabel_train_history.pickle'),
                  'wb') as f:
            for history in historys:
                pickle.dump(dict(history), f)

    # labeled_ccle_dataloader, labeled_tcga_dataloader = data.get_labeled_dataloaders(
    #     gex_features_df=gex_features_df,
    #     seed=2020,
    #     batch_size=training_params['labeled']['batch_size'],
    #     drug=drug,
    #     threshold=args.a_thres,
    #     days_threshold=args.days_thres,
    #     ccle_measurement=args.measurement,
    #     ft_flag=False,
    #     pdtc_flag=args.pdtc_flag
    # )
    # ml_baseline_history = defaultdict(list)
    # ccle_encoded_feature_tensor, ccle_label_tensor = generate_encoded_features(encoder, labeled_ccle_dataloader,
    #                                                                            normalize_flag=args.norm_flag)
    # tcga_encoded_feature_tensor, tcga_label_tensor = generate_encoded_features(encoder, labeled_tcga_dataloader,
    #                                                                            normalize_flag=args.norm_flag)
    # ml_baseline_history['enet'].append(
    #     ml_baseline.n_time_cv(
    #         model_fn=ml_baseline.classify_with_enet,
    #         n=int(args.n),
    #         train_data=(
    #             ccle_encoded_feature_tensor.detach().cpu().numpy(),
    #             ccle_label_tensor.detach().cpu().numpy()
    #         ),
    #         test_data=(
    #             tcga_encoded_feature_tensor.detach().cpu().numpy(),
    #             tcga_label_tensor.detach().cpu().numpy()
    #         ),
    #         metric=args.metric
    #     )[1]
    # )
    #
    # with open(os.path.join(task_save_folder, f'{param_str}_ft_baseline_results.json'), 'w') as f:
    #     json.dump(ml_baseline_history, f)

    ft_evaluation_metrics = defaultdict(list)
    train_labeled_ccle_dataloader, test_labeled_ccle_dataloader, labeled_rad51_dataloader = get_labeled_dataloader(
        batch_size=training_params['labeled']['batch_size'],
        drug=drug
    )

    ft_encoder = deepcopy(encoder)
    
    # print(train_labeled_ccle_dataloader.dataset.tensors[1].sum())
    # print(test_labeled_ccle_dataloader.dataset.tensors[1].sum())
    # print(labeled_rad51_dataloader.dataset.tensors[1].sum())

    target_classifier, ft_historys = fine_tuning.fine_tune_encoder(
        encoder=ft_encoder,
        train_dataloader=train_labeled_ccle_dataloader,
        val_dataloader=test_labeled_ccle_dataloader,
        test_dataloader=None,
        seed=2020,
        normalize_flag=args.norm_flag,
        task_save_folder=task_save_folder,
        **wrap_training_params(training_params, type='labeled')
    ) # here we are only training the network, so setting test_dataloader = None
    
    return target_classifier
#     ft_evaluation_metrics['best_index'].append(ft_historys[-2]['best_index'])
#     for metric in ['auroc', 'acc', 'aps', 'f1', 'auprc']:
#         ft_evaluation_metrics[metric].append(ft_historys[-1][metric][ft_historys[-2]['best_index']])

#     with open(os.path.join(task_save_folder, f'{param_str}_ft_evaluation_results.json'), 'w') as f:
#         json.dump(ft_evaluation_metrics, f)

In [30]:
test_args = arguments(is_train=True)

In [31]:
patient_responses_df = pd.read_csv("/data/ajayago/copied_from_cdal1/ajayago_home_folder/processed/TCGA_drug_response_010222.csv")
patient_responses_df

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EC,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EC,Doxorubicin,Partial Response,1,DOXORUBICIN
2,TCGA-G2-A2EC,Vinblastine,Partial Response,1,VINBLASTINE
3,TCGA-G2-A2EC,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-G2-A2EJ,Paclitaxel,Stable Disease,0,PACLITAXEL
...,...,...,...,...,...
1244,TCGA-BG-A0VZ,Cisplatin,Complete Response,1,CISPLATIN
1245,TCGA-BG-A0VZ,Paclitaxel,Complete Response,1,PACLITAXEL
1246,TCGA-BG-A0VZ,Doxorubicin,Complete Response,1,DOXORUBICIN
1247,TCGA-BG-A0VT,Carboplatin,Complete Response,1,CARBOPLATIN


In [32]:
patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_test_df.index)].drug.value_counts()

drug
CISPLATIN           39
PACLITAXEL          34
5-FLUOROURACIL      29
CARBOPLATIN         27
CYCLOPHOSPHAMIDE    18
LEUCOVORIN          18
DOXORUBICIN         14
DOCETAXEL           13
GEMCITABINE         12
ETOPOSIDE            9
OXALIPLATIN          9
VINORELBINE          7
CAPECITABINE         6
CETUXIMAB            5
PEMETREXED           5
IRINOTECAN           4
EPIRUBICIN           4
ANASTROZOLE          3
METHOTREXATE         2
ERLOTINIB            2
TRASTUZUMAB          2
VINBLASTINE          2
TAMOXIFEN            2
GOSERELIN            1
FULVESTRANT          1
LETROZOLE            1
TOPOTECAN            1
FOTEMUSTINE          1
LOMUSTINE            1
DACARBAZINE          1
VEMURAFENIB          1
DIDOX                1
MITOMYCIN-C          1
Name: count, dtype: int64

In [33]:
patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_train_df.index)].drug.value_counts()

drug
CISPLATIN             167
5-FLUOROURACIL         96
CARBOPLATIN            90
PACLITAXEL             79
CYCLOPHOSPHAMIDE       51
                     ... 
PNU-159548              1
RESIQUIMOD              1
ERIBULIN                1
METHYLPREDNISOLONE      1
TOPOTECAN               1
Name: count, Length: 68, dtype: int64

In [34]:
train_tcga_with_response = patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_train_df.index)].reset_index(drop=True)
train_tcga_with_response

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EC,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EC,Doxorubicin,Partial Response,1,DOXORUBICIN
2,TCGA-G2-A2EC,Vinblastine,Partial Response,1,VINBLASTINE
3,TCGA-G2-A2EC,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-G2-A2EJ,Paclitaxel,Stable Disease,0,PACLITAXEL
...,...,...,...,...,...
968,TCGA-E6-A8L9,Carboplatin,Complete Response,1,CARBOPLATIN
969,TCGA-2E-A9G8,Paclitaxel,Complete Response,1,PACLITAXEL
970,TCGA-2E-A9G8,Carboplatin,Complete Response,1,CARBOPLATIN
971,TCGA-BG-A0VT,Carboplatin,Complete Response,1,CARBOPLATIN


In [35]:
test_tcga_with_response = patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_test_df.index)].reset_index(drop=True)
test_tcga_with_response

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EF,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EF,Vinblastine,Partial Response,1,VINBLASTINE
2,TCGA-G2-A2EF,Doxorubicin,Partial Response,1,DOXORUBICIN
3,TCGA-G2-A2EF,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-DK-A3IQ,Gemcitabine,Stable Disease,0,GEMCITABINE
...,...,...,...,...,...
271,TCGA-QS-A8F1,Carboplatin,Clinical Progressive Disease,0,CARBOPLATIN
272,TCGA-QS-A8F1,Paclitaxel,Clinical Progressive Disease,0,PACLITAXEL
273,TCGA-BG-A0VZ,Cisplatin,Complete Response,1,CISPLATIN
274,TCGA-BG-A0VZ,Paclitaxel,Complete Response,1,PACLITAXEL


In [36]:
# with atleast 10 samples in test tcga samples and also present in cell lines
drugs2evaluate = [
                "CISPLATIN",
                  "5-FLUOROURACIL", 
                  "GEMCITABINE", 
                  "PACLITAXEL",
                  "DOXORUBICIN",
                  "CYCLOPHOSPHAMIDE",
                  "DOCETAXEL"]
len(drugs2evaluate)

7

In [37]:
models = {}
for drug in drugs2evaluate:
    for param_dict in update_params_dict_list:
        models[drug] = fine_tune(args=test_args, drug=drug, update_params_dict=param_dict)

../src/baselines/CODE-AE/code/model_save/code_adv/pretrain_num_epochs_100_train_num_epochs_50_dop_0.1 exists!
../src/baselines/CODE-AE/code/model_save/code_adv/AUC/CISPLATIN exists!
AE training epoch 0
AE training epoch 50
confounder wgan training epoch 0
(489, 7776)
(50, 7776)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Fine tuning epoch 300
Fine tuning epoch 350
Fine tuning epoch 400
Fine tuning epoch 450
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tun

In [38]:
test_tcga_with_response

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EF,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EF,Vinblastine,Partial Response,1,VINBLASTINE
2,TCGA-G2-A2EF,Doxorubicin,Partial Response,1,DOXORUBICIN
3,TCGA-G2-A2EF,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-DK-A3IQ,Gemcitabine,Stable Disease,0,GEMCITABINE
...,...,...,...,...,...
271,TCGA-QS-A8F1,Carboplatin,Clinical Progressive Disease,0,CARBOPLATIN
272,TCGA-QS-A8F1,Paclitaxel,Clinical Progressive Disease,0,PACLITAXEL
273,TCGA-BG-A0VZ,Cisplatin,Complete Response,1,CISPLATIN
274,TCGA-BG-A0VZ,Paclitaxel,Complete Response,1,PACLITAXEL


In [39]:
for k, v in models.items():
    v.eval()

In [40]:
set(models.keys()) & set(test_tcga_with_response.drug)

{'5-FLUOROURACIL',
 'CISPLATIN',
 'CYCLOPHOSPHAMIDE',
 'DOCETAXEL',
 'DOXORUBICIN',
 'GEMCITABINE',
 'PACLITAXEL'}

In [41]:
predictions = []
y_true = []
drugs = []
for idx, row in test_tcga_with_response.iterrows():
    if row["drug"] in models.keys():
        m = models[row["drug"]]
        inp = torch.from_numpy(tcga_test_df.loc[row["patient.arr"]].values.astype(np.float32)).to(torch.device("cuda:0"))
        predictions.append(nn.Sigmoid()(m(inp)).cpu().detach().numpy())
        y_true.append(row["response_cat"])
        drugs.append(row["drug"])

In [42]:
len(predictions), len(y_true)

(159, 159)

In [43]:
from sklearn.metrics import roc_auc_score, average_precision_score
auroc = roc_auc_score(y_true, predictions)
auprc = average_precision_score(y_true, predictions)

In [44]:
print(f"AUROC = {auroc}, AUPRC = {auprc}")

AUROC = 0.5106719367588932, AUPRC = 0.73922963305143


In [45]:
# per drug
res_df = pd.DataFrame(predictions, columns=["y_pred"])
res_df["y_true"] = y_true
res_df["drug"] = drugs

In [46]:
res_df

Unnamed: 0,y_pred,y_true,drug
0,1.532811e-07,1,DOXORUBICIN
1,1.601508e-07,1,CISPLATIN
2,1.436534e-06,0,GEMCITABINE
3,2.381338e-07,0,DOXORUBICIN
4,2.525214e-07,0,CISPLATIN
...,...,...,...
154,5.943990e-08,1,PACLITAXEL
155,4.865178e-08,0,PACLITAXEL
156,2.631779e-07,1,CISPLATIN
157,3.595757e-08,1,PACLITAXEL


In [47]:
res_df.y_true.value_counts()

y_true
1    115
0     44
Name: count, dtype: int64

In [48]:
for d in res_df.drug.unique():
    try:
        subset_df = res_df[res_df.drug == d]
        auroc = roc_auc_score(subset_df["y_true"], subset_df["y_pred"])
        auprc = average_precision_score(subset_df["y_true"], subset_df["y_pred"])
        print(f"Drug {d} | AUROC = {auroc}, AUPRC = {auprc}")
        print(subset_df.shape)
        print(subset_df.y_true.value_counts())
    except:
        continue

Drug DOXORUBICIN | AUROC = 0.696969696969697, AUPRC = 0.9248297157388068
(14, 3)
y_true
1    11
0     3
Name: count, dtype: int64
Drug CISPLATIN | AUROC = 0.6413793103448276, AUPRC = 0.8190869157266278
(39, 3)
y_true
1    29
0    10
Name: count, dtype: int64
Drug GEMCITABINE | AUROC = 0.5833333333333334, AUPRC = 0.5660714285714286
(12, 3)
y_true
0    6
1    6
Name: count, dtype: int64
Drug PACLITAXEL | AUROC = 0.6177777777777778, AUPRC = 0.8500460441001377
(34, 3)
y_true
1    25
0     9
Name: count, dtype: int64
Drug 5-FLUOROURACIL | AUROC = 0.4666666666666667, AUPRC = 0.7329797000140506
(29, 3)
y_true
1    20
0     9
Name: count, dtype: int64
Drug CYCLOPHOSPHAMIDE | AUROC = 0.5, AUPRC = 0.9292325219714925
(18, 3)
y_true
1    16
0     2
Name: count, dtype: int64
Drug DOCETAXEL | AUROC = 0.75, AUPRC = 0.8356150793650794
(13, 3)
y_true
1    8
0    5
Name: count, dtype: int64
