In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import os

In [4]:
import pickle

In [5]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [6]:
from scipy.stats import pearsonr, spearmanr

In [7]:
saved_folder = "/data//papers_data/systematic_assessment/run_files/"

In [8]:
models = [
    # "DruID",
    # "drug2tme",
    # "PANCDR",
    # "PREDICT-AI",
    "DiffDRP_v7", # another name for GANDALF
]

### Annotated mutations

In [9]:
with open("/data//papers_data/systematic_assessment/input_types/annotated_mutations/Experiment2/SettingA/patients_fold0_processed.pkl", "rb") as f:
    exp2A_fold0 = pickle.load(f)

with open("/data//papers_data/systematic_assessment/input_types/annotated_mutations/Experiment2/SettingA/patients_fold1_processed.pkl", "rb") as f:
    exp2A_fold1 = pickle.load(f)

with open("/data//papers_data/systematic_assessment/input_types/annotated_mutations/Experiment2/SettingA/patients_fold2_processed.pkl", "rb") as f:
    exp2A_fold2 = pickle.load(f)

In [10]:
exp2A_fold0["test"].drug_name.unique()

array(['BUPARLISIB', 'CISPLATIN', 'FLUOROURACIL', 'GEMCITABINE',
       'PACLITAXEL', 'SORAFENIB', 'TEMOZOLOMIDE'], dtype=object)

In [11]:
exp2A_fold0["test"].recist.value_counts()

recist
0    67
1    48
Name: count, dtype: int64

In [12]:
exp2A_fold1["test"].recist.value_counts()

recist
0    66
1    48
Name: count, dtype: int64

In [13]:
exp2A_fold2["test"].recist.value_counts()

recist
0    64
1    49
Name: count, dtype: int64

In [14]:
exp2A_fold0["test"].drug_name.value_counts()

drug_name
TEMOZOLOMIDE    30
CISPLATIN       27
SORAFENIB       15
GEMCITABINE     14
FLUOROURACIL    12
BUPARLISIB       9
PACLITAXEL       8
Name: count, dtype: int64

#### Drug based results

In [15]:
# fold 0
fold = 0
res_df0 = pd.DataFrame()
for m in models:
    # print(m)
    if "DiffDRP" not in m:
        path = f"{saved_folder}/{m}/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}.csv"
    else:
        path = f"/data//papers_data/{m}/run_files/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path, index_col=0)
        combined_df = pd.concat([exp2A_fold0["test"][["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],df], axis=1)
        for d in combined_df.drug_name.unique():
            if d in ["FLUOROURACIL", "GEMCITABINE", "PACLITAXEL", "TEMOZOLOMIDE", "CISPLATIN"]:
                subset_df = pd.read_csv(f"/data//papers_data/{m}/run_files/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}_tuned4{d}.csv")
            else:
                subset_df = combined_df[combined_df.drug_name == d]
            if len(subset_df["y_true"].value_counts()) >= 2: # both classes available
                auroc_val = round(roc_auc_score(subset_df["y_true"], subset_df["y_pred"]), 4)
                auprc_val = round(average_precision_score(subset_df["y_true"], subset_df["y_pred"]), 4)
                pearsonr_val = pearsonr(subset_df["y_true"], subset_df["y_pred"])
                # print(f"Drug Name: {d}")
                # print(f"AUROC = {auroc_val}")
                # print(f"AUPRC = {auprc_val}")
                res_dict = {"drug_name": d, "AUROC": auroc_val, "AUPRC": auprc_val, "pearsonr": pearsonr_val.statistic, "pearsonr_pval": pearsonr_val.pvalue, "method": m, "fold": fold}
                res_df0 = res_df0._append(res_dict, ignore_index=True)
res_df0

Unnamed: 0,drug_name,AUROC,AUPRC,pearsonr,pearsonr_pval,method,fold
0,CISPLATIN,0.6304,0.9317,-0.00211,0.991667,DiffDRP_v7,0
1,FLUOROURACIL,0.6571,0.7406,0.378179,0.225467,DiffDRP_v7,0
2,GEMCITABINE,0.6667,0.6541,0.325246,0.256503,DiffDRP_v7,0
3,PACLITAXEL,0.7143,0.9617,-0.075306,0.859334,DiffDRP_v7,0
4,SORAFENIB,0.1429,0.0769,-0.27678,0.317961,DiffDRP_v7,0
5,TEMOZOLOMIDE,0.6923,0.2541,0.234628,0.212026,DiffDRP_v7,0


In [16]:
res_df0.pivot_table(index="method", values="AUROC", columns="drug_name", aggfunc="mean") # AUROC

drug_name,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,SORAFENIB,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DiffDRP_v7,0.6304,0.6571,0.6667,0.7143,0.1429,0.6923


In [17]:
res_df0.pivot_table(index="method", values="AUPRC", columns="drug_name") # AUPRC

drug_name,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,SORAFENIB,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DiffDRP_v7,0.9317,0.7406,0.6541,0.9617,0.0769,0.2541


In [18]:
exp2A_fold0["test"][["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]][["drug_name", "recist"]].value_counts()

drug_name     recist
TEMOZOLOMIDE  0         26
CISPLATIN     1         23
SORAFENIB     0         14
BUPARLISIB    0          9
GEMCITABINE   0          8
FLUOROURACIL  1          7
PACLITAXEL    1          7
GEMCITABINE   1          6
FLUOROURACIL  0          5
CISPLATIN     0          4
TEMOZOLOMIDE  1          4
PACLITAXEL    0          1
SORAFENIB     1          1
Name: count, dtype: int64

In [19]:
# fold 1
fold = 1
res_df1 = pd.DataFrame()
for m in models:
    # print(m)
    if "DiffDRP" not in m:
        path = f"{saved_folder}/{m}/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}.csv"
    else:
        path = f"/data//papers_data/{m}/run_files/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path, index_col=0)
        combined_df = pd.concat([exp2A_fold1["test"][["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],df], axis=1)
        for d in combined_df.drug_name.unique():
            if d in ["FLUOROURACIL", "GEMCITABINE", "PACLITAXEL", "TEMOZOLOMIDE", "CISPLATIN"]:
                subset_df = pd.read_csv(f"/data//papers_data/{m}/run_files/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}_tuned4{d}.csv")
            else:
                subset_df = combined_df[combined_df.drug_name == d]
            if len(subset_df["y_true"].value_counts()) >= 2: # both classes available
                auroc_val = round(roc_auc_score(subset_df["y_true"], subset_df["y_pred"]), 4)
                auprc_val = round(average_precision_score(subset_df["y_true"], subset_df["y_pred"]), 4)
                pearsonr_val = pearsonr(subset_df["y_true"], subset_df["y_pred"])
                # print(f"Drug Name: {d}")
                # print(f"AUROC = {auroc_val}")
                # print(f"AUPRC = {auprc_val}")
                res_dict = {"drug_name": d, "AUROC": auroc_val, "AUPRC": auprc_val, "pearsonr": pearsonr_val.statistic, "pearsonr_pval": pearsonr_val.pvalue, "method": m, "fold": fold}
                res_df1 = res_df1._append(res_dict, ignore_index=True)
res_df1
            

Unnamed: 0,drug_name,AUROC,AUPRC,pearsonr,pearsonr_pval,method,fold
0,BUPARLISIB,0.875,0.5,0.263912,0.492597,DiffDRP_v7,1
1,CISPLATIN,0.6058,0.9279,0.059161,0.756149,DiffDRP_v7,1
2,FLUOROURACIL,0.7857,0.9038,0.428056,0.189044,DiffDRP_v7,1
3,GEMCITABINE,0.5417,0.3889,0.108924,0.749881,DiffDRP_v7,1
4,PACLITAXEL,0.6875,0.9294,0.224748,0.532454,DiffDRP_v7,1
5,TEMOZOLOMIDE,0.5556,0.2527,0.04791,0.801498,DiffDRP_v7,1


In [20]:
res_df1.pivot_table(index="method", values="AUROC", columns="drug_name") # AUROC

drug_name,BUPARLISIB,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DiffDRP_v7,0.875,0.6058,0.7857,0.5417,0.6875,0.5556


In [21]:
res_df1.pivot_table(index="method", values="AUPRC", columns="drug_name") # AUPRC

drug_name,BUPARLISIB,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DiffDRP_v7,0.5,0.9279,0.9038,0.3889,0.9294,0.2527


In [22]:
exp2A_fold1["test"][["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]][["drug_name", "recist"]].value_counts()

drug_name     recist
TEMOZOLOMIDE  0         27
CISPLATIN     1         26
SORAFENIB     0         13
BUPARLISIB    0          8
GEMCITABINE   0          8
PACLITAXEL    1          8
FLUOROURACIL  1          7
CISPLATIN     0          4
FLUOROURACIL  0          4
TEMOZOLOMIDE  1          3
GEMCITABINE   1          3
PACLITAXEL    0          2
BUPARLISIB    1          1
Name: count, dtype: int64

In [23]:
# fold 2
fold = 2
res_df2 = pd.DataFrame()
for m in models:
    # print(m)
    if "DiffDRP" not in m:
        path = f"{saved_folder}/{m}/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}.csv"
    else:
        path = f"/data//papers_data/{m}/run_files/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path, index_col=0)
        combined_df = pd.concat([exp2A_fold2["test"][["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]],df], axis=1)
        for d in combined_df.drug_name.unique():
            if d in ["FLUOROURACIL", "GEMCITABINE", "PACLITAXEL", "TEMOZOLOMIDE", "CISPLATIN"]:
                subset_df = pd.read_csv(f"/data//papers_data/{m}/run_files/saved_model_annotated_mutations/prediction_patients_val_corr_2A_ALL_fold{fold}_tuned4{d}.csv")
            else:
                subset_df = combined_df[combined_df.drug_name == d]
            if len(subset_df["y_true"].value_counts()) >= 2: # both classes available
                auroc_val = round(roc_auc_score(subset_df["y_true"], subset_df["y_pred"]), 4)
                auprc_val = round(average_precision_score(subset_df["y_true"], subset_df["y_pred"]), 4)
                pearsonr_val = pearsonr(subset_df["y_true"], subset_df["y_pred"])
                # print(f"Drug Name: {d}")
                # print(f"AUROC = {auroc_val}")
                # print(f"AUPRC = {auprc_val}")
                res_dict = {"drug_name": d, "AUROC": auroc_val, "AUPRC": auprc_val, "pearsonr": pearsonr_val.statistic, "pearsonr_pval": pearsonr_val.pvalue, "method": m, "fold": fold}
                res_df2 = res_df2._append(res_dict, ignore_index=True)
res_df2
            

Unnamed: 0,drug_name,AUROC,AUPRC,pearsonr,pearsonr_pval,method,fold
0,BUPARLISIB,0.375,0.1667,-0.120743,0.756993,DiffDRP_v7,2
1,CISPLATIN,0.6667,0.8684,0.192412,0.326624,DiffDRP_v7,2
2,FLUOROURACIL,0.75,0.9004,0.269771,0.422409,DiffDRP_v7,2
3,GEMCITABINE,0.6481,0.7193,0.358994,0.188819,DiffDRP_v7,2
4,PACLITAXEL,0.9167,0.9762,0.767714,0.026128,DiffDRP_v7,2
5,SORAFENIB,0.0769,0.0769,-0.292309,0.310524,DiffDRP_v7,2
6,TEMOZOLOMIDE,0.6875,0.2536,0.221448,0.257426,DiffDRP_v7,2


In [24]:
res_df2.pivot_table(index="method", values="AUROC", columns="drug_name") # AUROC

drug_name,BUPARLISIB,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,SORAFENIB,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DiffDRP_v7,0.375,0.6667,0.75,0.6481,0.9167,0.0769,0.6875


In [25]:
res_df2.pivot_table(index="method", values="AUPRC", columns="drug_name") # AUPRC

drug_name,BUPARLISIB,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,SORAFENIB,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DiffDRP_v7,0.1667,0.8684,0.9004,0.7193,0.9762,0.0769,0.2536


In [26]:
exp2A_fold2["test"][["sample_id", "drug_name", "recist", "mappedProject", "dataset_name"]][["drug_name", "recist"]].value_counts()

drug_name     recist
CISPLATIN     1         24
TEMOZOLOMIDE  0         24
SORAFENIB     0         13
GEMCITABINE   0          9
BUPARLISIB    0          8
FLUOROURACIL  1          7
PACLITAXEL    1          6
GEMCITABINE   1          6
CISPLATIN     0          4
FLUOROURACIL  0          4
TEMOZOLOMIDE  1          4
PACLITAXEL    0          2
BUPARLISIB    1          1
SORAFENIB     1          1
Name: count, dtype: int64

In [27]:
drug_res_df_auroc = pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="AUROC", columns="drug_name").reset_index()
drug_res_df_auroc.drop(["BUPARLISIB", "SORAFENIB"], axis=1, inplace=True)

In [28]:
code_ae_auroc = {"method": "CODE-AE", "CISPLATIN": 0.6322, "FLUOROURACIL": 0.5381, "GEMCITABINE": 0.5085, "PACLITAXEL": 0.3611, "TEMOZOLOMIDE": 0.433166667}
drug_res_df_auroc = drug_res_df_auroc._append(code_ae_auroc, ignore_index=True)
drug_res_df_auroc.set_index("method")

Unnamed: 0_level_0,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DiffDRP_v7,0.6343,0.730933,0.618833,0.772833,0.645133
CODE-AE,0.6322,0.5381,0.5085,0.3611,0.433167


In [29]:
pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="AUROC", columns="drug_name", aggfunc=np.std).reset_index().drop(["BUPARLISIB", "SORAFENIB"], axis=1)

  pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="AUROC", columns="drug_name", aggfunc=np.std).reset_index().drop(["BUPARLISIB", "SORAFENIB"], axis=1)


drug_name,method,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
0,DiffDRP_v7,0.030637,0.066386,0.067444,0.125311,0.077575


In [30]:
# CODE-AE std AUROC, AUPRC
# FU: 0.1606, 0.1435
# Gem: 0.0503, 0.0701
# Pac: 0.3155, 0.0574
# Tem: 0.3123, 0.0929 

In [31]:
drug_res_df_auprc = pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="AUPRC", columns="drug_name").reset_index()
drug_res_df_auprc.drop(["BUPARLISIB", "SORAFENIB"], axis=1, inplace=True)

In [32]:
code_ae_auprc = {"method": "CODE-AE", "CISPLATIN": 0.9059, "FLUOROURACIL": 0.6665, "GEMCITABINE": 0.4735, "PACLITAXEL": 0.8208, "TEMOZOLOMIDE": 0.1756333}
drug_res_df_auprc = drug_res_df_auprc._append(code_ae_auprc, ignore_index=True)
drug_res_df_auprc.set_index("method")

Unnamed: 0_level_0,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DiffDRP_v7,0.909333,0.848267,0.587433,0.955767,0.253467
CODE-AE,0.9059,0.6665,0.4735,0.8208,0.175633


In [33]:
pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="AUPRC", columns="drug_name", aggfunc=np.std).reset_index().drop(["BUPARLISIB", "SORAFENIB"], axis=1)

  pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="AUPRC", columns="drug_name", aggfunc=np.std).reset_index().drop(["BUPARLISIB", "SORAFENIB"], axis=1)


drug_name,method,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
0,DiffDRP_v7,0.0355,0.093258,0.174998,0.023958,0.000709


In [33]:
pd.concat([res_df0, res_df1, res_df2]).pivot_table(index="method", values="pearsonr", columns="drug_name").drop(["BUPARLISIB", "SORAFENIB"], axis=1)

drug_name,CISPLATIN,FLUOROURACIL,GEMCITABINE,PACLITAXEL,TEMOZOLOMIDE
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DiffDRP_v7,0.083155,0.358669,0.264388,0.305718,0.167995
