In [1]:
import json
import os
import warnings
from pathlib import Path

from pprint import pprint
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# filepath = Path(__file__).parent
filepath = Path(os.path.abspath(''))
print(filepath)

canc_col_name = "improve_sample_id"
drug_col_name = "improve_chem_id"

/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all


In [2]:
main_models_path = filepath / 'models'  # dir containing the collection of models
models_paths_list  = sorted(main_models_path.glob('*'))  # list of paths to the models
models_paths_list[1]

PosixPath('/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/GraphDRP')

In [6]:
# Aggregate runtimes from all models

res_fname = 'runtimes.csv'
out_fname = 'all_models_' + res_fname

agg_df_list = []
missing_files = []

for model_dir in models_paths_list:
    model_name = model_dir.name
    pp_res_path = model_dir / f"postproc.csa.{model_name}.improve_output"
    print(pp_res_path)
    try:
        rr = pd.read_csv(pp_res_path / res_fname, sep=',')
        agg_df_list.append(rr)
    except FileNotFoundError:
        warnings.warn(f"File not found! {pp_res_path}", UserWarning)
        missing_files.append(pp_res_path)

df = pd.concat(agg_df_list, axis=0)
df.to_csv(filepath / out_fname, index=False)
pprint(df.shape)
pprint(df.nunique())
pprint(df[:4])

/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/DeepCDR/postproc.csa.DeepCDR.improve_output
/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/GraphDRP/postproc.csa.GraphDRP.improve_output
/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/tCNNS/postproc.csa.tCNNS.improve_output
(1100, 9)
hours          5
minutes       44
seconds     1100
src            5
trg            5
split         10
tot_mins      66
model          2
stage          3
dtype: int64
   hours  minutes    seconds   src   trg  split  tot_mins    model       stage
0      0        2  33.573939  CCLE  CCLE      0         2  DeepCDR  preprocess
1      0        2  32.301698  CCLE  CCLE      1         2  DeepCDR  preprocess
2      0        2  34.182968  CCLE  CCLE      2         2  DeepCDR  preprocess
3      0        2  34.474254  CCLE  CCLE      3         2  DeepCDR  preprocess


  app.launch_new_instance()


In [7]:
# Aggregate scores from all models

res_fname = 'all_scores.csv'
out_fname = 'all_models_' + res_fname

agg_df_list = []
missing_files = []

for model_dir in models_paths_list:
    model_name = model_dir.name
    pp_res_path = model_dir / f"postproc.csa.{model_name}.improve_output"
    print(pp_res_path)
    try:
        rr = pd.read_csv(pp_res_path / res_fname, sep=',')
        agg_df_list.append(rr)
    except FileNotFoundError:
        warnings.warn(f"File not found! {pp_res_path}", UserWarning)
        missing_files.append(pp_res_path)

df = pd.concat(agg_df_list, axis=0)
df.to_csv(filepath / out_fname, index=False)
pprint(df.shape)
pprint(df.nunique())
pprint(df[:4])

/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/DeepCDR/postproc.csa.DeepCDR.improve_output
/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/GraphDRP/postproc.csa.GraphDRP.improve_output
/nfs/lambda_stor_01/data/apartin/projects/IMPROVE/pan-models/csa_postproc_all/models/tCNNS/postproc.csa.tCNNS.improve_output
(3750, 6)
met         5
split      10
value    3750
src         5
trg         5
model       3
dtype: int64
   met  split     value   src   trg    model
0  mse      0  0.006245  CCLE  CCLE  DeepCDR
1  mse      1  0.006012  CCLE  CCLE  DeepCDR
2  mse      2  0.006210  CCLE  CCLE  DeepCDR
3  mse      3  0.005937  CCLE  CCLE  DeepCDR


## Plot

In [2]:
def gen_plot(df: pd.DataFrame, fpath: str, tr_data: str, errorbar: bool=False, show: bool=False):
    """ ... """
    if errorbar:
        df_pivot = df.pivot(index='drug_name', columns=canc_col_name, values=['pred_mean', 'pred_std'])
    else:
        df_pivot = df.pivot(index='drug_name', columns=canc_col_name, values=['pred_mean'])

    # df_pivot = df_pivot.sort_values(('pred_mean', '655913~031-T'))
    random_pdo = df['improve_sample_id'].unique()[0]
    df_pivot = df_pivot.sort_values(('pred_mean', random_pdo))
    
    # Plotting
    fig, ax = plt.subplots(figsize=(14, 8))

    # Get the number of drugs and cells
    drugs = df['drug_name'].unique()
    cells = df[canc_col_name].unique()
    bar_width = 0.4  # Width of the bars
    index = np.arange(len(drugs))  # The label locations

    # Use a color palette from seaborn
    # colors = sns.color_palette()
    colors = sns.color_palette("pastel")
    
    # Plot each cell
    for i, (cell, color) in enumerate(zip(cells, colors)):
        pred_mean = df_pivot['pred_mean'][cell].values
        if errorbar:
            pred_std = df_pivot['pred_std'][cell].values
            bars = ax.bar(index + i * bar_width, pred_mean, bar_width, yerr=pred_std,
                          label=cell, color=color, capsize=5, ecolor='black',
                          error_kw=dict(linestyle='--', linewidth=1))
        else:
            bars = ax.bar(index + i * bar_width, pred_mean, bar_width, label=cell,
                          color=color, capsize=5, ecolor='black')

        # Add annotations
        for bar, mean in zip(bars, pred_mean):
            yval = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2, yval, f'{mean:.2f}', ha='center', va='bottom')

    # Add labels, title, and legend
    ax.set_xlabel('Drug')
    ax.set_ylabel('Predicted AUC (mean across data splits)')
    ax.set_title(f'{model_name}: Predicted AUC by Drug and PDO (trained on {tr_data})')
    ax.set_xticks(index + bar_width / 2)
    ax.set_xticklabels(drugs, rotation=45)
    ax.legend()
    ax.grid()

    ax.set_ylim(0.3, None)

    plt.tight_layout()

    # Save the plot to a file
    plt.savefig(fpath, dpi=300)
    plt.show()

In [18]:
# # IMPROVE models
# # improve_models_list = ["DeepTTC", "GraphDRP", "HIDRA", "IGTD", "PaccMann_MCA"]
# # model_name = "DeepTTC"
# # model_name = "GraphDRP"
# # model_name = "HIDRA"
# # model_name = "IGTD"
# model_name = "PaccMann_MCA"
# # -------
# source = "CTRPv2"
# target = "PDMR"

# UNO
model_name = "UNO"
# -------
source = "all"
target = "PDMR"

agg_df = pd.read_csv(filepath / "plots_outdir" / f"agg_preds_{model_name}_{source}_{target}.tsv", sep="\t")

In [None]:
gen_plot(df=agg_df,
         fpath=filepath / "plots_outdir" / f'plot_{model_name}_{source}-{target}.png',
         tr_data=source, errorbar=False, show=True)