## Output ROC scores for otherclone datasets applied bortezomib signature

**Gregory Way, 2021**

How well is a bortezomib specific classifier able to separate clones resistant to cb5083 and ixazomib?

In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, average_precision_score

import plotnine as gg

from utils.metrics import get_metrics, get_metric_pipeline

In [2]:
np.random.seed(56789)

In [3]:
# Set constants
dataset = "otherclones"

sig_dir = pathlib.Path("results", "singscore")
results_file = pathlib.Path(sig_dir, f"singscore_results_{dataset}.tsv.gz")

output_dir = pathlib.Path("results", "performance")

num_permutations = 100
threshold = 0

metric_comparisons = {
    "dataset": ["Metadata_dataset"],
}

roc_model_split_focus = ["ixazomib", "cb5083"]

In [4]:
# Load data
results_df = pd.read_csv(results_file, sep="\t")

print(results_df.shape)
results_df.head()

(420, 28)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,TotalScore,TotalDispersion,UpScore,UpDispersion,DownScore,DownDispersion,Metadata_permuted_p_value,dataset,min_permuted_value,max_permuted_value
0,218698,B02,2020_08_24_Batch9,12640,5x10^3 cells/well,1,WT_parental,20200818,218698,48 hr,...,-0.056488,381.7695,-0.071243,215.7183,0.014755,166.0512,0.702,otherclones,-0.155786,0.15683
1,218698,B03,2020_08_24_Batch9,8927,5x10^3 cells/well,2,WT clone 04,20200818,218698,48 hr,...,0.010667,707.2002,0.03311,391.4064,-0.022443,315.7938,0.487,otherclones,-0.155786,0.15683
2,218698,B04,2020_08_24_Batch9,6044,5x10^3 cells/well,3,WT clone 05,20200818,218698,48 hr,...,0.101739,488.5167,0.115978,239.4399,-0.014239,249.0768,0.145,otherclones,-0.155786,0.15683
3,218698,B05,2020_08_24_Batch9,7530,5x10^3 cells/well,4,WT clone 06,20200818,218698,48 hr,...,0.346523,320.9829,0.184524,199.4097,0.161999,121.5732,0.001,otherclones,-0.155786,0.15683
4,218698,B06,2020_08_24_Batch9,4726,5x10^3 cells/well,5,WT clone 07,20200818,218698,48 hr,...,-0.089659,394.3716,-0.099516,164.5686,0.009858,229.803,0.827,otherclones,-0.155786,0.15683


In [5]:
# Get performance metrics using real predictions
real_metric_results = get_metric_pipeline(
    results_df,
    metric_comparisons,
    [dataset],
    shuffle=False,
    signature=False,
    threshold=threshold
)

In [6]:
# Get performance metrics using shuffled predictions
all_shuffle_results = {compare: [] for compare in metric_comparisons}
for i in range(0, num_permutations):
    np.random.seed(i)
    shuffle_metric_results = get_metric_pipeline(
        results_df,
        metric_comparisons,
        datasets=[dataset],
        shuffle=True,
        signature=False,
        threshold=threshold
    )
    for compare in metric_comparisons:
        metric_df = shuffle_metric_results[compare].assign(permutation=i)
        all_shuffle_results[compare].append(metric_df)

In [7]:
# Get ROC curve information for model sets
roc_scores = []
roc_curve_data = []
for split in roc_model_split_focus:
    results_subset_df = results_df.query("Metadata_dataset == @split")
    for shuffle in [True, False]:
        roc_auc_val, roc_df = get_metrics(df=results_subset_df, return_roc_curve=True, shuffle=shuffle)

        roc_scores.append(pd.Series([roc_auc_val, split, shuffle]))
        roc_curve_data.append(roc_df.assign(model_split=split, shuffled=shuffle))

roc_scores_df = pd.DataFrame(roc_scores)
roc_scores_df.columns = ["roc_auc", "model_split", "shuffled"]
roc_curve_data_df = pd.concat(roc_curve_data).reset_index(drop=True)

In [8]:
# Output performance results
for compare in metric_comparisons:
    full_results_df = real_metric_results[compare]
    shuffle_results_df = pd.concat(all_shuffle_results[compare]).reset_index(drop=True)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_metric_performance.tsv")
    full_results_df.to_csv(output_file, sep="\t", index=False)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_shuffle_metric_performance.tsv")
    shuffle_results_df.to_csv(output_file, sep="\t", index=False)
    
# Output ROC results
output_file = pathlib.Path(f"{output_dir}/{dataset}_bortezomibsignature_roc_auc.tsv")
roc_scores_df.to_csv(output_file, sep="\t", index=False)

output_file = pathlib.Path(f"{output_dir}/{dataset}_bortezomibsignature_roc_curve.tsv")
roc_curve_data_df.to_csv(output_file, sep="\t", index=False)