## Output ROC scores for otherclone datasets applied bortezomib signature

**Gregory Way, 2021**

How well is a bortezomib specific classifier able to separate clones resistant to cb5083 and ixazomib?

In [6]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, average_precision_score

import plotnine as gg

from utils.metrics import get_metrics, get_metric_pipeline

In [7]:
np.random.seed(56789)

In [8]:
# Set constants
dataset = "otherclones"

sig_dir = pathlib.Path("results", "singscore")
results_file = pathlib.Path(sig_dir, f"singscore_results_{dataset}.tsv.gz")

output_dir = pathlib.Path("results", "performance")

num_permutations = 100
threshold = 0

metric_comparisons = {
    "dataset": ["Metadata_dataset"],
}

roc_model_split_focus = ["ixazomib", "cb5083"]

In [9]:
# Load data
results_df = pd.read_csv(results_file, sep="\t")

print(results_df.shape)
results_df.head()

(200, 28)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,TotalScore,TotalDispersion,UpScore,UpDispersion,DownScore,DownDispersion,Metadata_permuted_p_value,dataset,min_permuted_value,max_permuted_value
0,221057,B05,2021_08_02_Batch24,1814,2.5x10^3 cells/well,4,WT clone 10,20210728,221057,48 hr,...,0.299626,446.2626,0.216704,170.499,0.082922,275.7636,0.001,otherclones,-0.15444,0.159013
1,221057,B06,2021_08_02_Batch24,5481,2.5x10^3 cells/well,5,WT clone 12,20210728,221057,48 hr,...,-0.396296,266.1267,-0.288504,135.6579,-0.107792,130.4688,1.0,otherclones,-0.15444,0.159013
2,221057,B10,2021_08_02_Batch24,1925,2.5x10^3 cells/well,4,WT clone 10,20210728,221057,48 hr,...,0.132283,522.6165,0.131789,255.7485,0.000494,266.868,0.096,otherclones,-0.15444,0.159013
3,221057,B11,2021_08_02_Batch24,3910,2.5x10^3 cells/well,5,WT clone 12,20210728,221057,48 hr,...,-0.218328,334.3263,-0.107143,186.0663,-0.111185,148.26,0.992,otherclones,-0.15444,0.159013
4,221057,C02,2021_08_02_Batch24,2230,2.5x10^3 cells/well,10,BZ007,20210728,221057,48 hr,...,0.280254,309.1221,0.193638,166.7925,0.086616,142.3296,0.002,otherclones,-0.15444,0.159013


In [10]:
# Get performance metrics using real predictions
real_metric_results = get_metric_pipeline(
    results_df,
    metric_comparisons,
    [dataset],
    shuffle=False,
    signature=False,
    threshold=threshold
)

In [6]:
# Get performance metrics using shuffled predictions
all_shuffle_results = {compare: [] for compare in metric_comparisons}
for i in range(0, num_permutations):
    np.random.seed(i)
    shuffle_metric_results = get_metric_pipeline(
        results_df,
        metric_comparisons,
        datasets=[dataset],
        shuffle=True,
        signature=False,
        threshold=threshold
    )
    for compare in metric_comparisons:
        metric_df = shuffle_metric_results[compare].assign(permutation=i)
        all_shuffle_results[compare].append(metric_df)

In [7]:
# Get ROC curve information for model sets
roc_scores = []
roc_curve_data = []
for split in roc_model_split_focus:
    results_subset_df = results_df.query("Metadata_dataset == @split")
    for shuffle in [True, False]:
        roc_auc_val, roc_df = get_metrics(df=results_subset_df, return_roc_curve=True, shuffle=shuffle)

        roc_scores.append(pd.Series([roc_auc_val, split, shuffle]))
        roc_curve_data.append(roc_df.assign(model_split=split, shuffled=shuffle))

roc_scores_df = pd.DataFrame(roc_scores)
roc_scores_df.columns = ["roc_auc", "model_split", "shuffled"]
roc_curve_data_df = pd.concat(roc_curve_data).reset_index(drop=True)

In [8]:
# Output performance results
for compare in metric_comparisons:
    full_results_df = real_metric_results[compare]
    shuffle_results_df = pd.concat(all_shuffle_results[compare]).reset_index(drop=True)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_metric_performance.tsv")
    full_results_df.to_csv(output_file, sep="\t", index=False)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_shuffle_metric_performance.tsv")
    shuffle_results_df.to_csv(output_file, sep="\t", index=False)
    
# Output ROC results
output_file = pathlib.Path(f"{output_dir}/{dataset}_bortezomibsignature_roc_auc.tsv")
roc_scores_df.to_csv(output_file, sep="\t", index=False)

output_file = pathlib.Path(f"{output_dir}/{dataset}_bortezomibsignature_roc_curve.tsv")
roc_curve_data_df.to_csv(output_file, sep="\t", index=False)