## Output ROC scores for otherclone datasets applied bortezomib signature

**Gregory Way, 2021**

How well is a bortezomib specific classifier able to separate clones resistant to cb5083 and ixazomib?

**Yu Han, 2021**

How well is a bortezomib specific classifier able to separate WT and BZ clone types?

I did not make any major changes to Greg's original script of 10.0, except changing variable names to load in the new batch data results from scripts 8.1 and 9.1. 

In [99]:
import sys
import pathlib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, average_precision_score

import plotnine as gg

from utils.metrics import get_metrics, get_metric_pipeline

In [100]:
np.random.seed(56789)

In [135]:
# Set constants
dataset = "otherclones"

sig_dir = pathlib.Path("results", "singscore")
results_file = pathlib.Path(sig_dir, f"singscore_results_LAST_BATCH_VALIDATION{dataset}.tsv.gz")

output_dir = pathlib.Path("results", "performance")

num_permutations = 100
threshold = 0

metric_comparisons = {
    "dataset": ["Metadata_dataset"],
}

roc_model_split_focus = ["WT", "BZ"]

In [136]:
# Load data and double check if there are two classes (e.g., resistant and sensitve)
results_df = pd.read_csv(results_file, sep="\t")

print(results_df.shape)
results_df['Metadata_clone_type_indicator']

(200, 28)


0      0
1      0
2      0
3      0
4      1
      ..
195    1
196    1
197    1
198    1
199    1
Name: Metadata_clone_type_indicator, Length: 200, dtype: int64

In [116]:
# Get performance metrics using real predictions
real_metric_results = get_metric_pipeline(
    results_df,
    metric_comparisons,
    [dataset],
    shuffle=False,
    signature=False,
    threshold=threshold
)

In [125]:
# Get performance metrics using shuffled predictions
all_shuffle_results = {compare: [] for compare in metric_comparisons}
for i in range(0, num_permutations):
    np.random.seed(i)
    shuffle_metric_results = get_metric_pipeline(
        results_df,
        metric_comparisons,
        datasets=[dataset],
        shuffle=True,
        signature=False,
        threshold=threshold
    )
    for compare in metric_comparisons:
        metric_df = shuffle_metric_results[compare].assign(permutation=i)
        all_shuffle_results[compare].append(metric_df)

In [128]:
#double check the data counts are correct
results_df.Metadata_clone_type_indicator.value_counts()

0    120
1     80
Name: Metadata_clone_type_indicator, dtype: int64

In [121]:
# Get ROC curve information for model sets
roc_scores = []
roc_curve_data = []
for split in roc_model_split_focus:
    results_subset_df = results_df.query("Metadata_dataset == @split")
    for shuffle in [True, False]:
        roc_auc_val, roc_df = get_metrics(df=results_subset_df, return_roc_curve=True, shuffle=shuffle)

        roc_scores.append(pd.Series([roc_auc_val, split, shuffle]))
        roc_curve_data.append(roc_df.assign(model_split=split, shuffled=shuffle))

roc_scores_df = pd.DataFrame(roc_scores)
roc_scores_df.columns = ["roc_auc", "model_split", "shuffled"]
roc_curve_data_df = pd.concat(roc_curve_data).reset_index(drop=True)

In [122]:
roc_scores_df

Unnamed: 0,roc_auc,model_split,shuffled
0,0.55,WT,True
1,0.7,WT,False
2,0.4,BZ,True
3,0.7,BZ,False


In [123]:
roc_curve_data_df

Unnamed: 0,fpr,tpr,threshold,model_split,shuffled
0,0.0000,0.000,1.522342,WT,True
1,0.0000,0.025,0.522342,WT,True
2,0.0125,0.025,0.489874,WT,True
3,0.0250,0.025,0.449164,WT,True
4,0.0250,0.050,0.408079,WT,True
5,0.0375,0.050,0.394142,WT,True
6,0.0375,0.075,0.391508,WT,True
7,0.0500,0.075,0.388999,WT,True
8,0.0625,0.075,0.388378,WT,True
9,0.0625,0.100,0.379831,WT,True


In [129]:
# Output performance results
for compare in metric_comparisons:
    full_results_df = real_metric_results[compare]
    shuffle_results_df = pd.concat(all_shuffle_results[compare]).reset_index(drop=True)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_metric_performance_LAST_BATCH_VALIDATION.tsv")
    full_results_df.to_csv(output_file, sep="\t", index=False)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_shuffle_metric_performance_LAST_BATCH_VALIDATION.tsv")
    shuffle_results_df.to_csv(output_file, sep="\t", index=False)
    
# Output ROC results
output_file = pathlib.Path(f"{output_dir}/{dataset}_bortezomibsignature_roc_auc_LAST_BATCH_VALIDATION.tsv")
roc_scores_df.to_csv(output_file, sep="\t", index=False)

output_file = pathlib.Path(f"{output_dir}/{dataset}_bortezomibsignature_roc_curve_LAST_BATCH_VALIDATION.tsv")
roc_curve_data_df.to_csv(output_file, sep="\t", index=False)