In [1]:
import sys
import pathlib
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, average_precision_score

import plotnine as gg

sys.path.insert(0, "../3.bulk-signatures/")
from utils.metrics import get_metrics, get_metric_pipeline

In [2]:
np.random.seed(5678)

In [3]:
# Set constants
dataset = "bortezomib"

sig_dir = pathlib.Path("results", "singscore")
results_file = pathlib.Path(sig_dir, f"singscore_results{dataset}.tsv.gz")

output_dir = pathlib.Path("results", "performance")

num_permutations = 25
threshold = 0


metric_comparisons = {
    "total": ["Metadata_model_split"],
    "plate": ["Metadata_model_split", "Metadata_Plate"],
    "sample": ["Metadata_model_split", "Metadata_clone_number"]
}

In [4]:
# Load data
results_df = pd.read_csv(results_file, sep="\t")

print(results_df.shape)
results_df.head()

(405, 28)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_batch,Metadata_cell_count,Metadata_cell_density,Metadata_celltype_shorthand_from_plate_graph,Metadata_clone_number,Metadata_date,Metadata_plate_map_name,Metadata_time_to_adhere,...,TotalScore,TotalDispersion,UpScore,UpDispersion,DownScore,DownDispersion,Metadata_permuted_p_value,dataset,min_permuted_value,max_permuted_value
0,219907,B02,2021_03_03_Batch12,6139,2.5x10^3 cells/well,1.0,WT_parental,20210205.0,219814,48 hr,...,-0.228172,2882.1744,-0.123536,1598.2428,-0.104636,1283.9316,0.977,bortezomib,-0.187707,0.189248
1,219907,B03,2021_03_03_Batch12,4567,2.5x10^3 cells/well,2.0,CloneA,20210205.0,219814,48 hr,...,0.097773,825.8082,0.135187,545.5968,-0.037414,280.2114,0.188,bortezomib,-0.187707,0.189248
2,219907,B04,2021_03_03_Batch12,5624,2.5x10^3 cells/well,3.0,CloneE,20210205.0,219814,48 hr,...,0.244962,1498.9086,0.156689,539.6664,0.088273,959.2422,0.014,bortezomib,-0.187707,0.189248
3,219907,B05,2021_03_03_Batch12,5894,2.5x10^3 cells/well,4.0,WT clone 01,20210205.0,219814,48 hr,...,0.136947,1676.8206,0.144912,849.5298,-0.007965,827.2908,0.113,bortezomib,-0.187707,0.189248
4,219907,B06,2021_03_03_Batch12,1277,2.5x10^3 cells/well,5.0,WT clone 02,20210205.0,219814,48 hr,...,-0.384914,1478.1522,-0.25937,524.8404,-0.125544,953.3118,1.0,bortezomib,-0.187707,0.189248


In [5]:
# Using real predictions
real_metric_results = get_metric_pipeline(
    results_df,
    metric_comparisons,
    [dataset],
    shuffle=False,
    signature=False,
    threshold=threshold
)

  recall = tps / tps[-1]


In [6]:
# Using shuffled predictions
all_shuffle_results = {compare: [] for compare in metric_comparisons}
for i in range(0, num_permutations):
    np.random.seed(i)
    shuffle_metric_results = get_metric_pipeline(
        results_df,
        metric_comparisons,
        [dataset],
        shuffle=True,
        signature=False,
        threshold=threshold
    )
    for compare in metric_comparisons:
        metric_df = shuffle_metric_results[compare].assign(permutation=i)
        all_shuffle_results[compare].append(metric_df)

In [7]:
for compare in metric_comparisons:
    full_results_df = real_metric_results[compare]
    shuffle_results_df = pd.concat(all_shuffle_results[compare]).reset_index(drop=True)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_metric_performance.tsv")
    full_results_df.to_csv(output_file, sep="\t", index=False)
    
    output_file = pathlib.Path(f"{output_dir}/{compare}_{dataset}_shuffle_metric_performance.tsv")
    shuffle_results_df.to_csv(output_file, sep="\t", index=False)