# STraDA assessment

In [None]:
curr_version = "0.2.0"

In [None]:
def metricsCount(df, dataset, zoom=None):
    #hue_palette = sns.color_palette(["#6acc64", "#ee854a", "#d65f5f", "#797979"])  # Muted
    hue_palette = sns.color_palette(["#1ac938", "#ff7c00", "#e8000b", "#a3a3a3"])  # Bright
    hue_order = ["TP", "FN", "FP", "-"]

    g = sns.catplot(data=df[df.dataset == dataset],
                kind="count",
                x="source",
                hue="status",
                col="type",
                hue_order=hue_order,
                palette=hue_palette
               )
    plt.subplots_adjust(top=0.8)
    g.fig.suptitle('Fusions callers comparison on ' + dataset)
    if zoom is not None:
        axes = g.axes
        axes[0,0].set_ylim(0, zoom)
        axes[0,1].set_ylim(0, zoom)
    plt.show()

def perfTable(df):
    col_order = ['source', 'TP', 'FP', 'FN', '-', 'precision', 'recall']
    rows = list()
    for source in sorted(set(df.source)):
        curr_row = {"source": source, "TP": 0, "FP": 0, "FN": 0, "-": 0}
        for status, count in df[df.source == source]["status"].value_counts().items():
            curr_row[status] = count
        curr_row["precision"] = "{:.4f}".format(curr_row["TP"] / (curr_row["TP"] + curr_row["FP"]))
        curr_row["recall"] = "{:.4f}".format(curr_row["TP"] / (curr_row["TP"] + curr_row["FN"]))
        rows.append(curr_row)
    display(pd.DataFrame(rows)[col_order])

def perfTable(df):
    # Performance
    col_order = ['source', 'TP', 'FP', 'FN', '-', 'precision', 'recall']
    rows = list()
    res_by_src = {}
    for source in sorted(set(df.source)):
        curr_row = {"source": source, "TP": 0, "FP": 0, "FN": 0, "-": 0}
        for status, count in df[df.source == source]["status"].value_counts().items():
            curr_row[status] = count
        curr_row["precision"] = "{:.4f}".format(curr_row["TP"] / (curr_row["TP"] + curr_row["FP"]))
        curr_row["recall"] = "{:.4f}".format(curr_row["TP"] / (curr_row["TP"] + curr_row["FN"]))
        rows.append(curr_row)
        res_by_src[source] = curr_row
    display(pd.DataFrame(rows)[col_order])
    # Significance
    sig_rows = list()
    for curr_row in rows:
        curr_source = curr_row["source"]
        sig_row = {"source": curr_source}
        for cmp_source in sorted(set(df.source)):
            if curr_source == cmp_source:
                sig_row["(prec) " + cmp_source] = ""
                sig_row["(rec) " + cmp_source] = ""
            else:
                curr_res = res_by_src[curr_source]
                cmp_res = res_by_src[cmp_source]
                odds_ratio, p_value = stats.fisher_exact([
                    [curr_res["TP"], cmp_res["TP"]],
                    [curr_res["TP"] + curr_res["FP"], cmp_res["TP"] + cmp_res["FP"]]
                ])
                sig_row["(prec) " + cmp_source] = p_value
                odds_ratio, p_value = stats.fisher_exact([
                    [curr_res["TP"], cmp_res["TP"]],
                    [curr_res["TP"] + curr_res["FN"], cmp_res["TP"] + cmp_res["FN"]]
                ])
                sig_row["(rec) " + cmp_source] = p_value
        sig_rows.append(sig_row)
    col_order = ["source"] + ["(prec) " + src for src in sorted(set(df.source))]
    display(pd.DataFrame(sig_rows)[col_order])
    col_order = ["source"] + ["(rec) " + src for src in sorted(set(df.source))]
    display(pd.DataFrame(sig_rows)[col_order])

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

## Load datasets results

In [None]:
df_genes = pd.read_csv(curr_version + "/results_details_genes.tsv", sep="\t")
df_genes["type"] = df_genes["dataset"].apply(lambda x: "genes")
df_breakpoints = pd.read_csv(curr_version + "/results_details_breakpoints.tsv", sep="\t")
df_breakpoints["type"] = df_breakpoints["dataset"].apply(lambda x: "breakpoints")
df = pd.concat([df_genes, df_breakpoints])

## 2- Litterature dataset

### 2.1- Description

### 2.2- Results

In [None]:
curr_dataset = "Heyer_2019"

In [None]:
metricsCount(df, curr_dataset)
metricsCount(df, curr_dataset, 60)

In [None]:
for data_type in sorted(set(df.type)):
    print(data_type.capitalize())
    perfTable(df[(df.dataset == curr_dataset) & (df.type == data_type)])

## 3- Simulated dataset

### 3.1- Description

### 3.2- Results

In [None]:
curr_dataset = "simulated"

In [None]:
metricsCount(df, curr_dataset)
metricsCount(df, curr_dataset, 600)

In [None]:
for data_type in sorted(set(df.type)):
    print(data_type.capitalize())
    perfTable(df[(df.dataset == curr_dataset) & (df.type == data_type)])

## 4- Synthetic dataset

### 4.1- Description

### 4.2- Results

In [None]:
curr_dataset = "Tembe_2014"

In [None]:
metricsCount(df, curr_dataset)
metricsCount(df, curr_dataset, 400)

In [None]:
for data_type in sorted(set(df.type)):
    print(data_type.capitalize())
    perfTable(df[(df.dataset == curr_dataset) & (df.type == data_type)])