# MInITI assessment

In [None]:
import os
import sys
import statistics
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
sns.set_style("whitegrid")

In [None]:
# Preprocess

def getLoci(dataset_df):
    loci = []
    for col_name in dataset_df.columns:
        if col_name != "train_nb_spl_stable":
            match = re.match("^train_nb_(.+)_stable$", col_name)
            if match is not None:
                loci.append(match.group(1))
    return sorted(loci)


def getPredStatusEval(row, locus):
    status = "Undetermined"
    if row["{}_observed_status".format(locus)] != "Undetermined" and row["{}_expected_status".format(locus)] != "Undetermined":
        if row["{}_expected_status".format(locus)] == row["{}_observed_status".format(locus)]:
            status = "right"
        else:
            status = "wrong"
    return status

In [None]:
# Accuray

def getAccuracyDf(loci, results_df):
    accuracy_rows = []
    datasets_ids = list(set(results_df["dataset_id"]))
    configurations = sorted(list(set(results_df["config"])))
    for dataset_id in datasets_ids:
        for config in configurations:
            cfg_params = {}
            # Select sub-dataframe for dataset and configuration
            dataset = results_df[
                (results_df["dataset_id"] == dataset_id) &
                (results_df["config"] == config)
            ]
            # Get right, wrong and Undetermined counts
            ct_status_by_locus = {
                locus: {"right": 0, "wrong": 0, "Undetermined": 0} for locus in loci
            }
            for idx, row in dataset.iterrows():
                for locus in loci:
                    if row["{}_expected_status".format(locus)] != "Undetermined":
                        status = getPredStatusEval(row, locus)
                        ct_status_by_locus[locus][status] += 1
                cfg_params = {
                    "classifier": row["classifier"],
                    "min_support": row["min_support"],
                    "padding": row["padding"],
                    "stitching": row["stitching"],
                    "duplicates": row["duplicates"]
                }
            # Resume by locus
            for locus in loci:
                nb_determined = ct_status_by_locus[locus]["right"] + ct_status_by_locus[locus]["wrong"]
                accuracy_rows.append([
                    dataset_id,
                    locus,
                    config,
                    cfg_params["classifier"],
                    cfg_params["min_support"],
                    cfg_params["padding"],
                    cfg_params["stitching"],
                    cfg_params["duplicates"],
                    (None if nb_determined == 0 else ct_status_by_locus[locus]["right"] / nb_determined),
                    ct_status_by_locus[locus]["right"],
                    ct_status_by_locus[locus]["wrong"],
                    ct_status_by_locus[locus]["Undetermined"]
                ])
    accuracy_df = pd.DataFrame.from_records(
        accuracy_rows,
        columns=[
            "dataset_id", "locus",
            "config", "classifier", "min_support", "padding", "stitching", "duplicates",
            "accuracy", "nb_right_prediction", "nb_wrong_prediction", "nb_without_prediction"
        ]
    )
    return accuracy_df

In [None]:
# Consensus method

def getMajority(status):
    consensus = "Undetermined"
    nb_by_status = {"MSI": 0, "MSS": 0, "Undetermined": 0}
    for curr_status in status:
        nb_by_status[curr_status] += 1
    if nb_by_status["MSI"] > nb_by_status["MSS"]:
        consensus = "MSI"
    elif nb_by_status["MSI"] < nb_by_status["MSS"]:
        consensus = "MSS"
    return consensus


def getAgreement(status):
    agreement = "Undetermined"
    list_of_pred = [curr_status for curr_status in status]
    if len(set(list_of_pred)) == 1:
        agreement = list_of_pred[0]
    return agreement


def getMethodsConsensusDf(res_df, loci, used_clf=None, consensus_method="majority"):
    if used_clf is None:
        used_clf = sorted(list(set(res_df["classifier"])))
    nb_used_clf = len(used_clf)
    consensus_data = {}
    consensus_rows = []
    for idx_row, curr_row in res_df.iterrows():
        res_id = "{}_{}_{}".format(curr_row["dataset_id"], curr_row["lib_name"], curr_row["config"].split(",", 1)[1])
        if curr_row["classifier"] in used_clf:
            if res_id not in consensus_data:
                consensus_data[res_id] = {
                    "dataset_id": curr_row["dataset_id"],
                    "lib_name": curr_row["lib_name"],
                    "spl_name": curr_row["spl_name"]
                }
                for elt in ["spl"] + loci:
                    consensus_data[res_id][elt + "_expected_status"] = curr_row[elt + "_expected_status"]
                    consensus_data[res_id][elt + "_observed_status"] = [curr_row[elt + "_observed_status"]]
            else:
                for elt in ["spl"] + loci:
                    consensus_data[res_id][elt + "_observed_status"].append(curr_row[elt + "_observed_status"])
            if len(consensus_data[res_id]["spl_observed_status"]) == nb_used_clf:
                cons_row = {k: v for k, v in consensus_data[res_id].items()}
                cons_row["classifier"] = consensus_method
                cons_row["padding"] = curr_row["padding"]
                cons_row["min_support"] = curr_row["min_support"]
                cons_row["stitching"] = curr_row["stitching"]
                cons_row["duplicates"] = curr_row["duplicates"]
                cons_row["config"] = "clf={},{}".format(consensus_method, curr_row["config"].split(",", 1)[1])
                for elt in ["spl"] + loci:
                    if consensus_method == "majority":
                        cons_row[elt + "_observed_status"] = getMajority(cons_row[elt + "_observed_status"])
                    else:
                        cons_row[elt + "_observed_status"] = getAgreement(cons_row[elt + "_observed_status"])
                    cons_row[elt + "_pred_score"] = None
                    cons_row[elt + "_pred_is_ok"] = None
                    if elt != "spl":
                        cons_row[elt + "_pred_support"] = None
                for elt in ["spl"] + loci:
                    cons_row[elt + "_pred_is_ok"] = getPredStatusEval(cons_row, elt)
                consensus_rows.append(cons_row)
                del(consensus_data[res_id])
    return pd.DataFrame(consensus_rows)

In [None]:
# Prediction score

def getLociDf(loci, results_df):
    loci_rows = []
    for idx, row in results_df.iterrows():
        for locus in loci:
            if row["{}_expected_status".format(locus)] != "Undetermined":
                loci_rows.append([
                    row["dataset_id"],
                    locus,
                    row["config"],
                    row["classifier"],
                    row["{}_pred_score".format(locus)],
                    getPredStatusEval(row, locus) + "_classif"
                ])
    loci_df = pd.DataFrame.from_records(loci_rows, columns=["dataset_id", "locus", "config", "classifier", "prediction_score", "prediction_status"])
    return loci_df


def writeScorePredStatus(loci, results_df, nb_col=1, subplots_adjust=0.9):
    loci_df = getLociDf(loci, results_df)
    loci_df = loci_df[loci_df["prediction_status"] != "Undetermined_classif"]
    loci_df = loci_df[loci_df["classifier"] != retained["consensus"]["method"]]
    prediction_status_order = ["right_classif", "wrong_classif"]
    graph = sns.catplot(
        x="classifier",
        y="prediction_score",
        hue="prediction_status",
        col="locus",
        col_wrap=nb_col,
        data=loci_df,
        kind="box",
        medianprops=dict(linewidth=2, color='firebrick'),
        order=sorted(list(set(loci_df["classifier"]))),
        hue_order=prediction_status_order
    )
    for ax in graph.axes.flat:
        ax.set(xlabel='Classifiers', ylabel='Prediction score')
        ax.tick_params(axis='x', rotation=90)
    plt.subplots_adjust(top=subplots_adjust, hspace=0.2)
    plt.gcf().suptitle("Confidence scores evaluation")
    plt.show()

In [None]:
# Prediction status

def writePredStatus(accuracy_df, nb_col=1, subplots_adjust=0.9):
    loci = sorted(set(accuracy_df["locus"]))
    # Agglomerate dataset info
    status_rows = []
    for idx, row in accuracy_df.iterrows():
        nb_evaluated = row["nb_right_prediction"] + row["nb_wrong_prediction"] + row["nb_without_prediction"]
        if nb_evaluated == 0:
            continue
        for status in ["nb_right_prediction", "nb_wrong_prediction", "nb_without_prediction"]:
            prediction_status = prediction_status = status.split("_")[1] + "_classif"
            status_rows.append([
                row["dataset_id"],
                row["locus"],
                row["config"],
                prediction_status,
                row[status] * 100 / nb_evaluated
            ])
    status_df = pd.DataFrame.from_records(status_rows, columns=["dataset_id", "locus", "config", "prediction_status", "% of samples"])
    # Plot status
    prediction_status_order = ["wrong_classif", "without_classif"]
    status_df = status_df[status_df["prediction_status"] != "right_classif"]
    g = sns.catplot(
        y="% of samples",
        x="prediction_status",
        hue="config",
        col="locus",
        col_wrap=nb_col,
        data=status_df,
        kind="box",
        order=prediction_status_order,
        medianprops=dict(linewidth=2, color='firebrick')
    )
    plt.subplots_adjust(top=subplots_adjust, hspace=0.2)
    plt.gcf().suptitle("Classification accuracy")
    plt.show()

## 1. Load data

In [None]:
retained = {
    "classifiers": {"MSIsensor-pro_pro", "mSINGSUp", "agreement", "LogisticRegression"},
    "consensus": {
        "components": ["MSIsensor-pro_pro", "LogisticRegression"],
        "method": "agreement"
    },
    "duplicates": "with",
    "min_support": 150,
    "padding": 2,
    "stitching": "without"
}

In [None]:
# Init data
dataset_df = pd.read_csv("mmr_v1_datasets.tsv", sep='\t')
results_df = pd.read_csv("mmr_v1_results.tsv", sep='\t')
#dataset_df = pd.read_csv("solid_tumor_v5.1_datasets.tsv", sep='\t')
#results_df = pd.read_csv("solid_tumor_v5.1_results.tsv", sep='\t')
loci = getLoci(dataset_df)

In [None]:
# Preprocess data and agreement
results_df["spl_name"] = results_df["lib_name"].apply(lambda lib_name: lib_name.split("_")[0])
for locus in loci:
    results_df["{}_pred_is_ok".format(locus)] = results_df.apply(lambda row: getPredStatusEval(row, locus), axis=1)
results_df["spl_pred_is_ok"] = results_df.apply(lambda row: getPredStatusEval(row, "spl"), axis=1)

In [None]:
# Add agreement classifier
consensus_df = getMethodsConsensusDf(results_df, loci, retained["consensus"]["components"], retained["consensus"]["method"])
results_df = pd.concat([results_df, consensus_df], sort=False)

## 2. Results from standard datasets

### 2.1. Datasets

In [None]:
def datasetsComposition(dataset_df, mode="rate"):
    loci = set([title.replace("train_nb_", "").replace("_unstable", "") for title in dataset_df.columns if title.startswith("train_nb_") and title.endswith("_unstable")])
    # Datasets descriptions
    desc_rows = []
    for idx, row in dataset_df.iterrows():
        for dataset_type in ["train", "test"]:
            for locus in loci:
                nb_determined = row["{}_nb_{}_unstable".format(dataset_type, locus)] + row["{}_nb_{}_stable".format(dataset_type, locus)]
                if nb_determined != 0:
                    ratio_unstable = row["{}_nb_{}_unstable".format(dataset_type, locus)] / nb_determined
                    desc_rows.append([
                        dataset_type,
                        locus,
                        ratio_unstable,
                        row["{}_nb_{}_stable".format(dataset_type, locus)],
                        row["{}_nb_{}_unstable".format(dataset_type, locus)]
                    ])
    desc_df = pd.DataFrame.from_records(desc_rows, columns=["dataset_type", "locus", "unstable_ratio", "nb_stable", "nb_unstable"])
    # Plot
    if mode == "rate":
        g = sns.boxplot(x="locus", y="unstable_ratio", hue="dataset_type", data=desc_df, medianprops=dict(linewidth=2, color='firebrick'))
        plt.subplots_adjust(top=0.95)
        locs, labels = plt.xticks()
        g.set_xticklabels(labels, rotation=90)
        plt.gcf().suptitle("Rate of unstable ({} datasets)".format(len(dataset_df)))
    else:
        g = sns.boxplot(x="locus", y="nb_unstable", hue="dataset_type", data=desc_df, medianprops=dict(linewidth=2, color='firebrick'))
        plt.subplots_adjust(top=0.95)
        locs, labels = plt.xticks()
        g.set_xticklabels(labels, rotation=90)
        plt.gcf().suptitle("Number of unstable ({} datasets)".format(len(dataset_df)))
    plt.show()

datasetsComposition(dataset_df, "rate")
datasetsComposition(dataset_df, "count")

### 2.2. Loci

In [None]:
loci_acc_df = getAccuracyDf(loci, results_df)

#### 2.2.a. Accuracies

##### Find best parameters

In [None]:
by_config = {}
for curr_dataset in sorted(set(loci_acc_df["dataset_id"])):
    acc_dataset_df = loci_acc_df[loci_acc_df["dataset_id"] == curr_dataset]
    acc_by_config = {}
    for idx, row in acc_dataset_df.iterrows():
        if row["config"] not in by_config:
            by_config[row["config"]] = {
                "accuracy_sum": 0,
                "classifier": row["classifier"],
                "padding": row["padding"],
                "min_support": row["min_support"],
                "duplicates": row["duplicates"],
                "stitching": row["stitching"],
                "config": row["config"],
                "rank_sum": 0,
                #"rank_median": 0
            }
        if row["config"] not in acc_by_config:
            acc_by_config[row["config"]] = []
        acc_by_config[row["config"]].append(row["accuracy"]) # List because several it exists one accuracy by marker in dataset for the config
        by_config[row["config"]]["accuracy_sum"] += row["accuracy"]
    # Manage rank by config
    for cfg, accuracies in acc_by_config.items():
        acc_by_config[cfg] = np.mean(accuracies)
    sorted_acc = sorted(acc_by_config.items(), key=lambda elt: elt[1], reverse=True)
    rank = 0
    prev_acc = sorted_acc[0][1]
    for cfg, acc in sorted_acc:
        if prev_acc != acc:
            rank += 1
        by_config[cfg]["rank_sum"] += rank
    
acc_sum_df = pd.DataFrame(by_config.values())

In [None]:
sns.regplot(x="accuracy_sum", y="rank_sum", data=acc_sum_df)
plt.show()

In [None]:
pd.set_option('display.max_rows', None)
display(acc_sum_df.sort_values("accuracy_sum", ascending=False).head(100))
pd.set_option('display.max_rows', 10)

In [None]:
g = sns.barplot(data=acc_sum_df, y="config", x="accuracy_sum")
plt.gcf().set_size_inches(6, 8)  # Default [6, 4]
plt.gcf().set_dpi(500)  # Default 72
plt.show()

In [None]:
pd.set_option('display.max_rows', None)
display(acc_sum_df.sort_values("rank_sum", ascending=True).head(100))
pd.set_option('display.max_rows', 10)

In [None]:
g = sns.barplot(data=acc_sum_df, y="config", x="rank_sum")
plt.gcf().set_size_inches(6, 8)  # Default [6, 4]
plt.gcf().set_dpi(500)  # Default 72
plt.show()

##### Results

In [None]:
filtered_df = loci_acc_df[
    (loci_acc_df["duplicates"] == retained["duplicates"]) &
    (loci_acc_df["min_support"] == retained["min_support"]) & 
    (loci_acc_df["padding"] == retained["padding"]) &
    (loci_acc_df["stitching"] == retained["stitching"])
]
writePredStatus(filtered_df, 3, 0.85)

#### 2.2.b. Prediction score

In [None]:
writeScorePredStatus(
    loci,
    results_df[
        (results_df["duplicates"] == retained["duplicates"]) &
        (results_df["min_support"] == retained["min_support"]) & 
        (results_df["padding"] == retained["padding"]) &
        (results_df["stitching"] == retained["stitching"])
    ],
    3,
    0.92
)

#### 2.2.c. Hard loci

In [None]:
retained_agreement_df = results_df[
    (results_df.classifier == retained["consensus"]["method"]) &
    (results_df["duplicates"] == retained["duplicates"]) &
    (results_df["min_support"] == retained["min_support"]) & 
    (results_df["padding"] == retained["padding"]) &
    (results_df["stitching"] == retained["stitching"])
]

In [None]:
for locus in loci:
    stat_by_lib = {lib: {"library": lib, "datasets": [], "expected": None,  "right_pred": 0, "wrong_pred": 0} for lib in set(retained_agreement_df[retained_agreement_df[locus + "_pred_is_ok"] != "Undetermined"]["lib_name"])}
    for row_idx, row in retained_agreement_df.iterrows():
        pred_status = row[locus + "_pred_is_ok"]
        if pred_status != "Undetermined":
            stat_by_lib[row["lib_name"]]["expected"] = row[locus + "_expected_status"]
            stat_by_lib[row["lib_name"]][pred_status + "_pred"] += 1
            stat_by_lib[row["lib_name"]]["datasets"].append(row["dataset_id"])
    locus_hard_df = pd.DataFrame(stat_by_lib.values())
    locus_hard_df["wrong_rate"] = locus_hard_df.apply(lambda row: row["wrong_pred"] / (row["right_pred"] + row["wrong_pred"]), axis=1)

    print(locus)
    pd.set_option('display.max_rows', None)
    display(locus_hard_df[locus_hard_df["wrong_rate"] >= 0.5])
    pd.set_option('display.max_rows', 10)

### 2.3. Sample

In [None]:
spl_acc_df = getAccuracyDf(["spl"], results_df)

#### 2.3.a. Accuracies

In [None]:
by_config = {}
for curr_dataset in sorted(set(spl_acc_df["dataset_id"])):
    acc_dataset_df = spl_acc_df[spl_acc_df["dataset_id"] == curr_dataset]
    acc_by_config = {}
    for idx, row in acc_dataset_df.iterrows():
        if row["config"] not in by_config:
            by_config[row["config"]] = {
                "accuracy_sum": 0,
                "classifier": row["classifier"],
                "padding": row["padding"],
                "min_support": row["min_support"],
                "duplicates": row["duplicates"],
                "stitching": row["stitching"],
                "config": row["config"],
                "rank_sum": 0,
                #"rank_median": 0
            }
        if row["config"] not in acc_by_config:
            acc_by_config[row["config"]] = []
        acc_by_config[row["config"]].append(row["accuracy"])
        by_config[row["config"]]["accuracy_sum"] += row["accuracy"]
    # Manage rank by config
    for cfg, accuracies in acc_by_config.items():
        acc_by_config[cfg] = np.mean(accuracies)
    sorted_acc = sorted(acc_by_config.items(), key=lambda elt: elt[1], reverse=True)
    rank = 0
    prev_acc = sorted_acc[0][1]
    for cfg, acc in sorted_acc:
        if prev_acc != acc:
            rank += 1
        by_config[cfg]["rank_sum"] += rank
    
acc_sum_df = pd.DataFrame(by_config.values())

In [None]:
sns.regplot(x="accuracy_sum", y="rank_sum", data=acc_sum_df)
plt.show()

In [None]:
pd.set_option('display.max_rows', None)
display(acc_sum_df.sort_values("accuracy_sum", ascending=False).head(100))
pd.set_option('display.max_rows', 10)

In [None]:
filtered_df = spl_acc_df[
    (spl_acc_df["duplicates"] == retained["duplicates"]) &
    (spl_acc_df["min_support"] == retained["min_support"]) & 
    (spl_acc_df["padding"] == retained["padding"]) &
    (spl_acc_df["stitching"] == retained["stitching"])
]
writePredStatus(filtered_df, 1, 0.85)

#### 2.3.b. Prediction score

In [None]:
writeScorePredStatus(
    ["spl"],
    results_df[
        (results_df["duplicates"] == retained["duplicates"]) &
        (results_df["min_support"] == retained["min_support"]) & 
        (results_df["padding"] == retained["padding"]) &
        (results_df["stitching"] == retained["stitching"])
    ],
    3,
    0.92
)

#### 2.3.c. Hard samples

In [None]:
retained_agreement_df = results_df[
    (results_df.classifier == retained["consensus"]["method"]) &
    (results_df["duplicates"] == retained["duplicates"]) &
    (results_df["min_support"] == retained["min_support"]) & 
    (results_df["padding"] == retained["padding"]) &
    (results_df["stitching"] == retained["stitching"])
]
wrong_spl_df = retained_agreement_df[retained_agreement_df["spl_pred_is_ok"] == "wrong"]

In [None]:
pd.set_option('display.max_rows', None)
display(wrong_spl_df)
pd.set_option('display.max_rows', 10)

In [None]:
len(retained_agreement_df[retained_agreement_df["lib_name"].isin(wrong_spl_df["lib_name"])])

## 3. Results from balanced datasets

### 3.1. Datasets

In [None]:
def getBalancedDf(locus, results_df, random_seed):
    balanced_results_df = pd.DataFrame(columns=results_df.columns)
    datasets_ids = set(results_df["dataset_id"])
    for curr_dataset in datasets_ids:
        curr_dataset_df = results_df[results_df["dataset_id"] == curr_dataset]
        expected_stable = curr_dataset_df[
            curr_dataset_df["{}_expected_status".format(locus)] == "MSS"
        ]
        stable_ids = sorted(list(set(expected_stable["spl_name"])))
        expected_unstable = curr_dataset_df[
            curr_dataset_df["{}_expected_status".format(locus)] == "MSI"
        ]
        unstable_ids = sorted(list(set(expected_unstable["spl_name"])))
        sampling_size = min(len(stable_ids), len(unstable_ids))
        random.seed(random_seed)
        selected_spl = random.sample(stable_ids, sampling_size) + random.sample(unstable_ids, sampling_size)
        balanced_results_df = balanced_results_df.append(
            curr_dataset_df[curr_dataset_df["spl_name"].isin(selected_spl)],
            sort=False,
            ignore_index=True
        )
    return balanced_results_df

balanced_results_df = getBalancedDf(locus, results_df, 0)

In [None]:
retained_agreement_df = balanced_results_df[
    (balanced_results_df.classifier == retained["consensus"]["method"]) &
    (balanced_results_df["duplicates"] == retained["duplicates"]) &
    (balanced_results_df["min_support"] == retained["min_support"]) & 
    (balanced_results_df["padding"] == retained["padding"]) &
    (balanced_results_df["stitching"] == retained["stitching"])
]
desc_rows = []
for dataset_id in set(retained_agreement_df["dataset_id"]):
    dataset_df = retained_agreement_df[retained_agreement_df["dataset_id"] == dataset_id]
    for idx, row in dataset_df.iterrows():
        for locus in loci:
            desc_rows.append({
                "dataset": dataset_id,
                "locus": locus,
                "status": "stable",
                "count": len(dataset_df[dataset_df[locus + "_expected_status"] == "MSS"])
            })
            desc_rows.append({
                "dataset": dataset_id,
                "locus": locus,
                "status": "unstable",
                "count": len(dataset_df[dataset_df[locus + "_expected_status"] == "MSI"])
            })
desc_df = pd.DataFrame(desc_rows)
g = sns.boxplot(x="locus", y="count", hue="status", data=desc_df, medianprops=dict(linewidth=2, color='firebrick'))
locs, labels = plt.xticks()
g.set_xticklabels(labels, rotation=90)
plt.show()

### 3.2. Loci

In [None]:
loci_acc_balanced_df = getAccuracyDf(loci, balanced_results_df)

#### 3.2.a. Accuracies

In [None]:
by_config = {}
for curr_dataset in sorted(set(loci_acc_balanced_df["dataset_id"])):
    acc_dataset_df = loci_acc_balanced_df[loci_acc_balanced_df["dataset_id"] == curr_dataset]
    acc_by_config = {}
    for idx, row in acc_dataset_df.iterrows():
        if row["config"] not in by_config:
            by_config[row["config"]] = {
                "accuracy_sum": 0,
                "classifier": row["classifier"],
                "padding": row["padding"],
                "min_support": row["min_support"],
                "duplicates": row["duplicates"],
                "stitching": row["stitching"],
                "config": row["config"],
                "rank_sum": 0,
                "rank_median": 0
            }
        if row["config"] not in acc_by_config:
            acc_by_config[row["config"]] = []
        acc_by_config[row["config"]].append(row["accuracy"]) # List because several it exists one accuracy by marker in dataset for the config
        by_config[row["config"]]["accuracy_sum"] += row["accuracy"]
    # Manage rank by config
    for cfg, accuracies in acc_by_config.items():
        acc_by_config[cfg] = np.mean(accuracies)
    sorted_acc = sorted(acc_by_config.items(), key=lambda elt: elt[1], reverse=True)
    rank = 0
    prev_acc = sorted_acc[0][1]
    for cfg, acc in sorted_acc:
        if prev_acc != acc:
            rank += 1
        by_config[cfg]["rank_sum"] += rank
    
acc_sum_df = pd.DataFrame(by_config.values())

In [None]:
sns.regplot(x="accuracy_sum", y="rank_sum", data=acc_sum_df)
plt.show()

In [None]:
#pd.set_option('display.max_rows', None)
#display(acc_sum_df.sort_values("accuracy_sum", ascending=False).head(100))
#pd.set_option('display.max_rows', 10)

In [None]:
g = sns.barplot(data=acc_sum_df, y="config", x="accuracy_sum")
plt.gcf().set_size_inches(6, 8)  # Default [6, 4]
plt.gcf().set_dpi(500)  # Default 72
plt.show()

In [None]:
#pd.set_option('display.max_rows', None)
#display(acc_sum_df.sort_values("rank_sum", ascending=True).head(100))
#pd.set_option('display.max_rows', 10)

In [None]:
g = sns.barplot(data=acc_sum_df, y="config", x="rank_sum")
plt.gcf().set_size_inches(6, 8)  # Default [6, 4]
plt.gcf().set_dpi(500)  # Default 72
plt.show()

In [None]:
filtered_df = loci_acc_balanced_df[
    (loci_acc_balanced_df["duplicates"] == retained["duplicates"]) &
    (loci_acc_balanced_df["min_support"] == retained["min_support"]) & 
    (loci_acc_balanced_df["padding"] == retained["padding"]) &
    (loci_acc_balanced_df["stitching"] == retained["stitching"])
]
writePredStatus(filtered_df, 3, 0.85)

#### 3.2.b. Prediction score

In [None]:
writeScorePredStatus(
    loci,
    balanced_results_df[
        (balanced_results_df["duplicates"] == retained["duplicates"]) &
        (balanced_results_df["min_support"] == retained["min_support"]) & 
        (balanced_results_df["padding"] == retained["padding"]) &
        (balanced_results_df["stitching"] == retained["stitching"])
    ],
    3,
    0.92
)

## 4. Release summary

### 4.1. Loci

In [None]:
loci_acc_df = getAccuracyDf(loci, results_df)
filtered_df = loci_acc_df[
    (loci_acc_df["classifier"].isin(retained["classifiers"])) &
    (loci_acc_df["duplicates"] == retained["duplicates"]) &
    (loci_acc_df["min_support"] == retained["min_support"]) & 
    (loci_acc_df["padding"] == retained["padding"]) &
    (loci_acc_df["stitching"] == retained["stitching"])
]

In [None]:
loci_acc_cmp_df = filtered_df.groupby(["config"]).apply(
    lambda x: pd.Series(
        [
            sum(x["nb_right_prediction"]),
            sum(x["nb_wrong_prediction"]),
            sum(x["nb_without_prediction"]),
        ],
        index=[
            'nb_true', 'nb_false', 'nb_undetermined'
        ]
    )
)
loci_acc_cmp_df = loci_acc_cmp_df.reset_index()

In [None]:
loci_acc_cmp_df.sort_values("nb_false", ascending=False)

### 4.2. Sample

In [None]:
spl_acc_df = getAccuracyDf(["spl"], results_df)
filtered_df = spl_acc_df[
    (spl_acc_df["classifier"].isin(retained["classifiers"])) &
    (spl_acc_df["duplicates"] == retained["duplicates"]) &
    (spl_acc_df["min_support"] == retained["min_support"]) & 
    (spl_acc_df["padding"] == retained["padding"]) &
    (spl_acc_df["stitching"] == retained["stitching"])
]

In [None]:
spl_acc_cmp_df = filtered_df.groupby(["config"]).apply(
    lambda x: pd.Series(
        [
            sum(x["nb_right_prediction"]),
            sum(x["nb_wrong_prediction"]),
            sum(x["nb_without_prediction"]),
        ],
        index=[
            'nb_true', 'nb_false', 'nb_undetermined'
        ]
    )
)
spl_acc_cmp_df = spl_acc_cmp_df.reset_index()

In [None]:
spl_acc_cmp_df.sort_values("nb_false", ascending=False)