## IOKR as MS Base Scorer

Reproduce the IOKR performance values in Table 5 associated with Section 4.3.2.

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
import itertools as it

from scipy.stats import wilcoxon

sys.path.append(".")
from local_utils import IDIR_CASMI, IDIR_EA

from msmsrt_scorer.experiments.EA_Massbank.plot_and_table_utils import IDIR_METFRAG, _label_p, load_results

In [2]:
param_selection_measure = "topk_auc"
eval_method = "casmi"

**Load the results:**

In [5]:
MAKE_ORDER_PROB = ["sigmoid"]

res = pd.DataFrame()

for make_order_prob, base_dir in zip(MAKE_ORDER_PROB, ["../EA_Massbank/results__TFG__platt"]):
    # EA Dataset
    for ion_mode, max_n_ms2, n_samples in [("positive", 100, 100), ("negative", 65, 50)]:
        _idir = IDIR_EA(
            tree_method="random", n_random_trees=32, ion_mode=ion_mode, D_value_method=None, mode="application",
            base_dir=base_dir, param_selection_measure=param_selection_measure, 
            make_order_prob=make_order_prob, norm_scores="none", margin_type="max", participant="IOKR__696a17f3")

        _res = load_results(
            _idir, "MS + RT (our, %s)" % make_order_prob, max_n_ms2, n_samples=n_samples, method=eval_method)[0]
        _res["Dataset"] = "EA (Massbank)"
        _res["Ionization"] = ion_mode
        _res["Function"] = make_order_prob
    
        res = pd.concat((res, _res))

for make_order_prob, base_dir in zip(MAKE_ORDER_PROB, ["../CASMI_2016/results__TFG__platt"]):
    # CASMI Dataset
    for ion_mode, max_n_ms2, n_samples in [("positive", 75, 50), ("negative", 50, 50)]:
        _idir = IDIR_CASMI(
        tree_method="random", n_random_trees=32, ion_mode=ion_mode, D_value_method=None, mode="application",
        base_dir=base_dir, param_selection_measure=param_selection_measure, 
        make_order_prob=make_order_prob, norm_order_scores=False, margin_type="max", participant="IOKR__696a17f3")

        _res = load_results(
            _idir, "MS + RT (our, %s)" % make_order_prob, max_n_ms2, n_samples=n_samples, method=eval_method)[0]
        _res["Dataset"] = "CASMI 2016"
        _res["Ionization"] = ion_mode
        _res["Function"] = make_order_prob
        
        res = pd.concat((res, _res))

res = res.drop_duplicates(subset=res.columns[:-1]).drop("sample", axis=1)

assert (res[res.Method == "Only MS"].shape[0] == 250)
assert (res[res.Method != "Only MS"].shape[0] == 250)

In [6]:
res \
    .groupby(["Method", "Dataset", "Ionization", "Function"]).mean() \
    .groupby(["Method", "Function"]).mean() \
    .round(1) \
    .reset_index().drop(["Function", "Top-3"], axis=1)

Unnamed: 0,Method,Top-1,Top-5,Top-10,Top-20
0,"MS + RT (our, sigmoid)",26.8,51.9,62.3,70.3
1,Only MS,25.1,49.5,60.3,67.6
