## Influence of the Candidate Sets

Reproduce plots for the experiments inspecting the different candidate set definitions, see Section 4.3.3 and Table 6.

In [11]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

from msmsrt_scorer.experiments.CASMI_2016.plot_and_table_utils import IDIR, IDIR_METFRAG, _label_p, load_results

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
param_selection_measure = "topk_auc"
eval_method = "casmi"

### MS Base Scorer: MetFrag

**Using all candidates:**

In [16]:
# Input directories
_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="positive", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=None)
_results_pos = load_results(_idir, "MS + RT (our)", 75, n_samples=50, method=eval_method)[0]
_results_pos["Ionization"] = "positive"

_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="negative", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=None)
_results_neg = load_results(_idir, "MS + RT (our)", 50, n_samples=50, method=eval_method)[0]
_results_neg["Ionization"] = "negative"

results = pd.concat([_results_pos, _results_neg])

P_TEST = "wilcoxon_oneside"
results.drop(columns=["sample"]) \
    .groupby("Method") \
    .agg({"Top-1": lambda x: _label_p(x, 1, P_TEST, results),
          "Top-5": lambda x: _label_p(x, 5, P_TEST, results), 
          "Top-10": lambda x: _label_p(x, 10, P_TEST, results), 
          "Top-20": lambda x: _label_p(x, 20, P_TEST, results)}) \
    .reset_index() 

Unnamed: 0,Method,Top-1,Top-5,Top-10,Top-20
0,MS + RT (our),14.4 (***),43.4 (***),54.6 (***),65.9 (***)
1,Only MS,11.4,40.7,51.2,63.2


**Using only candidates with correct molecular formula:**

In [17]:
# Input directories
_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="positive", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=True)
_results_pos = load_results(_idir, "MS + RT (our)", 75, n_samples=50, method=eval_method)[0]
_results_pos["Ionization"] = "positive"

_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="negative", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=True)
_results_neg = load_results(_idir, "MS + RT (our)", 50, n_samples=50, method=eval_method)[0]
_results_neg["Ionization"] = "negative"

results = pd.concat([_results_pos, _results_neg])

P_TEST = "wilcoxon_oneside"
results.drop(columns=["sample"]) \
    .groupby("Method") \
    .agg({"Top-1": lambda x: _label_p(x, 1, P_TEST, results),
          "Top-5": lambda x: _label_p(x, 5, P_TEST, results), 
          "Top-10": lambda x: _label_p(x, 10, P_TEST, results), 
          "Top-20": lambda x: _label_p(x, 20, P_TEST, results)}) \
    .reset_index() 

Unnamed: 0,Method,Top-1,Top-5,Top-10,Top-20
0,MS + RT (our),17.5 (***),48.3 (***),59.7 (***),70.5 (***)
1,Only MS,13.1,46.0,56.9,68.7


### MS Base Scorer: IOKR

**Using all candidates:**

In [19]:
# Input directories
_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="positive", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=None,
             participant="IOKR__696a17f3")
_results_pos = load_results(_idir, "MS + RT (our)", 75, n_samples=50, method=eval_method)[0]
_results_pos["Ionization"] = "positive"

_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="negative", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=None,
             participant="IOKR__696a17f3")
_results_neg = load_results(_idir, "MS + RT (our)", 50, n_samples=50, method=eval_method)[0]
_results_neg["Ionization"] = "negative"

results = pd.concat([_results_pos, _results_neg])

P_TEST = "wilcoxon_oneside"
results.drop(columns=["sample"]) \
    .groupby("Method") \
    .agg({"Top-1": lambda x: _label_p(x, 1, P_TEST, results),
          "Top-5": lambda x: _label_p(x, 5, P_TEST, results), 
          "Top-10": lambda x: _label_p(x, 10, P_TEST, results), 
          "Top-20": lambda x: _label_p(x, 20, P_TEST, results)}) \
    .reset_index() 

Unnamed: 0,Method,Top-1,Top-5,Top-10,Top-20
0,MS + RT (our),26.1 (***),48.3 (***),59.9 (***),69.1 (***)
1,Only MS,24.4,46.0,58.4,65.5


**Using only candidates with correct molecular formula:**

In [21]:
# Input directories
_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="positive", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=True,
             participant="IOKR__696a17f3")
_results_pos = load_results(_idir, "MS + RT (our)", 75, n_samples=50, method=eval_method)[0]
_results_pos["Ionization"] = "positive"

_idir = IDIR(tree_method="random", n_random_trees=32, ion_mode="negative", D_value_method=None,
             base_dir="results__TFG__platt", mode="application",
             param_selection_measure=param_selection_measure, make_order_prob="sigmoid",
             norm_order_scores=False, margin_type="max", restrict_candidates_to_correct_mf=True,
             participant="IOKR__696a17f3")
_results_neg = load_results(_idir, "MS + RT (our)", 50, n_samples=50, method=eval_method)[0]
_results_neg["Ionization"] = "negative"

results = pd.concat([_results_pos, _results_neg])

P_TEST = "wilcoxon_oneside"
results.drop(columns=["sample"]) \
    .groupby("Method") \
    .agg({"Top-1": lambda x: _label_p(x, 1, P_TEST, results),
          "Top-5": lambda x: _label_p(x, 5, P_TEST, results), 
          "Top-10": lambda x: _label_p(x, 10, P_TEST, results), 
          "Top-20": lambda x: _label_p(x, 20, P_TEST, results)}) \
    .reset_index() 

Unnamed: 0,Method,Top-1,Top-5,Top-10,Top-20
0,MS + RT (our),30.2,52.3,66.2 (***),74.7
1,Only MS,30.6,53.9,65.3,74.8
