# Sorting Perturbation Experiment

This notebook organizes the analysis of the sorting perturbation experiment.

__Note__: if you are running this notebook in colab, uncomment and run the following cell to install the project and its dependencies

In [None]:
# %pip install "git+https://github.com/allenai/open-mds.git"

Run the following cells to import the required packages and load some helper functions.

In [None]:
from pathlib import Path

import pandas as pd
import scipy.stats as stats

from open_mds.common import util

# Threshold under which to reject the null hypothesis
THRESHOLD = 0.01

Point the variable `data_dir` to the location of a directory that contains the results of running the [`run_summarization.py`](../scripts/run_summarization.py) script for one or more models

In [None]:
data_dir = "../output/results/"
# Make sure the directory exists and contains the expected subdirectories
!ls $data_dir

Then run the following cell to produce the tabulated results and run the significance tests.

In [None]:
results = {
    "dataset": [],
    "model": [],
    "selection_strategy": [],
    "metric": [],
    "baseline": [],
    "perturbed": [],
    "difference": [],
    "significant": [],
}

for subdir in Path(data_dir).iterdir():
    # Some datasets have blind test splits, and so we evaluate on the validation set
    # HuggingFace assigns a different prefix to the keys in the output json, so set that here
    metric_key_prefix = "eval" if subdir.name in {"ms2", "cochrane"} else "predict"

    # The metrics we want to check for significance
    metric_columns = [
        f"{metric_key_prefix}_rouge_avg_fmeasure",
        f"{metric_key_prefix}_bertscore_f1",
    ]
    # Load the results as dataframes
    baseline_df, perturbed_df = util.load_results_dicts(
        data_dir=subdir,
        metric_columns=metric_columns,
        metric_key_prefix=metric_key_prefix,
        # Only retain data that pertains to the perturbation experiments
        load_retrieval_results=False,
    )

    # We only care about sorting results
    perturbed_df = perturbed_df[perturbed_df[f"{metric_key_prefix}_perturbation"] == "sorting"]

    # Perform the signifiance test for all models, selection strategies, and metrics
    for model_name_or_path in perturbed_df.model_name_or_path.unique():
        for selection_strategy in ["random", "oracle"]:
            for metric in metric_columns:
                # Isolate the results from one experiment
                experiment_df = perturbed_df[perturbed_df.model_name_or_path == model_name_or_path][
                    perturbed_df[f"{metric_key_prefix}_selection_strategy"] == selection_strategy
                ]

                baseline_scores = baseline_df[baseline_df.model_name_or_path == model_name_or_path][metric]
                perturbed_scores = experiment_df[metric]
                perturbed_scores_delta = experiment_df[f"{metric}_delta"]

                # Report any significant differences
                _, pvalue = stats.ttest_rel(baseline_scores, perturbed_scores)
                if pvalue < THRESHOLD:
                    print(
                        f"Model {model_name_or_path} with selection strategy {selection_strategy} has a"
                        f" significant difference in {metric} with p-value {pvalue}."
                        f" Baseline: {baseline_scores.mean()}, Perturbed: {perturbed_scores.mean()}"
                    )

                # Collect the results we are interested in
                metric_key = metric.removeprefix(f"{metric_key_prefix}_")
                results["dataset"].append(subdir.name)
                results["model"].append(model_name_or_path)
                results["selection_strategy"].append(selection_strategy)
                results["metric"].append(metric_key)
                results["baseline"].append(round(baseline_scores.mean(), 2))
                results["perturbed"].append(round(perturbed_scores.mean(), 2))
                results["difference"].append(round(perturbed_scores_delta.mean(), 2))
                results["significant"].append(pvalue < THRESHOLD)

results_df = pd.DataFrame(results)

In [None]:
results_df.head()

You may wish to subset the results dataframe by dataset

In [None]:
results_df[results_df.dataset == "multinews"]