# Human Evaluation

This notebook organizes the collection of data and the analysis for the human evaluation.

__Note__: if you are running this notebook in colab, uncomment and run the following cell to install the project and its dependencies

In [None]:
# %pip install "git+https://github.com/allenai/open-mds.git"

Run the following cells to import the required packages and load some helper functions.

In [None]:
import random
from pathlib import Path

import pandas as pd
import scipy

from open_mds.common import util

# Set random seed of python module for reproducibility
random_seed = 42
random.seed(random_seed)

## Setup & Data Collection

These are the parameters of the human evaluation. Please provide the directory to the raw experiemental results (`data_dir`), alongside the model (`model_name_or_path`), `retriever` and `top_k_strategy` to evaluate. You should also select the `num_samples` to draw for human annotation.

The following cells will collect the data for human evaluation and save it to disk.

In [None]:
# I/O paths
data_dir = "../output/results/multinews/"
output_dir = "../output/human_eval/"

Path(output_dir).mkdir(parents=True, exist_ok=True)
examples_path = Path(output_dir) / "examples.tsv"
model_key_path = Path(output_dir) / "model_key.tsv"
annotations_path = Path(output_dir) / "annotations"

# Parameters of the human evaluation
model_name_or_path = "allenai/PRIMERA-multinews"
retriever = "sparse"
top_k_strategy = "mean"
num_samples = 50

First load the raw data, subsetting it given the provided parameters above

In [None]:
# Some datasets have blind test splits, and so we evaluate on the validation set
# HuggingFace assigns a different prefix to the keys in the output json, so set that here
metric_key_prefix = "eval" if Path(data_dir).name in {"ms2", "cochrane"} else "predict"

# The metrics we want to record results for
metric_columns = [
    f"{metric_key_prefix}_rouge_avg_fmeasure",
    f"{metric_key_prefix}_bertscore_f1",
]

# Load the results as dataframes
baseline_df, retrieval_df = util.load_results_dicts(
    data_dir=data_dir,
    metric_columns=metric_columns,
    metric_key_prefix=metric_key_prefix,
    # Only retain data that pertains to the retrieval experiments
    load_perturbation_results=False,
    load_training_results=False,
)

# Filter to just the data we want to compare
baseline_df = baseline_df[baseline_df.model_name_or_path == model_name_or_path]
retrieval_df = retrieval_df.loc[
    (retrieval_df.model_name_or_path == model_name_or_path)
    & (retrieval_df[f"{metric_key_prefix}_retriever"] == retriever)
    & (retrieval_df[f"{metric_key_prefix}_top_k_strategy"] == top_k_strategy)
]

# Reset the index so that it is numbered from 0...N like the baseline dataframe
retrieval_df = retrieval_df.reset_index()

# Sanity check that baseline and retrieval dfs have the same index and correspond to the same examples
assert baseline_df.index.equals(retrieval_df.index)
assert baseline_df[f"{metric_key_prefix}_labels"].equals(retrieval_df[f"{metric_key_prefix}_labels"])

Then, randomly sample the examples for the human evaluation

In [None]:
# Ensure that repeated runs of this cell produce the same results
rng = random.Random(random_seed)
example_ids = rng.sample(retrieval_df.index.tolist(), num_samples)

sampled_baseline = baseline_df.loc[example_ids, :]
sampled_retrieval = retrieval_df.loc[example_ids, :]

# Sanity check that the sampled instances correspond to the same examples
assert (
    sampled_baseline[f"{metric_key_prefix}_labels"].tolist()
    == sampled_retrieval[f"{metric_key_prefix}_labels"].tolist()
)

Finally, collect the data we need for human evaluation in the correct format and save it to disk

In [None]:
target_summaries = sampled_baseline[f"{metric_key_prefix}_labels"].tolist()
baseline_summaries = sampled_baseline[f"{metric_key_prefix}_preds"].tolist()
retrieval_summaries = sampled_retrieval[f"{metric_key_prefix}_preds"].tolist()

target_summaries = [util.sanitize_text(summary) for summary in target_summaries]
baseline_summaries = [util.sanitize_text(summary) for summary in baseline_summaries]
retrieval_summaries = [util.sanitize_text(summary) for summary in retrieval_summaries]

In [None]:
# Ensure that repeated runs of this cell produce the same results
rng = random.Random(random_seed)
model_orders = rng.choices([True, False], k=num_samples)

with open(examples_path, "w") as examples:
    with open(model_key_path, "w") as model_key:

        # Write the headers
        examples.write("example_id\ttarget_summary\tmodel_1_summary\tmodel_2_summary\n")
        model_key.write("example_id\tmodel_1\tmodel_2\n")

        for example_id, target_summary, baseline_summary, retrieval_summary, baseline_first in zip(
            example_ids, target_summaries, baseline_summaries, retrieval_summaries, model_orders
        ):
            examples.write(f"{example_id}\t{target_summary}\t")
            model_key.write(f"{example_id}\t")

            # Randomly order the model summaries
            if baseline_first:
                examples.write(f"{baseline_summary}\t{retrieval_summary}\n")
                model_key.write("baseline\tretrieval\n")
            else:
                examples.write(f"{retrieval_summary}\t{baseline_summary}\n")
                model_key.write("retrieval\tbaseline\n")

## Analysis

The following cells will load the human annotations and perform the analysis. The `tsv` files containing the annotations from each human annotator should be in `output_dir / "annotations"`.

In [None]:
# Load the model key
model_key_df = pd.read_csv(model_key_path, sep="\t")

# Load the human annotations
annotator_dfs = []
annotations_path = Path(output_dir) / "annotations"
for annotator_file in annotations_path.glob("*.tsv"):
    # The first three lines are instructions, and the last column is annotator comments, skip
    annotator_dfs.append(pd.read_csv(annotator_file, sep="\t", skiprows=3, usecols=list(range(6))))
annotator_df = pd.concat(annotator_dfs)

In [None]:
BASELINE = "baseline"
# The three possible votes for each facet
A_VOTE = "A"
B_VOTE = "B"
NEITHER = "Neither"
# The two facets we're evaluating
FACETS = ["coverage", "informativeness"]

for facet in FACETS:

    baseline_votes, total_votes = 0, 0

    for i, ann in enumerate(annotator_df[facet].to_list()):
        row_idx = annotator_df.index[i]

        # Make sure differences aren't due to whitespace or case
        ann = ann.strip().title()

        # Unlikely, but catch unknown annotations
        if ann not in [A_VOTE, B_VOTE, NEITHER]:
            raise ValueError(f'Unknown annotation ("{ann}") in row {i} of the human evaluation results')

        # Skip ties
        if ann == "Neither":
            continue

        # Collect the votes for the baseline, resolving which model is the baseline based on the model key
        col_name = "model_1" if ann == A_VOTE else "model_2"
        baseline_votes += int(model_key_df.loc[row_idx, col_name].strip().lower() == BASELINE)
        total_votes += 1

    print(f'Results for facet: "{facet}"')
    print(scipy.stats.binomtest(baseline_votes, total_votes, p=0.5, alternative="two-sided"))
    print()