# Summarization Baselines

This notebook computes various simple summarization baselines for the datasets we investigated in the paper.

__Note__: if you are running this notebook in colab, uncomment and run the following cell to install the project and its dependencies

In [None]:
# %pip install "git+https://github.com/allenai/open-mds.git"

Run the following cells to import the required packages and load some helper functions

In [None]:
import random
from typing import List, Optional

import nltk
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

from open_mds import metrics
from open_mds.common import util

In [None]:
def summary_baselines(dataset, summary_column: str, strategy: str = "random", rng_state: int = 42) -> List[str]:
    """For each example, selects a random reference summary from the dataset to act as the generated summary. If
    `strategy` is "length-matched", then a random summary of the same or similar length to the true reference
    summary is selected.
    """
    random.seed(rng_state)
    baseline_summaries = []
    summary_lens = None

    for i, _ in tqdm(
        enumerate(dataset[summary_column]), desc=f"Summary baseline (strategy={strategy})", total=len(dataset)
    ):
        if strategy == "random":
            baseline_summaries.append(random.choice(dataset[summary_column][:i] + dataset[summary_column][i + 1 :]))
        elif strategy == "length-matched":
            # Only compute once and only compute at all if strategy is similar
            if summary_lens is None:
                summary_lens = [len(nltk.word_tokenize(summary)) for summary in dataset[summary_column]]
            curr_summary_len = summary_lens[i]
            other_summary_lens = summary_lens[:i] + summary_lens[i + 1 :]
            closest_summary_len = min(other_summary_lens, key=lambda x: abs(x - curr_summary_len))
            # Need to insert something at i so that list.index will give us the correct index
            other_summary_lens.insert(i, None)
            idx = other_summary_lens.index(closest_summary_len)
            baseline_summaries.append(dataset[summary_column][idx])
        else:
            raise ValueError(f"Invalid strategy: {strategy}")

    return baseline_summaries, dataset[summary_column]


def document_baselines(
    dataset,
    text_column: str,
    summary_column: str,
    strategy: str = "random",
    lead_only: bool = False,
    doc_sep_token: Optional[str] = None,
    rng_state: int = 42,
) -> List[str]:
    """For each example, selects a single input document to act as the generated summary. If `strategy` is "random",
    then the document is selected randomly. If `strategy` is similar, the document with the greatest ROUGE-1 F1
    score with the reference summary is selected. If `lead_only` is True, then only the first sentence of the
    selected input documents is returned.
    """
    random.seed(rng_state)
    baseline_summaries = []
    reference_summaries = dataset[summary_column]

    # Some datasets, like Multi-XScience, have a nested structure. Handle that here by allowing user to provide
    # both column names as "column_1+column_2".
    document_key = None
    if "+" in text_column:
        text_column, document_key = text_column.split("+")

    for documents, summary in tqdm(
        zip(dataset[text_column], reference_summaries),
        desc=f"Document baseline (strategy={strategy}, lead_only={lead_only})",
        total=len(dataset),
    ):
        if doc_sep_token is not None:
            documents = documents.split(doc_sep_token)
        if document_key is not None:
            documents = documents[document_key]

        if strategy == "random":
            baseline_summary = random.choice(documents)
        elif strategy == "oracle":
            scores = metrics.compute_rouge(
                predictions=documents, references=[summary] * len(documents), rouge_types=["rouge1"]
            )
            rouge_1_fmeasure = scores["rouge1"]["fmeasure"]
            idx = np.argmax(rouge_1_fmeasure)
            baseline_summary = documents[idx]
        elif strategy == "all":
            baseline_summary = documents[:]
        else:
            raise ValueError(f"Invalid strategy: {strategy}")

        if lead_only:
            if isinstance(baseline_summary, list):

                baseline_summary = [nltk.sent_tokenize(summary)[0] if summary else "" for summary in baseline_summary]
            else:
                baseline_summary = nltk.sent_tokenize(baseline_summary)[0] if baseline_summary else ""

        if isinstance(baseline_summary, list):
            baseline_summary = ". ".join(baseline_summary).strip()

        baseline_summaries.append(baseline_summary.strip())

    return baseline_summaries, reference_summaries


def select_column(dataset, summary_column: str, column: str) -> List[str]:
    """For each example, selects the text at `column` from the `dataset` to act as the generated summary."""

    # User could provide a column name that maps to a list, so handle that here
    if isinstance(dataset[column][0], list):
        column_text = [". ".join(example) for example in dataset[column]]
    else:
        column_text = dataset[column]

    return column_text, dataset[summary_column]

The run the baselines for each dataset. Note that each dataset may take 10s of minutes

In [None]:
dataset = load_dataset("multi_news", split="test")

# Random (length-matched) summary
predictions, references = summary_baselines(dataset, summary_column="summary", strategy="length-matched")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Random Summary:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle document
predictions, references = document_baselines(
    dataset,
    text_column="document",
    summary_column="summary",
    strategy="oracle",
    lead_only=False,
    doc_sep_token=util.DOC_SEP_TOKENS["multi_news"],
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Document:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle lead
predictions, references = document_baselines(
    dataset,
    text_column="document",
    summary_column="summary",
    strategy="oracle",
    lead_only=True,
    doc_sep_token=util.DOC_SEP_TOKENS["multi_news"],
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# All lead
predictions, references = document_baselines(
    dataset,
    text_column="document",
    summary_column="summary",
    strategy="all",
    lead_only=True,
    doc_sep_token=util.DOC_SEP_TOKENS["multi_news"],
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("All Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

In [None]:
dataset = load_dataset("ccdv/WCEP-10", "list", split="test")

# Random (length-matched) summary
predictions, references = summary_baselines(dataset, summary_column="summary", strategy="length-matched")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Random Summary:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle document
predictions, references = document_baselines(
    dataset, text_column="document", summary_column="summary", strategy="oracle", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Document:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle lead
predictions, references = document_baselines(
    dataset, text_column="document", summary_column="summary", strategy="oracle", lead_only=True
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# All lead
predictions, references = document_baselines(
    dataset, text_column="document", summary_column="summary", strategy="all", lead_only=True
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("All Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

In [None]:
dataset = load_dataset("multi_x_science_sum", split="test")

# Random (length-matched) summary
predictions, references = summary_baselines(dataset, summary_column="related_work", strategy="length-matched")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Random Summary:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle document
predictions, references = document_baselines(
    dataset, text_column="ref_abstract+abstract", summary_column="related_work", strategy="oracle", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Document:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle lead
predictions, references = document_baselines(
    dataset, text_column="ref_abstract+abstract", summary_column="related_work", strategy="oracle", lead_only=True
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# All lead
predictions, references = document_baselines(
    dataset, text_column="ref_abstract+abstract", summary_column="related_work", strategy="all", lead_only=True
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("All Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Abstract
predictions, references = select_column(dataset, summary_column="related_work", column="abstract")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Abstract:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

In [None]:
dataset = load_dataset("allenai/mslr2022", "ms2", split="validation")

# Random (length-matched) summary
predictions, references = summary_baselines(dataset, summary_column="target", strategy="length-matched")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Random Summary:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle document
predictions, references = document_baselines(
    dataset, text_column="abstract", summary_column="target", strategy="oracle", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Document:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle lead
# Note, for MS2 and Cochrane we treat the title as the lead, rather than the first sentence of the abstract
predictions, references = document_baselines(
    dataset, text_column="title", summary_column="target", strategy="oracle", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# All lead
predictions, references = document_baselines(
    dataset, text_column="title", summary_column="target", strategy="all", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("All Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Background
predictions, references = select_column(dataset, summary_column="target", column="background")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Background:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

In [None]:
dataset = load_dataset("allenai/mslr2022", "cochrane", split="validation")

# Random (length-matched) summary
predictions, references = summary_baselines(dataset, summary_column="target", strategy="length-matched")
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Random Summary:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle document
predictions, references = document_baselines(
    dataset, text_column="abstract", summary_column="target", strategy="oracle", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Document:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# Oracle lead
# Note, for MS2 and Cochrane we treat the title as the lead, rather than the first sentence of the abstract
predictions, references = document_baselines(
    dataset, text_column="title", summary_column="target", strategy="oracle", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("Oracle Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))

# All lead
predictions, references = document_baselines(
    dataset, text_column="title", summary_column="target", strategy="all", lead_only=False
)
rouge_results = metrics.compute_rouge(predictions=predictions, references=references)
print("All Lead:", round(rouge_results["rouge_avg_fmeasure_mean"], 1))