In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Install the necessary packages
%pip install rouge_score==0.1.2 datasets transformers evaluate nltk torchmetrics --quiet

# Import the necessary libraries
import torch
import gc
import pandas as pd
import nltk
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, GPT2LMHeadModel, GPT2Tokenizer
from torchmetrics.functional.text.rouge import rouge_score

# Download the CNN/Daily Mail dataset
full_dataset = load_dataset("cnn_dailymail", version="3.0.0", cache_dir="./datasets")

# Select a small random sample of 100 articles from the CNN/Daily Mail dataset
sample_size = 100
sample = (
    full_dataset["train"]
    .filter(lambda r: "CNN" in r["article"][:25])  # Select articles starting with "CNN"
    .shuffle(seed=42)  # Shuffle the dataset to ensure random selection
    .select(range(sample_size))  # Select the first 100 articles after shuffling
)

# Function to create batches from a list
def batch_generator(data: list, batch_size: int):
    """
    Generate batches from a list of data.

    Args:
        data (list): The input data to create batches from.
        batch_size (int): The desired batch size.

    Yields:
        list: A batch of data.
    """
    s = 0
    e = s + batch_size
    while s < len(data):
        yield data[s:e]  # Yield a batch of data
        s = e
        e = min(s + batch_size, len(data))  # Update the start and end indices for the next batch

# Function to compute ROUGE scores
def compute_rouge_score(generated: list, reference: list) -> dict:
    """
    Compute ROUGE scores between generated and reference texts.

    Args:
        generated (list): List of generated texts.
        reference (list): List of reference texts.

    Returns:
        dict: Dictionary containing ROUGE scores.
    """
    generated_with_newlines = ["\n".join(nltk.sent_tokenize(s.strip())) for s in generated]  # Tokenize the generated summaries
    reference_with_newlines = ["\n".join(nltk.sent_tokenize(s.strip())) for s in reference]  # Tokenize the reference summaries
    return rouge_score(
        generated_with_newlines,
        reference_with_newlines,
    )  # Compute the ROUGE scores

# Function to summarize using T5 model
def summarize_with_t5(model_checkpoint: str, articles: list, batch_size: int = 8) -> list:
    """
    Summarize a list of articles using the T5 model.

    Args:
        model_checkpoint (str): The pre-trained T5 model checkpoint.
        articles (list): List of articles to summarize.
        batch_size (int, optional): Batch size for inference. Defaults to 8.

    Returns:
        list: List of generated summaries.
    """
    # Set the device for computation
    if torch.cuda.is_available():
        device = "cuda:0"
    else:
        device = "cpu"

    # Load the pre-trained T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(
        model_checkpoint, cache_dir="./datasets"
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoint, model_max_length=1024, cache_dir="./datasets"
    )

    # Function to perform inference using the T5 model
    def perform_inference(batch: list) -> list:
        # Prepare the inputs
        inputs = tokenizer(
            batch, max_length=1024, return_tensors="pt", padding=True, truncation=True
        )

        # Generate the summary
        summary_ids = model.generate(
            inputs.input_ids.to(device),
            attention_mask=inputs.attention_mask.to(device),
            num_beams=2,
            min_length=0,
            max_length=40,
        )
        return tokenizer.batch_decode(summary_ids, skip_special_tokens=True)  # Decode the generated summary

    # List to store the generated summaries
    res = []

    # Prepend "summarize: " to each article
    summary_articles = list(map(lambda article: "summarize: " + article, articles))
    for batch in batch_generator(summary_articles, batch_size=batch_size):  # Iterate over batches
        res += perform_inference(batch)  # Perform inference on each batch and add to the results list

    # Clean up resources
    torch.cuda.empty_cache()
    gc.collect()

    del tokenizer
    del model
    torch.cuda.empty_cache()
    gc.collect()
    return res  # Return the generated summaries

# Function to summarize using GPT-2 model
def summarize_with_gpt2(model_checkpoint: str, articles: list, batch_size: int = 8) -> list:
    """
    Summarize a list of articles using the GPT-2 model.

    Args:
        model_checkpoint (str): The pre-trained GPT-2 model checkpoint.
        articles (list): List of articles to summarize.
        batch_size (int, optional): Batch size for inference. Defaults to 8.

    Returns:
        list: List of generated summaries.
    """
    # Set the device for computation
    if torch.cuda.is_available():
        device = "cuda:0"
    else:
        device = "cpu"

    # Load the pre-trained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(
        model_checkpoint, padding_side="left", cache_dir="./datasets"
    )
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
    model = GPT2LMHeadModel.from_pretrained(
        model_checkpoint,
        pad_token_id=tokenizer.eos_token_id,
        cache_dir="./datasets",
    ).to(device)

    # Function to perform inference using the GPT-2 model
    def perform_inference(batch: list) -> list:
        # Prepare the inputs
        tmp_inputs = tokenizer(
            batch, max_length=500, return_tensors="pt", padding=True, truncation=True
        )
        tmp_inputs_decoded = tokenizer.batch_decode(
            tmp_inputs.input_ids, skip_special_tokens=True
        )
        inputs = tokenizer(
            [article + " TL;DR:" for article in tmp_inputs_decoded],
            max_length=512,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )

        # Generate the summary
        summary_ids = model.generate(
            inputs.input_ids.to(device),
            attention_mask=inputs.attention_mask.to(device),
            num_beams=2,
            min_length=0,
            max_length=512 + 32,
        )
        return tokenizer.batch_decode(summary_ids, skip_special_tokens=True)  # Decode the generated summary

    # List to store the decoded summaries
    decoded_summaries = []
    for batch in batch_generator(articles, batch_size=batch_size):  # Iterate over batches
        decoded_summaries += perform_inference(batch)  # Perform inference on each batch and add to the results list

        torch.cuda.empty_cache()
        gc.collect()

    # Extract the summaries from the decoded summaries
    summaries = [
        summary[summary.find("TL;DR:") + len("TL;DR: ") :]
        for summary in decoded_summaries
    ]

    # Clean up resources
    del tokenizer
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return summaries  # Return the generated summaries

# Generate summaries using T5 model
t5_small_summaries = summarize_with_t5("t5-small", sample["article"])

# Compute ROUGE scores for T5 model
t5_rouge_scores = compute_rouge_score(t5_small_summaries, sample["highlights"])

# Generate summaries using GPT-2 model
gpt2_summaries = summarize_with_gpt2("gpt2", sample["article"])

# Compute ROUGE scores for GPT-2 model
gpt2_rouge_scores = compute_rouge_score(gpt2_summaries, sample["highlights"])

# Construct a DataFrame to display the results
results_df = pd.DataFrame({
    "Model": ["T5-Small", "GPT-2"],
    "ROUGE-1 F1": [t5_rouge_scores["rouge1_fmeasure"].item(), gpt2_rouge_scores["rouge1_fmeasure"].item()],
    "ROUGE-1 Precision": [t5_rouge_scores["rouge1_precision"].item(), gpt2_rouge_scores["rouge1_precision"].item()],
    "ROUGE-1 Recall": [t5_rouge_scores["rouge1_recall"].item(), gpt2_rouge_scores["rouge1_recall"].item()],
    "ROUGE-2 F1": [t5_rouge_scores["rouge2_fmeasure"].item(), gpt2_rouge_scores["rouge2_fmeasure"].item()],
    "ROUGE-2 Precision": [t5_rouge_scores["rouge2_precision"].item(), gpt2_rouge_scores["rouge2_precision"].item()],
    "ROUGE-2 Recall": [t5_rouge_scores["rouge2_recall"].item(), gpt2_rouge_scores["rouge2_recall"].item()],
    "ROUGE-L F1": [t5_rouge_scores["rougeL_fmeasure"].item(), gpt2_rouge_scores["rougeL_fmeasure"].item()],
    "ROUGE-Lsum Precision": [t5_rouge_scores["rougeLsum_precision"].item(), gpt2_rouge_scores["rougeLsum_precision"].item()],
    "ROUGE-Lsum Recall": [t5_rouge_scores["rougeLsum_recall"].item(), gpt2_rouge_scores["rougeLsum_recall"].item()],
})

# Display the results DataFrame
print("ROUGE Scores:")
display(results_df)
                                          


Note: you may need to restart the kernel to use updated packages.


Downloading builder script:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/default to ./datasets/cnn_dailymail/default/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to ./datasets/cnn_dailymail/default/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/288 [00:00<?, ?ba/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

ROUGE Scores:


Unnamed: 0,Model,ROUGE-1 F1,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-2 F1,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-L F1,ROUGE-Lsum Precision,ROUGE-Lsum Recall
0,T5-Small,0.29345,0.390506,0.241019,0.1122,0.150521,0.091911,0.224367,0.3627,0.22251
1,GPT-2,0.184629,0.259675,0.154533,0.044773,0.065962,0.036355,0.14407,0.242341,0.141547
