In [None]:
!pip install datasets
from datasets import load_dataset

try:
  dataset = load_dataset("cnn_dailymail","3.0.0")
  print(f"Features: {dataset['train'].column_names}")
except Exception as e:
  print(f"An error occurred: {e}")


In [None]:
sample = dataset["train"][1]
print(f""" Article (excerpt of 500 characters, total length: {len(sample["article"])}): """)
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])

In [None]:
sample_text = dataset["train"][1]["article"][:2000]
# We'll collect the generated summaries of each model in a dictionary
summaries = {}

In [None]:
!pip install nltk
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
string = "The U.S. are a country. The U.N. is an organization."
sent_tokenize(string)

In [None]:
def three_sentence_summary(text):
	return "\n".join(sent_tokenize(text)[:3])
summaries["baseline"] = three_sentence_summary(sample_text)

In [None]:
from transformers import pipeline, set_seed
set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)

summaries["gpt2"] = "\n".join( sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

In [None]:
pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))


In [None]:
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")


In [None]:
print("GROUND TRUTH")
print(dataset["train"][1]["highlights"])
print("")
for model_name in summaries:
	print(model_name.upper())
	print(summaries[model_name])
	print("")

In [None]:
!pip install sacrebleu
from datasets import load_metric
bleu_metric = load_metric("sacrebleu")

In [None]:
import pandas as pd
import numpy as np
bleu_metric.add( prediction="the the the the the the", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in results["precisions"]]
pd.DataFrame.from_dict(results, orient="index", columns=["Value"])


In [None]:
!pip install rouge_score
rouge_metric = load_metric("rouge")


In [None]:
reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
for model_name in summaries:
	rouge_metric.add(prediction=summaries[model_name], reference=reference)
	score = rouge_metric.compute()
	rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
	records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())

In [None]:
def evaluate_summaries_baseline(dataset, metric, column_text="article", 			column_summary="highlights"):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries, references=dataset[column_summary])
    score = metric.compute()
    return score


In [None]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))
score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T


In [None]:
from tqdm import tqdm
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
def chunks(list_of_elements, batch_size):
	"""Yield successive batch-sized chunks from list_of_elements."""
	for i in range(0, len(list_of_elements), batch_size):
	  yield list_of_elements[i : i + batch_size]
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="article", column_summary="highlights"):
	article_batches = list(chunks(dataset[column_text], batch_size))
	target_batches = list(chunks(dataset[column_summary], batch_size))
	for article_batch, target_batch in tqdm( zip(article_batches, target_batches), total=len(article_batches)):
		inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
		summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128)
		decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
		decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
		metric.add_batch(predictions=decoded_summaries, references=target_batch)
	score = metric.compute()
	return score
