In [1]:
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


False

In [3]:
pip install datasets transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
from datasets import load_dataset

In [5]:
# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [6]:
from transformers import pipeline, set_seed

In [7]:
# truncate it
sample_text = dataset["train"][1]["article"][:2000]
summaries = {}

## BART

In [8]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

## Measuring the baseline: ROUGE

In [9]:
summaries

{'bart': 'Mentally ill inmates are housed on the "forgotten floor" of Miami-Dade jail.\nMost often, they face drug charges or charges of assaulting an officer.\nJudge Steven Leifman says the arrests often result from confrontations with police.\nHe says about one-third of all people in the county jails are mentally ill.'}

In [10]:
from datasets import load_metric

In [11]:
pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [12]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
actual = dataset["train"][1]["highlights"]
records = []
rge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [14]:
for model_name in summaries:
  rouge.add(prediction=summaries[model_name], reference=actual)
  score=rouge.compute()
  # print(score)
  tbl = dict((rn, score[rn].mid.fmeasure) for rn in rge_names)
  records.append(tbl)

In [15]:
import pandas as pd

In [16]:
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart,0.475248,0.222222,0.316832,0.415842


# Evaluating over Entire Test Dataset

In [17]:
from tqdm import tqdm

In [18]:
def chunks(list_ele, batch_size):
  for i in range(0, len(list_ele), batch_size):
    yield list_ele[i : i+batch_size]

In [19]:
def evaluate_bart(dataset, metric, model, tokenizer,
                            batch_size=16, device='cpu',
                            column_text="article",
                            column_summary="highlights"):
    article_batches = [dataset[column_text][i:i+batch_size] for i in range(0, len(dataset[column_text]), batch_size)]
    target_batches = [dataset[column_summary][i:i+batch_size] for i in range(0, len(dataset[column_summary]), batch_size)]

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   max_length=128,
                                   num_beams=4,
                                   length_penalty=2.0,
                                   early_stopping=True)

        decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=True) for summary in summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [20]:
test_sampled = dataset["test"].shuffle(seed=1234).select(range(1000))

In [21]:
import torch

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [23]:
# hide_output
from transformers import BartForConditionalGeneration, BartTokenizer

model_ckpt = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt).to(device)
score = evaluate_bart(test_sampled, rouge,
                                model, tokenizer, batch_size=8)
rouge_dict = {rn: score[rn].mid.fmeasure for rn in score.keys()}
pd.DataFrame(rouge_dict, index=["bart"])


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 125/125 [1:55:23<00:00, 55.39s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart,0.427265,0.208488,0.300528,0.364702
