## Loading Libraries

In [15]:
from transformers import AutoTokenizer, BartForConditionalGeneration
import torch
from datasets import load_dataset
import pandas as pd
from rouge import Rouge
from tqdm import tqdm

## Loading model and tokenizer

In [None]:

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

Using device: cuda


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions'

## Loading dataset

In [None]:
cnn_dataset = load_dataset("cnn_dailymail",'3.0.0')
train_dataset, validation_dataset, test_dataset = pd.DataFrame(cnn_dataset["train"][:50000]), pd.DataFrame(cnn_dataset["validation"][:10000]), pd.DataFrame(cnn_dataset["test"][:10000])
train_dataset["article"] = "summarize: " + train_dataset["article"]
validation_dataset["article"]="summarize: " + validation_dataset["article"]
test_dataset["article"]="summarize: " + test_dataset["article"]

# Generating summarized texts

In [18]:
resultdf = {"actual_text": [], "generated_text": []}
curr = []
for index, row in tqdm(validation_dataset.iterrows(), total=len(validation_dataset), desc="Processing rows"):
    curr.append(row["article"])
    resultdf["actual_text"].append(row["highlights"]) 
    if (index + 1) % 25 == 0:
        inputs = tokenizer(curr, max_length=512, return_tensors="pt", truncation=True, padding=True)
        inputs = inputs.to(device)
        summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=50)

        text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

        resultdf["generated_text"].extend(text)
        curr = []
resultdf = pd.DataFrame(resultdf)

Processing rows: 100%|██████████| 10000/10000 [12:26<00:00, 13.39it/s]


## calculating rogue score

In [19]:
def calculate_rouge(summaries, references):
    rouge_calculator = Rouge()
    scores = rouge_calculator.get_scores(summaries, references, avg=True)
    rouge_score = scores
    return rouge_score
calculate_rouge(resultdf['generated_text'],resultdf['actual_text'])

{'rouge-1': {'r': 0.3416131258146902,
  'p': 0.4527089427163319,
  'f': 0.3800432569333121},
 'rouge-2': {'r': 0.15453648858318156,
  'p': 0.21279432872786946,
  'f': 0.17343168991175326},
 'rouge-l': {'r': 0.3224864629101037,
  'p': 0.4273725747172634,
  'f': 0.35877483938320526}}