## Importing our libraries

In [21]:
#!pip install spacy gensim
#!pip install datasets

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from rouge_score import rouge_scorer

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer



#Importing the Dataset

In [22]:
import datasets
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0")

train = dataset['train']
test = dataset['test']
validation = dataset['validation']

In [23]:
#!pip install transformers torch rouge-score gensim
#!pip install transformers torch rouge-score sumy
#!pip install sumy



In [24]:
#!python -c "import nltk; nltk.download('punkt')"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Training the model and comparing using ROUGE METRIC

In [48]:
# We use the t5_small and tokenize accordingly
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
if torch.cuda.is_available():
    model.cuda()

def generate_headline(article, max_input_length=512, max_output_length=150):
    prefixed_text = f"summarize: {article}"
    inputs = tokenizer(prefixed_text, return_tensors="pt", max_length=max_input_length, truncation=True)
    if torch.cuda.is_available():
        inputs = inputs.to(model.device)

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_output_length,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def sumy_summarize(article):
    parser = PlaintextParser.from_string(article, Tokenizer("english"))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, 1)  # Summarizing to 1 sentence for ease of comparasion
    return ' '.join(sentence._text for sentence in summary)

# ROUGE Scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Taking examples to generate the headlines

In [46]:
# Using the ROUGE metric to do the comparasion
for example in dataset.take(5):  # Taking 5 samples for our example
    article = example['article']
    original_highlight = example['highlights']
    generated_headline = generate_headline(article)
    sumy_summary = sumy_summarize(article)
    t5_scores = scorer.score(original_highlight, generated_headline)
    sumy_scores = scorer.score(generated_headline , sumy_summary)

    print()
    print("Generated Headline (T5):", generated_headline)
    print()
    print("Sumy Summary:", sumy_summary)
    print()
    print("Original Highlight:", original_highlight)
    print()
    print("T5 ROUGE Scores:", t5_scores)
    print()
    print("Sumy ROUGE Scores:", sumy_scores)
    print("----")


Generated Headline (T5): the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. he will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II" despite his growing fame, he says he is keeping his feet firmly on the ground.

Sumy Summary: Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.

Original Highlight: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's earnings from first five Potter films have been held in trust fund .

T5 ROUGE Scores: {'rouge1': Score(precision=0.25862068965517243, recall=0.38461538461538464, fmeasure=0.30927835051546393), 'rouge2': Score(precision=0.19298245614035087, recall=0.28947368