In [None]:
!pip install transformers datasets rouge_score

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from rouge_score import rouge_scorer
import numpy as np

# 1. Load a VERY Small Subset of the Dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")  # Use only 1% of the training data

# 2. Choose a Smaller Model
model_name = 't5-small'  # Can upgrade to 't5-base' for better quality
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 3. Preprocess Data (Optimized for Abstractive Summarization)
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 4. Training (Optimized for Faster Learning)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=3e-5,  # Slightly increased learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Train for only ONE epoch
    weight_decay=0.01,
    save_steps=10000,
    logging_steps=100,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

trainer.train()

# 5. Inference (Abstractive Summarization)
def generate_summary(text):
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt").to(model.device)
    output = model.generate(
        input_ids,
        max_length=100,
        num_beams=5,  # More beams for better summarization
        early_stopping=True,
        length_penalty=1.2,  # Encourages slightly shorter summaries
        temperature=0.9  # Encourages more diverse output
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# 6. Examples
texts = [
    "Climate change is increasingly affecting global agriculture, posing significant challenges to food security and livelihoods. Rising temperatures, altered precipitation patterns, and increased frequency of extreme weather events are some of the key factors impacting crop yields and livestock productivity. For instance, prolonged droughts can lead to water scarcity, affecting irrigation and reducing crop growth. Similarly, excessive rainfall can cause flooding, damaging crops and soil health.",

    "The Sri Lankan government said on Saturday that the recent killings in the country, including that of a prominent underworld figure, appear to be an attempt to undermine the ongoing investigations into high-profile criminal cases such as the 2019 Easter Sunday suicide bombings. The government’s response came following this week’s courtroom shooting that left the underworld figure dead.  The incident sent shockwaves across the country, leading to the opposition probing the government’s measures to tackle the spate of violence."
]

for i, text in enumerate(texts):
    print(f"\n**Original Text {i+1}:**\n{text}\n")
    print(f"**Generated Summary {i+1}:**\n{generate_summary(text)}\n")


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,3.61
200,1.0952


Step,Training Loss
100,3.61
200,1.0952
300,1.0298



**Original Text 1:**
Climate change is increasingly affecting global agriculture, posing significant challenges to food security and livelihoods. Rising temperatures, altered precipitation patterns, and increased frequency of extreme weather events are some of the key factors impacting crop yields and livestock productivity. For instance, prolonged droughts can lead to water scarcity, affecting irrigation and reducing crop growth. Similarly, excessive rainfall can cause flooding, damaging crops and soil health.





**Generated Summary 1:**
Climate change is increasingly affecting global agriculture, posing significant challenges to food security and livestock productivity. Rising temperatures, altered precipitation patterns, and increased frequency of extreme weather events are key factors affecting crop yields and livestock productivity. For example, prolonged droughts can lead to flooding, flooding and reducing crop growth. excessive rainfall can cause flooding, reducing crop growth.


**Original Text 2:**
The Sri Lankan government said on Saturday that the recent killings in the country, including that of a prominent underworld figure, appear to be an attempt to undermine the ongoing investigations into high-profile criminal cases such as the 2019 Easter Sunday suicide bombings. The government’s response came following this week’s courtroom shooting that left the underworld figure dead.  The incident sent shockwaves across the country, leading to the opposition probing the government’s measure