In [1]:
!pip install transformers datasets rouge_score

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from rouge_score import rouge_scorer
import numpy as np

# 1. Load a VERY Small Subset of the Dataset
try:
    dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")  # Use only 1% of the training data
except ValueError as e:
    print(f"Error loading dataset: {e}")
    print("Check dataset name, version, and Hugging Face Hub availability.")
    raise

# 2. Choose a Smaller Model
model_name = 't5-small'  # 't5-small' is much faster than 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 3. Preprocess Data (Simplified)
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length", return_tensors="pt")  # Shorter max length

    labels = tokenizer(examples["highlights"], max_length=64, truncation=True, padding="max_length", return_tensors="pt") # Shorter max length

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 4. Training (Fewer Epochs, Smaller Batch Size)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Skip evaluation during training for speed
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Increase batch size if GPU allows
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Train for only ONE epoch
    weight_decay=0.01,
    save_steps=10000, #avoid saving checkpoints
    logging_steps=100,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    #eval_dataset=tokenized_datasets, #remove eval dataset
    tokenizer=tokenizer,
)

trainer.train()

# 5. Simplified Evaluation (Optional)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for ref, pred in zip(decoded_labels, decoded_preds):
        try:
            score = scorer.score(ref, pred)
            rouge_scores.append(score)
        except Exception as e:
            print(f"Error calculating ROUGE score: {e}")
            print(f"Reference: {ref}")
            print(f"Prediction: {pred}")
            # Handle the error, e.g., by skipping this pair or assigning a default score
            continue

    # Calculate average ROUGE scores, handling cases where rouge_scores is empty
    if rouge_scores:
        avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
        avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
        avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

        return {
            'rouge1': avg_rouge1,
            'rouge2': avg_rouge2,
            'rougeL': avg_rougeL,
        }
    else:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'error': "No valid ROUGE scores calculated"
        }

# Override the prediction_step in Trainer to handle labels correctly
class CustomTrainer(Trainer):
    def prediction_step(
        self,
        model,
        inputs,
        prediction_loss_only,
        ignore_keys = None
    ):
        labels = inputs["labels"].clone()
        labels[labels == tokenizer.pad_token_id] = -100
        inputs["labels"] = labels
        return super().prediction_step(
            model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
        )
# comment out to skip evaluation

#trainer = CustomTrainer(
#    model=model,
#    args=training_args,
#    train_dataset=tokenized_datasets,
#    eval_dataset=tokenized_datasets,
#    tokenizer=tokenizer,
#    compute_metrics=compute_metrics,  # Pass the compute_metrics function here
#)

#trainer.train()
#eval_results = trainer.evaluate()
#print(f"Evaluation Results: {eval_results}")

# 6. Inference
text = "Climate change exerts profound and multifaceted impacts on agriculture, threatening global food security and livelihoods. Rising temperatures, shifting precipitation patterns and more frequent extreme weather events disrupt established agricultural practices. Crop yields dwindle due to heat stress, water scarcity from prolonged droughts and increased vulnerability to pests and diseases in warmer conditions. Livestock face diminished productivity and health challenges. Altered growing seasons disrupt planting and harvesting calendars, straining farmers' ability to predict and adapt. These challenges disproportionately affect vulnerable farming communities, exacerbating social and economic disparities. Mitigating the adverse effects of climate change on agriculture requires sustainable adaptation strategies. Resilient crop varieties, efficient water management and improved infrastructure are essential. Moreover, implementing international agreements like the Paris Agreement is crucial to coordinate global efforts in reducing greenhouse gas emissions, which underlie climate change. As the global population grows, addressing these challenges becomes increasingly urgent to ensure a secure and sustainable food supply for the future."
input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)  # Shorter summary length
summary = tokenizer.decode(output[0], skip_special_tokens=True)
print(summary)


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,3.3807
200,2.6298
300,2.5071


Climate change exerts profound and multifaceted impacts on agriculture. Crop yields dwindle due to heat stress and water scarcity. Altered growing seasons disrupt planting and harvesting calendars. These challenges disproportionately affect vulnerable farming communities.


In [5]:
# Load dataset (increase size for better learning)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

# Choose a better model
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing function with improved label handling
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels["input_ids"][i]]
        for i in range(len(labels["input_ids"]))
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Training Arguments (More epochs + evaluation)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Train for 3 epochs
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


  trainer = Trainer(


Step,Training Loss
100,1.787
200,1.6516
300,1.5977
400,1.6156
500,1.5595
600,1.5807
700,1.5724
800,1.5644
900,1.5392
1000,1.5084


TrainOutput(global_step=1077, training_loss=1.5937534002872562, metrics={'train_runtime': 1515.6891, 'train_samples_per_second': 5.683, 'train_steps_per_second': 0.711, 'total_flos': 5244954311393280.0, 'train_loss': 1.5937534002872562, 'epoch': 3.0})

In [6]:

# Inference with improved parameters
texts = ["The U.S. economy added 850,000 jobs in June, a sign of continued recovery as businesses reopen and consumers spend more. The unemployment rate, however, ticked up slightly to 5.9% from 5.8% in May.",
        "A massive wildfire in northern California has scorched over 150,000 acres, forcing thousands to evacuate. Firefighters are struggling to contain the blaze amid high temperatures and strong winds.",
        "Scientists have discovered a new species of dinosaur in Argentina. The creature, named 'Llukalkan aliocranianus,' lived approximately 80 million years ago and is believed to have been a formidable predator.",
        "The Tokyo 2020 Olympics, postponed due to the COVID-19 pandemic, are set to begin with strict health protocols in place. Athletes will undergo regular testing, and spectators will be limited to local residents.",
        "A recent study suggests that drinking coffee may reduce the risk of developing Alzheimer's disease. Researchers found that participants who consumed higher amounts of caffeine had a lower incidence of the neurodegenerative condition.",
        "The United Nations has called for an immediate ceasefire in the ongoing conflict in Yemen. The humanitarian crisis has worsened, with millions facing famine and limited access to medical supplies.",
        "Tech giant Apple has announced plans to invest $1 billion in building a new campus in North Carolina. The facility is expected to create thousands of jobs and bolster the state's economy."]

for text in texts:
  input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt").to(model.device)
  output = model.generate(
      input_ids,
      max_length=100,
      num_beams=3,
      early_stopping=True,
      do_sample=True,
      temperature=0.9,
      top_k=50,
      top_p=0.95
  )
  summary = tokenizer.decode(output[0], skip_special_tokens=True)
  print(summary)


U.S. economy added 850,000 jobs in June, sign of continued recovery . The unemployment rate ticked up slightly to 5.9% from 5.8% .
A massive wildfire in northern California has scorched over 150,000 acres . Firefighters are struggling to contain the blaze amid high temperatures and strong winds .
Scientists have discovered a new species of dinosaur in Argentina . The creature, named 'Lukalkan aliocranianus', lived approximately 80 million years ago .
Athletes will undergo regular testing, and spectators will be limited to local residents . The Tokyo 2020 Olympics are set to begin with strict health protocols in place .
Researchers found people who consumed more caffeine had a lower incidence of Alzheimer's disease .
The humanitarian crisis has worsened, with millions facing famine .
Tech giant Apple has announced plans to invest $1 billion in a new campus . The facility is expected to create thousands of jobs .
