In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install necessary libraries
!pip install transformers datasets torch accelerate matplotlib evaluate



In [None]:
!pip install rouge_score



In [None]:
# Import required libraries
import torch
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback, EarlyStoppingCallback

from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np


In [None]:
# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Take a small fraction (e.g., 1,000 examples) of the training dataset
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))  # Select first 1,000 examples
small_validation_dataset = dataset["validation"].shuffle(seed=42).select(range(100))  # Select first 100 examples

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocessing function for summarization
def preprocess_function(examples):
    inputs = ["summarize: " + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")  # Add padding here

    # Tokenize the summaries (targets)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length")  # Add padding here

    model_inputs["labels"] = labels["input_ids"]
    # print("Example Input Lengths:", [len(input) for input in model_inputs["input_ids"]][:5])
    # print("Example Label Lengths:", [len(label) for label in labels["input_ids"]][:5])
    return model_inputs

# Tokenizing the smaller dataset
tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights"])
tokenized_validation_dataset = small_validation_dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights"])

# Define ROUGE metric for evaluation
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Extract logits if predictions is a tuple
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Convert logits to token IDs
    predictions = torch.argmax(torch.tensor(predictions), dim=-1).numpy()

    # Replace padding tokens (tokenizer.pad_token_id) with -100 in labels for consistency
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels while skipping special tokens
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Ensure labels only include valid token IDs (e.g., non-negative values)
    cleaned_labels = [
        [token for token in label if token != -100] for label in labels
    ]
    decoded_labels = tokenizer.batch_decode(cleaned_labels, skip_special_tokens=True)

    # Debugging: Print some predictions and labels
    # print("Decoded Predictions:", decoded_preds[:2])
    # print("Decoded Labels:", decoded_labels[:2])

    # Compute ROUGE scores
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Debugging: Print ROUGE scores
    print("ROUGE metrics:", result)
    return result





# Training arguments with mixed precision and early stopping
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Text_summarizer/abstractive/results",
    evaluation_strategy="epoch",  # Evaluate after every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Text_summarizer/abstractive/logs",
    logging_strategy="epoch",  # Log every epoch
    save_strategy="epoch",  # Save the model every epoch
    fp16=True,  # Mixed precision training
    load_best_model_at_end=True,  # Load best model based on eval loss
    metric_for_best_model="eval_loss",  # Use eval_loss for early stopping
    report_to="none",  # Avoid using wandb for reporting
    # Early stopping parameters
    greater_is_better=False,  # lower eval_loss is better
    save_total_limit=2,  # Save the top 3 models
    # Early stopping patience (how many epochs to wait before stopping if no improvement)
    # patience=2,
)

# Custom callback for printing metrics and saving models
class PrintMetricsAndSaveModelCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Fetch the latest evaluation metrics from the log history
        if state.log_history:
            eval_metrics = state.log_history[-1]  # Get the last logged metrics
            eval_loss = eval_metrics.get("eval_loss", "N/A")
            print(f"Epoch {state.epoch}: Eval Loss: {eval_loss}")
        else:
            print(f"Epoch {state.epoch}: No evaluation metrics available.")

        # Save the model
        trainer.save_model(f'/content/drive/MyDrive/Text_summarizer/abstractive/results/model_epoch_{int(state.epoch)}')

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    data_collator=None,  # Default data collator handles padding
    compute_metrics=compute_metrics,
    callbacks=[PrintMetricsAndSaveModelCallback(), EarlyStoppingCallback(early_stopping_patience=2)],  # Add custom callback
)

# Train the model
trainer.train()

# Visualize training and validation loss
train_results = trainer.state.log_history  # Logs of the training process

train_losses = [x['loss'] for x in train_results if 'loss' in x]
eval_losses = [x['eval_loss'] for x in train_results if 'eval_loss' in x]

# Plotting loss
plt.plot(train_losses, label='Train Loss')
plt.plot(eval_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()
plt.show()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.9201,1.182131,0.573781,0.315924,0.525371,0.525643
2,1.1409,1.112833,0.57532,0.317913,0.527198,0.527546
3,1.0806,1.094868,0.57712,0.321183,0.529795,0.529995


Epoch 1.0: No evaluation metrics available.
ROUGE metrics: {'rouge1': 0.5737805652075986, 'rouge2': 0.3159243552709086, 'rougeL': 0.5253706165188925, 'rougeLsum': 0.525642899865516}
Epoch 2.0: Eval Loss: 1.1821306943893433
ROUGE metrics: {'rouge1': 0.5753196666963807, 'rouge2': 0.31791310338791834, 'rougeL': 0.5271983813636716, 'rougeLsum': 0.5275460035214241}
Epoch 3.0: Eval Loss: 1.1128331422805786
ROUGE metrics: {'rouge1': 0.5771204547222801, 'rouge2': 0.32118305545669223, 'rougeL': 0.5297946139660905, 'rougeLsum': 0.5299945873489484}
