In [None]:
# ==============================================================================
# FINAL T5 SUMMARIZATION TRAINING SCRIPT
# This script uses the best hyperparameters found from the previous sweep
# to train and save the final model efficiently.
# ==============================================================================

# --- 1. SETUP: Install necessary libraries ---
print("--- 1. INSTALLING LIBRARIES ---")
!pip install transformers datasets evaluate rouge_score accelerate bert_score -U -qq

import warnings
warnings.filterwarnings('ignore')

import os
import random
import numpy as np
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate

# --- 2. GLOBAL CONFIGURATION ---
MODEL_CHECKPOINT = "t5-small"
DATASET_NAME = "cnn_dailymail"
DATASET_CONFIG = "3.0.0"
SEED = 42
K_TRAIN = 80
N_EVAL = 1000
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 150
OUTPUT_DIR = "./final_t5_model_output"
SAVE_PATH = "./final_t5_summarizer" # Path to save the final model for download

# BEST HYPERPARAMETERS (from your previous run)
BEST_LR = 5e-5
BEST_BATCH_SIZE = 16

# --- 3. DATA PREPARATION (Reproducible) ---
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

print("\n--- 3. LOADING AND SAMPLING DATA (K=80) ---")
raw_datasets = load_dataset(DATASET_NAME, DATASET_CONFIG)

train_dataset_few_shot = raw_datasets["train"].shuffle(seed=SEED).select(range(K_TRAIN))
test_valid_split = raw_datasets["validation"].train_test_split(test_size=N_EVAL, seed=SEED)
validation_dataset = test_valid_split["train"].select(range(N_EVAL))
test_dataset = test_valid_split["test"].select(range(N_EVAL))

small_dataset = DatasetDict({
    "train": train_dataset_few_shot,
    "validation": validation_dataset,
    "test": test_dataset,
})
print(small_dataset)

# --- 4. TOKENIZATION ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples, prefix="summarize: "):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = small_dataset.map(preprocess_function, batched=True)

# --- 5. METRICS & DATA COLLATOR ---
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}

data_collator = DataCollatorForSeq2Seq(tokenizer, model=AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT))


# --- 6. FINAL MODEL TRAINING ---
print("\n--- 6. STARTING FINAL MODEL TRAINING WITH BEST PARAMETERS ---")
print(f"Using Learning Rate: {BEST_LR}, Batch Size: {BEST_BATCH_SIZE}")

model_final = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

training_args_final = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BEST_BATCH_SIZE,
    per_device_eval_batch_size=8,
    learning_rate=BEST_LR,
    num_train_epochs=15, # Train for 15 epochs as planned
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs_final',
    logging_strategy="epoch",
    eval_strategy="epoch", # Corrected argument name
    save_strategy="epoch",
    # CRITICAL CHANGE TO SAVE SPACE:
    save_total_limit=1, # Only keep the best model checkpoint, delete the rest
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="eval_rougeL", # Optimize for ROUGE-L score
    greater_is_better=True,
)

trainer_final = Seq2SeqTrainer(
    model=model_final,
    args=training_args_final,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_final.train()

# --- 7. SAVE FINAL MODEL & TOKENIZER FOR DEPLOYMENT ---
print(f"\n--- 7. SAVING FINAL MODEL TO: {SAVE_PATH} ---")
trainer_final.save_model(SAVE_PATH)
print("Model and tokenizer saved successfully!")

# --- 8. FINAL EVALUATION ON TEST SET (with BERTScore) ---
print("\n--- 8. EVALUATING FINAL MODEL ON THE TEST SET ---")

test_results = trainer_final.predict(tokenized_datasets["test"])

# Decode predictions and labels for metric calculation
preds = test_results.predictions
labels = np.where(test_results.label_ids != -100, test_results.label_ids, tokenizer.pad_token_id)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Calculate ROUGE
final_rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
final_rouge_pct = {k: round(v * 100, 2) for k, v in final_rouge.items()}

# Calculate BERTScore
bertscore_metric = evaluate.load("bertscore")
final_bertscore = bertscore_metric.compute(
    predictions=decoded_preds,
    references=decoded_labels,
    lang="en"
)
avg_bertscore_f1 = np.mean(final_bertscore["f1"]) * 100

print("\n\n====================================================================")
print("                    FINAL REPORT DATA")
print("====================================================================")
print(f"Final ROUGE-1 (Test Set): {final_rouge_pct['rouge1']}%")
print(f"Final ROUGE-2 (Test Set): {final_rouge_pct['rouge2']}%")
print(f"Final ROUGE-L (Test Set): {final_rouge_pct['rougeL']}%")
print(f"Final BERTScore F1 (Test Set): {avg_bertscore_f1:.4f}")
print("====================================================================")
print("Project execution complete. Your model is saved and ready for download.")

--- 1. INSTALLING LIBRARIES ---

--- 3. LOADING AND SAMPLING DATA (K=80) ---
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
})


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


--- 6. STARTING FINAL MODEL TRAINING WITH BEST PARAMETERS ---
Using Learning Rate: 5e-05, Batch Size: 16


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbaralabbal[0m ([33mbaralabbal-kathmandu-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.6793,2.555138,23.7919,10.3411,19.2535,19.2292
2,2.4783,2.40409,24.0933,10.6568,19.4805,19.4666
3,2.3617,2.297115,24.2519,10.9251,19.632,19.6205
4,2.2762,2.221111,24.3767,11.1403,19.7513,19.7503
5,2.2016,2.166023,24.2727,11.2349,19.7734,19.7662
6,2.1326,2.127271,24.4186,11.3535,19.8754,19.8471
7,2.1023,2.097389,24.3095,11.2614,19.7865,19.7676
8,2.0686,2.074191,24.321,11.3811,19.7974,19.7929
9,2.003,2.056064,24.3649,11.44,19.8403,19.8311
10,2.012,2.042602,24.1565,11.3109,19.644,19.644


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



--- 7. SAVING FINAL MODEL TO: ./final_t5_summarizer ---
Model and tokenizer saved successfully!

--- 8. EVALUATING FINAL MODEL ON THE TEST SET ---


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




                    FINAL REPORT DATA
Final ROUGE-1 (Test Set): 25.32%
Final ROUGE-2 (Test Set): 12.53%
Final ROUGE-L (Test Set): 21.11%
Final BERTScore F1 (Test Set): 86.8569
Project execution complete. Your model is saved and ready for download.


In [None]:
# 1. Compress the saved model folder into a single zip file.
# The folder name MUST match the path from your script: final_t5_summarizer
!zip -r final_t5_summarizer.zip final_t5_summarizer

# 2. Trigger the download of the zip file to your local machine (Mac).
from google.colab import files
files.download('final_t5_summarizer.zip')

  adding: final_t5_summarizer/ (stored 0%)
  adding: final_t5_summarizer/config.json (deflated 63%)
  adding: final_t5_summarizer/training_args.bin (deflated 53%)
  adding: final_t5_summarizer/tokenizer.json (deflated 74%)
  adding: final_t5_summarizer/tokenizer_config.json (deflated 95%)
  adding: final_t5_summarizer/special_tokens_map.json (deflated 85%)
  adding: final_t5_summarizer/generation_config.json (deflated 28%)
  adding: final_t5_summarizer/spiece.model (deflated 48%)
  adding: final_t5_summarizer/model.safetensors (deflated 12%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>