In [None]:
!pip install transformers datasets accelerate -U
!pip install rouge_score
!pip install evaluate

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.1 pya

In [None]:
# --- 1. Define Model Configurations ---
MODEL_CONFIGS = [
    {
        "name": "BART-Large (CNN)",
        "model_id": "facebook/bart-large-cnn",
        "max_input_length": 1024,
        "target_max_length": 150
    },
    {
        "name": "T5-Small",
        "model_id": "t5-small",
        "max_input_length": 512,
        "target_max_length": 128
        # T5 often requires a task prefix, handled in the preprocessing function
    },
    {
        "name": "PEGASUS-XSUM",
        "model_id": "google/pegasus-xsum",
        "max_input_length": 512, # PEGASUS often uses shorter inputs
        "target_max_length": 64   # PEGASUS-XSUM targets short, 'extreme' summaries
    }
]

# Select which configuration to use
CURRENT_MODEL_CONF = MODEL_CONFIGS[0] # Change index to 1 for T5, 2 for PEGASUS
MODEL_ID = CURRENT_MODEL_CONF["model_id"]
MAX_INPUT_LENGTH = CURRENT_MODEL_CONF["max_input_length"]
TARGET_MAX_LENGTH = CURRENT_MODEL_CONF["target_max_length"]

print(f"--- Selected Model: {CURRENT_MODEL_CONF['name']} ({MODEL_ID}) ---")

--- Selected Model: BART-Large (CNN) (facebook/bart-large-cnn) ---


In [None]:
import torch
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

# --- Configuration from Step 1 is used here ---

# 2. Load Dataset
# Using a small subset of the SAMSum dataset for a quick example
raw_datasets = load_dataset("knkarthick/samsum")

# 3. Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

# T5 models require a prefix, for other models it's harmless/ignored
if "t5" in MODEL_ID:
    prefix = "summarize: "
else:
    prefix = ""

# 4. Preprocessing Function (Tokenization)
def preprocess_function(examples):
    # Apply prefix for models like T5
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=TARGET_MAX_LENGTH,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to all splits
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=["id", "dialogue", "summary"]
)

# Use small portions for a faster demo
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))

# 5. Define Data Collator and Metrics
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

# Load ROUGE metric
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a list of references, so we use the decoded labels directly
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Extract the median scores for a cleaner view
    # Modified: If 'v' is already a float, use it directly. Otherwise, access .mid.fmeasure.
    processed_result = {}
    for k, v in result.items():
        if isinstance(v, (float, np.floating)):
            processed_result[k] = v * 100
        elif hasattr(v, 'mid') and hasattr(v.mid, 'fmeasure'):
            processed_result[k] = v.mid.fmeasure * 100
        else:
            # Fallback for unexpected types, or if the structure has changed in a different way
            print(f"Warning: Unexpected ROUGE score type for '{k}': {type(v)}. Attempting direct conversion.")
            try:
                processed_result[k] = float(v) * 100
            except (TypeError, ValueError):
                print(f"Error: Could not convert ROUGE score for '{k}' with value '{v}' to float.")
                processed_result[k] = 0.0 # Assign a default or handle as appropriate
    result = processed_result

    # Add generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# 6. Define Training Arguments and Trainer
training_args = Seq2SeqTrainingArguments(
    output_dir=f"./summarization_finetuning/{CURRENT_MODEL_CONF['name']}",
    num_train_epochs=3,                     # **Parameter Change: Number of Epochs**
    per_device_train_batch_size=4,          # **Parameter Change: Training Batch Size**
    per_device_eval_batch_size=4,           # **Parameter Change: Evaluation Batch Size**
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",                  # Save checkpoint after each epoch
    eval_strategy="epoch",            # Evaluate after each epoch
    learning_rate=2e-5,                     # **Parameter Change: Learning Rate**
    fp16=torch.cuda.is_available(),         # Use mixed precision if a GPU is available
    predict_with_generate=True,             # Essential for generation tasks
    generation_max_length=TARGET_MAX_LENGTH # Use the model-specific max length
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 7. Start Training
print("\n--- Starting Training ---")
trainer.train()

print("\n--- Training Complete ---")
print(f"Final Evaluation Metrics:\n{trainer.evaluate()}")

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(



--- Starting Training ---


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.3683,1.584098,40.7353,19.8693,31.2017,31.1899,60.05
2,1.2463,1.558801,40.8996,20.3982,30.9036,30.8639,60.07
3,0.8066,1.630622,42.5713,20.0784,31.8126,31.7678,60.76





--- Training Complete ---


Final Evaluation Metrics:
{'eval_loss': 1.630622386932373, 'eval_rouge1': 42.5713, 'eval_rouge2': 20.0784, 'eval_rougeL': 31.8126, 'eval_rougeLsum': 31.7678, 'eval_gen_len': 60.76, 'eval_runtime': 46.6072, 'eval_samples_per_second': 2.146, 'eval_steps_per_second': 0.536, 'epoch': 3.0}
