<table align="center">
  <td align="center"><a target="_blank" href="http://introtodeeplearning.com">
        <img src="https://i.ibb.co/Jr88sn2/mit.png" style="padding-bottom:5px;" />
      Visit MIT Deep Learning</a></td>
  <td align="center"><a target="_blank" href="https://colab.research.google.com/github/MITDeepLearning/introtodeeplearning/blob/master/lab3/LLM_Finetuning.ipynb">
        <img src="https://i.ibb.co/2P3SLwK/colab.png"  style="padding-bottom:5px;" />Run in Google Colab</a></td>
  <td align="center"><a target="_blank" href="https://github.com/MITDeepLearning/introtodeeplearning/blob/master/lab3/LLM_Finetuning.ipynb">
        <img src="https://i.ibb.co/xfJbPmL/github.png"  height="70px" style="padding-bottom:5px;"  />View Source on GitHub</a></td>
</table>

# Copyright Information

In [1]:
# Copyright 2025 MIT Introduction to Deep Learning. All Rights Reserved.
#
# Licensed under the MIT License. You may not use this file except in compliance
# with the License. Use and/or modification of this code outside of MIT Introduction
# to Deep Learning must reference:
#
# © MIT Introduction to Deep Learning
# http://introtodeeplearning.com
#

In [None]:
import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")

import numpy as np

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from lion_pytorch import Lion

  from .autonotebook import tqdm as notebook_tqdm
2025-11-04 09:22:15.483459: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Basic question-answer template
template_without_answer = "<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n"
template_with_answer = template_without_answer + "{answer}<end_of_turn>\n"

# Let's try to put something into the template to see how it looks
print(template_with_answer.format(question="What is your name?", answer="My name is Gemma!"))

<start_of_turn>user
What is your name?<end_of_turn>
<start_of_turn>model
My name is Gemma!<end_of_turn>



In [4]:
# Load the tokenizer for Gemma 2B
model_id = "unsloth/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# How big is the tokenizer?
print(f"Vocab size: {len(tokenizer.get_vocab())}")

Vocab size: 256000


In [5]:
# Lets test out both steps:
text = "Here is some sample text!"
print(f"Original text: {text}")

# Tokenize the text
tokens = tokenizer.encode(text, return_tensors="pt")
print(f"Encoded tokens: {tokens}")

# Decode the tokens
decoded_text = tokenizer.decode(tokens[0], skip_special_tokens=True)
print(f"Decoded text: {decoded_text}")

Original text: Here is some sample text!
Encoded tokens: tensor([[     2,   4858,    603,   1009,   6453,   2793, 235341]])
Decoded text: Here is some sample text!


This is really cool. Now we have a way to move in and out of the token space.

To "chat" with our LLM chatbot, we need to use the tokenizer and the chat template together, in order for the model to respond to the user's question. We can use the templates defined earlier to construct a prompt for the model, without the answer.

In [6]:
prompt = template_without_answer.format(question="What is the capital of France? Use one word.")
print(prompt)

<start_of_turn>user
What is the capital of France? Use one word.<end_of_turn>
<start_of_turn>model



If we were to feed this to the model, it would see that it is now the start of the model's turn, and it would generate the answer to this question.

In [7]:
# Load the model -- note that this may take a few minutes
# Load the model -- note that this may take a few minutes
def apply_lora(model):
    # Define LoRA config
    lora_config = LoraConfig(
        r=8, # rank of the LoRA matrices
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"
        ],
    )

    # Apply LoRA to the model
    lora_model = get_peft_model(model, lora_config)
    return lora_model

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.float16,
    low_cpu_mem_usage=True,
)
# Optional: enable gradient checkpointing to save memory
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()
# Resize embeddings (pad token added) and attach LoRA adapters
model.resize_token_embeddings(len(tokenizer))
model = apply_lora(model)

if hasattr(model, "print_trainable_parameters"):
    model.print_trainable_parameters()
else:
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable params: {trainable} / {total} ({trainable/total*100:.2f}%)")


trainable params: 10,383,360 || all params: 2,624,725,248 || trainable%: 0.3956


In [8]:
### Putting it together to prompt the model and generate a response ###

# 1. Construct the prompt in chat template form
question = "What does MIT stand for?"
prompt = template_without_answer.format(question=question)

# 2. Tokenize the prompt, including attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

# 3. Generate a sequence of tokens for the answer
with torch.no_grad():
    gen_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # Add attention mask
        max_new_tokens=50,  # Increased tokens
        do_sample=True,  # Enable sampling
        temperature=0.7 # Added temperature
    )

# 4. Decode and print the full text
print(tokenizer.decode(gen_ids[0], skip_special_tokens=True))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


user
What does MIT stand for?
model
MIT stands for **Massachusetts Institute of Technology**. 



In [9]:
prompt = template_without_answer.format(question="What does MIT stand for?")
tokens = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
output = model.generate(tokens, max_new_tokens=20)
print(tokenizer.decode(output[0]))

<bos><start_of_turn>user
What does MIT stand for?<end_of_turn>
<start_of_turn>model
MIT stands for **Massachusetts Institute of Technology**. 
<end_of_turn>


### 1.3.3: Forward pass and loss computation

Now let's define a function to perform a forward pass through the LLM and compute the loss. The forward pass gives us the logits -- which reflect the probability distribution over the next token -- for the next token. We can compute the loss by comparing the predicted logits to the true next token -- our target label. Note that this is effectively a classification problem! So, our loss can be captured by the cross entropy loss, and we can use PyTorch's [`nn.functional.cross_entropy`](https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html) function to compute it.

In [10]:
def forward_and_compute_loss(model, tokens, mask, context_length=512):
    # Truncate to context length
    tokens = tokens[:, :context_length]
    mask = mask[:, :context_length]

    # Construct the input, output, and mask
    x = tokens[:, :-1]
    y = tokens[:, 1:]
    mask = mask[:, 1:]

    # Forward pass to compute logits
    logits = model(x).logits

    # Compute loss
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),
        y.view(-1),
        reduction="none"
    )

    # Mask out the loss for non-answer tokens
    loss = loss[mask.view(-1)].mean()

    return loss

## PII Masking Fine-tuning (source_text ➜ target_text)

Adapt the pipeline to a PII masking dataset with columns: `source_text`, `target_text`, `privacy_mask`, `span_labels`, `mbert_text_tokens`, `mbert_bio_labels`, `id`, `language`, `set`. We will fine-tune the model to transform `source_text` into its masked form `target_text`.


In [11]:
# Prompt template specialized for PII masking with few-shot examples

def build_pii_prompt(source_text: str) -> str:
    instruction = (
        "You are a data privacy assistant. Mask all personally identifiable information (PII) "
        "in the following text using the masking scheme shown in the examples. "
        "Output only the masked text.\n\n"
        "Examples:\n\n"
        "Input: My name is John Smith and my email is john.smith@email.com\n"
        "Output: My name is [FIRSTNAME] [LASTNAME] and my email is [EMAIL]\n\n"
        "Input: Call me at 555-123-4567 or visit 123 Main Street, Boston MA 02101\n"
        "Output: Call me at [PHONENUMBER] or visit [STREET] [CITY] [STATE] [ZIPCODE]\n\n"
        "Input: My username is alice_2023 and I was born on 03/15/1990\n"
        "Output: My username is [USERNAME] and I was born on [DOB]\n\n"
        "Input: The SSN is 123-45-6789 and account number is ACC98765\n"
        "Output: The SSN is [SSN] and account number is [ACCOUNTNUMBER]"
    )
    return (
        f"<start_of_turn>user\n{instruction}\n\nText:\n{source_text}\n<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )


In [12]:
# Load a PII masking dataset (expects the listed columns)
# If you have a local file instead of HF dataset, replace with pandas read_csv and Dataset.from_pandas
from datasets import load_dataset

try:
    pii_ds = load_dataset("ai4privacy/pii-masking-300k")
except Exception as e:
    print("Falling back: please provide a local dataset with required columns.")
    raise e

# Keep English and split by provided 'set' column if present
if "language" in pii_ds["train"].column_names:
    pii_ds = pii_ds.filter(lambda ex: ex.get("language", "en") == "English")

# Train/validation split: if dataset has 'set' use it, else do a random split
if "set" in pii_ds["train"].column_names:
    train_split = pii_ds["train"].filter(lambda ex: ex.get("set", "train") == "train")
    valid_split = pii_ds["validation"].filter(lambda ex: ex.get("set", "validation") == "validation")
else:
    split = pii_ds["train"].train_test_split(test_size=0.02, seed=42)
    train_split, valid_split = split["train"], split["test"]

needed_columns = [
    "source_text", "target_text", "privacy_mask", "span_labels",
    "mbert_text_tokens", "mbert_bio_labels", "id", "language", "set"
]

missing = [c for c in ["source_text", "target_text"] if c not in train_split.column_names]
assert not missing, f"Dataset missing required columns: {missing}"

# Map to prompt-target fields the model will use

def add_prompt_fields(example):
    src = example["source_text"]
    tgt = example["target_text"]
    prompt = build_pii_prompt(src)
    example["prompt"] = prompt
    example["target"] = tgt
    return example

# Apply mapping and remove original columns not in needed_columns, keeping the new 'prompt' and 'target'
original_columns = train_split.column_names
columns_to_remove = [col for col in original_columns if col not in needed_columns]

train_split = train_split.map(add_prompt_fields, remove_columns=columns_to_remove)
valid_split = valid_split.map(add_prompt_fields, remove_columns=columns_to_remove)


print("PII dataset:", train_split)
print("Columns:", train_split.column_names[:10])
print({k: train_split[k][0] for k in ["prompt", "target"]})
print("Valid Split -> ", len(valid_split))

PII dataset: Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set', 'prompt', 'target'],
    num_rows: 29908
})
Columns: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set', 'prompt']
{'prompt': '<start_of_turn>user\nYou are a data privacy assistant. Mask all personally identifiable information (PII) in the following text using the masking scheme shown in the examples. Output only the masked text.\n\nExamples:\n\nInput: My name is John Smith and my email is john.smith@email.com\nOutput: My name is [FIRSTNAME] [LASTNAME] and my email is [EMAIL]\n\nInput: Call me at 555-123-4567 or visit 123 Main Street, Boston MA 02101\nOutput: Call me at [PHONENUMBER] or visit [STREET] [CITY] [STATE] [ZIPCODE]\n\nInput: My username is alice_2023 and I was born on 03/15/1990\nOutput: My username is [USERNAME] and I was born on [DOB]\n

In [None]:
# Collate to create input_ids and labels where prompt tokens are ignored (-100)
from torch.utils.data import DataLoader

def tokenize_with_labels(batch, tokenizer, max_length=512):
    prompts = batch["prompt"]
    targets = batch["target"]

    # Tokenize separately so we can compute boundaries
    prompt_enc = tokenizer(prompts, padding=False, truncation=True, max_length=max_length)
    target_enc = tokenizer(targets, padding=False, truncation=True, max_length=max_length)

    input_ids = []
    labels = []
    attention_mask = []

    for p_ids, t_ids in zip(prompt_enc["input_ids"], target_enc["input_ids"]):
        # Build concatenated sequence: [prompt] + [target]
        ids = p_ids + t_ids
        ids = ids[:max_length]

        # Build labels: -100 for prompt tokens, target token ids for target span
        prompt_len = min(len(p_ids), max_length)
        tgt_len = max(0, min(len(t_ids), max_length - prompt_len))
        lab = ([-100] * prompt_len) + t_ids[:tgt_len]
        lab = lab[:len(ids)]

        am = [1] * len(ids)

        input_ids.append(ids)
        labels.append(lab)
        attention_mask.append(am)

    # Pad to max batch length
    batch_max = max(len(x) for x in input_ids)
    def pad_to(x, pad_id):
        return x + [pad_id] * (batch_max - len(x))

    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
    input_ids = [pad_to(x, pad_id) for x in input_ids]
    labels = [pad_to(x, -100) for x in labels]
    attention_mask = [pad_to(x, 0) for x in attention_mask]

    # Create tensors on CPU first, then move to device
    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long).to(model.device),
        "labels": torch.tensor(labels, dtype=torch.long).to(model.device),
        "attention_mask": torch.tensor(attention_mask, dtype=torch.long).to(model.device),
    }

# Ensure pad token is set
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token

# Use a smaller micro-batch size with gradient accumulation to save memory
BATCH_SIZE = 2  # Micro-batch size per forward pass
GRADIENT_ACCUMULATION_STEPS = 4  # Effective batch size of 8
MAX_LEN = 384

train_loader_pii = DataLoader(
    train_split, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=lambda b: tokenize_with_labels({k: [ex[k] for ex in b] for k in b[0]}, tokenizer, max_length=MAX_LEN)
)

valid_loader_pii = DataLoader(
    valid_split, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=lambda b: tokenize_with_labels({k: [ex[k] for ex in b] for k in b[0]}, tokenizer, max_length=MAX_LEN)
)

print("Train batches (PII):", len(train_loader_pii))

Train batches (PII): 3739


In [None]:
# Training loop for PII masking (text-to-text)
from lion_pytorch import Lion

def train_pii(model, dataloader, max_steps=200, learning_rate=1e-4, gradient_accumulation_steps=1):
    """
    Train the model on PII masking task.
    
    Args:
        model: The model to train
        dataloader: Training data loader
        max_steps: Maximum optimizer steps
        learning_rate: Learning rate for optimizer
        gradient_accumulation_steps: Number of micro-batches to accumulate before an optimizer step
    
    Returns:
        model: Trained model
    """
    model.train()
    optimizer = Lion(model.parameters(), lr=learning_rate)
    losses = []

    optimizer.zero_grad()
    micro_step = 0
    global_step = 0
    accum_loss = 0.0

    for batch in dataloader:
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        loss = outputs.loss
        loss_value = loss.item()
        losses.append(loss_value)

        (loss / gradient_accumulation_steps).backward()
        accum_loss += loss_value
        micro_step += 1

        if micro_step % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

            if global_step % 10 == 0:
                print(f"step {global_step} loss: {accum_loss / gradient_accumulation_steps:.4f}")

            accum_loss = 0.0

            if global_step >= max_steps:
                break

    remainder = micro_step % gradient_accumulation_steps
    if remainder != 0 and accum_loss > 0.0 and global_step < max_steps:
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        if global_step % 10 == 0:
            print(f"step {global_step} loss: {accum_loss / remainder:.4f}")

    return model


In [15]:
# Inference helper for PII masking

def pii_mask(text: str, max_new_tokens=128, temperature=0.0, only_answer=True):
    prompt = build_pii_prompt(text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        gen = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=(temperature > 0.0),
            temperature=max(temperature, 1e-6),
        )

    out = tokenizer.decode(gen[0], skip_special_tokens=True)
    if only_answer:
        # Return only the model turn after the prompt
        if "<start_of_turn>model" in out:
            out = out.split("<start_of_turn>model")[-1]
        return out.strip()
    return out

# Example:
# print(pii_mask("My name is John Smith and my SSN is 123-45-6789. I live at 10 Main St, Boston MA."))


In [None]:
# PII evaluation: Character-level F1 score and detailed CSV export
import numpy as np
from collections import Counter
import pandas as pd
from datetime import datetime

def char_f1(pred: str, gold: str):
    """Calculate character-level F1 score between prediction and gold text."""
    pc = Counter(pred)
    gc = Counter(gold)
    overlap = sum((pc & gc).values())
    if overlap == 0:
        return 0.0
    precision = overlap / max(1, sum(pc.values()))
    recall = overlap / max(1, sum(gc.values()))
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

@torch.no_grad()
def evaluate_and_save_f1(model_obj, dset, n=200, csv_filename='pii_evaluation_results.csv', model_name='fine-tuned'):
    """
    Evaluate F1 score on dataset and save detailed results to CSV.
    
    Args:
        model_obj: Model to evaluate
        dset: Dataset to evaluate on
        n: Number of samples to evaluate
        csv_filename: Output CSV filename
        model_name: Name identifier for this model (e.g., 'fine-tuned' or 'base')
    
    Returns:
        avg_f1: Average F1 score
        results_df: DataFrame with detailed results
    """
    model_obj.eval()
    n = min(n, len(dset))
    
    results = []
    f1_scores = []
    
    print(f"Evaluating {model_name} model on {n} samples...")
    
    for i in range(n):
        src = dset[i]["source_text"]
        tgt = dset[i]["target_text"]
        
        # Get prediction
        if model_name == 'fine-tuned':
            pred = pii_mask(src, max_new_tokens=256, temperature=0.0, only_answer=True)
        else:
            # For base model comparison
            pred = pii_mask_with(model_obj, src, max_new_tokens=256)
        
        # Calculate F1
        f1 = char_f1(pred, tgt)
        f1_scores.append(f1)
        
        # Store detailed results
        results.append({
            'sample_id': i,
            'model': model_name,
            'source_text': src[:200],  # Truncate for readability
            'target_text': tgt[:200],
            'prediction': pred[:200],
            'f1_score': f1,
            'source_length': len(src),
            'target_length': len(tgt),
            'prediction_length': len(pred),
            'timestamp': datetime.now().isoformat()
        })
        
        if (i + 1) % 20 == 0:
            print(f"  Processed {i + 1}/{n} samples...")
    
    avg_f1 = float(np.mean(f1_scores)) if f1_scores else 0.0
    
    # Create DataFrame
    results_df = pd.DataFrame(results)
    
    # Save to CSV
    results_df.to_csv(csv_filename, index=False)
    print(f"\n✓ Results saved to: {csv_filename}")
    print(f"  Average F1 Score: {avg_f1:.4f}")
    print(f"  Median F1 Score: {results_df['f1_score'].median():.4f}")
    print(f"  Std Dev: {results_df['f1_score'].std():.4f}")
    
    return avg_f1, results_df


## Fine-tuning

Run the cell below to fine-tune the model on the PII masking task.


In [None]:
FT_STEPS = 2000
LR = 1e-4

model.train()
model = train_pii(
    model,
    train_loader_pii,
    max_steps=FT_STEPS,
    learning_rate=LR,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
)

print("Fine-tuning completed!")


## Save the Fine-tuned Model

Now that fine-tuning is complete, let's save the model.


In [None]:
# === Save full merged model (base + LoRA combined) ===
# This creates a ~5GB standalone model that's easier to use later

print("Merging LoRA adapters into base model...")
merged_model = model.merge_and_unload()

merged_save_dir = "./pii_merged_model"
print(f"Saving model to {merged_save_dir}...")

merged_model.save_pretrained(merged_save_dir)
tokenizer.save_pretrained(merged_save_dir)

print(f"\n✓ Full merged model saved to: {merged_save_dir}")
print(f"✓ Model size: ~5GB")
print(f"\nTo load later (simple, no LoRA needed):")
print(f"  from transformers import AutoModelForCausalLM, AutoTokenizer")
print(f"  model = AutoModelForCausalLM.from_pretrained('{merged_save_dir}', device_map='auto')")
print(f"  tokenizer = AutoTokenizer.from_pretrained('{merged_save_dir}')")


## Inference Demo

Let's test the fine-tuned model on a few validation samples.


In [18]:
N_SHOW = 3
for i in range(N_SHOW):
    src = valid_split[i]["source_text"]
    tgt = valid_split[i]["target_text"]
    pred = pii_mask(src, max_new_tokens=256, temperature=0.0, only_answer=True)
    print(f"--- Sample {i} ---")
    print("Source:\n", src[:500])
    print("\nTarget:\n", tgt[:500])
    print("\nPrediction:\n", pred[:500])
    print()

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Caching is incompatible with gradient checkpointing in Gemma2DecoderLayer. Setting `past_key_values=None`.


--- Sample 0 ---
Source:
 On the video sharing platform for educational content, a lively discussion unfolded among users from different locales within the UK.

The comment thread began with paaltwvkjuijwbj957 expressing admiration for the video's insightful content, followed by 2005zheng.monckton adding a clarification on a complex topic. 43CU chimed in with a question for clarification, an

Target:
 On the video sharing platform for educational content, a lively discussion unfolded among users from different locales within the UK.

The comment thread began with [USERNAME] expressing admiration for the video's insightful content, followed by [USERNAME] adding a clarification on a complex topic. [USERNAME] chimed in with a question for clarification, an

Prediction:
 user
You are a data privacy assistant. Mask all personally identifiable information (PII) in the following text using the masking scheme shown in the examples. Output only the masked text.

Examples:

Input: My name is Joh

## Model Evaluation

This section evaluates the fine-tuned model against the base model using **F1 scores only** (no exact match).

**Generated CSV Files:**

1. **`pii_finetuned_results.csv`** (from Cell 28 - Evaluation)
   - Detailed evaluation of the fine-tuned model
   - Columns: `sample_id`, `model`, `source_text`, `target_text`, `prediction`, `f1_score`, `source_length`, `target_length`, `prediction_length`, `timestamp`

2. **`pii_base_results.csv`** (from Cell 28 - Evaluation)
   - Same format as above, for the base model

3. **`pii_comparison_results.csv`** (from Cell 28 - Evaluation)
   - Combined results from both models for easy comparison


In [None]:
# === Final Evaluation: Fine-tuned vs Base Model (F1 Score Comparison) ===
from transformers import AutoModelForCausalLM

@torch.no_grad()
def pii_mask_with(model_obj, text: str, max_new_tokens=256):
    """Helper function to run inference with any model."""
    prompt = build_pii_prompt(text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model_obj.device)
    gen = model_obj.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
    )
    out = tokenizer.decode(gen[0], skip_special_tokens=True)
    if "<start_of_turn>model" in out:
        out = out.split("<start_of_turn>model")[-1]
    return out.strip()

print("="*70)
print("FINAL MODEL COMPARISON")
print("="*70)

# Evaluate fine-tuned model
print("\n1. Evaluating Fine-tuned Model...")
f1_ft, results_ft = evaluate_and_save_f1(
    model, 
    valid_split, 
    n=100,  # Evaluate on 100 samples
    csv_filename='pii_finetuned_results.csv',
    model_name='fine-tuned'
)

# Load and evaluate base model
print("\n2. Loading and Evaluating Base Model...")
base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
f1_base, results_base = evaluate_and_save_f1(
    base_model,
    valid_split,
    n=100,
    csv_filename='pii_base_results.csv',
    model_name='base'
)

# Combine results for side-by-side comparison
print("\n" + "="*70)
print("COMPARISON RESULTS")
print("="*70)
print(f"Fine-tuned Model F1: {f1_ft:.4f}")
print(f"Base Model F1:       {f1_base:.4f}")
print(f"Improvement:         {(f1_ft - f1_base):.4f} ({((f1_ft - f1_base) / f1_base * 100):.1f}%)")
print("="*70)

# Combine both datasets for comparison
comparison_df = pd.concat([results_ft, results_base], ignore_index=True)
comparison_df.to_csv('pii_comparison_results.csv', index=False)
print(f"\n✓ Combined comparison saved to: pii_comparison_results.csv")

# Show some example comparisons
print("\n" + "="*70)
print("SAMPLE PREDICTIONS (First 3)")   
print("="*70)
for i in range(3):
    print(f"\n--- Sample {i} ---")
    print(f"Source: {valid_split[i]['source_text'][:150]}...")
    print(f"\nTarget: {valid_split[i]['target_text'][:150]}...")
    print(f"\nFine-tuned (F1={results_ft.iloc[i]['f1_score']:.3f}): {results_ft.iloc[i]['prediction'][:150]}...")
    print(f"\nBase Model (F1={results_base.iloc[i]['f1_score']:.3f}): {results_base.iloc[i]['prediction'][:150]}...")
    print("-"*70)

Evaluating on 50 samples...

Evaluated 2/50 samples...
Evaluated 4/50 samples...
Evaluated 6/50 samples...
Evaluated 8/50 samples...
Evaluated 10/50 samples...
Evaluated 12/50 samples...
Evaluated 14/50 samples...
Evaluated 16/50 samples...
Evaluated 18/50 samples...
Evaluated 20/50 samples...
Evaluated 22/50 samples...
Evaluated 24/50 samples...
Evaluated 26/50 samples...
Evaluated 28/50 samples...
Evaluated 30/50 samples...
Evaluated 32/50 samples...
Evaluated 34/50 samples...
Evaluated 36/50 samples...
Evaluated 38/50 samples...
Evaluated 40/50 samples...
Evaluated 42/50 samples...
Evaluated 44/50 samples...
Evaluated 46/50 samples...
Evaluated 48/50 samples...
Evaluated 50/50 samples...

FINAL RESULTS
Fine-tuned EM: 0.000 | Base EM: 0.000
Fine-tuned F1: 0.436 | Base F1: 0.408

Improvement: 6.9%
