# Project 1 - Apply Lightweight Fine-Tuning to a Foundation Model

In [20]:
!python --version

Python 3.13.2


## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [21]:
# Optional: !pip install -r requirements.txt
import os
import random
import datetime
import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers.utils import logging
from peft import (
    LoraConfig,
    get_peft_model,
    AutoPeftModelForCausalLM,
    AutoPeftModelForSequenceClassification,
)

# ---- Simple debugging helpers ----
VERBOSE = True  # set to False to silence debug prints

def debug_print(*args, **kwargs):
    if VERBOSE:
        print(*args, **kwargs)

# ---- Reproducibility ----
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ---- Logging ----
logging.set_verbosity_warning()
debug_print("PyTorch version:", torch.__version__)



PyTorch version: 2.8.0+cpu


In [22]:
# Load the dataset
# dair-ai/emotion contains 6 labels: sadness, joy, love, anger, fear, surprise
dataset = load_dataset("dair-ai/emotion")
debug_print("Dataset splits:", list(dataset.keys()))
debug_print("Train size:", len(dataset["train"]), "Validation size:", len(dataset.get("validation", [])), "Test size:", len(dataset["test"]))

# Split the train set into train/validation (stratified by default)
train_valid = dataset["train"].train_test_split(test_size=0.1, seed=SEED)
debug_print("Train/valid sizes:", len(train_valid["train"]), len(train_valid["test"]))

# Inspect a couple of samples
debug_print("Train sample:", train_valid["train"][0])
debug_print("Validation sample:", train_valid["test"][0])

Dataset splits: ['train', 'validation', 'test']
Train size: 16000 Validation size: 2000 Test size: 2000
Train/valid sizes: 14400 1600
Train sample: {'text': 'when an alcoholic stood dribbling over a food counter', 'label': 3}
Validation sample: {'text': 'while cycling in the country', 'label': 4}


In [23]:
# Initialize the tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
debug_print("Loaded tokenizer:", model_name)

# Tokenization function
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True)

# Tokenize datasets
debug_print("Tokenizing train and validation splits...")
tokenized_train = train_valid["train"].map(tokenize, batched=True)
tokenized_test = train_valid["test"].map(tokenize, batched=True)
debug_print("Tokenized features:", tokenized_train.column_names)


Loaded tokenizer: distilbert-base-uncased
Tokenizing train and validation splits...
Tokenized features: ['text', 'label', 'input_ids', 'attention_mask']


In [24]:
# Build label mapping from dataset (prevents label-out-of-range issues)
label_list = dataset["train"].features["label"].names
num_labels = len(label_list)
id2label = {i: name for i, name in enumerate(label_list)}
label2id = {name: i for i, name in enumerate(label_list)}
debug_print("Num labels:", num_labels, "Labels:", label_list)

# Show a couple of items to verify labels are within [0, num_labels)
debug_print("Train sample:", train_valid["train"][0])
debug_print("Validation sample:", train_valid["test"][0])

# Initialize the model with correct label space
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
debug_print("Model classifier out_features:", model.classifier.out_features)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num labels: 6 Labels: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
Train sample: {'text': 'when an alcoholic stood dribbling over a food counter', 'label': 3}
Validation sample: {'text': 'while cycling in the country', 'label': 4}
Model classifier out_features: 6


In [25]:
# Quick model check (print only the head to keep output short)
debug_print("Model head:", model.classifier)

Model head: Linear(in_features=768, out_features=6, bias=True)


In [26]:
# Freeze the base model parameters (train only the classifier head)
for param in model.base_model.parameters():
    param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
debug_print(f"Parameters: {total_params:,} total | {total_trainable_params:,} trainable")


Parameters: 66,958,086 total | 595,206 trainable


In [27]:
# Prepare for training

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

# Re-tokenize to be safe if previous cells were re-run
tokenized_train = train_valid["train"].map(tokenize, batched=True)
tokenized_test = train_valid["test"].map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer, padding=True)
debug_print("Collator set. Example batch keys:", tokenized_train.features.keys())

training_args = TrainingArguments(
    output_dir=f"./results/{model_name}/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=300,
    learning_rate=3e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
)
debug_print("Training args ready. Epochs:", training_args.num_train_epochs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
debug_print("Trainer initialized for base model.")

Collator set. Example batch keys: dict_keys(['text', 'label', 'input_ids', 'attention_mask'])
Training args ready. Epochs: 5
Trainer initialized for base model.


In [28]:
# Evaluate the base model
debug_print("Evaluating base model...")
base_eval = trainer.evaluate()
debug_print("Base eval:", base_eval)
base_accuracy = base_eval.get("eval_accuracy")
print({"base_eval_accuracy": base_accuracy})

Evaluating base model...


Base eval: {'eval_loss': 1.795680284500122, 'eval_model_preparation_time': 0.0011, 'eval_accuracy': 0.134375, 'eval_runtime': 12.494, 'eval_samples_per_second': 128.062, 'eval_steps_per_second': 4.002}
{'base_eval_accuracy': 0.134375}


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [29]:
# Set up a fresh model for LoRA training (same label mapping)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
debug_print("Tokenizer reloaded for LoRA:", model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

debug_print("Configuring LoRA...")
config = LoraConfig(
    task_type="SEQ_CLS",
    target_modules=["q_lin", "k_lin", "v_lin"],
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
)

fine_tuned_model = get_peft_model(model, config)
debug_print("LoRA adapters injected.")

# Print trainable parameters
fine_tuned_model.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer reloaded for LoRA: distilbert-base-uncased
Configuring LoRA...
LoRA adapters injected.
trainable params: 1,037,574 || all params: 67,995,660 || trainable%: 1.5259


In [30]:
# Prepare for training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

training_args = TrainingArguments(
    output_dir=f"./results/{model_name}-lora/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=300,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
)

lora_trainer = Trainer(
    model=fine_tuned_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

  lora_trainer = Trainer(


In [31]:
# Train the LoRA model
debug_print("Starting LoRA training...")
train_out = lora_trainer.train()
debug_print("Training finished. Last metrics:", getattr(train_out, "metrics", {}))

Starting LoRA training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.994676,0.65875
2,1.419200,0.661467,0.75375
3,0.781300,0.546919,0.798125
4,0.598100,0.504998,0.81125
5,0.540000,0.492082,0.8175




Training finished. Last metrics: {'train_runtime': 2472.4807, 'train_samples_per_second': 29.121, 'train_steps_per_second': 0.91, 'total_flos': 997149382636032.0, 'train_loss': 0.8003791842990451, 'epoch': 5.0}


In [32]:
# Evaluate the LoRA model
debug_print("Evaluating LoRA model...")
lora_eval = lora_trainer.evaluate()
debug_print("LoRA eval:", lora_eval)
print({"lora_eval_accuracy": lora_eval.get("eval_accuracy")})

Evaluating LoRA model...




LoRA eval: {'eval_loss': 0.49208173155784607, 'eval_accuracy': 0.8175, 'eval_runtime': 13.4517, 'eval_samples_per_second': 118.944, 'eval_steps_per_second': 3.717, 'epoch': 5.0}
{'lora_eval_accuracy': 0.8175}


In [33]:
# Save the loRA model
fine_tuned_model.save_pretrained(f"./peft/{model_name}-lora")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [34]:
# Load the saved PEFT model and evaluate performance
use_mps = torch.backends.mps.is_available()
device = torch.device("mps" if use_mps else "cpu")
load_dtype = torch.float16 if use_mps else None
debug_print("Eval device:", device, "| use_mps:", use_mps, "| load_dtype:", load_dtype)

loaded_lora_model = AutoPeftModelForSequenceClassification.from_pretrained(
    f"./peft/{model_name}-lora",
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    torch_dtype=load_dtype
)

# Move model to device
loaded_lora_model = loaded_lora_model.to(device)
debug_print("PEFT model loaded and moved to device.")

# Evaluate on the test set
lora_eval_trainer = Trainer(
    model=loaded_lora_model,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)
lora_eval_results = lora_eval_trainer.evaluate()
debug_print("PEFT eval results:", lora_eval_results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Eval device: cpu | use_mps: False | load_dtype: None
PEFT model loaded and moved to device.


  lora_eval_trainer = Trainer(


PEFT eval results: {'eval_loss': 0.49208173155784607, 'eval_model_preparation_time': 0.0022, 'eval_accuracy': 0.8175, 'eval_runtime': 12.8343, 'eval_samples_per_second': 124.666, 'eval_steps_per_second': 15.583}


In [35]:
# Compare PEFT vs base model accuracy
peft_accuracy = lora_eval_results.get("eval_accuracy")

try:
    debug_print("Re-evaluating base model for comparison...")
    base_results = trainer.evaluate()
    base_accuracy = base_results.get("eval_accuracy")
except Exception as e:
    debug_print("Base model eval failed:", repr(e))
    base_accuracy = None

print({"peft_accuracy": peft_accuracy, "base_accuracy": base_accuracy})

Re-evaluating base model for comparison...
{'peft_accuracy': 0.8175, 'base_accuracy': 0.134375}
