# Lab 6 - Fine-tuning Llama 3.2 on Medical Dataset

This notebook demonstrates fine-tuning a Llama model on a medical Q&A dataset using LoRA (Low-Rank Adaptation) for efficient training.

## 1. Setup and Imports


In [10]:
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [11]:
import warnings
import os
warnings.filterwarnings("ignore")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


## Hugging Face Authentication

Run the next cell once to log in with your HF token (needed for gated models).


In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [None]:
from huggingface_hub import login

# Run once to store token locally 
# If you already logged in on this machine, skip rerunning this cell.
login("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")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## 2. Load Model and Tokenizer


In [None]:
import os

model_name = os.environ.get("LLAMA_MODEL", "meta-llama/Llama-3.2-1B-Instruct")
hf_token = os.environ.get("HF_TOKEN")  # set via env or login()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
print(f"Model loaded: {model_name}")
if hf_token is None:
    print("(No HF token provided; if this model is gated, set HF_TOKEN env var or run login().)")


## 3. Configure LoRA

LoRA enables efficient fine-tuning by training low-rank decomposition matrices instead of full model weights:


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


## 4. Load and Prepare Dataset


In [None]:
dataset = load_dataset("FreedomIntelligence/medical-o1-verifiable-problem")
train_dataset = dataset["train"].select(range(1000))
print(f"Training on {len(train_dataset)} examples")
print(f"Sample: {train_dataset[0]}")


In [None]:
def format_prompt(example):
    question = example.get("question", example.get("Open-ended Verifiable Question", ""))
    answer = example.get("answer", example.get("Ground-True Answer", ""))
    prompt = f"<|user|>\n{question}\n<|assistant|>\nThe answer is: {answer}"
    return {"text": prompt}

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

train_dataset = train_dataset.map(format_prompt)
tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)


## 5. Training


In [None]:
training_args = TrainingArguments(
    output_dir="./llama3_medical_lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    warmup_steps=50,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete!")


In [None]:
model.save_pretrained("./llama3_medical_lora")
tokenizer.save_pretrained("./llama3_medical_lora")
print("Model saved to: ./llama3_medical_lora")


## 6. Evaluation on Test Set


In [None]:
import random

test_dataset = dataset["train"].select(range(1000, 1020))

def get_prediction(question):
    prompt = f"<|user|>\n{question}\n<|assistant|>\nThe answer is:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.3, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("The answer is:")[-1].strip()

correct = 0
for i, example in enumerate(test_dataset):
    question = example.get("Open-ended Verifiable Question", example.get("question", ""))
    ground_truth = example.get("Ground-True Answer", example.get("answer", ""))
    prediction = get_prediction(question)
    is_correct = ground_truth.lower() in prediction.lower()
    if is_correct:
        correct += 1
    print(f"{i+1}. {'✓' if is_correct else '✗'} Pred: {prediction[:50]}... | Truth: {ground_truth[:30]}")

print(f"\nAccuracy: {correct}/{len(test_dataset)} ({100*correct/len(test_dataset):.1f}%)")
