# TP 7: Finetune llama 3.2 on medical dataset with Hugging Face and peft for fine-tuning

### Apolline Hadjal

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import json
import random
import time

# for gpu ressources, using cuda for windows
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using device: {device} ({torch.cuda.get_device_name(0)})")
else:
    device = torch.device("cpu")
    print(f"Using device: {device} (No GPU found - training will be slow)")



Using device: cpu (No GPU found - training will be slow)


In [None]:
# ============================================================
# STEP: LOAD MODEL AND TOKENIZER
# ============================================================

model_name = "meta-llama/Llama-3.2-1B-Instruct"

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    device_map={"": device},
)
print(f"Model loaded: {model_name}")

Loading model...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Model loaded: meta-llama/Llama-3.2-1B-Instruct


In [None]:
# ============================================================
# STEP: INITIALIZE LORA CONFIGURATION
# ============================================================

print("\nConfiguring LoRA...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Configuring LoRA...
trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


In [None]:
# ============================================================
# STEP: LOAD AND FORMAT DATASET (500 EXAMPLES)
# ============================================================

def format_prompt(example):
    """Format with CORRECT field names"""
    question = example.get('Open-ended Verifiable Question', '')
    answer = example.get('Ground-True Answer', '')

    if not question or len(question) < 10:
        return None
    if not answer or len(answer) < 2:
        return None

    text = f"""<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
{question}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
The answer is: {answer}<|eot_id|>"""

    return {"text": text}

print("\nLoading dataset...")
dataset = load_dataset("FreedomIntelligence/medical-o1-verifiable-problem")

print("Formatting dataset...")
train_dataset = dataset['train'].select(range(500)).map(
    format_prompt,
    remove_columns=dataset['train'].column_names
).filter(lambda x: x is not None)

print(f"Training on {len(train_dataset)} examples")


Loading dataset...


README.md: 0.00B [00:00, ?B/s]

medical_o1_verifiable_problem.json:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40644 [00:00<?, ? examples/s]

Formatting dataset...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/498 [00:00<?, ? examples/s]

Training on 498 examples


In [None]:
# ============================================================
# STEP: TOKENIZE TRAIN DATASET
# ============================================================

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

print("Tokenizing...")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

Tokenizing...


Map:   0%|          | 0/498 [00:00<?, ? examples/s]

In [None]:
# ============================================================
# STEP: SET UP TRAINING ARGUMENTS
# ============================================================

print("Setting up training...")
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=10,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use FP16 only if CUDA available
    logging_dir="./logs",
    report_to="none"
)


Setting up training...


In [None]:
# ============================================================
# STEP: USE DATA COLLATOR
# ============================================================

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
# ============================================================
# STEP: SET UP TRAINER
# ============================================================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)



The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
# ============================================================
# STEP: START TRAINING
# ============================================================

print("\nStarting training...")
print("="*60)
trainer.train()
print("="*60)
print("Training complete!")


Starting training...




KeyboardInterrupt: 

In [None]:
# save model and tokenizer
print("\nSaving model...")
model.save_pretrained("./llama3_medical_lora")
tokenizer.save_pretrained("./llama3_medical_lora")
print("Model saved to: ./llama3_medical_lora")

In [None]:

# ============================================================
# STEP 1: LOAD AND SPLIT THE DATASET
# ============================================================

print("\nLoading dataset...")
dataset = load_dataset("FreedomIntelligence/medical-o1-verifiable-problem")

# Define train/test split
train_size = 1000
test_dataset = dataset['train'].select(range(train_size, len(dataset['train'])))

# Verify dataset size
print(f"Total dataset size: {len(dataset['train'])}")
print(f"Training set: 0 to {train_size}")
print(f"Test set: {train_size} to {len(dataset['train'])}")

In [None]:

# ============================================================
# STEP 2: SAMPLE TEST EXAMPLES
# ============================================================

# Set random seed for reproducibility
random.seed(42)

# Randomly select 20 examples from test set
selected_indices = random.sample(range(len(test_dataset)), 20)

# Record indices
print(f"\nRandomly selected {len(selected_indices)} test examples")
print(f"Indices: {selected_indices[:5]}... (showing first 5)")

In [None]:
# ============================================================
# STEP 3: CREATE INFERENCE FUNCTION
# ============================================================

def get_prediction(question, max_tokens=50):
    """Generate prediction for a question"""
    # Format question with chat template
    prompt = f"""<|begin_of_text|>
<|start_header_id|>user<|end_header_id|>
{question}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

    # Tokenize and move to device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.3,
            top_p=0.9,
            do_sample=True
        )

    # Extract assistant's response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
        answer = full_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
        answer = answer.split("<|eot_id|>")[0].strip()
        return answer

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# ============================================================
# STEP 4: IMPLEMENT ACCURACY CHECKING
# ============================================================

def check_accuracy(prediction, ground_truth):
    """
    Check if prediction is correct
    Returns: (is_correct, match_type)
    """
    pred_lower = prediction.lower()
    truth_lower = ground_truth.lower()

    # Exact match: Ground truth appears verbatim in prediction
    if truth_lower in pred_lower:
        return True, "exact_match"

    # Partial match: At least 70% of key medical terms appear
    # Filter out common stop words
    stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'of', 'in', 'to', 'for', 'with', 'on', 'at', 'by'}

    truth_words = [w for w in truth_lower.split() if w not in stop_words and len(w) > 2]

    if len(truth_words) == 0:
        return False, "no_match"

    matches = sum(1 for word in truth_words if word in pred_lower)
    match_ratio = matches / len(truth_words)

    if match_ratio >= 0.7:
        return True, "partial_match"

    return False, "no_match"

In [None]:
# ============================================================
# STEP 5: RUN EVALUATION LOOP
# ============================================================

print("\n" + "="*80)
print("EVALUATING MODEL")
print("=" * 80)

results = []
correct_exact = 0
correct_partial = 0
total = 0

start_time = time.time()

for i, idx in enumerate(selected_indices, 1):
    # Extract question and ground truth
    example = test_dataset[idx]
    question = example['Open-ended Verifiable Question']
    ground_truth = example['Ground-True Answer']

    # Display question (truncated if long)
    print(f"\n{'='*80}")
    print(f"TEST {i}/20")
    print(f"{'='*80}")
    print(f"Question: {question[:100]}...")
    print(f"Ground Truth: {ground_truth}")

    # Generate prediction
    print("⏳ Generating prediction...")
    prediction = get_prediction(question)
    print(f"Prediction: {prediction[:200]}")

    # Check if prediction is correct
    is_correct, match_type = check_accuracy(prediction, ground_truth)

    # Display result
    if is_correct:
        if match_type == "exact_match":
            correct_exact += 1
            print("CORRECT (Exact match)")
        else:
            correct_partial += 1
            print("CORRECT (Partial match)")
    else:
        print("INCORRECT")

    # Track running accuracy
    total += 1
    current_accuracy = ((correct_exact + correct_partial) / total) * 100
    print(f"\nRunning accuracy: {current_accuracy:.1f}% ({correct_exact + correct_partial}/{total})")

    # Store result
    results.append({
        'question': question,
        'ground_truth': ground_truth,
        'prediction': prediction,
        'correct': is_correct,
        'match_type': match_type
    })

total_time = time.time() - start_time

In [None]:

# ============================================================
# STEP 6: CALCULATE FINAL METRICS
# ============================================================

print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)

accuracy = ((correct_exact + correct_partial) / total) * 100

print(f"Total examples evaluated: {total}")
print(f"Exact matches: {correct_exact} ({(correct_exact/total)*100:.1f}%)")
print(f"Partial matches: {correct_partial} ({(correct_partial/total)*100:.1f}%)")
print(f"Total correct: {correct_exact + correct_partial} ({accuracy:.1f}%)")
print(f"Incorrect: {total - correct_exact - correct_partial} ({((total - correct_exact - correct_partial)/total)*100:.1f}%)")
print(f"Total evaluation time: {total_time/60:.1f} minutes")
print(f"Average time per example: {total_time/total:.1f} seconds")

In [None]:
# ============================================================
# STEP 7: ANALYZE DETAILED RESULTS
# ============================================================

print("\n" + "="*80)
print("DETAILED RESULTS")
print("="*80)

# Show incorrect examples
incorrect = [r for r in results if not r['correct']]
if incorrect:
    print(f"\nINCORRECT EXAMPLES ({len(incorrect)}):")
    print("="*80)
    for i, r in enumerate(incorrect, 1):
        print(f"\n{i}. Question: {r['question']}")
        print(f"   Ground Truth: {r['ground_truth']}")
        print(f"   Prediction: {r['prediction'][:100]}...")
else:
    print("\nALL EXAMPLES CORRECT!")

# Show correct examples (first 5)
correct = [r for r in results if r['correct']]
if correct:
    print(f"\nCORRECT EXAMPLES ({len(correct)}):")
    print("="*80)
    for i, r in enumerate(correct[:5], 1):
        print(f"\n{i}. Question: {r['question']}")
        print(f"   Ground Truth: {r['ground_truth']}")
        print(f"   Prediction: {r['prediction'][:80]}...")
        print(f"   Match type: {r['match_type']}")
    if len(correct) > 5:
        print(f"\n... and {len(correct) - 5} more correct examples")


In [None]:

# ============================================================
# STEP 8: ASSESS PERFORMANCE
# ============================================================

print("\n" + "="*80)
print("PERFORMANCE ASSESSMENT")
print("="*80)

if accuracy >= 80:
    print("EXCELLENT! Model is performing very well!")
    print("   Your fine-tuning was highly successful.")
elif accuracy >= 60:
    print("GOOD! Model learned successfully!")
    print("   Consider training longer or with more data for improvement.")
elif accuracy >= 40:
    print("MODERATE. Model shows some learning.")
    print("   Recommend: Train for more epochs or increase dataset size.")
elif accuracy >= 20:
    print("POOR. Model needs significant improvement.")
    print("   Recommend: Check data quality, train longer, or use more examples.")
else:
    print("VERY POOR. Model barely learned.")
    print("   Recommend: Verify data formatting and retrain from scratch.")

In [None]:
# ============================================================
# STEP 9: SAVE RESULTS
# ============================================================

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

results_summary = {
    'total_examples': total,
    'exact_matches': correct_exact,
    'partial_matches': correct_partial,
    'total_correct': correct_exact + correct_partial,
    'accuracy_percentage': accuracy,
    'incorrect': total - correct_exact - correct_partial,
    'total_time_minutes': total_time / 60,
    'avg_time_per_example': total_time / total,
    'selected_indices': selected_indices,
    'detailed_results': results
}

with open('evaluation_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("Results saved to: evaluation_results.json")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)