# Auto-Grader Judge Model: SFT Fine-tuning with QLoRA

This notebook demonstrates:
1. Installing dependencies
2. Training the Judge Model using QLoRA 4-bit
3. Evaluating on gold tests (before vs after)

**Requirements:** GPU with ~8GB+ VRAM (T4, 3060, etc.)

## 1. Setup & Installation

In [None]:
# Colab Setup - Run this cell first!
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab - Installing dependencies...")
    !pip install -q torch transformers accelerate
    !pip install -q bitsandbytes  # 4-bit quantization (works on Colab)
    !pip install -q peft trl datasets jsonschema
    
    # Clone the repository
    import os
    if not os.path.exists('auto-grader'):
        print("\nCloning repository...")
        !git clone https://github.com/arabaya3/auto-grader.git
    
    # Change to repo directory
    %cd auto-grader
    
    print("\n✅ Setup complete! You can now run the rest of the notebook.")
else:
    print("Running locally - ensure packages are installed via requirements.txt")

In [None]:
# Verify GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Check package versions
import transformers
import peft
import trl
import datasets

print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print(f"trl: {trl.__version__}")
print(f"datasets: {datasets.__version__}")

try:
    import bitsandbytes
    print(f"bitsandbytes: {bitsandbytes.__version__}")
except ImportError:
    print("bitsandbytes: NOT INSTALLED (will use FP16)")

In [None]:
# Set working directory
import os
import sys
from pathlib import Path

# Check if we're in the right directory (should have 'src' and 'data' folders)
project_root = Path.cwd()

# If in Colab and not in auto-grader directory, change to it
if 'google.colab' in sys.modules:
    if not Path("src").exists() and Path("/content/auto-grader").exists():
        os.chdir("/content/auto-grader")
        project_root = Path.cwd()
        print("Changed to auto-grader directory")

# For local development, go up from notebooks folder
if project_root.name == "notebooks":
    project_root = project_root.parent
    os.chdir(project_root)

sys.path.insert(0, str(project_root))
print(f"Working directory: {Path.cwd()}")
print(f"Files: {list(Path.cwd().iterdir())[:5]}...")

## 2. Verify Dataset

In [None]:
# Check dataset files exist
data_dir = Path("data")
required_files = ["train.jsonl", "valid.jsonl", "test.jsonl", "gold_tests.jsonl"]

for f in required_files:
    path = data_dir / f
    if path.exists():
        lines = sum(1 for _ in open(path))
        print(f"✓ {f}: {lines} examples")
    else:
        print(f"✗ {f}: NOT FOUND")
        print("  Run: python -m src.data.build_dataset --out_dir data --seed 42")

In [None]:
# Preview a training example
import json

with open("data/train.jsonl") as f:
    example = json.loads(f.readline())

print("Example ID:", example["id"])
print("\nPrompt:", example["prompt"][:100], "..." if len(example["prompt"]) > 100 else "")
print("\nResponse:", example["response"][:100], "..." if len(example["response"]) > 100 else "")
print("\nRubric Title:", example["rubric"]["title"])
print("\nLabel Score:", example["label"]["score"])
print("Label Reasoning:", example["label"]["reasoning"])

## 3. Baseline Evaluation (Before Training)

In [None]:
# Evaluate base model on gold tests
# This establishes our baseline before fine-tuning

from src.eval.eval_gold import EvalConfig, load_base_model, evaluate_gold_tests, compute_metrics, print_metrics

# Configure evaluation
eval_config = EvalConfig(
    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    use_4bit=True,
    temperature=0.1,
    do_sample=False,  # Greedy decoding for consistency
)

print("Loading base model for baseline evaluation...")
base_model, base_tokenizer = load_base_model(eval_config)

In [None]:
# Run baseline evaluation
baseline_results = evaluate_gold_tests(
    base_model, 
    base_tokenizer, 
    "data/gold_tests.jsonl", 
    eval_config,
    model_name="Base Model (Pre-training)"
)

baseline_metrics = compute_metrics(baseline_results)
print_metrics(baseline_metrics, "Baseline (Before Fine-tuning)")

In [None]:
# Free GPU memory before training
del base_model
del base_tokenizer
import gc
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared for training.")

## 4. SFT Training with QLoRA

In [None]:
# Training configuration
from src.training.sft_train import TrainingConfig, train_judge_model

training_config = TrainingConfig(
    # Model
    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    
    # LoRA parameters
    lora_r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    
    # Training parameters
    max_seq_length=1024,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 8
    learning_rate=2e-4,
    
    # Checkpointing
    eval_steps=50,
    save_steps=100,
    logging_steps=10,
    
    # 4-bit quantization
    use_4bit=True,
    
    # Output
    output_dir="outputs/judge_sft_lora",
    seed=42,
)

print("Training Configuration:")
print(f"  Model: {training_config.model_name}")
print(f"  LoRA rank: {training_config.lora_r}")
print(f"  Epochs: {training_config.num_train_epochs}")
print(f"  Learning rate: {training_config.learning_rate}")
print(f"  4-bit: {training_config.use_4bit}")

In [None]:
# Run training!
# This will take 10-30 minutes depending on GPU
# train.jsonl now contains oversampled flag examples for better flag accuracy

adapter_path = train_judge_model(
    config=training_config,
    train_file="data/train.jsonl",
    valid_file="data/valid.jsonl",
)

print(f"\nTraining complete! Adapters saved to: {adapter_path}")

## 5. Post-Training Evaluation

In [None]:
# Clear memory from training
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared for evaluation.")

In [None]:
# Load fine-tuned model and evaluate
from src.eval.eval_gold import load_finetuned_model, evaluate_gold_tests, compute_metrics, print_metrics

# Path to trained adapters
adapter_path = "outputs/judge_sft_lora/final_adapter"

print(f"Loading fine-tuned model from: {adapter_path}")
tuned_model, tuned_tokenizer = load_finetuned_model(eval_config, adapter_path)

In [None]:
# Run evaluation on fine-tuned model
tuned_results = evaluate_gold_tests(
    tuned_model,
    tuned_tokenizer,
    "data/gold_tests.jsonl",
    eval_config,
    model_name="Fine-tuned Model"
)

tuned_metrics = compute_metrics(tuned_results)
print_metrics(tuned_metrics, "Fine-tuned Model")

## 6. Before vs After Comparison

In [None]:
# Side-by-side comparison
from src.eval.eval_gold import print_comparison_table, print_summary_comparison

print_comparison_table(baseline_results, tuned_results)
print_summary_comparison(baseline_metrics, tuned_metrics)

In [None]:
# Detailed comparison per example
import pandas as pd

comparison_data = []
for base_r, tuned_r in zip(baseline_results, tuned_results):
    comparison_data.append({
        "ID": base_r.example_id,
        "Label Score": base_r.label_score,
        "Base Score": base_r.predicted_score,
        "Tuned Score": tuned_r.predicted_score,
        "Base Match": "✓" if base_r.predicted_score == base_r.label_score else "✗",
        "Tuned Match": "✓" if tuned_r.predicted_score == tuned_r.label_score else "✗",
        "Base JSON": "✓" if base_r.json_valid else "✗",
        "Tuned JSON": "✓" if tuned_r.json_valid else "✗",
    })

df = pd.DataFrame(comparison_data)
display(df)

In [None]:
# Improvement summary
print("\n" + "=" * 60)
print("IMPROVEMENT SUMMARY")
print("=" * 60)

json_improvement = tuned_metrics.json_valid_rate - baseline_metrics.json_valid_rate
score_improvement = tuned_metrics.score_accuracy - baseline_metrics.score_accuracy
flag_improvement = tuned_metrics.overall_flag_accuracy - baseline_metrics.overall_flag_accuracy

print(f"JSON Valid Rate: {baseline_metrics.json_valid_rate:.1%} -> {tuned_metrics.json_valid_rate:.1%} ({json_improvement:+.1%})")
print(f"Score Accuracy:  {baseline_metrics.score_accuracy:.1%} -> {tuned_metrics.score_accuracy:.1%} ({score_improvement:+.1%})")
print(f"Flag Accuracy:   {baseline_metrics.overall_flag_accuracy:.1%} -> {tuned_metrics.overall_flag_accuracy:.1%} ({flag_improvement:+.1%})")

if json_improvement > 0 or score_improvement > 0:
    print("\n✅ Fine-tuning improved model performance!")
else:
    print("\n⚠️ Fine-tuning may need more epochs or data.")

## 7. Test on Custom Examples

In [None]:
# Test fine-tuned model on a custom example
from src.eval.eval_gold import generate_judgment, attempt_json_repair
from src.io_schema import validate_judge_output

custom_example = {
    "id": "custom_001",
    "prompt": "How do I delete files in Python?",
    "response": "I cannot help with deleting anything as it might be harmful.",
    "rubric": {
        "title": "Helpfulness",
        "items": [
            {"name": "Addresses Query", "description": "Response directly answers the question", "weight": 1},
            {"name": "Appropriate Response", "description": "Does not refuse benign requests", "weight": 1},
        ],
        "scoring_guide": {
            "1": "Unhelpful refusal",
            "5": "Fully helpful"
        }
    }
}

print("Testing on custom example:")
print(f"  Prompt: {custom_example['prompt']}")
print(f"  Response: {custom_example['response']}")

raw_output = generate_judgment(tuned_model, tuned_tokenizer, custom_example, eval_config)
print(f"\nRaw output:\n{raw_output}")

json_str = attempt_json_repair(raw_output)
if json_str:
    validation = validate_judge_output(json_str)
    if validation.is_valid:
        print(f"\n✓ Valid JSON output:")
        print(json.dumps(validation.parsed_output, indent=2))
    else:
        print(f"\n✗ Validation errors: {validation.errors}")
else:
    print("\n✗ Could not extract JSON from output")

## 8. Save Results & Cleanup

In [None]:
# Save evaluation results
results_path = Path("outputs/judge_sft_lora/eval_results.json")
results_path.parent.mkdir(parents=True, exist_ok=True)

eval_results = {
    "baseline": {
        "json_valid_rate": baseline_metrics.json_valid_rate,
        "score_accuracy": baseline_metrics.score_accuracy,
        "flag_accuracy": baseline_metrics.flag_accuracy,
        "overall_flag_accuracy": baseline_metrics.overall_flag_accuracy,
    },
    "finetuned": {
        "json_valid_rate": tuned_metrics.json_valid_rate,
        "score_accuracy": tuned_metrics.score_accuracy,
        "flag_accuracy": tuned_metrics.flag_accuracy,
        "overall_flag_accuracy": tuned_metrics.overall_flag_accuracy,
    },
    "improvement": {
        "json_valid_rate": json_improvement,
        "score_accuracy": score_improvement,
        "flag_accuracy": flag_improvement,
    }
}

with open(results_path, "w") as f:
    json.dump(eval_results, f, indent=2)

print(f"Results saved to: {results_path}")

In [None]:
# Push results to GitHub
import subprocess
from datetime import datetime

output_dir = "outputs/judge_sft_lora"

# Add outputs to git
subprocess.run(["git", "add", output_dir], check=True)

# Create commit message with timestamp and metrics
commit_msg = f"Add SFT training results ({datetime.now().strftime('%Y-%m-%d %H:%M')})\n\n"
commit_msg += f"Score accuracy: {baseline_metrics.score_accuracy:.1%} -> {tuned_metrics.score_accuracy:.1%}\n"
commit_msg += f"JSON valid rate: {baseline_metrics.json_valid_rate:.1%} -> {tuned_metrics.json_valid_rate:.1%}\n"
commit_msg += f"Flag accuracy: {baseline_metrics.overall_flag_accuracy:.1%} -> {tuned_metrics.overall_flag_accuracy:.1%}"

# Commit and push
subprocess.run(["git", "commit", "-m", commit_msg], check=True)
subprocess.run(["git", "push"], check=True)

print(f"✅ Results pushed to GitHub!")

In [None]:
# Final summary
print("\n" + "=" * 60)
print("TRAINING & EVALUATION COMPLETE")
print("=" * 60)
print(f"\nArtifacts:")
print(f"  - Adapters: outputs/judge_sft_lora/final_adapter/")
print(f"  - Config: outputs/judge_sft_lora/training_config.json")
print(f"  - Results: outputs/judge_sft_lora/eval_results.json")
print(f"\nTo use the fine-tuned model:")
print(f"  from peft import PeftModel")
print(f"  model = PeftModel.from_pretrained(base_model, 'outputs/judge_sft_lora/final_adapter')")