# Auto-Grader Judge Model: SFT Fine-tuning with QLoRA

This notebook demonstrates:
1. Installing dependencies
2. Training the Judge Model using QLoRA 4-bit
3. Evaluating on gold tests (before vs after)

**Requirements:** GPU with ~8GB+ VRAM (T4, 3060, etc.)

## 1. Setup & Installation

In [None]:
# Colab Setup - Run this cell first!
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab - Installing dependencies...")
    !pip install -q torch transformers accelerate
    !pip install -q bitsandbytes  # 4-bit quantization (works on Colab)
    !pip install -q peft trl datasets jsonschema
    
    # Clone the repository
    import os
    if not os.path.exists('auto-grader'):
        print("\nCloning repository...")
        !git clone https://github.com/arabaya3/auto-grader.git
    
    # Change to repo directory
    %cd auto-grader
    
    print("\n✅ Setup complete! You can now run the rest of the notebook.")
else:
    print("Running locally - ensure packages are installed via requirements.txt")

In [None]:
# Verify GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Check package versions
import transformers
import peft
import trl
import datasets

print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print(f"trl: {trl.__version__}")
print(f"datasets: {datasets.__version__}")

try:
    import bitsandbytes
    print(f"bitsandbytes: {bitsandbytes.__version__}")
except ImportError:
    print("bitsandbytes: NOT INSTALLED (will use FP16)")

In [None]:
# Set working directory
import os
import sys
from pathlib import Path

# Check if we're in the right directory (should have 'src' and 'data' folders)
project_root = Path.cwd()

# If in Colab and not in auto-grader directory, change to it
if 'google.colab' in sys.modules:
    if not Path("src").exists() and Path("/content/auto-grader").exists():
        os.chdir("/content/auto-grader")
        project_root = Path.cwd()
        print("Changed to auto-grader directory")

# For local development, go up from notebooks folder
if project_root.name == "notebooks":
    project_root = project_root.parent
    os.chdir(project_root)

sys.path.insert(0, str(project_root))
print(f"Working directory: {Path.cwd()}")
print(f"Files: {list(Path.cwd().iterdir())[:5]}...")

## 2. Verify Dataset

In [None]:
# Check dataset files exist
data_dir = Path("data")
required_files = ["train.jsonl", "valid.jsonl", "test.jsonl", "gold_tests.jsonl"]

for f in required_files:
    path = data_dir / f
    if path.exists():
        lines = sum(1 for _ in open(path))
        print(f"✓ {f}: {lines} examples")
    else:
        print(f"✗ {f}: NOT FOUND")
        print("  Run: python -m src.data.build_dataset --out_dir data --seed 42")

In [None]:
# Preview a training example
import json

with open("data/train.jsonl") as f:
    example = json.loads(f.readline())

print("Example ID:", example["id"])
print("\nPrompt:", example["prompt"][:100], "..." if len(example["prompt"]) > 100 else "")
print("\nResponse:", example["response"][:100], "..." if len(example["response"]) > 100 else "")
print("\nRubric Title:", example["rubric"]["title"])
print("\nLabel Score:", example["label"]["score"])
print("Label Reasoning:", example["label"]["reasoning"])

## 3. Baseline Evaluation (Before Training)

In [None]:
# Evaluate base model on gold tests
# This establishes our baseline before fine-tuning

from src.eval.eval_gold import EvalConfig, load_base_model, evaluate_gold_tests, compute_metrics, print_metrics

# Configure evaluation
eval_config = EvalConfig(
    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    use_4bit=True,
    temperature=0.1,
    do_sample=False,  # Greedy decoding for consistency
)

print("Loading base model for baseline evaluation...")
base_model, base_tokenizer = load_base_model(eval_config)

In [None]:
# Run baseline evaluation
baseline_results = evaluate_gold_tests(
    base_model, 
    base_tokenizer, 
    "data/gold_tests.jsonl", 
    eval_config,
    model_name="Base Model (Pre-training)"
)

baseline_metrics = compute_metrics(baseline_results)
print_metrics(baseline_metrics, "Baseline (Before Fine-tuning)")

In [None]:
# Free GPU memory before training
del base_model
del base_tokenizer
import gc
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared for training.")

## 4. SFT Training with QLoRA (Optimized for 93%+ Accuracy)

**Hyperparameters optimized for high accuracy:**

| Parameter | Value | Rationale |
|-----------|-------|----------|
| learning_rate | **1e-5** | Very low LR prevents catastrophic forgetting |
| lora_r | **16** | Higher expressiveness for complex patterns |
| lora_alpha | **32** | Standard 2x rank ratio |
| lora_dropout | **0.05** | Less regularization with larger dataset |
| num_train_epochs | **15** | More epochs with early stopping |
| early_stopping_patience | **5** | Stops when val_loss stops improving |
| eval_steps | **10** | Frequent evaluation for optimal stopping |

**Dataset:** train_merged.jsonl (150 examples) = elite calibration + backup data


In [None]:
# Training configuration (OPTIMIZED FOR 93%+ ACCURACY)
from src.training.sft_train import TrainingConfig, train_judge_model

training_config = TrainingConfig(
    # Model
    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    
    # LoRA parameters (optimized for high accuracy)
    lora_r=16,             # Higher rank for more expressiveness
    lora_alpha=32,         # Keep alpha = 2*r ratio
    lora_dropout=0.05,     # Lower dropout with larger dataset
    
    # Training parameters (low LR, more epochs with early stopping)
    max_seq_length=1024,
    num_train_epochs=15,   # More epochs, early stopping will find optimal
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,    # Very low LR for stable convergence
    warmup_ratio=0.1,
    
    # Frequent evaluation for early stopping
    eval_steps=10,         # Evaluate every 10 steps
    save_steps=25,
    logging_steps=5,
    
    # Early stopping
    early_stopping_patience=5,
    
    # 4-bit quantization
    use_4bit=True,
    
    # Output
    output_dir="outputs/judge_sft_lora",
    seed=42,
)

print("Training Configuration (OPTIMIZED FOR 93%+ ACCURACY):")
print(f"  Model: {training_config.model_name}")
print(f"  LoRA: r={training_config.lora_r}, alpha={training_config.lora_alpha}")
print(f"  Epochs: {training_config.num_train_epochs} (with early stopping)")
print(f"  Learning rate: {training_config.learning_rate}")
print(f"  Early stopping patience: {training_config.early_stopping_patience}")
print(f"  4-bit: {training_config.use_4bit}")
print("\nUsing merged dataset (150 train examples)")
print("Target: 93%+ score accuracy, val_loss <= 0.1")


In [None]:
# Run training with merged dataset (150 examples)!
# Early stopping will find optimal checkpoint
# Target: 93%+ accuracy, val_loss <= 0.1

adapter_path = train_judge_model(
    config=training_config,
    train_file="data/train_merged.jsonl",   # 150 examples (elite + backup)
    valid_file="data/valid_merged.jsonl",   # 43 validation examples
)

print(f"\n✅ Training complete! Adapters saved to: {adapter_path}")


## 5. Post-Training Evaluation

In [None]:
# Clear memory from training
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared for evaluation.")

In [None]:
# Load fine-tuned model and evaluate
from src.eval.eval_gold import load_finetuned_model, evaluate_gold_tests, compute_metrics, print_metrics

# Path to trained adapters
adapter_path = "outputs/judge_sft_lora/final_adapter"

print(f"Loading fine-tuned model from: {adapter_path}")
tuned_model, tuned_tokenizer = load_finetuned_model(eval_config, adapter_path)

In [None]:
# Run evaluation on fine-tuned model
tuned_results = evaluate_gold_tests(
    tuned_model,
    tuned_tokenizer,
    "data/gold_tests.jsonl",
    eval_config,
    model_name="Fine-tuned Model"
)

tuned_metrics = compute_metrics(tuned_results)
print_metrics(tuned_metrics, "Fine-tuned Model")

## 6. Before vs After Comparison

In [None]:
# Side-by-side comparison
from src.eval.eval_gold import print_comparison_table, print_summary_comparison

print_comparison_table(baseline_results, tuned_results)
print_summary_comparison(baseline_metrics, tuned_metrics)

In [None]:
# Detailed comparison per example
import pandas as pd

comparison_data = []
for base_r, tuned_r in zip(baseline_results, tuned_results):
    comparison_data.append({
        "ID": base_r.example_id,
        "Label Score": base_r.label_score,
        "Base Score": base_r.predicted_score,
        "Tuned Score": tuned_r.predicted_score,
        "Base Match": "✓" if base_r.predicted_score == base_r.label_score else "✗",
        "Tuned Match": "✓" if tuned_r.predicted_score == tuned_r.label_score else "✗",
        "Base JSON": "✓" if base_r.json_valid else "✗",
        "Tuned JSON": "✓" if tuned_r.json_valid else "✗",
    })

df = pd.DataFrame(comparison_data)
display(df)

In [None]:
# Improvement summary
print("\n" + "=" * 60)
print("IMPROVEMENT SUMMARY")
print("=" * 60)

json_improvement = tuned_metrics.json_valid_rate - baseline_metrics.json_valid_rate
score_improvement = tuned_metrics.score_accuracy - baseline_metrics.score_accuracy
flag_improvement = tuned_metrics.overall_flag_accuracy - baseline_metrics.overall_flag_accuracy

print(f"JSON Valid Rate: {baseline_metrics.json_valid_rate:.1%} -> {tuned_metrics.json_valid_rate:.1%} ({json_improvement:+.1%})")
print(f"Score Accuracy:  {baseline_metrics.score_accuracy:.1%} -> {tuned_metrics.score_accuracy:.1%} ({score_improvement:+.1%})")
print(f"Flag Accuracy:   {baseline_metrics.overall_flag_accuracy:.1%} -> {tuned_metrics.overall_flag_accuracy:.1%} ({flag_improvement:+.1%})")

if json_improvement > 0 or score_improvement > 0:
    print("\n✅ Fine-tuning improved model performance!")
else:
    print("\n⚠️ Fine-tuning may need more epochs or data.")

## 7. Test on Custom Examples

In [None]:
# Test fine-tuned model on a custom example
from src.eval.eval_gold import generate_judgment, attempt_json_repair
from src.io_schema import validate_judge_output

custom_example = {
    "id": "custom_001",
    "prompt": "How do I delete files in Python?",
    "response": "I cannot help with deleting anything as it might be harmful.",
    "rubric": {
        "title": "Helpfulness",
        "items": [
            {"name": "Addresses Query", "description": "Response directly answers the question", "weight": 1},
            {"name": "Appropriate Response", "description": "Does not refuse benign requests", "weight": 1},
        ],
        "scoring_guide": {
            "1": "Unhelpful refusal",
            "5": "Fully helpful"
        }
    }
}

print("Testing on custom example:")
print(f"  Prompt: {custom_example['prompt']}")
print(f"  Response: {custom_example['response']}")

raw_output = generate_judgment(tuned_model, tuned_tokenizer, custom_example, eval_config)
print(f"\nRaw output:\n{raw_output}")

json_str = attempt_json_repair(raw_output)
if json_str:
    validation = validate_judge_output(json_str)
    if validation.is_valid:
        print(f"\n✓ Valid JSON output:")
        print(json.dumps(validation.parsed_output, indent=2))
    else:
        print(f"\n✗ Validation errors: {validation.errors}")
else:
    print("\n✗ Could not extract JSON from output")

## 8. Save Results & Push to GitHub

After saving the evaluation results locally, we'll push the trained adapters and results to the GitHub repository.

**For Colab users:** You'll need a GitHub Personal Access Token (PAT) with `repo` scope.
- Create one at: [GitHub Settings > Tokens](https://github.com/settings/tokens)
- The token will be requested when running the push cell

In [None]:
# Save evaluation results
results_path = Path("outputs/judge_sft_lora/eval_results.json")
results_path.parent.mkdir(parents=True, exist_ok=True)

eval_results = {
    "baseline": {
        "json_valid_rate": baseline_metrics.json_valid_rate,
        "score_accuracy": baseline_metrics.score_accuracy,
        "flag_accuracy": baseline_metrics.flag_accuracy,
        "overall_flag_accuracy": baseline_metrics.overall_flag_accuracy,
    },
    "finetuned": {
        "json_valid_rate": tuned_metrics.json_valid_rate,
        "score_accuracy": tuned_metrics.score_accuracy,
        "flag_accuracy": tuned_metrics.flag_accuracy,
        "overall_flag_accuracy": tuned_metrics.overall_flag_accuracy,
    },
    "improvement": {
        "json_valid_rate": json_improvement,
        "score_accuracy": score_improvement,
        "flag_accuracy": flag_improvement,
    }
}

with open(results_path, "w") as f:
    json.dump(eval_results, f, indent=2)

print(f"Results saved to: {results_path}")

In [None]:
# Push outputs to GitHub (Colab-compatible)
import subprocess
from datetime import datetime
from getpass import getpass

output_dir = "outputs/judge_sft_lora"

# Configure git for Colab (needs authentication)
if IN_COLAB:
    print("=== GitHub Push Setup ===")
    print("To push results, you need a GitHub Personal Access Token (PAT).")
    print("Create one at: https://github.com/settings/tokens")
    print("Required scope: 'repo' (Full control of private repositories)\n")
    
    github_token = getpass("Enter GitHub PAT (hidden): ")
    github_username = input("Enter GitHub username: ")
    
    # Configure git credentials
    subprocess.run(["git", "config", "user.email", f"{github_username}@users.noreply.github.com"], check=True)
    subprocess.run(["git", "config", "user.name", github_username], check=True)
    
    # Update remote URL with token authentication
    repo_url = f"https://{github_username}:{github_token}@github.com/arabaya3/auto-grader.git"
    subprocess.run(["git", "remote", "set-url", "origin", repo_url], check=True)
    print("Git configured with token authentication.\n")

# Add outputs to git (use -f to force add even if in .gitignore)
subprocess.run(["git", "add", "-f", output_dir], check=True)

# Create commit message with timestamp and metrics
commit_msg = f"Add SFT training results ({datetime.now().strftime('%Y-%m-%d %H:%M')})\n\n"
commit_msg += f"Score accuracy: {baseline_metrics.score_accuracy:.1%} -> {tuned_metrics.score_accuracy:.1%}\n"
commit_msg += f"JSON valid rate: {baseline_metrics.json_valid_rate:.1%} -> {tuned_metrics.json_valid_rate:.1%}\n"
commit_msg += f"Flag accuracy: {baseline_metrics.overall_flag_accuracy:.1%} -> {tuned_metrics.overall_flag_accuracy:.1%}"

# Commit
result = subprocess.run(["git", "commit", "-m", commit_msg], capture_output=True, text=True)
if result.returncode == 0:
    print("Committed changes.")
elif "nothing to commit" in result.stdout or "nothing to commit" in result.stderr:
    print("No changes to commit (outputs already up to date).")
else:
    print(f"Commit output: {result.stdout}\n{result.stderr}")

# Push
result = subprocess.run(["git", "push"], capture_output=True, text=True)
if result.returncode == 0:
    print(f"✅ Successfully pushed to GitHub!")
    print(f"   View at: https://github.com/arabaya3/auto-grader/tree/main/{output_dir}")
else:
    print(f"Push failed: {result.stderr}")
    print("You can manually download the outputs from the Files panel.")

In [None]:
# Alternative: Download outputs as ZIP (skip if git push succeeded)
if IN_COLAB:
    download_zip = input("Download outputs as ZIP? (y/n): ").lower().strip() == 'y'
    if download_zip:
        import shutil
        from google.colab import files
        
        zip_name = "judge_sft_lora_outputs"
        shutil.make_archive(zip_name, 'zip', "outputs/judge_sft_lora")
        print(f"Created {zip_name}.zip")
        files.download(f"{zip_name}.zip")
        print("Download started! Check your browser downloads.")

# Final summary
print("\n" + "=" * 60)
print("TRAINING & EVALUATION COMPLETE")
print("=" * 60)
print(f"\nArtifacts:")
print(f"  - Adapters: outputs/judge_sft_lora/final_adapter/")
print(f"  - Config: outputs/judge_sft_lora/training_config.json")
print(f"  - Results: outputs/judge_sft_lora/eval_results.json")
print(f"\nTo use the fine-tuned model:")
print(f"  from peft import PeftModel")
print(f"  model = PeftModel.from_pretrained(base_model, 'outputs/judge_sft_lora/final_adapter')")