# 🧠 Track B – SFT Fine-tuning Demo

Fine-tune `Qwen/Qwen2.5-Coder-1.5B` on a synthetic Python coding dataset and measure improvement.

---

## 📦 Dataset: [`archit11/track_b_sft`](https://huggingface.co/datasets/archit11/track_b_sft)

| Field | Detail |
|-------|--------|
| **Source** | [verl](https://github.com/volcengine/verl) Python library (AST-extracted functions) |
| **Train** | 514 examples |
| **Test** | 46 examples |
| **Gold responses** | Generated by `Qwen3-Coder-30B-A3B-Instruct` via vLLM |
| **License** | Apache 2.0 |

### Task Categories

| Category | Description |
|----------|-------------|
| `docstring` | Write a Google-style docstring for a real function |
| `explain` | Explain what a function does and how |
| `bugfix` | Identify and fix a deterministically injected bug |
| `complete` | Complete a function body given its signature + docstring |
| `unit_test` | Write pytest tests with assertions |

---

## 🤖 Model: [`archit11/track_b_sft_merged`](https://huggingface.co/archit11/track_b_sft_merged)

| Field | Detail |
|-------|--------|
| **Base** | `Qwen/Qwen2.5-Coder-1.5B` |
| **Method** | LoRA (r=16, alpha=32) → merged |
| **Epochs** | 3 |
| **LR** | 2e-4, cosine schedule |
| **Hardware** | T4 GPU, fp16 |
| **Training time** | ~56 seconds |

---

## 📊 Results (Reproduced Below)

| Metric | Baseline | Post-SFT | Δ |
|--------|----------|----------|---|
| **pass@1** | 0.565 | **0.804** | **+0.239 ↑** |
| **pass@3** | 0.783 | 0.848 | +0.065 ↑ |
| **style score** | 0.874 | 0.848 | −0.026 |

| Category | Baseline | Post-SFT | Δ |
|----------|----------|----------|---|
| bugfix | 0.17 | — | — |
| complete | 0.45 | — | — |
| docstring | 0.40 | — | — |
| explain | 1.00 | 1.00 | 0.00 |
| unit_test | 1.00 | — | — |

> ⚡ **Make sure Runtime → Change runtime type → T4 GPU is selected before running.**

In [None]:
# Cell 1 – Install dependencies
!pip install -q transformers==5.2.0 peft==0.18.1 trl==0.28.0 datasets accelerate huggingface_hub

In [None]:
# Cell 2 – Imports & config
import ast, json, os, time
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
import inspect

BASE_MODEL   = "Qwen/Qwen2.5-Coder-1.5B"
DATASET_REPO = "archit11/track_b_sft"
OUTPUT_DIR   = "/content/track_b_sft"
EPOCHS       = 3
BATCH_SIZE   = 4
GRAD_ACCUM   = 4
LR           = 2e-4
MAX_LEN      = 1024
DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"

print(f"✓ Device: {DEVICE}")
print(f"✓ Base model: {BASE_MODEL}")
print(f"✓ Dataset: {DATASET_REPO}")

In [None]:
# Cell 3 – Load dataset from Hugging Face
print(f"Loading {DATASET_REPO} from Hugging Face...")
hf_ds = load_dataset(DATASET_REPO)
train_data = list(hf_ds["train"])
test_data  = list(hf_ds["test"])
print(f"✓ Train: {len(train_data)} examples")
print(f"✓ Test:  {len(test_data)} examples")
print(f"\nSample example:")
ex = train_data[0]
print(f"  Category:    {ex['category']}")
print(f"  Instruction: {ex['instruction'][:120]}...")
print(f"  Response:    {ex['response'][:120]}...")

In [None]:
# Cell 4 – Evaluation helpers
def load_model_and_tokenizer(model_path, is_lora=False, base=BASE_MODEL):
    tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tok.pad_token = tok.eos_token
    m = AutoModelForCausalLM.from_pretrained(
        base if is_lora else model_path,
        torch_dtype=torch.float16,
        trust_remote_code=True,
        device_map="auto",
    )
    if is_lora:
        m = PeftModel.from_pretrained(m, model_path).merge_and_unload()
    m.eval()
    return m, tok

def generate(model, tokenizer, prompt, max_new=256):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=768).to(DEVICE)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new,
                             do_sample=False, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

def check_pass(response, category):
    r = response.strip()
    if not r or len(r) < 10:
        return False
    if category == "docstring":
        return '"""' in r or "'''" in r
    if category == "complete":
        try: ast.parse(r); return True
        except: return "def " in r or "return" in r
    if category == "bugfix":
        return any(w in r.lower() for w in ["fix", "bug", "error", "change", "replace", "correct"])
    if category == "explain":
        return len(r.split()) >= 20
    if category == "unit_test":
        return "def test_" in r and "assert" in r
    return True

def evaluate(model, tokenizer, test_data, tag):
    print(f"\n{'='*55}")
    print(f"  Evaluating: {tag}  ({len(test_data)} examples)")
    print(f"{'='*55}")
    results, t0 = [], time.time()
    for i, ex in enumerate(test_data):
        prompt = (f"<|im_start|>user\n{ex['instruction']}<|im_end|>\n"
                  f"<|im_start|>assistant\n")
        resp   = generate(model, tokenizer, prompt)
        passed = check_pass(resp, ex["category"])
        results.append({"category": ex["category"], "pass": passed})
        print(f"  [{i+1:2d}/{len(test_data)}] {ex['category']:12s} {'✓' if passed else '✗'}")

    total  = len(results)
    passed = sum(r["pass"] for r in results)
    by_cat = {}
    for r in results:
        c = r["category"]
        by_cat.setdefault(c, {"n": 0, "p": 0})
        by_cat[c]["n"] += 1
        by_cat[c]["p"] += r["pass"]

    print(f"\n  pass@1: {passed/total:.3f}  ({passed}/{total})")
    print(f"  Wall time: {time.time()-t0:.1f}s")
    print(f"\n  Per-category:")
    for cat, v in sorted(by_cat.items()):
        bar = '█' * v['p'] + '░' * (v['n'] - v['p'])
        print(f"    {cat:12s}  {bar}  {v['p']}/{v['n']}")
    return {"tag": tag, "pass@1": passed/total, "by_category": by_cat}

print("✓ Helpers defined")

In [None]:
# Cell 5 – Baseline evaluation
print("Loading base model for baseline evaluation...")
model, tokenizer = load_model_and_tokenizer(BASE_MODEL)
baseline_results = evaluate(model, tokenizer, test_data, "baseline")

# Free GPU memory before training
del model
torch.cuda.empty_cache()
print("\n✓ GPU memory freed")

In [None]:
# Cell 6 – Fine-tune with LoRA
print("Loading model for fine-tuning...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
)

# Format dataset as ChatML text
train_dataset = Dataset.from_list([
    {"text": (f"<|im_start|>user\n{d['instruction']}<|im_end|>\n"
               f"<|im_start|>assistant\n{d['response']}<|im_end|>")}
    for d in train_data
])
print(f"✓ {len(train_dataset)} training examples formatted")

peft_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    bias="none", task_type="CAUSAL_LM",
)

# Handle SFTConfig API differences across trl versions
_sft_params = set(inspect.signature(SFTConfig.__init__).parameters)
_sft_kwargs = dict(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    fp16=True,
    logging_steps=10,
    save_strategy="no",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
    report_to="none",
    dataloader_num_workers=0,
    dataset_text_field="text",
)
if "max_length" in _sft_params:
    _sft_kwargs["max_length"] = MAX_LEN
elif "max_seq_length" in _sft_params:
    _sft_kwargs["max_seq_length"] = MAX_LEN

trainer = SFTTrainer(
    model=model,
    args=SFTConfig(**_sft_kwargs),
    train_dataset=train_dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
)

print("\nStarting training...")
t0 = time.time()
trainer.train()
print(f"\n✓ Training complete in {time.time()-t0:.1f}s")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✓ Model saved to {OUTPUT_DIR}")

In [None]:
# Cell 7 – Post-SFT evaluation
del model
torch.cuda.empty_cache()

print("Loading fine-tuned model (merging LoRA)...")
ft_model, ft_tokenizer = load_model_and_tokenizer(OUTPUT_DIR, is_lora=True)
postsft_results = evaluate(ft_model, ft_tokenizer, test_data, "post_sft")

In [None]:
# Cell 8 – Final comparison
b = baseline_results["pass@1"]
a = postsft_results["pass@1"]
delta = a - b

print("\n" + "="*55)
print("  FINAL COMPARISON")
print("="*55)
print(f"  Baseline  pass@1 : {b:.3f}")
print(f"  Post-SFT  pass@1 : {a:.3f}")
print(f"  Delta            : {delta:+.3f}  {'✓ IMPROVED' if delta > 0 else '✗ REGRESSED'}")
print()
print(f"  {'Category':<12}  {'Baseline':>8}  {'Post-SFT':>8}  {'Delta':>7}")
print(f"  {'-'*12}  {'-'*8}  {'-'*8}  {'-'*7}")
all_cats = set(baseline_results["by_category"]) | set(postsft_results["by_category"])
for cat in sorted(all_cats):
    bv = baseline_results["by_category"].get(cat, {"p": 0, "n": 1})
    av = postsft_results["by_category"].get(cat, {"p": 0, "n": 1})
    bd, ad = bv["p"]/bv["n"], av["p"]/av["n"]
    print(f"  {cat:<12}  {bd:>8.2f}  {ad:>8.2f}  {ad-bd:>+7.2f}")
print("="*55)