# Track B – SFT Output Comparison

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/archit-spec/assesment-jp/blob/main/eval_track_b.ipynb)

Side-by-side output comparison of **baseline** (`Qwen/Qwen2.5-Coder-1.5B`) vs **fine-tuned** (`archit11/track_b_sft_merged`) on 5 test samples.

**Dataset:** [`archit11/track_b_sft`](https://huggingface.co/datasets/archit11/track_b_sft)

### Install dependencies
```bash
pip install transformers datasets torch accelerate
```

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B"
FT_MODEL_ID   = "archit11/track_b_sft_merged"
DATASET_ID    = "archit11/track_b_sft"
NUM_SAMPLES   = 5
MAX_NEW_TOKENS = 256

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Load 5 test samples
ds = load_dataset(DATASET_ID, split="test")
samples = ds.select(range(min(NUM_SAMPLES, len(ds))))
print(f"Loaded {len(samples)} samples")

def load_model(model_id):
    print(f"Loading {model_id}...")
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=torch.float16, trust_remote_code=True
    ).to(device).eval()
    return model, tok

def generate(model, tok, instruction):
    messages = [{"role": "user", "content": instruction}]
    try:
        text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    except:
        text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tok(text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS,
                             do_sample=False, pad_token_id=tok.eos_token_id)
    return tok.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()

# Run base model
base_model, tok = load_model(BASE_MODEL_ID)
base_outputs = [generate(base_model, tok, row["instruction"]) for row in samples]
del base_model
import gc; gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Run fine-tuned model
ft_model, tok = load_model(FT_MODEL_ID)
ft_outputs = [generate(ft_model, tok, row["instruction"]) for row in samples]
del ft_model
gc.collect(); torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Print comparison
SEP = "=" * 70
for i, row in enumerate(samples):
    print(f"\n{SEP}")
    print(f"SAMPLE {i+1} | category: {row.get('category', 'n/a')}")
    print(f"{SEP}")
    print(f"INSTRUCTION:\n{row['instruction'][:300]}")
    print(f"\n--- BASE MODEL ---")
    print(base_outputs[i][:400])
    print(f"\n--- FINE-TUNED ---")
    print(ft_outputs[i][:400])
print(f"\n{SEP}")

### Known Results (pass@1 / pass@3)

| Metric | Baseline | Fine-Tuned | Δ |
|--------|----------|------------|---|
| pass@1 | 0.565 | **0.804** | +23.9% ↑ |
| pass@3 | 0.783 | **0.848** | +6.5% ↑ |

## Style Analysis – PEP8 & Docstring Quality

In [None]:
import ast, re, subprocess, sys, tempfile, os

def check_pep8(code: str) -> dict:
    """Run pycodestyle on code string, return violation count and details."""
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(code); fname = f.name
    try:
        r = subprocess.run(
            [sys.executable, "-m", "pycodestyle", "--max-line-length=100", fname],
            capture_output=True, text=True
        )
        lines = [l for l in r.stdout.strip().splitlines() if l]
        return {"violations": len(lines), "details": lines[:5]}
    except Exception as e:
        return {"violations": -1, "details": [str(e)]}
    finally:
        os.unlink(fname)

def check_docstring(text: str) -> dict:
    """Check for Google/NumPy style docstring markers."""
    has_triple  = '"""' in text or "'''" in text
    has_summary = has_triple and len(text.strip()) > 40
    has_args    = bool(re.search(r"Args:|Parameters:|:param ", text))
    has_returns = bool(re.search(r"Returns:|:returns:|Return:", text))
    has_example = bool(re.search(r"Example[s]?:|>>>" , text))
    score = sum([has_triple, has_summary, has_args, has_returns, has_example]) / 5
    return {
        "score": round(score, 2),
        "triple_quotes": has_triple,
        "summary": has_summary,
        "args": has_args,
        "returns": has_returns,
        "examples": has_example,
    }

def extract_code(text: str) -> str:
    blocks = re.findall(r"```(?:python)?\s*\n(.*?)```", text, re.DOTALL)
    return blocks[0].strip() if blocks else text.strip()

SEP = "=" * 70
print(f"{'Sample':<8} {'Model':<12} {'PEP8 violations':>16} {'Docstring score':>15}")
print("-" * 55)

for i, row in enumerate(samples):
    for label, output in [("Base", base_outputs[i]), ("Fine-Tuned", ft_outputs[i])]:
        code = extract_code(output)
        pep8   = check_pep8(code)
        docstr = check_docstring(output)
        viols  = pep8["violations"] if pep8["violations"] >= 0 else "n/a"
        print(f"  [{i+1}]   {label:<12} {str(viols):>16} {docstr['score']:>15.2f}")
        if pep8["details"]:
            for d in pep8["details"][:2]:
                print(f"           └ {d.split(':',2)[-1].strip()[:60]}")

print(f"\n{SEP}")
print("DOCSTRING DETAIL (Fine-Tuned only)")
print(SEP)
for i, row in enumerate(samples):
    d = check_docstring(ft_outputs[i])
    checks = {k: ("✓" if v else "✗") for k, v in d.items() if k != "score"}
    print(f"  [{i+1}] score={d['score']:.2f}  " + "  ".join(f"{k}:{v}" for k,v in checks.items()))