# 🧠 Track C – Embedding Fine-tuning Demo

Fine-tune `Qwen/Qwen3-Embedding-0.6B` on a synthetic text→code retrieval dataset and measure improvement on MRR@10, nDCG@10, and Recall@10.

---

## 📦 Dataset: [`archit11/code-embedding-dataset`](https://huggingface.co/datasets/archit11/code-embedding-dataset)

| Field | Detail |
|-------|--------|
| **Source** | [juspay/hyperswitch](https://github.com/juspay/hyperswitch) Rust codebase |
| **Train** | 1,736 (query, code) pairs |
| **Test** | 24 held-out retrieval examples |
| **Pair format** | `sentence_0` = natural-language query, `sentence_1` = code snippet |
| **License** | MIT |

### Data Card Summary

| Field | Detail |
|-------|--------|
| **Generation model** | `qwen/qwen3.5-397b-a17b` via OpenRouter |
| **Method** | LLM-generated NL descriptions + query variants per Rust file |
| **Code format** | `// PATH: ...\n// MODULE: ...\n// SYMBOL: ...\n<code>` |
| **Query types** | Functional ("how to X"), structural ("Y struct definition"), conceptual ("what does Z do") |

---

## 🤖 Model: [`archit11/assesment_qwen3_embedding_06b_e3`](https://huggingface.co/archit11/assesment_qwen3_embedding_06b_e3)

| Field | Detail |
|-------|--------|
| **Base** | `Qwen/Qwen3-Embedding-0.6B` |
| **Method** | Full fine-tune via `sentence-transformers` |
| **Loss** | `MultipleNegativesRankingLoss` (scale=20, cos_sim) |
| **Epochs** | ~2.3 (500 steps) |
| **Final train loss** | 0.0380 |
| **Pooling** | Last-token, 1024-dim, cosine similarity |
| **Hardware** | T4 GPU |

---

## 📊 Results (Reproduced Below)

| Metric | Baseline (`Qwen3-Embedding-0.6B`) | Fine-tuned | Δ |
|--------|----------------------------------|------------|---|
| **MRR@10** | — | — | — |
| **nDCG@10** | — | — | — |
| **Recall@10** | — | — | — |

> ⚡ **Make sure Runtime → Change runtime type → T4 GPU is selected before running.**

> 📌 Results table will be filled in automatically by Cell 7.

In [None]:
# Cell 1 – Install dependencies
!pip install -q sentence-transformers==3.4.1 datasets huggingface_hub
# sentence-transformers 3.x ships MultipleNegativesRankingLoss + SentenceTransformerTrainer
print("✓ Dependencies installed")

In [None]:
# Cell 2 – Imports & config
import math, time, json
import numpy as np
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

BASE_MODEL    = "Qwen/Qwen3-Embedding-0.6B"
FINETUNED_HF  = "archit11/assesment_qwen3_embedding_06b_e3"
DATASET_REPO  = "archit11/code-embedding-dataset"
OUTPUT_DIR    = "/content/track_c_embedding"
EPOCHS        = 2
BATCH_SIZE    = 8       # T4-safe (0.6B model)
LR            = 2e-5
WARMUP_RATIO  = 0.1
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

print(f"✓ Device : {DEVICE}")
print(f"✓ Base   : {BASE_MODEL}")
print(f"✓ Dataset: {DATASET_REPO}")

In [None]:
# Cell 3 – Load dataset from Hugging Face
print(f"Loading {DATASET_REPO} ...")
ds = load_dataset(DATASET_REPO)

# Training split: sentence_0 = query, sentence_1 = code
train_ds = ds["train"]
print(f"✓ Train : {len(train_ds)} pairs")
print(f"  Columns: {train_ds.column_names}")

# Build a retrieval test set from the 'test' split (or fall back to last 24 of train)
if "test" in ds:
    test_ds = ds["test"]
    print(f"✓ Test  : {len(test_ds)} examples")
else:
    # Use last 24 train examples as held-out test
    test_ds = train_ds.select(range(len(train_ds) - 24, len(train_ds)))
    train_ds = train_ds.select(range(len(train_ds) - 24))
    print(f"✓ Test  : {len(test_ds)} examples (held-out from train)")

# Preview
ex = train_ds[0]
print(f"\nSample pair:")
print(f"  Query : {ex['sentence_0'][:120]}")
print(f"  Code  : {str(ex['sentence_1'])[:120]}...")

In [None]:
# Cell 4 – Evaluation helpers (MRR@k, nDCG@k, Recall@k)

def build_retrieval_corpus(test_data):
    """
    Build a retrieval corpus from the test set.
    Each test example contributes:
      - 1 positive code document (the paired code)
      - All other codes in the test set act as distractors
    Returns: queries, corpus, relevance dict
    """
    queries  = {}   # qid -> query text
    corpus   = {}   # did -> code text
    relevant = {}   # qid -> set of relevant dids

    for i, ex in enumerate(test_data):
        qid = f"q{i}"
        did = f"d{i}"
        # Use 'queries' field if available (list of 4 queries), else sentence_0
        if "queries" in ex and ex["queries"]:
            q = ex["queries"][0] if isinstance(ex["queries"], list) else ex["queries"]
        else:
            q = ex["sentence_0"]
        # Use 'anchor' (code) if available, else sentence_1
        code = ex.get("anchor", ex.get("sentence_1", ""))
        queries[qid]  = q
        corpus[did]   = code
        relevant[qid] = {did}

    return queries, corpus, relevant


def compute_retrieval_metrics(model, queries, corpus, relevant, k=10, batch_size=32):
    """
    Encode queries and corpus, rank by cosine similarity,
    compute MRR@k, nDCG@k, Recall@k.
    """
    qids   = list(queries.keys())
    dids   = list(corpus.keys())
    qtexts = [queries[q] for q in qids]
    dtexts = [corpus[d]  for d in dids]

    # Use encode_query / encode_document if available (Qwen3 asymmetric encoding)
    if hasattr(model, "encode_query"):
        q_embs = model.encode_query(qtexts,  batch_size=batch_size, show_progress_bar=False)
        d_embs = model.encode_document(dtexts, batch_size=batch_size, show_progress_bar=False)
    else:
        q_embs = model.encode(qtexts,  batch_size=batch_size, show_progress_bar=False,
                              normalize_embeddings=True)
        d_embs = model.encode(dtexts,  batch_size=batch_size, show_progress_bar=False,
                              normalize_embeddings=True)

    # Cosine similarity matrix [n_queries x n_docs]
    sims = np.dot(q_embs, d_embs.T)   # already L2-normalised

    mrr_scores, ndcg_scores, recall_scores = [], [], []

    for qi, qid in enumerate(qids):
        rel_dids = relevant[qid]
        ranked   = np.argsort(-sims[qi])[:k]   # top-k doc indices

        # MRR@k
        mrr = 0.0
        for rank, di in enumerate(ranked, 1):
            if dids[di] in rel_dids:
                mrr = 1.0 / rank
                break
        mrr_scores.append(mrr)

        # nDCG@k
        dcg  = sum(1.0 / math.log2(rank + 1)
                   for rank, di in enumerate(ranked, 1)
                   if dids[di] in rel_dids)
        idcg = sum(1.0 / math.log2(rank + 1)
                   for rank in range(1, min(len(rel_dids), k) + 1))
        ndcg_scores.append(dcg / idcg if idcg > 0 else 0.0)

        # Recall@k
        hits = sum(1 for di in ranked if dids[di] in rel_dids)
        recall_scores.append(hits / len(rel_dids))

    return {
        "MRR@10":    float(np.mean(mrr_scores)),
        "nDCG@10":   float(np.mean(ndcg_scores)),
        "Recall@10": float(np.mean(recall_scores)),
        "n_queries":  len(qids),
        "n_docs":     len(dids),
    }


def evaluate_model(model, test_data, tag, k=10):
    print(f"\n{'='*55}")
    print(f"  Evaluating: {tag}")
    print(f"{'='*55}")
    queries, corpus, relevant = build_retrieval_corpus(test_data)
    t0 = time.time()
    metrics = compute_retrieval_metrics(model, queries, corpus, relevant, k=k)
    elapsed = time.time() - t0
    print(f"  MRR@{k}    : {metrics['MRR@10']:.4f}")
    print(f"  nDCG@{k}   : {metrics['nDCG@10']:.4f}")
    print(f"  Recall@{k} : {metrics['Recall@10']:.4f}")
    print(f"  Corpus    : {metrics['n_queries']} queries x {metrics['n_docs']} docs")
    print(f"  Wall time : {elapsed:.1f}s")
    metrics["tag"] = tag
    return metrics


print("✓ Evaluation helpers defined")

In [None]:
# Cell 5 – Baseline evaluation (Qwen3-Embedding-0.6B, no fine-tuning)
print(f"Loading base model: {BASE_MODEL} ...")
base_model = SentenceTransformer(BASE_MODEL, trust_remote_code=True)
base_model.to(DEVICE)

baseline_metrics = evaluate_model(base_model, test_ds, tag="baseline (Qwen3-Embedding-0.6B)")

# Free GPU memory
del base_model
torch.cuda.empty_cache()
print("\n✓ GPU memory freed")

In [None]:
# Cell 6 – Fine-tune with MultipleNegativesRankingLoss
#
# MultipleNegativesRankingLoss treats every other example in the batch
# as a hard negative – no explicit negatives needed.
# The model learns: encode_query(q) should be close to encode_document(code)

print(f"Loading model for fine-tuning: {BASE_MODEL} ...")
ft_model = SentenceTransformer(BASE_MODEL, trust_remote_code=True)
ft_model.to(DEVICE)

# Loss
train_loss = losses.MultipleNegativesRankingLoss(
    model=ft_model,
    scale=20.0,
    similarity_fct=losses.util.cos_sim,
)

# Training arguments
training_args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LR,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
    report_to="none",
    multi_dataset_batch_sampler="round_robin",
)

# Trainer
trainer = SentenceTransformerTrainer(
    model=ft_model,
    args=training_args,
    train_dataset=train_ds,
    loss=train_loss,
)

print(f"✓ Training on {len(train_ds)} pairs, {EPOCHS} epochs, batch={BATCH_SIZE}")
print("  Loss: MultipleNegativesRankingLoss (scale=20, in-batch negatives)")
print("\nStarting training...")
t0 = time.time()
trainer.train()
print(f"\n✓ Training complete in {time.time()-t0:.1f}s")

# Save
ft_model.save(OUTPUT_DIR)
print(f"✓ Model saved to {OUTPUT_DIR}")

In [None]:
# Cell 7 – Post fine-tuning evaluation
postsft_metrics = evaluate_model(ft_model, test_ds, tag="post fine-tuning")

# Also evaluate the uploaded HF model for reference
print(f"\nLoading uploaded HF model: {FINETUNED_HF} ...")
try:
    hf_model = SentenceTransformer(FINETUNED_HF, trust_remote_code=True)
    hf_model.to(DEVICE)
    hf_metrics = evaluate_model(hf_model, test_ds, tag=f"HF model ({FINETUNED_HF})")
    del hf_model
    torch.cuda.empty_cache()
except Exception as e:
    print(f"  (Could not load HF model: {e})")
    hf_metrics = None

In [None]:
# Cell 8 – Final comparison & failure analysis
b = baseline_metrics
a = postsft_metrics

print("\n" + "="*60)
print("  FINAL COMPARISON")
print("="*60)
print(f"  {'Metric':<12}  {'Baseline':>10}  {'Post-SFT':>10}  {'Delta':>8}")
print(f"  {'-'*12}  {'-'*10}  {'-'*10}  {'-'*8}")
for metric in ["MRR@10", "nDCG@10", "Recall@10"]:
    bv = b[metric]; av = a[metric]; dv = av - bv
    arrow = "↑" if dv > 0 else ("↓" if dv < 0 else "=")
    print(f"  {metric:<12}  {bv:>10.4f}  {av:>10.4f}  {dv:>+7.4f} {arrow}")
print("="*60)

# Save metrics JSON
results = {"baseline": b, "post_sft": a}
if hf_metrics:
    results["hf_model"] = hf_metrics
with open("/content/track_c_metrics.json", "w") as f:
    json.dump(results, f, indent=2)
print("\n✓ Metrics saved to /content/track_c_metrics.json")

# ---- Failure Analysis ----
print("\n" + "="*60)
print("  FAILURE ANALYSIS (queries where baseline rank > 5)")
print("="*60)

queries, corpus, relevant = build_retrieval_corpus(test_ds)
qids   = list(queries.keys())
dids   = list(corpus.keys())
qtexts = [queries[q] for q in qids]
dtexts = [corpus[d]  for d in dids]

# Re-encode with both models for comparison
base_model2 = SentenceTransformer(BASE_MODEL, trust_remote_code=True)
base_model2.to(DEVICE)

if hasattr(base_model2, "encode_query"):
    bq = base_model2.encode_query(qtexts,  show_progress_bar=False)
    bd = base_model2.encode_document(dtexts, show_progress_bar=False)
    aq = ft_model.encode_query(qtexts,  show_progress_bar=False)
    ad = ft_model.encode_document(dtexts, show_progress_bar=False)
else:
    bq = base_model2.encode(qtexts,  normalize_embeddings=True, show_progress_bar=False)
    bd = base_model2.encode(dtexts,  normalize_embeddings=True, show_progress_bar=False)
    aq = ft_model.encode(qtexts,  normalize_embeddings=True, show_progress_bar=False)
    ad = ft_model.encode(dtexts,  normalize_embeddings=True, show_progress_bar=False)

del base_model2
torch.cuda.empty_cache()

b_sims = np.dot(bq, bd.T)
a_sims = np.dot(aq, ad.T)

failures = []
for qi, qid in enumerate(qids):
    rel_dids = relevant[qid]
    b_ranked = np.argsort(-b_sims[qi])
    a_ranked = np.argsort(-a_sims[qi])

    b_rank = next((r+1 for r, di in enumerate(b_ranked) if dids[di] in rel_dids), 999)
    a_rank = next((r+1 for r, di in enumerate(a_ranked) if dids[di] in rel_dids), 999)

    if b_rank > 5:
        failures.append({
            "query":    queries[qid],
            "b_rank":   b_rank,
            "a_rank":   a_rank,
            "improved": a_rank < b_rank,
        })

print(f"  Found {len(failures)} hard queries (baseline rank > 5)")
for i, f in enumerate(failures[:15], 1):
    arrow = "↑ improved" if f["improved"] else ("↓ worse" if f["a_rank"] > f["b_rank"] else "= same")
    print(f"  [{i:2d}] Baseline rank {f['b_rank']:3d} → FT rank {f['a_rank']:3d}  {arrow}")
    print(f"       Query: {f['query'][:90]}")

print("\n✓ Track C complete!")

In [None]:
# Cell 9 – [Optional] Upload fine-tuned model to Hugging Face
# Uncomment and set your HF token to push the model

# from huggingface_hub import login
# login(token="hf_YOUR_TOKEN_HERE")
# ft_model.push_to_hub("YOUR_HF_USERNAME/track_c_embedding_finetuned")
# print("✓ Model pushed to Hugging Face Hub")

print("Skipping upload (uncomment above to push to HF Hub)")