# Track C: Self-Contained Embedding Fine-Tuning (HF Dataset)\n
\n
This notebook is fully self-contained. It: \n
1. Loads retrieval data directly from a Hugging Face dataset repo (`train.jsonl` / `test.jsonl`)\n
2. Converts rows into textâ†’code pairs\n
3. Fine-tunes an embedding model with in-batch negatives\n
4. Evaluates **Accuracy@1**, **MRR@10**, **nDCG@10**, **Recall@10** before and after training

In [None]:
# Uncomment if needed:\n
# !pip install -q sentence-transformers datasets huggingface_hub numpy torch\n
import json
import math\n
import random\n
from pathlib import Path\n
from typing import Dict, List\n
\n
import numpy as np\n
import torch\n
from torch.utils.data import DataLoader\n
from sentence_transformers import SentenceTransformer, InputExample, losses\n
from huggingface_hub import hf_hub_download\n

## Config

In [None]:
# HF dataset repo with train.jsonl and test.jsonl\n
HF_DATASET_REPO = "archit11/assesment_embeddings_new"\n
HF_TRAIN_FILE = "train.jsonl"\n
HF_TEST_FILE = "test.jsonl"\n
\n
# Embedding model choices: BAAI/bge-small-en-v1.5, intfloat/e5-small-v2, thenlper/gte-small\n
MODEL_NAME = "BAAI/bge-small-en-v1.5"\n
\n
SEED = 42\n
EPOCHS = 4\n
BATCH_SIZE = 32\n
LR = 2e-5\n
WARMUP_RATIO = 0.1\n
TOP_K = 10\n
MAX_TRAIN_PAIRS = 0   # 0 = all\n
MAX_TEST_PAIRS = 0    # 0 = all\n
OUTPUT_DIR = Path("results/track_c_notebook")\n
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n
\n
random.seed(SEED)\n
np.random.seed(SEED)\n
torch.manual_seed(SEED)\n
device = "cuda" if torch.cuda.is_available() else "cpu"\n
print("Device:", device)

## Load Dataset From HF

In [None]:
def load_jsonl_from_hf(repo_id: str, filename: str) -> List[dict]:\n
    local_path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=filename)\n
    rows = []\n
    with open(local_path, "r", encoding="utf-8") as f:\n
        for line in f:\n
            line = line.strip()\n
            if not line:\n
                continue\n
            rows.append(json.loads(line))\n
    return rows\n
\n
def to_query_code_pairs(rows: List[dict]) -> List[dict]:\n
    # Supports rows like {queries:[...], anchor:...} OR {query:..., code:...}\n
    out = []\n
    for r in rows:\n
        if "query" in r and "code" in r:\n
            q = str(r["query"]).strip()\n
            c = str(r["code"]).strip()\n
            if q and c:\n
                out.append({"query": q, "code": c, "meta": r})\n
            continue\n
\n
        anchor = str(r.get("anchor", "")).strip()\n
        if not anchor:\n
            continue\n
        for q in r.get("queries", []):\n
            q = str(q).strip()\n
            if q:\n
                out.append({"query": q, "code": anchor, "meta": r})\n
    return out\n
\n
train_raw = load_jsonl_from_hf(HF_DATASET_REPO, HF_TRAIN_FILE)\n
test_raw = load_jsonl_from_hf(HF_DATASET_REPO, HF_TEST_FILE)\n
train_pairs = to_query_code_pairs(train_raw)\n
test_pairs = to_query_code_pairs(test_raw)\n
\n
if MAX_TRAIN_PAIRS > 0:\n
    train_pairs = train_pairs[:MAX_TRAIN_PAIRS]\n
if MAX_TEST_PAIRS > 0:\n
    test_pairs = test_pairs[:MAX_TEST_PAIRS]\n
\n
print(f"Train pairs: {len(train_pairs)}")\n
print(f"Test pairs:  {len(test_pairs)}")

## Helpers: Formatting + Metrics

In [None]:
def format_query(query: str, model_name: str) -> str:\n
    m = model_name.lower()\n
    q = query.strip()\n
    if "e5" in m:\n
        return f"query: {q}"\n
    if "bge" in m:\n
        return f"Represent this sentence for searching relevant code: {q}"\n
    return q\n
\n
def format_code(code: str, model_name: str) -> str:\n
    m = model_name.lower()\n
    c = code.strip()\n
    if "e5" in m:\n
        return f"passage: {c}"\n
    return c\n
\n
def compute_metrics(model: SentenceTransformer, pairs: List[dict], model_name: str, k: int = 10) -> Dict[str, float]:\n
    # Build unique code corpus\n
    code_to_idx = {}\n
    codes = []\n
    queries = []\n
    target_idx = []\n
\n
    for item in pairs:\n
        code = format_code(item["code"], model_name)\n
        if code not in code_to_idx:\n
            code_to_idx[code] = len(codes)\n
            codes.append(code)\n
\n
    for item in pairs:\n
        q = format_query(item["query"], model_name)\n
        c = format_code(item["code"], model_name)\n
        queries.append(q)\n
        target_idx.append(code_to_idx[c])\n
\n
    q_emb = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)\n
    c_emb = model.encode(codes, convert_to_numpy=True, show_progress_bar=True)\n
\n
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)\n
    c_emb = c_emb / np.linalg.norm(c_emb, axis=1, keepdims=True)\n
    sim = np.dot(q_emb, c_emb.T)\n
\n
    mrr, ndcg, recall, acc1 = [], [], [], []\n
    for i in range(len(queries)):\n
        correct = target_idx[i]\n
        topk = np.argsort(sim[i])[::-1][:k]\n
\n
        # Accuracy@1\n
        acc1.append(1.0 if topk[0] == correct else 0.0)\n
\n
        # Recall@k\n
        recall.append(1.0 if correct in topk else 0.0)\n
\n
        # MRR@k\n
        rr = 0.0\n
        for rank, idx in enumerate(topk, start=1):\n
            if idx == correct:\n
                rr = 1.0 / rank\n
                break\n
        mrr.append(rr)\n
\n
        # nDCG@k (single relevant item)\n
        dcg = 0.0\n
        for rank, idx in enumerate(topk, start=1):\n
            rel = 1.0 if idx == correct else 0.0\n
            dcg += rel / math.log2(rank + 1)\n
        idcg = 1.0\n
        ndcg.append(dcg / idcg)\n
\n
    return {\n
        "Accuracy@1": round(float(np.mean(acc1)), 4),\n
        f"MRR@{k}": round(float(np.mean(mrr)), 4),\n
        f"nDCG@{k}": round(float(np.mean(ndcg)), 4),\n
        f"Recall@{k}": round(float(np.mean(recall)), 4),\n
    }

## Baseline

In [None]:
baseline_model = SentenceTransformer(MODEL_NAME, device=device)\n
baseline_metrics = compute_metrics(baseline_model, test_pairs, MODEL_NAME, k=TOP_K)\n
print("Baseline metrics:")\n
print(baseline_metrics)

## Fine-Tune

In [None]:
def build_train_examples(pairs: List[dict], model_name: str):\n
    ex = []\n
    for item in pairs:\n
        ex.append(InputExample(texts=[format_query(item['query'], model_name), format_code(item['code'], model_name)]))\n
    return ex\n
\n
train_examples = build_train_examples(train_pairs, MODEL_NAME)\n
train_loader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)\n
\n
ft_model = SentenceTransformer(MODEL_NAME, device=device)\n
train_loss = losses.MultipleNegativesRankingLoss(ft_model)\n
warmup_steps = int(len(train_loader) * EPOCHS * WARMUP_RATIO)\n
\n
ft_model.fit(\n
    train_objectives=[(train_loader, train_loss)],\n
    epochs=EPOCHS,\n
    warmup_steps=warmup_steps,\n
    optimizer_params={"lr": LR},\n
    output_path=str(OUTPUT_DIR / "model"),\n
    show_progress_bar=True,\n
)

## Post-Training Accuracy Test

In [None]:
model_path = OUTPUT_DIR / "model"\n
post_model = SentenceTransformer(str(model_path), device=device)\n
post_metrics = compute_metrics(post_model, test_pairs, MODEL_NAME, k=TOP_K)\n
\n
print("Post-training metrics:")\n
print(post_metrics)

## Compare Baseline vs Fine-Tuned

In [None]:
keys = ["Accuracy@1", f"MRR@{TOP_K}", f"nDCG@{TOP_K}", f"Recall@{TOP_K}"]\n
report = {}\n
for k in keys:\n
    b = baseline_metrics[k]\n
    p = post_metrics[k]\n
    d = round(p - b, 4)\n
    pct = float('inf') if b == 0 else round((d / b) * 100.0, 2)\n
    report[k] = {"baseline": b, "post": p, "delta": d, "pct": pct}\n
\n
for k, v in report.items():\n
    print(f"{k:<10} baseline={v['baseline']:.4f}  post={v['post']:.4f}  delta={v['delta']:+.4f}  pct={v['pct']}%")\n
\n
metrics_out = {\n
    "model": MODEL_NAME,\n
    "dataset_repo": HF_DATASET_REPO,\n
    "train_pairs": len(train_pairs),\n
    "test_pairs": len(test_pairs),\n
    "baseline": baseline_metrics,\n
    "post_training": post_metrics,\n
    "improvement": report,\n
}\n
with open(OUTPUT_DIR / "metrics.json", "w", encoding="utf-8") as f:\n
    json.dump(metrics_out, f, indent=2)\n
print("Saved:", OUTPUT_DIR / "metrics.json")