In [1]:
# ---- Simple config flags (pretrained only for now) ----
DO_TUNE_HYPERS = False       # keep OFF for this run
DO_FINE_TUNE_DENSE = False   # keep OFF for this run

# Retrieval defaults (sensible + fast)
BM25_K1, BM25_B = 1.2, 0.75
ANN_EFSEARCH = 128
FUSE_ALPHA = 0.5
CE_TOP_FOR_CE = 50  # reduce to 20 if you want extra speed

# Models
BI_ENCODER_BASE = "sentence-transformers/all-MiniLM-L6-v2"
CE_RERANKER     = "cross-encoder/ms-marco-MiniLM-L-6-v2"

# Seeds (simple, non-invasive)
import os, random, numpy as np, torch
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
assert torch.cuda.is_available(), "No GPU detected. In Colab: Runtime → Change runtime type → GPU (T4)."

Device: cuda


In [2]:
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive', force_remount=False)

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
ART = ROOT / "artifacts"
EMB_DIR = ART / "embeddings"
IDX_DIR = ROOT / "indices" / "faiss_hnsw"
RUNS   = ROOT / "runs"
REPTS  = ROOT / "reports"
EVALQ  = ROOT / "eval" / "qrels"

for p in [EMB_DIR, IDX_DIR, RUNS, REPTS]:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:", ROOT)

Mounted at /content/drive
Project root: /content/drive/MyDrive/finance-rag-microservice


In [3]:
# Minimal, versionless installs — play nice with Colab's preinstalled stack
!pip -q install -U sentence-transformers transformers faiss-cpu rank-bm25 tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import json, pandas as pd

# From NB1:
PASSAGES = ART / "passages.parquet"
assert PASSAGES.exists(), f"Missing {PASSAGES}. Run Notebook 01 first."

passages_df = pd.read_parquet(PASSAGES)
with open(EVALQ / "fiqa_queries_test.json") as f: queries_test = json.load(f)
with open(EVALQ / "fiqa_qrels_test.json")  as f: qrels_test   = json.load(f)

print("Passages:", len(passages_df))
print("Test queries:", len(queries_test), "Test qrels pairs:", sum(len(v) for v in qrels_test.values()))

Passages: 59018
Test queries: 648 Test qrels pairs: 1706


In [7]:
from rank_bm25 import BM25Okapi

passage_texts  = passages_df["passage"].tolist()
passage_docids = passages_df["doc_id"].tolist()
passage_tokens = [p.split() for p in passage_texts]

bm25_default = BM25Okapi(passage_tokens, k1=BM25_K1, b=BM25_B)
print("BM25 ready:", len(passage_tokens), "passages")

BM25 ready: 59018 passages


In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer

EMB_FILE = EMB_DIR / "passage_emb_allminilm.npy"

if EMB_FILE.exists():
    passage_emb = np.load(EMB_FILE)
    print("Loaded embeddings:", passage_emb.shape)
else:
    encoder = SentenceTransformer(BI_ENCODER_BASE, device=device)
    passage_emb = encoder.encode(
        passage_texts,
        batch_size=256,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True  # cosine via inner product
    ).astype("float32")
    np.save(EMB_FILE, passage_emb)
    print("Encoded & saved embeddings:", passage_emb.shape)

Loaded embeddings: (59018, 384)


In [9]:
import faiss, numpy as np

INDEX_PATH = IDX_DIR / "faiss_hnsw_ip.index"
IDS_PATH   = IDX_DIR / "passage_ids.npy"

def build_hnsw(emb, M=32, efC=200):
    d = emb.shape[1]
    index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
    index.hnsw.efConstruction = efC
    index.add(emb)
    return index

if INDEX_PATH.exists() and IDS_PATH.exists():
    index = faiss.read_index(str(INDEX_PATH))
    passage_ids = np.load(IDS_PATH)
    print("Loaded FAISS index:", index.ntotal)
else:
    index = build_hnsw(passage_emb, M=32, efC=200)
    passage_ids = np.arange(len(passage_emb), dtype=np.int64)
    faiss.write_index(index, str(INDEX_PATH))
    np.save(IDS_PATH, passage_ids)
    print("Built & saved FAISS index:", index.ntotal)

Loaded FAISS index: 59018


In [10]:
# One-time: map doc_id -> list of passage row indices (fast doc lookup)
from collections import defaultdict
DOC2PIs = defaultdict(list)
for i, did in enumerate(passage_docids):
    DOC2PIs[did].append(i)

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from math import log2
import torch, numpy as np

q_encoder = SentenceTransformer(BI_ENCODER_BASE, device=device)

ce_tokenizer = AutoTokenizer.from_pretrained(CE_RERANKER)
ce_model = AutoModelForSequenceClassification.from_pretrained(CE_RERANKER).to(device).eval()

def bm25_scores_for_query(q_text, bm25_obj):
    return bm25_obj.get_scores(q_text.split())

def bm25_doc_aggregate(scores, top_passages=2000, top_docs=500):
    k = min(top_passages, len(scores))
    top_idx = np.argpartition(scores, -k)[-k:]
    doc2score, doc_best_pi = {}, {}
    for pi in top_idx:
        did = passage_docids[pi]
        s = float(scores[pi])
        if (did not in doc2score) or (s > doc2score[did]):
            doc2score[did] = s
            doc_best_pi[did] = int(pi)
    ranked = sorted(doc2score.items(), key=lambda x: x[1], reverse=True)[:top_docs]
    return ranked, doc_best_pi

def ann_search_with_doc_map(query, topk=200, efSearch=ANN_EFSEARCH):
    index.hnsw.efSearch = efSearch
    q = q_encoder.encode([query], normalize_embeddings=True, convert_to_numpy=True)
    scores, idx = index.search(q.astype("float32"), topk)
    idx = idx[0]; scores = scores[0]
    pairs = [(int(i), float(s)) for i, s in zip(idx, scores) if i != -1]
    doc_best = {}
    for pi, s in pairs:
        did = passage_docids[pi]
        if (did not in doc_best) or (s > doc_best[did][1]):
            doc_best[did] = (int(pi), float(s))
    return pairs, doc_best

def fuse_minmax_docs(bm25_ranked, bm25_best_pi, ann_doc_best, alpha=FUSE_ALPHA):
    b_dict = {d: s for d, s in bm25_ranked}
    if b_dict:
        b_vals = np.fromiter(b_dict.values(), dtype=np.float32)
        bmin, bmax = float(b_vals.min()), float(b_vals.max())
    else:
        bmin, bmax = 0.0, 1.0
    b_norm = {d: (s - bmin) / (bmax - bmin + 1e-9) for d, s in b_dict.items()}

    a_dict = {d: s for d, (pi, s) in ann_doc_best.items()}
    if a_dict:
        a_vals = np.fromiter(a_dict.values(), dtype=np.float32)
        amin, amax = float(a_vals.min()), float(a_vals.max())
    else:
        amin, amax = 0.0, 1.0
    a_norm = {d: (s - amin) / (amax - amin + 1e-9) for d, s in a_dict.items()}

    docs = set(b_dict) | set(a_dict)
    fused, doc_best_pi = [], {}
    for d in docs:
        bn = b_norm.get(d, 0.0)
        an = a_norm.get(d, 0.0)
        fused.append((d, FUSE_ALPHA * bn + (1 - FUSE_ALPHA) * an))
        doc_best_pi[d] = ann_doc_best[d][0] if d in ann_doc_best else bm25_best_pi.get(d)
    fused.sort(key=lambda x: x[1], reverse=True)
    return fused, doc_best_pi

@torch.no_grad()
def rerank_with_ce_fast(query, fused_docs, doc_best_pi, top_for_ce=CE_TOP_FOR_CE, max_len=256):
    cand = fused_docs[:top_for_ce]
    ids, pairs = [], []
    for d, _ in cand:
        pi = doc_best_pi.get(d)
        if pi is None:
            continue
        ids.append(d)
        pairs.append((query, passage_texts[pi]))
    if not pairs:
        return [d for d, _ in fused_docs[:100]]

    batch = ce_tokenizer.batch_encode_plus(
        pairs, padding=True, truncation=True, max_length=max_len, return_tensors="pt"
    ).to(device)
    scores = ce_model(**batch).logits.squeeze(-1).detach().float().cpu().numpy()
    order = scores.argsort()[::-1]
    return [ids[i] for i in order]

# Metrics (same definitions as NB1)
def ndcg_at_k(qrels_for_q, ranked, k=10):
    rels = [1 if (doc in qrels_for_q and qrels_for_q[doc] > 0) else 0 for doc in ranked[:k]]
    dcg = sum(rel / log2(i+2) for i, rel in enumerate(rels))
    ideal = [1] * min(k, sum(1 for r in qrels_for_q.values() if r > 0))
    idcg = sum(rel / log2(i+2) for i, rel in enumerate(ideal))
    return (dcg / idcg) if idcg > 0 else 0.0

def mrr_at_k(qrels_for_q, ranked, k=10):
    for i, doc in enumerate(ranked[:k], start=1):
        if doc in qrels_for_q and qrels_for_q[doc] > 0:
            return 1.0 / i
    return 0.0

def recall_at_k(qrels_for_q, ranked, k=100):
    rel_docs = {d for d, r in qrels_for_q.items() if r > 0}
    if not rel_docs:
        return None
    found = sum(1 for d in ranked[:k] if d in rel_docs)
    return found / len(rel_docs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [5]:
import json

def eval_on_test_and_write(run_path, report_path,
                           k1=BM25_K1, b=BM25_B,
                           efSearch=ANN_EFSEARCH,
                           alpha=FUSE_ALPHA,
                           top_for_ce=CE_TOP_FOR_CE):

    bm25_obj = BM25Okapi(passage_tokens, k1=k1, b=b)

    # Evaluation loop + collect final ranking per query
    qid_to_final = {}
    n, ndcg_sum, mrr_sum, recall_sum, recall_cnt = 0, 0.0, 0.0, 0.0, 0

    for qid, qtext in tqdm(queries_test.items(), desc="Evaluating (test, pretrained)"):
        # BM25 once
        p_scores = bm25_scores_for_query(qtext, bm25_obj)
        bm25_docs, bm25_best_pi = bm25_doc_aggregate(p_scores, top_passages=2000, top_docs=500)
        # ANN + map to docs
        ann_pass, ann_doc_best = ann_search_with_doc_map(qtext, topk=200, efSearch=efSearch)
        # Fusion + choose best passage per doc
        fused_docs, doc_best_pi = fuse_minmax_docs(bm25_docs, bm25_best_pi, ann_doc_best, alpha=alpha)
        # CE rerank
        final_docs = rerank_with_ce_fast(qtext, fused_docs, doc_best_pi, top_for_ce=top_for_ce)[:1000]
        qid_to_final[qid] = final_docs

        # Metrics
        rels = qrels_test.get(qid, {})
        if not final_docs:
            continue
        n += 1
        ndcg_sum += ndcg_at_k(rels, final_docs, k=10)
        mrr_sum  += mrr_at_k(rels,  final_docs, k=10)
        r = recall_at_k(rels, final_docs, k=100)
        if r is not None:
            recall_sum += r
            recall_cnt += 1

    metrics = {
        "n_evaluated_queries": n,
        "nDCG@10": round(ndcg_sum / n if n else 0.0, 4),
        "MRR@10":  round(mrr_sum / n if n else 0.0, 4),
        "Recall@100": round(recall_sum / recall_cnt if recall_cnt else 0.0, 4),
        "config": {"bm25": {"k1": k1, "b": b},
                   "ann": {"efSearch": efSearch},
                   "fusion": {"alpha": alpha},
                   "ce": {"top_for_ce": top_for_ce}}
    }

    # Write TREC run
    with open(run_path, "w", encoding="utf-8") as out:
        for qid, doclist in qid_to_final.items():
            for rank, doc_id in enumerate(doclist, start=1):
                out.write(f"{qid} Q0 {doc_id} {rank} {1.0 - rank/100000:.6f} hybrid_ce\n")

    # Write report
    report = {
        "dataset": "BEIR FiQA (test split)",
        "metrics": metrics,
        "run_file": str(run_path),
        "counts": {
            "passages": int(len(passages_df)),
            "test_queries": int(len(queries_test)),
            "test_qrels_pairs": int(sum(len(v) for v in qrels_test.values())),
        }
    }
    Path(report_path).write_text(json.dumps(report, indent=2))
    return metrics

run_path = RUNS / "hybrid_ce_pretrained_test.trec"
rep_path = REPTS / "ir_hybrid_ce_pretrained_test.json"

metrics = eval_on_test_and_write(run_path, rep_path)
print("TEST (Hybrid+CE pretrained):", metrics)
print("Run:", run_path)
print("Report:", rep_path)

NameError: name 'BM25Okapi' is not defined

In [11]:
# 9-setup) Load TRAIN/DEV splits saved in NB1
from pathlib import Path
import json

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
EVALQ = ROOT / "eval" / "qrels"

# Load split artifacts produced in NB1 Section D
with open(EVALQ / "fiqa_queries_train.json") as f: queries_train = json.load(f)
with open(EVALQ / "fiqa_qrels_train.json")  as f: qrels_train   = json.load(f)
with open(EVALQ / "fiqa_queries_dev.json")   as f: queries_dev   = json.load(f)
with open(EVALQ / "fiqa_qrels_dev.json")     as f: qrels_dev     = json.load(f)

print("Loaded splits:")
print(" - train:", len(queries_train), "queries |", sum(len(v) for v in qrels_train.values()), "qrels")
print(" - dev  :", len(queries_dev),   "queries |", sum(len(v) for v in qrels_dev.values()), "qrels")

Loaded splits:
 - train: 5500 queries | 14166 qrels
 - dev  : 500 queries | 1238 qrels


In [12]:
# 9A) Tuning grids (small, safe) + sampling caps to control runtime
from itertools import product
import random, json
import pandas as pd

# Grids — widen later if desired
BM25_GRID = [{"k1": 1.2, "b": 0.75}]               # keep BM25 fixed for now (works well)
ANN_GRID  = [{"efSearch": 64}, {"efSearch": 128}]   # HNSW search effort
FUSE_GRID = [{"alpha": 0.4}, {"alpha": 0.5}, {"alpha": 0.6}, {"alpha": 0.7}]  # weight for BM25
CE_GRID   = [{"top_for_ce": 20}, {"top_for_ce": 50}]                           # CE budget

# Evaluate on a subset of TRAIN to keep it fast
TUNE_TRAIN_MAX = 400   # ~subsample of train queries (adjust if you want)
DEV_EVAL_MAX   = None  # use all dev queries (None) or cap with int

# Build sampled dicts
def sample_queries(qdict, n=None, seed=42):
    if not n or n >= len(qdict): return qdict
    rng = random.Random(seed)
    keys = list(qdict.keys())
    rng.shuffle(keys)
    keys = keys[:n]
    return {k: qdict[k] for k in keys}

train_q_sample = sample_queries(queries_train, TUNE_TRAIN_MAX)
dev_q_sample   = sample_queries(queries_dev,   DEV_EVAL_MAX)

print("Tuning train subset:", len(train_q_sample), "dev size:", len(dev_q_sample))

Tuning train subset: 400 dev size: 500


In [13]:
# 9B) Split evaluator using the fast helpers you already have
from math import log2
from tqdm import tqdm

def eval_split(queries, qrels, bm25_params, ann_params, fuse_params, ce_params):
    k1, b = bm25_params["k1"], bm25_params["b"]
    efS = ann_params["efSearch"]
    alpha = fuse_params["alpha"]
    top_ce = ce_params["top_for_ce"]

    bm25_obj = BM25Okapi(passage_tokens, k1=k1, b=b)

    n, ndcg_sum, mrr_sum, recall_sum, recall_cnt = 0, 0.0, 0.0, 0.0, 0
    for qid, qtext in tqdm(queries.items(), desc="Evaluating split"):
        # 1) BM25 once -> doc agg
        p_scores = bm25_scores_for_query(qtext, bm25_obj)
        bm25_docs, bm25_best_pi = bm25_doc_aggregate(p_scores, top_passages=2000, top_docs=500)
        # 2) ANN + doc best
        _, ann_doc_best = ann_search_with_doc_map(qtext, topk=200, efSearch=efS)
        # 3) Fuse + choose per-doc best passage
        fused_docs, doc_best_pi = fuse_minmax_docs(bm25_docs, bm25_best_pi, ann_doc_best, alpha=alpha)
        # 4) CE rerank
        final_docs = rerank_with_ce_fast(qtext, fused_docs, doc_best_pi, top_for_ce=top_ce)

        rels = qrels.get(qid, {})
        if not final_docs:
            continue
        n += 1
        # nDCG@10
        rels10 = [1 if (d in rels and rels[d] > 0) else 0 for d in final_docs[:10]]
        dcg = sum(r / log2(i+2) for i, r in enumerate(rels10))
        idcg = sum(1 / log2(i+2) for i in range(min(10, sum(1 for v in rels.values() if v > 0))))
        ndcg_sum += (dcg / idcg) if idcg > 0 else 0.0
        # MRR@10
        mrr = 0.0
        for i, d in enumerate(final_docs[:10], start=1):
            if d in rels and rels[d] > 0:
                mrr = 1.0 / i; break
        mrr_sum += mrr
        # Recall@100
        rel_docs = {d for d, r in rels.items() if r > 0}
        if rel_docs:
            found = sum(1 for d in final_docs[:100] if d in rel_docs)
            recall_sum += found / len(rel_docs)
            recall_cnt += 1

    return {
        "n_evaluated_queries": n,
        "nDCG@10": round(ndcg_sum / n if n else 0.0, 4),
        "MRR@10":  round(mrr_sum / n if n else 0.0, 4),
        "Recall@100": round(recall_sum / recall_cnt if recall_cnt else 0.0, 4),
        "bm25": bm25_params, "ann": ann_params, "fusion": fuse_params, "ce": ce_params
    }

In [None]:
# 9C) Train-grid → pick top → Dev confirm → choose best
results_train = []
for bm25, ann, fuse, ce in product(BM25_GRID, ANN_GRID, FUSE_GRID, CE_GRID):
    m = eval_split(train_q_sample, qrels_train, bm25, ann, fuse, ce)
    results_train.append(m)

df_train = pd.DataFrame(results_train).sort_values(["nDCG@10","Recall@100","MRR@10"], ascending=False)
display(df_train.head(10))
(TRAIN_TUNE_PATH := REPTS / "tune_train_leaderboard.csv").write_text(df_train.to_csv(index=False))
print("Saved:", TRAIN_TUNE_PATH)

Evaluating split: 100%|██████████| 400/400 [02:54<00:00,  2.29it/s]
Evaluating split: 100%|██████████| 400/400 [02:45<00:00,  2.41it/s]
Evaluating split: 100%|██████████| 400/400 [02:18<00:00,  2.90it/s]
Evaluating split: 100%|██████████| 400/400 [02:42<00:00,  2.47it/s]
Evaluating split: 100%|██████████| 400/400 [02:27<00:00,  2.70it/s]
Evaluating split: 100%|██████████| 400/400 [02:56<00:00,  2.27it/s]
Evaluating split: 100%|██████████| 400/400 [02:31<00:00,  2.63it/s]
Evaluating split: 100%|██████████| 400/400 [02:43<00:00,  2.45it/s]
Evaluating split: 100%|██████████| 400/400 [02:18<00:00,  2.89it/s]
Evaluating split: 100%|██████████| 400/400 [02:49<00:00,  2.36it/s]
Evaluating split: 100%|██████████| 400/400 [02:28<00:00,  2.69it/s]
Evaluating split: 100%|██████████| 400/400 [03:01<00:00,  2.20it/s]
Evaluating split: 100%|██████████| 400/400 [02:36<00:00,  2.55it/s]
Evaluating split: 100%|██████████| 400/400 [02:56<00:00,  2.26it/s]
Evaluating split: 100%|██████████| 400/400 [02:3

Unnamed: 0,n_evaluated_queries,nDCG@10,MRR@10,Recall@100,bm25,ann,fusion,ce
0,400,0.3688,0.4483,0.4899,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.4},{'top_for_ce': 20}
2,400,0.3688,0.4483,0.4899,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.5},{'top_for_ce': 20}
4,400,0.3688,0.4483,0.4899,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.6},{'top_for_ce': 20}
6,400,0.3688,0.4483,0.4899,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.7},{'top_for_ce': 20}
8,400,0.368,0.4469,0.4895,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.4},{'top_for_ce': 20}
10,400,0.368,0.4469,0.4895,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.5},{'top_for_ce': 20}
12,400,0.368,0.4469,0.4895,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.6},{'top_for_ce': 20}
14,400,0.368,0.4469,0.4895,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.7},{'top_for_ce': 20}
9,400,0.3661,0.4394,0.5945,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.4},{'top_for_ce': 50}
11,400,0.3661,0.4394,0.5945,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.5},{'top_for_ce': 50}


Saved: /content/drive/MyDrive/finance-rag-microservice/reports/tune_train_leaderboard.csv


In [None]:
# 9D) Validate the best few on DEV and select final tuned config
topK = len(df_train)   # evaluate ALL train combos on dev (still small)
cand = df_train.head(topK).to_dict(orient="records")

results_dev = []
for row in cand:
    cfg = {"bm25": row["bm25"], "ann": row["ann"], "fusion": row["fusion"], "ce": row["ce"]}
    m = eval_split(dev_q_sample, qrels_dev, cfg["bm25"], cfg["ann"], cfg["fusion"], cfg["ce"])
    m["cfg"] = cfg
    results_dev.append(m)

df_dev = pd.DataFrame(results_dev).sort_values(
    ["Recall@100", "nDCG@10", "MRR@10"], ascending=False
)
display(df_dev)
(DEV_TUNE_PATH := REPTS / "tune_dev_leaderboard.csv").write_text(df_dev.to_csv(index=False))
print("Saved:", DEV_TUNE_PATH)

best_cfg = df_dev.iloc[0]["cfg"]
print("Chosen tuned config:", best_cfg)
(SELECTION_PATH := REPTS / "retrieval_selection.json").write_text(json.dumps(best_cfg, indent=2))

Evaluating split: 100%|██████████| 500/500 [02:54<00:00,  2.87it/s]
Evaluating split: 100%|██████████| 500/500 [03:06<00:00,  2.68it/s]
Evaluating split: 100%|██████████| 500/500 [03:02<00:00,  2.74it/s]
Evaluating split: 100%|██████████| 500/500 [03:18<00:00,  2.52it/s]
Evaluating split: 100%|██████████| 500/500 [03:18<00:00,  2.52it/s]
Evaluating split: 100%|██████████| 500/500 [03:02<00:00,  2.73it/s]
Evaluating split: 100%|██████████| 500/500 [03:16<00:00,  2.54it/s]
Evaluating split: 100%|██████████| 500/500 [03:13<00:00,  2.58it/s]
Evaluating split: 100%|██████████| 500/500 [03:27<00:00,  2.41it/s]
Evaluating split: 100%|██████████| 500/500 [03:48<00:00,  2.19it/s]
Evaluating split: 100%|██████████| 500/500 [03:42<00:00,  2.25it/s]
Evaluating split: 100%|██████████| 500/500 [03:39<00:00,  2.28it/s]
Evaluating split: 100%|██████████| 500/500 [03:49<00:00,  2.18it/s]
Evaluating split: 100%|██████████| 500/500 [03:37<00:00,  2.30it/s]
Evaluating split: 100%|██████████| 500/500 [03:4

Unnamed: 0,n_evaluated_queries,nDCG@10,MRR@10,Recall@100,bm25,ann,fusion,ce,cfg
12,500,0.3821,0.4563,0.6117,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.4},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
13,500,0.3821,0.4563,0.6117,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.5},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
14,500,0.3821,0.4563,0.6117,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.6},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
15,500,0.3821,0.4563,0.6117,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 64},{'alpha': 0.7},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
8,500,0.3834,0.4569,0.6079,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.4},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
9,500,0.3834,0.4569,0.6079,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.5},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
10,500,0.3834,0.4569,0.6079,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.6},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
11,500,0.3834,0.4569,0.6079,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.7},{'top_for_ce': 50},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
4,500,0.3841,0.4576,0.5047,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.4},{'top_for_ce': 20},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."
5,500,0.3841,0.4576,0.5047,"{'k1': 1.2, 'b': 0.75}",{'efSearch': 128},{'alpha': 0.5},{'top_for_ce': 20},"{'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSe..."


Saved: /content/drive/MyDrive/finance-rag-microservice/reports/tune_dev_leaderboard.csv
Chosen tuned config: {'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSearch': 64}, 'fusion': {'alpha': 0.4}, 'ce': {'top_for_ce': 50}}


155

In [15]:
# 9E (standalone) — load tuned config from Drive (if present) and evaluate on TEST

from pathlib import Path
import json

# Ensure ROOT, paths
try:
    ROOT  # already defined earlier
except NameError:
    ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")

REPTS = ROOT / "reports"
RUNS  = ROOT / "runs"
REPTS.mkdir(parents=True, exist_ok=True)
RUNS.mkdir(parents=True, exist_ok=True)

# Load TEST queries/qrels (and passages) if not in memory
try:
    queries_test; qrels_test; passages_df
except NameError:
    import pandas as pd
    EVALQ = ROOT / "eval" / "qrels"
    passages_df = pd.read_parquet(ROOT / "artifacts" / "passages.parquet")
    with open(EVALQ / "fiqa_queries_test.json") as f: queries_test = json.load(f)
    with open(EVALQ / "fiqa_qrels_test.json")  as f: qrels_test   = json.load(f)

# Get tuned config if available; else sensible fallback (dev-chosen style)
sel_path = REPTS / "retrieval_selection.json"
if sel_path.exists():
    best_cfg = json.loads(sel_path.read_text())
    run_path = RUNS  / "hybrid_ce_tuned_test.trec"
    rep_path = REPTS / "ir_hybrid_ce_tuned_test.json"
    print("Loaded tuned config from:", sel_path)
else:
    best_cfg = {"bm25":{"k1":1.2,"b":0.75},
                "ann":{"efSearch":64},
                "fusion":{"alpha":0.4},
                "ce":{"top_for_ce":50}}
    run_path = RUNS  / "hybrid_ce_preselected_test.trec"
    rep_path = REPTS / "ir_hybrid_ce_preselected_test.json"
    print("No tuned config found; using fallback:", best_cfg)

# Require Section 8 utilities once (for eval_on_test_and_write)
if "eval_on_test_and_write" not in globals():
    raise RuntimeError(
        "Missing utilities. Please run Section 8 (retrieval utilities + eval_on_test_and_write) once, then re-run this cell."
    )

# Evaluate on TEST with the loaded config
metrics = eval_on_test_and_write(
    run_path=run_path,
    report_path=rep_path,
    k1=best_cfg["bm25"]["k1"], b=best_cfg["bm25"]["b"],
    efSearch=best_cfg["ann"]["efSearch"],
    alpha=best_cfg["fusion"]["alpha"],
    top_for_ce=best_cfg["ce"]["top_for_ce"],
)
print("TEST metrics:", metrics)
print("Run:", run_path)
print("Report:", rep_path)

Loaded tuned config from: /content/drive/MyDrive/finance-rag-microservice/reports/retrieval_selection.json


Evaluating (test, pretrained): 100%|██████████| 648/648 [04:30<00:00,  2.39it/s]

TEST metrics: {'n_evaluated_queries': 648, 'nDCG@10': 0.3719, 'MRR@10': 0.4476, 'Recall@100': 0.5951, 'config': {'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSearch': 64}, 'fusion': {'alpha': 0.4}, 'ce': {'top_for_ce': 50}}}
Run: /content/drive/MyDrive/finance-rag-microservice/runs/hybrid_ce_tuned_test.trec
Report: /content/drive/MyDrive/finance-rag-microservice/reports/ir_hybrid_ce_tuned_test.json





In [17]:
# 10A (FAST) — Build train pairs via ANN best-passage selection (no BM25 full scan)

from pathlib import Path
import json, numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample

# Paths
ROOT  = Path("/content/drive/MyDrive/finance-rag-microservice")
EVALQ = ROOT / "eval" / "qrels"
EMB_DIR = ROOT / "artifacts" / "embeddings"
EMB_FILE = EMB_DIR / "passage_emb_allminilm.npy"

# 1) Load tuned config (optional; not strictly needed for this step)
SEL = ROOT / "reports" / "retrieval_selection.json"
if SEL.exists():
    best_cfg = json.loads(SEL.read_text())
    print("Loaded tuned config:", best_cfg)
else:
    best_cfg = {"bm25":{"k1":1.2,"b":0.75}, "ann":{"efSearch":64}, "fusion":{"alpha":0.4}, "ce":{"top_for_ce":50}}
    print("No tuned config found; using fallback:", best_cfg)

# 2) Ensure splits are loaded
try:
    queries_train; qrels_train
except NameError:
    with open(EVALQ / "fiqa_queries_train.json") as f: queries_train = json.load(f)
    with open(EVALQ / "fiqa_qrels_train.json")  as f: qrels_train   = json.load(f)
print("Train queries:", len(queries_train), "| Train qrels pairs:", sum(len(v) for v in qrels_train.values()))

# 3) Ensure passages + mapping are loaded
try:
    passages_df; passage_texts; passage_docids; DOC2PIs
except NameError:
    import pandas as pd
    passages_df = pd.read_parquet(ROOT / "artifacts" / "passages.parquet")
    passage_texts  = passages_df["passage"].tolist()
    passage_docids = passages_df["doc_id"].tolist()
    from collections import defaultdict
    DOC2PIs = defaultdict(list)
    for i, did in enumerate(passage_docids):
        DOC2PIs[did].append(i)

# 4) Load (or create) passage embeddings (pretrained)
if 'passage_emb' in globals() and isinstance(passage_emb, np.ndarray):
    pass
else:
    assert EMB_FILE.exists(), f"Missing {EMB_FILE}. Encode passages once (Section 5) before running Step 10."
    passage_emb = np.load(EMB_FILE)  # shape: (num_passages, dim)
print("Passage embeddings:", passage_emb.shape)

# 5) Ensure we have a query encoder
try:
    q_encoder
except NameError:
    q_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=("cuda" if torch.cuda.is_available() else "cpu"))

# 6) Build training pairs: for each (query, relevant doc), choose the doc’s best passage by cosine sim
train_examples = []
qids = list(queries_train.keys())
for qid in tqdm(qids, desc="Selecting best passages for train pairs"):
    qtext = queries_train.get(qid)
    if not qtext:
        continue
    qvec = q_encoder.encode([qtext], normalize_embeddings=True, convert_to_numpy=True)[0].astype("float32")  # (d,)
    rels = qrels_train.get(qid, {})
    for doc_id, rel in rels.items():
        if rel <= 0:
            continue
        pis = DOC2PIs.get(doc_id, [])
        if not pis:
            continue
        # cosine on normalized vectors = dot product
        cand = passage_emb[pis]                                  # (m, d)
        sims = cand @ qvec                                       # (m,)
        best_pi = pis[int(sims.argmax())]
        train_examples.append(InputExample(texts=[qtext, passage_texts[best_pi]]))

print("Fine-tune train pairs:", len(train_examples))

Loaded tuned config: {'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSearch': 64}, 'fusion': {'alpha': 0.4}, 'ce': {'top_for_ce': 50}}
Train queries: 5500 | Train qrels pairs: 14166
Passage embeddings: (59018, 384)


Selecting best passages for train pairs: 100%|██████████| 5500/5500 [00:35<00:00, 156.70it/s]

Fine-tune train pairs: 14131





In [19]:
# 10B — Lightweight fine-tune (1 epoch) with version-safe fit args
import torch, inspect, math
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
FT_EPOCHS = 1
FT_BATCH  = 64            # lower to 32 if you hit OOM
FT_MAX_STEPS = 1200       # safety cap

ft_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
train_loader = DataLoader(train_examples, shuffle=True, batch_size=FT_BATCH, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(ft_model)

steps_per_epoch = min(len(train_loader), FT_MAX_STEPS)
total_steps = steps_per_epoch * FT_EPOCHS
warmup_steps = max(1, int(0.05 * total_steps))   # 5% warmup

print(f"Training on {len(train_examples)} pairs | batch={FT_BATCH} | "
      f"steps/epoch={steps_per_epoch} | warmup_steps={warmup_steps}")

# Build kwargs based on the installed sentence-transformers signature
fit_sig = inspect.signature(ft_model.fit)
fit_kwargs = {
    "train_objectives": [(train_loader, train_loss)],
    "epochs": FT_EPOCHS,
    "steps_per_epoch": steps_per_epoch,
    "warmup_steps": warmup_steps,
    "show_progress_bar": True,
}
# Some versions support use_amp; add it only if available
if "use_amp" in fit_sig.parameters:
    fit_kwargs["use_amp"] = True

ft_model.fit(**fit_kwargs)

FT_DIR = ROOT / "artifacts" / "bi_encoder_finetuned"
ft_model.save(str(FT_DIR))
print("Saved fine-tuned model:", FT_DIR)

Training on 14131 pairs | batch=64 | steps/epoch=220 | warmup_steps=11


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mat3841[0m ([33mat3841-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Saved fine-tuned model: /content/drive/MyDrive/finance-rag-microservice/artifacts/bi_encoder_finetuned


In [20]:
# 10C — Re-embed passages and rebuild FAISS (HNSW)
# put these two lines at the TOP of 10C, before saving new embeddings
from shutil import copy2
PRETRAIN_EMB_BACKUP = EMB_DIR / "passage_emb_pretrained_backup.npy"
if (EMB_FILE.exists()) and (not PRETRAIN_EMB_BACKUP.exists()):
    copy2(EMB_FILE, PRETRAIN_EMB_BACKUP)
    print("Backed up pretrained embeddings →", PRETRAIN_EMB_BACKUP)


import numpy as np, faiss

EMB_DIR   = ROOT / "artifacts" / "embeddings"
EMB_DIR.mkdir(parents=True, exist_ok=True)
EMB_FILE  = EMB_DIR / "passage_emb_allminilm.npy"

# Encode passages with the fine-tuned model
passage_emb_ft = ft_model.encode(
    passage_texts,
    batch_size=256,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True
).astype("float32")
np.save(EMB_FILE, passage_emb_ft)
print("Saved embeddings:", EMB_FILE, passage_emb_ft.shape)

# Rebuild FAISS index
IDX_DIR   = ROOT / "indices" / "faiss_hnsw"
INDEX_PATH= IDX_DIR / "faiss_hnsw_ip.index"
IDS_PATH  = IDX_DIR / "passage_ids.npy"
IDX_DIR.mkdir(parents=True, exist_ok=True)

d = passage_emb_ft.shape[1]
index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT)
index.hnsw.efConstruction = 200
index.add(passage_emb_ft)
faiss.write_index(index, str(INDEX_PATH))
np.save(IDS_PATH, np.arange(len(passage_emb_ft), dtype=np.int64))
print("FAISS rebuilt:", index.ntotal)

# IMPORTANT: use the fine-tuned encoder for queries as well
q_encoder = ft_model
print("Query encoder switched to fine-tuned model.")

Backed up pretrained embeddings → /content/drive/MyDrive/finance-rag-microservice/artifacts/embeddings/passage_emb_pretrained_backup.npy


Batches:   0%|          | 0/231 [00:00<?, ?it/s]

Saved embeddings: /content/drive/MyDrive/finance-rag-microservice/artifacts/embeddings/passage_emb_allminilm.npy (59018, 384)
FAISS rebuilt: 59018
Query encoder switched to fine-tuned model.


In [21]:
# 10D — Dev evaluation (fine-tuned)
# Uses eval_split() from step 9B utilities. If it's not in memory, re-run step 9B cell once.

m_dev_ft = eval_split(
    queries_dev, qrels_dev,
    best_cfg["bm25"], best_cfg["ann"], best_cfg["fusion"], best_cfg["ce"]
)
print("DEV (after fine-tune):", m_dev_ft)

Evaluating split: 100%|██████████| 500/500 [03:43<00:00,  2.24it/s]

DEV (after fine-tune): {'n_evaluated_queries': 500, 'nDCG@10': 0.3855, 'MRR@10': 0.457, 'Recall@100': 0.6259, 'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSearch': 64}, 'fusion': {'alpha': 0.4}, 'ce': {'top_for_ce': 50}}





In [22]:
# 10E — Final TEST with tuned config + fine-tuned encoder
from pathlib import Path
import json

RUNS  = ROOT / "runs"
REPTS = ROOT / "reports"
RUNS.mkdir(parents=True, exist_ok=True)
REPTS.mkdir(parents=True, exist_ok=True)

run_ft = RUNS / "hybrid_ce_finetuned_test.trec"
rep_ft = REPTS / "ir_hybrid_ce_finetuned_test.json"

m_test_ft = eval_on_test_and_write(
    run_path=run_ft, report_path=rep_ft,
    k1=best_cfg["bm25"]["k1"], b=best_cfg["bm25"]["b"],
    efSearch=best_cfg["ann"]["efSearch"],
    alpha=best_cfg["fusion"]["alpha"],
    top_for_ce=best_cfg["ce"]["top_for_ce"]
)
print("TEST (Hybrid+CE fine-tuned):", m_test_ft)
print("Run:", run_ft)
print("Report:", rep_ft)

# Optional: summary compare vs prior reports (if present)
summary = {}
def _safe_load(path):
    try:
        return json.loads(Path(path).read_text())
    except Exception:
        return None

bm25_report   = _safe_load(REPTS / "ir_bm25_test.json")
pretr_report  = _safe_load(REPTS / "ir_hybrid_ce_pretrained_test.json")
tuned_report  = _safe_load(REPTS / "ir_hybrid_ce_tuned_test.json")

if bm25_report:  summary["bm25_test"] = bm25_report.get("metrics")
if pretr_report: summary["hybrid_ce_pretrained_test"] = pretr_report.get("metrics")
if tuned_report: summary["hybrid_ce_tuned_test"] = tuned_report.get("metrics")
summary["hybrid_ce_finetuned_test"] = m_test_ft

sum_path = REPTS / "retrieval_summary.json"
sum_path.write_text(json.dumps(summary, indent=2))
print("Saved summary →", sum_path)

Evaluating (test, pretrained): 100%|██████████| 648/648 [04:56<00:00,  2.18it/s]


TEST (Hybrid+CE fine-tuned): {'n_evaluated_queries': 648, 'nDCG@10': 0.3794, 'MRR@10': 0.4526, 'Recall@100': 0.6247, 'config': {'bm25': {'k1': 1.2, 'b': 0.75}, 'ann': {'efSearch': 64}, 'fusion': {'alpha': 0.4}, 'ce': {'top_for_ce': 50}}}
Run: /content/drive/MyDrive/finance-rag-microservice/runs/hybrid_ce_finetuned_test.trec
Report: /content/drive/MyDrive/finance-rag-microservice/reports/ir_hybrid_ce_finetuned_test.json
Saved summary → /content/drive/MyDrive/finance-rag-microservice/reports/retrieval_summary.json


In [23]:
# Freeze final retriever choice & knobs for NB3
from pathlib import Path
import json

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
FINAL = {
  "bi_encoder_model_path": "artifacts/bi_encoder_finetuned",  # use fine-tuned model
  "bm25": {"k1": 1.2, "b": 0.75},
  "ann": {"efSearch": 64, "index_path": "indices/faiss_hnsw/faiss_hnsw_ip.index"},
  "fusion": {"alpha": 0.4},
  "ce": {"model": "cross-encoder/ms-marco-MiniLM-L-6-v2", "top_for_ce": 50},
  "embeddings": {"file": "artifacts/embeddings/passage_emb_allminilm.npy"},
  "passages": {"file": "artifacts/passages.parquet"},
  "splits": {
    "queries_test": "eval/qrels/fiqa_queries_test.json",
    "qrels_test":   "eval/qrels/fiqa_qrels_test.json"
  },
  "metrics": json.loads((ROOT/"reports/ir_hybrid_ce_finetuned_test.json").read_text())["metrics"]
}
out = ROOT / "reports" / "retrieval_final_selection.json"
out.write_text(json.dumps(FINAL, indent=2))
print("Wrote:", out)

Wrote: /content/drive/MyDrive/finance-rag-microservice/reports/retrieval_final_selection.json
