In [None]:
# Mount Drive so later notebooks pick up saved artifacts
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path

PROJECT_NAME = "finance-rag-microservice"
ROOT = Path("/content/drive/MyDrive") / PROJECT_NAME

# Create the exact structure from the assignment
paths = [
    ROOT / "app",                      # FastAPI or Streamlit demo
    ROOT / "cfg",                      # dataset.yaml, retrieval.yaml, generation.yaml
    ROOT / "data",                     # raw dataset downloads here
    ROOT / "indices" / "bm25",
    ROOT / "indices" / "faiss_hnsw",
    ROOT / "artifacts",                # passages.parquet, embeddings later
    ROOT / "runs",                     # bm25.trec, hybrid_ce.trec
    ROOT / "eval" / "qrels",           # qrels/test etc.
    ROOT / "notebooks",                # 01_build_index.ipynb etc. (you'll save the Colab as this)
    ROOT / "reports",                  # ir_*.json, ragas.json, latency.csv...
    ROOT / "docs",                     # prompt_library.md, system_overview.png
    ROOT / "tests",                    # test_retrieval.py, test_grounding.py
]

for p in paths:
    p.mkdir(parents=True, exist_ok=True)

# Create small placeholder files the assignment expects
# (These are minimal stubs; you can expand later.)
(ROOT / "README.md").write_text("# Finance RAG Microservice\n\nSee notebooks for setup.\n")
(ROOT / "metrics.md").write_text("# Metrics\n\n(Consolidated tables will be added here.)\n")
(ROOT / "Makefile").write_text(
    "install:\n\tpip install -r requirements.txt\n\n"
    "run-api:\n\tuvicorn app.main:app --reload --port 8000\n"
)
(ROOT / "Dockerfile").write_text(
    "FROM python:3.11-slim\nWORKDIR /app\nCOPY requirements.txt .\n"
    "RUN pip install --no-cache-dir -r requirements.txt\nCOPY . .\n"
    "CMD [\"uvicorn\", \"app.main:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8000\"]\n"
)
(ROOT / "requirements.txt").write_text(
    "\n".join([
        # minimal for this notebook
        "numpy>=1.26,<2",
        "pandas>=2.2",
        "tqdm>=4.66",
        "pyarrow>=15",
        "rank-bm25>=0.2.2",
        # future notebooks (kept here for reproducibility)
        "faiss-cpu>=1.8.0",                 # FAISS HNSW (NB02)
        "sentence-transformers>=3.0.1",     # MiniLM embeddings (NB02)
        "transformers>=4.42.0",             # Flan-T5 (NB03)
        "accelerate>=0.33.0",
        "ragas>=0.1.14",                    # RAG metrics (NB03)
        "fastapi>=0.112.0", "uvicorn>=0.30.0",  # API demo (NB05)
    ])
)
# Baseline cfg (edit later if needed)
(ROOT / "cfg" / "dataset.yaml").write_text(
    "name: beir-fiqa-2018\nuse_full_corpus: true\nchunk_tokens: 512\nchunk_stride: 128\nseed: 42\nrag_eval_n: 600\n"
)
(ROOT / "cfg" / "retrieval.yaml").write_text(
    "bm25:\n  k1: 1.2\n  b: 0.75\n  top_k_candidates: 1000\nhybrid:\n  bm25_top: 500\n  ann_top: 100\n"
    "rerank:\n  top_for_ce: 50\n  model: cross-encoder/ms-marco-MiniLM-L-6-v2\n"
)
(ROOT / "cfg" / "generation.yaml").write_text(
    "model: google/flan-t5-base\nk_contexts: 4\nmax_new_tokens: 256\ntemperature: 0.1\nabstain_threshold: 0.2\n"
)

print("✅ Repo scaffold created at:", ROOT)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Repo scaffold created at: /content/drive/MyDrive/finance-rag-microservice


In [None]:
!pip -q install datasets rank-bm25 pandas tqdm pyarrow

In [None]:
# Robust .zip download (reuses if present) — BEIR hosts fiqa.zip (not .tar.gz)
import urllib.request, os, zipfile, shutil
from pathlib import Path

DATA_DIR = ROOT / "data"
ZIP_PATH = DATA_DIR / "fiqa.zip"
EXTRACT_DIR = DATA_DIR / "fiqa"
URL = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip"

def download_if_needed(url: str, dest: Path, min_bytes: int = 1_000_000):
    dest.parent.mkdir(parents=True, exist_ok=True)
    if dest.exists() and dest.stat().st_size >= min_bytes:
        print(f"Found existing zip: {dest} ({dest.stat().st_size/1_048_576:.1f} MB)")
        return
    print(f"Downloading:\n  {url}\n  → {dest}")
    urllib.request.urlretrieve(url, dest)
    if dest.stat().st_size < min_bytes:
        raise RuntimeError(f"Download seems incomplete (size={dest.stat().st_size} bytes).")

def safe_extract_zip(src: Path, out_dir: Path):
    # Clean any previous extraction to avoid mixed states
    if EXTRACT_DIR.exists():
        shutil.rmtree(EXTRACT_DIR)
    out_dir.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(src) as z:
        # Path traversal guard
        for m in z.infolist():
            extracted_path = (out_dir / m.filename).resolve()
            if not str(extracted_path).startswith(str(out_dir.resolve())):
                raise Exception("Blocked path traversal during extract.")
        z.extractall(out_dir)

# 1) Download (or reuse)
download_if_needed(URL, ZIP_PATH)

# 2) Extract into data/fiqa
print(f"Extracting {ZIP_PATH.name} → {DATA_DIR} ...")
safe_extract_zip(ZIP_PATH, DATA_DIR)

# Some archives already contain 'fiqa/' as the top-level; ensure final path is data/fiqa
if not EXTRACT_DIR.exists():
    # Find a likely inner dir and rename
    candidates = [p for p in DATA_DIR.iterdir() if p.is_dir() and p.name.lower().startswith("fiqa")]
    if candidates:
        candidates[0].rename(EXTRACT_DIR)

# 3) Sanity check expected files
expected = [
    EXTRACT_DIR / "corpus.jsonl",
    EXTRACT_DIR / "queries.jsonl",
    EXTRACT_DIR / "qrels" / "test.tsv",
]
for p in expected:
    if not p.exists():
        raise FileNotFoundError(f"Missing expected file: {p}")
print("✅ FiQA files present:", [str(p) for p in expected])

Found existing zip: /content/drive/MyDrive/finance-rag-microservice/data/fiqa.zip (17.1 MB)
Extracting fiqa.zip → /content/drive/MyDrive/finance-rag-microservice/data ...
✅ FiQA files present: ['/content/drive/MyDrive/finance-rag-microservice/data/fiqa/corpus.jsonl', '/content/drive/MyDrive/finance-rag-microservice/data/fiqa/queries.jsonl', '/content/drive/MyDrive/finance-rag-microservice/data/fiqa/qrels/test.tsv']


In [10]:
# D) Load FiQA once, save split artifacts (train/dev/test) for later use
#    This does NOT run BM25; it just prepares data.

import json, csv
from pathlib import Path

# EXTRACT_DIR and ROOT come from earlier cells:
# EXTRACT_DIR = ROOT / "data" / "fiqa"
# ROOT = /content/drive/MyDrive/finance-rag-microservice

# 1) Load corpus.jsonl (full doc set)
corpus = {}
with open(EXTRACT_DIR / "corpus.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        corpus[str(rec["_id"])] = {
            "title": rec.get("title", "") or "",
            "text": rec.get("text", "") or "",
            "metadata": rec.get("metadata", {}) or {},
        }

# 2) Load all queries (we'll filter by split)
all_queries = {}
with open(EXTRACT_DIR / "queries.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        all_queries[str(rec["_id"])] = rec["text"]

# 3) Helper to read qrels TSV (handles header and 3- or 4-column format)
def _is_header(row):
    return bool(row) and row[0].strip().lower().startswith(("query","qid"))

def _load_qrels_file(tsv_path: Path):
    out = {}
    with open(tsv_path, "r", encoding="utf-8", newline="") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            if not row or all(not x.strip() for x in row) or _is_header(row):
                continue
            # Accept 3-col (qid, docid, score) OR 4+ col (qid, Q0, docid, score)
            if len(row) == 3:
                qid, did, score = row
            elif len(row) >= 4:
                qid, did, score = row[0], row[2], row[-1]
            else:
                continue
            try:
                rel = int(float(score))
            except ValueError:
                continue
            out.setdefault(qid, {})[did] = rel
    return out

qrels_dir = EXTRACT_DIR / "qrels"
qrels_train = _load_qrels_file(qrels_dir / "train.tsv")
qrels_dev   = _load_qrels_file(qrels_dir / "dev.tsv")
qrels_test  = _load_qrels_file(qrels_dir / "test.tsv")

# 4) Split-filtered queries (keep only queries that appear in that split’s qrels)
queries_train = {qid: all_queries[qid] for qid in qrels_train if qid in all_queries}
queries_dev   = {qid: all_queries[qid] for qid in qrels_dev   if qid in all_queries}
queries_test  = {qid: all_queries[qid] for qid in qrels_test  if qid in all_queries}

def _count_pairs(qdict): return sum(len(v) for v in qdict.values())

print("Docs:", len(corpus))
print("Train queries:", len(queries_train), "qrels:", _count_pairs(qrels_train))
print("Dev   queries:", len(queries_dev),   "qrels:", _count_pairs(qrels_dev))
print("Test  queries:", len(queries_test),  "qrels:", _count_pairs(qrels_test))

# 5) Persist split JSONs to eval/qrels/ (the scaffold’s evaluation folder)
outdir = ROOT / "eval" / "qrels"
outdir.mkdir(parents=True, exist_ok=True)

(outdir / "fiqa_queries_train.json").write_text(json.dumps(queries_train))
(outdir / "fiqa_queries_dev.json").write_text(json.dumps(queries_dev))
(outdir / "fiqa_queries_test.json").write_text(json.dumps(queries_test))

(outdir / "fiqa_qrels_train.json").write_text(json.dumps(qrels_train))
(outdir / "fiqa_qrels_dev.json").write_text(json.dumps(qrels_dev))
(outdir / "fiqa_qrels_test.json").write_text(json.dumps(qrels_test))

# Optional: save all-queries mapping for convenience later
(outdir / "fiqa_queries_all.json").write_text(json.dumps(all_queries))

# 6) Save a small manifest for quick reference in README/reports
manifest = {
    "docs": len(corpus),
    "splits": {
        "train": {"queries": len(queries_train), "qrels_pairs": _count_pairs(qrels_train)},
        "dev":   {"queries": len(queries_dev),   "qrels_pairs": _count_pairs(qrels_dev)},
        "test":  {"queries": len(queries_test),  "qrels_pairs": _count_pairs(qrels_test)},
    }
}
(ROOT / "reports" / "fiqa_splits_manifest.json").write_text(json.dumps(manifest, indent=2))
print("Saved split JSONs →", outdir)
print("Saved manifest   →", ROOT / "reports" / "fiqa_splits_manifest.json")

Docs: 57638
Train queries: 5500 qrels: 14166
Dev   queries: 500 qrels: 1238
Test  queries: 648 qrels: 1706
Saved split JSONs → /content/drive/MyDrive/finance-rag-microservice/eval/qrels
Saved manifest   → /content/drive/MyDrive/finance-rag-microservice/reports/fiqa_splits_manifest.json


In [11]:
from tqdm import tqdm
import pandas as pd

CHUNK_TOKENS = 512
CHUNK_STRIDE = 128

def tokenize_simple(s: str):  # simple & fast
    return s.split()

def chunk_tokens(tokens, win=CHUNK_TOKENS, stride=CHUNK_STRIDE):
    i, n = 0, len(tokens)
    while i < n:
        j = min(i + win, n)
        yield tokens[i:j]
        if j >= n:
            break
        i += (win - stride)

rows = []
for doc_id, d in tqdm(corpus.items(), desc="Chunking docs"):
    text = ((d.get("title") or "") + "\n" + (d.get("text") or "")).strip()
    toks = tokenize_simple(text)
    if not toks:
        continue
    for window in chunk_tokens(toks):
        rows.append({
            "doc_id": doc_id,
            "passage": " ".join(window)
        })

passages_df = pd.DataFrame(rows)
passages_parquet = ROOT / "artifacts" / "passages.parquet"
passages_df.to_parquet(passages_parquet, index=False)
print("Saved passages:", passages_parquet, "(", len(passages_df), "rows )")

Chunking docs: 100%|██████████| 57638/57638 [00:01<00:00, 40255.87it/s]


Saved passages: /content/drive/MyDrive/finance-rag-microservice/artifacts/passages.parquet ( 59018 rows )


In [12]:
# BM25-3) Build BM25 index
from rank_bm25 import BM25Okapi

passage_tokens = [p.split() for p in passages_df["passage"].tolist()]
passage_doc_ids = passages_df["doc_id"].tolist()

bm25 = BM25Okapi(passage_tokens)
print("BM25 index over passages ready.")

BM25 index over passages ready.


In [13]:
# BM25-4) Retrieval (passage scores → max-pool per doc) and write TREC run
import numpy as np
from tqdm import tqdm

def bm25_doc_run(query_text, top_passages=2000, top_docs=1000):
    q_toks = query_text.split()
    scores = bm25.get_scores(q_toks)  # per passage
    k = min(top_passages, len(scores))
    top_idx = np.argpartition(scores, -k)[-k:]
    doc2score = {}
    for idx in top_idx:
        did = passage_doc_ids[idx]
        s = float(scores[idx])
        if (did not in doc2score) or (s > doc2score[did]):
            doc2score[did] = s
    return sorted(doc2score.items(), key=lambda x: x[1], reverse=True)[:top_docs]

run_path = ROOT / "runs" / "bm25_test.trec"
with open(run_path, "w", encoding="utf-8") as out:
    for qid, qtext in tqdm(queries.items(), desc="Retrieving (BM25→doc) [test]"):
        ranked = bm25_doc_run(qtext, top_passages=2000, top_docs=1000)
        for rank, (doc_id, score) in enumerate(ranked, start=1):
            out.write(f"{qid} Q0 {doc_id} {rank} {score:.6f} bm25_passagemax\n")
print("Wrote run:", run_path)

Retrieving (BM25→doc) [test]: 100%|██████████| 648/648 [03:30<00:00,  3.08it/s]

Wrote run: /content/drive/MyDrive/finance-rag-microservice/runs/bm25_test.trec





In [14]:
# BM25-5) Metrics + report
from math import log2
import json

# Load run as qid -> ranked list
qid_to_ranked = {}
with open(run_path, "r", encoding="utf-8") as f:
    for line in f:
        qid, _, docid, rank, score, _ = line.strip().split()
        qid_to_ranked.setdefault(qid, []).append(docid)

def ndcg_at_k(qrels_for_q, ranked, k=10):
    rels = [1 if (doc in qrels_for_q and qrels_for_q[doc] > 0) else 0 for doc in ranked[:k]]
    dcg = sum(rel / log2(i+2) for i, rel in enumerate(rels))
    ideal = [1] * min(k, sum(1 for r in qrels_for_q.values() if r > 0))
    idcg = sum(rel / log2(i+2) for i, rel in enumerate(ideal))
    return (dcg / idcg) if idcg > 0 else 0.0

def mrr_at_k(qrels_for_q, ranked, k=10):
    for i, doc in enumerate(ranked[:k], start=1):
        if doc in qrels_for_q and qrels_for_q[doc] > 0:
            return 1.0 / i
    return 0.0

def recall_at_k(qrels_for_q, ranked, k=100):
    rel_docs = {d for d, r in qrels_for_q.items() if r > 0}
    if not rel_docs:
        return None
    found = sum(1 for d in ranked[:k] if d in rel_docs)
    return found / len(rel_docs)

n, ndcg_sum, mrr_sum, recall_sum, recall_cnt = 0, 0.0, 0.0, 0.0, 0
for qid, rels in qrels.items():
    ranked = qid_to_ranked.get(qid, [])
    if not ranked:
        continue
    n += 1
    ndcg_sum += ndcg_at_k(rels, ranked, k=10)
    mrr_sum  += mrr_at_k(rels, ranked, k=10)
    r = recall_at_k(rels, ranked, k=100)
    if r is not None:
        recall_sum += r
        recall_cnt += 1

metrics = {
    "n_evaluated_queries": n,
    "nDCG@10": round(ndcg_sum / n if n else 0.0, 4),
    "MRR@10":  round(mrr_sum / n if n else 0.0, 4),
    "Recall@100": round(recall_sum / recall_cnt if recall_cnt else 0.0, 4),
}
print(metrics)

report = {
    "dataset": "BEIR FiQA (test split)",
    "chunking": {"tokens": 512, "stride": 128},
    "bm25": "rank-bm25 over passages; doc score = max passage score",
    "run_file": str(run_path),
    "counts": {
        "passages": int(len(passages_df)),
        "test_queries": int(len(queries)),
        "test_qrels_pairs": int(sum(len(v) for v in qrels.values())),
    },
    "metrics": metrics,
}
rep_path = ROOT / "reports" / "ir_bm25_test.json"
rep_path.write_text(json.dumps(report, indent=2))
print("Saved:", rep_path)

{'n_evaluated_queries': 648, 'nDCG@10': 0.1365, 'MRR@10': 0.1697, 'Recall@100': 0.3203}
Saved: /content/drive/MyDrive/finance-rag-microservice/reports/ir_bm25_test.json


In [15]:
print("✅ BM25 baseline (test) complete.")
print("Artifacts:")
print(" - passages:", str(ROOT / "artifacts" / "passages.parquet"))
print(" - run file:", str(ROOT / "runs" / "bm25_test.trec"))
print(" - report:  ", str(ROOT / "reports" / "ir_bm25_test.json"))

✅ BM25 baseline (test) complete.
Artifacts:
 - passages: /content/drive/MyDrive/finance-rag-microservice/artifacts/passages.parquet
 - run file: /content/drive/MyDrive/finance-rag-microservice/runs/bm25_test.trec
 - report:   /content/drive/MyDrive/finance-rag-microservice/reports/ir_bm25_test.json
