In [1]:
# Section 0 — Config (REPLACE)

USE_SUBSET = False          # True for faster iteration; False for full 648
SUBSET_SIZE = 600

GEN_MODEL = "google/flan-t5-large"   # fallback: "google/flan-t5-base"
ALLOW_EXTERNAL_KNOWLEDGE = True      # allow clearly-labeled fallback beyond context

# Retrieval → context packing limits
TOP_DOCS_FOR_CONTEXT = 12            # how many top docs we consider
MAX_CONTEXT_TOKENS   = 1400          # token budget for packed context

# Generation decoding
MIN_NEW_TOKENS = 32                  # force at least ~1–2 sentences
MAX_NEW_TOKENS = 96
NUM_BEAMS      = 4

# Seeds
import os, random, numpy as np, torch
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
print("Device:", "cuda" if torch.cuda.is_available() else "cpu")

Device: cuda


In [2]:
# Minimal, versionless installs (plays nice with Colab)
!pip -q install -U transformers sentencepiece accelerate faiss-cpu sentence-transformers rank-bm25 tqdm pandas pyarrow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following

In [3]:
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive', force_remount=False)

ROOT   = Path("/content/drive/MyDrive/finance-rag-microservice")
ART    = ROOT / "artifacts"
EVALQ  = ROOT / "eval" / "qrels"
RUNS   = ROOT / "runs"
REPTS  = ROOT / "reports"
IDXDIR = ROOT / "indices" / "faiss_hnsw"

for p in [RUNS, REPTS]:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:", ROOT)

Mounted at /content/drive
Project root: /content/drive/MyDrive/finance-rag-microservice


In [4]:
import json, sys
SEL_FINAL = REPTS / "retrieval_final_selection.json"
assert SEL_FINAL.exists(), f"Missing {SEL_FINAL}. Finish NB2 first."

final_cfg = json.loads(SEL_FINAL.read_text())
print(json.dumps(final_cfg, indent=2))

{
  "bi_encoder_model_path": "artifacts/bi_encoder_finetuned",
  "bm25": {
    "k1": 1.2,
    "b": 0.75
  },
  "ann": {
    "efSearch": 64,
    "index_path": "indices/faiss_hnsw/faiss_hnsw_ip.index"
  },
  "fusion": {
    "alpha": 0.4
  },
  "ce": {
    "model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "top_for_ce": 50
  },
  "embeddings": {
    "file": "artifacts/embeddings/passage_emb_allminilm.npy"
  },
  "passages": {
    "file": "artifacts/passages.parquet"
  },
  "splits": {
    "queries_test": "eval/qrels/fiqa_queries_test.json",
    "qrels_test": "eval/qrels/fiqa_qrels_test.json"
  },
  "metrics": {
    "n_evaluated_queries": 648,
    "nDCG@10": 0.3794,
    "MRR@10": 0.4526,
    "Recall@100": 0.6247,
    "config": {
      "bm25": {
        "k1": 1.2,
        "b": 0.75
      },
      "ann": {
        "efSearch": 64
      },
      "fusion": {
        "alpha": 0.4
      },
      "ce": {
        "top_for_ce": 50
      }
    }
  }
}


In [5]:
import pandas as pd, json, random

passages_df = pd.read_parquet(ART / "passages.parquet")
passage_texts  = passages_df["passage"].tolist()
passage_docids = passages_df["doc_id"].tolist()

with open(EVALQ / "fiqa_queries_test.json") as f: queries_test = json.load(f)
with open(EVALQ / "fiqa_qrels_test.json")  as f: qrels_test   = json.load(f)

# optional subset for faster iteration
if USE_SUBSET:
    keys = list(queries_test.keys())
    random.Random(SEED).shuffle(keys)
    keys = keys[:SUBSET_SIZE]
    queries = {k: queries_test[k] for k in keys}
else:
    queries = queries_test

print("Passages:", len(passages_df))
print("Test queries used:", len(queries))

Passages: 59018
Test queries used: 648


In [6]:
# FAISS index
import faiss, numpy as np
INDEX_PATH = ROOT / final_cfg["ann"]["index_path"]
assert INDEX_PATH.exists(), f"Missing FAISS index at {INDEX_PATH}"
index = faiss.read_index(str(INDEX_PATH))
print("FAISS index loaded:", index.ntotal)

# Map: doc_id -> list of passage indices
from collections import defaultdict
DOC2PIs = defaultdict(list)
for i, did in enumerate(passage_docids):
    DOC2PIs[did].append(i)

# BM25 over passages (for consistency with NB2 fusion & best-passage selection)
from rank_bm25 import BM25Okapi
passage_tokens = [p.split() for p in passage_texts]
bm25 = BM25Okapi(passage_tokens, k1=final_cfg["bm25"]["k1"], b=final_cfg["bm25"]["b"])
print("BM25 ready.")

# Fine-tuned query encoder
from sentence_transformers import SentenceTransformer
device = "cuda" if torch.cuda.is_available() else "cpu"
q_encoder = SentenceTransformer(str(ROOT / final_cfg["bi_encoder_model_path"]), device=device)
print("Query encoder:", final_cfg["bi_encoder_model_path"])

# Cross-encoder reranker
from transformers import AutoTokenizer, AutoModelForSequenceClassification
ce_name = final_cfg["ce"]["model"]
ce_tokenizer = AutoTokenizer.from_pretrained(ce_name)
ce_model = AutoModelForSequenceClassification.from_pretrained(ce_name).to(device).eval()
print("Cross-encoder:", ce_name)

FAISS index loaded: 59018
BM25 ready.
Query encoder: artifacts/bi_encoder_finetuned


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2


In [7]:
from math import log2
import numpy as np
import torch
from tqdm import tqdm

def bm25_scores_for_query(q_text, bm25_obj):
    return bm25_obj.get_scores(q_text.split())

def bm25_doc_aggregate(scores, top_passages=2000, top_docs=500):
    k = min(top_passages, len(scores))
    top_idx = np.argpartition(scores, -k)[-k:]
    doc2score, doc_best_pi = {}, {}
    for pi in top_idx:
        did = passage_docids[pi]
        s = float(scores[pi])
        if (did not in doc2score) or (s > doc2score[did]):
            doc2score[did] = s
            doc_best_pi[did] = int(pi)
    ranked = sorted(doc2score.items(), key=lambda x: x[1], reverse=True)[:top_docs]
    return ranked, doc_best_pi

def ann_search_with_doc_map(query, topk=200, efSearch=None):
    if efSearch is None:
        efSearch = final_cfg["ann"]["efSearch"]
    # cosine via inner product on normalized vectors
    q = q_encoder.encode([query], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
    # HNSW efSearch set if available
    try:
        index.hnsw.efSearch = efSearch
    except Exception:
        pass
    scores, idx = index.search(q, topk)
    idx = idx[0]; scores = scores[0]
    pairs = [(int(i), float(s)) for i, s in zip(idx, scores) if i != -1]
    doc_best = {}
    for pi, s in pairs:
        did = passage_docids[pi]
        if (did not in doc_best) or (s > doc_best[did][1]):
            doc_best[did] = (int(pi), float(s))
    return pairs, doc_best  # list[(pi,score)], dict[doc]->(best_pi,score)

def fuse_minmax_docs(bm25_ranked, bm25_best_pi, ann_doc_best, alpha=None):
    if alpha is None:
        alpha = final_cfg["fusion"]["alpha"]
    b_dict = {d: s for d, s in bm25_ranked}
    if b_dict:
        b_vals = np.fromiter(b_dict.values(), dtype=np.float32)
        bmin, bmax = float(b_vals.min()), float(b_vals.max())
    else:
        bmin, bmax = 0.0, 1.0
    b_norm = {d: (s - bmin) / (bmax - bmin + 1e-9) for d, s in b_dict.items()}

    a_dict = {d: s for d, (pi, s) in ann_doc_best.items()}
    if a_dict:
        a_vals = np.fromiter(a_dict.values(), dtype=np.float32)
        amin, amax = float(a_vals.min()), float(a_vals.max())
    else:
        amin, amax = 0.0, 1.0
    a_norm = {d: (s - amin) / (amax - amin + 1e-9) for d, s in a_dict.items()}

    docs = set(b_dict) | set(a_dict)
    fused, doc_best_pi = [], {}
    for d in docs:
        bn = b_norm.get(d, 0.0)
        an = a_norm.get(d, 0.0)
        fused.append((d, alpha * bn + (1 - alpha) * an))
        doc_best_pi[d] = ann_doc_best[d][0] if d in ann_doc_best else bm25_best_pi.get(d)
    fused.sort(key=lambda x: x[1], reverse=True)
    return fused, doc_best_pi

@torch.no_grad()
def rerank_with_ce_fast(query, fused_docs, doc_best_pi, top_for_ce=None, max_len=256):
    if top_for_ce is None:
        top_for_ce = final_cfg["ce"]["top_for_ce"]
    cand = fused_docs[:top_for_ce]
    ids, pairs = [], []
    for d, _ in cand:
        pi = doc_best_pi.get(d)
        if pi is None:
            continue
        ids.append(d)
        pairs.append((query, passage_texts[pi]))
    if not pairs:
        return [d for d, _ in fused_docs[:100]]

    batch = ce_tokenizer.batch_encode_plus(
        pairs, padding=True, truncation=True, max_length=max_len, return_tensors="pt"
    ).to(device)
    scores = ce_model(**batch).logits.squeeze(-1).detach().float().cpu().numpy()
    order = scores.argsort()[::-1]
    return [ids[i] for i in order]

In [8]:
# Section 6 — Generator (REPLACE)
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tok = AutoTokenizer.from_pretrained(GEN_MODEL)
if torch.cuda.is_available():
    gen_model = AutoModelForSeq2SeqLM.from_pretrained(
        GEN_MODEL, torch_dtype=torch.float16, device_map="auto"
    )
else:
    gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)

# NOTE: no device=... when using device_map="auto"
gen = pipeline("text2text-generation", model=gen_model, tokenizer=tok)

def make_rag_prompt(context, question):
    if not ALLOW_EXTERNAL_KNOWLEDGE:
        return (
            "You are a finance assistant. Use ONLY the context to answer in 1–3 sentences. "
            "If none of the context is relevant, say: I don't know. "
            "Include bracketed citations like [DOC:ID] next to claims.\n\n"
            f"Context:\n{context}\n\n"
            f"Question: {question}\n"
            "Answer:"
        )
    else:
        return (
            "You are a finance assistant. Prefer the context. If the context is insufficient, you may use general knowledge, "
            "but start that part with: 'Based on general knowledge,' and avoid fabricating numbers. "
            "If context contradicts general knowledge, prefer the context. Include [DOC:ID] citations when you use the context. "
            "Answer in 1–3 sentences.\n\n"
            f"Context:\n{context}\n\n"
            f"Question: {question}\n"
            "Answer:"
        )

def flan_generate(prompt, max_new_tokens=MAX_NEW_TOKENS, beams=NUM_BEAMS, min_new_tokens=MIN_NEW_TOKENS):
    return gen(
        prompt,
        max_new_tokens=max_new_tokens,
        min_new_tokens=min_new_tokens,   # enforce non-trivial answers
        do_sample=False,
        num_beams=beams,
        early_stopping=True,
        no_repeat_ngram_size=2,
        length_penalty=1.05,             # slight bias toward full sentences
    )[0]["generated_text"]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


In [22]:
# Override prompt: no "I don't know" instruction
def make_rag_prompt(context, question):
    if not ALLOW_EXTERNAL_KNOWLEDGE:
        # strict context preference, but no abstain instruction
        return (
            "You are a finance assistant. Use ONLY the context to answer in 1–3 sentences. "
            "Include bracketed citations like [DOC:ID] next to claims.\n\n"
            f"Context:\n{context}\n\n"
            f"Question: {question}\n"
            "Answer:"
        )
    else:
        # allow labeled fallback to general knowledge (still no 'I don't know')
        return (
            "You are a finance assistant. Prefer the context. If the context is insufficient, you may use general knowledge, "
            "and start that part with: 'Based on general knowledge,' and avoid fabricating numbers. "
            "If context contradicts general knowledge, prefer the context. Include [DOC:ID] citations when you use the context. "
            "Answer in 1–3 sentences.\n\n"
            f"Context:\n{context}\n\n"
            f"Question: {question}\n"
            "Answer:"
        )


In [23]:
# Section 7a — CE-based passage selection (ADD/REPLACE)
@torch.no_grad()
def select_passages_ce(query, final_docs, bm25_scores,
                       per_doc=3, max_total=18, max_len=256, max_context_tokens=MAX_CONTEXT_TOKENS):
    """
    For each top doc, score its passages with the cross-encoder, keep 'per_doc' best,
    then take the global top 'max_total' passages while staying under the token budget.
    """
    candidates = []
    for d in final_docs[:TOP_DOCS_FOR_CONTEXT]:
        pis = DOC2PIs.get(d, [])
        if not pis:
            continue
        # light prefilter by BM25 to keep CE cheap
        top_pis = sorted(pis, key=lambda i: bm25_scores[i], reverse=True)[:10] if bm25_scores is not None else pis[:10]
        pairs = [(query, passage_texts[pi]) for pi in top_pis]
        batch = ce_tokenizer.batch_encode_plus(pairs, padding=True, truncation=True, max_length=max_len, return_tensors="pt").to(device)
        scores = ce_model(**batch).logits.squeeze(-1).detach().float().cpu().numpy()
        order = scores.argsort()[::-1]
        for i in order[:per_doc]:
            candidates.append((d, top_pis[i], float(scores[i])))

    # global sort & pack into token budget
    candidates.sort(key=lambda x: x[2], reverse=True)
    parts, cites, used = [], [], 0
    for d, pi, _ in candidates[:max_total]:
        snippet = passage_texts[pi].strip().replace("\n", " ")
        tagged  = f"{snippet} [DOC:{d}]\n"
        tokens  = len(tok.encode(tagged))
        if used + tokens > max_context_tokens:
            break
        parts.append(tagged)
        cites.append({"doc_id": d, "passage_idx": int(pi)})
        used += tokens
    return "".join(parts), cites

In [24]:
# # Section 7 — Generate with CE-picked passages & min length (REPLACE)
# import json
# from tqdm import tqdm

# pred_path = RUNS / ("rag_predictions_test_cepass_minlen.jsonl" if not USE_SUBSET
#                     else "rag_predictions_test_subset_cepass_minlen.jsonl")
# if pred_path.exists():
#     pred_path.unlink()

# with open(pred_path, "w", encoding="utf-8") as out:
#     for qid, qtext in tqdm(queries.items(), desc="RAG generating (CE passages + minlen)"):
#         # 1) retrieve (same as NB2)
#         p_scores = bm25_scores_for_query(qtext, bm25)
#         bm25_docs, bm25_best_pi = bm25_doc_aggregate(p_scores, top_passages=2000, top_docs=500)
#         _, ann_doc_best = ann_search_with_doc_map(qtext, topk=200, efSearch=final_cfg["ann"]["efSearch"])
#         fused_docs, doc_best_pi = fuse_minmax_docs(bm25_docs, bm25_best_pi, ann_doc_best, alpha=final_cfg["fusion"]["alpha"])
#         final_docs = rerank_with_ce_fast(qtext, fused_docs, doc_best_pi, top_for_ce=final_cfg["ce"]["top_for_ce"])

#         # 2) build richer context with multiple CE-ranked passages
#         context, citations = select_passages_ce(qtext, final_docs, bm25_scores=p_scores, per_doc=3, max_total=18)

#         # 3) generate
#         prompt = make_rag_prompt(context, qtext)
#         answer = flan_generate(prompt)

#         out.write(json.dumps({
#             "qid": qid,
#             "question": qtext,
#             "answer": answer,
#             "doc_ids": final_docs[:TOP_DOCS_FOR_CONTEXT],
#             "citations": citations,
#             "context_chars": len(context),
#             "mode": "knowledge_plus_context" if ALLOW_EXTERNAL_KNOWLEDGE else "context_only"
#         }) + "\n")

# print("Saved predictions →", pred_path)

In [25]:
# Section 7 — Generate (no-IDK prompt) → new output file
import json
from tqdm import tqdm

pred_path = RUNS / ("rag_predictions_test_cepass_minlen_noidk.jsonl" if not USE_SUBSET
                    else "rag_predictions_test_subset_cepass_minlen_noidk.jsonl")
if pred_path.exists():
    pred_path.unlink()

with open(pred_path, "w", encoding="utf-8") as out:
    for qid, qtext in tqdm(queries.items(), desc="RAG generating (no-IDK prompt)"):
        # 1) retrieve
        p_scores = bm25_scores_for_query(qtext, bm25)
        bm25_docs, bm25_best_pi = bm25_doc_aggregate(p_scores, top_passages=2000, top_docs=500)
        _, ann_doc_best = ann_search_with_doc_map(qtext, topk=200, efSearch=final_cfg["ann"]["efSearch"])
        fused_docs, doc_best_pi = fuse_minmax_docs(bm25_docs, bm25_best_pi, ann_doc_best, alpha=final_cfg["fusion"]["alpha"])
        final_docs = rerank_with_ce_fast(qtext, fused_docs, doc_best_pi, top_for_ce=final_cfg["ce"]["top_for_ce"])

        # 2) richer context (CE-picked passages)
        context, citations = select_passages_ce(qtext, final_docs, bm25_scores=p_scores, per_doc=3, max_total=18)

        # 3) generate
        prompt = make_rag_prompt(context, qtext)
        answer = flan_generate(prompt)

        out.write(json.dumps({
            "qid": qid,
            "question": qtext,
            "answer": answer,
            "doc_ids": final_docs[:TOP_DOCS_FOR_CONTEXT],
            "citations": citations,
            "context_chars": len(context),
            "mode": ("knowledge_plus_context" if ALLOW_EXTERNAL_KNOWLEDGE else "context_only"),
            "prompt_variant": "no_idk"
        }) + "\n")

print("Saved predictions →", pred_path)


RAG generating (no-IDK prompt): 100%|██████████| 648/648 [35:49<00:00,  3.32s/it]

Saved predictions → /content/drive/MyDrive/finance-rag-microservice/runs/rag_predictions_test_cepass_minlen_noidk.jsonl





In [26]:
# Bridge for eval — use the no-IDK predictions file & distinct report name
from pathlib import Path
ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")

pred_path = ROOT / "runs" / ("rag_predictions_test_cepass_minlen_noidk.jsonl" if not USE_SUBSET
                             else "rag_predictions_test_subset_cepass_minlen_noidk.jsonl")
rep_path  = ROOT / "reports" / ("rag_eval_test_cepass_minlen_noidk.json" if not USE_SUBSET
                                else "rag_eval_test_subset_cepass_minlen_noidk.json")

print("Pred file for eval:", pred_path, "| exists:", pred_path.exists())


Pred file for eval: /content/drive/MyDrive/finance-rag-microservice/runs/rag_predictions_test_cepass_minlen_noidk.jsonl | exists: True


In [27]:
# # Bridge: tell Section 8 which predictions file to evaluate (ADD just before your eval)
# from pathlib import Path
# ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
# pred_path = ROOT / "runs" / ("rag_predictions_test_cepass_minlen.jsonl" if not USE_SUBSET
#                              else "rag_predictions_test_subset_cepass_minlen.jsonl")
# print("Pred file for eval:", pred_path, "| exists:", pred_path.exists())

# # (Optional) distinct report name
# rep_path = ROOT / "reports" / ("rag_eval_test_cepass_minlen.json" if not USE_SUBSET
#                                else "rag_eval_test_subset_cepass_minlen.json")

In [28]:
# Optional: different eval report name for CE-passages run
rep_path = ROOT / "reports" / ("rag_eval_test_cepass.json" if not USE_SUBSET
                               else "rag_eval_test_subset_cepass.json")


import json, re, string
from collections import Counter

# Load predictions
preds = []
with open(pred_path, "r", encoding="utf-8") as f:
    for line in f:
        preds.append(json.loads(line))

def support_at_k(pred, qrels, k=TOP_DOCS_FOR_CONTEXT):
    """1 if any cited doc (or top-k doc) is relevant in qrels."""
    qid = pred["qid"]
    rels = qrels.get(qid, {})
    if not rels:
        return 0.0
    doc_ids = pred["doc_ids"][:k]
    return 1.0 if any((d in rels and rels[d] > 0) for d in doc_ids) else 0.0

def evidence_recall_at_k(pred, qrels, k=TOP_DOCS_FOR_CONTEXT):
    """Fraction of relevant doc_ids present in top-k doc_ids."""
    qid = pred["qid"]
    rels = qrels.get(qid, {})
    rel_docs = {d for d, r in rels.items() if r > 0}
    if not rel_docs:
        return None
    doc_ids = set(pred["doc_ids"][:k])
    return len(rel_docs & doc_ids) / len(rel_docs)

# simple tokenizer
PUNCT = set(string.punctuation)
def simple_tokens(s):
    s = s.lower()
    s = "".join(ch for ch in s if ch not in PUNCT)
    return [t for t in s.split() if t]

STOP = set("""
a an the and or but if while to for of in on at by from with as is are was were be been being this that these those
""".split())

def context_containment(pred, passages_map):
    """% of answer tokens that appear in the context (ignoring stopwords)."""
    ans = pred["answer"]
    toks = [t for t in simple_tokens(ans) if t not in STOP]
    if not toks:
        return 0.0
    # Build context text from cited passages
    ctx_parts = []
    for c in pred["citations"]:
        pi = c["passage_idx"]
        ctx_parts.append(passages_map[pi])
    ctx_text = " ".join(ctx_parts).lower()
    ctx_set = set(simple_tokens(ctx_text))
    hit = sum(1 for t in toks if t in ctx_set)
    return hit / len(toks)

# Build index: passage_idx -> text
passages_map = {i: passage_texts[i] for i in range(len(passage_texts))}

# Compute metrics
sup, evr, cct = [], [], []
for p in preds:
    sup.append(support_at_k(p, qrels_test))
    r = evidence_recall_at_k(p, qrels_test)
    if r is not None: evr.append(r)
    cct.append(context_containment(p, passages_map))

report = {
    "n_predictions": len(preds),
    "Support@{k}".format(k=TOP_DOCS_FOR_CONTEXT): round(float(np.mean(sup)), 4) if sup else None,
    "EvidenceRecall@{k}".format(k=TOP_DOCS_FOR_CONTEXT): round(float(np.mean(evr)), 4) if evr else None,
    "ContextContainment": round(float(np.mean(cct)), 4) if cct else None,
    "notes": {
        "support_at_k": "Any top-k doc is relevant per qrels",
        "evidence_recall_at_k": "Fraction of relevant doc_ids present in top-k retrieved docs",
        "context_containment": "Share of answer tokens found in concatenated cited passages"
    }
}
print(json.dumps(report, indent=2))

rep_path = REPTS / ("rag_eval_test.json" if not USE_SUBSET else "rag_eval_test_subset.json")
rep_path.write_text(json.dumps(report, indent=2))
print("Saved eval →", rep_path)

{
  "n_predictions": 648,
  "Support@12": 0.6852,
  "EvidenceRecall@12": 0.4771,
  "ContextContainment": 0.9139,
  "notes": {
    "support_at_k": "Any top-k doc is relevant per qrels",
    "evidence_recall_at_k": "Fraction of relevant doc_ids present in top-k retrieved docs",
    "context_containment": "Share of answer tokens found in concatenated cited passages"
  }
}
Saved eval → /content/drive/MyDrive/finance-rag-microservice/reports/rag_eval_test.json


In [29]:
rag_cfg = {
  "retrieval": final_cfg,
  "generation": {
    "model": GEN_MODEL,
    "max_new_tokens": MAX_NEW_TOKENS,
    "min_new_tokens": MIN_NEW_TOKENS,
    "num_beams": NUM_BEAMS,
    "max_context_tokens": MAX_CONTEXT_TOKENS,
    "top_docs_for_context": TOP_DOCS_FOR_CONTEXT,
    "prompt_style": "context-only, abstain with 'I don't know'"
  },
  "run": {
    "use_subset": USE_SUBSET,
    "subset_size": SUBSET_SIZE if USE_SUBSET else None
  }
}
cfg_path = REPTS / "rag_config.json"
cfg_path.write_text(json.dumps(rag_cfg, indent=2))
print("Saved:", cfg_path)

Saved: /content/drive/MyDrive/finance-rag-microservice/reports/rag_config.json


In [14]:
from pathlib import Path
import json
ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
print((ROOT/"reports/rag_eval_test.json").read_text())


{
  "n_predictions": 648,
  "Support@12": 0.6852,
  "EvidenceRecall@12": 0.4771,
  "ContextContainment": 0.9205,
  "notes": {
    "support_at_k": "Any top-k doc is relevant per qrels",
    "evidence_recall_at_k": "Fraction of relevant doc_ids present in top-k retrieved docs",
    "context_containment": "Share of answer tokens found in concatenated cited passages"
  }
}


In [30]:
from pathlib import Path
import json

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
pred_path = ROOT / "runs" / "rag_predictions_test_cepass_minlen_noidk.jsonl"  # or *_subset.jsonl

rows = []
with open(pred_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 5: break
        rows.append(json.loads(line))

for r in rows:
    print("\nQID:", r["qid"])
    print("Q:", r["question"])
    print("A:", r["answer"])
    print("Doc IDs (top):", r["doc_ids"][:5])
    print("Citations:", r["citations"][:3], "| Context chars:", r["context_chars"])



QID: 8
Q: How to deposit a cheque issued to an associate in my business into my business account?
A: Based on general knowledge, and a state-issued ""dba"" certificate (from the county clerk's office) as well as an Employer ID Number (EIN) issued by the IRS
Doc IDs (top): ['65404', '564553', '580624', '316359', '508754']
Citations: [{'doc_id': '65404', 'passage_idx': 6465}, {'doc_id': '564553', 'passage_idx': 55527}, {'doc_id': '580624', 'passage_idx': 57092}] | Context chars: 5780

QID: 15
Q: Can I send a money order from USPS as a business?
A: Based on general knowledge, but it's an antiquated system. So much pollution, so much waste, and for what? So that you can get junk mail and businesses can continue to send you paper bills
Doc IDs (top): ['325273', '224000', '420483', '229251', '204288']
Citations: [{'doc_id': '325273', 'passage_idx': 32243}, {'doc_id': '224000', 'passage_idx': 22090}, {'doc_id': '420483', 'passage_idx': 41366}] | Context chars: 5571

QID: 18
Q: 1 EIN doing bu

In [31]:
from pathlib import Path
import json, statistics as stats

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
pred_path = ROOT / "runs" / "rag_predictions_test_cepass_minlen_noidk.jsonl"  # or *_subset.jsonl
qrels = json.loads((ROOT / "eval" / "qrels" / "fiqa_qrels_test.json").read_text())

preds = [json.loads(l) for l in open(pred_path, "r", encoding="utf-8")]

def support_ok(p, k=10):
    rels = qrels.get(p["qid"], {})
    return any((d in rels and rels[d] > 0) for d in p["doc_ids"][:k])

abstain = sum(1 for p in preds if "i don't know" in p["answer"].lower()) / len(preds)
avg_len = stats.mean(len(p["answer"].split()) for p in preds)
avg_ctx = stats.mean(p["context_chars"] for p in preds)
misses  = [p for p in preds if not support_ok(p)]

print({
    "abstain_rate": round(abstain, 4),
    "avg_answer_tokens": round(avg_len, 1),
    "avg_context_chars": round(avg_ctx, 1),
    "n_predictions": len(preds),
    "n_support_misses": len(misses),
})

for p in misses[:3]:
    print("\nMISS QID:", p["qid"])
    print("Q:", p["question"])
    print("A:", p["answer"][:200])
    print("Doc IDs:", p["doc_ids"][:5])

{'abstain_rate': 0.0031, 'avg_answer_tokens': 30.6, 'avg_context_chars': 5137.7, 'n_predictions': 648, 'n_support_misses': 221}

MISS QID: 18
Q: 1 EIN doing business under multiple business names
A: According to the IRS Publication 1635, Understanding your EIN (PDF), under ""What is an EINT?"" on page 2: Caution: An EIND is for use in connection with your business activities only
Doc IDs: ['377152', '348480', '357520', '431685', '203820']

MISS QID: 34
Q: 401k Transfer After Business Closure
A: In general, it's best to transfer the account to an IRA after separation from the company to avoid the issues both of my esteemed colleagues have referenced.
Doc IDs: ['458917', '551545', '411354', '591168', '483268']

MISS QID: 42
Q: What are the ins/outs of writing equipment purchases off as business expenses in a home based business?
A: If you start a sole-proprietorship in your own name there are all sorts of things you can write off as long as there is reasonable expectation of profit.
Doc 

In [17]:
from pathlib import Path
import json

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
print((ROOT / "reports" / "retri eval_final_selection.json".replace(" ", "")).read_text())  # or the exact path:
print((ROOT / "reports" / "retrieval_final_selection.json").read_text())
print((ROOT / "reports" / "rag_config.json").read_text())


{
  "bi_encoder_model_path": "artifacts/bi_encoder_finetuned",
  "bm25": {
    "k1": 1.2,
    "b": 0.75
  },
  "ann": {
    "efSearch": 64,
    "index_path": "indices/faiss_hnsw/faiss_hnsw_ip.index"
  },
  "fusion": {
    "alpha": 0.4
  },
  "ce": {
    "model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "top_for_ce": 50
  },
  "embeddings": {
    "file": "artifacts/embeddings/passage_emb_allminilm.npy"
  },
  "passages": {
    "file": "artifacts/passages.parquet"
  },
  "splits": {
    "queries_test": "eval/qrels/fiqa_queries_test.json",
    "qrels_test": "eval/qrels/fiqa_qrels_test.json"
  },
  "metrics": {
    "n_evaluated_queries": 648,
    "nDCG@10": 0.3794,
    "MRR@10": 0.4526,
    "Recall@100": 0.6247,
    "config": {
      "bm25": {
        "k1": 1.2,
        "b": 0.75
      },
      "ann": {
        "efSearch": 64
      },
      "fusion": {
        "alpha": 0.4
      },
      "ce": {
        "top_for_ce": 50
      }
    }
  }
}
{
  "bi_encoder_model_path": "artifacts/bi_enco

In [32]:
# Save eval for the current run (no-IDK prompt + CE-picked passages)
from pathlib import Path
import json, numpy as np, statistics as stats, string

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
pred_path = ROOT / "runs" / "rag_predictions_test_cepass_minlen_noidk.jsonl"
rep_path  = ROOT / "reports" / "rag_eval_test_cepass_minlen_noidk.json"
assert pred_path.exists(), f"Missing predictions: {pred_path}"

# load data
preds = [json.loads(l) for l in open(pred_path, "r", encoding="utf-8")]
qrels_test = json.loads((ROOT / "eval" / "qrels" / "fiqa_qrels_test.json").read_text())

# helper funcs (same as earlier)
def support_at_k(pred, qrels, k):
    rels = qrels.get(pred["qid"], {})
    if not rels: return 0.0
    doc_ids = pred["doc_ids"][:k]
    return 1.0 if any((d in rels and rels[d] > 0) for d in doc_ids) else 0.0

def evidence_recall_at_k(pred, qrels, k):
    rels = qrels.get(pred["qid"], {})
    rel_docs = {d for d, r in rels.items() if r > 0}
    if not rel_docs: return None
    doc_ids = set(pred["doc_ids"][:k])
    return len(rel_docs & doc_ids) / len(rel_docs)

PUNCT = set(string.punctuation)
def simple_tokens(s):
    s = s.lower()
    s = "".join(ch for ch in s if ch not in PUNCT)
    return [t for t in s.split() if t]
STOP = set("a an the and or but if while to for of in on at by from with as is are was were be been being this that these those".split())

# Build index passage_idx -> text for cited passages
import pandas as pd
passages_df = pd.read_parquet(ROOT / "artifacts" / "passages.parquet")
passage_texts = passages_df["passage"].tolist()

def context_containment(pred):
    ans = pred["answer"]
    toks = [t for t in simple_tokens(ans) if t not in STOP]
    if not toks: return 0.0
    ctx = " ".join(passage_texts[c["passage_idx"]] for c in pred["citations"]).lower()
    ctxset = set(simple_tokens(ctx))
    return sum(1 for t in toks if t in ctxset) / len(toks)

# use the same k you retrieved/packed with
K = 12

sup = [support_at_k(p, qrels_test, K) for p in preds]
evr = [x for p in preds if (x:=evidence_recall_at_k(p, qrels_test, K)) is not None]
cct = [context_containment(p) for p in preds]

report = {
    "n_predictions": len(preds),
    f"Support@{K}": round(float(np.mean(sup)), 4),
    f"EvidenceRecall@{K}": round(float(np.mean(evr)), 4) if evr else None,
    "ContextContainment": round(float(np.mean(cct)), 4) if cct else None,
}

rep_path.write_text(json.dumps(report, indent=2))
print("Saved eval →", rep_path)
print(json.dumps(report, indent=2))


Saved eval → /content/drive/MyDrive/finance-rag-microservice/reports/rag_eval_test_cepass_minlen_noidk.json
{
  "n_predictions": 648,
  "Support@12": 0.6852,
  "EvidenceRecall@12": 0.4771,
  "ContextContainment": 0.9139
}


In [33]:
# Save a manifest capturing all knobs for this run
from pathlib import Path
import json, time

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
SEL_FINAL = ROOT / "reports" / "retrieval_final_selection.json"
final_cfg = json.loads(SEL_FINAL.read_text())

manifest = {
  "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
  "predictions_file": "runs/rag_predictions_test_cepass_minlen_noidk.jsonl",
  "eval_file": "reports/rag_eval_test_cepass_minlen_noidk.json",
  "prompt_variant": "no_idk",
  "mode": "knowledge_plus_context",   # matches your current setting
  "retrieval": final_cfg,             # frozen in NB2
  "context_selection": {
    "selector": "cross_encoder_top_passages",
    "per_doc": 3,
    "max_total": 18,
    "top_docs_for_context": 12,
    "max_context_tokens": 1400
  },
  "generation": {
    "model": "google/flan-t5-large",
    "min_new_tokens": 24,
    "max_new_tokens": 96,
    "num_beams": 4,
    "length_penalty": 1.05,
    "no_repeat_ngram_size": 2,
    "allow_external_knowledge": True
  }
}

man_path = ROOT / "reports" / "rag_run_manifest_noidk.json"
man_path.write_text(json.dumps(manifest, indent=2))
print("Saved manifest →", man_path)


Saved manifest → /content/drive/MyDrive/finance-rag-microservice/reports/rag_run_manifest_noidk.json


In [34]:
# Point NB4 and the README to this run by default
from pathlib import Path
import json

ROOT = Path("/content/drive/MyDrive/finance-rag-microservice")
best_ptr = {
    "manifest": "reports/rag_run_manifest_noidk.json",
    "predictions": "runs/rag_predictions_test_cepass_minlen_noidk.jsonl",
    "eval": "reports/rag_eval_test_cepass_minlen_noidk.json"
}
best_path = ROOT / "reports" / "rag_best_run.json"
best_path.write_text(json.dumps(best_ptr, indent=2))
print("Wrote pointer →", best_path)


Wrote pointer → /content/drive/MyDrive/finance-rag-microservice/reports/rag_best_run.json
