In [1]:
!pip install --upgrade pip
!pip install pandas faiss-cpu rank-bm25 fastembed langchain-community groq datasets ragas langchain-groq


Collecting pip
  Using cached pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.2-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
C:\Users\anish\anaconda3\python.exe -m pip install --upgrade pip




In [2]:
import os, warnings
warnings.filterwarnings("ignore")

# Keep tokenizers & BLAS quiet and single-threaded to avoid kernel crashes
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"


In [3]:
### config (hard-coded to your saved artifacts)

In [4]:
from pathlib import Path

# ====== YOUR SAVED ARTIFACTS ======
FAISS_INDEX_PATH = Path("faiss_medqa_bge_small.index")   # FAISS index
MAPPING_PATH     = Path("medqa_chunk_mapping.parquet")   # id/text/meta (same order as FAISS)

# ====== Retrieval must match the build ======
EMBED_MODEL = "BAAI/bge-small-en-v1.5"   # you built with BGE-small
USE_COSINE_IP = True                     # BGE + normalized vectors + IndexFlatIP
USE_BGE_QUERY_INSTRUCTION = True
BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "

# Hybrid settings (pools + fusion)
BM25_POOL_K   = 100     # how many BM25 candidates to consider before fusion
FAISS_POOL_K  = 100     # how many FAISS candidates to consider before fusion
RRF_K         = 60      # RRF constant (higher = flatter)
FINAL_TOP_K   = 3       # how many contexts to send to the LLM

# Questions (same as your other runs)
EVAL_SELECTION_CSV = "medquad_selected_questions.csv"
ANSWER_FIELD       = "answer"     # if CSV has 'gold', we detect it below
N_EVAL             = 3            # reduce if hitting rate limits

# LLM (only this changes for experiments)
from groq import Groq
GROQ_API_KEY = ""
GEN_MODEL    = "llama-3.3-70b-versatile"

PRINT_PROMPTS = False

# RAGAS (optional)
RUN_RAGAS_EVAL   = True
RAGAS_JUDGE_MODEL = "llama-3.1-8b-instant"
RAGAS_EMBED_MODEL = EMBED_MODEL   # FastEmbed via LangChain

# Outputs
OUT_DIR = "."
ANSWERS_CSV = f"{OUT_DIR}/hybrid_rag_answers.csv"


In [5]:
### load questions (same CSV)

In [6]:
import pandas as pd

sel = pd.read_csv(EVAL_SELECTION_CSV)
gold_col = "gold" if "gold" in sel.columns else (ANSWER_FIELD if ANSWER_FIELD in sel.columns else None)
assert gold_col is not None, f"Selection file must contain either 'gold' or '{ANSWER_FIELD}'"

eval_df = sel[["question", gold_col]].copy()
if isinstance(N_EVAL, int):
    eval_df = eval_df.head(N_EVAL)

print(f"Hybrid RAG questions: {len(eval_df)}")
print(eval_df["question"].to_string(index=False))


Hybrid RAG questions: 3
              Do you have information about X-Rays
What are the symptoms of Alpha-ketoglutarate de...
What are the treatments for GLUT1 deficiency sy...


In [7]:
### load FAISS index + mapping parquet (no re-indexing)

In [8]:

import faiss, json

assert FAISS_INDEX_PATH.exists(), f"Missing {FAISS_INDEX_PATH}"
assert MAPPING_PATH.exists(), f"Missing {MAPPING_PATH}"

index = faiss.read_index(str(FAISS_INDEX_PATH))
try:
    faiss.omp_set_num_threads(1)  # keep FAISS single-threaded to avoid kernel churn
except Exception:
    pass

m = pd.read_parquet(MAPPING_PATH)

ID_CANDS    = ["id","chunk_id","doc_id"]
TEXT_CANDS  = ["text","chunk_text","content","passage","body","doc"]
META_CANDS  = ["meta","metadata"]

id_col   = next((c for c in ID_CANDS if c in m.columns), None)
text_col = next((c for c in TEXT_CANDS if c in m.columns), None)
meta_col = next((c for c in META_CANDS if c in m.columns), None)
assert id_col and text_col, f"Mapping parquet must have id/text. Found: {list(m.columns)}"

chunk_ids   = m[id_col].astype(str).tolist()
chunk_texts = m[text_col].astype(str).tolist()
if meta_col:
    try:
        chunk_meta  = [json.loads(x) if isinstance(x, str) else x for x in m[meta_col].tolist()]
    except Exception:
        chunk_meta = m[meta_col].tolist()
else:
    chunk_meta = [{} for _ in range(len(chunk_ids))]

print("FAISS ntotal:", index.ntotal, "| mapping rows:", len(chunk_ids))
assert index.ntotal == len(chunk_ids), "Index/mapping size mismatch — they must be built together."



FAISS ntotal: 18559 | mapping rows: 18559


In [9]:
### build BM25 over the same chunk_texts

In [10]:
import re
from rank_bm25 import BM25Okapi

def simple_tokenize(text: str):
    # lowercased alphanum + apostrophes; tune if you wish
    return re.findall(r"[A-Za-z0-9']+", (text or "").lower())

tokenized_docs = [simple_tokenize(t) for t in chunk_texts]
bm25 = BM25Okapi(tokenized_docs)
print(f"BM25 built over {len(tokenized_docs)} docs (same corpus as FAISS).")


BM25 built over 18559 docs (same corpus as FAISS).


In [11]:
### FastEmbed query encoder (same embed model as build)

In [12]:
import numpy as np
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

q_embedder = FastEmbedEmbeddings(model_name=EMBED_MODEL)

def _format_query(q: str) -> str:
    return (BGE_QUERY_PREFIX + q) if USE_BGE_QUERY_INSTRUCTION else q

def dense_retrieve_faiss(query: str, k: int = FAISS_POOL_K):
    qtext = _format_query(query)
    q_vec = np.asarray(q_embedder.embed_query(qtext), dtype="float32")[None, :]
    if USE_COSINE_IP:
        q_vec /= (np.linalg.norm(q_vec, axis=1, keepdims=True) + 1e-12)
    sims, idxs = index.search(q_vec, k)
    idxs = idxs[0].tolist(); sims = sims[0].tolist()
    return [(i, float(sims[j])) for j, i in enumerate(idxs) if i >= 0]

def sparse_retrieve_bm25(query: str, k: int = BM25_POOL_K):
    q_tokens = simple_tokenize(query)
    scores = bm25.get_scores(q_tokens)
    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [(i, float(scores[i])) for i in top_idx]


In [13]:
### Hybrid fusion (RRF) + prompt + Groq chat

In [14]:
from typing import List, Dict
from groq import Groq

client = Groq(api_key=GROQ_API_KEY)

def rrf_fuse(sparse_list, dense_list, k_rrf=RRF_K, final_top_k=FINAL_TOP_K):
    """
    sparse_list / dense_list: lists of (idx, score) sorted descending by their own score.
    We ignore raw scores and fuse by reciprocal ranks.
    """
    fused = {}
    # assign ranks (1-based)
    for rank, (i, _s) in enumerate(sparse_list, start=1):
        fused[i] = fused.get(i, 0.0) + 1.0 / (k_rrf + rank)
    for rank, (i, _s) in enumerate(dense_list, start=1):
        fused[i] = fused.get(i, 0.0) + 1.0 / (k_rrf + rank)
    # sort by fused score
    ranked = sorted(fused.items(), key=lambda x: x[1], reverse=True)[:final_top_k]
    return ranked  # list of (idx, fused_score)

def build_rag_messages(question: str, contexts: List[str], print_prompt: bool = False) -> List[Dict]:
    system_txt = (
        "You are a concise, evidence-focused medical assistant. "
        "Use the provided context passages to answer accurately. "
        "If the context does not contain the answer, say you don't know."
    )
    ctx_block = "\n\n".join([f"[Context {i+1}]\n{c}" for i, c in enumerate(contexts)])
    user_txt = (
        f"Question: {question}\n\n"
        f"Context Passages:\n{ctx_block}\n\n"
        "Instructions: Answer in 2–4 sentences. Cite which Context numbers support your statements (e.g., [1], [2]). "
        "If insufficient evidence, say 'I don't know based on the given context.'"
    )
    msgs = [{"role":"system","content":system_txt},{"role":"user","content":user_txt}]
    if print_prompt:
        print("\n"+"="*88); print("[RAG PROMPT]")
        print("\n[SYSTEM]\n"+system_txt)
        print("\n[USER]\n"+user_txt)
        print("="*88)
    return msgs

def groq_chat(messages, model=GEN_MODEL, temperature=0.0, max_tokens=512, top_p=1.0) -> str:
    r = client.chat.completions.create(
        model=model, temperature=temperature, max_tokens=max_tokens, top_p=top_p, messages=messages
    )
    return r.choices[0].message.content.strip()


In [15]:
### Run HYBRID RAG; save answers

In [16]:
rows = []
for _, r in eval_df.iterrows():
    q  = str(r["question"]).strip()
    gt = str(r[gold_col]).strip()

    # retrieve pools
    dense_pool  = dense_retrieve_faiss(q, k=FAISS_POOL_K)
    sparse_pool = sparse_retrieve_bm25(q,  k=BM25_POOL_K)

    # fuse
    fused = rrf_fuse(sparse_pool, dense_pool, k_rrf=RRF_K, final_top_k=FINAL_TOP_K)
    final_indices = [i for (i, _f) in fused]
    contexts = [chunk_texts[i] for i in final_indices]

    # prompt + generate
    msgs = build_rag_messages(q, contexts, print_prompt=PRINT_PROMPTS)
    ans  = groq_chat(msgs, model=GEN_MODEL, temperature=0.0, max_tokens=512, top_p=1.0)

    rows.append({
        "question": q,
        "answer": ans,
        "contexts": contexts,      # list[str] for RAGAS
        "ground_truth": gt,        # RAGAS expects 'ground_truth'; this does NOT rename your CSV
        "final_top_k": FINAL_TOP_K,
        "bm25_pool_k": BM25_POOL_K,
        "faiss_pool_k": FAISS_POOL_K,
        "rrf_k": RRF_K
    })

hybrid_df = pd.DataFrame(rows)
hybrid_df.to_csv(ANSWERS_CSV, index=False)
print(f"Saved HYBRID answers to: {ANSWERS_CSV}")
hybrid_df.head(2)


Saved HYBRID answers to: ./hybrid_rag_answers.csv


Unnamed: 0,question,answer,contexts,ground_truth,final_top_k,bm25_pool_k,faiss_pool_k,rrf_k
0,Do you have information about X-Rays,X-rays are a type of electromagnetic radiation...,[Summary : X-rays are a type of radiation call...,Summary : X-rays are a type of radiation calle...,3,100,100,60
1,What are the symptoms of Alpha-ketoglutarate d...,The symptoms of Alpha-ketoglutarate dehydrogen...,[What are the signs and symptoms of Alpha-keto...,What are the signs and symptoms of Alpha-ketog...,3,100,100,60


In [17]:
### RAGAS evaluation (FastEmbed + ChatGroq)

In [18]:
if RUN_RAGAS_EVAL:
    try:
        from datasets import Dataset
        from ragas import evaluate
        try:
            from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness
            METRICS = [faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness]
        except Exception:
            from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
            METRICS = [faithfulness, answer_relevancy, context_precision, context_recall]

        from langchain_groq import ChatGroq
        from langchain_community.embeddings.fastembed import FastEmbedEmbeddings as LC_FastEmbedEmbeddings

        ragas_llm = ChatGroq(
            groq_api_key=GROQ_API_KEY, model_name=RAGAS_JUDGE_MODEL,
            temperature=0.0, max_retries=6, request_timeout=60
        )
        ragas_embeddings = LC_FastEmbedEmbeddings(model_name=RAGAS_EMBED_MODEL)

        ds = Dataset.from_pandas(hybrid_df[["question","answer","contexts","ground_truth"]].copy())

        # run one metric at a time (more robust with rate limits)
        scores = {}
        for metric in METRICS:
            for attempt in range(5):
                try:
                    rep = evaluate(dataset=ds, metrics=[metric], llm=ragas_llm, embeddings=ragas_embeddings,
                                   is_async=False, raise_exceptions=False)
                    key = next(iter(rep.keys()))
                    try: scores[key] = float(rep[key])
                    except: scores[key] = float(str(rep[key]))
                    break
                except Exception as e:
                    import time
                    print(f"[RAGAS] retry {attempt+1}/5 due to: {e}")
                    time.sleep(30 + attempt*20)
            else:
                scores[str(metric)] = None

        import json
        out_json = f"{OUT_DIR}/hybrid_ragas_metrics.json"
        with open(out_json, "w", encoding="utf-8") as f:
            json.dump(scores, f, indent=2)
        print("\n=== RAGAS (Hybrid) ===")
        for k, v in scores.items():
            if isinstance(v, float):
                print(f"{k}: {v:.3f}")
            else:
                print(f"{k}: {v}")
        print("Saved:", out_json)

    except Exception as e:
        print("[RAGAS WARNING] Skipped due to error:", e)



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]


=== RAGAS (Hybrid) ===
faithfulness: 0.875
answer_relevancy: 0.526
context_precision: 1.000
context_recall: 0.816
answer_correctness: 0.584
Saved: ./hybrid_ragas_metrics.json
