In [1]:
!pip install -U ragas==0.1.9 datasets==2.20.0 pyarrow==17.0.0
!pip install -U langchain==0.3.2 langchain-community==0.3.2 langchain-huggingface==0.1.0
!pip install -U tokenizers==0.19.1 sentence-transformers==3.0.1
!pip install -U nest_asyncio
# optional but recommended on Windows to avoid MKL/OpenMP clashes:
!pip uninstall -y intel-openmp || true


Collecting langchain==0.3.2
  Using cached langchain-0.3.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community==0.3.2
  Using cached langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-huggingface==0.1.0
  Using cached langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.3.2)
  Using cached langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain==0.3.2)
  Using cached tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.

The conflict is caused by:
    The user requested langchain==0.3.2
    langchain-community 0.3.2 depends on langchain<0.4.0 and >=0.3.3

To fix this you could try to:
1. loosen the range of package versions you've specified
2. remove package versions to allow

ERROR: Cannot install langchain-community==0.3.2 and langchain==0.3.2 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts






In [2]:
import os, platform
# Tame OpenMP/tokenizers threading & duplication issues on Windows
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")        # avoids OpenMP hard crash
os.environ.setdefault("OMP_NUM_THREADS", "1")                # prevents oversubscription
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")     # quieter tokenizers


'false'

In [3]:
# ===== Config =====
DATA_PATH   = "medqa_cleaned.csv"  # your cleaned CSV (unchanged)
ANSWER_FIELD = "answer"                         # <- ground truth column in your CSV (DO NOT RENAME)
TEXT_FIELD   = "answer"                         # <- what to index for retrieval (use "source_text" if you have it)

TOP_K = 3
N_EVAL = 3  # same 3 questions as prompting (use selection CSV below)
PRINT_PROMPTS = True

GEN_MODEL   = "openai/gpt-oss-20b"
GROQ_API_KEY = ""

# Use the EXACT same selection file from the prompt run (support both 'gold' or 'answer' inside it)
EVAL_SELECTION_CSV = "medquad_selected_questions.csv"

#RAGAS_EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
SEED = 42


In [4]:
import pandas as pd

df = pd.read_csv(DATA_PATH)

# sanity checks (no column renames here)
assert "question" in df.columns, "CSV must have a 'question' column"
assert ANSWER_FIELD in df.columns, f"CSV must have the ground-truth column '{ANSWER_FIELD}'"
assert TEXT_FIELD   in df.columns, f"CSV must have the text-to-index column '{TEXT_FIELD}'"

# stable IDs without altering source columns
if "doc_id" not in df.columns:
    df = df.copy()
    df["doc_id"] = [f"doc_{i}" for i in range(len(df))]

# light normalization (safe)
df["question"] = df["question"].astype(str).str.strip()
df[ANSWER_FIELD] = df[ANSWER_FIELD].astype(str).str.strip()
df[TEXT_FIELD]   = df[TEXT_FIELD].astype(str).str.strip()

print("Loaded rows:", len(df), "| columns:", list(df.columns))
df.head(3)


Loaded rows: 16018 | columns: ['question', 'answer', 'qtype', 'doc_id']


Unnamed: 0,question,answer,qtype,doc_id
0,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,susceptibility,doc_0
1,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...,symptoms,doc_1
2,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,susceptibility,doc_2


## BM25 index over TEXT_FIELD (not touching answer)

In [5]:
!pip install rank_bm25 



In [6]:
import re
from rank_bm25 import BM25Okapi

def simple_tokenize(text: str):
    return re.findall(r"[A-Za-z0-9']+", (text or "").lower())

documents = df[TEXT_FIELD].tolist()   # <- whatever you chose to index
doc_ids   = df["doc_id"].tolist()

tokenized_docs = [simple_tokenize(t) for t in documents]
bm25 = BM25Okapi(tokenized_docs)
print(f"BM25 built over {len(tokenized_docs)} docs (TEXT_FIELD='{TEXT_FIELD}').")

def bm25_retrieve(query: str, k: int = 3):
    q_tokens = simple_tokenize(query)
    scores = bm25.get_scores(q_tokens)
    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [(doc_ids[i], documents[i], float(scores[i])) for i in top_idx]


BM25 built over 16018 docs (TEXT_FIELD='answer').


##  Use the same 3 questions as prompting (support both gold or answer in the selection file)

In [7]:
import os

sel = pd.read_csv(EVAL_SELECTION_CSV)

# figure out which column in the selection file holds the gold text
sel_gold_col = "gold" if "gold" in sel.columns else (ANSWER_FIELD if ANSWER_FIELD in sel.columns else None)
assert sel_gold_col is not None, f"Selection file must contain either 'gold' or '{ANSWER_FIELD}'"

eval_df = sel[["question", sel_gold_col]].copy()
eval_df = eval_df.head(min(N_EVAL, len(eval_df)))  # keep same 3 if that's what you saved
print("Evaluating SAME questions used in prompting:", len(eval_df))
print(eval_df["question"].to_string(index=False))


Evaluating SAME questions used in prompting: 3
              Do you have information about X-Rays
What are the symptoms of Alpha-ketoglutarate de...
What are the treatments for GLUT1 deficiency sy...


### Prompt builder + Groq call (unchanged, prints full prompt)

In [8]:
from groq import Groq

client = Groq(api_key=GROQ_API_KEY)

def build_rag_messages(question: str, contexts: list[str], print_prompt=True):
    system_txt = (
        "You are a concise, evidence-focused medical assistant. "
        "Use the provided context passages to answer accurately. "
        "If the context does not contain the answer, say you don't know."
    )
    ctx_block = "\n\n".join([f"[Context {i+1}]\n{c}" for i, c in enumerate(contexts)])
    user_txt = (
        f"Question: {question}\n\n"
        f"Context Passages:\n{ctx_block}\n\n"
        "Instructions: Answer in 2–4 sentences. Cite which Context numbers support your statements (e.g., [1], [2]). "
        "If insufficient evidence, say 'I don't know based on the given context.'"
    )
    messages = [{"role":"system","content":system_txt},{"role":"user","content":user_txt}]
    if print_prompt:
        print("\n"+"="*88); print("[RAG PROMPT]")
        print("\n[SYSTEM]\n"+system_txt)
        print("\n[USER]\n"+user_txt)
        print("="*88)
    return messages

def groq_chat(messages, model=GEN_MODEL, temperature=0.0, max_tokens=512, top_p=1.0):
    r = client.chat.completions.create(
        model=model, temperature=temperature, max_tokens=max_tokens, top_p=top_p, messages=messages
    )
    return r.choices[0].message.content.strip()


### Run RAG on those questions (use original answer column as ground truth)

In [9]:
rows = []
for _, r in eval_df.iterrows():
    q = str(r["question"]).strip()
    gold_text = str(r[sel_gold_col]).strip()  # keep whatever the selection file provided

    hits = bm25_retrieve(q, k=TOP_K)
    contexts = [text for (_id, text, _score) in hits]

    messages = build_rag_messages(q, contexts, print_prompt=PRINT_PROMPTS)
    ans = groq_chat(messages, model=GEN_MODEL, temperature=0.0, max_tokens=512, top_p=1.0)

    rows.append({
        "question": q,
        "answer": ans,
        "contexts": contexts,
        "ground_truth": gold_text   # RAGAS expects this name; we DO NOT rename the CSV column itself
    })

rag_results_df = pd.DataFrame(rows)
rag_results_df.head(2)



[RAG PROMPT]

[SYSTEM]
You are a concise, evidence-focused medical assistant. Use the provided context passages to answer accurately. If the context does not contain the answer, say you don't know.

[USER]
Question: Do you have information about X-Rays

Context Passages:
[Context 1]
Summary : X-rays are a type of radiation called electromagnetic waves. X-ray imaging creates pictures of the inside of your body. The images show the parts of your body in different shades of black and white. This is because different tissues absorb different amounts of radiation. Calcium in bones absorbs x-rays the most, so bones look white. Fat and other soft tissues absorb less, and look gray. Air absorbs the least, so lungs look black. The most familiar use of x-rays is checking for broken bones, but x-rays are also used in other ways. For example, chest x-rays can spot pneumonia. Mammograms use x-rays to look for breast cancer. When you have an x-ray, you may wear a lead apron to protect certain parts

Unnamed: 0,question,answer,contexts,ground_truth
0,Do you have information about X-Rays,"Yes, X‑rays are a form of electromagnetic radi...",[Summary : X-rays are a type of radiation call...,Summary : X-rays are a type of radiation calle...
1,What are the symptoms of Alpha-ketoglutarate d...,Alpha‑ketoglutarate dehydrogenase deficiency i...,[What are the signs and symptoms of Alpha-keto...,What are the signs and symptoms of Alpha-ketog...


In [10]:
print(rag_results_df.iloc[0, 2])

["Summary : X-rays are a type of radiation called electromagnetic waves. X-ray imaging creates pictures of the inside of your body. The images show the parts of your body in different shades of black and white. This is because different tissues absorb different amounts of radiation. Calcium in bones absorbs x-rays the most, so bones look white. Fat and other soft tissues absorb less, and look gray. Air absorbs the least, so lungs look black. The most familiar use of x-rays is checking for broken bones, but x-rays are also used in other ways. For example, chest x-rays can spot pneumonia. Mammograms use x-rays to look for breast cancer. When you have an x-ray, you may wear a lead apron to protect certain parts of your body. The amount of radiation you get from an x-ray is small. For example, a chest x-ray gives out a radiation dose similar to the amount of radiation you're naturally exposed to from the environment over 10 days.", "An Underdiagnosed Disease Experts believe that Paget's di

In [11]:
print(rag_results_df.iloc[0, 3])

Summary : X-rays are a type of radiation called electromagnetic waves. X-ray imaging creates pictures of the inside of your body. The images show the parts of your body in different shades of black and white. This is because different tissues absorb different amounts of radiation. Calcium in bones absorbs x-rays the most, so bones look white. Fat and other soft tissues absorb less, and look gray. Air absorbs the least, so lungs look black. The most familiar use of x-rays is checking for broken bones, but x-rays are also used in other ways. For example, chest x-rays can spot pneumonia. Mammograms use x-rays to look for breast cancer. When you have an x-ray, you may wear a lead apron to protect certain parts of your body. The amount of radiation you get from an x-ray is small. For example, a chest x-ray gives out a radiation dose similar to the amount of radiation you're naturally exposed to from the environment over 10 days.


## RAGAS eval (unchanged API; uses ground_truth from the rows we built)

In [12]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "langchain-huggingface"])


0

In [13]:
# --- SAFER RAGAS EVAL with FastEmbed (sequential, retries, robust) ---

# (Optional) tame threading to avoid native-lib crashes on Windows/Anaconda
import os, time, math
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import pandas as pd
from datasets import Dataset
from ragas import evaluate

# --- metrics (answer_correctness may not exist on older ragas) ---
try:
    from ragas.metrics import (
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        answer_correctness,
    )
    HAVE_CORR = True
except Exception:
    from ragas.metrics import (
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
    )
    HAVE_CORR = False

# --- LLM judge via Groq (keep it cheap/fast) ---
from langchain_groq import ChatGroq
ragas_llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model_name="llama-3.1-8b-instant",
    temperature=0.0,
    max_retries=6,          # help with transient errors
    request_timeout=60,     # avoid hanging
)

# --- Embeddings: FastEmbed (fallback to TF-IDF if not available) ---
def make_embeddings():
    try:
        from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
        return FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5")
    except Exception as e:
        print("[WARN] FastEmbed unavailable -> falling back to TF-IDF:", e)
        # Minimal TF-IDF wrapper that satisfies LangChain Embeddings
        from sklearn.feature_extraction.text import TfidfVectorizer
        from langchain_core.embeddings import Embeddings
        class TfidfEmbeddings(Embeddings):
            def __init__(self, corpus, max_features=4096):
                self.vec = TfidfVectorizer(max_features=max_features)
                self.vec.fit(corpus)
            def embed_documents(self, texts):
                return self.vec.transform(texts).toarray().tolist()
            def embed_query(self, text):
                return self.vec.transform([text]).toarray()[0].tolist()
        # small fit corpus from your current batch
        _fit = []
        _fit += [t for ctxs in rag_results_df["contexts"] for t in (ctxs if isinstance(ctxs, list) else [str(ctxs)])]
        _fit += rag_results_df["answer"].astype(str).tolist()
        _fit += rag_results_df["ground_truth"].astype(str).tolist()
        _fit = [s for s in {str(x).strip() for x in _fit} if s]
        return TfidfEmbeddings(_fit, max_features=4096)

ragas_embeddings = make_embeddings()

# --- Sanitize the dataframe to avoid bad rows ---
def _as_list_str(x):
    if isinstance(x, list):
        return [str(t) for t in x if str(t).strip()]
    if pd.isna(x) or str(x).strip() == "":
        return []
    return [str(x)]

_df = rag_results_df.copy()
_df["question"]      = _df["question"].astype(str).fillna("").str.strip()
_df["answer"]        = _df["answer"].astype(str).fillna("").str.strip()
_df["ground_truth"]  = _df["ground_truth"].astype(str).fillna("").str.strip()
_df["contexts"]      = _df["contexts"].apply(_as_list_str)

# drop rows with empty question/answer/contexts or ground_truth (they break some metrics)
mask_ok = _df["question"].ne("") & _df["answer"].ne("") & _df["ground_truth"].ne("") & _df["contexts"].apply(len).gt(0)
bad = (~mask_ok).sum()
if bad:
    print(f"[INFO] Dropping {bad} invalid rows before RAGAS.")
_df = _df[mask_ok].reset_index(drop=True)

# small cap to avoid 429s; tune as needed
MAX_ROWS = None  # set e.g. 20 if you still hit rate limits
if MAX_ROWS is not None:
    _df = _df.head(MAX_ROWS).copy()

ragas_ds = Dataset.from_pandas(_df[["question","answer","contexts","ground_truth"]])

# --- Evaluate metrics one-by-one with polite retries (helps with Groq 429) ---
def eval_metric_with_backoff(metric, tries=5, wait_seconds=70):
    last_err = None
    for t in range(1, tries+1):
        try:
            rep = evaluate(
                dataset=ragas_ds,
                metrics=[metric],
                llm=ragas_llm,
                embeddings=ragas_embeddings,
                is_async=False,          # avoid heavy async executor
                raise_exceptions=False,  # don’t crash on inner errors
            )
            return rep
        except Exception as e:
            last_err = e
            print(f"[RAGAS] Retry {t}/{tries} after {wait_seconds}s due to: {e}")
            time.sleep(wait_seconds)
    raise RuntimeError(f"RAGAS metric failed after {tries} retries") from last_err

metrics = [faithfulness, answer_relevancy, context_precision, context_recall]
if HAVE_CORR:
    metrics.append(answer_correctness)

scores = {}
for m in metrics:
    rep = eval_metric_with_backoff(m)
    key = next(iter(rep.keys()))
    try:
        scores[key] = float(rep[key])
    except Exception:
        # some versions return non-floats; coerce if possible
        try:
            scores[key] = float(str(rep[key]))
        except Exception:
            scores[key] = rep[key]

print("\n=== RAGAS METRICS (averages) ===")
order = ["faithfulness", "answer_relevancy", "context_precision", "context_recall"]
if HAVE_CORR:
    order.append("answer_correctness")
for k in order:
    if k in scores:
        v = scores[k]
        print(f"{k}: {v:.3f}" if isinstance(v, float) else f"{k}: {v}")

# Derived hallucination = 1 - faithfulness
if "faithfulness" in scores and isinstance(scores["faithfulness"], float):
    print(f"hallucination (1 - faithfulness): {1.0 - scores['faithfulness']:.3f}")
else:
    print("hallucination (1 - faithfulness): N/A")



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness

For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]


=== RAGAS METRICS (averages) ===
faithfulness: 0.894
answer_relevancy: 0.565
context_precision: 1.000
context_recall: 0.982
answer_correctness: 0.697
hallucination (1 - faithfulness): 0.106
