In [1]:
!pip install --upgrade pip
!pip install groq pandas


Collecting pip
  Using cached pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.2-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
C:\Users\anish\anaconda3\python.exe -m pip install --upgrade pip




In [2]:
### Config (edit paths + API key)

In [3]:
# === Paths / common config ===
EVAL_SELECTION_CSV = "medquad_selected_questions.csv"  # <-- your questions CSV
ANSWER_FIELD       = "answer"                                       # auto-detects 'gold' or this one
N_EVAL             = 3                                              # set to an int, or None for all rows

# === LLMs ===
GROQ_API_KEY = ""     # <-- hard-code your key
GEN_MODEL    = "qwen/qwen3-32b"         # <-- generation moqwen3-32bdel (ZERO-SHOT)
JUDGE_MODEL  = "llama-3.1-8b-instant"         # judge model (fast/cheap)

# === Generation knobs (keep fixed) ===
TEMPERATURE = 0.0
MAX_TOKENS  = 256
TOP_P       = 1.0

# Print the exact prompt sent to the model?
PRINT_ZERO_SHOT_PROMPTS = True

# Outputs
RESULTS_DIR = "."


In [4]:
### Load Questions

In [5]:
import pandas as pd

sel = pd.read_csv(EVAL_SELECTION_CSV)

# figure out which column holds the gold/reference text
gold_col = "gold" if "gold" in sel.columns else (ANSWER_FIELD if ANSWER_FIELD in sel.columns else None)
assert gold_col is not None, f"Selection file must contain either 'gold' or '{ANSWER_FIELD}'"

eval_df = sel[["question", gold_col]].copy()
eval_df.columns = ["question", "gold"]   # internal rename for convenience

if isinstance(N_EVAL, int):
    eval_df = eval_df.head(N_EVAL)

print("ZERO-SHOT evaluation questions:", len(eval_df))
print(eval_df["question"].to_string(index=False))


ZERO-SHOT evaluation questions: 3
              Do you have information about X-Rays
What are the symptoms of Alpha-ketoglutarate de...
What are the treatments for GLUT1 deficiency sy...


In [6]:
### Groq client + chat helper

In [7]:
from typing import List, Dict
from groq import Groq

client = Groq(api_key=GROQ_API_KEY)

def chat_messages(model: str, messages: List[Dict], temperature: float = 0.0, max_tokens: int = 256, top_p: float = 1.0) -> str:
    r = client.chat.completions.create(
        model=model, temperature=temperature, max_tokens=max_tokens, top_p=top_p,
        messages=messages
    )
    return r.choices[0].message.content.strip()


In [8]:
### ZERO-SHOT prompt builder (static; prints the exact prompt)

In [9]:
def build_zero_shot_messages(question: str, print_prompt: bool = False) -> List[Dict]:
    """
    ZERO-SHOT: single Q, no examples. Static template; only 'question' changes.
    """
    system_msg = (
        "You are a concise, evidence-focused medical assistant. "
        "Answer briefly (2–4 sentences) and avoid speculation. If unsure, say you don't know."
    )
    user_msg = f"Question: {question}\n\nRespond in 2–4 sentences. Be factual and precise."
    messages = [{"role":"system","content":system_msg},
                {"role":"user","content":user_msg}]
    if print_prompt:
        print("\n" + "="*88)
        print("[ZERO-SHOT PROMPT]")
        print("\n[SYSTEM]\n" + system_msg)
        print("\n[USER]\n" + user_msg)
        print("="*88)
    return messages


In [10]:
### Normalizer

In [11]:
import re, unicodedata

def normalize_text(t: str) -> str:
    t = (t or "").strip()
    t = unicodedata.normalize("NFKC", t)
    t = re.sub(r"\s+", " ", t)
    return t


In [12]:
### Run ZERO-SHOT generation + save answers

In [13]:
import pandas as pd

def run_zero_shot(print_prompts: bool = False) -> pd.DataFrame:
    rows = []
    for _, r in eval_df.iterrows():
        q   = normalize_text(str(r["question"]))
        gold= normalize_text(str(r["gold"]))

        msgs = build_zero_shot_messages(q, print_prompt=print_prompts)
        ans  = chat_messages(GEN_MODEL, msgs, temperature=TEMPERATURE, max_tokens=MAX_TOKENS, top_p=TOP_P)

        rows.append({
            "question": q,
            "gold": gold,
            "strategy": "zero-shot",
            "answer": normalize_text(ans),
        })

    df_zs = pd.DataFrame(rows)
    out_csv = f"{RESULTS_DIR}/medquad_zero_shot_answers.csv"
    df_zs.to_csv(out_csv, index=False)
    print(f"[ZERO-SHOT] Saved answers to: {out_csv}  (rows={len(df_zs)})")
    return df_zs

zero_shot_df = run_zero_shot(print_prompts=PRINT_ZERO_SHOT_PROMPTS)
zero_shot_df.head(2)



[ZERO-SHOT PROMPT]

[SYSTEM]
You are a concise, evidence-focused medical assistant. Answer briefly (2–4 sentences) and avoid speculation. If unsure, say you don't know.

[USER]
Question: Do you have information about X-Rays

Respond in 2–4 sentences. Be factual and precise.

[ZERO-SHOT PROMPT]

[SYSTEM]
You are a concise, evidence-focused medical assistant. Answer briefly (2–4 sentences) and avoid speculation. If unsure, say you don't know.

[USER]
Question: What are the symptoms of Alpha-ketoglutarate dehydrogenase deficiency ?

Respond in 2–4 sentences. Be factual and precise.

[ZERO-SHOT PROMPT]

[SYSTEM]
You are a concise, evidence-focused medical assistant. Answer briefly (2–4 sentences) and avoid speculation. If unsure, say you don't know.

[USER]
Question: What are the treatments for GLUT1 deficiency syndrome ?

Respond in 2–4 sentences. Be factual and precise.
[ZERO-SHOT] Saved answers to: ./medquad_zero_shot_answers.csv  (rows=3)


Unnamed: 0,question,gold,strategy,answer
0,Do you have information about X-Rays,Summary : X-rays are a type of radiation calle...,zero-shot,"<think> Okay, the user is asking about X-Rays...."
1,What are the symptoms of Alpha-ketoglutarate d...,What are the signs and symptoms of Alpha-ketog...,zero-shot,"<think> Okay, I need to answer the user's ques..."


In [14]:
### LLM-as-judge metrics (Faithfulness, Hallucination, Relevance, Correctness)

In [15]:
import re, pandas as pd

JUDGE_TEMPERATURE = 0.0
JUDGE_MAX_TOKENS  = 64
JUDGE_TOP_P       = 1.0

def _extract_float(txt: str) -> float:
    m = re.search(r"\d*\.?\d+(?:[eE][-+]?\d+)?", txt or "")
    try: x = float(m.group(0)) if m else 0.0
    except: x = 0.0
    return max(0.0, min(1.0, x))

def chat_judge(system_prompt: str, user_prompt: str, model: str = JUDGE_MODEL) -> str:
    r = client.chat.completions.create(
        model=model, temperature=JUDGE_TEMPERATURE, max_tokens=JUDGE_MAX_TOKENS, top_p=JUDGE_TOP_P,
        messages=[{"role":"system","content":system_prompt},
                  {"role":"user","content":user_prompt}]
    )
    return r.choices[0].message.content.strip()

def entail_prob(premise: str, claim: str) -> float:
    sys = ("You are an evaluator. Given a PREMISE (evidence) and a CLAIM (one sentence), "
           "return ONLY a number in [0,1] = probability that PREMISE ENTAILS CLAIM.")
    usr = f"PREMISE:\n{premise}\n\nCLAIM:\n{claim}\n\nOutput only a number in [0,1]."
    return _extract_float(chat_judge(sys, usr))

def split_sents(t: str):
    t = (t or "").strip()
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', t) if s.strip()]

def faithfulness_verbose(answer: str, gold_reference: str, thresh: float = 0.5):
    sents = split_sents(answer)
    if not sents:
        return 0.0, 1.0, pd.DataFrame(columns=["sentence","entail_prob","supported"])
    rows, ok = [], 0
    for s in sents:
        p = entail_prob(gold_reference, s)
        sup = p >= thresh
        ok += int(sup)
        rows.append({"sentence": s, "entail_prob": round(p,3), "supported": sup})
    f = ok/len(sents)
    return f, 1.0-f, pd.DataFrame(rows)

def answer_correctness_llm(gold_answer: str, model_answer: str) -> float:
    e1 = entail_prob(gold_answer, model_answer)
    e2 = entail_prob(model_answer, gold_answer)
    return 0.5*(e1+e2)

def answer_relevance(q: str, a: str) -> float:
    sys = (
        "You are an evaluator. Rate how well the ANSWER addresses the QUESTION.\n"
        "- 1.0 = Directly answers, accurate and focused.\n"
        "- 0.7 = Mostly answers with minor gaps/irrelevance.\n"
        "- 0.4 = Partial answer; noticeable gaps or off-topic parts.\n"
        "- 0.0 = Does not answer or off-topic.\n"
        "Return ONLY a number in [0,1]."
    )
    usr = f"QUESTION:\n{q}\n\nANSWER:\n{a}\n\nScore:"
    return _extract_float(chat_judge(sys, usr))

def band(x: float) -> str:
    return "Excellent" if x>=0.90 else "Good" if x>=0.75 else "Borderline" if x>=0.60 else "Poor"

def score_zero_shot(df_answers: pd.DataFrame, faith_thresh: float = 0.5) -> pd.DataFrame:
    rows = []
    for _, r in df_answers.iterrows():
        q, a, g = r["question"], r["answer"], r["gold"]
        faith, halluc, _df = faithfulness_verbose(a, g, thresh=faith_thresh)
        relev = answer_relevance(q, a)
        corr  = answer_correctness_llm(g, a)
        rows.append({
            "question": q,
            "strategy": "zero-shot",
            "faithfulness": round(faith, 3),
            "hallucination_rate": round(halluc, 3),
            "answer_relevance": round(relev, 3),
            "answer_correctness": round(corr, 3),
            "faith_band": band(faith),
            "relevance_band": band(relev),
            "correctness_band": band(corr),
        })
    return pd.DataFrame(rows)

zs_scored = score_zero_shot(zero_shot_df, faith_thresh=0.5)
zs_scored.to_csv(f"{RESULTS_DIR}/medquad_zero_shot_scores.csv", index=False)
print("Saved ZERO-SHOT per-item scores to:", f"{RESULTS_DIR}/medquad_zero_shot_scores.csv")

print("\n=== ZERO-SHOT Summary (averages) ===")
print(zs_scored[["faithfulness","hallucination_rate","answer_relevance","answer_correctness"]].mean().round(3))


Saved ZERO-SHOT per-item scores to: ./medquad_zero_shot_scores.csv

=== ZERO-SHOT Summary (averages) ===
faithfulness          0.735
hallucination_rate    0.265
answer_relevance      0.833
answer_correctness    0.630
dtype: float64


In [16]:
### 