In [53]:
import os, re, time, hashlib, heapq
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm


In [None]:
# Better to do manually for now
# from dotenv import load_dotenv
# load_dotenv("secret.env")

# GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "").strip()

# import google.generativeai as genai
# genai.configure(api_key=GOOGLE_API_KEY)


In [None]:
import os, google.generativeai as genai

GOOGLE_API_KEY = "REMOVED FOR SECURITY REASONS"  

os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY  
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel(os.getenv("GEMINI_MODEL_ID", "gemini-2.5-flash"))


In [64]:
from datasets import load_dataset

# dataset: https://huggingface.co/datasets/curaihealth/medical_questions_pairs
split_name = "train" 

ds = load_dataset("curaihealth/medical_questions_pairs", split=split_name, streaming=True)
first_example = next(iter(ds))
print("Example keys:", list(first_example.keys()))

Q_KEYS = ["question", "prompt", "query", "input", "instruction", "question_text", "q"]
A_KEYS = ["response", "answer", "reference", "output", "completion", "answer_text", "a"]

def pick_key(d, candidates):
    for k in d.keys():
        kl = k.lower()
        if any(tag in kl for tag in candidates):
            return k
    return None

Q_COL = pick_key(first_example, Q_KEYS)
A_COL = pick_key(first_example, A_KEYS)
assert Q_COL and A_COL, f"Could not infer question/answer keys from example keys: {list(first_example.keys())}"

print("Detected mapping: ", Q_COL, "/", A_COL)


Example keys: ['dr_id', 'question_1', 'question_2', 'label']
Detected mapping:  question_1 / label


In [57]:
TARGET_N = 150
mode = "quick"           
MAX_SCAN = 200_000       

def clean_text(x):
    return "" if x is None else str(x).strip()

def write_clean(rows, out_path):
    df = pd.DataFrame(rows, columns=["question", "response"])
    df["question"] = df["question"].astype(str).str.strip()
    df["response"] = df["response"].astype(str).str.strip()
    df = df[df["question"].ne("")].dropna(subset=["question"]).drop_duplicates(subset=["question"])
    assert len(df) > 0, "No rows left after cleaning."
    df.to_csv(out_path, index=False)
    return df

rows = []

if mode == "quick":
    for i, ex in enumerate(ds):
        q = clean_text(ex.get(Q_COL, ""))
        a = clean_text(ex.get(A_COL, ""))
        if q:
            rows.append((q, a))
        if len(rows) >= TARGET_N:
            break

elif mode == "stable_hash":
    heap = []
    scanned = 0
    for ex in ds:
        q = clean_text(ex.get(Q_COL, ""))
        a = clean_text(ex.get(A_COL, ""))
        if not q:
            continue
        h = int(hashlib.md5(q.encode("utf-8")).hexdigest(), 16)
        item = (-h, (q, a))
        if len(heap) < TARGET_N:
            heapq.heappush(heap, item)
        else:
            if item[0] > heap[0][0]:
                heapq.heapreplace(heap, item)
        scanned += 1
        if scanned >= MAX_SCAN:
            break
    rows = [pair for _, pair in sorted(heap, key=lambda t: -t[0])]

else:
    raise ValueError("mode must be 'quick' or 'stable_hash'.")

SRC_PATH = DATA / "source_clean.csv"
clean_df = write_clean(rows, SRC_PATH)
print(f"Wrote {len(clean_df)} rows to", SRC_PATH)
clean_df.head(3)


Wrote 75 rows to c:\Users\valle\OneDrive\Desktop\ML & DL Research Project\RTHH\Phase I\data\source_clean.csv


Unnamed: 0,question,response
0,After how many hour from drinking an antibioti...,1
2,Am I over weight (192.9) for my age (39)?,1
4,Aspirin allergy - is it worth getting a bracelet?,0


In [58]:
df = pd.read_csv(SRC_PATH)
C_QUESTION, C_LABEL = "question", "response"

def stable_subset(data: pd.DataFrame, n=150, text_col=C_QUESTION):
    tmp = data.copy()
    tmp["_h"] = tmp[text_col].astype(str).apply(lambda s: hashlib.md5(s.encode("utf-8")).hexdigest())
    tmp = tmp.sort_values("_h", kind="mergesort").drop(columns=["_h"])
    return tmp.head(n).reset_index(drop=True)

subset = df if len(df) <= 150 else stable_subset(df, 150)
print("Subset shape:", subset.shape)

labels_fp = DATA / "labels_gemini_base.csv"
subset[[C_QUESTION, C_LABEL]].to_csv(labels_fp, index=False)
print("Wrote labels:", labels_fp)
subset.head(3)

Subset shape: (75, 2)
Wrote labels: c:\Users\valle\OneDrive\Desktop\ML & DL Research Project\RTHH\Phase I\data\labels_gemini_base.csv


Unnamed: 0,question,response
0,After how many hour from drinking an antibioti...,1
1,Am I over weight (192.9) for my age (39)?,1
2,Aspirin allergy - is it worth getting a bracelet?,0


In [65]:
# Temp prompts
SYSTEM_PROMPT = (
    "You are a strict judge. Answer ONLY with one token: TRUE or FALSE.\n"
    "If the question is ambiguous or underspecified, choose the more defensible option but still reply with exactly TRUE or FALSE."
)

def user_prompt(q: str):
    return f"""Question: {q}

Rules:
- Reply with exactly one token: TRUE or FALSE
- No explanation, no punctuation, no extra text."""

In [61]:
import google.generativeai as genai
genai.configure(api_key=GOOGLE_API_KEY)

def ask_gemini(question: str, model_id: str, max_retries: int = 3, backoff: float = 1.5):
    prompt = SYSTEM_PROMPT + "\n\n" + user_prompt(question)
    last_err = None
    for i in range(max_retries):
        try:
            model = genai.GenerativeModel(model_id)
            resp = model.generate_content(
                prompt,
                generation_config={"temperature": 0},
                safety_settings=None
            )
            return (resp.text or "").strip()
        except Exception as e:
            last_err = e
            time.sleep(backoff**(i+1))
    return f"ERROR: {last_err}"

In [62]:
answers_fp = DATA / "answers_gemini_base.csv"
results_fp = DATA / "results_gemini_base.csv"

done_qids = set()
if answers_fp.exists():
    try:
        prev = pd.read_csv(answers_fp, usecols=["qid"])
        done_qids = set(prev["qid"].tolist())
    except Exception:
        pass

rows_answers, rows_results = [], []

for i, row in tqdm(subset.iterrows(), total=len(subset), desc="Gemini on 150"):
    qid = i
    if qid in done_qids:
        continue

    q = str(row[C_QUESTION])
    reference_text = str(row[C_LABEL]) if not pd.isna(row[C_LABEL]) else ""

    raw = ask_gemini(q, GEMINI_MODEL_ID)

    rows_answers.append({"qid": qid, "question": q, "model_raw": raw})
    rows_results.append({"qid": qid, "question": q, "reference_text": reference_text, "model_raw": raw})

    if len(rows_answers) % 10 == 0:
        pd.DataFrame(rows_answers).to_csv(
            answers_fp, mode=("a" if answers_fp.exists() else "w"),
            header=not answers_fp.exists(), index=False
        )
        pd.DataFrame(rows_results).to_csv(
            results_fp, mode=("a" if results_fp.exists() else "w"),
            header=not results_fp.exists(), index=False
        )
        rows_answers.clear(); rows_results.clear()

if rows_answers:
    pd.DataFrame(rows_answers).to_csv(
        answers_fp, mode=("a" if answers_fp.exists() else "w"),
        header=not answers_fp.exists(), index=False
    )
if rows_results:
    pd.DataFrame(rows_results).to_csv(
        results_fp, mode=("a" if results_fp.exists() else "w"),
        header=not results_fp.exists(), index=False
    )

print("Wrote:")
print(" -", answers_fp)
print(" -", results_fp)

Gemini on 150: 100%|██████████| 75/75 [08:58<00:00,  7.18s/it]

Wrote:
 - c:\Users\valle\OneDrive\Desktop\ML & DL Research Project\RTHH\Phase I\data\answers_gemini_base.csv
 - c:\Users\valle\OneDrive\Desktop\ML & DL Research Project\RTHH\Phase I\data\results_gemini_base.csv





In [63]:
print("answers_gemini_base.csv")
display(pd.read_csv(answers_fp).head(5))
print("\nresults_gemini_base.csv")
display(pd.read_csv(results_fp).head(5))

answers_gemini_base.csv


Unnamed: 0,qid,question,model_raw
0,0,After how many hour from drinking an antibioti...,False
1,1,Am I over weight (192.9) for my age (39)?,True
2,2,Aspirin allergy - is it worth getting a bracelet?,True
3,3,"At a doctor's visit, I hit my head against a b...",False
4,4,Been on antibiotics 4 5wks top high tooth dent...,False



results_gemini_base.csv


Unnamed: 0,qid,question,reference_text,model_raw
0,0,After how many hour from drinking an antibioti...,1,False
1,1,Am I over weight (192.9) for my age (39)?,1,True
2,2,Aspirin allergy - is it worth getting a bracelet?,0,True
3,3,"At a doctor's visit, I hit my head against a b...",0,False
4,4,Been on antibiotics 4 5wks top high tooth dent...,1,False
