In [18]:
import pandas as pd

# Load each CSV into a separate DataFrame
df_jama = pd.read_csv("jama_formatted_questions.csv")
df_medbullets = pd.read_csv("medbullets_op4.csv")
df_medxpert = pd.read_csv("MedXpertQA.csv")
df_mmlu = pd.read_csv("professional_medicine_mmlu.csv")

# Optional: print shape or preview
print("JAMA:", df_jama.shape)
print("MedBullets:", df_medbullets.shape)
print("MedXpertQA:", df_medxpert.shape)
print("MMLU:", df_mmlu.shape)


JAMA: (1034, 19)
MedBullets: (298, 12)
MedXpertQA: (2450, 8)
MMLU: (272, 6)


In [4]:
import pandas as pd

# Standardize into two-column DataFrames
jama_df = pd.DataFrame({
    "question": df_jama["numbered_question"],
    "answer": df_jama["answer_idx"].combine_first(df_jama["answer"]),
    "data_source": "jama"

})

medbullets_df = pd.DataFrame({
    "question": df_medbullets["actual_question"],
    "answer": df_medbullets["answer_idx"].combine_first(df_medbullets["answer"]),
    "data_source": "medbullets"

})

mmlu_df = pd.DataFrame({
    "question": df_mmlu["numbered_question"],
    "answer": df_mmlu["answer"],
    "data_source": "mmlu"
})

medxpert_df = pd.DataFrame({
    "question": df_medxpert["actual_question"],
    "answer": df_medxpert["label"],
    "data_source": "medxpert"
})

# Concatenate all into one DataFrame
merge_df = pd.concat([jama_df, medbullets_df, mmlu_df, medxpert_df], ignore_index=True)

# Preview result
print(merge_df.shape)
merge_df.head(700)


(4054, 3)


Unnamed: 0,question,answer,data_source
0,1. A man in his 30s with AIDS presented with a...,D,jama
1,1. An 80-year-old man with stage II bladder ca...,C,jama
2,1. A 31-year-old man presented with left cervi...,D,jama
3,1. A 53-year-old woman with a history of stage...,C,jama
4,1. A 33-year-old man with no prior ocular prob...,B,jama
...,...,...,...
695,1. A 70-year old man with a history of soft co...,C,jama
696,1. A 40-year-old man of Scandinavian descent p...,B,jama
697,1. A girl in her teens with a history of postc...,B,jama
698,1. A 27-year-old man presented to the clinic f...,A,jama


In [3]:
import sys
!{sys.executable} -m pip install openai


Collecting openai
  Downloading openai-1.75.0-py3-none-any.whl.metadata (25 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.9.0-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting sniffio (from openai)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.8-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.75.0-py3-none-any.whl (646 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.

In [3]:
import openai
import pandas as pd
import csv
import os
from tqdm import tqdm


openai.api_key = "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"  # Replace with your actual API key
client = openai.OpenAI(
    api_key= "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" #Set the API key. See the how-to guide for further instructions
)

In [10]:
def format_centaur_question(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": (
                    "You are a medical NLP assistant. Your job is to format clinical vignettes with numbered sentences followed by a multiple-choice question "
                    "into three clear sections:\n\n"
                    "Step 1: Read excerpt — just the numbered sentences\n"
                    "Step 2: Answer QA Details — the diagnostic question and options\n"
                    "Step 3: Determine sentence relevance — repeat the numbered sentences exactly as in Step 1.\n\n"
                    "Do not modify the content, just separate it into these three steps."
                )},
                {"role": "user", "content": text}
            ],
            max_tokens=2048,
            temperature=0.2,
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {e}"


In [18]:
from pathlib import Path
import time


# Load and slice the full dataframe
sample_df = merge_df.head(4054).copy()

# Checkpoint path for saving progress
checkpoint_path = Path("centaur_checkpoint.csv")

# If checkpoint exists, load it
if checkpoint_path.exists():
    processed_df = pd.read_csv(checkpoint_path)
    start_index = len(processed_df)
    print(f"🔁 Resuming from row {start_index}")
else:
    processed_df = pd.DataFrame(columns=sample_df.columns.tolist() + ["centaur_question"])
    start_index = 0
    print("🚀 Starting fresh")

# Iterate from last saved row
for idx in range(start_index, len(sample_df)):
    row = sample_df.iloc[idx]
    print(f"\n--- Processing Row {idx+1}/{len(sample_df)} ---")
    
    result = format_centaur_question(row["question"])
    print(result)

    # Add result to the row
    new_row = row.copy()
    new_row["centaur_question"] = result

    # Convert to single-row DataFrame and concatenate
    new_df = pd.DataFrame([new_row])
    processed_df = pd.concat([processed_df, new_df], ignore_index=True)

    # Save checkpoint
    processed_df.to_csv(checkpoint_path, index=False)
    time.sleep(1.5)

print("\n✅ All rows processed and saved to 'centaur_checkpoint.csv'")

🔁 Resuming from row 4054

✅ All rows processed and saved to 'centaur_checkpoint.csv'


In [19]:
processed_df.head()

Unnamed: 0,question,answer,data_source,centaur_question,sentence_number
0,1. A man in his 30s with AIDS presented with a...,D,jama,### Step 1: Read excerpt\n1. A man in his 30s ...,6
1,1. An 80-year-old man with stage II bladder ca...,C,jama,### Step 1: Read excerpt\n1. An 80-year-old ma...,13
2,1. A 31-year-old man presented with left cervi...,D,jama,### Step 1: Read excerpt\n1. A 31-year-old man...,17
3,1. A 53-year-old woman with a history of stage...,C,jama,**Step 1: Read excerpt**\n\n1. A 53-year-old w...,6
4,1. A 33-year-old man with no prior ocular prob...,B,jama,**Step 1: Read excerpt**\n\n1. A 33-year-old m...,13


In [24]:
import pandas as pd
import re

processed_df["sentence_number"] = processed_df["question"].apply(count_numbered_sentences)
processed_df.to_csv(checkpoint_path, index=False)  # Save updated version

# --- Step 1: Filter rows with at least 3 numbered sentences ---
filtered_df = merge_df[merge_df["sentence_number"] > 2].copy()

# --- Step 2: Define sampling thresholds ---
LARGE_SAMPLE = 750
SMALL_SAMPLE = 250

# --- Step 3: Sample from each data source ---
sampled_dfs = []
for source in filtered_df["data_source"].unique():
    source_df = filtered_df[filtered_df["data_source"] == source]
    n_rows = len(source_df)

    if n_rows >= LARGE_SAMPLE:
        sampled = source_df.sample(n=LARGE_SAMPLE, random_state=42)
        print(f"✅ Sampled {LARGE_SAMPLE} from {source} (total {n_rows})")
    elif n_rows >= SMALL_SAMPLE:
        sampled = source_df.sample(n=SMALL_SAMPLE, random_state=42)
        print(f"⚠️ Sampled {SMALL_SAMPLE} from {source} (total {n_rows})")
    else:
        print(f"❌ Skipped {source}: only {n_rows} rows (less than {SMALL_SAMPLE})")
        continue

    sampled_dfs.append(sampled)

# --- Step 4: Combine and shuffle ---
processed_df = pd.concat(sampled_dfs, ignore_index=True)
processed_df = processed_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n✅ Final Sample Size:", processed_df.shape[0])
print(processed_df["data_source"].value_counts())

# --- Step 5: Standardize answer labels (A–J) ---
def standardize_answer(ans):
    if pd.isna(ans):
        return None
    ans_str = str(ans).strip().upper()

    mapping = {
        "0": "A", "A": "A",
        "1": "B", "B": "B",
        "2": "C", "C": "C",
        "3": "D", "D": "D",
        "4": "E", "E": "E",
        "5": "F", "F": "F",
        "6": "G", "G": "G",
        "7": "H", "H": "H",
        "8": "I", "I": "I",
        "9": "J", "J": "J"
    }

    for key, val in mapping.items():
        if re.search(rf"\b{key}\b", ans_str):
            return val

    return ans_str  # fallback

processed_df["answer"] = processed_df["answer"].apply(standardize_answer)

# --- Step 6: Add unique ID column (e.g., ID0001, ID0002, ...) ---
processed_df.insert(0, "ID", [f"ID{i+1:04d}" for i in range(len(processed_df))])

# --- Step 7: Export processed dataset to CSV ---
output_file = "merged_2k_questions.csv"
processed_df.to_csv(output_file, index=False)
print(f"\n📁 Saved to: {output_file}")


✅ Sampled 750 from jama (total 1034)
⚠️ Sampled 250 from medbullets (total 296)
⚠️ Sampled 250 from mmlu (total 257)
✅ Sampled 750 from medxpert (total 1913)

✅ Final Sample Size: 2000
data_source
medxpert      750
jama          750
medbullets    250
mmlu          250
Name: count, dtype: int64

📁 Saved to: merged_2k_questions.csv


In [25]:
merge_df.head(700)


Unnamed: 0,question,answer,data_source,sentence_number
0,1. A man in his 30s with AIDS presented with a...,D,jama,6
1,1. An 80-year-old man with stage II bladder ca...,C,jama,13
2,1. A 31-year-old man presented with left cervi...,D,jama,17
3,1. A 53-year-old woman with a history of stage...,C,jama,6
4,1. A 33-year-old man with no prior ocular prob...,B,jama,13
...,...,...,...,...
695,1. A 70-year old man with a history of soft co...,C,jama,15
696,1. A 40-year-old man of Scandinavian descent p...,B,jama,16
697,1. A girl in her teens with a history of postc...,B,jama,20
698,1. A 27-year-old man presented to the clinic f...,A,jama,11


In [26]:
import re
def standardize_answer(ans):
    if pd.isna(ans):
        return None
    ans_str = str(ans).strip().upper()

    mapping = {
        "1": "A", "A": "A",
        "2": "B", "B": "B",
        "3": "C", "C": "C",
        "4": "D", "D": "D",
        "5": "E", "E": "E",
        "6": "F", "F": "F",
        "7": "G", "G": "G",
        "8": "H", "H": "H",
        "9": "I", "I": "I",
        "10": "J", "J": "J"
    }

    for key, value in mapping.items():
        if re.search(rf"\b{key}\b", ans_str):
            return value

    return ans_str  # fallback: return unchanged if no match

# Apply to the merge_df
merge_df["answer"] = merge_df["answer"].apply(standardize_answer)
merge_df.head(700)


Unnamed: 0,question,answer,data_source,sentence_number
0,1. A man in his 30s with AIDS presented with a...,D,jama,6
1,1. An 80-year-old man with stage II bladder ca...,C,jama,13
2,1. A 31-year-old man presented with left cervi...,D,jama,17
3,1. A 53-year-old woman with a history of stage...,C,jama,6
4,1. A 33-year-old man with no prior ocular prob...,B,jama,13
...,...,...,...,...
695,1. A 70-year old man with a history of soft co...,C,jama,15
696,1. A 40-year-old man of Scandinavian descent p...,B,jama,16
697,1. A girl in her teens with a history of postc...,B,jama,20
698,1. A 27-year-old man presented to the clinic f...,A,jama,11


In [27]:
import re

def count_numbered_sentences(text):
    if pd.isna(text):
        return 0
    lines = text.strip().split("\n")
    count = 0
    for line in lines:
        if re.match(r"^\d+\.\s", line.strip()):
            count += 1
        else:
            break  # stop at the first non-numbered line
    return count

# Apply to the question column
merge_df["sentence_number"] = merge_df["question"].apply(count_numbered_sentences)


In [28]:
# Save the updated DataFrame back to the CSV file
merge_df.to_csv("merged_4k_questions.csv", index=False)

In [29]:
import pandas as pd

# Step 1: Filter to only those with >2 numbered sentences
filtered_df = merge_df[merge_df["sentence_number"] > 2]

# Define thresholds
large_sample_size = 750
small_sample_size = 250

# Sample from each source
sampled_dfs = []
sources = filtered_df["data_source"].unique()

for source in sources:
    subset = filtered_df[filtered_df["data_source"] == source]
    source_size = len(subset)

    if source_size >= large_sample_size:
        sampled = subset.sample(n=large_sample_size, random_state=42)
        print(f"✅ Sampled 500 from {source} (total {source_size})")
    elif source_size >= small_sample_size:
        sampled = subset.sample(n=small_sample_size, random_state=42)
        print(f"⚠️ Sampled 250 from {source} (total {source_size})")
    else:
        print(f"❌ Skipped {source}: only {source_size} rows (less than 250)")
        continue

    sampled_dfs.append(sampled)

# Combine and shuffle
sampled_df = pd.concat(sampled_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

# Summary
print("\n✅ Final Sample Size:", sampled_df.shape[0])
print(sampled_df["data_source"].value_counts())

import re
def standardize_answer(ans):
    if pd.isna(ans):
        return None
    ans_str = str(ans).strip().upper()

    mapping = {
        "0": "A", "A": "A",
        "1": "B", "B": "B",
        "2": "C", "C": "C",
        "3": "D", "D": "D",
        "4": "E", "E": "E",
        "5": "F", "F": "F",
        "6": "G", "G": "G",
        "7": "H", "H": "H",
        "8": "I", "I": "I",
        "9": "J", "J": "J"
    }

    for key, value in mapping.items():
        if re.search(rf"\b{key}\b", ans_str):
            return value

    return ans_str  # fallback: return unchanged if no match

# Apply to the merge_df
sampled_df["answer"] = sampled_df["answer"].apply(standardize_answer)
# merge_df.head(700)


✅ Sampled 500 from jama (total 1034)
⚠️ Sampled 250 from medbullets (total 296)
⚠️ Sampled 250 from mmlu (total 257)
✅ Sampled 500 from medxpert (total 1913)

✅ Final Sample Size: 2000
data_source
medxpert      750
jama          750
medbullets    250
mmlu          250
Name: count, dtype: int64


In [30]:
# Add ID column with zero-padded values (e.g., ID0001, ID0002, ...)
sampled_df.insert(0, "ID", ["ID{:04d}".format(i + 1) for i in range(len(sampled_df))])
sampled_df.to_csv("merged_2k_questions.csv", index=False)

In [31]:
# Filter to rows not in sampled_df and from medbullets only
not_sampled_medbullets = merge_df[
    (~merge_df["question"].isin(sampled_df["question"])) &
    (merge_df["data_source"] == "mmlu")
]

# Show 3 full rows
not_sampled_medbullets.head(10)


Unnamed: 0,question,answer,data_source,sentence_number
1337,1. Six healthy subjects participate in a study...,0,mmlu,2
1345,1. A 32-year-old male presents to the office w...,C,mmlu,2
1348,1. A 5-year-old boy is admitted to the hospita...,C,mmlu,10
1379,"1. During a study of renal glomeruli, a health...",C,mmlu,1
1396,1. A 37-year-old woman with right lower extrem...,A,mmlu,2
1405,1. A 23-year-old woman with bone marrow failur...,B,mmlu,2
1408,1. An epidemic involving 10 individuals of all...,A,mmlu,5
1429,1. A sexually active 20-year-old woman has had...,A,mmlu,4
1439,1. A 43-year-old female presents to the office...,A,mmlu,2
1440,1. A 64-year-old male presents to the emergenc...,C,mmlu,5


In [32]:
print("JAMA columns:", list(df_jama.columns))
print("MedBullets columns:", list(df_medbullets.columns))
print("MMLU columns:", list(df_mmlu.columns))
print("MedXpert columns:", list(df_medxpert.columns))


JAMA columns: ['link', 'question', 'opa', 'opb', 'opc', 'opd', 'diagnosis', 'answer_idx', 'answer', 'explanation', 'field', 'actual_question', 'id', 'formatted_question', 'gpt_direct_prediction', 'gpt_no_bullet_direct_prediction', 'gpto3_mini_no_bullet_direct_prediction', 'gpto3_reasoning', 'numbered_question']
MedBullets columns: ['link', 'question', 'opa', 'opb', 'opc', 'opd', 'answer_idx', 'answer', 'explanation', 'actual_question', 'number_sentences', 'bullet_question']
MMLU columns: ['question', 'choices', 'answer', 'actual_question', 'numbered_question', 'number_sentences']
MedXpert columns: ['id', 'question', 'options', 'label', 'medical_task', 'body_system', 'question_type', 'actual_question']


In [33]:
import re

def fix_question_formatting(text):
    # Patterns to detect question prompts (extendable)
    prompts = [
        "What Would You Do Next\?",
        "What Is Your Diagnosis\?",
        "What Is the Most Likely Diagnosis\?",
        "Which of the following is the most likely diagnosis\?",
        "What Is the Most Likely Cause\?"
    ]
    
    # Combine into one regex pattern
    pattern = r"(?<![\.\n])\s*(" + "|".join(prompts) + r")"

    # Substitute: ensure a period and line break before the question
    fixed_text = re.sub(pattern, r".\n\1", text)

    return fixed_text.strip()

def auto_split_context_question(row):
    # First, fix the format in the full question string
    text = fix_question_formatting(row["question"])

    # Then extract question and context as before
    match = re.search(r"(What|Which)[^\n]+?\?", text, flags=re.IGNORECASE)

    if match:
        q_start = match.start()
        context = text[:q_start].strip()
        question_prompt = text[q_start:match.end()].strip()
    else:
        # Fallback: last sentence
        sentences = re.split(r'(?<=[.?!])\s+', text)
        context = " ".join(sentences[:-1])
        question_prompt = sentences[-1]

    options = [
        f"A: {row['opa']}",
        f"B: {row['opb']}",
        f"C: {row['opc']}",
        f"D: {row['opd']}"
    ]
    question_full = question_prompt + "\n\n" + "\n".join(options)

    return pd.Series([context, question_full], index=["context", "question"])

jama_df = pd.concat([
    df_jama.apply(auto_split_context_question, axis=1),
    df_jama["answer_idx"].combine_first(df_jama["answer"]).rename("answer"),
    pd.Series(["jama"] * len(df_jama), name="data_source")
], axis=1)

# Optional preview
print("✅ JAMA processed:", jama_df.shape)
# Temporarily show full column contents
with pd.option_context("display.max_colwidth", None):
    display(jama_df.head())

✅ JAMA processed: (1034, 4)


Unnamed: 0,context,question,answer,data_source
0,"A man in his 30s with AIDS presented with acute-onset painful scattered umbilicated papulopustules and ovoid ulcerated plaques with elevated, pink borders on the face, trunk, and extremities (Figure, A). The patient also had a new-onset cough but was afebrile and denied other systemic symptoms. Due to his significant immunocompromise, the clinical presentation was highly suspicious for infection. For rapid bedside differentiation of multiple infectious etiologies, a Tzanck smear was performed by scraping the base of an ulcerated lesion and inner aspect of a pseudopustule and scraping its base with a #15 blade. These contents were placed on a glass slide, fixed, and stained with Wright-Giemsa and subsequently Papanicolaou staining to further characterize the changes seen.A, Clinical image demonstrating papulopustules and ovoid ulcerated plaques with elevated, pink borders on the elbows. B, Tzanck smear using Wright-Giemsa staining of specimen demonstrating ballooning of keratinocytes and peripheralization of nuclear material (original magnification ×20). .",What Is Your Diagnosis?\n\nA: Herpes simplex virus\nB: Histoplasmosis\nC: Molluscum contagiosum\nD: Mpox,D,jama
1,"An 80-year-old man with stage II bladder carcinoma (T2NXM0) and atrial fibrillation treated with apixaban presented to the emergency department with 1 week of fatigue and 2 days of dyspnea on exertion. One week prior to presentation, he received a fourth cycle of carboplatin/gemcitabine for bladder carcinoma with 6 mg of pegylated granulocyte colony-stimulating factor (G-CSF). The patient reported no anorexia, fever, melena, hematemesis, hematuria, cough, orthopnea, or peripheral edema.His vital signs were normal except for a heart rate of 103/min. His white blood cell count was 22 × 103/μL (reference, 4-11 × 103/μL), increased from 4.8 × 103/μL 8 days prior. His manual differential, which was previously normal, showed 18% bands (0%-10%), 2% metamyelocytes, 7% myelocytes, 7% promyelocytes, and 6% blasts. His hemoglobin level was 5.2 g/dL (reference, 13-17 g/dL), decreased from 7.4 g/dL, and platelets were 25 × 103/μL (reference, 150-420 × 103/μL), decreased from 268 × 103/μL 8 days prior. Ferritin was 1423 ng/mL (reference, 300-400 ng/mL). Mean corpuscular volume, prothrombin time, international normalized ratio, partial thromboplastin time, fibrinogen, haptoglobin, vitamin B12, and methylmalonic acid values were normal, and results of a direct antiglobulin test were negative. A computed tomography (CT) scan of his abdomen and pelvis was normal. He received 2 units of packed red blood cells and was admitted to the hospital. Flow cytometry identified a small population of CD34+/CD117+ cells (Figure).Left, Peripheral blood smear showing normocytic anemia with anisopoikilocytosis and leukocytosis with 6% to 8% blast forms. Right, Flow cytometry of peripheral blood demonstrating a small population of white blood cells that stained positive for CD34 and CD117, which are markers of immature myeloblasts.Esophagogastroduodenoscopy revealed 2 nonbleeding angioectasias in the stomach that were treated with argon plasma coagulation. Three days after admission, his white blood cell count was 27.7 × 103/μL with 4% peripheral blasts, hemoglobin was 7.3 g/dL, and platelet count had increased to 92 × 103/μL without a platelet transfusion.Repeat complete blood cell count with differential in 1 to 2 weeks.",What Would You Do Next?\n\nA: Perform a bone marrow biopsy\nB: Prescribe all-trans retinoic acid\nC: Repeat complete blood cell count with differential in 1 to 2 weeks\nD: Start cytoreductive therapy with hydroxyurea,C,jama
2,"A 31-year-old man presented with left cervical and left inguinal masses. He reported intermittent itching and night sweats for 2 years. He denied fever, weight loss, shortness of breath, rashes, diarrhea, and neurological symptoms. On a preemployment evaluation, the patient was told he had a high white blood cell count 2 years ago. On examination, there was left cervical and inguinal lymphadenopathy and no other organomegaly. Complete blood cell count and peripheral blood smear showed marked leukocytosis, with a white blood cell count of 22 340/μL, an absolute neutrophil count of 5360/μL, and 55% eosinophils with an absolute eosinophil count of 12 290/μL (to convert all to cells ×109/L, multiply by 0.001). Vitamin B12 was markedly elevated at more than 4000 pg/mL (to convert to pmol/L, multiply by 0.7378). The erythrocyte sedimentation rate was 5 mm/h. Lactate dehydrogenase was 180 U/L, and alkaline phosphatase was 81 U/L (to convert both to μkat/L, multiply by 0.0167). Evaluations for HIV and hepatitis B and C were all negative. Serum creatinine was 0.76 mg/dL (to convert to μmol/L, multiply by 88.4); alanine aminotransferase and aspartate aminotransferase were 11 U/L and 9.9 U/L, respectively (to convert to μkat/L, multiply by 0.0167); and total bilirubin was 0.35 mg/dL (to convert to μmol/L, multiply by 17.104). Bone marrow biopsy showed hypercellular marrow (cellularity of 100%), myeloid hyperplasia, increased eosinophils with some dysplasia, and a blast count of 2%. Positron emission tomographic–computed tomographic scan showed a left upper cervical lymph node of 2.6 cm and a left inguinal lymph node of 3.1 × 2.3 cm with an standardized uptake value max of 5.7 (Figure, A). Left inguinal lymph node biopsy showed partial involvement by atypical cells with high proliferation index (Ki-67 >95%) that were positive for CD3, CD4, CD8, BCL2, and TDT, suggestive of T-cell lymphoblastic lymphoma/leukemia (Figure, B).A, Positron emission tomographic–computed tomographic (PET/CT) scan of the head and neck at presentation showing a left upper cervical lymph node of 2.6 cm (arrowhead). B, Lymph node biopsy immunohistochemical stain with terminal deoxynucleotidyl transferase. The inset shows interphase fluorescence in situ hybridization for FIP1L1::PDGFRA rearrangement (positive). C, PET/CT 12 weeks after treatment initiation.Myeloid/lymphoid neoplasms with eosinophilia and tyrosine kinase gene fusions.",What Is Your Diagnosis?\n\nA: Kimura disease\nB: Classic Hodgkin lymphoma\nC: T-cell acute lymphoblastic lymphoma/leukemia\nD: Myeloid/lymphoid neoplasms with eosinophilia and tyrosine kinase gene fusions,D,jama
3,"A 53-year-old woman with a history of stage IVA1 (T4N1M0B2) mycosis fungoides presented with a new 1-month history of hyperpigmentation of the oral mucosa (Figure 1). Examination of the mouth revealed multiple coalescing painless nonpruritic black macules and patches on the tongue, roof of the mouth, and buccal mucosa. Examination of the skin was notable for erythematous and hyperpigmented patches covering 90% of the body surface area, consistent with her known mycosis fungoides. Other notable findings on examination were 1- to 2-cm lymphadenopathy in the bilateral inguinal folds and axillae. Review of systems was notable for fatigue. The patient had previously received 5 cycles of romidepsin with progression of disease, followed by 4 doses of pegylated liposomal doxorubicin hydrochloride, which was followed by partial response.Black macules and patches on the tongue, roof of the mouth, and buccal mucosa. .",What Is Your Diagnosis?\n\nA: Laugier-Hunziker syndrome\nB: Melanoma\nC: Medication adverse effect\nD: Oral involvement of mycosis fungoides,C,jama
4,"A 33-year-old man with no prior ocular problems presented to the emergency department in central Florida with a chief complaint of a “pulling and popping” sensation in his left eye that had occurred the previous night. Ophthalmology was consulted to evaluate for a conjunctival foreign body of the left eye. At the time of the examination, his symptoms had resolved; however, he had a photograph from a cellular phone taken during the episode (Figure 1). The photograph shows an irregular, serpiginous extension from beneath the plica semilunaris toward the corneal limbus with localized conjunctival hyperemia. He reported a similar sensation of movement in his left eye that occurred for 1 night about 5 years ago for which he visited an urgent care center where he was diagnosed with allergic conjunctivitis. He also reported recent swelling of the left side of his face with associated numbness and occasional swelling of his left hand, all of which resolved after a few days. He had immigrated from Nigeria 10 years prior, had not returned since, and was working as a traveling nurse. A slitlamp examination did not reveal any conjunctival hyperemia, foreign bodies, or other abnormalities like those shown in the photograph. His uncorrected visual acuity was 20/20, extraocular movements were full and without pain or abnormal sensation, and intraocular pressure was normal. His dilated fundus examination was unremarkable. A comprehensive blood cell count revealed mild elevation in the relative (but not absolute) eosinophil count (6.9% reference; 6.0% of white blood cells).Patient cellular phone photograph of the left eye shows an irregular serpiginous extension from beneath the plica semilunaris toward the corneal limbus with localized conjunctival hyperemia.Exploration of conjunctiva and removal of foreign body.",What Would You Do Next?\n\nA: Treatment with diethylcarbamazine\nB: Peripheral blood smear\nC: Serological testing for onchocerciasis\nD: Exploration of conjunctiva and removal of foreign body,B,jama


In [41]:
import pandas as pd
import re

# --- Split functions ---
def split_question_by_answer_choices(row):
    text = row["question"].strip()
    
    # If no explicit delimiter, return full text as context
    if "\nAnswer Choices:" not in text:
        return pd.Series([text, ""], index=["context", "question"])

    # Split at delimiter
    pre_question, choices = text.split("\nAnswer Choices:", 1)
    sentences = re.split(r'(?<=[.?!])\s+', pre_question.strip())

    if len(sentences) <= 1:
        context = ""
        question = pre_question.strip() + "\nAnswer Choices:" + choices.strip()
    else:
        context = " ".join(sentences[:-1])
        question_prompt = sentences[-1]
        question = question_prompt.strip() + "\nAnswer Choices:" + choices.strip()

    # Fallback: ensure both fields are filled
    if not context:
        context = text
    if not question:
        question = text

    return pd.Series([context, question], index=["context", "question"])

def split_mmlu_context_and_question(row):
    full_text = str(row["question"]).strip()
    choices = eval(row["choices"]) if isinstance(row["choices"], str) else row["choices"]

    # Split into sentences. Assume last sentence is the actual question.
    sentences = re.split(r'(?<=[.?!])\s+', full_text)
    if len(sentences) < 2:
        return pd.Series([full_text, ""])  # fallback

    context = " ".join(sentences[:-1])
    question_text = sentences[-1]

    # Format choices with A–J
    option_labels = list("ABCDEFGHIJ")
    formatted_choices = [
        f"{label}. {choice.strip()}" for label, choice in zip(option_labels, choices)
    ]
    formatted_question = question_text + "\n\n" + "\n\n".join(formatted_choices)

    return pd.Series([context, formatted_question], index=["context", "question"])

def medbullets_auto_split_context_question(row):
    # First, fix the format in the full question string
    text = fix_question_formatting(row["question"])

    # Then extract question and context as before
    match = re.search(r"(What|Which)[^\n]+?\?", text)

    if match:
        q_start = match.start()
        context = text[:q_start].strip()
        question_prompt = text[q_start:match.end()].strip()
    else:
        # Fallback: last sentence
        sentences = re.split(r'(?<=[.?!])\s+', text)
        context = " ".join(sentences[:-1])
        question_prompt = sentences[-1]

    options = [
        f"A: {row['opa']}",
        f"B: {row['opb']}",
        f"C: {row['opc']}",
        f"D: {row['opd']}"
    ]
    question_full = question_prompt + "\n\n" + "\n".join(options)

    return pd.Series([context, question_full], index=["context", "question"])


# --- Build harmonized LLM datasets ---
medxpert_df = pd.concat([
    df_medxpert.apply(split_question_by_answer_choices, axis=1),
    df_medxpert["label"].rename("answer"),
    pd.Series(["medxpert"] * len(df_medxpert), name="data_source")
], axis=1)

mmlu_df = pd.concat([
    df_mmlu.apply(split_mmlu_context_and_question, axis=1),
    df_mmlu["answer"].rename("answer"),
    pd.Series(["mmlu"] * len(df_mmlu), name="data_source")
], axis=1)


medbullets_df = pd.concat([
    df_medbullets.apply(medbullets_auto_split_context_question, axis=1),
    df_medbullets["answer_idx"].combine_first(df_jama["answer"]).rename("answer"),
    pd.Series(["medbullets"] * len(df_medbullets), name="data_source")
], axis=1)

# --- Final merge ---
merge_llm_df = pd.concat(
    [jama_df, medbullets_df, medxpert_df, mmlu_df],
    ignore_index=True
)[["context", "question", "answer", "data_source"]]

# Preview
print("✅ Merged LLM dataset shape:", merge_llm_df.shape)
# merge_llm_df.head()
with pd.option_context("display.max_colwidth", None):
    display(merge_llm_df[merge_llm_df["data_source"] == "medbullets"].head(400))

✅ Merged LLM dataset shape: (4790, 4)


Unnamed: 0,context,question,answer,data_source
1034,A 42-year-old woman is enrolled in a randomized controlled trial to study cardiac function in the setting of several different drugs. She is started on verapamil and instructed to exercise at 50% of her VO2 max while several cardiac parameters are being measured.,"During this experiment, which of the following represents the relative conduction speed through the heart from fastest to slowest?\n\nA: AV node > ventricles > atria > Purkinje fibers\nB: Purkinje fibers > ventricles > atria > AV node\nC: Purkinje fibers > atria > ventricles > AV node\nD: Purkinje fibers > AV node > ventricles > atria",C,medbullets
1035,"A 9-year-old girl presents to the emergency department with a fever and a change in her behavior. She presented with similar symptoms 6 weeks ago and was treated for an Escherchia coli infection. She also was treated for a urinary tract infection 10 weeks ago. Her mother says that last night her daughter felt ill, and her condition has been worsening. Her daughter experienced a severe headache and had a stiff neck. This morning she was minimally responsive, vomited several times, and produced a small amount of dark cloudy urine. The patient was born at 39 weeks and met all her developmental milestones. She is currently up to date on her vaccinations and did not have infections during early childhood. Her parents are divorced and her father has noted she does not seem to get sick when he takes care of her. Her temperature is 99.5°F (37.5°C), blood pressure is 60/35 mmHg, pulse is 190/min, respirations are 33/min, and oxygen saturation is 98% on room air. The patient is started on intravenous fluids, vasopressors, and broad-spectrum antibiotics.",Which of the following is the most appropriate underlying explanation for this patient's presentation?\n\nA: Gastroenteritis\nB: Intentional contamination\nC: Meningitis\nD: Urinary tract infection,B,medbullets
1036,"A 1-year-old girl is brought to a neurologist due to increasing seizure frequency over the past 2 months. She recently underwent a neurology evaluation which revealed hypsarrhythmia on electroencephalography (EEG) with a mix of slow waves, multifocal spikes, and asynchrony. Her parents have noticed the patient occasionally stiffens and spreads her arms at home. She was born at 38-weeks gestational age without complications. She has no other medical problems. Her medications consist of lamotrigine and valproic acid. Her temperature is 98.3°F (36.8°C), blood pressure is 90/75 mmHg, pulse is 94/min, and respirations are 22/min. Physical exam reveals innumerable hypopigmented macules on the skin and an irregularly shaped, thickened, and elevated plaque on the lower back.",Which of the following is most strongly associated with this patient's condition?\n\nA: Cardiac rhabdomyoma\nB: Glaucoma\nC: Optic glioma\nD: Polyostotic fibrous dysplasia,A,medbullets
1037,"A 17-year-old boy presents to his primary care physician with a chief concern of ""bad"" skin that has not improved despite home remedies. The patient has had lesions on his face that have persisted since he was 13 years of age. He has a diet high in refined carbohydrates and has gained 20 pounds since starting high school. Physical exam is notable for the findings in Figure A. The patient is started on benzoyl peroxide and topical retinoids. He returns 1 month later stating that his symptoms are roughly the same.",Which of the following is the most appropriate next step in management?\n\nA: Continue current therapy for 1 more month\nB: Dietary intervention\nC: Isoretinoin\nD: Topical antibiotics,D,medbullets
1038,"A 55-year-old woman is brought to the emergency department by her husband with a 1 hour history of an unremitting headache. The headache started suddenly while she was eating dinner and she says it feels like the “worst headache of my life.” An emergent CT scan of the head without contrast confirms the diagnosis, and a CT angiogram identifies the source of bleeding. The patient undergoes surgical management of her condition. On hospital day 3, she is found to be disoriented to person, place, and time. She also develops nausea and vomiting. Her medical problems consist of heart failure for which she takes furosemide, spironolactone, and metoprolol, which were continued at admission. Her temperature is 99.6°F (37.6°C), blood pressure is 100/60 mmHg, pulse is 112/min, and respirations are 16/min. Physical examination shows poor skin turgor. Capillary refill time is 4 seconds. Serum laboratory results are shown below: Na+: 120 mEq/L Cl-: 92 mEq/L K+: 3.9 mEq/L HCO3-: 26 mEq/L BUN: 32 mg/dL Creatinine: 1.0 mg/dL Serum osmolality is 265 mEq/L and urine osmolality is 340 mEq/L. Urine sodium is 44 mEq/L. .",Which of the following is the most likely diagnosis?\n\nA: Cerebral salt wasting\nB: Diuretic overuse\nC: Primary polydipsia\nD: Syndrome of inappropriate anti-diuretic hormone,A,medbullets
...,...,...,...,...
1327,"A 42-year-old man presents to the urgent care clinic with low back pain. He was working on a home improvement project the day prior to presentation when the pain started. He describes the pain as ""achy and sore."" It is not positional and does not radiate. He denies fevers, chills, paresthesias, and bowel or bladder incontinence. He has a history of a distal radius fracture 2 years ago from falling off a ladder. He drinks 3 alcoholic beverages weekly and denies illicit drug use. The patient’s temperature is 98.4°F (36.9°C), blood pressure is 124/80 mmHg, pulse is 90/min, and respirations are 16/min. His body mass index (BMI) is 22.4 kg/m^2. There is tenderness to palpation of his paravertebral lumbar region bilaterally. Perineal and dermatomal sensation is symmetric and intact. Strength is 5/5 to knee flexion/extension and ankle dorsiflexion/plantarflexion. Patellar and Achilles reflexes are 2+ bilaterally. Raising either leg while the patient is in the supine position does not elicit any pain. .",Which of the following is the most likely diagnosis?\n\nA: Osteoarthritis\nB: Vertebral compression fracture\nC: Lumbar strain\nD: Disc herniation,C,medbullets
1328,"A 77-year-old man presents to the emergency department acutely obtunded. The patient lives alone and was found unresponsive by his son. Generally, the patient manages his own finances, medications, and works part-time. He has not been responding to phone calls for the past 3 days. The patient is unable to offer a history. He has a past medical history of hypothyroidism, depression, and diabetes. His temperature is 88.0°F (31.1°C), blood pressure is 92/62 mmHg, pulse is 35/min, respirations are 9/min, and oxygen saturation is 92% on room air. The patient is cold to the touch and moves all extremities to painful stimuli. His pupils are reactive and sluggish, and he does not follow commands. There are no signs of trauma or skin infections. The patient is started on IV fluids and hydrocortisone, is externally warmed, and is started on a norepinephrine drip. An ECG is performed as seen in Figure A.",Which of the following is the most appropriate next step in management?\n\nA: Levothyroxine administration\nB: Free T4 level\nC: Thyroid stimulating hormone and free T4 level\nD: Triiodothyronine administration,A,medbullets
1329,"A 52-year-old G3P3 woman presents to clinic with a 2-year history of urinary incontinence. She has had frequent, involuntary loss of urine over the last 2 years but presented today because of 10 days of dysuria. She has been treated for 2 urinary tract infections over the past 6 months. She does not endorse any loss of urine with coughing or laughter. She has no chronic medical illnesses and no surgical history. Her temperature is 99.5°F (37.5°C), blood pressure is 120/80 mmHg, pulse is 92/min, and respirations are 15/min. Her BMI is 30 kg/m^2. On pelvic examination, the vagina is dry and atrophic with a 2-cm tender, palpable anterior vaginal mass. The mass does not change with Valsava maneuver. Her postvoid residual volume is 60 mL. .",Which of the following is the most likely diagnosis?\n\nA: Urethral diverticulum\nB: Overflow incontinence\nC: Stress urinary incontinence\nD: Pelvic organ prolapse,A,medbullets
1330,"A 48-year-old man presents to the emergency room with a 2-hour history of severe abdominal pain, nausea, and vomiting. He states that he has not passed gas or had a bowel movement in 4 days and his pain has worsened and become constant over the past 2 hours. His only medical history includes an appendectomy that he underwent as a child, and he takes no daily medications. His temperature is 38.5°C (101.3°F), blood pressure is 92/60 mmHg, pulse is 138/min, and respirations are 25/min. His pulse oximetry is 99% on room air.There are no cardiopulmonary abnormalities on auscultation. His abdomen is distended and tender in all quadrants, with guarding and rebound present. He also has increased bowel sounds throughout. Laboratory results are as follows:\n\nHemoglobin: 11 g/dL\nLeukocyte count: 16,500/mm^3 with normal differential\nPlatelets: 250,000/mm^3\n\nSerum:\nCreatinine: 1.0 mg/dL\nGlucose: 95 mg/dL\nLipase: 45 U/L\nTotal bilirubin: 0.8 mg/dL\nAlkaline phosphatase: 74 U/L\nAspartate aminotransferase (AST, GOT): 32 U/L\nAlanine aminotransferase (ALT, GPT): 45 U/L\nLactate: 7.0 mmol/L","Which of the following is the most appropriate next step in management?\n\nA: Supportive care, NPO, and intravenous fluids\nB: Urgent surgical intervention\nC: Nasogastric tube placement\nD: CT angiography of the abdomen and pelvis",B,medbullets


In [42]:
# Apply the standardization to the 'answer' column
merge_llm_df["answer"] = merge_llm_df["answer"].apply(standardize_answer)


In [36]:
merge_llm_df.to_csv("merged_llm_4k_questions.csv", index=False)

In [43]:
merge_llm_df[merge_llm_df["data_source"] == "medbullets"].to_csv("medbullets_only.csv", index=False)


In [46]:
# Clean and standardize the ground truth and predicted answers
df["answer_letter"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_letter"] = df["predicted_answer"].astype(str).str.strip().str.upper()

# Compare predictions to ground truth
df["match"] = df.apply(
    lambda row: "Correct" if row["predicted_letter"] == row["answer_letter"] else "Incorrect",
    axis=1
)

# Convert to binary for statistics
df["binary_match"] = df["match"].map({"Correct": 1, "Incorrect": 0})

# Compute overall accuracy
correct_count = df["binary_match"].sum()
total_count = df["binary_match"].notna().sum()
accuracy = correct_count / total_count if total_count > 0 else 0

print(f"Correct Predictions: {correct_count}")
print(f"Total Predictions Compared: {total_count}")
print(f"Overall Accuracy: {accuracy:.2%}")

# Compute per-data source accuracy and std deviation
if "data_source" in df.columns:
    for source in df["data_source"].dropna().unique():
        source_df = df[df["data_source"] == source]
        correct = source_df["binary_match"].sum()
        total = source_df["binary_match"].notna().sum()
        acc = correct / total if total > 0 else 0
        std = source_df["binary_match"].std(ddof=1) if total > 1 else float("nan")

        print(f"\nData Source: {source}")
        print(f"  Correct Predictions: {correct}")
        print(f"  Total Predictions: {total}")
        print(f"  Accuracy: {acc:.2%}")
        print(f"  Std Dev: {std:.4f}")
else:
    print("\nColumn 'data_source' not found in the dataset.")

Correct Predictions: 896
Total Predictions Compared: 2008
Overall Accuracy: 44.62%

Data Source: medbullets
  Correct Predictions: 160
  Total Predictions: 252
  Accuracy: 63.49%
  Std Dev: 0.4824

Data Source: medxpert
  Correct Predictions: 62
  Total Predictions: 754
  Accuracy: 8.22%
  Std Dev: 0.2749

Data Source: jama
  Correct Predictions: 450
  Total Predictions: 752
  Accuracy: 59.84%
  Std Dev: 0.4905

Data Source: mmlu
  Correct Predictions: 224
  Total Predictions: 250
  Accuracy: 89.60%
  Std Dev: 0.3059
