In [1]:
# import libraries
# !pip install -q datasets pandas

from datasets import load_dataset
import pandas as pd
import hashlib, pathlib, os

In [2]:
# New Dataset
DATASET_NAME = "keivalya/MedQuad-MedicalQnADataset" 
SAMPLE_SIZE = 150
SEED = 12345

# Create output directory if missing
OUT_DIR = pathlib.Path("data")
OUT_DIR.mkdir(exist_ok=True, parents=True)

# File paths
CSV_PATH = OUT_DIR / f"medquestions_subset_{SAMPLE_SIZE}.csv"
JSONL_PATH = OUT_DIR / f"medquestions_subset_{SAMPLE_SIZE}.jsonl"

print("Dataset name:", DATASET_NAME)
print("Output folder:", OUT_DIR)
print("Sample size:", SAMPLE_SIZE)

Dataset name: keivalya/MedQuad-MedicalQnADataset
Output folder: data
Sample size: 150


In [3]:
def normalize_row(record):
    """
    Normalizes dataset records so all have 'question' and 'answer' fields.
    Removes blank spaces and handles potential key name variations.
    """
    q = (record.get("question") or record.get("Question") or record.get("query") or "").strip()
    a = (record.get("answer") or record.get("Answer") or record.get("response") or "").strip()
    return {"question": q, "answer": a}

def stable_key(q: str, a: str, seed: int = SEED) -> int:
    """
    Generates a deterministic key for sampling the same subset every run.
    """
    h = hashlib.sha256(f"{seed}||{q}||{a}".encode("utf-8", "ignore")).digest()
    return int.from_bytes(h[:8], "big", signed=False)

In [4]:
# Loads and cleans dataset
print("Loading dataset from Hugging Face...")
dataset_stream = load_dataset(DATASET_NAME, split="train", streaming=True)

rows = []
for rec in dataset_stream:
    r = normalize_row(rec)
    if r["question"] and r["answer"]:
        rows.append(r)

print(f"Loaded {len(rows):,} total valid question–answer pairs.")

Loading dataset from Hugging Face...
Loaded 16,407 total valid question–answer pairs.


In [5]:
# Select stable subset
for r in rows:
    r["_key"] = stable_key(r["question"], r["answer"], SEED)

# Sort by key
rows.sort(key=lambda x: x["_key"])
subset = rows[:SAMPLE_SIZE]

# Assign unique question IDs
for i, r in enumerate(subset):
    r["qid"] = f"medquestions-{SAMPLE_SIZE}-{i:04d}"

print(f"Selected {len(subset)} stable subset rows.")

Selected 150 stable subset rows.


In [6]:
df = pd.DataFrame(subset)[["qid", "question", "answer"]]
df.to_csv(CSV_PATH, index=False)
df.to_json(JSONL_PATH, orient="records", lines=True, force_ascii=False)

print("Saved files:")
print("  CSV:", CSV_PATH)
print("  JSONL:", JSONL_PATH)

Saved files:
  CSV: data\medquestions_subset_150.csv
  JSONL: data\medquestions_subset_150.jsonl


In [7]:
# Verify saved data
df_check = pd.read_csv(CSV_PATH)
print(f"Verified {len(df_check)} rows loaded from CSV.")
df_check.head(5)

Verified 150 rows loaded from CSV.


Unnamed: 0,qid,question,answer
0,medquestions-150-0000,What are the genetic changes related to famili...,Mutations in the APC gene cause both classic a...
1,medquestions-150-0001,What are the treatments for Noonan syndrome ?,These resources address the diagnosis or manag...
2,medquestions-150-0002,How to diagnose National Hormone and Pituitary...,CJD is usually diagnosed based on signs and sy...
3,medquestions-150-0003,Is Spastic diplegia cerebral palsy inherited ?,Is spastic diplegia cerebral palsy inherited? ...
4,medquestions-150-0004,Is restless legs syndrome inherited ?,The inheritance pattern of restless legs syndr...
