In [None]:
import json
import random
import pandas as pd
import google.generativeai as genai
import time
from constants import GEMINI_KEY

# Configure Gemini
genai.configure(api_key=GEMINI_KEY)
model = genai.GenerativeModel("gemini-2.5-flash-preview-04-17") 

In [None]:
# Load data
with open("res/patient_data.json", "r", encoding="utf-8") as f:
    examples = json.load(f)

icd_df = pd.read_csv("res/codes_icd_diagnosis.csv")
icd_codes = icd_df["icd_code"].dropna().unique().tolist()

PRIMARY_CODES = {
    "M1710": "Unilateral primary osteoarthritis, unspecified knee",
    "M1711": "Unilateral primary osteoarthritis, right knee",
    "M1712": "Unilateral primary osteoarthritis, left knee"
}

In [None]:
# Generate one synthetic patient
def generate_patient(primary_code: str, secondary_codes: list, base_example: dict) -> dict:
    secondary_descriptions = [
        icd_df.loc[icd_df["icd_code"] == code, "long_title"].values[0]
        for code in secondary_codes if code in icd_df["icd_code"].values 
    ]

    example_text = base_example.get("text", "")

    prompt = f"""
You are a medical assistant. Generate a synthetic hospital discharge summary in English.

Please follow these guidelines:
- Do **not** include any real or fictional names of patients, doctors, hospitals, or specific locations.
- Ensure the summary is **fully anonymous**, using placeholders such as "Mr. ___", "Dr. ___", or "the patient".
- The content must be **original**, realistic, and **not copied** or paraphrased directly from the example.
- Ensure medical consistency with the diagnoses provided.
- Add variety in phrasing, structure, and style to avoid duplication across examples.

You may use this example as inspiration (but do not copy it):
---
{example_text}
---

Generate a **new** discharge summary. The patient has the following diagnoses:

Primary ICD code: {primary_code} = {PRIMARY_CODES[primary_code]}
Secondary ICD codes:
{chr(10).join([f"- {code}: {desc}" for code, desc in zip(secondary_codes, secondary_descriptions)])}

Return only the generated discharge summary.
"""

    response = model.generate_content(prompt)
    return {
        "discharge_summary": response.text.strip(),
        "primary_icd_code": primary_code,
        "secondary_icd_codes": ";".join(secondary_codes)
    }

In [None]:
# Generate N synthetic patients
synthetic_patients = []
n_patients = 100  # number of synthetic patients

for _ in range(n_patients):
    primary = random.choice(list(PRIMARY_CODES.keys()))
    secondary = random.sample(icd_codes, k=random.randint(1, 3))
    example = random.choice(examples)

    try:
        patient = generate_patient(primary, secondary, example)
        synthetic_patients.append(patient)
    except Exception as e:
        print(f"Error generating patient: {e}")
    time.sleep(10)  # ⏱️ Delay of 10 seconds between each request due to limitation (gemini free version)


# Save to CSV

df = pd.DataFrame(synthetic_patients)
df.to_csv("results/synthetic_dataset_gemini.csv", index=False, encoding="utf-8")
print(f"Successfully generated {len(df)} synthetic patients.")


Successfully generated 100 synthetic patients.
