In [1]:
import pandas as pd

In [6]:
import numpy as np
import pandas as pd

def generate_dummy_hospital_data(n_rows=500, seed=42):
    # --- CRITICAL FIX ---
    n_rows = int(n_rows)  # ensure integer even if 500.0 is passed

    rng = np.random.default_rng(seed)

    case_types = ["OP", "IP", "DC"]; case_type_p = [0.60, 0.30, 0.10]
    surgical_mix_vals = ["Surgery", "Medical"]; surgical_mix_p = [0.40, 0.60]
    doctor_status_vals = ["Active", "Inactive"]; doctor_status_p = [0.80, 0.20]
    genders = ["M", "F"]; genders_p = [0.60, 0.40]
    discharge_before_noon_target = (0.20, 0.25)

    def sample_dob_year(size):
        base = rng.choice([0, 1], size=int(size), p=[0.30, 0.70])
        years = np.where(base == 1,
                         rng.integers(1980, 2006, size=int(size)),
                         rng.integers(1955, 1980, size=int(size)))
        return years

    nationalities = ["UAE","Oman","India","Indonesia","Japan","USA","Netherlands",
                     "Denmark","Sri Lanka","Germany","Philippines","Bangladesh","Pakistan","Egypt"]
    nat_p = np.array([0.15,0.05,0.20,0.05,0.03,0.05,0.03,0.03,0.03,0.04,0.10,0.08,0.08,0.08])
    nat_p = nat_p / nat_p.sum()

    doctors = [
        ("Dr. Shawn Martin","ABC123","Consultant","Emergency"),
        ("Dr. Lily James","DEF232","Specialist","Cardiology"),
        ("Dr. Thomas Muller","ABC234","Specialist","Nephrology"),
        ("Dr. Hana Suzuki","FGH234","Consultant","General Surgery"),
        ("Dr. Omar Khalid","QWE123","GP","Family Medicine"),
        ("Dr. Aisha Rahman","QWE234","Specialist","Pulmonology"),
        ("Dr. Peter Novak","VBN123","Consultant","Orthopaedics"),
        ("Dr. Priya Iyer","HJK123","Specialist","Obstetrics & Gynecology"),
        ("Dr. Ahmed Zayed","ABC456","Specialist","Gastroenterology"),
        ("Dr. Sofia Kim","FGH123","GP","Internal Medicine"),
    ]
    doc_weights = np.array([0.14,0.10,0.10,0.14,0.06,0.10,0.14,0.10,0.06,0.06]); doc_weights /= doc_weights.sum()

    payers = ["PureWin Insurance","Goodhealth Insurance","TotalHealth Insurance",
              "Nextcare Insurance","BlueMedic Charity","Self-Pay Corporate"]
    payer_p = np.array([0.40,0.18,0.15,0.12,0.08,0.07]); payer_p /= payer_p.sum()
    payer_plans = {
        "PureWin Insurance":["Basic","Advance","Premium"],
        "Goodhealth Insurance":["GH-1","GH-2"],
        "TotalHealth Insurance":["TH-Standard","TH-Plus"],
        "Nextcare Insurance":["NC-Blue","NC-Gold"],
        "BlueMedic Charity":[np.nan],
        "Self-Pay Corporate":[np.nan],
    }
    payer_mix_map = {
        "PureWin Insurance":"Insurance", "Goodhealth Insurance":"Insurance",
        "TotalHealth Insurance":"Insurance", "Nextcare Insurance":"Insurance",
        "BlueMedic Charity":"Corporate","Self-Pay Corporate":"Corporate"
    }

    months = pd.date_range("2025-01-01","2025-12-01", freq="MS")

    month_col     = rng.choice(months, size=int(n_rows))
    case_no       = rng.integers(123456000, 123999999, size=int(n_rows))
    dob_years     = sample_dob_year(n_rows)
    gender_col    = rng.choice(genders, size=int(n_rows), p=genders_p)
    nationality   = rng.choice(nationalities, size=int(n_rows), p=nat_p)

    doc_idx       = rng.choice(len(doctors), size=int(n_rows), p=doc_weights)
    doctor_name   = np.array([doctors[i][0] for i in doc_idx])
    doctor_license= np.array([doctors[i][1] for i in doc_idx])
    doctor_type   = np.array([doctors[i][2] for i in doc_idx])
    specialty     = np.array([doctors[i][3] for i in doc_idx])

    doctor_status = rng.choice(doctor_status_vals, size=int(n_rows), p=doctor_status_p)
    case_type     = rng.choice(case_types, size=int(n_rows), p=case_type_p)

    los = np.zeros(int(n_rows), dtype=int)
    dc_mask = case_type == "DC"; ip_mask = case_type == "IP"
    los[dc_mask] = 1
    ip_count = int(ip_mask.sum())
    if ip_count > 0:
        los[ip_mask] = rng.choice([2,3,4,5,6,7], size=ip_count, p=[0.10,0.25,0.30,0.20,0.10,0.05])

    surgical_mix = rng.choice(surgical_mix_vals, size=int(n_rows), p=surgical_mix_p)
    severity = rng.choice([1,2,3], size=int(n_rows), p=[0.35,0.45,0.20])

    base_cmi = rng.normal(1.0, 0.1, size=int(n_rows))
    base_cmi = np.clip(base_cmi, 0.75, 1.35)
    base_cmi += (surgical_mix == "Surgery") * 0.07
    base_cmi += (case_type == "IP") * 0.05
    base_cmi += (severity - 2) * 0.04
    cmi_val = np.round(base_cmi, 3)

    payer = rng.choice(payers, size=int(n_rows), p=payer_p)
    plan  = np.array([rng.choice(payer_plans[p]) for p in payer], dtype=object)
    payer_mix = np.array([payer_mix_map[p] for p in payer])

    # --- changed to object array so list-multiplication with NaT works everywhere
    discharge_time = np.array([pd.NaT]*int(n_rows), dtype=object)

    has_time = rng.random(int(n_rows))
    prob_has_time = np.where(case_type=="IP", 0.90, np.where(case_type=="DC", 0.80, 0.30))
    time_mask = has_time < prob_has_time

    def random_time_series(size):
        mins = rng.integers(0, 16*60, size=int(size))  # 06:00–22:00
        return pd.to_datetime(["2000-01-01"]*int(size)) + pd.to_timedelta(6*60 + mins, unit="m")

    discharge_time[time_mask] = random_time_series(time_mask.sum()).to_pydatetime()

    with_time_idx = np.where(pd.notna(discharge_time))[0]
    if len(with_time_idx) > 0:
        target_share = rng.uniform(*discharge_before_noon_target)
        n_before = int(round(target_share * len(with_time_idx)))
        before_idx = rng.choice(with_time_idx, size=n_before, replace=False)
        mins = rng.integers(0, 6*60, size=n_before)
        discharge_time[before_idx] = (
            pd.to_datetime(["2000-01-01"]*n_before) + pd.to_timedelta(6*60 + mins, unit="m")
        ).to_pydatetime()

    discharge_before_noon = np.array(
        ["Yes" if (isinstance(ts, pd.Timestamp) or hasattr(ts, "hour")) and ts.hour < 12 else
         ("No" if pd.notna(ts) else np.nan)
         for ts in discharge_time], dtype=object
    )

    base_rev = np.where(case_type=="OP", rng.uniform(100, 1500, int(n_rows)),
               np.where(case_type=="DC", rng.uniform(1500, 6000, int(n_rows)),
                        rng.uniform(4000, 80000, int(n_rows))))
    base_rev *= np.where(doctor_type=="Consultant", 1.35, np.where(doctor_type=="Specialist", 1.10, 0.90))
    base_rev *= np.where(surgical_mix=="Surgery", 1.30, 1.00)
    base_rev *= np.where(payer_mix=="Corporate", 0.85, 1.00)
    base_rev *= np.where(payer=="PureWin Insurance", 1.08, 1.00)
    base_rev *= np.where(severity==3, 1.15, np.where(severity==1, 0.93, 1.00))
    base_rev *= rng.normal(1.0, 0.12, int(n_rows))
    revenue = np.maximum(0, base_rev).round(0)
    revenue *= np.where(doctor_status=="Inactive", rng.uniform(0.85, 0.95, int(n_rows)), 1.0)
    revenue = revenue.round(0)

    df = pd.DataFrame({
        "Month": pd.to_datetime(month_col).date,
        "Case_No": case_no.astype(float),
        "DOB": dob_years.astype(float),
        "Nationality": nationality,
        "Gender": gender_col,
        "DoctorLicense": doctor_license,
        "DoctorName": doctor_name,
        "Doctor Type": doctor_type,
        "Doctor Status": doctor_status,
        "CMI Value": cmi_val,
        "Specialty": specialty,
        "Insurance/Payer": payer,
        "InsurancePlanName": plan,
        "Payer Mix": payer_mix,
        "Case type": case_type,
        "LOS": los.astype(float),
        "Severity": severity.astype(float),
        "Surgical Mix": surgical_mix,
        "Discharge Time": [ts.time() if pd.notna(ts) else pd.NaT for ts in discharge_time],
        "Discharge Before 12PM": discharge_before_noon,
        "Revenue": revenue.astype(float)
    })

    # keep ~40/60 Surgery:Medical globally
    desired_surgery = int(round(0.40 * n_rows))
    current_surgery = (df["Surgical Mix"]=="Surgery").sum()
    if current_surgery != desired_surgery:
        diff = desired_surgery - current_surgery
        if diff > 0:
            idx = df.index[df["Surgical Mix"]=="Medical"][:diff]
            df.loc[idx, "Surgical Mix"] = "Surgery"
            df.loc[idx, "Revenue"] = (df.loc[idx, "Revenue"] * 1.10).round(0)
            df.loc[idx, "CMI Value"] = (df.loc[idx, "CMI Value"] + 0.05).round(3)
        elif diff < 0:
            idx = df.index[df["Surgical Mix"]=="Surgery"][:abs(diff)]
            df.loc[idx, "Surgical Mix"] = "Medical"
            df.loc[idx, "Revenue"] = (df.loc[idx, "Revenue"] * 0.90).round(0)
            df.loc[idx, "CMI Value"] = (df.loc[idx, "CMI Value"] - 0.05).round(3)

    return df

In [16]:
df = generate_dummy_hospital_data(n_rows=500, seed=42)

#save
out_path = "dummy_hospital_generated.csv"
df.to_csv(out_path, index=False)

print(f"saved: {out_path} | rows: {len(df)}")

saved: dummy_hospital_generated.csv | rows: 500
