In [8]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

# -----------------------
# Study design
# -----------------------
n_people = 250
visit_months = np.arange(0, 48, 6)   # 0–42 months, 6-monthly visits

rows = []

for i in range(n_people):
    pid = f"P{i:04d}"

    # -----------------------
    # Baseline demographics
    # -----------------------
    age = rng.normal(72, 6)
    education_years = int(np.clip(rng.normal(13, 2.5), 8, 20))
    sex = rng.choice(["M", "F"])

    # -----------------------
    # True cognitive decline rates
    # (points per month)
    # -----------------------
    memory_slope = rng.normal(-0.10, 0.07)
    attention_slope = rng.normal(-0.07, 0.05)
    language_slope = rng.normal(-0.05, 0.04)

    # Slight modifiers (older → faster decline, more education → slower)
    age_effect = (age - 72) * -0.002
    edu_effect = (education_years - 13) * 0.003

    memory_slope += age_effect + edu_effect
    attention_slope += age_effect + edu_effect
    language_slope += age_effect + edu_effect

    # -----------------------
    # Baseline test scores
    # -----------------------
    memory_base = rng.normal(28, 1.5)       # MMSE-like
    attention_base = rng.normal(90, 8)
    language_base = rng.normal(45, 6)

    # -----------------------
    # Longitudinal visits
    # -----------------------
    for m in visit_months:
        # visit dropout increases over time
        dropout_prob = 0.05 + 0.002 * m
        if rng.random() < dropout_prob:
            continue

        memory = memory_base + memory_slope * m + rng.normal(0, 0.8)
        attention = attention_base + attention_slope * m + rng.normal(0, 3.0)
        language = language_base + language_slope * m + rng.normal(0, 2.0)

        rows.append({
            "participant_id": pid,
            "visit_month": m,
            "age_baseline": round(age, 1),
            "education_years": education_years,
            "sex": sex,
            "memory": memory,
            "attention": attention,
            "language": language
        })

df = (
    pd.DataFrame(rows)
    .sort_values(["participant_id", "visit_month"])
    .reset_index(drop=True)
)

# -----------------------
# Save
# -----------------------
out_path = "../data/simulated/longitudinal_simulated.csv"
df.to_csv(out_path, index=False)

df.head(), df.shape


(  participant_id  visit_month  age_baseline  education_years sex     memory  \
 0          P0000            0          73.8               10   M  28.895279   
 1          P0000            6          73.8               10   M  28.284866   
 2          P0000           18          73.8               10   M  27.309113   
 3          P0000           24          73.8               10   M  26.944527   
 4          P0000           30          73.8               10   M  27.079604   
 
    attention   language  
 0  89.803435  45.031254  
 1  83.810932  44.948232  
 2  83.671722  41.471946  
 3  81.860075  41.441075  
 4  83.302006  42.318522  ,
 (1800, 8))