In [None]:
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

n_people = 200
visit_months = np.array([0, 6, 12, 18, 24, 30, 36])  # 3 years, 6-month visits

rows = []

for i in range(n_people):
    pid = f"P{i:04d}"

    # baseline features (fixed per person)
    age = rng.normal(72, 6)                       # around 72
    education_years = int(np.clip(rng.normal(13, 2.5), 8, 20))
    sex = rng.choice(["M", "F"])

    # underlying decline rates (person-specific)
    # negative slopes = decline; make variability realistic
    memory_slope = rng.normal(-0.12, 0.08)        # points per month (adjust later)
    attention_slope = rng.normal(-0.08, 0.06)
    language_slope = rng.normal(-0.06, 0.05)

    # baseline score levels (person-specific)
    memory_base = rng.normal(28, 1.5)
    attention_base = rng.normal(90, 8)
    language_base = rng.normal(45, 6)

    for m in visit_months:
        # random missing visits (realistic longitudinal dropout)
        if rng.random() < 0.08:  # 8% chance this visit missing
            continue

        # noisy observations around a true linear decline
        memory = memory_base + memory_slope * m + rng.normal(0, 0.8)
        attention = attention_base + attention_slope * m + rng.normal(0, 3.0)
        language = language_base + language_slope * m + rng.normal(0, 2.0)

        rows.append({
            "participant_id": pid,
            "visit_month": m,
            "age_baseline": age,
            "education_years": education_years,
            "sex": sex,
            "memory": memory,
            "attention": attention,
            "language": language
        })

df = pd.DataFrame(rows).sort_values(["participant_id", "visit_month"])

# Save to your simulated folder
out_path = "../data/simulated/longitudinal_simulated.csv"
df.to_csv(out_path, index=False)

df.head(), df.shape
