# Data Generation for ALF Risk Prediction

This notebook generates synthetic daily health data for residents in assisted living facilities (ALFs).

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed
np.random.seed(42)

# Parameters
n_patients = 200
n_days = 30
facilities = ['F001', 'F002', 'F003']
diagnoses = ['diabetes', 'dementia', 'hypertension', 'COPD', 'arthritis']
genders = ['Male', 'Female']

# Generate synthetic data
data = []
for pid in range(1, n_patients + 1):
    facility_id = random.choice(facilities)
    age = np.random.randint(65, 95)
    gender = random.choice(genders)
    diagnosis = random.choice(diagnoses)

    for day_offset in range(n_days):
        date = datetime(2025, 6, 1) + timedelta(days=day_offset)
        heart_rate = np.random.normal(75, 10)
        bp_sys = np.random.normal(130, 15)
        bp_dia = np.random.normal(80, 10)
        temperature = np.random.normal(36.8, 0.5)
        med_adherence = np.clip(np.random.beta(5, 2), 0, 1)
        incident_next_day = int(np.random.rand() < 0.05 if med_adherence > 0.8 else np.random.rand() < 0.15)

        data.append([
            f"P{pid:04d}", facility_id, date.strftime('%Y-%m-%d'), age, gender, diagnosis,
            heart_rate, bp_sys, bp_dia, temperature, med_adherence, incident_next_day
        ])

# Create DataFrame
df = pd.DataFrame(data, columns=[
    'patient_id', 'facility_id', 'date', 'age', 'gender', 'diagnosis',
    'heart_rate', 'blood_pressure_sys', 'blood_pressure_dia', 'temperature',
    'med_adherence', 'incident_next_day'
])

# Save dataset
df.to_csv('../data/synthetic_alf_data.csv', index=False)
df.head()

✅ Dataset has been generated and saved to `../data/synthetic_alf_data.csv`.