# ALF Risk Prediction: EDA & Modeling

This notebook explores the synthetic ALF resident data and trains a model to predict next-day health incidents.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

# Load data
df = pd.read_csv('../synthetic_alf_data.csv')
df['date'] = pd.to_datetime(df['date'])
df.head()


## Basic Data Exploration

In [None]:
df.info()
df.describe()
df['incident_next_day'].value_counts(normalize=True)

## Feature Engineering

In [None]:
# Recreate vital sign deltas
df = df.sort_values(by=['patient_id', 'date'])
df['heart_rate_diff'] = df.groupby('patient_id')['heart_rate'].diff().fillna(0)
df['bp_sys_diff'] = df.groupby('patient_id')['blood_pressure_sys'].diff().fillna(0)
df['bp_dia_diff'] = df.groupby('patient_id')['blood_pressure_dia'].diff().fillna(0)

# One-hot encode categorical
df['age_group'] = pd.cut(df['age'], bins=[60, 70, 80, 90, 100], labels=['60s', '70s', '80s', '90s'])
df = pd.get_dummies(df, columns=['gender', 'diagnosis', 'age_group', 'facility_id'], drop_first=True)


## Train/Test Split

In [None]:
unique_patients = df['patient_id'].unique()
train_p, test_p = train_test_split(unique_patients, test_size=0.2, random_state=42)
train_df = df[df['patient_id'].isin(train_p)]
test_df = df[df['patient_id'].isin(test_p)]

X_train = train_df.drop(columns=['patient_id', 'date', 'incident_next_day'])
y_train = train_df['incident_next_day']
X_test = test_df.drop(columns=['patient_id', 'date', 'incident_next_day'])
y_test = test_df['incident_next_day']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modeling

In [None]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

## Feature Importance

In [None]:
importances = model.feature_importances_
features = X_train.columns
fi_df = pd.DataFrame({'feature': features, 'importance': importances}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=fi_df.head(10), x='importance', y='feature')
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.show()