## Objective
This notebook evaluates predictive models built on different feature sets
to quantify the impact of blood glucose measurements and missingness-aware
features.

Key goals:
- Train baseline and comparative models
- Evaluate performance across feature configurations
- Assess whether missingness itself is predictive
- Establish modeling best practices for clinical datasets

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score
)

In [None]:
DATA_PATH = "../data/processed/clean_baseline.csv"
df = pd.read_csv(DATA_PATH)

TARGET_COLUMN = "target"  # <-- CHANGE THIS
y = df[TARGET_COLUMN].values

In [None]:
X_full = np.load("../data/processed/X_full.npy")
X_no_glucose = np.load("../data/processed/X_no_glucose.npy")
X_missingness = np.load("../data/processed/X_missingness.npy")

In [None]:
Xf_train, Xf_test, y_train, y_test = train_test_split(
    X_full, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

Xng_train, Xng_test, _, _ = train_test_split(
    X_no_glucose, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

Xm_train, Xm_test, _, _ = train_test_split(
    X_missingness, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "roc_auc": roc_auc_score(y_test, y_prob),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    }

In [None]:
lr = LogisticRegression(max_iter=1000)

results_lr = pd.DataFrame.from_dict({
    "Full (with glucose)": evaluate_model(lr, Xf_train, Xf_test, y_train, y_test),
    "No glucose": evaluate_model(lr, Xng_train, Xng_test, y_train, y_test),
    "Missingness-aware": evaluate_model(lr, Xm_train, Xm_test, y_train, y_test)
}, orient="index")

results_lr

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=5,
    random_state=42
)

results_rf = pd.DataFrame.from_dict({
    "Full (with glucose)": evaluate_model(rf, Xf_train, Xf_test, y_train, y_test),
    "No glucose": evaluate_model(rf, Xng_train, Xng_test, y_train, y_test),
    "Missingness-aware": evaluate_model(rf, Xm_train, Xm_test, y_train, y_test)
}, orient="index")

results_rf

In [None]:
def cross_val_auc(model, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_idx, test_idx in skf.split(X, y):
        model.fit(X[train_idx], y[train_idx])
        y_prob = model.predict_proba(X[test_idx])[:, 1]
        scores.append(roc_auc_score(y[test_idx], y_prob))

    return np.mean(scores), np.std(scores)

In [None]:
cv_results = {
    "Full (with glucose)": cross_val_auc(rf, X_full, y),
    "No glucose": cross_val_auc(rf, X_no_glucose, y),
    "Missingness-aware": cross_val_auc(rf, X_missingness, y)
}

pd.DataFrame(cv_results, index=["Mean ROC-AUC", "Std"])