## Objective
This notebook transforms the cleaned clinical dataset into a model-ready
feature set while preserving clinically meaningful missingness.

Key goals:
- Create explicit missingness indicators
- Engineer glucose-aware and glucose-agnostic features
- Compare imputation strategies
- Produce reproducible feature matrices for modeling

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
DATA_PATH = "../data/processed/clean_baseline.csv"
df = pd.read_csv(DATA_PATH)

df.shape

In [None]:
identifier_cols = []  # add IDs if present
binary_flags = ["has_glucose_measurement"]

numeric_cols = [
    col for col in df.select_dtypes(include=np.number).columns
    if col not in binary_flags + identifier_cols
]

len(numeric_cols)

In [None]:
missingness_cols = []

for col in numeric_cols:
    if df[col].isna().mean() > 0.05:  # threshold for meaningful missingness
        indicator = f"{col}_missing"
        df[indicator] = df[col].isna().astype(int)
        missingness_cols.append(indicator)

len(missingness_cols)

In [None]:
glucose_cols = [
    col for col in numeric_cols
    if "glucose" in col or "glu" in col
]

glucose_cols

In [None]:
for col in glucose_cols:
    df[f"{col}_log"] = np.log1p(df[col])

In [None]:
features_full = (
    numeric_cols +
    missingness_cols +
    binary_flags
)

In [None]:
features_no_glucose = [
    col for col in features_full
    if col not in glucose_cols
    and not any(g in col for g in glucose_cols)
]

In [None]:
features_missingness_only = (
    [col for col in numeric_cols if col not in glucose_cols] +
    missingness_cols +
    binary_flags
)

In [None]:
imputer = SimpleImputer(strategy="median")

X_full = imputer.fit_transform(df[features_full])
X_no_glucose = imputer.fit_transform(df[features_no_glucose])
X_missingness = imputer.fit_transform(df[features_missingness_only])

In [None]:
scaler = StandardScaler()

X_full_scaled = scaler.fit_transform(X_full)
X_no_glucose_scaled = scaler.fit_transform(X_no_glucose)
X_missingness_scaled = scaler.fit_transform(X_missingness)

In [None]:
np.save("../data/processed/X_full.npy", X_full_scaled)
np.save("../data/processed/X_no_glucose.npy", X_no_glucose_scaled)
np.save("../data/processed/X_missingness.npy", X_missingness_scaled)

pd.Series(features_full).to_csv("../data/processed/features_full.csv", index=False)
pd.Series(features_no_glucose).to_csv("../data/processed/features_no_glucose.csv", index=False)
pd.Series(features_missingness_only).to_csv("../data/processed/features_missingness.csv", index=False)