In [None]:
# --- 1. Imports ---
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import os

# --- 2. Load cleaned data ---
df = pd.read_feather("data/processed/cleaned_nhanes.feather")

# --- 3. Define features and targets ---
target_cols = ['vitamin_d', 'hdl_cholesterol', 'a1c']
X = df.drop(columns=target_cols)
y = df[target_cols]

# --- 4. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 5. Feature type separation ---
num_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# --- 6. ColumnTransformer pipeline ---
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

# --- 7. Full modeling pipeline ---
model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', MultiOutputRegressor(RandomForestRegressor(random_state=42)))
])

# --- 8. Fit model ---
model.fit(X_train, y_train)

# --- 9. Evaluate ---
y_pred = model.predict(X_test)

for i, col in enumerate(target_cols):
    print(f"--- {col.upper()} ---")
    print(f"R²:  {r2_score(y_test[col], y_pred[:, i]):.3f}")
    print(f"RMSE: {mean_squared_error(y_test[col], y_pred[:, i], squared=False):.3f}\n")

# --- 10. Save model ---
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/baseline_multioutput_model.pkl")
print("Model saved to 'models/baseline_multioutput_model.pkl'")
