# Import Library

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from scipy.stats import pearsonr
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

# Function Library

# Data Loading

In [None]:
X = np.load('landmark_features.npy')
y = pd.read_csv('../data/BMI/cleaned_data.csv')['bmi'].values

In [32]:
df = pd.read_csv('landmark_features.csv')
X = df.drop(columns=['bmi', 'name']).values  # 'name' only if you included it
y = df['bmi'].values

In [33]:
print(X.shape)
print(y.shape)

(3961, 136)
(3961,)


# Modeling

## Split data

In [34]:
scaler = StandardScaler()

In [35]:
X_scaled = scaler.fit_transform(X)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, 
    y, 
    test_size=0.2, 
    random_state=42
)

## Random Forest

In [37]:
rf = RandomForestRegressor(n_estimators= 100, random_state= 42)
rf.fit(X_train, y_train)

In [38]:
y_pred = rf.predict(X_test)
r, _ = pearsonr(y_test, y_pred)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("Pearson r: ", r)

MAE: 6.162648654575444
R² Score: 0.055669050402192166
Pearson r:  0.27394943754022927


# SVR

In [18]:
from sklearn.svm import SVR
svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)

In [39]:
y_pred = rf.predict(X_test)
r, _ = pearsonr(y_test, y_pred)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("Pearson r: ", r)

MAE: 6.162648654575444
R² Score: 0.055669050402192166
Pearson r:  0.27394943754022927


# XG Boost

In [22]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

In [40]:
y_pred = rf.predict(X_test)
r, _ = pearsonr(y_test, y_pred)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))
print("Pearson r: ", r)

MAE: 6.162648654575444
R² Score: 0.055669050402192166
Pearson r:  0.27394943754022927


## Neural Network

In [42]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y.reshape(-1, 1)).ravel()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, 
    y_scaled, 
    test_size=0.2, 
    random_state=42
)

In [53]:
# Train MLP
mlp = MLPRegressor(hidden_layer_sizes=(64, 64), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

In [54]:
y_pred_scaled = mlp.predict(X_test)
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
y_true = scaler.inverse_transform(y_test.reshape(-1, 1)).ravel()

In [55]:
r, _ = pearsonr(y_true, y_pred)
print("MAE:", mean_absolute_error(y_true, y_pred))
print("R² Score:", r2_score(y_true, y_pred))
print("Pearson r:", r)

MAE: 5.679331342972026
R² Score: 0.17797493782001494
Pearson r: 0.43047546741502174


In [60]:
# Define parameter grid
param_grid = {
    'hidden_layer_sizes': [(64,), (64, 64), (128,), (128, 64)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],  # L2 penalty
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [500]
}

# Set up GridSearchCV
mlp = MLPRegressor(random_state=42)
grid = GridSearchCV(mlp, param_grid, scoring='r2', cv=3, n_jobs=-1, verbose=1)

# Run grid search
grid.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [61]:
# Best model
best_mlp = grid.best_estimator_
print("Best MLP:", grid.best_params_)

Best MLP: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.001, 'max_iter': 500}


In [62]:
y_pred_scaled = best_mlp.predict(X_test)
y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
y_true = scaler.inverse_transform(y_test.reshape(-1, 1)).ravel()

In [63]:
r, _ = pearsonr(y_true, y_pred)
print("MAE:", mean_absolute_error(y_true, y_pred))
print("R² Score:", r2_score(y_true, y_pred))
print("Pearson r:", r)


MAE: 5.510246921180058
R² Score: 0.229096274194604
Pearson r: 0.4816657414332032


In [64]:
import joblib

# Save model to disk
joblib.dump(best_mlp, "mlp_landmark_model.pkl")

['mlp_landmark_model.pkl']

In [65]:
joblib.dump(scaler, "mlp_scaler.pkl")

['mlp_scaler.pkl']