# Face-to-BMI
***
## Machine Learning II

By Amulya Jayanti | Halleluya Mengesha | Hira Stanley | Sami Naeem | Vaishnavi Kokadwar
  
*May, 2025*
***

# PART 2 - Models

## c) VGGNet - Third Best Model

### i) Finetuning: CNN + Regressor

In [1]:
import os
import warnings
import absl.logging
from PIL import Image
import numpy as np
import pandas as pd
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings("ignore")
absl.logging.set_verbosity(absl.logging.ERROR)

print("Current Working Directory:", os.getcwd())



Current Working Directory: /Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project


In [None]:
# Correct path based on nested folder
base_dir = '/Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project/BMI/Data'
csv_path = os.path.join(base_dir, 'data.csv')

# Load CSV
df = pd.read_csv(csv_path)

# Create full image paths
df['image_path'] = df['name'].apply(lambda x: os.path.join(base_dir, x))

# Clean
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
# Correct image folder path
image_dir = '/Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project/BMI/Data/Images'

# Update the full image paths
df['image_path'] = df['name'].apply(lambda x: os.path.join(image_dir, x.strip()))

# Check if the images now exist
df['image_exists'] = df['image_path'].apply(os.path.exists)

# Keep only available images
df_valid = df[df['image_exists']].reset_index(drop=True)

In [None]:
from tensorflow.keras.applications import VGG19
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import preprocess_input  # works for VGG19 too
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
import numpy as np

# --- Split train/test using is_training column ---
train_df = df_valid[df_valid['is_training'] == 1]
test_df  = df_valid[df_valid['is_training'] == 0]

# --- Data Augmentation (Train) ---
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=10,
    width_shift_range=0.05,
    height_shift_range=0.05,
    zoom_range=0.1,
    horizontal_flip=True
)

# --- No Augmentation (Test) ---
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_gen = train_datagen.flow_from_dataframe(
    train_df, x_col='image_path', y_col='bmi',
    target_size=(224, 224), class_mode='raw', batch_size=32
)

test_gen = test_datagen.flow_from_dataframe(
    test_df, x_col='image_path', y_col='bmi',
    target_size=(224, 224), class_mode='raw', batch_size=32,
    shuffle=False
)

# --- VGG19 base + custom regression head ---
base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Unfreeze top 8 conv layers
for layer in base_model.layers[:-8]:
    layer.trainable = False

x = Flatten()(base_model.output)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='linear')(x)

model = Model(inputs=base_model.input, outputs=output)
model.compile(optimizer=Adam(1e-4), loss='mean_absolute_error')

# --- Callbacks ---
early_stop = EarlyStopping(patience=4, restore_best_weights=True)
lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2,
                                min_lr=1e-6, verbose=1)

# --- Train the model ---
model.fit(train_gen, validation_data=test_gen,
          epochs=15,
          callbacks=[early_stop, lr_schedule])

# --- Evaluate on test set ---
y_true = test_df['bmi'].values
y_pred = model.predict(test_gen).flatten()
genders = test_df['gender'].values

mae = mean_absolute_error(y_true, y_pred)
r_all, _ = pearsonr(y_true, y_pred)
r_male, _ = pearsonr(y_true[genders == 'Male'], y_pred[genders == 'Male'])
r_female, _ = pearsonr(y_true[genders == 'Female'], y_pred[genders == 'Female'])

print("\n✅ VGG19 Fine-Tuned (Top 8 Layers + Augmentation + LR Scheduler):")
print(f"📈 Pearson r (Overall): {r_all:.3f}")
print(f"👦 Pearson r (Male):    {r_male:.3f}")
print(f"👧 Pearson r (Female):  {r_female:.3f}")
print(f"📏 Mean Absolute Error: {mae:.2f}")

# --- Save the model ---
model.save('/Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project/vgg19_ftd_aug_lrs_bmi_model.keras', save_format="keras")


Found 3210 validated image filenames.
Found 752 validated image filenames.


2025-05-24 19:28:21.534327: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-05-24 19:28:21.534492: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-24 19:28:21.534500: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1748132901.534994  909474 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1748132901.535056  909474 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/15


2025-05-24 19:28:23.195117: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 684ms/step - loss: 14.7560 - val_loss: 6.8651 - learning_rate: 1.0000e-04
Epoch 2/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 694ms/step - loss: 6.8199 - val_loss: 6.3878 - learning_rate: 1.0000e-04
Epoch 3/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 688ms/step - loss: 5.9607 - val_loss: 5.5083 - learning_rate: 1.0000e-04
Epoch 4/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 692ms/step - loss: 5.4618 - val_loss: 6.0912 - learning_rate: 1.0000e-04
Epoch 5/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 606ms/step - loss: 5.3138
Epoch 5: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 685ms/step - loss: 5.3135 - val_loss: 6.0960 - learning_rate: 1.0000e-04
Epoch 6/15
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 681ms/ste




✅ VGG19 Fine-Tuned (Top 8 Layers + Augmentation + LR Scheduler):
📈 Pearson r (Overall): 0.634
👦 Pearson r (Male):    0.677
👧 Pearson r (Female):  0.584
📏 Mean Absolute Error: 5.00


In [16]:
# Convert CNN to .tflite

import tensorflow as tf

print("🚀 Starting TFLite conversion...")

# Set working directory (optional if you're already there)
os.chdir("/Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project")
print("📁 Current Working Directory:", os.getcwd())

keras_path = "vgg19_finetuned_aug_lrsched_bmi_model.keras"
assert os.path.exists(keras_path), f"❌ Model not found: {keras_path}"
print("✅ Found model:", keras_path)

# --- Load the model (compile=False is safe since you're not retraining) ---
model = tf.keras.models.load_model(keras_path, compile=False)

# --- Convert to TFLite ---
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Optional: Post-training quantization to reduce file size and enable edge deployment
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Convert
tflite_model = converter.convert()

# Save to file
tflite_path = "vgg19_finetuned_aug_lrsched_bmi_model_quant.tflite"
with open(tflite_path, "wb") as f:
    f.write(tflite_model)

print(f"✅ Saved TFLite model to {tflite_path}")


🚀 Starting TFLite conversion...
📁 Current Working Directory: /Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project
✅ Found model: vgg19_finetuned_aug_lrsched_bmi_model.keras
INFO:tensorflow:Assets written to: /var/folders/qd/9k8sr40n2mq8fbk6jyf7q9tw0000gn/T/tmpw7ribk2_/assets


INFO:tensorflow:Assets written to: /var/folders/qd/9k8sr40n2mq8fbk6jyf7q9tw0000gn/T/tmpw7ribk2_/assets


Saved artifact at '/var/folders/qd/9k8sr40n2mq8fbk6jyf7q9tw0000gn/T/tmpw7ribk2_'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name='input_layer_4')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  13399268512: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13478674272: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14501662560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14501658688: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14359819360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14359820240: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14188864960: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14188865136: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14188942784: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14188942608: TensorSpec(shape=(), dtype=tf.resource, name=None)
  143329654

W0000 00:00:1748138178.356070 1017195 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1748138178.356091 1017195 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.


In [21]:
# Convert to .pkl

import pickle

pkl_path = "vgg19_finetuned_aug_lrsched_bmi_model_quant.pkl"
with open(pkl_path, "wb") as f:
    pickle.dump(tflite_model, f)
print(f"✅ Pickled TFLite bytes to {pkl_path}")

✅ Pickled TFLite bytes to vgg19_finetuned_aug_lrsched_bmi_model_quant.pkl


### ii) Feature Extraction + Regressor

**Step 1: Extraction - Using Best Fine-Tuned VGG19**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input  # works for VGG19 too

from tqdm import tqdm  # ← progress bar

# --- Load the fine-tuned model ---
model = tf.keras.models.load_model(
    '/Users/halleluyamengesha/Desktop/UChicago/_Quarter_3/Machine Learning II/Project/vgg19_finetuned_aug_lrsched_bmi_model.keras'
)

# --- Extract the penultimate layer (Dense 256) as the feature output ---
feature_model = Model(inputs=model.input, outputs=model.get_layer(index=-2).output)

# --- Preprocessing function ---
def preprocess_image(img_path):
    img = Image.open(img_path).convert("RGB").resize((224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)

# --- Loop through df_valid for feature extraction ---
features = []
labels = []
image_paths = []

print(f"🔄 Extracting features from {len(df_valid)} images...\n")

for i, row in tqdm(df_valid.iterrows(), total=len(df_valid), desc="Extracting"):
    try:
        img_array = preprocess_image(row['image_path'])
        feat_vec = feature_model.predict(img_array, verbose=0)[0]  # 256-d
        features.append(feat_vec)
        labels.append(row['bmi'])
        image_paths.append(row['image_path'])

        # Optional: log every 100 samples
        if i % 100 == 0 and i > 0:
            print(f"✅ Processed {i} images...")

    except Exception as e:
        print(f"⚠️ Error on {row['image_path']}: {e}")

# --- Save features and labels ---
features_array = np.array(features)
labels_array = np.array(labels)

# Optionally save to disk
np.save("vgg19_features.npy", features_array)
np.save("vgg19_labels.npy", labels_array)

# Or create a DataFrame if you prefer
df_features = pd.DataFrame(features_array)
df_features['bmi'] = labels_array
df_features['image_path'] = image_paths
df_features.to_csv("vgg19_features_labeled.csv", index=False)

print("\n✅ Feature extraction complete.")
print(f"🔢 Features shape: {features_array.shape}")


🔄 Extracting features from 3962 images...



Extracting:   3%|▎         | 103/3962 [00:07<04:42, 13.68it/s]

✅ Processed 100 images...


Extracting:   5%|▌         | 203/3962 [00:15<04:37, 13.57it/s]

✅ Processed 200 images...


Extracting:   8%|▊         | 303/3962 [00:22<04:33, 13.37it/s]

✅ Processed 300 images...


Extracting:  10%|█         | 403/3962 [00:30<04:14, 13.99it/s]

✅ Processed 400 images...


Extracting:  13%|█▎        | 503/3962 [00:37<04:16, 13.49it/s]

✅ Processed 500 images...


Extracting:  15%|█▌        | 603/3962 [00:44<04:06, 13.65it/s]

✅ Processed 600 images...


Extracting:  18%|█▊        | 703/3962 [00:52<04:04, 13.31it/s]

✅ Processed 700 images...


Extracting:  20%|██        | 803/3962 [00:59<04:07, 12.75it/s]

✅ Processed 800 images...


Extracting:  23%|██▎       | 903/3962 [01:06<03:43, 13.71it/s]

✅ Processed 900 images...


Extracting:  25%|██▌       | 1003/3962 [01:14<03:34, 13.79it/s]

✅ Processed 1000 images...


Extracting:  28%|██▊       | 1103/3962 [01:21<03:33, 13.37it/s]

✅ Processed 1100 images...


Extracting:  30%|███       | 1203/3962 [01:29<03:23, 13.56it/s]

✅ Processed 1200 images...


Extracting:  33%|███▎      | 1303/3962 [01:36<03:14, 13.64it/s]

✅ Processed 1300 images...


Extracting:  35%|███▌      | 1403/3962 [01:43<03:04, 13.88it/s]

✅ Processed 1400 images...


Extracting:  38%|███▊      | 1503/3962 [01:51<03:02, 13.48it/s]

✅ Processed 1500 images...


Extracting:  40%|████      | 1603/3962 [01:58<02:55, 13.46it/s]

✅ Processed 1600 images...


Extracting:  43%|████▎     | 1703/3962 [02:06<02:47, 13.51it/s]

✅ Processed 1700 images...


Extracting:  46%|████▌     | 1803/3962 [02:13<02:36, 13.76it/s]

✅ Processed 1800 images...


Extracting:  48%|████▊     | 1903/3962 [02:21<02:29, 13.75it/s]

✅ Processed 1900 images...


Extracting:  51%|█████     | 2003/3962 [02:28<02:25, 13.48it/s]

✅ Processed 2000 images...


Extracting:  53%|█████▎    | 2103/3962 [02:36<02:17, 13.55it/s]

✅ Processed 2100 images...


Extracting:  56%|█████▌    | 2203/3962 [02:43<02:13, 13.17it/s]

✅ Processed 2200 images...


Extracting:  58%|█████▊    | 2303/3962 [02:50<02:00, 13.77it/s]

✅ Processed 2300 images...


Extracting:  61%|██████    | 2403/3962 [02:58<01:54, 13.64it/s]

✅ Processed 2400 images...


Extracting:  63%|██████▎   | 2503/3962 [03:05<01:46, 13.67it/s]

✅ Processed 2500 images...


Extracting:  66%|██████▌   | 2603/3962 [03:13<01:40, 13.47it/s]

✅ Processed 2600 images...


Extracting:  68%|██████▊   | 2703/3962 [03:20<01:41, 12.39it/s]

✅ Processed 2700 images...


Extracting:  71%|███████   | 2803/3962 [03:28<01:25, 13.54it/s]

✅ Processed 2800 images...


Extracting:  73%|███████▎  | 2903/3962 [03:35<01:17, 13.63it/s]

✅ Processed 2900 images...


Extracting:  76%|███████▌  | 3003/3962 [03:43<01:12, 13.23it/s]

✅ Processed 3000 images...


Extracting:  78%|███████▊  | 3103/3962 [03:50<01:02, 13.71it/s]

✅ Processed 3100 images...


Extracting:  81%|████████  | 3203/3962 [03:57<00:54, 13.86it/s]

✅ Processed 3200 images...


Extracting:  83%|████████▎ | 3303/3962 [04:05<00:47, 13.81it/s]

✅ Processed 3300 images...


Extracting:  86%|████████▌ | 3403/3962 [04:12<00:41, 13.45it/s]

✅ Processed 3400 images...


Extracting:  88%|████████▊ | 3503/3962 [04:20<00:33, 13.67it/s]

✅ Processed 3500 images...


Extracting:  91%|█████████ | 3603/3962 [04:27<00:27, 13.03it/s]

✅ Processed 3600 images...


Extracting:  93%|█████████▎| 3703/3962 [04:35<00:19, 13.43it/s]

✅ Processed 3700 images...


Extracting:  96%|█████████▌| 3803/3962 [04:42<00:11, 13.85it/s]

✅ Processed 3800 images...


Extracting:  99%|█████████▊| 3903/3962 [04:50<00:04, 13.57it/s]

✅ Processed 3900 images...


Extracting: 100%|██████████| 3962/3962 [04:54<00:00, 13.45it/s]



✅ Feature extraction complete.
🔢 Features shape: (3962, 256)


In [17]:
from tensorflow.keras.models import Model

print("🚀 Starting feature extractor conversion...")

# Load full model
full_model = tf.keras.models.load_model("vgg19_finetuned_aug_lrsched_bmi_model.keras")

# Extract Dense(256) output layer
feature_model = Model(inputs=full_model.input, outputs=full_model.get_layer(index=-2).output)

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(feature_model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

# Save to disk
with open("vgg19_feature_extractor.tflite", "wb") as f:
    f.write(tflite_model)

print("✅ Saved feature extractor as vgg19_feature_extractor.tflite")

🚀 Starting feature extractor conversion...
INFO:tensorflow:Assets written to: /var/folders/qd/9k8sr40n2mq8fbk6jyf7q9tw0000gn/T/tmp7fimkf5t/assets


INFO:tensorflow:Assets written to: /var/folders/qd/9k8sr40n2mq8fbk6jyf7q9tw0000gn/T/tmp7fimkf5t/assets


Saved artifact at '/var/folders/qd/9k8sr40n2mq8fbk6jyf7q9tw0000gn/T/tmp7fimkf5t'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name='input_layer_4')
Output Type:
  TensorSpec(shape=(None, 256), dtype=tf.float32, name=None)
Captures:
  14561128112: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14561127056: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14561196864: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14561196160: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14561155904: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14561154848: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14218340128: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14218339776: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14218180032: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14218179328: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1421818

W0000 00:00:1748138218.074144 1017195 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1748138218.074156 1017195 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.


**Step 2: Regression Models**

**a) Individual Models**

In [19]:
df_features = pd.read_csv("vgg19_features_labeled.csv")

# Extract filename from path
df_features["file"] = df_features["image_path"].apply(lambda p: os.path.basename(p))
df_valid["file"] = df_valid["image_path"].apply(lambda p: os.path.basename(p))

# Merge
df_features = df_features.merge(df_valid[['file', 'gender', 'is_training']], on='file', how='left')

# Check for nulls
print(df_features["is_training"].isna().sum(), "rows with missing is_training")

# Sanity check split counts
print("Train:", sum(df_features["is_training"] == 1))
print("Test:", sum(df_features["is_training"] == 0))


0 rows with missing is_training
Train: 3210
Test: 752


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

import joblib
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

# --- Load features ---
df_features = pd.read_csv("vgg19_features_labeled.csv")

# --- Merge gender and is_training from df_valid ---
df_features["file"] = df_features["image_path"].apply(os.path.basename)
df_valid["file"] = df_valid["image_path"].apply(os.path.basename)
df_features = df_features.merge(df_valid[["file", "gender", "is_training"]], on="file", how="left")

# --- Check merge worked ---
assert df_features["is_training"].notna().all(), "❌ Some rows are missing is_training after merge."
assert df_features["gender"].notna().all(), "❌ Some rows are missing gender after merge."

# --- Prepare features/labels ---
X = df_features.drop(columns=["bmi", "image_path", "file", "gender", "is_training"]).values
y = df_features["bmi"].values
genders = df_features["gender"].values
split = df_features["is_training"].astype(bool).values  # True = train

X_train, X_test = X[split], X[~split]
y_train, y_test = y[split], y[~split]
g_train, g_test = genders[split], genders[~split]

# --- Regressors to compare ---
regressors = {
    "Ridge": Ridge(alpha=1.0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(C=10, epsilon=1.0),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "MLP": MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42),
    "XGB": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=100, learning_rate=0.1, verbose=0, random_state=42)
}

os.makedirs("models", exist_ok=True)

# --- Train and evaluate ---
for name, model in regressors.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Save the trained model
    model_path = f"models/vgg19_{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, model_path)
    print(f"💾 Saved {name} model to {model_path}")

    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    r_all, _ = pearsonr(y_test, y_pred)

    # Gender-specific Pearson r
    try:
        r_male, _ = pearsonr(y_test[g_test == "Male"], y_pred[g_test == "Male"])
        r_female, _ = pearsonr(y_test[g_test == "Female"], y_pred[g_test == "Female"])
    except Exception as e:
        r_male, r_female = np.nan, np.nan
        print(f"⚠️ Gender-specific correlation failed for {name}: {e}")

    print(f"\n✅ {name}")
    print(f"📏 MAE: {mae:.2f}")
    print(f"📈 Pearson r (Overall): {r_all:.3f}")
    print(f"👦 Pearson r (Male):    {r_male:.3f}")
    print(f"👧 Pearson r (Female):  {r_female:.3f}")

💾 Saved Ridge model to models/vgg19_ridge_model.pkl

✅ Ridge
📏 MAE: 5.15
📈 Pearson r (Overall): 0.633
👦 Pearson r (Male):    0.675
👧 Pearson r (Female):  0.577
💾 Saved Random Forest model to models/vgg19_random_forest_model.pkl

✅ Random Forest
📏 MAE: 4.99
📈 Pearson r (Overall): 0.647
👦 Pearson r (Male):    0.695
👧 Pearson r (Female):  0.584
💾 Saved SVR model to models/vgg19_svr_model.pkl

✅ SVR
📏 MAE: 5.00
📈 Pearson r (Overall): 0.649
👦 Pearson r (Male):    0.697
👧 Pearson r (Female):  0.584
💾 Saved KNN model to models/vgg19_knn_model.pkl

✅ KNN
📏 MAE: 5.20
📈 Pearson r (Overall): 0.626
👦 Pearson r (Male):    0.668
👧 Pearson r (Female):  0.572
💾 Saved MLP model to models/vgg19_mlp_model.pkl

✅ MLP
📏 MAE: 4.98
📈 Pearson r (Overall): 0.649
👦 Pearson r (Male):    0.696
👧 Pearson r (Female):  0.586
💾 Saved XGB model to models/vgg19_xgb_model.pkl

✅ XGB
📏 MAE: 5.08
📈 Pearson r (Overall): 0.635
👦 Pearson r (Male):    0.675
👧 Pearson r (Female):  0.582
[LightGBM] [Info] Auto-choosing col-wise

**b) Ensemble Model**

In [None]:
from sklearn.ensemble import StackingRegressor
from tqdm import tqdm

# --- Define base learners ---
base_learners = [
    ("svr", SVR(C=10, epsilon=1.0)),
    ("mlp", MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)),
    ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
    ("catboost", CatBoostRegressor(iterations=100, learning_rate=0.1, verbose=0, random_state=42))
]

# --- Meta-learner (Ridge is good for numeric target with small # of features) ---
meta_learner = Ridge(alpha=1.0)

# --- Build stacking ensemble (let sklearn train everything) ---
ensemble = StackingRegressor(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1
)

# --- Fit ensemble with progress tracking ---
print("\n⏳ Training stacking ensemble...")
with tqdm(total=1, desc="Ensemble Training", bar_format="{desc}: {bar} {percentage:3.0f}%") as pbar:
    ensemble.fit(X_train, y_train)
    pbar.update(1)

# --- Predict and evaluate ---
y_pred = ensemble.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r_all, _ = pearsonr(y_test, y_pred)
r_male, _ = pearsonr(y_test[g_test == "Male"], y_pred[g_test == "Male"])
r_female, _ = pearsonr(y_test[g_test == "Female"], y_pred[g_test == "Female"])

# --- Report ---
print("\n✅ Ensemble Stacking Regressor")
print(f"📏 MAE: {mae:.2f}")
print(f"📈 Pearson r (Overall): {r_all:.3f}")
print(f"👦 Pearson r (Male):    {r_male:.3f}")
print(f"👧 Pearson r (Female):  {r_female:.3f}")



⏳ Training stacking ensemble...


Ensemble Training: ██████████ 100%



✅ Ensemble Stacking Regressor
📏 MAE: 4.99
📈 Pearson r (Overall): 0.649
👦 Pearson r (Male):    0.699
👧 Pearson r (Female):  0.583
