In [16]:
import sys
!{sys.executable} -m pip install keras tensorflow --upgrade
!{sys.executable} -m pip install keras_tuner

import os
import joblib
import numpy as np
import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras
from keras import backend as K
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [17]:
# Step 2: Data Loading and Preprocessing
# Load the training data, apply robust scaling based on EDA, and split into train/test sets. The scaler is saved for future inference.
DATA_PATH = "../data/raw/training_data.npz"
SCALER_DIR = "../data/processed/scalers"
MODEL_DIR = "../src/models"
FIGURE_DIR = "../src/visualization/plots"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(FIGURE_DIR, exist_ok=True)

def load_and_preprocess_data(path=DATA_PATH, test_size=0.2, random_state=42, scaler_type='standard'):
    data = np.load(path)
    X, y = data['X'], data['y']
    print(f"Loaded dataset: X shape {X.shape}, y shape {y.shape}")
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # Separate opt_flag (last column)
    X_train_prefix = X_train[:, :-1]
    X_test_prefix = X_test[:, :-1]
    opt_flag_train = X_train[:, -1].reshape(-1, 1)
    opt_flag_test = X_test[:, -1].reshape(-1, 1)
    # Choose scaler based on EDA
    if scaler_type == 'minmax':
        scaler = MinMaxScaler()
    elif scaler_type == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_prefix)
    X_test_scaled = scaler.transform(X_test_prefix)
    # Reattach opt_flag
    X_train_final = np.hstack([X_train_scaled, opt_flag_train])
    X_test_final = np.hstack([X_test_scaled, opt_flag_test])
    # Save scaler for later inference
    os.makedirs(SCALER_DIR, exist_ok=True)
    joblib.dump(scaler, os.path.join(SCALER_DIR, "feature_scaler.pkl"))
    print(f"Train set: {X_train_final.shape}, Test set: {X_test_final.shape}")
    return X_train_final, X_test_final, y_train, y_test

X_train, X_test, y_train, y_test = load_and_preprocess_data(scaler_type='standard')

Loaded dataset: X shape (50000, 20), y shape (50000,)
Train set: (40000, 20), Test set: (10000, 20)


In [18]:
# Step 3: Training Metrics Plotting
# This function plots and saves Loss, MAE, and RMSE curves for each model during training.
def plot_training_metrics(history, title, filename):
    plt.figure(figsize=(15, 4))
    # Loss plot
    plt.subplot(1, 3, 1)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Val Loss")
    plt.title(f"{title} - Loss")
    plt.xlabel("Epoch")
    plt.ylabel("MSE Loss")
    plt.legend()
    # MAE plot
    plt.subplot(1, 3, 2)
    plt.plot(history.history["mae"], label="Train MAE")
    plt.plot(history.history["val_mae"], label="Val MAE")
    plt.title(f"{title} - MAE")
    plt.xlabel("Epoch")
    plt.ylabel("Mean Absolute Error")
    plt.legend()
    # RMSE plot (computed from loss)
    train_rmse = np.sqrt(history.history["loss"])
    val_rmse = np.sqrt(history.history["val_loss"])
    plt.subplot(1, 3, 3)
    plt.plot(train_rmse, label="Train RMSE")
    plt.plot(val_rmse, label="Val RMSE")
    plt.title(f"{title} - RMSE")
    plt.xlabel("Epoch")
    plt.ylabel("Root MSE")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURE_DIR, filename))
    plt.close()

In [19]:
# Step 4: Large MLP Model (Hyperband Tuning)
# This model uses Keras Tuner's Hyperband to optimize architecture and learning rate. It serves as a strong baseline for tabular option pricing.
def build_large_mlp(hp):
    model = keras.Sequential()
    input_dim = X_train.shape[1]
    model.add(layers.Input(shape=(input_dim,)))
    num_layers = hp.Int("num_layers", min_value=2, max_value=6, step=1)
    for i in range(num_layers):
        units = hp.Int(f"units_{i}", min_value=32, max_value=512, step=32)
        activation = hp.Choice("activation", ["relu", "tanh"])
        model.add(layers.Dense(units=units, activation=activation))
    model.add(layers.Dense(1, activation="linear"))
    optimizer_choice = hp.Choice("optimizer", ["adam", "sgd"])
    learning_rate = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    if optimizer_choice == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
    return model

tuner = kt.Hyperband(
    build_large_mlp,
    objective="val_mae",
    max_epochs=40,
    factor=3,
    directory="tuner_logs",
    project_name="large_mlp_tuning"
    )
stop_early = keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)
tuner.search(
    X_train, y_train,
    validation_data=(X_test, y_test),
    callbacks=[stop_early],
    batch_size=kt.HyperParameters().Int("batch_size", min_value=32, max_value=256, step=32),
    epochs=40
    )
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model_large = tuner.hypermodel.build(best_hps)
history_large = model_large.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=40,
    batch_size=64,
    verbose=1
    )
y_pred_large = model_large.predict(X_test).flatten()
mae_large = mean_absolute_error(y_test, y_pred_large)
rmse_large = np.sqrt(mean_squared_error(y_test, y_pred_large))
print(f"Large MLP Evaluation: MAE={mae_large:.4f}, RMSE={rmse_large:.4f}")
plot_training_metrics(history_large, "Large MLP (Tuned)", "training_large_tuned.png")
model_large.save(os.path.join(MODEL_DIR, "mlp_large_tuned.h5"))

Reloading Tuner from tuner_logs\large_mlp_tuning\tuner0.json
Epoch 1/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 137.0176 - mae: 8.8824 - val_loss: 92.4278 - val_mae: 6.6603
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 137.0176 - mae: 8.8824 - val_loss: 92.4278 - val_mae: 6.6603
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 949us/step - loss: 81.7370 - mae: 6.0119 - val_loss: 54.1210 - val_mae: 4.9909
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 949us/step - loss: 81.7370 - mae: 6.0119 - val_loss: 54.1210 - val_mae: 4.9909
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 947us/step - loss: 21.0402 - mae: 2.7185 - val_loss: 3.3074 - val_mae: 1.2200
Epoch 4/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 947us/step - loss: 21.0402 - mae: 2.7185 - val_loss: 3.3074 - val_mae: 1.2200
Epoch 4

Reloading Tuner from tuner_logs\large_mlp_tuning\tuner0.json
Epoch 1/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 137.0176 - mae: 8.8824 - val_loss: 92.4278 - val_mae: 6.6603
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 137.0176 - mae: 8.8824 - val_loss: 92.4278 - val_mae: 6.6603
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 949us/step - loss: 81.7370 - mae: 6.0119 - val_loss: 54.1210 - val_mae: 4.9909
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 949us/step - loss: 81.7370 - mae: 6.0119 - val_loss: 54.1210 - val_mae: 4.9909
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 947us/step - loss: 21.0402 - mae: 2.7185 - val_loss: 3.3074 - val_mae: 1.2200
Epoch 4/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 947us/step - loss: 21.0402 - mae: 2.7185 - val_loss: 3.3074 - val_mae: 1.2200
Epoch 4



# Neural Network Training Workflow
- Robust preprocessing based on EDA and scaling analysis
- Multiple NN architectures: Large MLP (Hyperband), Residual MLP, Advanced MLP (Bayesian Optimization)
- Hyperparameter tuning for each model
- Evaluation using RMSE, MAE, Loss; metrics plotted and saved
- Each trained model is saved for future inference
- Ensemble model combines predictions for improved accuracy

In [20]:
# Feature Engineering: Add polynomial and interaction features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_fe = poly.fit_transform(X_train)
X_test_fe = poly.transform(X_test)

print(f"Feature engineered shapes: X_train_fe {X_train_fe.shape}, X_test_fe {X_test_fe.shape}")

Feature engineered shapes: X_train_fe (40000, 230), X_test_fe (10000, 230)


In [21]:
# Step 6: Advanced MLP (Bayesian Optimization)
# This model uses Bayesian Optimization to search for optimal architecture and training parameters, aiming for best-in-class tabular regression.
from keras_tuner.tuners import BayesianOptimization

def build_advanced_mlp(hp):
    model = keras.Sequential()
    input_dim = X_train.shape[1]
    model.add(layers.Input(shape=(input_dim,)))
    num_layers = hp.Int("num_layers", 2, 6)
    for i in range(num_layers):
        units = hp.Int(f"units_{i}", 32, 512, step=32)
        activation = hp.Choice(f"activation_{i}", ["relu", "tanh"])
        model.add(layers.Dense(units=units, activation=activation))
    model.add(layers.Dense(1, activation="linear"))
    optimizer_choice = hp.Choice("optimizer", ["adam", "sgd", "rmsprop"])
    learning_rate = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    if optimizer_choice == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_choice == "sgd":
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
    return model

bayes_tuner = BayesianOptimization(
    build_advanced_mlp,
    objective="val_mae",
    max_trials=20,
    directory="tuner_logs",
    project_name="large_mlp_bayes_tuning"
    )
stop_early = keras.callbacks.EarlyStopping(monitor="val_loss", patience=7)
bayes_tuner.search(
    X_train, y_train,
    validation_data=(X_test, y_test),
    callbacks=[stop_early],
    batch_size=64,
    epochs=40
    )
best_bayes_hps = bayes_tuner.get_best_hyperparameters(num_trials=1)[0]
advanced_model = bayes_tuner.hypermodel.build(best_bayes_hps)
history_adv = advanced_model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=40,
    batch_size=64,
    verbose=1
    )
y_pred_adv = advanced_model.predict(X_test).flatten()
mae_adv = mean_absolute_error(y_test, y_pred_adv)
rmse_adv = np.sqrt(mean_squared_error(y_test, y_pred_adv))
print(f"Advanced MLP (Bayesian) Evaluation: MAE={mae_adv:.4f}, RMSE={rmse_adv:.4f}")
plot_training_metrics(history_adv, "Advanced MLP (Bayesian)", "training_large_advanced_tuned.png")
advanced_model.save(os.path.join(MODEL_DIR, "mlp_large_advanced_tuned.h5"))

Reloading Tuner from tuner_logs\large_mlp_bayes_tuning\tuner0.json
Epoch 1/40
Epoch 1/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 71.7113 - mae: 5.5656 - val_loss: 4.6096 - val_mae: 1.4736
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 71.7113 - mae: 5.5656 - val_loss: 4.6096 - val_mae: 1.4736
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 971us/step - loss: 2.7609 - mae: 1.0896 - val_loss: 2.0525 - val_mae: 0.9176
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 971us/step - loss: 2.7609 - mae: 1.0896 - val_loss: 2.0525 - val_mae: 0.9176
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 981us/step - loss: 1.7429 - mae: 0.8246 - val_loss: 1.8845 - val_mae: 0.8732
Epoch 4/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 981us/step - loss: 1.7429 - mae: 0.8246 - val_loss: 1.8845 - val_mae: 0.8732


Reloading Tuner from tuner_logs\large_mlp_bayes_tuning\tuner0.json
Epoch 1/40
Epoch 1/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 71.7113 - mae: 5.5656 - val_loss: 4.6096 - val_mae: 1.4736
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 71.7113 - mae: 5.5656 - val_loss: 4.6096 - val_mae: 1.4736
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 971us/step - loss: 2.7609 - mae: 1.0896 - val_loss: 2.0525 - val_mae: 0.9176
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 971us/step - loss: 2.7609 - mae: 1.0896 - val_loss: 2.0525 - val_mae: 0.9176
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 981us/step - loss: 1.7429 - mae: 0.8246 - val_loss: 1.8845 - val_mae: 0.8732
Epoch 4/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 981us/step - loss: 1.7429 - mae: 0.8246 - val_loss: 1.8845 - val_mae: 0.8732




## Advanced Model Improvements
Let's try more advanced deep learning techniques: residual connections, dropout, batch normalization, and learning rate scheduling.

In [23]:
# Step 5: Residual MLP Model
# This model introduces residual connections, dropout, and batch normalization for improved generalization and deeper learning.
from tensorflow.keras import regularizers

def build_residual_mlp(input_dim, n_layers=4, units=128, dropout_rate=0.2, l2_reg=1e-4):
    inputs = keras.Input(shape=(input_dim,))
    x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(inputs)
    for i in range(n_layers):
        shortcut = x
        x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Add()([x, shortcut])  # Residual connection
    outputs = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(), loss='mse', metrics=['mae'])
    return model

res_mlp = build_residual_mlp(X_train.shape[1], n_layers=3, units=256, dropout_rate=0.3, l2_reg=1e-3)
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)
history_res = res_mlp.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=40,
    batch_size=64,
    callbacks=[lr_scheduler],
    verbose=1
    )
res_pred = res_mlp.predict(X_test).flatten()
mae_res = mean_absolute_error(y_test, res_pred)
rmse_res = np.sqrt(mean_squared_error(y_test, res_pred))
print(f"Residual MLP Evaluation: MAE={mae_res:.4f}, RMSE={rmse_res:.4f}")
plot_training_metrics(history_res, "Residual MLP", "training_residual_mlp.png")
res_mlp.save(os.path.join(MODEL_DIR, "mlp_large_residual_tuned.h5"))

Epoch 1/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 76.0410 - mae: 5.8400 - val_loss: 15.4982 - val_mae: 2.8000 - learning_rate: 0.0010
Epoch 2/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.3556 - mae: 2.6920 - val_loss: 6.5562 - val_mae: 1.7460 - learning_rate: 0.0010
Epoch 3/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 11.1529 - mae: 2.3747 - val_loss: 3.5976 - val_mae: 1.1456 - learning_rate: 0.0010
Epoch 4/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9.9112 - mae: 2.2530 - val_loss: 4.2431 - val_mae: 1.2711 - learning_rate: 0.0010
Epoch 5/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9.1976 - mae: 2.1566 - val_loss: 2.3209 - val_mae: 0.7822 - learning_rate: 0.0010
Epoch 6/40
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8.2547 - mae: 2.0485 -



In [24]:
# Step 7: Ensemble Model
# The ensemble averages predictions from all trained models to further improve accuracy and robustness.
ensemble_preds = (y_pred_large + res_pred + y_pred_adv) / 3
mae_ensemble = mean_absolute_error(y_test, ensemble_preds)
rmse_ensemble = np.sqrt(mean_squared_error(y_test, ensemble_preds))
print(f"Ensemble Model Evaluation: MAE={mae_ensemble:.4f}, RMSE={rmse_ensemble:.4f}")

Ensemble Model Evaluation: MAE=0.5744, RMSE=1.0644


# Summary and Next Steps
- Data was robustly preprocessed based on EDA.
- Three advanced neural network models were trained and tuned: Large MLP (Hyperband), Residual MLP, and Advanced MLP (Bayesian Optimization).
- Each model was evaluated using RMSE and MAE, with training curves plotted and saved.
- An ensemble model was created for improved accuracy.
- All models are saved for future inference.

**Next steps:**
- Explore deeper architectures, regularization, and uncertainty estimation (MC Dropout, quantile regression).
- Try feature engineering, PCA, and stacking ensembles for further gains.
- Analyze model errors and refine training data or architecture as needed.

In [26]:
# Deeper Residual Network with Multi-Level Skip Connections and More Regularization
from tensorflow.keras.layers import GaussianNoise
from tensorflow.keras.optimizers import AdamW, Nadam

def build_deep_residual_mlp(input_dim, n_layers=6, units=128, dropout_rate=0.3, l1_reg=1e-4, l2_reg=1e-4, noise_std=0.05):
    inputs = keras.Input(shape=(input_dim,))
    x = GaussianNoise(noise_std)(inputs)
    x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
    skip = x
    for i in range(n_layers):
        x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        if i % 2 == 1:
            x = layers.Add()([x, skip])  # Multi-level skip connection
    outputs = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=AdamW(), loss='mse', metrics=['mae', 'mse'])
    return model

# Train deeper residual model
advanced_res_mlp = build_deep_residual_mlp(X_train_fe.shape[1], n_layers=6, units=256, dropout_rate=0.3, l1_reg=1e-4, l2_reg=1e-3, noise_std=0.05)
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)

history_adv_res = advanced_res_mlp.fit(
    X_train_fe, y_train,
    validation_data=(X_test_fe, y_test),
    epochs=50,
    batch_size=64,
    callbacks=[lr_scheduler],
    verbose=1
)

# Evaluate
adv_res_pred = advanced_res_mlp.predict(X_test_fe).flatten()
mae_adv_res = mean_absolute_error(y_test, adv_res_pred)
rmse_adv_res = np.sqrt(mean_squared_error(y_test, adv_res_pred))
r2_adv_res = r2_score(y_test, adv_res_pred)

print(f"\nDeeper Residual MLP Evaluation:\n  MAE: {mae_adv_res:.4f}\n  RMSE: {rmse_adv_res:.4f}\n  R²: {r2_adv_res:.4f}")
advanced_res_mlp.save(os.path.join(MODEL_DIR, "mlp_large_deep_residual_tuned.h5"))
plot_training_metrics(history_adv_res, "Deep Residual MLP", "training_deep_residual_mlp.png")

Epoch 1/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 86.7469 - mae: 6.1939 - mse: 82.4059 - val_loss: 31.0816 - val_mae: 3.5045 - val_mse: 26.7075 - learning_rate: 0.0010
Epoch 2/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 20.5677 - mae: 2.7786 - mse: 16.2067 - val_loss: 10.9460 - val_mae: 1.6049 - val_mse: 6.6035 - learning_rate: 0.0010
Epoch 3/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 15.2482 - mae: 2.3033 - mse: 10.9304 - val_loss: 7.3173 - val_mae: 1.0927 - val_mse: 3.0277 - learning_rate: 0.0010
Epoch 4/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 14.4865 - mae: 2.1818 - mse: 10.2171 - val_loss: 7.4064 - val_mae: 1.2364 - val_mse: 3.1495 - learning_rate: 0.0010
Epoch 5/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 11.5014 - mae: 1.9784 - mse: 7.2981 - val_loss: 6.0669 - val_ma




Deeper Residual MLP Evaluation:
  MAE: 0.6659
  RMSE: 1.1447
  R²: 0.9935


In [27]:
# Cyclical Learning Rate Callback
from tensorflow.keras.callbacks import Callback

class CyclicLR(Callback):
    def __init__(self, base_lr=1e-4, max_lr=1e-2, step_size=2000):
        super().__init__()
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.iterations = 0
    def on_train_batch_begin(self, batch, logs=None):
        cycle = np.floor(1 + self.iterations / (2 * self.step_size))
        x = np.abs(self.iterations / self.step_size - 2 * cycle + 1)
        lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, (1 - x))
        K.set_value(self.model.optimizer.lr, lr)
        self.iterations += 1

# Example usage:
# clr = CyclicLR(base_lr=1e-4, max_lr=1e-2, step_size=1000)
# history = model.fit(..., callbacks=[clr])


In [28]:
# Feature Selection: Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver='full')  # retain 95% variance
X_train_pca = pca.fit_transform(X_train_fe)
X_test_pca = pca.transform(X_test_fe)

print(f"PCA reduced shape: X_train_pca {X_train_pca.shape}, X_test_pca {X_test_pca.shape}")


PCA reduced shape: X_train_pca (40000, 101), X_test_pca (10000, 101)


In [29]:
# Ensemble Stacking: Combine Multiple Models
from sklearn.linear_model import LinearRegression

# Assume you have predictions from several models
# y_pred_adv, res_pred, adv_res_pred (from previous cells)

stacked_preds = np.vstack([
    y_pred_adv,
    res_pred,
    adv_res_pred
]).T

stacker = LinearRegression()
stacker.fit(stacked_preds, y_test)
ensemble_stacked = stacker.predict(stacked_preds)

mae_stacked = mean_absolute_error(y_test, ensemble_stacked)
rmse_stacked = np.sqrt(mean_squared_error(y_test, ensemble_stacked))
r2_stacked = r2_score(y_test, ensemble_stacked)

print(f"\nStacked Ensemble Evaluation:\n  MAE: {mae_stacked:.4f}\n  RMSE: {rmse_stacked:.4f}\n  R²: {r2_stacked:.4f}")


Stacked Ensemble Evaluation:
  MAE: 0.5756
  RMSE: 1.0077
  R²: 0.9949


### Advanced Deep Learning Improvements
We'll now apply:
- MC Dropout for uncertainty estimation
- Quantile regression for predictive intervals
- Mixup data augmentation for tabular data
- Learning rate warmup/scheduling

In [30]:
# MC Dropout for Uncertainty Estimation
from tensorflow.keras import Model

def mc_dropout_predict(model, X, n_iter=100):
    # Enable dropout at inference
    f = Model(model.input, model.output)
    preds = [f(X, training=True).numpy().flatten() for _ in range(n_iter)]
    preds = np.array(preds)
    mean_pred = preds.mean(axis=0)
    std_pred = preds.std(axis=0)
    return mean_pred, std_pred

# Example usage:
mean_pred, std_pred = mc_dropout_predict(advanced_res_mlp, X_test_fe, n_iter=100)
print(f"MC Dropout mean prediction: {mean_pred[:5]}")
print(f"MC Dropout std (uncertainty): {std_pred[:5]}")

MC Dropout mean prediction: [ 0.32455605 29.407522    1.2473353   0.40297022  0.15261444]
MC Dropout std (uncertainty): [0.5068752  0.7433825  0.67587054 0.34708363 0.38833657]


In [31]:
# Quantile Regression for Predictive Intervals
# Use only tensorflow.keras for compatibility
from tensorflow import keras
from tensorflow.keras import layers, backend as K

# Quantile loss function for Keras
# K.maximum and K.mean are from tensorflow.keras.backend

def quantile_loss(q):
    def loss(y_true, y_pred):
        e = y_true - y_pred
        return K.mean(K.maximum(q * e, (q - 1) * e), axis=-1)
    return loss

# Build quantile model (e.g., 0.1, 0.5, 0.9)
def build_quantile_mlp(input_dim, quantile):
    inputs = keras.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss=quantile_loss(quantile))
    return model

# Train quantile models
q_low, q_med, q_high = 0.1, 0.5, 0.9
model_low = build_quantile_mlp(X_train_fe.shape[1], q_low)
model_med = build_quantile_mlp(X_train_fe.shape[1], q_med)
model_high = build_quantile_mlp(X_train_fe.shape[1], q_high)

model_low.fit(X_train_fe, y_train, epochs=20, batch_size=64, verbose=0)
model_med.fit(X_train_fe, y_train, epochs=20, batch_size=64, verbose=0)
model_high.fit(X_train_fe, y_train, epochs=20, batch_size=64, verbose=0)

# Predict intervals
pred_low = model_low.predict(X_test_fe).flatten()
pred_med = model_med.predict(X_test_fe).flatten()
pred_high = model_high.predict(X_test_fe).flatten()

print(f"Quantile interval example: low={pred_low[:5]}, median={pred_med[:5]}, high={pred_high[:5]}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 411us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 412us/step
Quantile interval example: low=[-3.7380363e-05  1.7551762e+01  9.4530261e-01 -3.7380363e-05
 -3.7380363e-05], median=[2.2215044e-04 2.8520277e+01 8.9656967e-01 2.2215044e-04 2.2215044e-04], high=[ 0.10687125 32.447674    0.9467553   0.0888381  -0.13649786]


In [32]:
# Mixup Data Augmentation for Tabular Data
import numpy as np

def mixup(X, y, alpha=0.2):
    '''Mixup augmentation for tabular data.'''
    n_samples = X.shape[0]
    lam = np.random.beta(alpha, alpha, n_samples)
    idx = np.random.permutation(n_samples)
    X_mix = lam[:, None] * X + (1 - lam)[:, None] * X[idx]
    y_mix = lam * y + (1 - lam) * y[idx]
    return X_mix, y_mix

# Example usage:
X_train_mix, y_train_mix = mixup(X_train_fe, y_train, alpha=0.2)
print(f"Mixup sample shapes: {X_train_mix.shape}, {y_train_mix.shape}")
# Train model on mixup data
mixup_model = build_deep_residual_mlp(X_train_mix.shape[1], n_layers=6, units=256)
history_mixup = mixup_model.fit(X_train_mix, y_train_mix, validation_data=(X_test_fe, y_test), epochs=30, batch_size=64, verbose=1)


Mixup sample shapes: (40000, 230), (40000,)
Epoch 1/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 89.8291 - mae: 6.6961 - mse: 87.1499 - val_loss: 32.4558 - val_mae: 3.7826 - val_mse: 29.7592
Epoch 2/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 33.2316 - mae: 3.7542 - mse: 30.5304 - val_loss: 9.1704 - val_mae: 1.7785 - val_mse: 6.4664
Epoch 3/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 26.0108 - mae: 3.2429 - mse: 23.3053 - val_loss: 9.5135 - val_mae: 1.7839 - val_mse: 6.8049
Epoch 4/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 22.3727 - mae: 2.9739 - mse: 19.6619 - val_loss: 6.2404 - val_mae: 1.2138 - val_mse: 3.5274
Epoch 5/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 20.3124 - mae: 2.8177 - mse: 17.5965 - val_loss: 5.9862 - val_mae: 1.2007 - val_mse: 3.2690
Epoch 6/30
[1m625/625[

In [33]:
# Evaluate Mixup Model Performance
y_pred_mixup = mixup_model.predict(X_test_fe).flatten()
mae_mixup = mean_absolute_error(y_test, y_pred_mixup)
rmse_mixup = np.sqrt(mean_squared_error(y_test, y_pred_mixup))
r2_mixup = r2_score(y_test, y_pred_mixup)

print(f"\nMixup Model Evaluation:\n  MAE: {mae_mixup:.4f}\n  RMSE: {rmse_mixup:.4f}\n  R²: {r2_mixup:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step

Mixup Model Evaluation:
  MAE: 0.9922
  RMSE: 1.3890
  R²: 0.9904
