In [2]:
import sys
!{sys.executable} -m pip install keras tensorflow --upgrade
!{sys.executable} -m pip install keras_tuner

import os
import joblib
import numpy as np
import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras
from keras import backend as K
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [3]:
DATA_PATH = "../data/raw/training_data.npz"
SCALER_DIR = "../data/processed/scalers"
MODEL_DIR = "../src/models"
FIGURE_DIR = "../src/visualization/plots"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(FIGURE_DIR, exist_ok=True)

In [4]:
def load_dataset(path=DATA_PATH):
    """Load dataset from NPZ file."""
    data = np.load(path)
    X, y = data["X"], data["y"]
    print(f"Loaded dataset: X shape {X.shape}, y shape {y.shape}")
    return X, y

X, y = load_dataset()

Loaded dataset: X shape (22680000, 25), y shape (22680000,)


In [5]:
def preprocess_data(X, y, test_size=0.2, random_state=42):
    """
    Splits into train/test and normalizes features.
    Prefix prices + numeric features are scaled.
    opt_flag is kept as-is (categorical 0/1).
    """
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Separate opt_flag (last column)
    X_train_prefix = X_train[:, :-1]
    X_test_prefix = X_test[:, :-1]

    opt_flag_train = X_train[:, -1].reshape(-1, 1)
    opt_flag_test = X_test[:, -1].reshape(-1, 1)

    # Scale everything except opt_flag
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_prefix)
    X_test_scaled = scaler.transform(X_test_prefix)

    # Reattach opt_flag
    X_train_final = np.hstack([X_train_scaled, opt_flag_train])
    X_test_final = np.hstack([X_test_scaled, opt_flag_test])

    # Save scaler for later inference
    os.makedirs(SCALER_DIR, exist_ok=True)
    joblib.dump(scaler, os.path.join(SCALER_DIR, "feature_scaler.pkl"))

    print(f"Train set: {X_train_final.shape}, Test set: {X_test_final.shape}")
    return X_train_final, X_test_final, y_train, y_test

X_train, X_test, y_train, y_test = preprocess_data(X, y)

Train set: (18144000, 25), Test set: (4536000, 25)


In [6]:
input_dim = X_train.shape[1]
input_dim

25

In [7]:
def plot_training(history, title, filename):
    """Save training curves for loss + MAE + RMSE."""
    plt.figure(figsize=(15, 4))

    # Loss plot
    plt.subplot(1, 3, 1)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Val Loss")
    plt.title(f"{title} - Loss")
    plt.xlabel("Epoch")
    plt.ylabel("MSE Loss")
    plt.legend()

    # MAE plot
    plt.subplot(1, 3, 2)
    plt.plot(history.history["mae"], label="Train MAE")
    plt.plot(history.history["val_mae"], label="Val MAE")
    plt.title(f"{title} - MAE")
    plt.xlabel("Epoch")
    plt.ylabel("Mean Absolute Error")
    plt.legend()

    # RMSE plot (computed from loss)
    train_rmse = np.sqrt(history.history["loss"])
    val_rmse = np.sqrt(history.history["val_loss"])
    plt.subplot(1, 3, 3)
    plt.plot(train_rmse, label="Train RMSE")
    plt.plot(val_rmse, label="Val RMSE")
    plt.title(f"{title} - RMSE")
    plt.xlabel("Epoch")
    plt.ylabel("Root MSE")
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(FIGURE_DIR, filename))
    plt.close()


In [8]:
def build_quick_mlp(input_dim):
    """Small, quick baseline MLP."""
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1, activation="linear")
    ])
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


def evaluate_model(model, X_test, y_test):
    """Compute MAE and RMSE on test set."""
    y_pred = model.predict(X_test)
    mae = np.mean(np.abs(y_test - y_pred.flatten()))
    rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
    print(f"Evaluation on Test Set -> MAE: {mae:.4f}, RMSE: {rmse:.4f}")
    return mae, rmse

quick_mlp = build_quick_mlp(input_dim)
history_quick = quick_mlp.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=64,
    verbose=1
)
quick_mlp.save(os.path.join(MODEL_DIR, "mlp_quick.h5"))
plot_training(history_quick, "Quick MLP", "training_quick_run_2.png")
evaluate_model(quick_mlp, X_test, y_test)

Epoch 1/10
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 705us/step - loss: 96.1570 - mae: 4.4488 - val_loss: 94.6974 - val_mae: 4.3128
Epoch 2/10
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 705us/step - loss: 96.1570 - mae: 4.4488 - val_loss: 94.6974 - val_mae: 4.3128
Epoch 2/10
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 677us/step - loss: 94.2414 - mae: 4.3172 - val_loss: 94.2153 - val_mae: 4.2751
Epoch 3/10
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 677us/step - loss: 94.2414 - mae: 4.3172 - val_loss: 94.2153 - val_mae: 4.2751
Epoch 3/10
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 704us/step - loss: 93.9700 - mae: 4.2890 - val_loss: 94.3808 - val_mae: 4.3060
Epoch 4/10
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 704us/step - loss: 93.9700 - mae: 4.2890 - val_loss: 94.3808 - val_mae: 4.3060
Epoch 4/10
[1m2



[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 377us/step
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 377us/step
Evaluation on Test Set -> MAE: 4.2367, RMSE: 9.6985
Evaluation on Test Set -> MAE: 4.2367, RMSE: 9.6985


(np.float64(4.236670574655875), np.float64(9.698543360537787))

In [9]:
def build_large_mlp(hp):
    model = keras.Sequential()
    input_dim = X_train.shape[1]

    # Input layer
    model.add(layers.Input(shape=(input_dim,)))

    # Number of hidden layers
    num_layers = hp.Int("num_layers", min_value=2, max_value=6, step=1)

    for i in range(num_layers):
        units = hp.Int(f"units_{i}", min_value=32, max_value=512, step=32)
        activation = hp.Choice("activation", ["relu", "tanh"])
        model.add(layers.Dense(units=units, activation=activation))

    # Output layer
    model.add(layers.Dense(1, activation="linear"))

    # Optimizer
    optimizer_choice = hp.Choice("optimizer", ["adam", "sgd"])
    learning_rate = hp.Float("lr", 1e-4, 1e-2, sampling="log")

    if optimizer_choice == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)

    model.compile(
        optimizer=optimizer,
        loss="mse",
        metrics=["mae", "mse", "accuracy"]  # include accuracy for monitoring
    )
    return model


In [10]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Tuner
tuner = kt.Hyperband(
    build_large_mlp,
    objective="val_mae",
    max_epochs=50,  # upper bound for epochs
    factor=3,
    directory="tuner_logs",
    project_name="large_mlp_tuning"
)

stop_early = keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)
tuner.search(
    X_train, y_train,
    validation_data=(X_test, y_test),
    callbacks=[stop_early],
    batch_size=kt.HyperParameters().Int("batch_size", min_value=32, max_value=256, step=32),
    epochs=50
)
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
for hp in best_hps.values.keys():
    print(f"  - {hp}: {best_hps.get(hp)}")

Reloading Tuner from tuner_logs\large_mlp_tuning\tuner0.json
  - num_layers: 3
  - units_0: 96
  - activation: relu
  - units_1: 192
  - optimizer: sgd
  - lr: 0.0005822751269209618
  - units_2: 352
  - units_3: 320
  - units_4: 384
  - units_5: 352
  - tuner/epochs: 6
  - tuner/initial_epoch: 0
  - tuner/bracket: 2
  - tuner/round: 0
  - num_layers: 3
  - units_0: 96
  - activation: relu
  - units_1: 192
  - optimizer: sgd
  - lr: 0.0005822751269209618
  - units_2: 352
  - units_3: 320
  - units_4: 384
  - units_5: 352
  - tuner/epochs: 6
  - tuner/initial_epoch: 0
  - tuner/bracket: 2
  - tuner/round: 0


In [11]:
model = tuner.hypermodel.build(best_hps)

batch_size = 64
epochs = 50


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)

Epoch 1/50
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 1ms/step - accuracy: 0.3077 - loss: 97.4116 - mae: 4.4431 - mse: 97.4116 - val_accuracy: 0.3284 - val_loss: 94.9298 - val_mae: 4.2647 - val_mse: 94.9298
Epoch 2/50
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 1ms/step - accuracy: 0.3077 - loss: 97.4116 - mae: 4.4431 - mse: 97.4116 - val_accuracy: 0.3284 - val_loss: 94.9298 - val_mae: 4.2647 - val_mse: 94.9298
Epoch 2/50
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 1ms/step - accuracy: 0.3010 - loss: 95.3363 - mae: 4.3945 - mse: 95.3363 - val_accuracy: 0.2511 - val_loss: 94.9959 - val_mae: 4.4549 - val_mse: 94.9959
Epoch 3/50
[1m283500/283500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 1ms/step - accuracy: 0.3010 - loss: 95.3363 - mae: 4.3945 - mse: 95.3363 - val_accuracy: 0.2511 - val_loss: 94.9959 - val_mae: 4.4549 - val_mse: 94.9959
Epoch 3/50
[1m283500/283500[0m [32m━━━━━━━━━━

In [12]:
y_pred = model.predict(X_test).flatten()
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
mae = np.mean(np.abs(y_test - y_pred))
acc = np.mean(np.isclose(np.round(y_test), np.round(y_pred)))  # crude regression accuracy

print(f"\nFinal Model Evaluation:\n  MAE: {mae:.4f}\n  RMSE: {rmse:.4f}\n  Accuracy: {acc:.4f}")
model.save(os.path.join(MODEL_DIR, "mlp_large_tuned.h5"))
plot_training(history, "Large MLP (Tuned)", "training_large_tuned.png")

[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 437us/step
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 437us/step





Final Model Evaluation:
  MAE: 4.5012
  RMSE: 9.7467
  Accuracy: 0.2132


Not the best numbers, next few steps - 
1. Baseline from mean of y_train -> check wif the NN is beating this
2. Try different batch size and epochs

In [13]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()

y_baseline = np.full_like(y_test_scaled, fill_value=np.mean(y_train_scaled))
baseline_mae = mean_absolute_error(y_test_scaled, y_baseline)
baseline_rmse = np.sqrt(mean_squared_error(y_test_scaled, y_baseline))
baseline_r2 = r2_score(y_test_scaled, y_baseline)

print(f"\nBaseline (mean predictor):\n"
      f"  MAE: {baseline_mae:.4f}\n"
      f"  RMSE: {baseline_rmse:.4f}\n"
      f"  R²: {baseline_r2:.4f}")


Baseline (mean predictor):
  MAE: 0.7519
  RMSE: 1.0001
  R²: -0.0000


In [14]:
model = tuner.hypermodel.build(best_hps)

In [15]:
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_test_scaled, y_test_scaled),
    epochs=25,
    batch_size=128,
    verbose=1,
)

Epoch 1/25
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 1ms/step - accuracy: 5.5115e-08 - loss: 0.4700 - mae: 0.3875 - mse: 0.4700 - val_accuracy: 0.0000e+00 - val_loss: 0.3378 - val_mae: 0.2744 - val_mse: 0.3378
Epoch 2/25
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 1ms/step - accuracy: 5.5115e-08 - loss: 0.4700 - mae: 0.3875 - mse: 0.4700 - val_accuracy: 0.0000e+00 - val_loss: 0.3378 - val_mae: 0.2744 - val_mse: 0.3378
Epoch 2/25
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 1ms/step - accuracy: 1.1023e-07 - loss: 0.3323 - mae: 0.2679 - mse: 0.3323 - val_accuracy: 0.0000e+00 - val_loss: 0.3324 - val_mae: 0.2637 - val_mse: 0.3324
Epoch 3/25
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 1ms/step - accuracy: 1.1023e-07 - loss: 0.3323 - mae: 0.2679 - mse: 0.3323 - val_accuracy: 0.0000e+00 - val_loss: 0.3324 - val_mae: 0.2637 - val_mse: 0.3324
Epoch 3/25
[1m141750/141750[0m

In [16]:
y_pred_scaled = model.predict(X_test_scaled).flatten()
mae = mean_absolute_error(y_test_scaled, y_pred_scaled)
rmse = np.sqrt(mean_squared_error(y_test_scaled, y_pred_scaled))
r2 = r2_score(y_test_scaled, y_pred_scaled)

print(f"\nFinal Model Evaluation (scaled):\n"
      f"  MAE: {mae:.4f}\n"
      f"  RMSE: {rmse:.4f}\n"
      f"  R²: {r2:.4f}")

[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 424us/step
[1m141750/141750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 424us/step

Final Model Evaluation (scaled):
  MAE: 0.2509
  RMSE: 0.5731
  R²: 0.6716

Final Model Evaluation (scaled):
  MAE: 0.2509
  RMSE: 0.5731
  R²: 0.6716


- Model seems to be learning from data, with drops in RMSE and MAE
- Still improvements to be made
- Explore feature engineering, more complex models regularization, CV?

## Further Improvements: Feature Engineering & Advanced Tuning
We'll now try to improve the large MLP by adding feature engineering and using more advanced hyperparameter search methods.

In [17]:
# Feature Engineering: Add polynomial and interaction features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_fe = poly.fit_transform(X_train)
X_test_fe = poly.transform(X_test)

print(f"Feature engineered shapes: X_train_fe {X_train_fe.shape}, X_test_fe {X_test_fe.shape}")

Feature engineered shapes: X_train_fe (18144000, 350), X_test_fe (4536000, 350)


In [18]:
# Advanced Hyperparameter Tuning: Bayesian Optimization
from keras_tuner.tuners import BayesianOptimization

def build_advanced_mlp(hp):
    model = keras.Sequential()
    input_dim = X_train_fe.shape[1]
    model.add(layers.Input(shape=(input_dim,)))
    num_layers = hp.Int("num_layers", 2, 6)
    for i in range(num_layers):
        units = hp.Int(f"units_{i}", 32, 512, step=32)
        activation = hp.Choice(f"activation_{i}", ["relu", "tanh"])
        model.add(layers.Dense(units=units, activation=activation))
    model.add(layers.Dense(1, activation="linear"))
    optimizer_choice = hp.Choice("optimizer", ["adam", "sgd", "rmsprop"])
    learning_rate = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    if optimizer_choice == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_choice == "sgd":
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="mse", metrics=["mae", "mse"])
    return model

bayes_tuner = BayesianOptimization(
    build_advanced_mlp,
    objective="val_mae",
    max_trials=20,
    directory="tuner_logs",
    project_name="large_mlp_bayes_tuning"
)

stop_early = keras.callbacks.EarlyStopping(monitor="val_loss", patience=7)
bayes_tuner.search(
    X_train_fe, y_train,
    validation_data=(X_test_fe, y_test),
    callbacks=[stop_early],
    batch_size=64,
    epochs=40
)
best_bayes_hps = bayes_tuner.get_best_hyperparameters(num_trials=1)[0]
for hp in best_bayes_hps.values.keys():
    print(f"  - {hp}: {best_bayes_hps.get(hp)}")

Reloading Tuner from tuner_logs\large_mlp_bayes_tuning\tuner0.json
  - num_layers: 3
  - units_0: 64
  - activation_0: relu
  - units_1: 352
  - activation_1: relu
  - optimizer: adam
  - lr: 0.0016580818738343003
  - units_2: 160
  - activation_2: tanh
  - units_3: 64
  - activation_3: tanh
  - units_4: 64
  - activation_4: relu
  - num_layers: 3
  - units_0: 64
  - activation_0: relu
  - units_1: 352
  - activation_1: relu
  - optimizer: adam
  - lr: 0.0016580818738343003
  - units_2: 160
  - activation_2: tanh
  - units_3: 64
  - activation_3: tanh
  - units_4: 64
  - activation_4: relu


In [19]:
# Train and evaluate improved model
# Fix: Reduce sample size further to avoid MemoryError
max_samples = 5000  # Lower this if you still get MemoryError
X_train_fe_small = X_train_fe[:max_samples]
y_train_small = y_train[:max_samples]
# Optional: Use PCA to reduce feature dimensionality if needed
# from sklearn.decomposition import PCA
# pca = PCA(n_components=100)  # or set to retain 95% variance
# X_train_fe_small = pca.fit_transform(X_train_fe_small)
# X_test_fe_pca = pca.transform(X_test_fe)
advanced_model = bayes_tuner.hypermodel.build(best_bayes_hps)
history_adv = advanced_model.fit(
    X_train_fe_small, y_train_small,
    validation_data=(X_test_fe, y_test),
    epochs=40,
    batch_size=64,
    verbose=1
)

# Evaluate
y_pred_adv = advanced_model.predict(X_test_fe).flatten()
mae_adv = mean_absolute_error(y_test, y_pred_adv)
rmse_adv = np.sqrt(mean_squared_error(y_test, y_pred_adv))
r2_adv = r2_score(y_test, y_pred_adv)

print(f"\nImproved Model Evaluation:\n  MAE: {mae_adv:.4f}\n  RMSE: {rmse_adv:.4f}\n  R²: {r2_adv:.4f}")
advanced_model.save(os.path.join(MODEL_DIR, "mlp_large_advanced_tuned.h5"))
plot_training(history_adv, "Large MLP (Advanced)", "training_large_advanced_tuned.png")

Epoch 1/40
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 1s/step - loss: 253.3592 - mae: 11.2618 - mse: 253.3592 - val_loss: 228.3281 - val_mae: 9.7021 - val_mse: 228.3281
Epoch 2/40
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 1s/step - loss: 253.3592 - mae: 11.2618 - mse: 253.3592 - val_loss: 228.3281 - val_mae: 9.7021 - val_mse: 228.3281
Epoch 2/40
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 579ms/step - loss: 194.8995 - mae: 8.7755 - mse: 194.8995 - val_loss: 207.9048 - val_mae: 9.1993 - val_mse: 207.9048
Epoch 3/40
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 579ms/step - loss: 194.8995 - mae: 8.7755 - mse: 194.8995 - val_loss: 207.9048 - val_mae: 9.1993 - val_mse: 207.9048
Epoch 3/40
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 570ms/step - loss: 171.0040 - mae: 7.7339 - mse: 171.0040 - val_loss: 200.5439 - val_mae: 8.1629 - val_mse: 200.5439
Epoch 4/40
[1m79/79[0m [32m━━━━━━━━━




Improved Model Evaluation:
  MAE: 6.2850
  RMSE: 13.1318
  R²: 0.3983


## Advanced Model Improvements
Let's try more advanced deep learning techniques: residual connections, dropout, batch normalization, and learning rate scheduling.

In [20]:
# Residual MLP with Dropout and BatchNorm
from tensorflow.keras import regularizers

def build_residual_mlp(input_dim, n_layers=4, units=128, dropout_rate=0.2, l2_reg=1e-4):
    inputs = keras.Input(shape=(input_dim,))
    x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(inputs)
    for i in range(n_layers):
        shortcut = x
        x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l2(l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Add()([x, shortcut])  # Residual connection
    outputs = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(), loss='mse', metrics=['mae', 'mse'])
    return model

res_mlp = build_residual_mlp(X_train_fe.shape[1], n_layers=3, units=256, dropout_rate=0.3, l2_reg=1e-3)
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)

history_res = res_mlp.fit(
    X_train_fe, y_train,
    validation_data=(X_test_fe, y_test),
    epochs=40,
    batch_size=64,
    callbacks=[lr_scheduler],
    verbose=1
)

# Evaluate
res_pred = res_mlp.predict(X_test_fe).flatten()
mae_res = mean_absolute_error(y_test, res_pred)
rmse_res = np.sqrt(mean_squared_error(y_test, res_pred))
r2_res = r2_score(y_test, res_pred)

print(f"\nResidual MLP Evaluation:\n  MAE: {mae_res:.4f}\n  RMSE: {rmse_res:.4f}\n  R²: {r2_res:.4f}")
res_mlp.save(os.path.join(MODEL_DIR, "mlp_large_residual_tuned.h5"))
plot_training(history_res, "Residual MLP", "training_residual_mlp.png")

MemoryError: Unable to allocate 23.7 GiB for an array with shape (18144000, 350) and data type float32

In [None]:
# Ensemble: Average predictions from best models
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict with advanced and residual models
ensemble_preds = (y_pred_adv + res_pred) / 2
mae_ensemble = mean_absolute_error(y_test, ensemble_preds)
rmse_ensemble = np.sqrt(mean_squared_error(y_test, ensemble_preds))
r2_ensemble = r2_score(y_test, ensemble_preds)

print(f"\nEnsemble Model Evaluation:\n  MAE: {mae_ensemble:.4f}\n  RMSE: {rmse_ensemble:.4f}\n  R²: {r2_ensemble:.4f}")


Ensemble Model Evaluation:
  MAE: 5.9232
  RMSE: 9.6243
  R²: 0.4447


## Next-Level Model Improvements
We'll now try several advanced deep learning and ML techniques to further boost performance:
- Deeper residual networks with multi-level skip connections
- More regularization: Dropout, L1/L2, Gaussian noise
- Advanced optimizers: AdamW, Nadam, Lookahead
- Learning rate warmup and cyclical schedules
- Feature selection/dimensionality reduction (PCA)
- Ensemble stacking (combine multiple models)
- Data augmentation (if feasible for tabular data)
- Model uncertainty estimation (MC Dropout, quantile regression)

In [None]:
# Deeper Residual Network with Multi-Level Skip Connections and More Regularization
from tensorflow.keras.layers import GaussianNoise
from tensorflow.keras.optimizers import AdamW, Nadam

def build_deep_residual_mlp(input_dim, n_layers=6, units=128, dropout_rate=0.3, l1_reg=1e-4, l2_reg=1e-4, noise_std=0.05):
    inputs = keras.Input(shape=(input_dim,))
    x = GaussianNoise(noise_std)(inputs)
    x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
    skip = x
    for i in range(n_layers):
        x = layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        if i % 2 == 1:
            x = layers.Add()([x, skip])  # Multi-level skip connection
    outputs = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=AdamW(), loss='mse', metrics=['mae', 'mse'])
    return model

# Train deeper residual model
advanced_res_mlp = build_deep_residual_mlp(X_train_fe.shape[1], n_layers=6, units=256, dropout_rate=0.3, l1_reg=1e-4, l2_reg=1e-3, noise_std=0.05)
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)

history_adv_res = advanced_res_mlp.fit(
    X_train_fe, y_train,
    validation_data=(X_test_fe, y_test),
    epochs=50,
    batch_size=64,
    callbacks=[lr_scheduler],
    verbose=1
)

# Evaluate
adv_res_pred = advanced_res_mlp.predict(X_test_fe).flatten()
mae_adv_res = mean_absolute_error(y_test, adv_res_pred)
rmse_adv_res = np.sqrt(mean_squared_error(y_test, adv_res_pred))
r2_adv_res = r2_score(y_test, adv_res_pred)

print(f"\nDeeper Residual MLP Evaluation:\n  MAE: {mae_adv_res:.4f}\n  RMSE: {rmse_adv_res:.4f}\n  R²: {r2_adv_res:.4f}")
advanced_res_mlp.save(os.path.join(MODEL_DIR, "mlp_large_deep_residual_tuned.h5"))
plot_training(history_adv_res, "Deep Residual MLP", "training_deep_residual_mlp.png")

Epoch 1/50
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - loss: 99.8228 - mae: 6.0948 - mse: 95.7783 - val_loss: 95.8885 - val_mae: 5.9922 - val_mse: 92.3501 - learning_rate: 0.0010
Epoch 2/50
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - loss: 94.7495 - mae: 5.9057 - mse: 92.1526 - val_loss: 94.2381 - val_mae: 5.9298 - val_mse: 92.2835 - learning_rate: 0.0010
Epoch 3/50
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - loss: 93.3804 - mae: 5.8713 - mse: 91.7001 - val_loss: 93.5348 - val_mae: 5.8081 - val_mse: 92.0518 - learning_rate: 0.0010
Epoch 4/50
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - loss: 92.7688 - mae: 5.8654 - mse: 91.4322 - val_loss: 93.1828 - val_mae: 5.8160 - val_mse: 91.8870 - learning_rate: 0.0010
Epoch 5/50
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - loss: 92.4510 - mae: 5.8504 - mse: 91.2488 - val_




Deeper Residual MLP Evaluation:
  MAE: 5.8221
  RMSE: 9.5634
  R²: 0.4517


In [None]:
# Cyclical Learning Rate Callback
from tensorflow.keras.callbacks import Callback

class CyclicLR(Callback):
    def __init__(self, base_lr=1e-4, max_lr=1e-2, step_size=2000):
        super().__init__()
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.iterations = 0
    def on_train_batch_begin(self, batch, logs=None):
        cycle = np.floor(1 + self.iterations / (2 * self.step_size))
        x = np.abs(self.iterations / self.step_size - 2 * cycle + 1)
        lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, (1 - x))
        K.set_value(self.model.optimizer.lr, lr)
        self.iterations += 1

# Example usage:
# clr = CyclicLR(base_lr=1e-4, max_lr=1e-2, step_size=1000)
# history = model.fit(..., callbacks=[clr])


In [None]:
# Feature Selection: Principal Component Analysis (PCA)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, svd_solver='full')  # retain 95% variance
X_train_pca = pca.fit_transform(X_train_fe)
X_test_pca = pca.transform(X_test_fe)

print(f"PCA reduced shape: X_train_pca {X_train_pca.shape}, X_test_pca {X_test_pca.shape}")


PCA reduced shape: X_train_pca (360000, 38), X_test_pca (90000, 38)


In [None]:
# Ensemble Stacking: Combine Multiple Models
from sklearn.linear_model import LinearRegression

# Assume you have predictions from several models
# y_pred_adv, res_pred, adv_res_pred (from previous cells)

stacked_preds = np.vstack([
    y_pred_adv,
    res_pred,
    adv_res_pred
]).T

stacker = LinearRegression()
stacker.fit(stacked_preds, y_test)
ensemble_stacked = stacker.predict(stacked_preds)

mae_stacked = mean_absolute_error(y_test, ensemble_stacked)
rmse_stacked = np.sqrt(mean_squared_error(y_test, ensemble_stacked))
r2_stacked = r2_score(y_test, ensemble_stacked)

print(f"\nStacked Ensemble Evaluation:\n  MAE: {mae_stacked:.4f}\n  RMSE: {rmse_stacked:.4f}\n  R²: {r2_stacked:.4f}")


Stacked Ensemble Evaluation:
  MAE: 5.8465
  RMSE: 9.5624
  R²: 0.4518


### Advanced Deep Learning Improvements
We'll now apply:
- MC Dropout for uncertainty estimation
- Quantile regression for predictive intervals
- Mixup data augmentation for tabular data
- Learning rate warmup/scheduling

In [None]:
# MC Dropout for Uncertainty Estimation
from tensorflow.keras import Model

def mc_dropout_predict(model, X, n_iter=100):
    # Enable dropout at inference
    f = Model(model.input, model.output)
    preds = [f(X, training=True).numpy().flatten() for _ in range(n_iter)]
    preds = np.array(preds)
    mean_pred = preds.mean(axis=0)
    std_pred = preds.std(axis=0)
    return mean_pred, std_pred

# Example usage:
mean_pred, std_pred = mc_dropout_predict(advanced_res_mlp, X_test_fe, n_iter=100)
print(f"MC Dropout mean prediction: {mean_pred[:5]}")
print(f"MC Dropout std (uncertainty): {std_pred[:5]}")

MC Dropout mean prediction: [0.06772503 0.17885922 7.824484   2.438419   0.09934095]
MC Dropout std (uncertainty): [0.22354934 0.2279192  0.34892038 0.3731246  0.2561742 ]


In [None]:
# Quantile Regression for Predictive Intervals
# Use only tensorflow.keras for compatibility
from tensorflow import keras
from tensorflow.keras import layers, backend as K

# Quantile loss function for Keras
# K.maximum and K.mean are from tensorflow.keras.backend

def quantile_loss(q):
    def loss(y_true, y_pred):
        e = y_true - y_pred
        return K.mean(K.maximum(q * e, (q - 1) * e), axis=-1)
    return loss

# Build quantile model (e.g., 0.1, 0.5, 0.9)
def build_quantile_mlp(input_dim, quantile):
    inputs = keras.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss=quantile_loss(quantile))
    return model

# Train quantile models
q_low, q_med, q_high = 0.1, 0.5, 0.9
model_low = build_quantile_mlp(X_train_fe.shape[1], q_low)
model_med = build_quantile_mlp(X_train_fe.shape[1], q_med)
model_high = build_quantile_mlp(X_train_fe.shape[1], q_high)

model_low.fit(X_train_fe, y_train, epochs=20, batch_size=64, verbose=0)
model_med.fit(X_train_fe, y_train, epochs=20, batch_size=64, verbose=0)
model_high.fit(X_train_fe, y_train, epochs=20, batch_size=64, verbose=0)

# Predict intervals
pred_low = model_low.predict(X_test_fe).flatten()
pred_med = model_med.predict(X_test_fe).flatten()
pred_high = model_high.predict(X_test_fe).flatten()

print(f"Quantile interval example: low={pred_low[:5]}, median={pred_med[:5]}, high={pred_high[:5]}")

[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 395us/step
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 395us/step
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 391us/step
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 391us/step
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 384us/step
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 384us/step
Quantile interval example: low=[1.869812e-06 1.869812e-06 1.869812e-06 1.869812e-06 1.869812e-06], median=[-1.5293405e-04 -1.5293405e-04  4.4251113e+00 -1.5293405e-04
 -1.5293405e-04], high=[1.2684676e-03 1.2684676e-03 1.9968006e+01 9.7142715e+00 1.2684676e-03]
Quantile interval example: low=[1.869812e-06 1.869812e-06 1.869812e-06 1.869812e-06 1.869812e-06], median=[-1.5293405e-04 -1.5293405e-04  4.4251113e+00 -1.5293405e-04
 -1.5293405e-04], high=[1.2684676e-03 1.2684676e-03 1.9968006e+01 9.7142715e+00 1.2684676e-03]


In [None]:
# Mixup Data Augmentation for Tabular Data
import numpy as np

def mixup(X, y, alpha=0.2):
    '''Mixup augmentation for tabular data.'''
    n_samples = X.shape[0]
    lam = np.random.beta(alpha, alpha, n_samples)
    idx = np.random.permutation(n_samples)
    X_mix = lam[:, None] * X + (1 - lam)[:, None] * X[idx]
    y_mix = lam * y + (1 - lam) * y[idx]
    return X_mix, y_mix

# Example usage:
X_train_mix, y_train_mix = mixup(X_train_fe, y_train, alpha=0.2)
print(f"Mixup sample shapes: {X_train_mix.shape}, {y_train_mix.shape}")
# Train model on mixup data
mixup_model = build_deep_residual_mlp(X_train_mix.shape[1], n_layers=6, units=256)
history_mixup = mixup_model.fit(X_train_mix, y_train_mix, validation_data=(X_test_fe, y_test), epochs=30, batch_size=64, verbose=1)


Mixup sample shapes: (360000, 135), (360000,)
Epoch 1/30
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - loss: 85.3159 - mae: 5.7766 - mse: 82.7681 - val_loss: 95.4293 - val_mae: 6.0405 - val_mse: 93.0159
Epoch 2/30
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - loss: 85.3159 - mae: 5.7766 - mse: 82.7681 - val_loss: 95.4293 - val_mae: 6.0405 - val_mse: 93.0159
Epoch 2/30
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - loss: 81.4110 - mae: 5.5964 - mse: 79.4167 - val_loss: 97.4179 - val_mae: 6.1284 - val_mse: 95.8057
Epoch 3/30
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - loss: 81.4110 - mae: 5.5964 - mse: 79.4167 - val_loss: 97.4179 - val_mae: 6.1284 - val_mse: 95.8057
Epoch 3/30
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - loss: 80.3429 - mae: 5.5628 - mse: 78.9570 - val_loss: 94.6987 - val_mae: 6.0738 - val_mse: 93.4098

In [None]:
# Evaluate Mixup Model Performance
y_pred_mixup = mixup_model.predict(X_test_fe).flatten()
mae_mixup = mean_absolute_error(y_test, y_pred_mixup)
rmse_mixup = np.sqrt(mean_squared_error(y_test, y_pred_mixup))
r2_mixup = r2_score(y_test, y_pred_mixup)

print(f"\nMixup Model Evaluation:\n  MAE: {mae_mixup:.4f}\n  RMSE: {rmse_mixup:.4f}\n  R²: {r2_mixup:.4f}")

[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 716us/step

Mixup Model Evaluation:
  MAE: 5.9384
  RMSE: 9.6652
  R²: 0.4400
