In [17]:
import sys
!{sys.executable} -m pip install keras tensorflow --upgrade
!{sys.executable} -m pip install keras_tuner

import os
import joblib
import numpy as np
import tensorflow as tf
import keras_tuner as kt
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Collecting keras_tuner
  Using cached keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras_tuner)
  Using cached kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Using cached keras_tuner-1.4.7-py3-none-any.whl (129 kB)
Using cached kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [5]:
DATA_PATH = "../data/raw/training_data.npz"
SCALER_DIR = "../data/processed/scalers"
MODEL_DIR = "../src/models"
FIGURE_DIR = "../src/visualization/plots"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(FIGURE_DIR, exist_ok=True)

In [6]:
def load_dataset(path=DATA_PATH):
    """Load dataset from NPZ file."""
    data = np.load(path)
    X, y = data["X"], data["y"]
    print(f"Loaded dataset: X shape {X.shape}, y shape {y.shape}")
    return X, y

X, y = load_dataset()

Loaded dataset: X shape (450000, 15), y shape (450000,)


In [7]:
def preprocess_data(X, y, test_size=0.2, random_state=42):
    """
    Splits into train/test and normalizes features.
    Prefix prices + numeric features are scaled.
    opt_flag is kept as-is (categorical 0/1).
    """
    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Separate opt_flag (last column)
    X_train_prefix = X_train[:, :-1]
    X_test_prefix = X_test[:, :-1]

    opt_flag_train = X_train[:, -1].reshape(-1, 1)
    opt_flag_test = X_test[:, -1].reshape(-1, 1)

    # Scale everything except opt_flag
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_prefix)
    X_test_scaled = scaler.transform(X_test_prefix)

    # Reattach opt_flag
    X_train_final = np.hstack([X_train_scaled, opt_flag_train])
    X_test_final = np.hstack([X_test_scaled, opt_flag_test])

    # Save scaler for later inference
    os.makedirs(SCALER_DIR, exist_ok=True)
    joblib.dump(scaler, os.path.join(SCALER_DIR, "feature_scaler.pkl"))

    print(f"Train set: {X_train_final.shape}, Test set: {X_test_final.shape}")
    return X_train_final, X_test_final, y_train, y_test

X_train, X_test, y_train, y_test = preprocess_data(X, y)

Train set: (360000, 15), Test set: (90000, 15)


In [9]:
input_dim = X_train.shape[1]
input_dim

15

In [13]:
def plot_training(history, title, filename):
    """Save training curves for loss + MAE + RMSE."""
    plt.figure(figsize=(15, 4))

    # Loss plot
    plt.subplot(1, 3, 1)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Val Loss")
    plt.title(f"{title} - Loss")
    plt.xlabel("Epoch")
    plt.ylabel("MSE Loss")
    plt.legend()

    # MAE plot
    plt.subplot(1, 3, 2)
    plt.plot(history.history["mae"], label="Train MAE")
    plt.plot(history.history["val_mae"], label="Val MAE")
    plt.title(f"{title} - MAE")
    plt.xlabel("Epoch")
    plt.ylabel("Mean Absolute Error")
    plt.legend()

    # RMSE plot (computed from loss)
    train_rmse = np.sqrt(history.history["loss"])
    val_rmse = np.sqrt(history.history["val_loss"])
    plt.subplot(1, 3, 3)
    plt.plot(train_rmse, label="Train RMSE")
    plt.plot(val_rmse, label="Val RMSE")
    plt.title(f"{title} - RMSE")
    plt.xlabel("Epoch")
    plt.ylabel("Root MSE")
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(FIGURE_DIR, filename))
    plt.close()


In [14]:
def build_quick_mlp(input_dim):
    """Small, quick baseline MLP."""
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1, activation="linear")
    ])
    model.compile(optimizer="adam", loss="mse", metrics=["mae"])
    return model


def evaluate_model(model, X_test, y_test):
    """Compute MAE and RMSE on test set."""
    y_pred = model.predict(X_test)
    mae = np.mean(np.abs(y_test - y_pred.flatten()))
    rmse = np.sqrt(np.mean((y_test - y_pred.flatten()) ** 2))
    print(f"🎯 Evaluation on Test Set -> MAE: {mae:.4f}, RMSE: {rmse:.4f}")
    return mae, rmse

quick_mlp = build_quick_mlp(input_dim)
history_quick = quick_mlp.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=64,
    verbose=1
)
quick_mlp.save(os.path.join(MODEL_DIR, "mlp_quick.h5"))
plot_training(history_quick, "Quick MLP", "training_quick_run_2.png")
evaluate_model(quick_mlp, X_test, y_test)

Epoch 1/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 761us/step - loss: 95.4658 - mae: 6.0945 - val_loss: 91.7508 - val_mae: 5.9288
Epoch 2/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 742us/step - loss: 91.3731 - mae: 5.8502 - val_loss: 91.5185 - val_mae: 5.9363
Epoch 3/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 735us/step - loss: 91.1271 - mae: 5.8367 - val_loss: 91.7755 - val_mae: 5.9679
Epoch 4/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 760us/step - loss: 91.0568 - mae: 5.8293 - val_loss: 92.8673 - val_mae: 6.0017
Epoch 5/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 740us/step - loss: 91.0015 - mae: 5.8252 - val_loss: 91.1833 - val_mae: 5.8462
Epoch 6/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 740us/step - loss: 90.8985 - mae: 5.8194 - val_loss: 92.1096 - val_mae: 5.9459
Epoch 7/10
[1m5625/5625[0m [32m━━━━━━━━━━━━━━━━━━



[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 369us/step
🎯 Evaluation on Test Set -> MAE: 5.8318, RMSE: 9.5534


(np.float64(5.831829638353275), np.float64(9.553417440466928))

In [19]:
def build_large_mlp(hp, input_dim):
    """Hyperparameter-tunable large MLP."""
    model = keras.Sequential()
    model.add(keras.Input(shape=(input_dim,)))

    # Number of layers
    for i in range(hp.Int("num_layers", 2, 6)):
        units = hp.Int(f"units_{i}", 32, 512, step=32)
        activation = hp.Choice("activation", ["relu", "tanh"])
        model.add(keras.layers.Dense(units, activation=activation))

    optimizer_choice = hp.Choice("optimizer", ["adam", "sgd"])
    lr = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    
    if optimizer_choice == "adam":
        optimizer = keras.optimizers.Adam(learning_rate=lr)
    else:
        optimizer = keras.optimizers.SGD(learning_rate=lr)

    model.add(keras.layers.Dense(1, activation="linear"))
    model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])
    return model


In [20]:
# Tuner setup
tuner = kt.Hyperband(
    lambda hp: build_large_mlp(hp, input_dim=X_train.shape[1]),
    objective="val_mae",
    max_epochs=30,
    factor=3,
    directory="tuner_dir",
    project_name="large_mlp"
)

stop_early = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

# Run the hyperparameter search
tuner.search(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=50,                # will be tuned by Hyperband
    batch_size=kt.HyperParameters().Int("batch_size", min_value=32, max_value=256, step=32),
    callbacks=[stop_early],
    verbose=1
)

# Best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("\nBest Hyperparameters found:")
for param, value in best_hps.values.items():
    print(f"  {param}: {value}")

# Build best model
model = tuner.hypermodel.build(best_hps)

Trial 84 Complete [00h 01m 45s]
val_mae: 5.781039714813232

Best val_mae So Far: 5.71572732925415
Total elapsed time: 01h 21m 49s

✅ Best Hyperparameters found:
  num_layers: 6
  units_0: 352
  activation: relu
  units_1: 224
  optimizer: adam
  lr: 0.0004816603466007292
  units_2: 448
  units_3: 192
  units_4: 224
  units_5: 32
  tuner/epochs: 10
  tuner/initial_epoch: 0
  tuner/bracket: 1
  tuner/round: 0


In [21]:
# Train final model with best params
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=best_hps.get("max_epochs", 50),
    batch_size=best_hps.get("batch_size", 64),
    verbose=1
)

# Evaluate final model
y_pred = model.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred.flatten())**2))
mae = np.mean(np.abs(y_test - y_pred.flatten()))
print(f"\n🎯 Final Model Evaluation:\n  MAE: {mae:.4f}\n  RMSE: {rmse:.4f}")

model.save(os.path.join(MODEL_DIR, "mlp_large_tuned.h5"))
plot_training(history, "Large MLP (Tuned)", "training_large_tuned.png")

TypeError: HyperParameters.get() takes 2 positional arguments but 3 were given