In [1]:
import pandas as pd

# Load the dataset
file_path = "../data/clean_FeatEng.csv"
df_cleaned = pd.read_csv(file_path)

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Select relevant features
features = ["demand-forecast", "wind-forecast", "solar-forecast", "temperature-forecast", "day-ahead-auction-price"]
target = "day-ahead-auction-price"

# Scale data
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_cleaned[features])

# Convert data into sequences
def create_sequences(data, seq_length=24):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, -1])  # Predict price
    return np.array(X), np.array(y)

SEQ_LENGTH = 24  # Use past 24 hours to predict next price
X, y = create_sequences(df_scaled, SEQ_LENGTH)

# Train-test split
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


2025-03-13 23:37:19.049719: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Train shape: (31817, 24, 5), Test shape: (7955, 24, 5)


# Transformer V0

    A simpler approach based on self-attention.
    Can replace LSTMs and Bidirectional LSTMs.

In [3]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --- Positional Encoding Function ---
def positional_encoding(sequence_length, d_model):
    """
    Computes the positional encoding for a given sequence length and model dimension.
    """
    angle_rads = np.arange(sequence_length)[:, np.newaxis] / np.power(
        10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model)
    )
    # Apply sine to even indices in the array; cosine to odd indices
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]  # shape: (1, sequence_length, d_model)
    return tf.cast(pos_encoding, dtype=tf.float32)

# --- Transformer Encoder Block ---
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    """
    A single Transformer encoder block with multi-head attention and a feed-forward network.
    """
    # Multi-head self-attention layer
    x = MultiHeadAttention(num_heads=num_heads, key_dim=head_size, dropout=dropout)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x + inputs)  # Add residual connection
    
    # Feed-forward network
    x_ff = Dense(ff_dim, activation="relu")(x)
    x_ff = Dropout(dropout)(x_ff)
    x_ff = Dense(inputs.shape[-1])(x_ff)
    x = LayerNormalization(epsilon=1e-6)(x + x_ff)  # Residual connection
    return x

# --- Build Transformer Model ---
def build_transformer_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0.1, mlp_dropout=0.1):
    """
    Builds a Transformer-based model for time-series forecasting.
    """
    inputs = Input(shape=input_shape)
    
    # Add positional encoding
    pos_encoding = positional_encoding(input_shape[0], input_shape[1])
    x = inputs + pos_encoding
    
    # Stacking multiple Transformer encoder blocks
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    
    # Global average pooling over the time dimension
    x = GlobalAveragePooling1D()(x)
    
    # MLP head for regression
    for units in mlp_units:
        x = Dense(units, activation="relu")(x)
        x = Dropout(mlp_dropout)(x)
    
    outputs = Dense(1)(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mae")
    return model

# --- Assume X_train, y_train, X_test, y_test, SEQ_LENGTH, and scaler are defined ---
# For example, SEQ_LENGTH is the number of past hours used for prediction.
input_shape = (SEQ_LENGTH, X_train.shape[2])

# Build the model: Adjust hyperparameters as needed
transformer_model = build_transformer_model(
    input_shape=input_shape,
    head_size=64,
    num_heads=4,
    ff_dim=128,
    num_transformer_blocks=2,
    mlp_units=[64],
    dropout=0.1,
    mlp_dropout=0.1
)

transformer_model.summary()

# --- Train the Transformer Model ---
history = transformer_model.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test),
    epochs=50, 
    batch_size=32, 
    verbose=1
)

# --- Evaluate the Model ---
y_pred = transformer_model.predict(X_test)

# Reverse scaling for evaluation:
# Here we assume that the scaler was fit on your features such that the target (price) is in the last column.
# We concatenate the predictions with the other features from the last time step to reverse the scaling.
y_pred_rescaled = scaler.inverse_transform(
    np.hstack((X_test[:, -1, :-1], y_pred.reshape(-1, 1)))
)[:, -1]
y_test_rescaled = scaler.inverse_transform(
    np.hstack((X_test[:, -1, :-1], y_test.reshape(-1, 1)))
)[:, -1]

mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))

print(f"Transformer Model - MAE: {mae:.2f}, RMSE: {rmse:.2f}")


Epoch 1/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - loss: 0.0975 - val_loss: 0.0614
Epoch 2/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 0.0527 - val_loss: 0.0555
Epoch 3/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 0.0408 - val_loss: 0.0348
Epoch 4/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - loss: 0.0364 - val_loss: 0.0338
Epoch 5/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - loss: 0.0332 - val_loss: 0.0320
Epoch 6/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 0.0317 - val_loss: 0.0367
Epoch 7/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - loss: 0.0310 - val_loss: 0.0311
Epoch 8/50
[1m995/995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - loss: 0.0300 - val_loss: 0.0338
Epoch 9/50
[1m995/995[

In [4]:
print(f"Transformer Model - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

Transformer Model - MAE: 10.50, RMSE: 16.87


| Model                                            | MAE | RMSE |
| :----------------------------------------------- | :------------------ | :------------------- |
| Historical Average                               | 30.37               | 40.85                |
| Initial Linear Regression                        | 17.7                | 23.81                |
| Improved Linear Regression (Feature Engineering) | 16.24               | 21.42                |
| XGBoost (Default Settings)                       | 15.12               | 20.06                |
| Tuned XGBoost (Hyperparameter Search)            | 14.86               | 19.73                |
| Stacked Model (XGBoost + LightGBM + Ridge)       | 15.33               | 20.39                |
| lightGBM (tuned)                                 | 15.93               | 20.81                |
| Stacked Model with tuned LGBM                    | 15.47               | 20.41                |
| LSTMs                                            | 11.74               | 15.19                |
| CNN-LSTM                                         | 12.27               | 17.58                |
| **BiLSTM**                                       | **9.82**            | **14.61**            |
| Transformer Model                                | 10.5                | 16.87                |