In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/renewable-price/clean_FeatEng.csv
/kaggle/input/fe-full-dataset-smoothed-outliers/FE_full_dataset_smoothed_outliers.csv
/kaggle/input/fe-full-dataset/FE_full_dataset.csv


In [18]:
import pandas as pd

# Load the dataset
file_path = "/kaggle/input/fe-full-dataset/FE_full_dataset.csv" # "/kaggle/input/renewable-price/clean_FeatEng.csv" 
df_cleaned = pd.read_csv(file_path)

# Transformer V1. MAE: 0.61, RMSE: 6.91

# Prepare Data

In [26]:
from sklearn.preprocessing import MinMaxScaler

# Scale data
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_cleaned[features])

# Convert data into sequences
def create_sequences(data, seq_length=24):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, -1])  # Predict price
    return np.array(X), np.array(y)

SEQ_LENGTH = 24  # Use past 24 hours to predict next price
X, y = create_sequences(df_scaled, SEQ_LENGTH)

# Train-test split
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (32008, 24, 19), Test shape: (8003, 24, 19)


In [None]:
# Normalize features
scaler_x = StandardScaler()
scaler_y = StandardScaler()

df[features] = scaler_x.fit_transform(df[features])
df[target] = scaler_y.fit_transform(df[[target]])

# Create sequences for training
def create_sequences(data, target_col, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data.iloc[i : i + seq_length][features].values)
        y.append(data.iloc[i + seq_length][target_col])
    return np.array(X), np.array(y)

X, y = create_sequences(df, target, SEQ_LEN)
X_train, X_test = X[:int(0.8 * len(X))], X[int(0.8 * len(X)):]
y_train, y_test = y[:int(0.8 * len(y))], y[int(0.8 * len(y)):]

# Define Transformer

In [37]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --- Positional Encoding Function ---
def positional_encoding(sequence_length, d_model):
    """
    Computes the positional encoding for a given sequence length and model dimension.
    """
    angle_rads = np.arange(sequence_length)[:, np.newaxis] / np.power(
        10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model)
    )
    # Apply sine to even indices in the array; cosine to odd indices
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]  # shape: (1, sequence_length, d_model)
    return tf.cast(pos_encoding, dtype=tf.float32)

# --- Transformer Encoder Block ---
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    """
    A single Transformer encoder block with multi-head attention and a feed-forward network.
    """
    # Multi-head self-attention layer
    x = MultiHeadAttention(num_heads=num_heads, key_dim=head_size, dropout=dropout)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x + inputs)  # Add residual connection
    
    # Feed-forward network
    x_ff = Dense(ff_dim, activation="relu")(x)
    x_ff = Dropout(dropout)(x_ff)
    x_ff = Dense(inputs.shape[-1])(x_ff)
    x = LayerNormalization(epsilon=1e-6)(x + x_ff)  # Residual connection
    return x

# --- Build Transformer Model ---
def build_transformer_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0.1, mlp_dropout=0.1):
    """
    Builds a Transformer-based model for time-series forecasting.
    """
    inputs = Input(shape=input_shape)
    
    # Add positional encoding
    pos_encoding = positional_encoding(input_shape[0], input_shape[1])
    x = inputs + pos_encoding
    
    # Stacking multiple Transformer encoder blocks
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    
    # Global average pooling over the time dimension
    x = GlobalAveragePooling1D()(x)
    
    # MLP head for regression
    for units in mlp_units:
        x = Dense(units, activation="relu")(x)
        x = Dropout(mlp_dropout)(x)
    
    outputs = Dense(1)(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mae")
    return model

# Train and Evaluate Model

In [27]:
# --- Assume X_train, y_train, X_test, y_test, SEQ_LENGTH, and scaler are defined ---
# For example, SEQ_LENGTH is the number of past hours used for prediction.
input_shape = (SEQ_LENGTH, X_train.shape[2])

# Build the model: Adjust hyperparameters as needed
transformer_model = build_transformer_model(
    input_shape=input_shape,
    head_size=64,
    num_heads=4,
    ff_dim=128,
    num_transformer_blocks=2,
    mlp_units=[64],
    dropout=0.1,
    mlp_dropout=0.1
)

transformer_model.summary()

# --- Train the Transformer Model ---
history = transformer_model.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test),
    epochs=50, 
    batch_size=32, 
    verbose=1
)

# --- Evaluate the Model ---
y_pred = transformer_model.predict(X_test)

# Reverse scaling for evaluation:
# Here we assume that the scaler was fit on your features such that the target (price) is in the last column.
# We concatenate the predictions with the other features from the last time step to reverse the scaling.
y_pred_rescaled = scaler.inverse_transform(
    np.hstack((X_test[:, -1, :-1], y_pred.reshape(-1, 1)))
)[:, -1]
y_test_rescaled = scaler.inverse_transform(
    np.hstack((X_test[:, -1, :-1], y_test.reshape(-1, 1)))
)[:, -1]

mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))

print(f"Transformer Model - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

Epoch 1/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - loss: 0.0375 - val_loss: 0.0043
Epoch 2/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.0039 - val_loss: 0.0022
Epoch 3/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.0023 - val_loss: 0.0020
Epoch 4/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0020 - val_loss: 0.0019
Epoch 5/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0018 - val_loss: 0.0018
Epoch 6/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0018 - val_loss: 0.0022
Epoch 7/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0018 - val_loss: 0.0022
Epoch 8/50
[1m1001/1001[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0017 - val_loss: 0.0019
Epoch 9/50
[1m1001/10


* MAE measures the average magnitude of errors in a set of predictions, without considering their direction (positive or negative). It treats all errors equally.
* RMSE gives a higher weight to larger errors because it squares the residuals (errors) before averaging. This means that large errors will disproportionately increase the RMSE.

**Reason for a Low MAE and High RMSE**:
- Outliers or Large Errors: If there are a few predictions where the model's error is much larger than the rest, these outliers will increase the RMSE significantly, but the MAE will not be affected as much because it doesn't square the errors.

| Model                                            | Mae   | Rmse  |
| :----------------------------------------------- | :---- | :---- |
| Historical Average                               | 30.37 | 40.85 |
| Initial Linear Regression                        | 17.7  | 23.81 |
| Improved Linear Regression (Feature Engineering) | 16.24 | 21.42 |
| XGBoost (Default Settings)                       | 15.12 | 20.06 |
| Tuned XGBoost (Hyperparameter Search)            | 14.86 | 19.73 |
| Stacked Model (XGBoost + LightGBM + Ridge)       | 15.33 | 20.39 |
| lightGBM (tuned)                                 | 15.93 | 20.81 |
| Stacked Model with tuned LGBM                    | 15.47 | 20.41 |
| LSTMs                                            | 11.74 | 15.19 |
| CNN-LSTM                                         | 12.27 | 17.58 |
| BiLSTM                                           | 9.82  | 14.61 |
| Transformer Model                                | 10.5  | 16.87 |
| **Improved Transformer (Feature Engineering)**   | **0.61**  | **6.91**  |


In [40]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error


# ✅ Load and preprocess dataset
df = pd.read_csv("/kaggle/input/fe-full-dataset-smoothed-outliers/FE_full_dataset_smoothed_outliers.csv", parse_dates=["contract-delivery"])
df = df.sort_values("contract-delivery")

# Feature selection (using previous engineered features)
features = ['demand-forecast', 'temperature-normal',
       'temperature-forecast', 'solar-forecast',
       'wind-forecast', 'hour', 'day_of_week',
       'month', 'is_weekend', 'hour_sin', 'hour_cos', 'day_of_week_sin',
       'day_of_week_cos', 'month_sin', 'month_cos', 'wind_volatility',
       'solar_volatility', 'wind_solar_interaction',
       'demand_to_renewable_ratio']
target = "day-ahead-auction-price"

In [41]:
from sklearn.preprocessing import MinMaxScaler

# Scale data
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_cleaned[features])

# Convert data into sequences
def create_sequences(data, seq_length=24):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, -1])  # Predict price
    return np.array(X), np.array(y)

SEQ_LENGTH = 24  # Use past 24 hours to predict next price
X, y = create_sequences(df_scaled, SEQ_LENGTH)

# Train-test split
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (34655, 24, 19), Test shape: (8664, 24, 19)


In [42]:
# --- Assume X_train, y_train, X_test, y_test, SEQ_LENGTH, and scaler are defined ---
# For example, SEQ_LENGTH is the number of past hours used for prediction.
input_shape = (SEQ_LENGTH, X_train.shape[2])

# Build the model: Adjust hyperparameters as needed
transformer_model = build_transformer_model(
    input_shape=input_shape,
    head_size=64,
    num_heads=4,
    ff_dim=128,
    num_transformer_blocks=2,
    mlp_units=[64],
    dropout=0.1,
    mlp_dropout=0.1
)

transformer_model.summary()

In [43]:
# --- Train the Transformer Model ---
history = transformer_model.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test),
    epochs=50, 
    batch_size=32, 
    verbose=1
)

# --- Evaluate the Model ---
y_pred = transformer_model.predict(X_test)

# Reverse scaling for evaluation:
# Here we assume that the scaler was fit on your features such that the target (price) is in the last column.
# We concatenate the predictions with the other features from the last time step to reverse the scaling.
y_pred_rescaled = scaler.inverse_transform(
    np.hstack((X_test[:, -1, :-1], y_pred.reshape(-1, 1)))
)[:, -1]
y_test_rescaled = scaler.inverse_transform(
    np.hstack((X_test[:, -1, :-1], y_test.reshape(-1, 1)))
)[:, -1]

mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))

print(f"Transformer Model - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

Epoch 1/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - loss: 0.0242 - val_loss: 0.0051
Epoch 2/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0047 - val_loss: 0.0049
Epoch 3/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0047 - val_loss: 0.0049
Epoch 4/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0046 - val_loss: 0.0050
Epoch 5/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0063 - val_loss: 0.0052
Epoch 6/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0047 - val_loss: 0.0051
Epoch 7/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0047 - val_loss: 0.0050
Epoch 8/50
[1m1083/1083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0047 - val_loss: 0.0050
Epoch 9/50
[1m1083/108