# Forecasting Electricity Prices in Poland (RCE)

## Import Libraries

In [41]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

#pd.set_option("display.max_columns", 200)
#pd.set_option("display.width", 120)


## Loading Data

In [5]:
# Step 1: use RAW urls 
train_urls = [
    "https://raw.githubusercontent.com/anlan-chen/ML2-Final-Project/main/Data/Data/Training/GUI_ENERGY_PRICES_201912312300-202012312300.csv",
    "https://raw.githubusercontent.com/anlan-chen/ML2-Final-Project/main/Data/Data/Training/GUI_ENERGY_PRICES_202012312300-202112312300.csv",
    "https://raw.githubusercontent.com/anlan-chen/ML2-Final-Project/main/Data/Data/Training/GUI_ENERGY_PRICES_202112312300-202212312300.csv",
    "https://raw.githubusercontent.com/anlan-chen/ML2-Final-Project/main/Data/Data/Training/GUI_ENERGY_PRICES_202312312300-202412312300.csv",
]

test_urls = [
    "https://raw.githubusercontent.com/anlan-chen/ML2-Final-Project/main/Data/Data/Testing/GUI_ENERGY_PRICES_202412312300-202512312300.csv",
]

# Step 2: load training data
train_list = []
for url in train_urls:
    df = pd.read_csv(url)
    df["source_url"] = url
    train_list.append(df)

train_df = pd.concat(train_list, ignore_index=True)

print("train_df shape:", train_df.shape)
print("train_df columns:", train_df.columns.tolist())
display(train_df.head())

# Step 3: load testing data
test_list = []
for url in test_urls:
    df = pd.read_csv(url)
    df["source_url"] = url
    test_list.append(df)

test_df = pd.concat(test_list, ignore_index=True)

print("test_df shape:", test_df.shape)
print("test_df columns:", test_df.columns.tolist())
display(test_df.head())


train_df shape: (35088, 7)
train_df columns: ['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)', 'Intraday Period (CET/CEST)', 'Intraday Price (EUR/MWh)', 'source_url']


Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh),Intraday Period (CET/CEST),Intraday Price (EUR/MWh),source_url
0,01/01/2020 00:00:00 - 01/01/2020 01:00:00,BZN|PL,Without Sequence,34.95,,,https://raw.githubusercontent.com/anlan-chen/M...
1,01/01/2020 01:00:00 - 01/01/2020 02:00:00,BZN|PL,Without Sequence,34.67,,,https://raw.githubusercontent.com/anlan-chen/M...
2,01/01/2020 02:00:00 - 01/01/2020 03:00:00,BZN|PL,Without Sequence,33.3,,,https://raw.githubusercontent.com/anlan-chen/M...
3,01/01/2020 03:00:00 - 01/01/2020 04:00:00,BZN|PL,Without Sequence,31.91,,,https://raw.githubusercontent.com/anlan-chen/M...
4,01/01/2020 04:00:00 - 01/01/2020 05:00:00,BZN|PL,Without Sequence,27.54,,,https://raw.githubusercontent.com/anlan-chen/M...


test_df shape: (35040, 7)
test_df columns: ['MTU (CET/CEST)', 'Area', 'Sequence', 'Day-ahead Price (EUR/MWh)', 'Intraday Period (CET/CEST)', 'Intraday Price (EUR/MWh)', 'source_url']


Unnamed: 0,MTU (CET/CEST),Area,Sequence,Day-ahead Price (EUR/MWh),Intraday Period (CET/CEST),Intraday Price (EUR/MWh),source_url
0,01/01/2025 00:00:00 - 01/01/2025 00:15:00,BZN|PL,Without Sequence,15.41,,,https://raw.githubusercontent.com/anlan-chen/M...
1,01/01/2025 00:15:00 - 01/01/2025 00:30:00,BZN|PL,Without Sequence,15.41,,,https://raw.githubusercontent.com/anlan-chen/M...
2,01/01/2025 00:30:00 - 01/01/2025 00:45:00,BZN|PL,Without Sequence,15.41,,,https://raw.githubusercontent.com/anlan-chen/M...
3,01/01/2025 00:45:00 - 01/01/2025 01:00:00,BZN|PL,Without Sequence,15.41,,,https://raw.githubusercontent.com/anlan-chen/M...
4,01/01/2025 01:00:00 - 01/01/2025 01:15:00,BZN|PL,Without Sequence,2.19,,,https://raw.githubusercontent.com/anlan-chen/M...


In [6]:
# Step 2: extract start_time and end_time from MTU
def extract_times_from_mtu(mtu):
    # Example: "01/01/2020 00:00:00 - 01/01/2020 01:00:00"
    left, right = mtu.split(" - ")
    start = pd.to_datetime(left, dayfirst=True, errors="coerce")
    end = pd.to_datetime(right, dayfirst=True, errors="coerce")
    return start, end

# training
train_times = train_df["MTU (CET/CEST)"].astype(str).apply(extract_times_from_mtu)
train_df["start_time"] = train_times.apply(lambda x: x[0])
train_df["end_time"]   = train_times.apply(lambda x: x[1])

# testing
test_times = test_df["MTU (CET/CEST)"].astype(str).apply(extract_times_from_mtu)
test_df["start_time"] = test_times.apply(lambda x: x[0])
test_df["end_time"]   = test_times.apply(lambda x: x[1])

# Step 3: keep only needed columns for now
y_col = "Day-ahead Price (EUR/MWh)"

train_clean = train_df[["start_time", "end_time", "Area", "Sequence", y_col]].copy()
test_clean  = test_df[["start_time", "end_time", "Area", "Sequence", y_col]].copy()

# Step 4: basic cleaning
train_clean = train_clean.dropna(subset=["start_time", y_col]).sort_values("start_time")
test_clean  = test_clean.dropna(subset=["start_time", y_col]).sort_values("start_time")

train_clean[y_col] = pd.to_numeric(train_clean[y_col], errors="coerce")
test_clean[y_col]  = pd.to_numeric(test_clean[y_col], errors="coerce")

print("train_clean shape:", train_clean.shape)
print("test_clean shape:", test_clean.shape)

print("train time range:", train_clean["start_time"].min(), "-", train_clean["start_time"].max())
print("test  time range:", test_clean["start_time"].min(), "-", test_clean["start_time"].max())


train_clean shape: (35072, 5)
test_clean shape: (35030, 5)
train time range: 2020-01-01 00:00:00 - 2024-12-31 23:00:00
test  time range: 2025-01-01 00:00:00 - 2025-12-31 23:45:00


In [7]:
# Step 5: check time step frequency
train_diff = train_clean["start_time"].diff().value_counts().head(5)
test_diff  = test_clean["start_time"].diff().value_counts().head(5)

print("Most common train time diffs:")
print(train_diff)

print("\nMost common test time diffs:")
print(test_diff)


Most common train time diffs:
start_time
0 days 01:00:00      35062
0 days 04:00:00          4
0 days 02:00:00          4
365 days 01:00:00        1
Name: count, dtype: int64

Most common test time diffs:
start_time
0 days 00:15:00    35027
0 days 01:45:00        1
0 days 01:15:00        1
Name: count, dtype: int64


In [9]:
# Step 6: set index and resample to 15 minutes
# For training: hourly - 15min by forward fill
train_15m = (
    train_clean.set_index("start_time")
              .sort_index()
              .resample("15min")
              .ffill()
              .reset_index()
)

# For testing: already 15min, but we enforce consistent grid
test_15m = (
    test_clean.set_index("start_time")
             .sort_index()
             .resample("15min")
             .asfreq()   # Because the Day-ahead Price in the test set represents the true observed values used for evaluation, 
                         # it should not be filled·. Otherwise, the evaluation results would be distorted. 
             .reset_index()
)

print("train_15m shape:", train_15m.shape)
print("test_15m shape:", test_15m.shape)

display(train_15m.head())
display(test_15m.head())


train_15m shape: (175389, 5)
test_15m shape: (35040, 5)


Unnamed: 0,start_time,end_time,Area,Sequence,Day-ahead Price (EUR/MWh)
0,2020-01-01 00:00:00,2020-01-01 01:00:00,BZN|PL,Without Sequence,34.95
1,2020-01-01 00:15:00,2020-01-01 01:00:00,BZN|PL,Without Sequence,34.95
2,2020-01-01 00:30:00,2020-01-01 01:00:00,BZN|PL,Without Sequence,34.95
3,2020-01-01 00:45:00,2020-01-01 01:00:00,BZN|PL,Without Sequence,34.95
4,2020-01-01 01:00:00,2020-01-01 02:00:00,BZN|PL,Without Sequence,34.67


Unnamed: 0,start_time,end_time,Area,Sequence,Day-ahead Price (EUR/MWh)
0,2025-01-01 00:00:00,2025-01-01 00:15:00,BZN|PL,Without Sequence,15.41
1,2025-01-01 00:15:00,2025-01-01 00:30:00,BZN|PL,Without Sequence,15.41
2,2025-01-01 00:30:00,2025-01-01 00:45:00,BZN|PL,Without Sequence,15.41
3,2025-01-01 00:45:00,2025-01-01 01:00:00,BZN|PL,Without Sequence,15.41
4,2025-01-01 01:00:00,2025-01-01 01:15:00,BZN|PL,Without Sequence,2.19


In [10]:
# Step 5.1: define target and copy data
target_col = "Day-ahead Price (EUR/MWh)"

train_fe = train_15m.copy()
test_fe  = test_15m.copy()


In [11]:
# Step 5.2: lag features (15min resolution)
lag_list = [1, 2, 4, 96, 672] # 15min、30min、1h、1 day、1 week

for lag in lag_list:
    train_fe[f"lag_{lag}"] = train_fe[target_col].shift(lag)
    test_fe[f"lag_{lag}"]  = test_fe[target_col].shift(lag)


In [12]:
# Step 5.3: rolling statistics
rolling_windows = [96, 672]

for w in rolling_windows:
    train_fe[f"roll_mean_{w}"] = train_fe[target_col].shift(1).rolling(w).mean()
    train_fe[f"roll_std_{w}"]  = train_fe[target_col].shift(1).rolling(w).std()

    test_fe[f"roll_mean_{w}"] = test_fe[target_col].shift(1).rolling(w).mean()
    test_fe[f"roll_std_{w}"]  = test_fe[target_col].shift(1).rolling(w).std()


In [13]:
# Step 5.4: calendar features
for df in [train_fe, test_fe]:
    df["hour"] = df["start_time"].dt.hour
    df["minute"] = df["start_time"].dt.minute
    df["dayofweek"] = df["start_time"].dt.dayofweek
    df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)


In [14]:
# Step 5.5: drop missing values caused by lags/rolling
train_fe = train_fe.dropna().reset_index(drop=True)
test_fe  = test_fe.dropna().reset_index(drop=True)

print("train_fe shape:", train_fe.shape)
print("test_fe shape:", test_fe.shape)


train_fe shape: (174621, 18)
test_fe shape: (33012, 18)


## Step 6：Prepare X / y and Baseline Model

To provide a reference point for model performance, a naive baseline model is constructed. The baseline forecast uses the electricity price observed one hour earlier as the prediction for the current time step. This approach reflects a simple persistence assumption, where short-term electricity prices are expected to exhibit limited changes.

In [15]:
# Step 6.1: define feature columns
feature_cols = [
    "lag_1",
    "lag_4",
    "lag_96",
    "lag_672",
    "roll_mean_96",
    "roll_std_96",
    "roll_mean_672",
    "roll_std_672",
    "hour",
    "minute",
    "dayofweek",
    "is_weekend"
]

target_col = "Day-ahead Price (EUR/MWh)"

print("Number of features:", len(feature_cols))
print(feature_cols)


Number of features: 12
['lag_1', 'lag_4', 'lag_96', 'lag_672', 'roll_mean_96', 'roll_std_96', 'roll_mean_672', 'roll_std_672', 'hour', 'minute', 'dayofweek', 'is_weekend']


In [16]:
# Step 6.2: split into X and y
X_train = train_fe[feature_cols]
y_train = train_fe[target_col]

X_test = test_fe[feature_cols]
y_test = test_fe[target_col]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (174621, 12)
X_test shape: (33012, 12)


In [18]:
# Step 6.3: naive baseline using lag_4 (1 hour)
y_pred_naive = X_test["lag_4"]

mae_naive = mean_absolute_error(y_test, y_pred_naive)
rmse_naive = mean_squared_error(y_test, y_pred_naive, squared=False)

print("Naive baseline results:")
print("MAE:", mae_naive)
print("RMSE:", rmse_naive)


Naive baseline results:
MAE: 16.298627468799225
RMSE: 27.52143615423511




For the naive baseline, the MAE is approximately 16.3 EUR/MWh and the RMSE is approximately 27.5 EUR/MWh. These results indicate that while short-term price persistence exists, the baseline struggles to capture sharp price fluctuations and peak events. Consequently, this benchmark provides a meaningful reference against which the performance of Gradient Boosting models can be assessed.

## Step 7：LightGBM

In [20]:
# Step 7.1: prepare LightGBM datasets
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test  = lgb.Dataset(X_test, y_test)


In [21]:
# Step 7.2: LightGBM parameters (initial)
lgb_params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": ["l1", "l2"],
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}


In [22]:
# Step 7.3: train LightGBM
gbm = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_train],
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)


Training until validation scores don't improve for 50 rounds


In [23]:
# Step 7.4: predict and evaluate
y_pred_lgb = gbm.predict(X_test, num_iteration=gbm.best_iteration)

mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
rmse_lgb = mean_squared_error(y_test, y_pred_lgb, squared=False)

print("LightGBM results:")
print("MAE:", mae_lgb)
print("RMSE:", rmse_lgb)


LightGBM results:
MAE: 5.587601668186966
RMSE: 12.822283770553215




Compared to the naive one-hour persistence benchmark, LightGBM achieves a substantial reduction in both MAE and RMSE, indicating that the model captures temporal dependencies and seasonality beyond simple price persistence.

## Step 8：CatBoost (compare)

In [26]:
# Step 8.1: train CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    loss_function="RMSE",
    verbose=False
)

cat_model.fit(X_train, y_train)


<catboost.core.CatBoostRegressor at 0x23d2534c4a0>

In [27]:
# Step 8.2: predict and evaluate
y_pred_cat = cat_model.predict(X_test)

mae_cat = mean_absolute_error(y_test, y_pred_cat)
rmse_cat = mean_squared_error(y_test, y_pred_cat, squared=False)

print("CatBoost results:")
print("MAE:", mae_cat)
print("RMSE:", rmse_cat)


CatBoost results:
MAE: 5.4589376937031195
RMSE: 12.617467671823402




Both LightGBM and CatBoost significantly outperform the naive benchmark. CatBoost achieves an MAE of 5.46 EUR/MWh and an RMSE of 12.62 EUR/MWh, which is slightly better than LightGBM. The close performance of the two models indicates that Gradient Boosting methods are well suited for short-term electricity price forecasting.

## Step 9：Hyperparameter Tuning

In [28]:
# Step 9.1: create validation split from training data (last 20%)
split_idx = int(len(X_train) * 0.8)

X_tr = X_train.iloc[:split_idx]
y_tr = y_train.iloc[:split_idx]

X_val = X_train.iloc[split_idx:]
y_val = y_train.iloc[split_idx:]

print("Train part:", X_tr.shape)
print("Validation part:", X_val.shape)


Train part: (139696, 12)
Validation part: (34925, 12)


In [39]:
# Step 9.2: LightGBM tuning
lgb_train_tune = lgb.Dataset(X_tr, y_tr)
lgb_valid_tune = lgb.Dataset(X_val, y_val, reference=lgb_train_tune)

tuned_params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": ["l1", "l2"],
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_child_samples": 20,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

gbm_tuned = lgb.train(
    tuned_params,
    lgb_train_tune,
    num_boost_round=4000,
    valid_sets=[lgb_valid_tune],
    callbacks=[lgb.early_stopping(stopping_rounds=100)]
)


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[300]	valid_0's l1: 4.40456	valid_0's l2: 118.405


In [40]:
# Step 9.3: predict with tuned LightGBM
y_pred_lgb_tuned = gbm_tuned.predict(
    X_test, num_iteration=gbm_tuned.best_iteration
)

mae_lgb_tuned = mean_absolute_error(y_test, y_pred_lgb_tuned)
rmse_lgb_tuned = mean_squared_error(y_test, y_pred_lgb_tuned, squared=False)

print("Tuned LightGBM results:")
print("MAE:", mae_lgb_tuned)
print("RMSE:", rmse_lgb_tuned)


Tuned LightGBM results:
MAE: 7.085077734945378
RMSE: 15.003122197646409




Hyperparameter tuning using a time-based validation split does not lead to further improvements over the initial LightGBM model. In particular, lower learning rates result in underfitting and worse predictive performance on the test set. Therefore, the untuned LightGBM model is selected as the final model.

In [42]:
# Step 9.4: catboost tuning
def eval_cat(params, X_tr, y_tr, X_val, y_val):
    model = CatBoostRegressor(
        loss_function="RMSE",
        random_seed=42,
        verbose=False,
        **params
    )
    model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        use_best_model=True,
        early_stopping_rounds=100
    )
    pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, pred)
    rmse = mean_squared_error(y_val, pred, squared=False)
    return mae, rmse, model

In [44]:
param_grid = [
    {"iterations": 4000, "learning_rate": 0.03, "depth": 6, "l2_leaf_reg": 3},
    {"iterations": 4000, "learning_rate": 0.03, "depth": 8, "l2_leaf_reg": 3},
    {"iterations": 4000, "learning_rate": 0.05, "depth": 6, "l2_leaf_reg": 3},
    {"iterations": 4000, "learning_rate": 0.05, "depth": 8, "l2_leaf_reg": 3},
    {"iterations": 4000, "learning_rate": 0.03, "depth": 8, "l2_leaf_reg": 10},
    {"iterations": 4000, "learning_rate": 0.05, "depth": 8, "l2_leaf_reg": 10},
]

In [45]:
best = None

for i, params in enumerate(param_grid, 1):
    mae, rmse, model = eval_cat(params, X_tr, y_tr, X_val, y_val)
    print(f"[{i}] params={params} | val MAE={mae:.4f} | val RMSE={rmse:.4f}")
    if (best is None) or (rmse < best["rmse"]):
        best = {"params": params, "mae": mae, "rmse": rmse, "model": model}

print("\nBest on validation:")
print(best["params"])
print("val MAE:", best["mae"])
print("val RMSE:", best["rmse"])




[1] params={'iterations': 4000, 'learning_rate': 0.03, 'depth': 6, 'l2_leaf_reg': 3} | val MAE=4.2208 | val RMSE=10.7615




[2] params={'iterations': 4000, 'learning_rate': 0.03, 'depth': 8, 'l2_leaf_reg': 3} | val MAE=4.4107 | val RMSE=10.8843




[3] params={'iterations': 4000, 'learning_rate': 0.05, 'depth': 6, 'l2_leaf_reg': 3} | val MAE=4.2637 | val RMSE=10.8151




[4] params={'iterations': 4000, 'learning_rate': 0.05, 'depth': 8, 'l2_leaf_reg': 3} | val MAE=4.3509 | val RMSE=10.7783




[5] params={'iterations': 4000, 'learning_rate': 0.03, 'depth': 8, 'l2_leaf_reg': 10} | val MAE=4.3635 | val RMSE=10.8235
[6] params={'iterations': 4000, 'learning_rate': 0.05, 'depth': 8, 'l2_leaf_reg': 10} | val MAE=4.3695 | val RMSE=10.7792

Best on validation:
{'iterations': 4000, 'learning_rate': 0.03, 'depth': 6, 'l2_leaf_reg': 3}
val MAE: 4.220811797019566
val RMSE: 10.761537448515694




In [46]:
best_params = best["params"]

cat_best = CatBoostRegressor(
    loss_function="RMSE",
    random_seed=42,
    verbose=False,
    **best_params
)

cat_best.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),    
    use_best_model=True,
    early_stopping_rounds=100
)

y_pred_cat_best = cat_best.predict(X_test)
mae_cat_best = mean_absolute_error(y_test, y_pred_cat_best)
rmse_cat_best = mean_squared_error(y_test, y_pred_cat_best, squared=False)

print("Tuned CatBoost results (test):")
print("MAE:", mae_cat_best)
print("RMSE:", rmse_cat_best)


Tuned CatBoost results (test):
MAE: 5.438818729162264
RMSE: 12.631427617001442




Both LightGBM and CatBoost substantially outperform the naive benchmark. CatBoost tuning leads to a marginal improvement in MAE, while RMSE remains largely unchanged. Overall, the close performance across models confirms the robustness of Gradient Boosting methods for short-term electricity price forecasting.

## Step 10: 2025 Results Analysis (Peak / Trough / Underestimation)

In [67]:
# Put all predictions you have into a dict 
pred_dict = {
    "naive_lag_4": y_pred_naive.values,   
    "lgb": np.array(y_pred_lgb),
    "Cat": np.array(y_pred_cat),
}

# Compute MAE/RMSE on 2025 test set and select the best by RMSE 
score_rows = []
for name, pred in pred_dict.items():
    mae = mean_absolute_error(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    score_rows.append([name, mae, rmse])

score_df = pd.DataFrame(score_rows, columns=["model", "MAE", "RMSE"]).sort_values(["RMSE", "MAE"])
display(score_df)

best_model = score_df.iloc[0]["model"]
print("Best model:", best_model)

y_pred_best = pred_dict[best_model]



Unnamed: 0,model,MAE,RMSE
2,Cat,5.458938,12.617468
1,lgb,5.587602,12.822284
0,naive_lag_4,16.298627,27.521436


Best model: Cat


In [68]:
# Step 10.2: build results dataframe (2025)

results_2025 = test_fe.copy()
results_2025["y_true"] = y_test.values
results_2025["y_pred"] = y_pred_best

# Error = pred - true (negative => underestimation)
results_2025["error"] = results_2025["y_pred"] - results_2025["y_true"]
results_2025["abs_error"] = results_2025["error"].abs()

display(results_2025.head())
print("results_2025 shape:", results_2025.shape)


Unnamed: 0,start_time,end_time,Area,Sequence,Day-ahead Price (EUR/MWh),lag_1,lag_2,lag_4,lag_96,lag_672,...,roll_mean_672,roll_std_672,hour,minute,dayofweek,is_weekend,y_true,y_pred,error,abs_error
0,2025-01-08 00:00:00,2025-01-08 00:15:00,BZN|PL,Without Sequence,71.82,50.24,50.24,50.24,24.27,15.41,...,74.949286,46.804171,0,0,2,0,71.82,44.452715,-27.367285,27.367285
1,2025-01-08 00:15:00,2025-01-08 00:30:00,BZN|PL,Without Sequence,71.82,71.82,50.24,50.24,24.27,15.41,...,75.033229,46.74778,0,15,2,0,71.82,71.070764,-0.749236,0.749236
2,2025-01-08 00:30:00,2025-01-08 00:45:00,BZN|PL,Without Sequence,71.82,71.82,71.82,50.24,24.27,15.41,...,75.117173,46.691169,0,30,2,0,71.82,71.061146,-0.758854,0.758854
3,2025-01-08 00:45:00,2025-01-08 01:00:00,BZN|PL,Without Sequence,71.82,71.82,71.82,50.24,24.27,15.41,...,75.201116,46.634338,0,45,2,0,71.82,71.07929,-0.74071,0.74071
4,2025-01-08 01:00:00,2025-01-08 01:15:00,BZN|PL,Without Sequence,65.79,71.82,71.82,71.82,26.11,2.19,...,75.28506,46.577287,1,0,2,0,65.79,68.926332,3.136332,3.136332


results_2025 shape: (33012, 22)


In [69]:
# Step 10.3: define Peak / Trough by quantiles (top 10% / bottom 10%)

q90 = results_2025["y_true"].quantile(0.9)
q10 = results_2025["y_true"].quantile(0.1)

peak_df = results_2025[results_2025["y_true"] >= q90]
trough_df = results_2025[results_2025["y_true"] <= q10]

print("Peak points:", peak_df.shape[0])
print("Trough points:", trough_df.shape[0])
print("q90 threshold:", q90)
print("q10 threshold:", q10)


Peak points: 3303
Trough points: 3302
q90 threshold: 155.9
q10 threshold: 45.8970000000001


In [70]:
# Step 10.4: Peak / Trough error metrics + direction (under/over)

mae_all = mean_absolute_error(results_2025["y_true"], results_2025["y_pred"])
rmse_all = mean_squared_error(results_2025["y_true"], results_2025["y_pred"], squared=False)

mae_peak = mean_absolute_error(peak_df["y_true"], peak_df["y_pred"])
rmse_peak = mean_squared_error(peak_df["y_true"], peak_df["y_pred"], squared=False)

mae_trough = mean_absolute_error(trough_df["y_true"], trough_df["y_pred"])
rmse_trough = mean_squared_error(trough_df["y_true"], trough_df["y_pred"], squared=False)

mean_err_peak = peak_df["error"].mean()
mean_err_trough = trough_df["error"].mean()

under_peak_ratio = np.mean(peak_df["error"] < 0)
over_peak_ratio  = np.mean(peak_df["error"] > 0)

under_trough_ratio = np.mean(trough_df["error"] < 0)
over_trough_ratio  = np.mean(trough_df["error"] > 0)

print("=== Overall ===")
print("MAE:", mae_all)
print("RMSE:", rmse_all)

print("\n=== Peak (top 10%) ===")
print("MAE:", mae_peak)
print("RMSE:", rmse_peak)
print("Mean error (pred-true):", mean_err_peak)
print("Under ratio:", under_peak_ratio)
print("Over ratio:", over_peak_ratio)

print("\n=== Trough (bottom 10%) ===")
print("MAE:", mae_trough)
print("RMSE:", rmse_trough)
print("Mean error (pred-true):", mean_err_trough)
print("Under ratio:", under_trough_ratio)
print("Over ratio:", over_trough_ratio)


=== Overall ===
MAE: 5.4589376937031195
RMSE: 12.617467671823402

=== Peak (top 10%) ===
MAE: 14.224755526460864
RMSE: 26.340678220776255
Mean error (pred-true): -4.89219602015419
Under ratio: 0.5749318801089919
Over ratio: 0.4250681198910082

=== Trough (bottom 10%) ===
MAE: 8.113757416139155
RMSE: 17.02178588474489
Mean error (pred-true): 4.574863244268218
Under ratio: 0.4339794064203513
Over ratio: 0.5660205935796487




Based on the 2025 test set, the CatBoost model achieves strong overall performance, with an MAE of 5.46 EUR/MWh and an RMSE of 12.62 EUR/MWh.

Prediction errors increase substantially during peak price periods. For the top 10% of prices, the MAE rises to 14.22 EUR/MWh, and the model underestimates prices in approximately 57% of observations. The negative mean error further indicates a systematic tendency to underestimate extreme price spikes.

In contrast, during trough periods corresponding to the bottom 10% of prices, the MAE is lower at 8.11 EUR/MWh. In these low-price regimes, the model more frequently overestimates prices, as reflected by a positive mean error and an overestimation ratio of about 57%.

Overall, the results suggest a mean-reverting behavior of the model, where extreme prices are pulled toward average levels. This pattern reflects the inherent difficulty of predicting rare and abrupt price movements using historical price information alone.

In [66]:
# Step 10.5: hourly MAE profile (when does the model make larger errors?)

hourly_mae = (
    results_2025
    .groupby("hour")
    .apply(lambda x: mean_absolute_error(x["y_true"], x["y_pred"]))
    .sort_index()
)

display(hourly_mae)


  .apply(lambda x: mean_absolute_error(x["y_true"], x["y_pred"]))


hour
0     3.445534
1     2.337893
2     1.734926
3     1.321302
4     1.908908
5     2.988791
6     5.495177
7     6.499629
8     6.383481
9     7.058154
10    5.965488
11    4.555186
12    4.604615
13    5.444663
14    6.867079
15    7.898285
16    7.854797
17    7.598779
18    7.952118
19    9.060906
20    9.980750
21    6.323650
22    4.182912
23    3.520019
dtype: float64

The hourly MAE profile reveals clear intraday patterns in prediction errors. Errors are lowest during night and early morning hours, when electricity demand and price volatility are relatively low. Prediction errors increase during morning hours and reach their highest levels during the late afternoon and evening peak (approximately 15:00–20:00), reflecting periods of rapid demand changes and heightened market volatility. After the evening peak, errors decline again as prices stabilize.

This intraday error pattern is consistent with the underlying dynamics of electricity markets and highlights that forecasting performance deteriorates during periods of structural stress and sharp price movements.

In [59]:
# Step 10.6: show the largest errors (top 20) to inspect peaks/spikes

top_errors = results_2025.sort_values("abs_error", ascending=False).head(20)[
    ["start_time", "y_true", "y_pred", "error", "abs_error", "hour", "dayofweek", "is_weekend"]
]
display(top_errors)


Unnamed: 0,start_time,y_true,y_pred,error,abs_error,hour,dayofweek,is_weekend
16105,2025-07-01 20:00:00,471.29,282.019732,-189.270268,189.270268,20,1,0
13321,2025-06-02 20:00:00,427.76,258.28706,-169.47294,169.47294,20,0,0
16205,2025-07-02 21:00:00,197.2,366.529577,169.329577,169.329577,21,2,0
22725,2025-09-08 19:00:00,446.8,288.337552,-158.462448,158.462448,19,0,0
24069,2025-09-22 19:00:00,374.57,217.329487,-157.240513,157.240513,19,0,0
29808,2025-11-28 15:00:00,117.77,273.556545,155.786545,155.786545,15,4,0
11401,2025-05-13 20:00:00,354.28,198.733814,-155.546186,155.546186,20,1,0
23785,2025-09-19 20:00:00,170.62,320.999122,150.379122,150.379122,20,4,0
22825,2025-09-09 20:00:00,222.24,372.587312,150.347312,150.347312,20,1,0
29500,2025-11-25 10:00:00,401.49,253.203336,-148.286664,148.286664,10,1,0


Inspection of the largest absolute prediction errors reveals that extreme errors are highly concentrated during late afternoon and evening hours, particularly around the daily peak load period. Most of these cases correspond to extreme price spikes, where the model substantially underestimates the true electricity price.

A smaller number of large errors occur when prices drop sharply after a high-price period, leading to temporary overestimation due to the lagged structure of the features. In addition, the presence of extreme negative prices further highlights the limitations of models based solely on historical price information.

Overall, the largest prediction errors are associated with rare and abrupt market events that are difficult to anticipate without additional exogenous variables.