In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
# %% load packages
import locale
import sys
import os
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import optuna
import requests
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
import random
from sqlalchemy import create_engine,inspect
from pathlib import Path
import urllib.parse
import pyarrow
from calendar import day_abbr
import calendar
from typing import Tuple, Union, Dict, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from pygam import LinearGAM
from datetime import datetime




In [10]:
from srs.utils.tutor_utils import prepare_dataset_tensor, forecasting_study,\
  plot_daily_profile,plot_hour_comparison, build_multiwindow_experts, tune_ewa_eta, \
  ewa_aggregate_forecasts, compute_error_table, tune_expert_window, \
  run_expert_window_test, build_regression_matrix, SimpleMLP, train_mlp, \
  prepare_train_test_tensors, build_mlp_rolling_forecasts, tune_mlp_hyperparameters, \
  DST_trafo, prepare_dataset_tensor_modified

from srs.utils.our_utils import run_forecast_step
from srs.collect_data.setup import setup_seed, get_device
from srs.collect_data.entsoe_data import create_entsoe_engine, get_tables, get_spec, \
  get_market_divisions,get_map_codes,get_map_codes_starting_with, get_resolution_codes, \
    prepare_generation, prepare_load,prepare_price, prepare_unavailability, \
    prepare_filling_rate_hydro, prepare_physical_flow, prepare_installed_capacity
from srs.collect_data.datastream_data import create_datastream_engine, get_tables, \
  prepare_datastream
from srs.collect_data.dwd_mosmix_data import fetch_region_weather, prepare_weather
from srs.collect_data.merge_data import merge_datasets, build_training_dataset



In [40]:
print(f"data_t shape: {data_t.shape}")
print(f"train_t shape: {train_t.shape}")
print(f"train_dates shape: {train_dates.shape}")
print(f"price_t shape: {price_t.shape}")

data_t shape: torch.Size([2193, 24, 10])
train_t shape: torch.Size([1463, 24, 10])
train_dates shape: (1463,)
price_t shape: torch.Size([2193, 24])


In [42]:
reg_data.keys()

dict_keys(['regmat', 'index_dict', 'dep_indices'])

In [51]:

reg_data["dep_indices"]

[0,
 8,
 16,
 24,
 32,
 40,
 48,
 56,
 64,
 72,
 80,
 88,
 96,
 104,
 112,
 120,
 128,
 136,
 144,
 152,
 160,
 168,
 176,
 184]

In [None]:
'''
  training interval:
  2019 - 365 days
  2020 - 366 days
  2021 - 365 days
  2022 - 366 days
  
  testing interval:
  2023 - 365 days
  2024 - 366 days
  
  
  The reason why I have metnioned lightgbm and GAM before is that I have, as one of the alternative methodologies, to get preliminary predictions from gam,lightbgm or any other models, use these predictions as a input for a input layer to get final predictions from MLP.
That is why I need to be consistent for now with 
'''

In [None]:
#set the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **************************************
# define dates for training and evaluation 
# **************************************
INIT_DATE_EXPERIMENTS = '2019-01-01'
INIT_TEST_DATE = '2023-01-01'
FINAL_DATE_EXPERIMENTS = '2024-12-31'
n_days_test = (pd.to_datetime(FINAL_DATE_EXPERIMENTS) - pd.to_datetime(INIT_TEST_DATE)).days + (1) # additional adjustment

repo_root = Path.cwd().parents[1]
mapcodes = ["NO1","NO2","NO3","NO4","NO5"]
maps_dict = {}

for code in mapcodes:
    csv_path = repo_root / "data" / f"data_{code}.csv"
    df = pd.read_csv(csv_path, parse_dates=["time_utc"])
    data_t, train_t, train_dates, price_t = prepare_dataset_tensor( # <- update function to Alex's one
        csv_path,
        tz="CET",
        seed=42,
        test_days=n_days_test,
        dtype=torch.float64,
    )
    
    # fix potential problems with dates after change time zone.. (Alex correction)
    train_dates_series       = pd.DatetimeIndex(sorted(train_dates))
    id_init_exp = train_dates_series.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    id_init_test_period = train_dates_series.get_loc(pd.Timestamp(INIT_TEST_DATE))
    id_end_exp = train_dates_series.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    data_t = data_t[id_init_exp:(id_end_exp+1), :,:]
    train_dates = pd.Series(train_dates[id_init_exp:(id_end_exp+1)])
    
    maps_dict[code] = {
        "df": df,
        "data_t": data_t,
        "train_t": train_t,
        "train_dates": train_dates,
        "price_t": price_t
    }
maps_dict.keys()

gam24_by_zone = {}
rmse_by_zone  = {}

for z in mapcodes:
    print(f"\n--- {z} ---")
    price_S         = maps_dict[z]["price_t"]
    data_array      = maps_dict[z]["data_t"]
    full_dates      = maps_dict[z]["train_dates"] # <- changed from _all_ days to train_dates based on Alex spot
    feature_names   = maps_dict[z]["df"].columns[1:]
    full_date_series= pd.DatetimeIndex(sorted(full_dates)) 

    # evaluation days (all of 2024)
    train_start_idx = full_date_series.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    id_init_eval = full_date_series.get_loc(pd.Timestamp(INIT_TEST_DATE))
    id_end_eval = full_date_series.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    eval_start_idx = id_init_eval 
    eval_end_idx  = id_end_eval
    N_s = (eval_end_idx - eval_start_idx) + 1
    full_dates = pd.to_datetime(full_dates)
    
    # new features: WD - dummy for week days, price lags for Mon, Tue and Fri, day-ahead load lag
    WD             = [1,6,7]     
    PRICE_S_LAGS   = [1,2,7]
    DA_LAG         = [0]
    S              = 24
    #D             = 730

    # tensors to collect forecasts for THIS zone
    forecasts_zone = torch.full((N_s, S, 1), float("nan"),
                                dtype=torch.float64, device=device)
                
    #   shape: (N_s, S)
    true_vals = price_S[eval_start_idx : eval_end_idx + 1].to(device)  
    
    # compute RMSE
    diff = forecasts_zone[:, :, 0] - true_vals
    rmse = torch.sqrt((diff**2).mean()).item()
    
    print(range(N_s))
    print(f"Zone {z} GAM-24h RMSE: {rmse:.4f}")

    gam24_by_zone[z] = forecasts_zone[:, :, 0].cpu()
    rmse_by_zone[z]  = rmse

In [None]:
# 12. Tutorial 4. Reg Matrix and simple MLP Benchmark

# 7. Transform merged dataset using DST_trafo and prepare training data.
data_t, train_t, train_dates, price_t = prepare_dataset_tensor(
    "./data/data_no1.csv",
    tz="CET",
    seed=42,
    test_days=2*365,         
    dtype=torch.float64, 
)
print(train_t.shape, price_t.shape)

# 1) Build regression matrix for the evaluation block
reg_data = build_regression_matrix(
    dat_eval = train_t.cpu().numpy(),
    days_eval= pd.to_datetime(train_dates),
    reg_names= df.columns[1:],   # all columns except time_utc
)

# 2) Prepare tensors for the FIRST evaluation day
tensors = prepare_train_test_tensors(
    regmat_df   = reg_data["regmat"],
    dep_indices = reg_data["dep_indices"],
    D           = 730,                      # window
    eval_start_row = reg_data["regmat"].shape[0] - 730,   # begin_eval
    device      = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

# 3) Train the MLP and get prediction vs. true
pred, true = train_mlp(
    tensors,
    hidden_dim   = 50,
    lr           = 0.001,
    weight_decay = 0.001,
    batch_size   = 32,
    epochs       = 60,
)

print("Predicted price (all 24 series) :", pred.numpy())
print("True price       (all 24 series) :", true.numpy())

# scalar metrics for that single-day, 24-hour vector
rmse = torch.sqrt(((pred - true) ** 2).mean()).item()
mae  = (pred - true).abs().mean().item()
print(f"MLP 1-day RMSE: {rmse:.3f}  MAE: {mae:.3f}")

In [None]:
# 13. Tutorial 5. MLP  — rolling-window expert + Optuna tuning

# -------------------------------------------------------------
# 0) Regression matrix on *all* data (no NaNs)
reg_data = build_regression_matrix(
    dat_eval = train_t.cpu().numpy(),
    days_eval= pd.to_datetime(train_dates),
    reg_names= df.columns[1:],   # all columns except time_utc
)
reg_df   = reg_data["regmat"].dropna().reset_index(drop=True)
dep_idx  = reg_data["dep_indices"]

# -------------------------------------------------------------
# 1)  Optuna tuning on a 730-day evaluation block
eval_start  = reg_df.shape[0] - 730          # first eval row
best_params, study = tune_mlp_hyperparameters(
    reg_df, dep_idx, eval_window = (eval_start, 730), n_trials = 40
)
print("best params:", best_params)

# -------------------------------------------------------------
# 2)  Build rolling forecasts on a 730-day *test* block
test_horizon = 730
test_start   = reg_df.shape[0] - test_horizon
preds_mlp, trues_mlp = build_mlp_rolling_forecasts(
    reg_df, dep_idx,
    window      = best_params["D"],
    horizon     = test_horizon,
    start_row   = test_start,
    hidden_dim  = best_params["hidden"],
    lr          = best_params["lr"],
    weight_decay= best_params["wd"],
)

In [None]:
# -------------------------------------------------------------
# 3)  Append to forecast_all and compute error table
mlp_chan  = preds_mlp.unsqueeze(2)            # (N,24,1)
forecast_all = torch.cat([forecast_all, mlp_chan], dim=2)
model_names  = model_names + ["MLP"]

err_table_with_mlp = compute_error_table(forecast_all, model_names)
print(err_table_with_mlp)


### MLP with rolling window and without Optuna

In [None]:
# global constants to all zones
INIT_DATE_EXPERIMENTS = '2019-01-01'
INIT_TEST_DATE        = '2023-01-01'
FINAL_DATE_EXPERIMENTS= '2024-12-31'

# hyperparameters
WINDOW_DAYS   = 730                 
HIDDEN_DIM    = 50                  
LEARNING_RATE = 1e-3
WEIGHT_DECAY  = 1e-3
EPOCHS        = 60
BATCH_SIZE    = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

repo_root  = Path.cwd().parents[1]
mapcodes   = ["NO1", "NO2", "NO3", "NO4", "NO5"]
zone_data  = {}          

for code in mapcodes:
    csv_path = repo_root / "data" / f"data_{code}.csv"
    df_raw   = pd.read_csv(csv_path, parse_dates=["time_utc"])

    data_t, train_t, train_dates, price_t = prepare_dataset_tensor_modified(
        csv_path,
        tz      = "CET",
        seed    = 42,
        test_days = (pd.Timestamp(FINAL_DATE_EXPERIMENTS)
                     - pd.Timestamp(INIT_TEST_DATE)).days + 1,
        dtype   = torch.float64,
    )

    idx = pd.DatetimeIndex(sorted(train_dates))
    start_i = idx.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    end_i   = idx.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    data_t  = data_t[start_i:end_i+1]             
    dates_t = pd.Series(train_dates[start_i:end_i+1])

    zone_data[code] = dict(
        df        = df_raw,
        tensor    = data_t,
        dates     = dates_t,
        price_t   = price_t[start_i:end_i+1],
    )

# rolling-window MLP per zone
rmse_mlp_by_zone   = {}
preds_mlp_by_zone  = {}

for code in mapcodes:
    print(f"\n==== Zone {code} ====")

    reg_data = build_regression_matrix(
        dat_eval = zone_data[code]["tensor"].cpu().numpy(),
        days_eval= pd.to_datetime(zone_data[code]["dates"]),
        reg_names= zone_data[code]["df"].columns[1:],   
    )
    reg_df   = reg_data["regmat"].dropna().reset_index(drop=True)
    dep_idx  = reg_data["dep_indices"]

    all_dates = pd.DatetimeIndex(sorted(zone_data[code]["dates"]\
                                        .iloc[len(zone_data[code]["dates"])
                                             - len(reg_df):]))
    test_start_row = all_dates.get_loc(pd.Timestamp(INIT_TEST_DATE))
    test_end_row   = all_dates.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    horizon        = test_end_row - test_start_row + 1      

    preds, trues = build_mlp_rolling_forecasts(
        regmat_df   = reg_df.astype("float32"),
        dep_indices = dep_idx,
        window      = WINDOW_DAYS,
        horizon     = horizon,
        start_row   = test_start_row,
        hidden_dim  = HIDDEN_DIM,
        lr          = LEARNING_RATE,
        weight_decay= WEIGHT_DECAY,
        batch_size  = BATCH_SIZE,
        epochs      = EPOCHS,
        device      = device,
    )

    rmse = torch.sqrt(((preds - trues) ** 2).mean()).item()
    rmse_mlp_by_zone[code]  = rmse
    preds_mlp_by_zone[code] = preds                   

    print(f"RMSE 2023-24: {rmse:7.3f}")

print("\n===== Rolling-MLP RMSE ( NOK / MWh ) =====")
for z, r in rmse_mlp_by_zone.items():
    print(f"{z}:  {r:7.3f}")


==== Zone NO1 ====
RMSE 2023-24:  21.057

==== Zone NO2 ====
RMSE 2023-24:  24.377

==== Zone NO3 ====
RMSE 2023-24:  15.700

==== Zone NO4 ====
RMSE 2023-24:  11.912

==== Zone NO5 ====
RMSE 2023-24:  17.403