# Setup

In [1]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# Collab
#%pip install pygam
#%pip install optuna

In [2]:
# %% load packages
import locale
import sys
import os
import time
import math
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import optuna
import requests
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
import random
from sqlalchemy import create_engine,inspect
from pathlib import Path
import urllib.parse
import pyarrow
from calendar import day_abbr
import calendar
from typing import Tuple, Union, Dict, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from pygam import LinearGAM
from datetime import datetime
from typing import List, Dict, Any

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Collab
#!git clone https://github.com/aamaguay/DLiE_forecast.git
#%cd DLiE_forecast
#!pip install -e .

import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1602afc30>

In [4]:
from srs.utils.tutor_utils import prepare_dataset_tensor, forecasting_study,\
  plot_daily_profile,plot_hour_comparison, build_multiwindow_experts, tune_ewa_eta, \
  ewa_aggregate_forecasts, compute_error_table, tune_expert_window, \
  run_expert_window_test, prepare_train_test_tensors, \
  DST_trafo, prepare_dataset_tensor_modified, build_regression_matrix_modified, \
  build_regression_matrix_modified_2, build_regression_matrix_reordered_dtleak, \
  build_regression_matrix_reordered_dtleak_seasonal, \
  build_regression_matrix_reordered_dtleak_seasonal_h_t_h_deltaP

from srs.models.mlp import SimpleMLP, train_mlp, build_mlp_rolling_forecasts, \
  tune_mlp_hyperparameters, DeepMLP, build_mlp_rolling_forecasts_weighted_loss, \
  build_mlp_rolling_forecasts_weighted_data, \
  build_mlp_rolling_forecasts_weighted_data_modifed, build_mlp_rolling_forecasts_weighted_data_delta
  

from srs.utils.our_utils import run_forecast_step, compute_hourly_delta, rolling_std_lag, \
  compute_hourly_delta, flatten_to_hour_rows, daily_anchor, reconstruct_level
from srs.collect_data.setup import setup_seed, get_device

from srs.collect_data.entsoe_data import create_entsoe_engine, get_tables, get_spec, \
  get_market_divisions,get_map_codes,get_map_codes_starting_with, get_resolution_codes, \
    prepare_generation, prepare_load,prepare_price, prepare_unavailability, \
    prepare_filling_rate_hydro, prepare_physical_flow, prepare_installed_capacity
    
from srs.collect_data.datastream_data import create_datastream_engine, get_tables, \
  prepare_datastream
from srs.collect_data.dwd_mosmix_data import fetch_region_weather, prepare_weather
from srs.collect_data.merge_data import merge_datasets, build_training_dataset

from srs.utils.utils_final import build_regression_matrix

# Start

### Using tutor's original code

In [None]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# global date constants to all zones
INIT_DATE_EXPERIMENTS   = '2019-01-01'
INIT_TEST_DATE          = '2023-01-01'
FINAL_DATE_EXPERIMENTS  = '2024-12-31'

# features
DA_NAMES                    = ["Load_DA", "Solar_DA", "WindOn_DA", "WindOff_DA"]
FUEL_NAMES                  = ["Coal","NGas","Oil","EUA"]
SEASONAL_NAMES              = []         # ["D_t", "W_t", "A_t"]
SEASONAL_SIN_COS_NAMES      = ["W_t_sin", "W_t_cos", "A_t_sin", "A_t_cos"]
VOL_NAMES                   = ["vol_24h_lag1", "volpct_24h_lag1"]


# global constants

TARGET_MODE         =  "hour"   # "hour" (Δt=1 h) or "day" (Δt=24 h)
LAG_UNIT            =  "h"      # "h" or "d" (interprets diff_price_lags)
DIFF_PRICE_LAGS     = [1,2,7]
DA_LAG              = [0]
FUEL_LAGS           = [2]
PRICE_LAGS          = [0]
WD                  = [0]

# hyperparameters
WINDOW_DAYS   = 730                 
HIDDEN_DIM    = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY  = 0
EPOCHS        = 60
BATCH_SIZE    = 64
ALPHA         = 0   # 0.002; time-decaying weight for MSE loss or X and y

repo_root  = Path.cwd().parents[1]
mapcodes   = ["NO1", "NO2", "NO3", "NO4", "NO5"]
zone_data  = {}          

start_time = time.time()

for code in mapcodes:
    csv_path = repo_root / "data" / f"data_{code}.csv"
    df_raw   = pd.read_csv(csv_path, parse_dates=["time_utc"])

    data_t, train_t, train_dates, price_t = prepare_dataset_tensor_modified(
        csv_path,
        tz      = "CET",
        seed    = 42,
        test_days = (pd.Timestamp(FINAL_DATE_EXPERIMENTS)
                     - pd.Timestamp(INIT_TEST_DATE)).days + 1,
        dtype   = torch.float64,
    )

    idx = pd.DatetimeIndex(sorted(train_dates))
    start_i = idx.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    end_i   = idx.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    data_t  = data_t[start_i:end_i+1]             
    dates_t = pd.Series(train_dates[start_i:end_i+1])

    zone_data[code] = dict(
        df        = df_raw,
        tensor    = data_t,
        dates     = dates_t,
        price_t   = price_t[start_i:end_i+1],
    )

# rolling-window MLP per zone
results_by_zone = {}

for code in mapcodes:
    
    reg_data = build_regression_matrix(
        dat_eval = zone_data[code]["tensor"].cpu().numpy(),
        days_eval= pd.to_datetime(zone_data[code]["dates"]),
        reg_names= zone_data[code]["df"].columns[1:],   
        wd = WD,
        #price_lags = PRICE_LAGS,
        da_lags = DA_LAG,
        fuel_lags = FUEL_LAGS,
        da_names = DA_NAMES,
        fuel_names = FUEL_NAMES,
        seasonal_names = SEASONAL_NAMES,
        seasonal_sin_cos_names = SEASONAL_SIN_COS_NAMES,
        vol_names = VOL_NAMES,
        diff_price_lags = DIFF_PRICE_LAGS,
        target_mode = TARGET_MODE,
        lag_unit = LAG_UNIT
    )
    reg_df   = reg_data["regmat"].dropna().reset_index(drop=True)
    dep_idx  = reg_data["dep_indices"]

    all_dates = pd.DatetimeIndex(sorted(zone_data[code]["dates"]\
                                        .iloc[len(zone_data[code]["dates"])
                                             - len(reg_df):]))
    test_start_row = all_dates.get_loc(pd.Timestamp(INIT_TEST_DATE))
    test_end_row   = all_dates.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    horizon        = test_end_row - test_start_row + 1      

    (preds, trues, 
     diff_train_rmse, diff_test_rmse, 
     lvl_train_rmse,  lvl_test_rmse) = build_mlp_rolling_forecasts_weighted_data(
        regmat_df   = reg_df.astype("float32"),
        dep_indices = dep_idx,
        window      = WINDOW_DAYS,
        horizon     = horizon,
        start_row   = test_start_row,
        hidden_dim  = HIDDEN_DIM,
        lr          = LEARNING_RATE,
        weight_decay= WEIGHT_DECAY,
        batch_size  = BATCH_SIZE,
        epochs      = EPOCHS,
        alpha       = ALPHA,
        price_series_flat = zone_data[code]["price_t"].cpu().numpy().ravel(),
        device      = device,
    )

    results_by_zone[code] = {
        "preds":       preds,
        "trues":       trues,
        "diff_train_rmse": diff_train_rmse,
        "diff_test_rmse":   diff_test_rmse,
        "lvl_train_rmse": lvl_train_rmse,
        "lvl_test_rmse": lvl_test_rmse
    }
  
    # train vs test rmse summary per region
    print(f"--- Zone {code} ---")
    print(f"delta_P Train RMSE: {np.mean(diff_train_rmse):.3f}")
    print(f"delta_P Test  RMSE: {np.mean(diff_test_rmse):.3f}")
    print(f"Actual_P Train RMSE: {np.mean(lvl_train_rmse):.3f}")
    print(f"Actual_P Test  RMSE: {np.mean(lvl_test_rmse):.3f}")

end_time = time.time()
duration = (end_time - start_time)/60
print(f"ellapsed time: {duration:.2f} minutes")

In [8]:
'''
--- Zone NO1 ---
delta_P Train RMSE: 9.353
delta_P Test  RMSE: 5.692
Actual_P Train RMSE: 9.353
Actual_P Test  RMSE: 5.692
--- Zone NO2 ---
delta_P Train RMSE: 10.800
delta_P Test  RMSE: 6.482
Actual_P Train RMSE: 10.800
Actual_P Test  RMSE: 6.482
--- Zone NO3 ---
delta_P Train RMSE: 5.426
delta_P Test  RMSE: 3.792
Actual_P Train RMSE: 5.426
Actual_P Test  RMSE: 3.792
--- Zone NO4 ---
delta_P Train RMSE: 4.421
delta_P Test  RMSE: 3.338
Actual_P Train RMSE: 4.421
Actual_P Test  RMSE: 3.338
--- Zone NO5 ---
delta_P Train RMSE: 8.798
delta_P Test  RMSE: 5.069
Actual_P Train RMSE: 8.798
Actual_P Test  RMSE: 5.069
ellapsed time: 55.43 minutes
'''

'\n--- Zone NO1 ---\ndelta_P Train RMSE: 9.353\ndelta_P Test  RMSE: 5.692\nP Train RMSE: 807.937\nP Test  RMSE: 2057.823\n--- Zone NO2 ---\ndelta_P Train RMSE: 10.800\ndelta_P Test  RMSE: 6.482\nP Train RMSE: 941.786\nP Test  RMSE: 2273.225\n--- Zone NO3 ---\ndelta_P Train RMSE: 5.426\ndelta_P Test  RMSE: 3.792\nP Train RMSE: 369.047\nP Test  RMSE: 1511.378\n'

In [None]:
print(f'reg_data keys: {reg_data.keys()}')
print(f"regmat shape: {reg_data['regmat'].shape}")
print(f"reg_data['index_dict'].keys(): {reg_data['index_dict'].keys()}")
print(f"len(reg_data['dep_indices']): {len(reg_data['dep_indices'])}")

In [18]:
reg_df.describe()

Unnamed: 0,del_price_s0,del_price_lag1h_s0,del_price_lag2h_s0,del_price_lag7h_s0,Load_DA_lag_0_s0,Solar_DA_lag_0_s0,WindOn_DA_lag_0_s0,WindOff_DA_lag_0_s0,sin_hour_s0,cos_hour_s0,...,Coal_lag_2,NGas_lag_2,Oil_lag_2,EUA_lag_2,W_t_sin,W_t_cos,A_t_sin,A_t_cos,vol_24h_lag1,volpct_24h_lag1
count,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,...,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0,1981.0
mean,-3.680747,-3.680747,-2.698869,4.613473,1794.54475,0.0,0.0,0.0,0.0,1.0,...,143.684866,49.20884,81.125983,57.698284,0.000297,-0.002156,-0.046261,-0.000268,10.323341,0.710531
std,8.236958,8.236958,8.175153,14.089255,276.516947,0.0,0.0,0.0,0.0,0.0,...,92.200677,49.373025,19.98018,24.167436,0.707323,0.707245,0.700365,0.712638,16.083611,11.354238
min,-137.46,-137.46,-144.54,-187.72,959.43,0.0,0.0,0.0,0.0,1.0,...,41.603549,3.535,20.989196,15.24,-0.974928,-0.900969,-0.999991,-0.999963,0.009325,0.003517
25%,-3.81,-3.81,-2.41,0.04,1601.04,0.0,0.0,0.0,0.0,1.0,...,67.027872,15.769,70.104365,28.59,-0.781831,-0.900969,-0.729558,-0.720667,1.496226,0.027388
50%,-1.24,-1.24,-0.71,0.95,1773.71,0.0,0.0,0.0,0.0,1.0,...,126.048117,33.785,84.899104,63.55,0.0,-0.222521,-0.103102,0.004304,4.801358,0.054434
75%,-0.37,-0.37,-0.12,5.11,1983.53,0.0,0.0,0.0,0.0,1.0,...,163.976067,59.0,92.096922,79.64,0.781831,0.62349,0.655156,0.714673,12.33278,0.111922
max,18.48,18.48,40.71,162.58,2608.26,0.0,0.0,0.0,0.0,1.0,...,485.485485,310.5,139.510547,97.59,0.974928,1.0,0.999991,1.0,130.831596,385.72095


In [19]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# global date constants to all zones
INIT_DATE_EXPERIMENTS   = '2019-01-01'
INIT_TEST_DATE          = '2023-01-01'
FINAL_DATE_EXPERIMENTS  = '2024-12-31'

# features
DA_NAMES                    = ["Load_DA", "WindOn_DA"]
FUEL_NAMES                  = ["Coal","NGas","Oil","EUA"]
SEASONAL_NAMES              = []         # ["D_t", "W_t", "A_t"]
SEASONAL_SIN_COS_NAMES      = ["W_t_sin", "W_t_cos", "A_t_sin", "A_t_cos"]
VOL_NAMES                   = ["vol_24h_lag1", "volpct_24h_lag1"]


# global constants

TARGET_MODE         =  "hour"   # "hour" (Δt=1 h) or "day" (Δt=24 h)
LAG_UNIT            =  "h"      # "h" or "d" (interprets diff_price_lags)
DIFF_PRICE_LAGS     = [1,2,7]
DA_LAG              = [0]
FUEL_LAGS           = [2]
PRICE_LAGS          = [0]
WD                  = [0]

# hyperparameters
WINDOW_DAYS   = 730                 
HIDDEN_DIM    = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY  = 0
EPOCHS        = 60
BATCH_SIZE    = 64
ALPHA         = 0   # 0.002; time-decaying weight for MSE loss or X and y

repo_root  = Path.cwd().parents[1]
mapcodes   = ["NO1", "NO2", "NO3", "NO4", "NO5"]
zone_data  = {}          

start_time = time.time()

for code in mapcodes:
    csv_path = repo_root / "data" / f"data_{code}.csv"
    df_raw   = pd.read_csv(csv_path, parse_dates=["time_utc"])

    data_t, train_t, train_dates, price_t = prepare_dataset_tensor_modified(
        csv_path,
        tz      = "CET",
        seed    = 42,
        test_days = (pd.Timestamp(FINAL_DATE_EXPERIMENTS)
                     - pd.Timestamp(INIT_TEST_DATE)).days + 1,
        dtype   = torch.float64,
    )

    idx = pd.DatetimeIndex(sorted(train_dates))
    start_i = idx.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    end_i   = idx.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    data_t  = data_t[start_i:end_i+1]             
    dates_t = pd.Series(train_dates[start_i:end_i+1])

    zone_data[code] = dict(
        df        = df_raw,
        tensor    = data_t,
        dates     = dates_t,
        price_t   = price_t[start_i:end_i+1],
    )

# rolling-window MLP per zone
results_by_zone = {}

for code in mapcodes:
    
    reg_data = build_regression_matrix(
        dat_eval = zone_data[code]["tensor"].cpu().numpy(),
        days_eval= pd.to_datetime(zone_data[code]["dates"]),
        reg_names= zone_data[code]["df"].columns[1:],   
        wd = WD,
        #price_lags = PRICE_LAGS,
        da_lags = DA_LAG,
        fuel_lags = FUEL_LAGS,
        da_names = DA_NAMES,
        fuel_names = FUEL_NAMES,
        seasonal_names = SEASONAL_NAMES,
        seasonal_sin_cos_names = SEASONAL_SIN_COS_NAMES,
        vol_names = VOL_NAMES,
        diff_price_lags = DIFF_PRICE_LAGS,
        target_mode = TARGET_MODE,
        lag_unit = LAG_UNIT
    )
    reg_df   = reg_data["regmat"].dropna().reset_index(drop=True)
    dep_idx  = reg_data["dep_indices"]

    all_dates = pd.DatetimeIndex(sorted(zone_data[code]["dates"]\
                                        .iloc[len(zone_data[code]["dates"])
                                             - len(reg_df):]))
    test_start_row = all_dates.get_loc(pd.Timestamp(INIT_TEST_DATE))
    test_end_row   = all_dates.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    horizon        = test_end_row - test_start_row + 1      

    (preds, trues, 
     diff_train_rmse, diff_test_rmse, 
     lvl_train_rmse,  lvl_test_rmse) = build_mlp_rolling_forecasts_weighted_data(
        regmat_df   = reg_df.astype("float32"),
        dep_indices = dep_idx,
        window      = WINDOW_DAYS,
        horizon     = horizon,
        start_row   = test_start_row,
        hidden_dim  = HIDDEN_DIM,
        lr          = LEARNING_RATE,
        weight_decay= WEIGHT_DECAY,
        batch_size  = BATCH_SIZE,
        epochs      = EPOCHS,
        alpha       = ALPHA,
        price_series_flat = zone_data[code]["price_t"].cpu().numpy().ravel(),
        device      = device,
    )

    results_by_zone[code] = {
        "preds":       preds,
        "trues":       trues,
        "diff_train_rmse": diff_train_rmse,
        "diff_test_rmse":   diff_test_rmse,
        "lvl_train_rmse": lvl_train_rmse,
        "lvl_test_rmse": lvl_test_rmse
    }
  
    # train vs test rmse summary per region
    print(f"--- Zone {code} ---")
    print(f"delta_P Train RMSE: {np.mean(diff_train_rmse):.3f}")
    print(f"delta_P Test  RMSE: {np.mean(diff_test_rmse):.3f}")
    print(f"Actual_P Train RMSE: {np.mean(lvl_train_rmse):.3f}")
    print(f"Actual_P Test  RMSE: {np.mean(lvl_test_rmse):.3f}")

end_time = time.time()
duration = (end_time - start_time)/60
print(f"ellapsed time: {duration:.2f} minutes")

TypeError: build_mlp_rolling_forecasts_weighted_data() got an unexpected keyword argument 'price_series_flat'