In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# %% load packages
import locale
import sys
import os
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import optuna
import requests
import torch
import random
from sqlalchemy import create_engine,inspect
from pathlib import Path
import urllib.parse
import pyarrow
from calendar import day_abbr
import calendar
from typing import Tuple, Union, Dict, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from pygam import LinearGAM
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from srs.utils.tutor_utils import prepare_dataset_tensor, forecasting_study,\
  plot_daily_profile,plot_hour_comparison, build_multiwindow_experts, tune_ewa_eta, \
  ewa_aggregate_forecasts, compute_error_table, tune_expert_window, \
  run_expert_window_test, build_regression_matrix, SimpleMLP, train_mlp, \
  prepare_train_test_tensors, build_mlp_rolling_forecasts, tune_mlp_hyperparameters, \
  DST_trafo

from srs.utils.our_utils import run_forecast_step
from srs.collect_data.setup import setup_seed, get_device
from srs.collect_data.entsoe_data import create_entsoe_engine, get_tables, get_spec, \
  get_market_divisions,get_map_codes,get_map_codes_starting_with, get_resolution_codes, \
    prepare_generation, prepare_load,prepare_price, prepare_unavailability, \
    prepare_filling_rate_hydro, prepare_physical_flow, prepare_installed_capacity
from srs.collect_data.datastream_data import create_datastream_engine, get_tables, \
  prepare_datastream
from srs.collect_data.dwd_mosmix_data import fetch_region_weather, prepare_weather
from srs.collect_data.merge_data import merge_datasets, build_training_dataset



### gam_24h and gam_1h fitting for no1

In [None]:
# Transform merged dataset using DST_trafo and prepare training data.

repo_root = Path.cwd().parents[1]
data_no1 = pd.read_csv(repo_root / "data" /'data_no1.csv')
data_t_no1, train_t_no1, train_dates, price_t_no1 = prepare_dataset_tensor(
    repo_root / "data" / "data_no1.csv",
    tz="CET",
    seed=42,
    test_days=2*365,         
    dtype=torch.float64, 
)

data_array = data_t_no1         
price_S    = price_t_no1        
dates_S    = train_dates    

train_start_idx = dates_idx.get_loc(pd.Timestamp("2019-01-01"))
train_end_idx   = dates_idx.get_loc(pd.Timestamp("2023-12-31"))


D          = 730            
S          = 24
WD         = [1, 6, 7]
PRICE_S_LAGS = [1, 2, 7]
da_lag = [0]

#validation period length
length_eval = 2 * 365

# The first obdervation in the evaluation period
begin_eval = data_array.shape[0] - length_eval

N_s = length_eval

model_names = [
    "true",
    "expert_ext",
    "linar_gam",
    "light_gbm"
]
n_models = len(model_names)

# 3D tensor to hold forecasts:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
forecasts = torch.full((N_s, S, n_models), float('nan'), dtype=torch.float64, device=device)

In [None]:

# Create thread pool
init_time = datetime.now()
with ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(
            run_forecast_step,
            n,
            price_S,
            data_array,
            begin_eval,
            D,
            dates_S,
            WD,
            PRICE_S_LAGS,
            da_lag,
            data_no1.columns[1:],  # reg_names
            data_no1.columns[1:]   # data_columns
        )
        for n in range(N_s)
    ]

    for future in as_completed(futures):
        try:
            n, gam_24h, gam_per_hour = future.result()
            forecasts[n, :, 0] = torch.tensor(gam_24h, dtype=forecasts.dtype, device=forecasts.device)
            forecasts[n, :, 1] = torch.tensor(gam_per_hour, dtype=forecasts.dtype, device=forecasts.device)
            #forecasts[n, :, insert_order] = true_price
            #forecasts[n, :, insert_order] = torch.tensor(expert, dtype=forecasts.dtype, device=forecasts.device)
            #forecasts[n, :, insert_order] = torch.tensor(lg_gbm, dtype=forecasts.dtype, device=forecasts.device)
        except Exception as e:
            print(f"Thread crashed: {e}")

# End timing
end_time = datetime.now()
duration_minutes = (end_time - init_time).total_seconds() / 60
print(f"\nParallel training duration (threaded): {duration_minutes:.2f} minutes")

In [None]:
# save forecasts (v1 = version 1, add up as number of experiments increases)
fc = forecasts.cpu().numpy()
N_s, S, n_models = fc.shape

samples = np.repeat(np.arange(N_s), S)
hours = np.tile(np.arange(S), N_s)
data = {
    "sample": samples,
    "hours": hours
}

for name, m in [("gam_24h", 0), ("gam_1h", 1)]:
    data[name] = fc[:,:,m].reshape(-1)
    
df = pd.DataFrame(data)
df.to_csv(repo_root/"data"/"forecasts_gam24h_gam_1h_v1.csv", index=False)
print(f"Saved forecasts with columns: {df.columns.tolist()}")

In [None]:
# print(data_array.shape )
# print(price_S.shape )
# print(dates_S.shape )

# print(data_t_no1.shape)
# print(price_t_no1.shape)
# print(train_dates.shape)
# print(train_t_no1.shape)

### gam_24h fitting for all no1-no5 regions separately.

In [None]:
'''
  eval/test periods
  2023 - 365 days
  2024 - 366 days
'''

In [None]:
#set the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

repo_root = Path.cwd().parents[1]
mapcodes = ["NO1","NO2","NO3","NO4","NO5"]
maps_dict = {}

for code in mapcodes:
    csv_path = repo_root / "data" / f"data_{code}.csv"
    df = pd.read_csv(csv_path, parse_dates=["time_utc"])
    data_t, train_t, train_dates, price_t = prepare_dataset_tensor(
        csv_path,
        tz="CET",
        seed=42,
        test_days=2*365 + 1,
        dtype=torch.float64,
    )
    
    maps_dict[code] = {
        "df": df,
        "data_t": data_t,
        "train_t": train_t,
        "train_dates": train_dates,
        "price_t": price_t
    }
maps_dict.keys()

gam24_by_zone   = {}
rmse_by_zone  = {}

for z in mapcodes:
    print(f"\n--- {z} ---")
    full_dates   = maps_dict[z]["df"]["time_utc"].dt.normalize().unique() # <- added _all_ days
    price_S      = maps_dict[z]["price_t"]
    data_array   = maps_dict[z]["data_t"]
    dates_S      = maps_dict[z]["train_dates"]
    feature_names= maps_dict[z]["df"].columns[1:]
    #data_columns = reg_names

    # build a DatetimeIndex to locate our anchor dates
    #dt_index       = pd.DatetimeIndex(pd.to_datetime(dates_S)) # <- typo here, delete this line afterward
    full_dates       = pd.DatetimeIndex(sorted(full_dates))
    train_start_idx = full_dates.get_loc(pd.Timestamp("2019-01-01"))
    train_end_idx   = full_dates.get_loc(pd.Timestamp("2023-12-31")) # <- typo here, was 2022.12.31, changed to 23.12.31

    # evaluation days (all of 2024)
    eval_start_idx = train_end_idx + 1 
    eval_year = full_dates[eval_start_idx].year
    eval_end_date = pd.Timestamp(f"{eval_year}-12-31")
    eval_end_idx  = full_dates.get_loc(eval_end_date)
    N_s = eval_end_idx - eval_start_idx + 1
    
    # new features: WD - dummy for week days, price lags for Mon, Tue and Fri, day-ahead load lag
    WD             = [1,6,7]     
    PRICE_S_LAGS   = [1,2,7]
    DA_LAG         = [0]
    S              = 24
    #D             = 730

    # tensors to collect forecasts for THIS zone
    forecasts_zone = torch.full((N_s, S, 1), float("nan"),
                                dtype=torch.float64, device=device)

    # thread pool
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(
                run_forecast_step,
                n,
                price_S,
                data_array,
                train_start_idx,
                train_end_idx,
                full_dates,
                WD,                
                PRICE_S_LAGS,
                DA_LAG,
                feature_names,   # reg_names
            )
            for n in range(N_s)
        ]
        for fut in as_completed(futures):
            try:
                n, gam24 = fut.result()
                forecasts_zone[n, :, 0] = torch.tensor(gam24, dtype=forecasts_zone.dtype, device=device)
            except Exception as e:
                print(f"Thread crashed: {e}")
                
    #   shape: (N_s, S)
    true_vals = price_S[eval_start_idx : eval_end_idx + 1].to(device)  
    
    # compute RMSE
    diff = forecasts_zone[:, :, 0] - true_vals
    rmse = torch.sqrt((diff**2).mean()).item()
    
    print(range(N_s))
    print(f"Zone {z} GAM-24h RMSE: {rmse:.4f}")

    gam24_by_zone[z] = forecasts_zone[:, :, 0].cpu()
    rmse_by_zone[z]  = rmse


--- NO1 ---
Loop   0: train 2019-01-01 00:00:00 -> 2023-12-31 00:00:00, forecast 2024-01-01 00:00:00
Loop   1: train 2019-01-01 00:00:00 -> 2024-01-01 00:00:00, forecast 2024-01-02 00:00:00
Loop   2: train 2019-01-01 00:00:00 -> 2024-01-02 00:00:00, forecast 2024-01-03 00:00:00
Loop   3: train 2019-01-01 00:00:00 -> 2024-01-03 00:00:00, forecast 2024-01-04 00:00:00
Loop   4: train 2019-01-01 00:00:00 -> 2024-01-04 00:00:00, forecast 2024-01-05 00:00:00
Loop   5: train 2019-01-01 00:00:00 -> 2024-01-05 00:00:00, forecast 2024-01-06 00:00:00
Loop   6: train 2019-01-01 00:00:00 -> 2024-01-06 00:00:00, forecast 2024-01-07 00:00:00
Loop   7: train 2019-01-01 00:00:00 -> 2024-01-07 00:00:00, forecast 2024-01-08 00:00:00
Loop   8: train 2019-01-01 00:00:00 -> 2024-01-08 00:00:00, forecast 2024-01-09 00:00:00
Loop  10: train 2019-01-01 00:00:00 -> 2024-01-10 00:00:00, forecast 2024-01-11 00:00:00
Loop   9: train 2019-01-01 00:00:00 -> 2024-01-09 00:00:00, forecast 2024-01-10 00:00:00
Loop  11