In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# %% load packages
import locale
import sys
import os
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import optuna
import requests
import torch
import random
from sqlalchemy import create_engine,inspect
from pathlib import Path
import urllib.parse
import pyarrow
from calendar import day_abbr
import calendar
from typing import Tuple, Union, Dict, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from pygam import LinearGAM
from datetime import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import sys
sys.path.append('C:/Users/amaguaya/OneDrive - Kienzle Automotive GmbH/Desktop/DLiE/repo/DLiE_forecast_13_06_25')

import os
print("Working directory:", os.getcwd())



Working directory: c:\Users\amaguaya\OneDrive - Kienzle Automotive GmbH\Desktop\DLiE\repo\DLiE_forecast_13_06_25\srs\notebooks


In [77]:
from srs.utils.tutor_utils import prepare_dataset_tensor, forecasting_study,\
  plot_daily_profile,plot_hour_comparison, build_multiwindow_experts, tune_ewa_eta, \
  ewa_aggregate_forecasts, compute_error_table, tune_expert_window, \
  run_expert_window_test, build_regression_matrix, SimpleMLP, train_mlp, \
  prepare_train_test_tensors, build_mlp_rolling_forecasts, tune_mlp_hyperparameters, \
  DST_trafo, prepare_dataset_tensor_modified

from srs.utils.our_utils import run_forecast_step, run_forecast_step_modified, prepare_data_forTraining, run_forecast_step_modified_JustTraining, get_holidays_dummy
from srs.collect_data.setup import setup_seed, get_device
from srs.collect_data.entsoe_data import create_entsoe_engine, get_tables, get_spec, \
  get_market_divisions,get_map_codes,get_map_codes_starting_with, get_resolution_codes, \
    prepare_generation, prepare_load,prepare_price, prepare_unavailability, \
    prepare_filling_rate_hydro, prepare_physical_flow, prepare_installed_capacity
from srs.collect_data.datastream_data import create_datastream_engine, get_tables, \
  prepare_datastream
from srs.collect_data.dwd_mosmix_data import fetch_region_weather, prepare_weather
from srs.collect_data.merge_data import merge_datasets, build_training_dataset

from srs.models.light_gbm import forecast_lgbm_whole_sample_optuna_selectBestOptions


### models_24h fitting for all no1-no5 regions separately.

In [None]:
'''
  eval/test periods
  2023 - 365 days
  2024 - 366 days
'''


## EXPERIMENT USING DEFAULT HYPERPARAMETERS VALUES 

In [307]:
# set the GPU
from srs.utils.our_utils import run_forecast_step_modified_JustTraining


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **************************************
# define dates for training and evaluation 
# **************************************
INIT_DATE_EXPERIMENTS = '2019-01-01'
INIT_TEST_DATE = '2023-01-01'
FINAL_DATE_EXPERIMENTS = '2024-12-31'
n_days_test = (pd.to_datetime(FINAL_DATE_EXPERIMENTS) - pd.to_datetime(INIT_TEST_DATE)).days + (1) # additional adjustment

repo_root = Path.cwd().parents[1]
mapcodes = ["NO1","NO2","NO3","NO4","NO5"]
maps_dict = {}

# holidays dataset
holidays_df = pd.read_csv(repo_root / "data" / f"holidays_2000_2030.csv", parse_dates=["Date"])

for code in mapcodes:
    csv_path = repo_root / "data" / f"data_{code}.csv"
    df = pd.read_csv(csv_path, parse_dates=["time_utc"])
    data_t, train_t, train_dates, price_t = prepare_dataset_tensor_modified(
        csv_path,
        tz="CET",
        seed=42,
        test_days= n_days_test , # Here it should be 1*365 for 1 year of test, and so on
        dtype=torch.float64,
    )

    # fix potential problems with dates after change time zone..
    train_dates_series       = pd.DatetimeIndex(sorted(train_dates))
    id_init_exp = train_dates_series.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    id_init_test_period = train_dates_series.get_loc(pd.Timestamp(INIT_TEST_DATE))
    id_end_exp = train_dates_series.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    data_t = data_t[id_init_exp:(id_end_exp+1), :,:]
    train_dates = pd.Series(train_dates[id_init_exp:(id_end_exp+1)])

    print(f"days for test data ..{data_t.shape[0] - id_init_test_period}")
    
    maps_dict[code] = {
        "df": df,
        "data_t": data_t,
        "train_t": train_t,
        "train_dates": train_dates,
        "price_t": price_t
    }
maps_dict.keys()

preds_by_zone   = {}
true_values_by_zone   = {}
rmse_by_zone  = {}

# Define models and select variables to apply smooth spline, and for the rest it will use a line spline
LS_MODELS_TRAIN = ['gam_24hAhead', 'lgbm_24hAhead_defaultHyper']
LS_VAR_APPLY_SMOOTH_SPLINE = ["Price_lag_1", "Load_DA_lag_0", "WindOn_DA_lag_0", "NGas_lag_2", "Oil_lag_2", "EUA_lag_2"]
MODEL_USING_FIRST_DIFF = True

for z in mapcodes:
    print(f"\n--- {z} ---")
    price_S      = maps_dict[z]["price_t"]
    data_array   = maps_dict[z]["data_t"]
    full_dates      = maps_dict[z]["train_dates"] # <-- here i change a bit the previuous function, prepare_data_tensor, it has all dates
    feature_names= maps_dict[z]["df"].columns[1:]
    full_date_series = pd.DatetimeIndex(sorted(full_dates))

    # evaluation days (all of 2024)
    train_start_idx = full_date_series.get_loc(pd.Timestamp(INIT_DATE_EXPERIMENTS))
    id_init_eval = full_date_series.get_loc(pd.Timestamp(INIT_TEST_DATE))
    id_end_eval = full_date_series.get_loc(pd.Timestamp(FINAL_DATE_EXPERIMENTS))
    eval_start_idx = id_init_eval
    eval_end_idx  = id_end_eval
    N_s            = (eval_end_idx - eval_start_idx) + 1 # additional adjustment
    full_dates = pd.to_datetime(full_dates)

    

    # new features: WD - dummy for week days, price lags for Mon, Tue and Fri, day-ahead load lag
    WD             = [1,6,7]     
    PRICE_S_LAGS   = [1,2,7]
    DA_LAG         = [0]
    S              = 24
    FUEL_LAGS      = [2]

    # prepare data
    all_vars_names, Xy_t = prepare_data_forTraining(
            data_array, 
            full_dates, 
            WD, 
            PRICE_S_LAGS, DA_LAG, feature_names, 
            FUEL_LAGS,
            full_dates, 
            holidays_df, 
            model_first_diff_price = MODEL_USING_FIRST_DIFF,
            dayahead=1, daybefore= 0.5)
    
    print(f"the models will use these features: {all_vars_names[1:]}")
    
    # tensors to collect forecasts for THIS zone
    forecasts_zone = torch.full((N_s, S, len(LS_MODELS_TRAIN)), float("nan"),
                                dtype=torch.float64, device=device)

    # thread pool
    with ThreadPoolExecutor(max(1, os.cpu_count() // 2) + 0 ) as executor:
        futures = [
            executor.submit(
                run_forecast_step_modified_JustTraining,
                n,
                data_array      = Xy_t,
                train_start_idx = train_start_idx,
                train_end_idx   = id_init_eval - 1,
                full_dates      = full_dates,
                feature_names   = all_vars_names,   # reg_names
                ls_models       = LS_MODELS_TRAIN, # models to estimate
                apply_smoo_spline_over_varList = LS_VAR_APPLY_SMOOTH_SPLINE
            )
            for n in range(N_s)
        ]

        for fut in as_completed(futures):
            try:
                n, ls_res_pred = fut.result()
                for idx, pred_vc in enumerate(ls_res_pred):
                    forecasts_zone[n, :, idx] = pred_vc.detach().clone().to(forecasts_zone.dtype).to(forecasts_zone.device)
            except Exception as e:
                import traceback
                print(f"Thread crashed: {e}")
                traceback.print_exc()

    true_vals = price_S[eval_start_idx : eval_end_idx + 1].to(device) 
    true_values_by_zone [z] = true_vals.reshape(-1).cpu()
    preds_tuple_ls = []
    rmse_tuple_ls =[]
    for idx_,md_ in enumerate(LS_MODELS_TRAIN):
        forecasted_delta = forecasts_zone[:, :, idx_].reshape(-1)
        print(f"results of model.... {md_}") 
        if MODEL_USING_FIRST_DIFF == False:
            pred_adj = forecasted_delta
            tuple_forc = [(pred_adj, pred_adj)]
            preds_tuple_ls = preds_tuple_ls + tuple_forc
        # compute RMSE, first lag of price + forecast of delta_var - true values
        else:
            last_price = Xy_t[eval_start_idx : eval_end_idx + 1, :, 1].reshape(-1)
            pred_adj = forecasted_delta + last_price
            tuple_forc = [(last_price, pred_adj)]
            preds_tuple_ls = preds_tuple_ls + tuple_forc
        diff = pred_adj - true_vals.reshape(-1)
        valid_mask = ~torch.isnan(diff)
        diff = diff[valid_mask]
        rmse = torch.sqrt((diff**2).mean()).item()
        rmse_tuple_ls = rmse_tuple_ls + [(rmse)]
        print(f"Zone {z}, model {md_}, RMSE: {rmse:.4f}")

    preds_by_zone[z] = preds_tuple_ls
    rmse_by_zone[z]  = rmse_tuple_ls
    

days for test data ..731
days for test data ..731
days for test data ..731
days for test data ..731
days for test data ..731

--- NO1 ---
the models will use these features: ['Price_lag_1', 'Price_lag_2', 'Price_lag_7', 'Load_DA_lag_0', 'WindOn_DA_lag_0', 'WD_1', 'WD_6', 'WD_7', 'Coal_lag_2', 'NGas_lag_2', 'Oil_lag_2', 'EUA_lag_2', 'pct_chg_Load_DA', 'lag168_Load_DA', 'lag1_price_2nd_diff', 'hour_7to9_17to18_dummy', 'sin_hour', 'cos_hour', 'volatility_24h_lg1', 'volatility_pct_24h_lg1', 'holiday_dummy']
Loop   0: train 2019-01-01 00:00:00 -> 2022-12-31 00:00:00, forecast 2023-01-01 00:00:00
  dat_slice shape: torch.Size([1462, 24, 22])  → flatten count = 35088
Loop   1: train 2019-03-03 00:00:00 -> 2023-01-01 00:00:00, forecast 2023-01-02 00:00:00
  dat_slice shape: torch.Size([1402, 24, 22])  → flatten count = 33648
Loop   2: train 2019-03-04 00:00:00 -> 2023-01-02 00:00:00, forecast 2023-01-03 00:00:00
  dat_slice shape: torch.Size([1402, 24, 22])  → flatten count = 33648
Loop   3: t

In [197]:
# compute RMSE
# diff = Xy_t[eval_start_idx : eval_end_idx + 1,:,1].reshape(-1) + forecasts_zone.reshape(-1) - true_vals.reshape(-1)
# rmse = torch.sqrt((diff**2).mean()).item()

# print(f"Zone {z} LBGM-24h RMSE: {rmse:.4f}")

# lgbm24_by_zone[z] = forecasts_zone[:, :, 0].cpu()
# rmse_by_zone[z]  = rmse

# Zone NO1 LBGM-24h RMSE: 8.0664, using dummy 7/8 am
# 8,12 after introduce var of volatility
# Zone NO1 LBGM-24h RMSE: 7.7831, using cos, sen hourly and just 2 volatility vars
# Zone NO1 LBGM-24h RMSE: 7.8722, unclusig volatiltg of load
# Zone NO1 LBGM-24h RMSE: 7.5839, unclusig soem var and lag of second diff

In [296]:
import plotly.graph_objects as go
import plotly.io as pio

# 👉 Force browser rendering
pio.renderers.default = 'browser'

# Flatten series (assuming they are already prepared)
flattened_series = data_t[:, :, 0].reshape(-1).cpu().numpy()
predicted_values = (Xy_t[eval_start_idx : eval_end_idx + 1,:,1].reshape(-1) + forecasts_zone.reshape(-1)).cpu().numpy()

start_datetime = pd.to_datetime(INIT_DATE_EXPERIMENTS)
end_datetime = pd.to_datetime(FINAL_DATE_EXPERIMENTS) + pd.Timedelta(hours=23)
datetime_index = pd.date_range(start=start_datetime, end=end_datetime, freq='H')

# Slice last N_s*24 points
flattened_series = flattened_series[-(N_s * 24):]
predicted_values = predicted_values[-(N_s * 24):]
datetime_index = datetime_index[-(N_s * 24):]

# Create interactive Plotly figure
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=datetime_index[-(24*7*9)-(2*24):],
    y=flattened_series[-(24*7*9)-(2*24):],
    mode='lines',
    name='Actual Price',
    line=dict(width=2)
))

fig.add_trace(go.Scatter(
    x=datetime_index[-(24*7*9)-(2*24):],
    y=predicted_values[-(24*7*9)-(2*24):],
    mode='lines',
    name='Forecasted Price',
    line=dict(dash='dash', width=2)
))

fig.update_layout(
    title="Electricity Price Forecast vs Actual (Hourly)",
    xaxis_title="Time",
    yaxis_title="Price",
    hovermode='x unified',
    template='plotly_white',
    height=500
)

# Open in browser
fig.show()