In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from optuna.storages import RDBStorage
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import multiprocessing
from tqdm import tqdm


from src.dataset import get_datasets
from src.feature import create_features, split
from src.prediction import init_prophet_model
from src.benchmark import extend_by_predictions_and_samples

Importing plotly failed. Interactive plots will not work.


In [None]:
COLUMNS = [
    "Price",
    "Hydro",
    "Pumped storage generation",
    "Solar",
    "Wind offshore",
    "Wind onshore",
    "temperature_2m",
    "precipitation",
    "wind_speed_100m",
    "direct_radiation",
]
WINDOW_SIZE = 24

CANDIDATE_FEATURES = [
    'Hydro',
    'Pumped storage generation', 'Solar',
    'Wind offshore', 'Wind onshore', 'temperature_2m', 'precipitation',
    'wind_speed_100m', 'direct_radiation', 'hour', 'dayofweek', 'dayofyear'
]

for i in range(1, 15):
    CANDIDATE_FEATURES.append(f'ma_{i}_days')
    CANDIDATE_FEATURES.append(f'ma_{i}_days_pumped_storage_generation')

# Remove duplicates (if any) while preserving order.
CANDIDATE_FEATURES = list(dict.fromkeys(CANDIDATE_FEATURES))

In [None]:
BEST_FEATURES = [
    "Pumped storage generation",
    "Solar",
    "Wind offshore",
    "temperature_2m",
    "wind_speed_100m",
    "hour",
    "dayofweek",
    "dayofyear",
    "ma_1_days",
    "ma_1_days_pumped_storage_generation",
    "ma_2_days",
    "ma_2_days_pumped_storage_generation",
    "ma_3_days",
    "ma_4_days_pumped_storage_generation",
    "ma_5_days_pumped_storage_generation",
    "ma_6_days",
    "ma_7_days",
    "ma_8_days_pumped_storage_generation",
    "ma_9_days_pumped_storage_generation",
    "ma_10_days",
    "ma_11_days",
    "ma_12_days_pumped_storage_generation",
    "ma_13_days",
    "ma_14_days_pumped_storage_generation",
]


import sys
sys.stdout = open("output.log", "w")  # Redirects print output to a file
sys.stderr = sys.stdout  # Redirect errors as well


print("All outputs will be saved to output.log")

# =============================================================================
# Precompute the Train/Test and Forecast Windows Outside the Objective
# =============================================================================
windowed_sets = []

# Get and split datasets
merged_df, _ = get_datasets()
train, eval, test, benchmark, _, _, _ = split(merged_df)

# Combine all available training data
training_set = pd.concat([train, eval, test])[COLUMNS]
benchmarking_set = benchmark[COLUMNS]

# Iterate through forecast windows for rolling prediction
for window_start in tqdm(range(WINDOW_SIZE, len(benchmarking_set) - WINDOW_SIZE, WINDOW_SIZE), desc="Preparing windowed datasets"):
    y_actual = benchmarking_set.iloc[window_start : window_start + WINDOW_SIZE]["Price"]
    dataset_extended = pd.concat((training_set, benchmarking_set.iloc[:window_start]))
    next_day = dataset_extended.index[-1] + pd.DateOffset(hours=1)

    if next_day != y_actual.index[0]:
        print(f"\nSkipping prediction for {next_day} due to missing entries.\n")
        continue

    dataset_extended_ps = extend_by_predictions_and_samples(dataset_extended, dataset_extended.index[-1])
    dataset_extended_features = create_features(dataset_extended_ps)

   # Split for training (all but last 24 hours) and prediction (the next 24 hours)
    VALIDATION_SET_SIZE = 24 * 14
    X_train_df = dataset_extended_features.iloc[:-WINDOW_SIZE - VALIDATION_SET_SIZE]
    X_eval_df = dataset_extended_features.iloc[-WINDOW_SIZE - VALIDATION_SET_SIZE : -WINDOW_SIZE]
    X_predict_df = dataset_extended_features.reindex(y_actual.index)
    windowed_sets.append((X_train_df, X_predict_df, X_eval_df, y_actual))

DB_URL = "postgresql://optuna_user:postgres@localhost/optuna_db"
NUM_WORKERS = 30
N_TRIALS_TOTAL = 500

def objective(trial, worker_name, progress):
    """
    Objective function for Optuna hyperparameter tuning.
    """
    # SELECTED_FEATURES = [
    #     feature for feature in CANDIDATE_FEATURES if trial.suggest_categorical(feature, [True, False])
    # ]

    SELECTED_FEATURES = BEST_FEATURES
    
    rmse_scores = []

    # --------------------------------------------------
    # 2. Hyperparameter Search Space for XGBoost
    # --------------------------------------------------
    # Sample early_stopping_rounds separately so it can be passed to fit.
    params = {
        'base_score': trial.suggest_float('base_score', 0.0, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 250)
    }

    # --------------------------------------------------
    # 3. Rolling Forecast Loop with Early Stopping
    # --------------------------------------------------
    # Loop over forecast windows (starting at index 24 to ensure an initial training period).
    for X_train_df, X_predict_df, X_eval_df, y_actual in tqdm(windowed_sets, desc=f"{worker_name} - {progress}"):
        model = xgb.XGBRegressor(
            **params,
            objective='reg:squarederror',
            eval_metric='rmse',
            random_state=42,
            n_jobs=2
        )

        model.fit(X_train_df[SELECTED_FEATURES], X_train_df["Price"], eval_set=[(X_eval_df[SELECTED_FEATURES], X_eval_df["Price"])], verbose=False
        )

        preds = model.predict(X_predict_df[SELECTED_FEATURES])
        rmse = mean_squared_error(y_actual, preds)  ** 0.5
        rmse_scores.append(rmse)

    if len(rmse_scores) == 0:
        return float('inf')
    avg_rmse = np.mean(rmse_scores)
    return avg_rmse

def worker_process(study_name):
    """
    Worker function that fetches trials from the database and optimizes them.
    """
    storage = RDBStorage(DB_URL)  # Remove heartbeat_interval
    study = optuna.create_study(study_name=study_name, storage=storage, direction="minimize", load_if_exists=True)
    run = 1
    while True:
        try:
            trial = study.ask()
            if trial is None:
                break
        except Exception as e:
            print(f"Database error: {e}")
            break
        progress = f"Run {run} of {N_TRIALS_TOTAL // NUM_WORKERS}"
        value = objective(trial, multiprocessing.current_process().name, progress)
        study.tell(trial, value)
        run+=1

if __name__ == "__main__":
    study_name = "XGB_hyperopt_params"
    study = optuna.create_study(study_name=study_name, storage=DB_URL, direction="minimize", load_if_exists=True)

    processes = []
    for _ in range(NUM_WORKERS):
        p = multiprocessing.Process(target=worker_process, args=(study_name,))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    best_trial = study.best_trial
    print(f"Best RMSE: {best_trial.value}")
    print("Best hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")


  0%|          | 0/30 [00:00<?, ?it/s]

Optuna Trials:   0%|          | 0/500 [00:00<?, ?it/s]

[I 2025-02-16 21:50:31,987] Trial 8 finished with value: 137.34579957043564 and parameters: {'base_score': 0.07550256199518257, 'learning_rate': 0.0021228831950554083, 'max_depth': 40, 'n_estimators': 335, 'gamma': 0.2623531092500475, 'min_child_weight': 3, 'subsample': 0.5393459596520509, 'colsample_bylevel': 0.5346442829742823, 'max_delta_step': 1, 'early_stopping_rounds': 123}. Best is trial 8 with value: 137.34579957043564.
[I 2025-02-16 21:50:34,580] Trial 3 finished with value: 135.36025937339815 and parameters: {'base_score': 0.48021377908335106, 'learning_rate': 0.0019409126308456031, 'max_depth': 50, 'n_estimators': 303, 'gamma': 1.9203122727100959, 'min_child_weight': 2, 'subsample': 0.994482939142921, 'colsample_bylevel': 0.5143726328994345, 'max_delta_step': 4, 'early_stopping_rounds': 62}. Best is trial 3 with value: 135.36025937339815.
[I 2025-02-16 21:51:05,223] Trial 10 finished with value: 135.85561569461046 and parameters: {'base_score': 0.19288989535520484, 'learning

In [None]:
BEST_FEATURES = [
"hour",
"ma_11_days",
"ma_12_days",
"ma_12_days_pumped_storage_generation",
"ma_13_days_pumped_storage_generation",
"ma_1_days",
"ma_2_days",
"ma_2_days_pumped_storage_generation",
"ma_3_days",
"ma_4_days_pumped_storage_generation",
"ma_5_days",
"ma_5_days_pumped_storage_generation",
"ma_6_days",
"ma_6_days_pumped_storage_generation",
"ma_8_days",
"ma_8_days_pumped_storage_generation",
"precipitation",
"Solar",
"temperature_2m",
"Wind offshore"    ,                    
"Wind onshore"   ]

In [None]:

import sys
sys.stdout = open("output.log", "w")  # Redirects print output to a file
sys.stderr = sys.stdout  # Redirect errors as well

import os
os.environ["TMPDIR"] = "/home/user/tmp"


print("All outputs will be saved to output.log")

# =============================================================================
# Precompute the Train/Test and Forecast Windows Outside the Objective
# =============================================================================
windowed_sets = []

# Get and split datasets
merged_df, _ = get_datasets()
train, eval, test, benchmark, _, _, _ = split(merged_df)

# Combine all available training data
training_set = pd.concat([train, eval, test])[COLUMNS]
benchmarking_set = benchmark[COLUMNS]

# Iterate through forecast windows for rolling prediction
for window_start in tqdm(range(WINDOW_SIZE, len(benchmarking_set) - (WINDOW_SIZE * 15), WINDOW_SIZE), desc="Preparing windowed datasets"):
    y_actual = benchmarking_set.iloc[window_start : window_start + WINDOW_SIZE]["Price"]
    dataset_extended = pd.concat((training_set, benchmarking_set.iloc[:window_start]))
    next_day = dataset_extended.index[-1] + pd.DateOffset(hours=1)

    if next_day != y_actual.index[0]:
        print(f"\nSkipping prediction for {next_day} due to missing entries.\n")
        continue

    dataset_extended_ps = extend_by_predictions_and_samples(dataset_extended, dataset_extended.index[-1])
    dataset_extended_features = create_features(dataset_extended_ps)
    
    X_train_df = dataset_extended_features.iloc[:-WINDOW_SIZE]
    X_predict_df = dataset_extended_features.reindex(y_actual.index)
    
    windowed_sets.append((X_train_df, X_predict_df, y_actual))

DB_URL = "postgresql://optuna_user:postgres@localhost/optuna_db"
NUM_WORKERS = 40
N_TRIALS_TOTAL = 80

def objective(trial, worker_name, progress):
    """
    Objective function for Optuna hyperparameter tuning.
    """
    # SELECTED_FEATURES = [
    #     feature for feature in CANDIDATE_FEATURES if trial.suggest_categorical(feature, [True, False])
    # ]

    SELECTED_FEATURES = BEST_FEATURES

    params = {
        "changepoint_prior_scale": trial.suggest_float("changepoint_prior_scale", 0.001, 0.5, log=True),  # Trend flexibility
        "changepoint_range": trial.suggest_float("changepoint_range", 0.8, 0.95),  # % of history for changepoints
        "seasonality_mode": trial.suggest_categorical("seasonality_mode", ["additive", "multiplicative"]),
        "seasonality_prior_scale": trial.suggest_float("seasonality_prior_scale", 0.01, 10, log=True),  # Seasonality strength
        "yearly_seasonality": trial.suggest_int("yearly_seasonality", 5, 20),  # Fourier order for yearly seasonality
        "weekly_seasonality": trial.suggest_int("weekly_seasonality", 5, 20),  # Fourier order for weekly seasonality
        "daily_seasonality": trial.suggest_int("daily_seasonality", 5, 20),  # Fourier order for daily seasonality
        "holidays_prior_scale": trial.suggest_float("holidays_prior_scale", 0.01, 10, log=True),  # Holiday effects regularization
        "n_changepoints": trial.suggest_int("n_changepoints", 10, 50),  # Number of changepoints
        "interval_width": trial.suggest_float("interval_width", 0.7, 0.95),  # Prediction interval width
    }

    rmse_scores = []
    for i, (X_train_df, X_predict_df, y_actual) in enumerate(windowed_sets):
        tqdm.write(f"{worker_name} processing window {i+1}/{len(windowed_sets)} - {progress}")
        X_train_prophet = (
            X_train_df[["Price"] + SELECTED_FEATURES]
            .rename(columns={"Price": "y"})
            .reset_index()
            .rename(columns={"index": "ds"})
            .dropna()
        )
        prophet_X_predict = X_predict_df[SELECTED_FEATURES].reset_index().rename(columns={"index": "ds"})

        prophet_model = init_prophet_model(SELECTED_FEATURES, params=params)
        prophet_model.fit(X_train_prophet)
        prophet_forecast = prophet_model.predict(prophet_X_predict)["yhat"]
        rmse = mean_squared_error(y_actual, prophet_forecast) ** 0.5
        rmse_scores.append(rmse)

    return float('inf') if not rmse_scores else np.mean(rmse_scores)

def worker_process(study_name):
    """
    Worker function that fetches trials from the database and optimizes them.
    """
    storage = RDBStorage(DB_URL)  # Remove heartbeat_interval
    study = optuna.create_study(study_name=study_name, storage=storage, direction="minimize", load_if_exists=True)
    run = 1
    while True:
        try:
            trial = study.ask()
            if trial is None:
                break
        except Exception as e:
            print(f"Database error: {e}")
            break
        progress = f"Run {run} of {N_TRIALS_TOTAL // NUM_WORKERS}"
        value = objective(trial, multiprocessing.current_process().name, progress)
        study.tell(trial, value)
        run+=1

if __name__ == "__main__":
    study_name = "prophet_hyperopt_params"
    study = optuna.create_study(study_name=study_name, storage=DB_URL, direction="minimize", load_if_exists=True)

    processes = []
    for _ in range(NUM_WORKERS):
        p = multiprocessing.Process(target=worker_process, args=(study_name,))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    best_trial = study.best_trial
    print(f"Best RMSE: {best_trial.value}")
    print("Best hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
