In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
from prophet import Prophet

from src.dataset import get_datasets
from src.feature import create_features, split
from src.prediction import init_prophet_model
from src.benchmark import extend_by_predictions_and_samples

Importing plotly failed. Interactive plots will not work.


In [2]:
COLUMNS = [
    "Price",
    "Hydro",
    "Pumped storage generation",
    "Solar",
    "Wind offshore",
    "Wind onshore",
    "temperature_2m",
    "precipitation",
    "wind_speed_100m",
    "direct_radiation",
]
WINDOW_SIZE = 24

CANDIDATE_FEATURES = [
    'Hydro',
    'Pumped storage generation', 'Solar',
    'Wind offshore', 'Wind onshore', 'temperature_2m', 'precipitation',
    'wind_speed_100m', 'direct_radiation', 'hour', 'dayofweek', 'dayofyear'
]

for i in range(1, 15):
    CANDIDATE_FEATURES.append(f'ma_{i}_days')
    CANDIDATE_FEATURES.append(f'ma_{i}_days_pumped_storage_generation')

# Remove duplicates (if any) while preserving order.
CANDIDATE_FEATURES = list(dict.fromkeys(CANDIDATE_FEATURES))

In [None]:
SELECTED_FEATURES = [
    "Pumped storage generation",
    "Solar",
    "Wind offshore",
    "temperature_2m",
    "wind_speed_100m",
    "hour",
    "dayofweek",
    "dayofyear",
    "ma_1_days",
    "ma_1_days_pumped_storage_generation",
    "ma_2_days",
    "ma_2_days_pumped_storage_generation",
    "ma_3_days",
    "ma_4_days_pumped_storage_generation",
    "ma_5_days_pumped_storage_generation",
    "ma_6_days",
    "ma_7_days",
    "ma_8_days_pumped_storage_generation",
    "ma_9_days_pumped_storage_generation",
    "ma_10_days",
    "ma_11_days",
    "ma_12_days_pumped_storage_generation",
    "ma_13_days",
    "ma_14_days_pumped_storage_generation",
]

windowed_sets = []

# =============================================================================
# Precompute the Train/Test and Forecast Windows Outside the Objective
# =============================================================================

# Get and split datasets
merged_df, _ = get_datasets()
train, eval, test, benchmark, _, _, _ = split(merged_df)

# Combine all available training data
training_set = pd.concat([train, eval, test])[COLUMNS]
# Benchmark dataset for rolling forecast
benchmarking_set = benchmark[COLUMNS]

# Iterate through forecast windows for rolling prediction
for window_start in range(24, len(benchmarking_set) - WINDOW_SIZE, WINDOW_SIZE):
    # Define the next forecast window
    y_actual = benchmarking_set.iloc[window_start : window_start + WINDOW_SIZE]["Price"]

    # Extend dataset with past data
    dataset_extended = pd.concat((training_set, benchmarking_set.iloc[:window_start]))
    next_day = dataset_extended.index[-1] + pd.DateOffset(hours=1)

    # Skip forecast if there is a missing data gap
    if next_day != y_actual.index[0]:
        print(
            f"\nSkipping prediction for {next_day} due to missing entries.\n"
            "--------------------------------------------------------------\n"
        )
        continue

    # Extend dataset by including predictions and sample adjustments
    dataset_extended_ps = extend_by_predictions_and_samples(
        dataset_extended, dataset_extended.index[-1]
    )

    # Create features for modeling
    dataset_extended_features = create_features(dataset_extended_ps)

   # Split for training (all but last 24 hours) and prediction (the next 24 hours)
    VALIDATION_SET_SIZE = 24 * 14
    X_train_df = dataset_extended_features.iloc[:-WINDOW_SIZE - VALIDATION_SET_SIZE]
    X_eval_df = dataset_extended_features.iloc[-WINDOW_SIZE - VALIDATION_SET_SIZE : -WINDOW_SIZE]
    X_predict_df = dataset_extended_features.reindex(y_actual.index)
    windowed_sets.append((X_train_df, X_predict_df, X_eval_df, y_actual))

def objective(trial):
    # --------------------------------------------------
    # 1. Feature Selection
    # --------------------------------------------------
     # Feature selection based on trial parameters
    # SELECTED_FEATURES = [
    #     feature for feature in CANDIDATE_FEATURES 
    #     if trial.suggest_categorical(feature, [True, False])
    # ]   
    
    rmse_scores = []

    # --------------------------------------------------
    # 2. Hyperparameter Search Space for XGBoost
    # --------------------------------------------------
    # Sample early_stopping_rounds separately so it can be passed to fit.
    params = {
        'base_score': trial.suggest_float('base_score', 0.0, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 250)
    }

    # --------------------------------------------------
    # 3. Rolling Forecast Loop with Early Stopping
    # --------------------------------------------------
    # Loop over forecast windows (starting at index 24 to ensure an initial training period).
    for X_train_df, X_predict_df, X_eval_df, y_actual in windowed_sets:

        model = xgb.XGBRegressor(
            **params,
            objective='reg:squarederror',
            eval_metric='rmse',
            random_state=42,
            n_jobs=2
        )

        model.fit(X_train_df[SELECTED_FEATURES], X_train_df["Price"], eval_set=[(X_eval_df[SELECTED_FEATURES], X_eval_df["Price"])], verbose=True
        )

        preds = model.predict(X_predict_df[SELECTED_FEATURES])
        rmse = mean_squared_error(y_actual, preds)  ** 0.5
        rmse_scores.append(rmse)

    if len(rmse_scores) == 0:
        return float('inf')
    avg_rmse = np.mean(rmse_scores)
    return avg_rmse

# =============================================================================
# Run the Optuna Study
# =============================================================================
n_trials = 100
pbar = tqdm(total=n_trials, desc="Optuna Trials")

def progress_callback(study, trial):
    pbar.update(1)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials, n_jobs=2, callbacks=[progress_callback])

print("Best trial:")
best_trial = study.best_trial
print(f"  Average RMSE: {best_trial.value}")
print("  Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")


Optuna Trials:   0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-02-14 20:35:47,831] A new study created in memory with name: no-name-1ebac0b2-1b42-41f8-b5f4-a7b407fcc3c4


[0]	validation_0-rmse:91.52104
[1]	validation_0-rmse:91.51774
[2]	validation_0-rmse:91.51443
[3]	validation_0-rmse:91.51113
[4]	validation_0-rmse:91.50782
[5]	validation_0-rmse:91.50452
[6]	validation_0-rmse:91.50122
[7]	validation_0-rmse:91.49791
[8]	validation_0-rmse:91.49461
[9]	validation_0-rmse:91.49131
[10]	validation_0-rmse:91.48800
[11]	validation_0-rmse:91.48471
[12]	validation_0-rmse:91.48141
[13]	validation_0-rmse:91.47810
[14]	validation_0-rmse:91.47480
[15]	validation_0-rmse:91.47150
[16]	validation_0-rmse:91.46819
[17]	validation_0-rmse:91.46489
[18]	validation_0-rmse:91.46160
[19]	validation_0-rmse:91.45832
[20]	validation_0-rmse:91.45501
[21]	validation_0-rmse:91.45171
[22]	validation_0-rmse:91.44840
[23]	validation_0-rmse:91.44510
[24]	validation_0-rmse:91.44180
[0]	validation_0-rmse:90.87039
[25]	validation_0-rmse:91.43852
[26]	validation_0-rmse:91.43521
[27]	validation_0-rmse:91.43191
[28]	validation_0-rmse:91.42860
[29]	validation_0-rmse:91.42530
[30]	validation_0-r

In [None]:
# =============================================================================
# Precompute the Train/Test and Forecast Windows Outside the Objective
# =============================================================================
windowed_sets = []
# Get and split datasets
merged_df, _ = get_datasets()
train, eval, test, benchmark, _, _, _ = split(merged_df)

# Combine all available training data
training_set = pd.concat([train, eval, test])[COLUMNS]
# Benchmark dataset for rolling forecast
benchmarking_set = benchmark[COLUMNS]

# Iterate through forecast windows for rolling prediction
for window_start in range(24, len(benchmarking_set) - WINDOW_SIZE, WINDOW_SIZE):
    # Define the next forecast window
    y_actual = benchmarking_set.iloc[window_start : window_start + WINDOW_SIZE]["Price"]

    # Extend dataset with past data
    dataset_extended = pd.concat((training_set, benchmarking_set.iloc[:window_start]))
    next_day = dataset_extended.index[-1] + pd.DateOffset(hours=1)

    # Skip forecast if there is a missing data gap
    if next_day != y_actual.index[0]:
        print(
            f"\nSkipping prediction for {next_day} due to missing entries.\n"
            "--------------------------------------------------------------\n"
        )
        continue

    # Extend dataset by including predictions and sample adjustments
    dataset_extended_ps = extend_by_predictions_and_samples(
        dataset_extended, dataset_extended.index[-1]
    )

    # Create features for modeling
    dataset_extended_features = create_features(dataset_extended_ps)

    # Split into training and prediction datasets
    X_train_df = dataset_extended_features.iloc[:-WINDOW_SIZE]
    X_predict_df = dataset_extended_features.reindex(y_actual.index)
    
    windowed_sets.append((X_train_df, X_predict_df, y_actual))
    

# =============================================================================
# Objective Function for Optuna
# =============================================================================

def objective(trial):
    """
    Objective function for Optuna hyperparameter tuning.
    Selects features and evaluates Prophet model on rolling forecast windows.
    """
    # Feature selection based on trial parameters
    SELECTED_FEATURES = [
        feature for feature in CANDIDATE_FEATURES 
        if trial.suggest_categorical(feature, [True, False])
    ]   
    
    rmse_scores = []

    # Iterate through rolling forecast windows
    for X_train_df, X_predict_df, y_actual in windowed_sets:
        
        # Prepare training data for Prophet model
        X_train_prophet = (
            X_train_df[["Price"] + SELECTED_FEATURES]
            .rename(columns={"Price": "y"})
            .reset_index()
            .rename(columns={"index": "ds"})
            .dropna()
        )
        prophet_X_predict = X_predict_df[SELECTED_FEATURES].reset_index().rename(columns={"index": "ds"})

        # Initialize and train Prophet model
        prophet_model = init_prophet_model(SELECTED_FEATURES)
        prophet_model.fit(X_train_prophet)

        # Generate forecast
        prophet_forecast = prophet_model.predict(prophet_X_predict)
        prophet_forecast = prophet_forecast["yhat"]

        # Compute RMSE for current forecast window
        rmse = mean_squared_error(y_actual, prophet_forecast) ** 0.5
        rmse_scores.append(rmse)

    # Return average RMSE over all rolling forecast windows
    if len(rmse_scores) == 0:
        return float('inf')
    avg_rmse = np.mean(rmse_scores)
    return avg_rmse


# =============================================================================
# Run the Optuna Study
# =============================================================================

n_trials = 1
pbar = tqdm(total=n_trials, desc="Optuna Trials")

def progress_callback(study, trial):
    """Callback function for tracking Optuna trials."""
    pbar.update(1)

# Create and optimize study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials, n_jobs=4, callbacks=[progress_callback])
pbar.close()

# Print best results
print("Best trial:")
best_trial = study.best_trial
print(f"  Average RMSE: {best_trial.value}")
print("  Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")


Optuna Trials:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-02-14 20:24:33,005] A new study created in memory with name: no-name-ea3d135e-03b8-4bf7-91e9-77b63bab5ed3
20:24:42 - cmdstanpy - INFO - Chain [1] start processing
20:25:34 - cmdstanpy - INFO - Chain [1] done processing
20:25:44 - cmdstanpy - INFO - Chain [1] start processing
20:26:30 - cmdstanpy - INFO - Chain [1] done processing
20:26:39 - cmdstanpy - INFO - Chain [1] start processing
20:27:24 - cmdstanpy - INFO - Chain [1] done processing
20:27:33 - cmdstanpy - INFO - Chain [1] start processing
20:28:15 - cmdstanpy - INFO - Chain [1] done processing
20:28:24 - cmdstanpy - INFO - Chain [1] start processing
20:29:01 - cmdstanpy - INFO - Chain [1] done processing
20:29:10 - cmdstanpy - INFO - Chain [1] start processing
20:29:37 - cmdstanpy - INFO - Chain [1] done processing
20:29:37 - cmdstanpy - ERROR - Chain [1] error: terminated by signal 2 Unknown error: -2


KeyboardInterrupt: 

Optimization terminated abnormally. Falling back to Newton.


20:29:52 - cmdstanpy - INFO - Chain [1] start processing
20:35:20 - cmdstanpy - INFO - Chain [1] done processing
20:35:20 - cmdstanpy - ERROR - Chain [1] error: terminated by signal 2 Unknown error: -2
[W 2025-02-14 20:35:20,058] Trial 0 failed with parameters: {'Hydro': True, 'Pumped storage generation': True, 'Solar': False, 'Wind offshore': True, 'Wind onshore': True, 'temperature_2m': False, 'precipitation': False, 'wind_speed_100m': False, 'direct_radiation': False, 'hour': False, 'dayofweek': True, 'dayofyear': False, 'ma_1_days': True, 'ma_1_days_pumped_storage_generation': False, 'ma_2_days': False, 'ma_2_days_pumped_storage_generation': False, 'ma_3_days': False, 'ma_3_days_pumped_storage_generation': True, 'ma_4_days': True, 'ma_4_days_pumped_storage_generation': False, 'ma_5_days': True, 'ma_5_days_pumped_storage_generation': False, 'ma_6_days': False, 'ma_6_days_pumped_storage_generation': False, 'ma_7_days': True, 'ma_7_days_pumped_storage_generation': True, 'ma_8_days': F