In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm

from src.dataset import get_datasets
from src.feature import create_features, split
from src.benchmark import extend_by_predictions_and_samples

# =============================================================================
# Prepare the Data and Candidate Features
# =============================================================================

data = create_features(get_datasets()[0])
data.index = pd.to_datetime(data.index)

best_features = [
    "Pumped storage generation", "Solar", "Wind offshore", "temperature_2m",
    "wind_speed_100m", "hour", "dayofweek", "dayofyear", "ma_1_days",
    "ma_1_days_pumped_storage_generation", "ma_2_days",
    "ma_2_days_pumped_storage_generation", "ma_3_days",
    "ma_4_days_pumped_storage_generation", "ma_5_days_pumped_storage_generation",
    "ma_6_days", "ma_7_days", "ma_8_days_pumped_storage_generation",
    "ma_9_days_pumped_storage_generation", "ma_10_days", "ma_11_days",
    "ma_12_days_pumped_storage_generation", "ma_13_days", "ma_14_days_pumped_storage_generation",
]

candidate_features = [
    'Hydro', 'Pumped storage generation', 'Solar', 'Wind offshore',
    'Wind onshore', 'temperature_2m', 'precipitation', 'wind_speed_100m',
    'direct_radiation', 'hour', 'dayofweek', 'dayofyear'
]

for i in range(1, 15):
    candidate_features.append(f'ma_{i}_days')
    candidate_features.append(f'ma_{i}_days_pumped_storage_generation')

# =============================================================================
# Objective Function for Optuna
# =============================================================================

def objective(trial):
    """
    Objective function for Optuna hyperparameter tuning of XGBoost model.
    """
    params = {
        'base_score': trial.suggest_float('base_score', 0.0, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'gamma': trial.suggest_float('gamma', 0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 250)
    }

    merged_df, _ = get_datasets()
    train_df, eval_df, test_df, benchmark_df, _, _, _ = split(merged_df)

    training_set = pd.concat([train_df, eval_df, test_df]).sort_index()
    benchmarking_set = benchmark_df.sort_index()

    WINDOW_SIZE = 24
    rmse_scores = []

    for window_start in range(24, len(benchmarking_set) - WINDOW_SIZE, WINDOW_SIZE):
        y_actual = benchmarking_set.iloc[window_start: window_start + WINDOW_SIZE]["Price"]
        dataset_extended = pd.concat([training_set, benchmarking_set.iloc[:window_start]])
        next_timestamp = dataset_extended.index[-1] + pd.DateOffset(hours=1)
        if next_timestamp != y_actual.index[0]:
            continue

        dataset_extended_ps = extend_by_predictions_and_samples(dataset_extended, dataset_extended.index[-1])
        dataset_extended_features = create_features(dataset_extended_ps)

        if not set(y_actual.index).issubset(dataset_extended_features.index):
            continue

        selected_features = best_features

        train_data = dataset_extended_features.iloc[:-WINDOW_SIZE]
        split_idx = int(0.8 * len(train_data))
        X_train = train_data.iloc[:split_idx][selected_features]
        y_train = train_data.iloc[:split_idx]["Price"]
        X_eval = train_data.iloc[split_idx:][selected_features]
        y_eval = train_data.iloc[split_idx:]["Price"]

        X_predict = dataset_extended_features.reindex(y_actual.index)[selected_features]
        if X_predict.isnull().any().any():
            continue

        model = xgb.XGBRegressor(
            **params, objective='reg:squarederror', eval_metric='rmse',
            random_state=42, n_jobs=2
        )

        model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], verbose=False)
        preds = model.predict(X_predict)
        rmse = mean_squared_error(y_actual, preds) ** 0.5
        rmse_scores.append(rmse)

    if len(rmse_scores) == 0:
        return float('inf')
    return np.mean(rmse_scores)

# =============================================================================
# Run the Optuna Study
# =============================================================================

n_trials = 100
pbar = tqdm(total=n_trials, desc="Optuna Trials")

def progress_callback(study, trial):
    """Callback function to track Optuna progress."""
    pbar.update(1)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials, n_jobs=2, callbacks=[progress_callback])

print("Best trial:")
best_trial = study.best_trial
print(f"  Average RMSE: {best_trial.value}")
print("  Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")


Optuna Trials:   0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-02-10 18:02:54,324] A new study created in memory with name: no-name-3dc96b82-a3f6-48da-bfa9-cfac22eed736


DatetimeIndex(['2025-01-07 00:00:00', '2025-01-07 01:00:00',
               '2025-01-07 02:00:00', '2025-01-07 03:00:00',
               '2025-01-07 04:00:00', '2025-01-07 05:00:00',
               '2025-01-07 06:00:00', '2025-01-07 07:00:00',
               '2025-01-07 08:00:00', '2025-01-07 09:00:00',
               '2025-01-07 10:00:00', '2025-01-07 11:00:00',
               '2025-01-07 12:00:00', '2025-01-07 13:00:00',
               '2025-01-07 14:00:00', '2025-01-07 15:00:00',
               '2025-01-07 16:00:00', '2025-01-07 17:00:00',
               '2025-01-07 18:00:00', '2025-01-07 19:00:00',
               '2025-01-07 20:00:00', '2025-01-07 21:00:00',
               '2025-01-07 22:00:00', '2025-01-07 23:00:00'],
              dtype='datetime64[ns]', freq=None) 
##############################################
DatetimeIndex(['2018-10-14 23:00:00', '2018-10-15 00:00:00',
               '2018-10-15 01:00:00', '2018-10-15 02:00:00',
               '2018-10-15 03:00:00', '2018-10-1

[I 2025-02-10 18:03:22,279] Trial 0 finished with value: inf and parameters: {'base_score': 0.4023037496547617, 'learning_rate': 0.05185699894775817, 'max_depth': 37, 'n_estimators': 430, 'gamma': 4.999567530723389, 'min_child_weight': 2, 'subsample': 0.6106009796830527, 'colsample_bylevel': 0.7498447346062492, 'max_delta_step': 9, 'early_stopping_rounds': 154}. Best is trial 0 with value: inf.


DatetimeIndex(['2025-02-05 00:00:00', '2025-02-05 01:00:00',
               '2025-02-05 02:00:00', '2025-02-05 03:00:00',
               '2025-02-05 04:00:00', '2025-02-05 05:00:00',
               '2025-02-05 06:00:00', '2025-02-05 07:00:00',
               '2025-02-05 08:00:00', '2025-02-05 09:00:00',
               '2025-02-05 10:00:00', '2025-02-05 11:00:00',
               '2025-02-05 12:00:00', '2025-02-05 13:00:00',
               '2025-02-05 14:00:00', '2025-02-05 15:00:00',
               '2025-02-05 16:00:00', '2025-02-05 17:00:00',
               '2025-02-05 18:00:00', '2025-02-05 19:00:00',
               '2025-02-05 20:00:00', '2025-02-05 21:00:00',
               '2025-02-05 22:00:00', '2025-02-05 23:00:00'],
              dtype='datetime64[ns]', freq=None) 
##############################################
DatetimeIndex(['2018-10-14 23:00:00', '2018-10-15 00:00:00',
               '2018-10-15 01:00:00', '2018-10-15 02:00:00',
               '2018-10-15 03:00:00', '2018-10-1

[W 2025-02-10 18:03:34,810] Trial 1 failed with parameters: {'base_score': 0.8690630983701515, 'learning_rate': 0.03509740310006119, 'max_depth': 73, 'n_estimators': 300, 'gamma': 3.4317980118367766, 'min_child_weight': 1, 'subsample': 0.6173326801267456, 'colsample_bylevel': 0.8278007323873928, 'max_delta_step': 2, 'early_stopping_rounds': 130} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/boh/Library/Python/3.9/lib/python/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/zy/2cj896n535n1dkk6yxct9vhw0000gn/T/ipykernel_5614/3048886667.py", line 124, in objective
    dataset_extended_features = create_features(dataset_extended_ps)
  File "/Users/boh/code/LSDI-Project-/hand_in/src/feature.py", line 174, in create_features
    df.dropna(inplace=True)
  File "/Users/boh/Library/Python/3.9/lib/python/site-packages/pandas/core/frame.py", line 6433, in dropna
    resu

KeyboardInterrupt: 

In [6]:
import optuna
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from prophet import Prophet
import pandas as pd
import numpy as np

from src.prediction import init_prophet_model
from src.dataset import get_datasets
from src.feature import create_features, split
from src.benchmark import extend_by_predictions_and_samples

# =============================================================================
# Prepare the Data and Candidate Features
# =============================================================================

COLUMNS = [
    "Price",
    "Hydro",
    "Pumped storage generation",
    "Solar",
    "Wind offshore",
    "Wind onshore",
    "temperature_2m",
    "precipitation",
    "wind_speed_100m",
    "direct_radiation",
]

# Define candidate features.
candidate_features = [
    'Hydro',
    'Pumped storage generation',
    'Solar',
    'Wind offshore',
    'Wind onshore',
    'temperature_2m',
    'precipitation',
    'wind_speed_100m',
    'direct_radiation',
    'hour',
    'dayofweek',
    'dayofyear'
]

# Append moving-average features for various windows.
for i in range(1, 15):
    candidate_features.append(f'ma_{i}_days')
    candidate_features.append(f'ma_{i}_days_pumped_storage_generation')

# Remove duplicates (if any) while preserving order.
candidate_features = list(dict.fromkeys(candidate_features))

windowed_sets = []

WINDOW_SIZE = 24

# =============================================================================
# Precompute the Train/Test and Forecast Windows Outside the Objective
# =============================================================================

# Get and split datasets
merged_df, _ = get_datasets()
train, eval, test, benchmark, _, _, _ = split(merged_df)

# Combine all available training data
training_set = pd.concat([train, eval, test])[COLUMNS]
# Benchmark dataset for rolling forecast
benchmarking_set = benchmark[COLUMNS]

# Iterate through forecast windows for rolling prediction
for window_start in range(24, len(benchmarking_set) - WINDOW_SIZE, WINDOW_SIZE):
    # Define the next forecast window
    y_actual = benchmarking_set.iloc[window_start : window_start + WINDOW_SIZE]["Price"]

    # Extend dataset with past data
    dataset_extended = pd.concat((training_set, benchmarking_set.iloc[:window_start]))
    next_day = dataset_extended.index[-1] + pd.DateOffset(hours=1)

    # Skip forecast if there is a missing data gap
    if next_day != y_actual.index[0]:
        print(
            f"\nSkipping prediction for {next_day} due to missing entries.\n"
            "--------------------------------------------------------------\n"
        )
        continue

    # Extend dataset by including predictions and sample adjustments
    dataset_extended_ps = extend_by_predictions_and_samples(
        dataset_extended, dataset_extended.index[-1]
    )

    # Create features for modeling
    dataset_extended_features = create_features(dataset_extended_ps)

    # Split into training and prediction datasets
    X_train_df = dataset_extended_features.iloc[:-WINDOW_SIZE]
    X_predict_df = dataset_extended_features.reindex(y_actual.index)
    
    windowed_sets.append((X_train_df, X_predict_df, y_actual))
    

# =============================================================================
# Objective Function for Optuna
# =============================================================================

def objective(trial):
    """
    Objective function for Optuna hyperparameter tuning.
    Selects features and evaluates Prophet model on rolling forecast windows.
    """
    # Feature selection based on trial parameters
    SELECTED_FEATURES = []
    for feature in candidate_features:
        use_feature = trial.suggest_categorical(feature, [True, False])
        SELECTED_FEATURES.append(feature)
    
    # Ensure at least one feature is selected
    if len(SELECTED_FEATURES) == 0:
        SELECTED_FEATURES = candidate_features
    
    # Remove duplicates from feature selection
    SELECTED_FEATURES = list(dict.fromkeys(SELECTED_FEATURES))
    
    rmse_scores = []

    # Iterate through rolling forecast windows
    for X_train_df, X_predict_df, y_actual in windowed_sets:
        
        # Prepare training data for Prophet model
        X_train_prophet = (
            X_train_df[["Price"] + SELECTED_FEATURES]
            .rename(columns={"Price": "y"})
            .reset_index()
            .rename(columns={"index": "ds"})
            .dropna()
        )
        prophet_X_predict = X_predict_df[SELECTED_FEATURES].reset_index().rename(columns={"index": "ds"})

        # Initialize and train Prophet model
        prophet_model = init_prophet_model(SELECTED_FEATURES)
        prophet_model.fit(X_train_prophet)

        # Generate forecast
        prophet_forecast = prophet_model.predict(prophet_X_predict)
        prophet_forecast = prophet_forecast["yhat"]

        # Compute RMSE for current forecast window
        rmse = mean_squared_error(y_actual, prophet_forecast) ** 0.5
        rmse_scores.append(rmse)

    # Return average RMSE over all rolling forecast windows
    if len(rmse_scores) == 0:
        return float('inf')
    avg_rmse = np.mean(rmse_scores)
    return avg_rmse


# =============================================================================
# Run the Optuna Study
# =============================================================================

n_trials = 1
pbar = tqdm(total=n_trials, desc="Optuna Trials")

def progress_callback(study, trial):
    """Callback function for tracking Optuna trials."""
    pbar.update(1)

# Create and optimize study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials, n_jobs=-1, callbacks=[progress_callback])
pbar.close()

# Print best results
print("Best trial:")
best_trial = study.best_trial
print(f"  Average RMSE: {best_trial.value}")
print("  Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")


Optuna Trials:   0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-02-14 20:08:22,939] A new study created in memory with name: no-name-f0734101-e226-4b02-9c46-9acffd96be1f
20:08:34 - cmdstanpy - INFO - Chain [1] start processing
20:09:54 - cmdstanpy - INFO - Chain [1] done processing
20:10:06 - cmdstanpy - INFO - Chain [1] start processing
20:10:38 - cmdstanpy - INFO - Chain [1] done processing
20:10:38 - cmdstanpy - ERROR - Chain [1] error: terminated by signal 2 Unknown error: -2


KeyboardInterrupt: 

Optimization terminated abnormally. Falling back to Newton.
