# Regular version

In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the preprocessed dataset
file_path = 'preprocessed_dataset.csv'
data = pd.read_csv(file_path)

# Print basic information about the dataset
print("Dataset shape:", data.shape)
print("\nFeatures:", data.columns.tolist())
print("\nData types:\n", data.dtypes)
print("\nData sample:\n", data.head())

# Ensure data is sorted by date
data = data.sort_values('date')

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['spot_price', 'date'])
y = data['spot_price']

# Print feature statistics
print("\nFeature statistics:")
print(X.describe())

# Print target variable statistics
print("\nTarget variable statistics:")
print(y.describe())

# TimeSeriesSplit for expanding window cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Initialize XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Store results of each fold
fold_results = []

# Perform expanding window cross-validation
for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model on the expanding training set
    model.fit(X_train, y_train)
    
    # Predict on the test set for this fold
    y_pred = model.predict(X_test)
    
    # Calculate RMSE (Root Mean Squared Error) for this fold
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Store the RMSE result
    fold_results.append(rmse)
    
    print(f"\nFold {fold}:")
    print(f"Train set size: {len(X_train)}, Test set size: {len(X_test)}")
    print(f"RMSE: {rmse:.4f}")

# Output the RMSE results for each fold
print("\nRMSE for each fold:", [f"{rmse:.4f}" for rmse in fold_results])
print(f"Average RMSE across folds: {np.mean(fold_results):.4f}")

# Use the last 20% of the data for final testing
test_size = int(0.2 * len(X))
X_train, X_test = X.iloc[:-test_size], X.iloc[-test_size:]
y_train, y_test = y.iloc[:-test_size], y.iloc[-test_size:]

# Train the final model
final_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Calculate and print the final RMSE
final_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"\nFinal RMSE on test set: {final_rmse:.4f}")

# Print feature importance
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': final


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
RMSE for each fold: [np.float64(5.32152323490936), np.float64(0.6092211835903061), np.float64(0.16515569767849061), np.float64(1.3764184943064073), np.float64(2.486172096681458)]
Average RMSE across folds: 1.9916981414332047
Final RMSE: 1.01285968309676


# Hyperparameter optimization with Optuna

In [14]:
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm
import warnings
from pprint import pprint

# Suppress warnings
warnings.filterwarnings("ignore")

# Define categorical feature groups
categorical_groups = {
    'day_of_week': ['day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6'],
    'is_weekend': ['is_weekend_1'],
    'month': ['month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12'],
    'year': ['year_2017', 'year_2018'],
    'season': ['season_Spring', 'season_Summer', 'season_Winter']
}

# Define the objective function for Optuna
def objective(trial):
    # Suggest values for hyperparameters
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 0, 5),
    }
    
    # Suggest whether to include each feature group or not
    selected_features = []
    for group, features in categorical_groups.items():
        if trial.suggest_categorical(f"group_{group}", [0, 1]) == 1:
            selected_features.extend(features)
    
    # Suggest whether to include each non-categorical feature or not
    non_categorical_features = [col for col in X.columns if col not in [item for sublist in categorical_groups.values() for item in sublist]]
    for feature in non_categorical_features:
        if trial.suggest_categorical(f"feature_{feature}", [0, 1]) == 1:
            selected_features.append(feature)

    # If no feature is selected, return a high loss value
    if len(selected_features) == 0:
        return float('inf')

    # Select the features that are chosen by Optuna
    X_selected = X[selected_features]

    # TimeSeriesSplit for cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    rmse_list = []

    # Perform cross-validation with expanding window
    for train_index, test_index in tscv.split(X_selected):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the XGBoost model
        model = xgb.XGBRegressor(**param)
        model.fit(X_train, y_train)

        # Predict and evaluate the model
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        rmse_list.append(rmse)

    # Return the average RMSE across the folds
    return sum(rmse_list) / len(rmse_list)

# Create the Optuna study
study = optuna.create_study(direction='minimize')

# Wrap the optimization process in tqdm to track progress
n_trials = 50
with tqdm(total=n_trials, desc="Optimizing") as pbar:
    def callback(study, trial):
        pbar.update(1)
    
    study.optimize(objective, n_trials=n_trials, callbacks=[callback])

# Output the best trial and hyperparameters, including the selected features
print("\nBest trial RMSE:", study.best_trial.value)
print("\nBest hyperparameters:")
pprint(study.best_trial.params)

# Get the list of selected features from the best trial
best_trial = study.best_trial
selected_features = []
for group, features in categorical_groups.items():
    if best_trial.params[f"group_{group}"] == 1:
        selected_features.extend(features)
non_categorical_features = [col for col in X.columns if col not in [item for sublist in categorical_groups.values() for item in sublist]]
for feature in non_categorical_features:
    if best_trial.params[f"feature_{feature}"] == 1:
        selected_features.append(feature)
print("\nSelected features:", selected_features)


[I 2024-10-18 14:24:42,461] A new study created in memory with name: no-name-b88a42bc-2b11-469a-b09f-c03e918c6e51
Optimizing:   0%|          | 0/50 [00:00<?, ?it/s][I 2024-10-18 14:24:44,546] Trial 0 finished with value: 2.8786128840652245 and parameters: {'n_estimators': 344, 'max_depth': 9, 'learning_rate': 0.016571018032323873, 'subsample': 0.615944171054639, 'colsample_bytree': 0.8598693719553465, 'min_child_weight': 10, 'gamma': 2.057455352005647, 'lambda': 2.9324414503174867, 'group_day_of_week': 0, 'group_is_weekend': 0, 'group_month': 1, 'group_year': 1, 'group_season': 1, 'feature_volume_demand': 1, 'feature_volume_production': 1, 'feature_price_lag_1d': 1, 'feature_price_lag_7d': 1, 'feature_price_lag_30d': 1, 'feature_price_rolling_mean_7d': 1, 'feature_price_rolling_std_7d': 1, 'feature_price_rolling_mean_30d': 1, 'feature_price_rolling_std_30d': 1, 'feature_demand_production_ratio': 0, 'feature_demand_lag_1d': 1, 'feature_production_lag_1d': 1, 'feature_demand_rolling_mean


Best trial RMSE: 2.2199714224601763

Best hyperparameters:
{'colsample_bytree': 0.7968317635956264,
 'feature_Avg_Temp': 1,
 'feature_Avg_Wind': 1,
 'feature_Fill_Level': 1,
 'feature_Max_Gust': 1,
 'feature_Max_Temp': 1,
 'feature_Max_Wind': 1,
 'feature_Min_Temp': 0,
 'feature_Precipitation': 1,
 'feature_Snow_Depth': 1,
 'feature_coal_price': 0,
 'feature_crude_oil_price': 1,
 'feature_day': 1,
 'feature_demand_ewm_alpha_0.5': 1,
 'feature_demand_lag_1d': 1,
 'feature_demand_production_ratio': 0,
 'feature_demand_rolling_mean_30d': 0,
 'feature_demand_rolling_mean_7d': 1,
 'feature_demand_rolling_std_30d': 0,
 'feature_demand_rolling_std_7d': 1,
 'feature_hour': 0,
 'feature_month': 0,
 'feature_month_cos': 0,
 'feature_month_sin': 1,
 'feature_natural_gas_price': 0,
 'feature_price_anomaly': 0,
 'feature_price_boxcox': 1,
 'feature_price_change_1d': 0,
 'feature_price_ewm_alpha_0.5': 1,
 'feature_price_lag_1d': 0,
 'feature_price_lag_30d': 1,
 'feature_price_lag_7d': 1,
 'feature_


