In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [14]:
# load data from data_no2_boosted.csv
data = pd.read_csv('data_no2_boosted.csv')
data.head()

Unnamed: 0,date,spot_price,krs_temp_2m,krs_app_temp,stv_temp_2m,stv_app_temp,gas_price
0,2015-12-31 23:00:00,16.39,6.3,3.3,7.3,1.4,2.28
1,2016-01-01 00:00:00,16.04,6.1,3.4,6.9,0.9,2.28
2,2016-01-01 01:00:00,15.74,6.3,3.3,7.0,0.9,2.28
3,2016-01-01 02:00:00,15.57,6.5,3.0,7.4,1.1,2.28
4,2016-01-01 03:00:00,15.47,6.7,2.8,8.0,1.8,2.28


In [15]:
# Convert the date column to datetime format if it's not already
data['date'] = pd.to_datetime(data['date'])

# Extract time-based features
data['hour'] = data['date'].dt.hour
data['day_of_week'] = data['date'].dt.dayofweek  # Monday=0, Sunday=6
data['month'] = data['date'].dt.month
data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)  # 1 if weekend, else 0

# Display the updated DataFrame with new features
data.head()

Unnamed: 0,date,spot_price,krs_temp_2m,krs_app_temp,stv_temp_2m,stv_app_temp,gas_price,hour,day_of_week,month,is_weekend
0,2015-12-31 23:00:00,16.39,6.3,3.3,7.3,1.4,2.28,23,3,12,0
1,2016-01-01 00:00:00,16.04,6.1,3.4,6.9,0.9,2.28,0,4,1,0
2,2016-01-01 01:00:00,15.74,6.3,3.3,7.0,0.9,2.28,1,4,1,0
3,2016-01-01 02:00:00,15.57,6.5,3.0,7.4,1.1,2.28,2,4,1,0
4,2016-01-01 03:00:00,15.47,6.7,2.8,8.0,1.8,2.28,3,4,1,0


In [16]:
# Define features and target
features = ['krs_temp_2m', 'krs_app_temp', 'stv_temp_2m', 'stv_app_temp', 
            'gas_price', 'hour', 'day_of_week', 'month', 'is_weekend']
target = 'spot_price'


In [20]:
# Define rolling forecast parameters
n_days = 208  # Number of days to simulate
forecast_horizon = 24  # Forecast horizon in hours (next day)

# Initialize lists to store predictions and true values
forecast_results = []

# Start rolling forecast
for i in range(n_days):
    # Print progress
    print(f"Day {i+1} of {n_days}")

    # Define the end index for the current training data slice
    end_index = -(n_days - i) * forecast_horizon

    # Define the start index to include exactly 364 * 24 rows
    start_index = end_index - 364 * 24

    # Create training data subset (using only data up to the current day)
    train_data = data.iloc[start_index:end_index].copy()
    X_train = train_data[features]
    y_train = train_data[target]

    # Initialize and train the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)

    # Define the test set for the next day
    X_test = data[features].iloc[end_index:end_index + forecast_horizon]
    y_test = data[target].iloc[end_index:end_index + forecast_horizon]

    # Predict the next 24 hours
    y_pred = rf_model.predict(X_test)

    # Store the predictions and the actual values for this day
    forecast_results.append(pd.DataFrame({
        'ds': data['date'].iloc[end_index:end_index + forecast_horizon].values,
        'yhat': y_pred,
        'y': y_test.values
    }))

    # Expand the training data by adding the actual test data (simulating real-time data availability)
    data.loc[end_index:end_index + forecast_horizon - 1, target] = y_test

Day 1 of 208
Day 2 of 208
Day 3 of 208
Day 4 of 208
Day 5 of 208
Day 6 of 208
Day 7 of 208
Day 8 of 208
Day 9 of 208
Day 10 of 208
Day 11 of 208
Day 12 of 208
Day 13 of 208
Day 14 of 208
Day 15 of 208
Day 16 of 208
Day 17 of 208
Day 18 of 208
Day 19 of 208
Day 20 of 208
Day 21 of 208
Day 22 of 208
Day 23 of 208
Day 24 of 208
Day 25 of 208
Day 26 of 208
Day 27 of 208
Day 28 of 208
Day 29 of 208
Day 30 of 208
Day 31 of 208
Day 32 of 208
Day 33 of 208
Day 34 of 208
Day 35 of 208
Day 36 of 208
Day 37 of 208
Day 38 of 208
Day 39 of 208
Day 40 of 208
Day 41 of 208
Day 42 of 208
Day 43 of 208
Day 44 of 208
Day 45 of 208
Day 46 of 208
Day 47 of 208
Day 48 of 208
Day 49 of 208
Day 50 of 208
Day 51 of 208
Day 52 of 208
Day 53 of 208
Day 54 of 208
Day 55 of 208
Day 56 of 208
Day 57 of 208
Day 58 of 208
Day 59 of 208
Day 60 of 208
Day 61 of 208
Day 62 of 208
Day 63 of 208
Day 64 of 208
Day 65 of 208
Day 66 of 208
Day 67 of 208
Day 68 of 208
Day 69 of 208
Day 70 of 208
Day 71 of 208
Day 72 of 208
D

ValueError: Found array with 0 sample(s) (shape=(0, 9)) while a minimum of 1 is required by RandomForestRegressor.

In [21]:
# Concatenate all daily forecasts
forecast_df = pd.concat(forecast_results, ignore_index=True)

# Calculate final MAE and RMSE for the entire forecast period
mae = mean_absolute_error(forecast_df['y'], forecast_df['yhat'])
rmse = np.sqrt(mean_squared_error(forecast_df['y'], forecast_df['yhat']))

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 6.637227324879227
Root Mean Squared Error (RMSE): 9.601385577442691


Mean Absolute Error (MAE): 16.856599108573718
Root Mean Squared Error (RMSE): 19.1763193410751