In [419]:
import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import itertools


### The Prophet Model(only)



In [None]:
energy_data = pd.read_csv('../../data/day_ahead_energy_prices.csv')
energy_data['Datetime'] = pd.to_datetime(energy_data['Datetime']).dt.tz_localize(None)
energy_data = energy_data.rename(columns={'Datetime': 'ds', 'hourly day-ahead energy price': 'y'})

split_index = int(len(energy_data) * 0.82)
mid_index = (len(energy_data) + split_index) // 2

train_data, validation_data, test_data = energy_data[:split_index], energy_data[split_index:mid_index], energy_data[mid_index:]

In [None]:
# Initialize Prophet model
model = Prophet()
model.fit(train_data) 

future = test_data[['ds']]
forecast = model.predict(future)

# Result_df
results_df = forecast.merge(test_data, on='ds', how='left')  # test_data has to include columns 'ds' and 'y'

# Root Mean Squared Error (RMSE)
mse = mean_squared_error(results_df['y'], results_df['yhat'])
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE) for the test dataset: {rmse:.2f}")

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(results_df['ds'], results_df['y'], label='Actual Prices', color='blue', linewidth=2)
plt.plot(results_df['ds'], results_df['yhat'], label='Forecasted Prices', color='red', linestyle='dashed', linewidth=2)
plt.title("Actual vs Forecasted Energy Prices for Test Dataset")
plt.xlabel("Time")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

### Add Weather as regressor

In [None]:
# Load energy and weather data
energy_data = pd.read_csv('../../data/day_ahead_energy_prices.csv')
energy_data['Datetime'] = pd.to_datetime(energy_data['Datetime']).dt.tz_localize(None)  # Convert to datetime and remove timezone
energy_data = energy_data.rename(columns={'Datetime': 'ds', 'hourly day-ahead energy price': 'y'})  # Rename columns to match Prophet's expected column names

weather_data = pd.read_csv('../../data/germany_weather_average.csv')  # Adjust file path if needed
weather_data['date'] = pd.to_datetime(weather_data['date']).dt.tz_localize(None)
weather_data.rename(columns={'date': 'ds'}, inplace=True)

merged_data = pd.merge(energy_data, weather_data, on='ds', how='left')
train_data, validation_data, test_data = merged_data[:split_index], merged_data[split_index:mid_index], merged_data[mid_index:]

In [None]:
model = Prophet()

weather_columns = [col for col in weather_data.columns if col != 'ds'] 

for col in weather_columns:
    model.add_regressor(col)

model.fit(train_data) 
future = test_data[['ds'] + weather_columns]

# Generate forecast
forecast = model.predict(future)
results_df = forecast.merge(test_data, on='ds', how='left')

# Calculate RMSE
mse = mean_squared_error(results_df['y'], results_df['yhat'])
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

#### + Add Energy Market Mix

In [None]:
# Load data
energy_data = pd.read_csv('../../data/day_ahead_energy_prices.csv')
energy_data['Datetime'] = pd.to_datetime(energy_data['Datetime']).dt.tz_localize(None)  # Convert to datetime and remove timezone
energy_data = energy_data.rename(columns={'Datetime': 'ds', 'hourly day-ahead energy price': 'y'})  # Rename columns to match Prophet's expected column names

weather_data = pd.read_csv('../../data/germany_weather_average.csv')  # Adjust file path if needed
weather_data['date'] = pd.to_datetime(weather_data['date']).dt.tz_localize(None)
weather_data.rename(columns={'date': 'ds'}, inplace=True)

market_mix = pd.read_csv('../../data/hourly_market_metrics_cleaned.csv')
market_mix['Timestamp'] = pd.to_datetime(market_mix['Timestamp']).dt.tz_localize(None)
market_mix.rename(columns={'Timestamp': 'ds'}, inplace=True)

merged_data = pd.merge(energy_data, weather_data, on='ds', how='left')
merged_data = pd.merge(market_mix, merged_data, on= 'ds' , how= 'left')
train_data, validation_data, test_data = merged_data[:split_index], merged_data[split_index:mid_index], merged_data[mid_index:]


In [None]:
model = Prophet()

# Add all columns except 'ds' and 'y' as regressors
columns = [col for col in merged_data.columns if col not in ['ds', 'y']] 

for col in columns:
    model.add_regressor(col)

# Fit the model with training data
model.fit(train_data) 

future = test_data[['ds'] + columns] # not taking y
forecast = model.predict(future)

results_df = forecast[['ds', 'yhat']].merge(test_data[['ds', 'y']], on='ds', how='left')

# Calculate RMSE
mse = mean_squared_error(results_df['y'], results_df['yhat'])
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

In [None]:
# Plot forecast and actual values
plt.figure(figsize=(16, 6))
plt.plot(results_df['ds'], results_df['y'], label='Actual Prices', color='blue', linewidth=2)
plt.plot(results_df['ds'], results_df['yhat'], label='Forecasted Prices', color='red', linestyle='dashed', linewidth=2)
plt.title("Actual vs Forecasted Energy Prices for Test Dataset")
plt.xlabel("Time")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

### Create Moving Average Features


In [390]:
# Load data
energy_data = pd.read_csv('../../data/day_ahead_energy_prices.csv')
energy_data['Datetime'] = pd.to_datetime(energy_data['Datetime']).dt.tz_localize(None)  # Convert to datetime and remove timezone
energy_data = energy_data.rename(columns={'Datetime': 'ds', 'hourly day-ahead energy price': 'y'})  # Rename columns to match Prophet's expected column names

weather_data = pd.read_csv('../../data/germany_weather_average.csv')  # Adjust file path if needed
weather_data['date'] = pd.to_datetime(weather_data['date']).dt.tz_localize(None)
weather_data.rename(columns={'date': 'ds'}, inplace=True)

market_mix = pd.read_csv('../../data/hourly_market_mix_cleaned.csv')
market_mix['Timestamp'] = pd.to_datetime(market_mix['Timestamp']).dt.tz_localize(None)
market_mix.rename(columns={'Timestamp': 'ds'}, inplace=True)

merged_data = pd.merge(energy_data, weather_data, on='ds', how='left')
merged_data = pd.merge(market_mix, merged_data, on= 'ds' , how= 'left')

In [396]:
merged_data['weekofyear'].isnull().sum()

0

In [391]:
def create_features(df, include_target=True):
    df = df.copy()

    # Extract temporal features
    df['hour'] = df['ds'].dt.hour
    df['dayofweek'] = df['ds'].dt.dayofweek
    df['dayofyear'] = df['ds'].dt.dayofyear
    df['weekofyear'] = df['ds'].dt.isocalendar().week

    if include_target:
        # Moving averages for hours (if 'y' is available)
        for window in range(2, 24):  # Using range for hours from 2 to 23
            df[f'ma_{int(window)}_hours'] = df['y'].rolling(window=window).mean()

        # Moving averages for days (1 day, 3 days, 1 week)
        for window in [24, 72, 168]:  # Corresponding to 1 day, 3 days, 7 days
            df[f'ma_{int(window / 24)}_days'] = df['y'].rolling(window=window).mean()

        # Moving averages for months (assuming 30 days per month)
        for window in [1, 2, 3]:  # 1 month, 2 months, 3 months
            hours_in_month = window * 30 * 24
            df[f'ma_{int(window)}_month'] = df['y'].rolling(window=hours_in_month).mean()

        # Rolling averages for selected energy-related columns
        energy_columns = [col for col in merged_data.columns if col not in ['ds', 'y']]

        for window in [3, 6, 12]:  # 3, 6, and 12 hours
            for col in energy_columns:
                df[f'{col}_ma_{int(window)}_hours'] = df[col].rolling(window=window).mean()

        # Fill NaN values in moving averages with interpolation or medians
        for column in df.columns:
            if 'ma_' in column:  # Apply only to moving average columns
                df[column] = df[column].interpolate(method='linear').fillna(df[column].median())

    return df

merged_data = create_features(merged_data)
train_data, validation_data, test_data = merged_data[:split_index], merged_data[split_index:mid_index], merged_data[mid_index:]
energy_columns = [col for col in merged_data.columns if col not in ['ds', 'y']]



In [None]:
#  Regressors: ['dayofweek', 'dayofyear', 'ma_2_hours', 'ma_3_hours', 'ma_4_hours', 
# 'ma_8_hours', 'ma_9_hours', 'ma_10_hours', 'ma_11_hours',
#  'ma_20_hours', 'ma_21_hours', 'ma_22_hours', 'ma_23_hours', 'Hydro', 'Nuclear']

In [422]:
model = Prophet()

# Add time-based regressors
model.add_regressor('dayofweek')
model.add_regressor('dayofyear')

# Add moving average regressors
model.add_regressor('hour')
model.add_regressor('ma_2_hours') 
model.add_regressor('ma_3_hours')  
model.add_regressor('ma_4_hours')
model.add_regressor('ma_5_hours')
model.add_regressor('ma_6_hours')
model.add_regressor('ma_7_hours')
model.add_regressor('ma_8_hours')
model.add_regressor('ma_9_hours')

model.add_regressor('ma_1_days')
# model.add_regressor('ma_3_days')
model.add_regressor('ma_7_days')


model.add_regressor('Biomass') 
model.add_regressor('Hard Coal')
model.add_regressor('Hydro')
# model.add_regressor('Lignite')
# model.add_regressor('Natural Gas')
model.add_regressor('Nuclear')
# model.add_regressor('Other')
model.add_regressor('Pumped storage generation')
model.add_regressor('Solar')
model.add_regressor('Wind offshore')
model.add_regressor('Wind onshore')

# Add regressors for the provided columns
model.add_regressor('temperature_2m')
model.add_regressor('Precipitation (rain/snow)')
model.add_regressor('wind_speed_100m')
# model.add_regressor('direct_radiation')

# model.add_regressor('ma_10_hours')
# model.add_regressor('ma_11_hours')
# model.add_regressor('ma_12_hours')
# model.add_regressor('ma_13_hours')
# model.add_regressor('ma_14_hours')
# model.add_regressor('ma_15_hours')
# model.add_regressor('ma_16_hours')
# model.add_regressor('ma_17_hours')
# model.add_regressor('ma_18_hours')
# model.add_regressor('ma_19_hours')
# model.add_regressor('ma_20_hours')
# model.add_regressor('ma_21_hours')
# model.add_regressor('ma_22_hours')
# model.add_regressor('ma_23_hours')


model.fit(train_data)

future = test_data[['ds'] + energy_columns]
forecast = model.predict(future)

results_df = forecast.merge(test_data, on='ds', how='left')

# Calculate RMSE
mae = mean_absolute_error(results_df['y'], results_df['yhat'])
print(f"Mean Absolute Error (MAE): {mae:.2f}")

mse = mean_squared_error(results_df['y'], results_df['yhat'])
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


22:15:44 - cmdstanpy - INFO - Chain [1] start processing
22:16:05 - cmdstanpy - INFO - Chain [1] done processing


Mean Absolute Error (MAE): 4.21
Root Mean Squared Error (RMSE): 7.41


##### Find the best

In [421]:
# Define the list of all possible regressors
all_regressors = [
    'dayofweek', 'dayofyear', 'hour', 'ma_2_hours', 'ma_3_hours',
    'ma_4_hours', 'ma_5_hours', 'ma_6_hours', 'ma_7_hours', 'ma_8_hours', 'ma_9_hours', 'ma_10_hours', 'ma_11_hours', 
    'ma_12_hours', 'ma_13_hours', 'ma_14_hours', 'ma_15_hours', 'ma_16_hours', 'ma_17_hours', 'ma_18_hours', 'ma_19_hours',
    'ma_20_hours', 'ma_21_hours', 'ma_22_hours', 'ma_23_hours',
    
    'Biomass', 'Hard Coal', 'Hydro', 'Lignite', 'Natural Gas', 'Nuclear',
    'Other', 'Pumped storage generation', 'Solar', 'Wind offshore', 'Wind onshore', 
    
    'temperature_2m', 'Precipitation (rain/snow)',
    'wind_speed_100m', 'direct_radiation'
]

# Initialize variables to track the best RMSE and regressors
best_rmse = float('inf')
active_regressors = []

# Iterate over all regressors
for regressor in all_regressors:
    try:
        # Initialize a new Prophet model
        model = Prophet()

        # Add currently active regressors
        for active_regressor in active_regressors:
            model.add_regressor(active_regressor)

        # Temporarily add the new regressor
        model.add_regressor(regressor)

        # Fit the model with training data
        model.fit(train_data)

        # Prepare future dataframe with active regressors
        future = test_data[['ds'] + active_regressors + [regressor]]

        # Generate forecast
        forecast = model.predict(future)

        # Merge forecast with test data for evaluation
        results_df = forecast.merge(test_data, on='ds', how='left')

        # Calculate RMSE
        mse = mean_squared_error(results_df['y'], results_df['yhat'])
        rmse = np.sqrt(mse)

        print(f"Regressor: {regressor} | RMSE: {rmse:.2f}")

        # Check if the new regressor improves RMSE
        if rmse < best_rmse:
            best_rmse = rmse
            active_regressors.append(regressor)  # Keep this regressor active
            print(f"Regressor '{regressor}' improves RMSE. Keeping it active.")
        else:
            print(f"Regressor '{regressor}' does not improve RMSE. Discarding it.")

    except Exception as e:
        print(f"Error with regressor '{regressor}': {e}")

# Final list of active regressors
print(f"\nBest RMSE: {best_rmse:.2f}")
print(f"Active Regressors: {active_regressors}")


21:48:19 - cmdstanpy - INFO - Chain [1] start processing
21:49:22 - cmdstanpy - INFO - Chain [1] done processing


Regressor: dayofweek | RMSE: 105.82
Regressor 'dayofweek' improves RMSE. Keeping it active.


21:49:26 - cmdstanpy - INFO - Chain [1] start processing
21:51:37 - cmdstanpy - INFO - Chain [1] done processing


Regressor: dayofyear | RMSE: 95.48
Regressor 'dayofyear' improves RMSE. Keeping it active.


21:51:41 - cmdstanpy - INFO - Chain [1] start processing
21:53:24 - cmdstanpy - INFO - Chain [1] done processing


Regressor: hour | RMSE: 96.12
Regressor 'hour' does not improve RMSE. Discarding it.


21:53:30 - cmdstanpy - INFO - Chain [1] start processing
21:53:45 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_2_hours | RMSE: 9.74
Regressor 'ma_2_hours' improves RMSE. Keeping it active.


21:53:51 - cmdstanpy - INFO - Chain [1] start processing
21:54:17 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_3_hours | RMSE: 7.87
Regressor 'ma_3_hours' improves RMSE. Keeping it active.


21:54:20 - cmdstanpy - INFO - Chain [1] start processing
21:54:44 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_4_hours | RMSE: 7.57
Regressor 'ma_4_hours' improves RMSE. Keeping it active.


21:54:49 - cmdstanpy - INFO - Chain [1] start processing
21:55:14 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_5_hours | RMSE: 7.62
Regressor 'ma_5_hours' does not improve RMSE. Discarding it.


21:55:18 - cmdstanpy - INFO - Chain [1] start processing
21:55:42 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_6_hours | RMSE: 7.61
Regressor 'ma_6_hours' does not improve RMSE. Discarding it.


21:55:45 - cmdstanpy - INFO - Chain [1] start processing
21:56:11 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_7_hours | RMSE: 7.58
Regressor 'ma_7_hours' does not improve RMSE. Discarding it.


21:56:15 - cmdstanpy - INFO - Chain [1] start processing
21:56:38 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_8_hours | RMSE: 7.57
Regressor 'ma_8_hours' improves RMSE. Keeping it active.


21:56:43 - cmdstanpy - INFO - Chain [1] start processing
21:57:06 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_9_hours | RMSE: 7.50
Regressor 'ma_9_hours' improves RMSE. Keeping it active.


21:57:12 - cmdstanpy - INFO - Chain [1] start processing
21:57:32 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_10_hours | RMSE: 7.49
Regressor 'ma_10_hours' improves RMSE. Keeping it active.


21:57:36 - cmdstanpy - INFO - Chain [1] start processing
21:58:08 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_11_hours | RMSE: 7.48
Regressor 'ma_11_hours' improves RMSE. Keeping it active.


21:58:12 - cmdstanpy - INFO - Chain [1] start processing
21:58:44 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_14_hours | RMSE: 7.50
Regressor 'ma_14_hours' does not improve RMSE. Discarding it.


21:58:48 - cmdstanpy - INFO - Chain [1] start processing
21:59:13 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_15_hours | RMSE: 7.52
Regressor 'ma_15_hours' does not improve RMSE. Discarding it.


21:59:17 - cmdstanpy - INFO - Chain [1] start processing
21:59:44 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_16_hours | RMSE: 7.53
Regressor 'ma_16_hours' does not improve RMSE. Discarding it.


21:59:48 - cmdstanpy - INFO - Chain [1] start processing
22:00:10 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_17_hours | RMSE: 7.51
Regressor 'ma_17_hours' does not improve RMSE. Discarding it.


22:00:15 - cmdstanpy - INFO - Chain [1] start processing
22:00:35 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_18_hours | RMSE: 7.49
Regressor 'ma_18_hours' does not improve RMSE. Discarding it.


22:00:39 - cmdstanpy - INFO - Chain [1] start processing
22:00:59 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_19_hours | RMSE: 7.48
Regressor 'ma_19_hours' does not improve RMSE. Discarding it.


22:01:03 - cmdstanpy - INFO - Chain [1] start processing
22:01:28 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_20_hours | RMSE: 7.47
Regressor 'ma_20_hours' improves RMSE. Keeping it active.


22:01:33 - cmdstanpy - INFO - Chain [1] start processing
22:01:54 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_21_hours | RMSE: 7.35
Regressor 'ma_21_hours' improves RMSE. Keeping it active.


22:01:58 - cmdstanpy - INFO - Chain [1] start processing
22:02:30 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_22_hours | RMSE: 7.35
Regressor 'ma_22_hours' improves RMSE. Keeping it active.


22:02:35 - cmdstanpy - INFO - Chain [1] start processing
22:02:56 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_23_hours | RMSE: 7.33
Regressor 'ma_23_hours' improves RMSE. Keeping it active.


22:03:01 - cmdstanpy - INFO - Chain [1] start processing
22:03:27 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_12_hours | RMSE: 7.33
Regressor 'ma_12_hours' does not improve RMSE. Discarding it.


22:03:31 - cmdstanpy - INFO - Chain [1] start processing
22:03:50 - cmdstanpy - INFO - Chain [1] done processing


Regressor: ma_13_hours | RMSE: 7.33
Regressor 'ma_13_hours' does not improve RMSE. Discarding it.


22:03:54 - cmdstanpy - INFO - Chain [1] start processing
22:04:14 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Biomass | RMSE: 7.36
Regressor 'Biomass' does not improve RMSE. Discarding it.


22:04:18 - cmdstanpy - INFO - Chain [1] start processing
22:04:45 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Hard Coal | RMSE: 7.33
Regressor 'Hard Coal' does not improve RMSE. Discarding it.


22:04:49 - cmdstanpy - INFO - Chain [1] start processing
22:05:15 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Hydro | RMSE: 7.31
Regressor 'Hydro' improves RMSE. Keeping it active.


22:05:19 - cmdstanpy - INFO - Chain [1] start processing
22:05:54 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Lignite | RMSE: 7.32
Regressor 'Lignite' does not improve RMSE. Discarding it.


22:06:02 - cmdstanpy - INFO - Chain [1] start processing
22:06:26 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Natural Gas | RMSE: 7.35
Regressor 'Natural Gas' does not improve RMSE. Discarding it.


22:06:33 - cmdstanpy - INFO - Chain [1] start processing
22:07:01 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Nuclear | RMSE: 7.30
Regressor 'Nuclear' improves RMSE. Keeping it active.


22:07:08 - cmdstanpy - INFO - Chain [1] start processing
22:07:36 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Other | RMSE: 7.33
Regressor 'Other' does not improve RMSE. Discarding it.


22:07:40 - cmdstanpy - INFO - Chain [1] start processing
22:08:05 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Pumped storage generation | RMSE: 7.30
Regressor 'Pumped storage generation' does not improve RMSE. Discarding it.


22:08:09 - cmdstanpy - INFO - Chain [1] start processing
22:08:36 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Solar | RMSE: 7.31
Regressor 'Solar' does not improve RMSE. Discarding it.


22:08:40 - cmdstanpy - INFO - Chain [1] start processing
22:09:13 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Wind offshore | RMSE: 7.30
Regressor 'Wind offshore' does not improve RMSE. Discarding it.


22:09:19 - cmdstanpy - INFO - Chain [1] start processing
22:09:46 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Wind onshore | RMSE: 7.32
Regressor 'Wind onshore' does not improve RMSE. Discarding it.


22:09:50 - cmdstanpy - INFO - Chain [1] start processing
22:10:15 - cmdstanpy - INFO - Chain [1] done processing


Regressor: temperature_2m | RMSE: 7.31
Regressor 'temperature_2m' does not improve RMSE. Discarding it.


22:10:22 - cmdstanpy - INFO - Chain [1] start processing
22:10:49 - cmdstanpy - INFO - Chain [1] done processing


Regressor: Precipitation (rain/snow) | RMSE: 7.31
Regressor 'Precipitation (rain/snow)' does not improve RMSE. Discarding it.


22:10:56 - cmdstanpy - INFO - Chain [1] start processing
22:11:24 - cmdstanpy - INFO - Chain [1] done processing


Regressor: wind_speed_100m | RMSE: 7.34
Regressor 'wind_speed_100m' does not improve RMSE. Discarding it.


22:11:29 - cmdstanpy - INFO - Chain [1] start processing
22:12:00 - cmdstanpy - INFO - Chain [1] done processing


Regressor: direct_radiation | RMSE: 7.31
Regressor 'direct_radiation' does not improve RMSE. Discarding it.

Best RMSE: 7.30
Active Regressors: ['dayofweek', 'dayofyear', 'ma_2_hours', 'ma_3_hours', 'ma_4_hours', 'ma_8_hours', 'ma_9_hours', 'ma_10_hours', 'ma_11_hours', 'ma_20_hours', 'ma_21_hours', 'ma_22_hours', 'ma_23_hours', 'Hydro', 'Nuclear']


In [424]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Tüm regresörler
regressors = [
    'dayofweek', 'dayofyear', 'hour', 'ma_2_hours', 'ma_3_hours',
    'ma_4_hours', 'ma_5_hours', 'ma_6_hours', 'ma_7_hours', 'ma_8_hours', 'ma_9_hours', 'ma_10_hours', 'ma_11_hours', 
    'ma_12_hours', 'ma_13_hours', 'ma_14_hours', 'ma_15_hours', 'ma_16_hours', 'ma_17_hours', 'ma_18_hours', 'ma_19_hours',
    'ma_20_hours', 'ma_21_hours', 'ma_22_hours', 'ma_23_hours',
    
    'Biomass', 'Hard Coal', 'Hydro', 'Lignite', 'Natural Gas', 'Nuclear',
    'Other', 'Pumped storage generation', 'Solar', 'Wind offshore', 'Wind onshore', 
    
    'temperature_2m', 'Precipitation (rain/snow)',
    'wind_speed_100m', 'direct_radiation'
]
# Hedef değişken ve regresörler
X = train_data[regressors]
y = train_data['y']

# Feature selection için SelectKBest kullan
selector = SelectKBest(score_func=mutual_info_regression, k=15)
X_selected = selector.fit_transform(X, y)

# Seçilen özelliklerin adlarını al
selected_features = np.array(regressors)[selector.get_support()]
print("Seçilen Özellikler:", selected_features)

# Prophet modelini bu özelliklerle eğit
model = Prophet()
for feature in selected_features:
    model.add_regressor(feature)

# Modeli eğit
model.fit(train_data)

# Test verisi ile tahmin yap
future = test_data[['ds'] + list(selected_features)]
forecast = model.predict(future)

# Tahmin sonuçlarını birleştir
results_df = forecast.merge(test_data, on='ds', how='left')

# Performans metriklerini hesapla
mae = mean_absolute_error(results_df['y'], results_df['yhat'])
mse = mean_squared_error(results_df['y'], results_df['yhat'])
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Seçilen Özellikler: ['ma_2_hours' 'ma_3_hours' 'ma_4_hours' 'ma_5_hours' 'ma_6_hours'
 'ma_7_hours' 'ma_8_hours' 'ma_9_hours' 'ma_10_hours' 'ma_11_hours'
 'ma_12_hours' 'ma_13_hours' 'ma_14_hours' 'ma_15_hours' 'ma_16_hours']


22:20:59 - cmdstanpy - INFO - Chain [1] start processing
22:21:45 - cmdstanpy - INFO - Chain [1] done processing


Mean Absolute Error (MAE): 4.37
Root Mean Squared Error (RMSE): 7.53


In [423]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Tüm regresörler
regressors = [
    'dayofweek', 'dayofyear', 'hour', 'ma_2_hours', 'ma_3_hours',
    'ma_4_hours', 'ma_5_hours', 'ma_6_hours', 'ma_7_hours', 'ma_8_hours', 'ma_9_hours', 'ma_10_hours', 'ma_11_hours', 
    'ma_12_hours', 'ma_13_hours', 'ma_14_hours', 'ma_15_hours', 'ma_16_hours', 'ma_17_hours', 'ma_18_hours', 'ma_19_hours',
    'ma_20_hours', 'ma_21_hours', 'ma_22_hours', 'ma_23_hours',
    
    'Biomass', 'Hard Coal', 'Hydro', 'Lignite', 'Natural Gas', 'Nuclear',
    'Other', 'Pumped storage generation', 'Solar', 'Wind offshore', 'Wind onshore', 
    
    'temperature_2m', 'Precipitation (rain/snow)',
    'wind_speed_100m', 'direct_radiation'
]
# Listeyi optimize edecek bir fonksiyon
def optimize_regressors(train_data, test_data, regressors, num_features=15):
    best_rmse = float('inf')
    best_combination = None
    
    # Tüm kombinasyonları denemek yerine seçilebilir alt kümelere odaklanalım
    from itertools import combinations
    for subset in combinations(regressors, num_features):
        # Prophet modelini her bir alt küme için oluştur
        model = Prophet()
        for regressor in subset:
            model.add_regressor(regressor)
        
        # Modeli eğit
        model.fit(train_data)
        
        # Tahmin yap
        future = test_data[['ds'] + list(subset)]
        forecast = model.predict(future)
        results_df = forecast.merge(test_data, on='ds', how='left')
        
        # RMSE'yi hesapla
        mse = mean_squared_error(results_df['y'], results_df['yhat'])
        rmse = np.sqrt(mse)
        
        # En iyi kombinasyonu kontrol et
        if rmse < best_rmse:
            best_rmse = rmse
            best_combination = subset
    
    return best_combination, best_rmse

# En iyi kombinasyonu bul
best_features, best_rmse = optimize_regressors(train_data, test_data, regressors, num_features=15)

print("En iyi regresör kombinasyonu:", best_features)
print(f"En iyi RMSE: {best_rmse:.2f}")


22:19:15 - cmdstanpy - INFO - Chain [1] start processing
22:19:47 - cmdstanpy - INFO - Chain [1] done processing
22:19:52 - cmdstanpy - INFO - Chain [1] start processing


KeyboardInterrupt: 