In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
file_path = 'continuous dataset.csv'  # Update the file path as needed
data = pd.read_csv(file_path)

# Convert the 'datetime' column to a datetime format
data['datetime'] = pd.to_datetime(data['datetime'], format='%d-%m-%Y %H:%M')

# Extracting date and time features
data['hour'] = data['datetime'].dt.hour
data['day_of_week'] = data['datetime'].dt.dayofweek
data['month'] = data['datetime'].dt.month

# Create lag features for 'nat_demand'
for lag in range(1, 25):  # Creating lag features for the past 24 hours
    data[f'nat_demand_lag_{lag}'] = data['nat_demand'].shift(lag)

# Create rolling statistics features for 'nat_demand'
data['rolling_mean_24h'] = data['nat_demand'].rolling(window=24).mean()
data['rolling_std_24h'] = data['nat_demand'].rolling(window=24).std()

# Drop rows with NaN values created due to lag and rolling features
data = data.dropna()

# Prepare the data for modeling
X = data.drop(columns=['datetime', 'nat_demand'])
y = data['nat_demand']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

# Plot the actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='Actual')
plt.plot(y_pred, label='Predicted')
plt.xlabel('Sample Index')
plt.ylabel('National Electricity Load')
plt.title('Actual vs Predicted National Electricity Load')
plt.legend()
plt.grid(True)
plt.show()


Fitting 3 folds for each of 54 candidates, totalling 162 fits
