# ⚡ Electricity Demand Forecasting

 **Electricity Demand Forecasting** notebook! This notebook demonstrates a complete workflow for forecasting electricity demand using:
- **Naive Baseline**
- **XGBoost**
- **LSTM Neural Network**
- **Ensemble Model**

You'll learn how to load and preprocess data, train and evaluate models, and compare their performance visually. Each step is clearly marked and explained for easy navigation.

---

## Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import xgboost as xgb
import math
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import joblib
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import json
from datetime import datetime, timedelta
import warnings

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Ignore warnings and set random seed

In [2]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

np.random.seed(42)
tf.random.set_seed(42)

## ElectricityDemandForecaster Class: Modular Implementation

The following cells define the `ElectricityDemandForecaster` class in a modular way. Each function is presented in its own cell, with explanations and usage tips. This structure makes the notebook easy to navigate, test, and extend.

### Class Initialization

Define the `ElectricityDemandForecaster` class and its constructor. This sets up the main attributes and prepares the environment for forecasting.

In [None]:
class ElectricityDemandForecaster:
    """
    A class to handle electricity demand forecasting using multiple models and ensemble techniques.
    """
    def __init__(self, data_path=None, forecast_horizon=24):
        self.data_path = data_path
        self.forecast_horizon = forecast_horizon
        self.df = None
        self.train_df = None
        self.test_df = None
        self.feature_columns = None
        self.target_column = 'demand'
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.models = {}
        self.predictions = {}
        self.metrics = {}
        os.makedirs('models', exist_ok=True)

    def load_data(self, data_path=None):
        if data_path:
            self.data_path = data_path
        print(f"Loading data from: {self.data_path}")
        self.df = pd.read_csv(self.data_path)
        missing_values = self.df.isnull().sum()
        print("\nMissing values in each column:")
        print(missing_values)
        print("\nDataset info:")
        print(f"Number of rows: {self.df.shape[0]}")
        print(f"Number of columns: {self.df.shape[1]}")
        print("\nColumn names:")
        print(self.df.columns.tolist())
        return self.df

    def preprocess_data(self):
        """
        Preprocess the data by converting timestamps, extracting features, and scaling.
        """
        if self.df is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        # Convert timestamp column to datetime if not already
        if not np.issubdtype(self.df['timestamp'].dtype, np.datetime64):
            self.df['timestamp'] = pd.to_datetime(self.df['timestamp'])
        # Sort by timestamp
        self.df = self.df.sort_values('timestamp').reset_index(drop=True)
        # Feature engineering: extract time-based features
        self.df['hour'] = self.df['timestamp'].dt.hour
        self.df['dayofweek'] = self.df['timestamp'].dt.dayofweek
        self.df['month'] = self.df['timestamp'].dt.month
        self.df['is_weekend'] = self.df['dayofweek'].isin([5,6]).astype(int)
        # Fill missing values (if any) with forward fill
        self.df = self.df.fillna(method='ffill')
        # Set feature columns (excluding timestamp and target)
        self.feature_columns = [col for col in self.df.columns if col not in ['timestamp', self.target_column]]
        # Scale features and target
        self.feature_scaler = StandardScaler()
        self.target_scaler = MinMaxScaler()
        self.df[self.feature_columns] = self.feature_scaler.fit_transform(self.df[self.feature_columns])
        self.df[self.target_column] = self.target_scaler.fit_transform(self.df[[self.target_column]])
        return self.df

    def split_train_test(self, test_size=0.2):
        """
        Split the data into training and testing sets.
        """
        if self.df is None:
            raise ValueError("Data not loaded or preprocessed.")
        n_test = int(len(self.df) * test_size)
        self.train_df = self.df.iloc[:-n_test]
        self.test_df = self.df.iloc[-n_test:]
        self.X_train = self.train_df[self.feature_columns]
        self.y_train = self.train_df[self.target_column]
        self.X_test = self.test_df[self.feature_columns]
        self.y_test = self.test_df[self.target_column]
        return self.X_train, self.X_test, self.y_train, self.y_test

    def implement_naive_forecast(self):
        """
        Implement a naive forecast model using the value from 24 hours ago.
        """
        if self.test_df is None:
            raise ValueError("Data not split. Call split_train_test() first.")
        # Naive forecast: previous day's value (24 hours ago)
        horizon = self.forecast_horizon
        y_pred = self.test_df[self.target_column].shift(horizon).fillna(method='bfill')
        y_true = self.test_df[self.target_column]
        mae = mean_absolute_error(y_true, y_pred)
        rmse = math.sqrt(mean_squared_error(y_true, y_pred))
        mape = mean_absolute_percentage_error(y_true, y_pred)
        self.predictions['naive'] = y_pred.values
        self.metrics['naive'] = {'mae': mae, 'rmse': rmse, 'mape': mape}
        print(f"Naive Forecast - MAE: {mae:.4f}, RMSE: {rmse:.4f}, MAPE: {mape:.4f}")
        return y_pred

    def create_features_targets(self, data, lag_hours=24, window_size=7*24):
        """
        Create a dataset with lagged features for time series forecasting.
        """
        df = data.copy()
        for lag in range(1, lag_hours+1):
            df[f'lag_{lag}'] = df[self.target_column].shift(lag)
        df['rolling_mean'] = df[self.target_column].rolling(window=window_size).mean()
        df['rolling_std'] = df[self.target_column].rolling(window=window_size).std()
        df = df.dropna().reset_index(drop=True)
        feature_cols = [col for col in df.columns if col not in ['timestamp', self.target_column]]
        X = df[feature_cols]
        y = df[self.target_column]
        return X, y

    def implement_xgboost(self):
        """
        Implement an XGBoost model for time series forecasting.
        """
        X_train, y_train = self.X_train, self.y_train
        X_test, y_test = self.X_test, self.y_test
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = math.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        self.models['xgboost'] = model
        self.predictions['xgboost'] = y_pred
        self.metrics['xgboost'] = {'mae': mae, 'rmse': rmse, 'mape': mape}
        print(f"XGBoost - MAE: {mae:.4f}, RMSE: {rmse:.4f}, MAPE: {mape:.4f}")
        return y_pred

    def create_sequences(self, data, seq_length):
        """
        Create sequences for LSTM model.
        """
        X, y = [], []
        for i in range(len(data) - seq_length):
            X.append(data[i:i+seq_length, :])
            y.append(data[i+seq_length, 0])
        return np.array(X), np.array(y)

    def implement_lstm(self, seq_length=24, epochs=30, batch_size=64):
        """
        Implement an LSTM model for time series forecasting.
        """
        # Prepare data for LSTM
        data = np.hstack([self.X_train.values, self.y_train.values.reshape(-1,1)])
        X_lstm, y_lstm = self.create_sequences(data, seq_length)
        data_test = np.hstack([self.X_test.values, self.y_test.values.reshape(-1,1)])
        X_test_lstm, y_test_lstm = self.create_sequences(data_test, seq_length)
        # Model
        model = Sequential([
            LSTM(64, input_shape=(seq_length, X_lstm.shape[2]), return_sequences=True),
            Dropout(0.2),
            LSTM(32),
            Dropout(0.2),
            Dense(1)
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mae')
        es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        model.fit(X_lstm, y_lstm, validation_split=0.1, epochs=epochs, batch_size=batch_size, callbacks=[es], verbose=0)
        y_pred = model.predict(X_test_lstm).flatten()
        mae = mean_absolute_error(y_test_lstm, y_pred)
        rmse = math.sqrt(mean_squared_error(y_test_lstm, y_pred))
        mape = mean_absolute_percentage_error(y_test_lstm, y_pred)
        self.models['lstm'] = model
        self.predictions['lstm'] = y_pred
        self.metrics['lstm'] = {'mae': mae, 'rmse': rmse, 'mape': mape}
        print(f"LSTM - MAE: {mae:.4f}, RMSE: {rmse:.4f}, MAPE: {mape:.4f}")
        return y_pred

    def create_ensemble(self, weights_options=None):
        """
        Create an ensemble model by combining XGBoost and LSTM predictions.
        """
        if 'xgboost' not in self.predictions or 'lstm' not in self.predictions:
            raise ValueError("Run both XGBoost and LSTM models first.")
        xgb_pred = self.predictions['xgboost']
        lstm_pred = self.predictions['lstm']
        y_true = self.y_test[-len(xgb_pred):] if len(self.y_test) > len(xgb_pred) else self.y_test
        if weights_options is None:
            weights_options = [(0.5, 0.5), (0.6, 0.4), (0.4, 0.6)]
        best_mae = float('inf')
        best_weights = None
        best_pred = None
        for w_xgb, w_lstm in weights_options:
            ensemble_pred = w_xgb * xgb_pred + w_lstm * lstm_pred
            mae = mean_absolute_error(y_true, ensemble_pred)
            if mae < best_mae:
                best_mae = mae
                best_weights = (w_xgb, w_lstm)
                best_pred = ensemble_pred
        self.predictions['ensemble'] = best_pred
        self.metrics['ensemble'] = {'mae': best_mae, 'weights': best_weights}
        print(f"Ensemble - MAE: {best_mae:.4f}, Weights: {best_weights}")
        return best_pred

    def compare_models(self):
        """
        Compare all implemented models and visualize results.
        """
        results = []
        for model_name, metric in self.metrics.items():
            results.append({'Model': model_name, 'MAE': metric['mae'], 'RMSE': metric.get('rmse', None), 'MAPE': metric.get('mape', None)})
        results_df = pd.DataFrame(results)
        print(results_df)
        plt.figure(figsize=(8,4))
        sns.barplot(x='Model', y='MAE', data=results_df)
        plt.title('Model MAE Comparison')
        plt.show()
        return results_df

    def save_models(self, output_dir='models'):
        """
        Save all models for future use.
        """
        os.makedirs(output_dir, exist_ok=True)
        if 'xgboost' in self.models:
            joblib.dump(self.models['xgboost'], os.path.join(output_dir, 'xgboost_demand_forecasting.pkl'))
        if 'lstm' in self.models:
            self.models['lstm'].save(os.path.join(output_dir, 'lstm_demand_forecasting.keras'))
        if hasattr(self, 'target_scaler'):
            joblib.dump(self.target_scaler, os.path.join(output_dir, 'target_scaler.pkl'))
        if 'ensemble' in self.metrics:
            with open(os.path.join(output_dir, 'ensemble_weights.json'), 'w') as f:
                json.dump({'weights': self.metrics['ensemble']['weights']}, f)
        print(f"Models saved to {output_dir}")

    def save_results(self, output_dir='results'):
        """
        Save model metrics, logs, and comparison results to JSON and CSV files.
        """
        os.makedirs(output_dir, exist_ok=True)
        with open(os.path.join(output_dir, 'model_metrics.json'), 'w') as f:
            json.dump(self.metrics, f, indent=2)
        comparison_df = pd.DataFrame([{**{'model': k}, **v} for k, v in self.metrics.items()])
        comparison_df.to_csv(os.path.join(output_dir, 'model_comparison.csv'), index=False)
        log = {'timestamp': datetime.now().isoformat(), 'metrics': self.metrics}
        with open(os.path.join(output_dir, f'training_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json'), 'w') as f:
            json.dump(log, f, indent=2)
        print(f"Results saved to {output_dir}")

    def get_best_model(self):
        """
        Determine the best model based on MAE metric.
        """
        best_model = min(self.metrics.items(), key=lambda x: x[1]['mae'])[0]
        print(f"Best model: {best_model}")
        return best_model

    def visualize_results(self, sample_days=7):
        """
        Visualize forecasting results for all models.
        """
        plt.figure(figsize=(15,6))
        n = sample_days * 24
        y_true = self.y_test[:n]
        plt.plot(range(n), y_true, label='Actual', color='black')
        for model_name, y_pred in self.predictions.items():
            plt.plot(range(n), y_pred[:n], label=model_name.capitalize())
        plt.legend()
        plt.title(f'Forecasting Results for {sample_days} Days')
        plt.xlabel('Hour')
        plt.ylabel('Scaled Demand')
        plt.show()

    def run_pipeline(self):
        """
        Run the complete forecasting pipeline.
        """
        self.load_data()
        self.preprocess_data()
        self.split_train_test()
        self.implement_naive_forecast()
        self.implement_xgboost()
        self.implement_lstm()
        self.create_ensemble()
        self.compare_models()
        self.visualize_results()
        self.save_models()
        self.save_results()