# Grid Forecasting: Missing Values
In the previous notebooks, analysis was done only with respect to Zone 1. Here, Time Series Regression is used to perform prediction for all zones simultaneously.
> Predictions are done to fill missing values (best estimates)

> As usual, three weeks of training is used.

## Pre-processing

### Defining Dependent Variable

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

# Loading the data
load_long = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Exploratory Code\load_history_long.csv").sort_values(by = "timestamp")
load_long.head()

Unnamed: 0,zone_id,year,month,day,hour,load,timestamp
0,1,2004,1,1,00:30,16853.0,2004-01-01 00:30:00
14850,10,2004,1,1,00:30,23339.0,2004-01-01 00:30:00
16500,11,2004,1,1,00:30,90700.0,2004-01-01 00:30:00
28050,18,2004,1,1,00:30,200946.0,2004-01-01 00:30:00
9900,7,2004,1,1,00:30,136233.0,2004-01-01 00:30:00


In [2]:
# Converting to Wide Format
load_wide = load_long.pivot_table(
    index='timestamp',      # The column to use as the index
    columns='zone_id',  # The column whose unique values will become the new column names
    values='load'    # The column to use for the values in the new DataFrame.
).sort_values(by="timestamp")

# Converting string to datetime
from datetime import datetime
load_wide.index = pd.to_datetime(load_wide.index)
load_wide.index[0]

# Segregating temporal information
load_wide['year'] = load_wide.index.year
load_wide['month'] = load_wide.index.month
load_wide['day'] = load_wide.index.day
load_wide['hour'] = load_wide.index.hour

load_wide.head()

zone_id,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,year,month,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01 00:30:00,16853.0,126259.0,136233.0,484.0,6829.0,133088.0,136233.0,3124.0,75243.0,23339.0,...,65970.0,28752.0,30645.0,200946.0,82298.0,79830.0,2004,1,1,0
2004-01-01 01:30:00,16450.0,123313.0,133055.0,457.0,6596.0,129909.0,133055.0,2956.0,67368.0,22100.0,...,64600.0,27851.0,30461.0,195835.0,79827.0,77429.0,2004,1,1,1
2004-01-01 02:30:00,16517.0,119192.0,128608.0,450.0,6525.0,125717.0,128608.0,2953.0,64050.0,21376.0,...,63843.0,27631.0,30197.0,194093.0,77728.0,75558.0,2004,1,1,2
2004-01-01 03:30:00,16873.0,117507.0,126791.0,448.0,6654.0,124162.0,126791.0,2914.0,63861.0,21335.0,...,64023.0,27986.0,30264.0,194708.0,76433.0,75709.0,2004,1,1,3
2004-01-01 04:30:00,17064.0,118343.0,127692.0,444.0,6977.0,125320.0,127692.0,3221.0,75852.0,21564.0,...,65679.0,29160.0,30907.0,202458.0,78172.0,77475.0,2004,1,1,4


In [3]:
# Log transformation on load values (no-scaling)
load_wide_log = load_wide
load_wide_log[list(range(1,21,1))] = load_wide_log[list(range(1,21,1))].apply(np.log)
load_wide_log[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,year,month,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01 00:30:00,9.732284,11.746091,11.822122,6.182085,8.828934,11.798766,11.822122,8.04687,11.228478,10.057881,...,11.096955,10.266463,10.330225,12.210791,11.318102,11.287655,2004,1,1,0
2004-01-01 01:30:00,9.708081,11.722481,11.798518,6.124683,8.794219,11.774589,11.798518,7.991592,11.117925,10.003333,...,11.07597,10.234624,10.324202,12.185028,11.287617,11.257117,2004,1,1,1


### Defining Feature Matrix

In [4]:
# Loading the temperature data
temperature = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Exploratory Code\weighted_temperature.csv")
feature_matrix = temperature[["timestamp","temp_weighted"]]
feature_matrix.set_index("timestamp", inplace=True)
feature_matrix.index = pd.to_datetime(feature_matrix.index)
feature_matrix[0:3]

Unnamed: 0_level_0,temp_weighted
timestamp,Unnamed: 1_level_1
2004-01-01 00:30:00,42.338937
2004-01-01 01:30:00,41.239284
2004-01-01 02:30:00,39.591442


In [5]:
# Adding temperature knots for PLR

T_H = 55  # Heating Threshold
T_C = 65  # Cooling Threshold

# Temporarily changing alias of df
mul_df = feature_matrix.copy()

# Construct the Heating Demand Knot: HDK = max(0, T_H - Temp)
## This captures load increase when temp is below T_H.
mul_df["HDK"] = np.where(
    mul_df["temp_weighted"] < T_H,  
    T_H - mul_df["temp_weighted"],  # Value if True: The positive difference
    0                               # Value if False: Zero
)

# Construct the Cooling Demand Knot; CDK = max(0, Temp - T_C)
## This captures load increase when temp is above T_C.
mul_df["CDK"] = np.where(
    mul_df["temp_weighted"] > T_C,  
    mul_df["temp_weighted"] - T_C,  # Value if True: The positive difference
    0                               # Value if False: Zero
)

# Reverting back to original alias
feature_matrix = mul_df.copy()
feature_matrix[0:3]

Unnamed: 0_level_0,temp_weighted,HDK,CDK
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-01-01 00:30:00,42.338937,12.661063,0.0
2004-01-01 01:30:00,41.239284,13.760716,0.0
2004-01-01 02:30:00,39.591442,15.408558,0.0


In [8]:
# Segregating temporal information
feature_matrix['year'] = feature_matrix.index.year
feature_matrix['month'] = feature_matrix.index.month
feature_matrix['day'] = feature_matrix.index.day
feature_matrix['hour'] = feature_matrix.index.hour
feature_matrix.sort_index(inplace=True)
feature_matrix.tail()

Unnamed: 0_level_0,temp_weighted,HDK,CDK,year,month,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-06-30 01:30:00,71.08037,0.0,6.08037,2008,6,30,1
2008-06-30 02:30:00,70.803812,0.0,5.803812,2008,6,30,2
2008-06-30 03:30:00,70.248102,0.0,5.248102,2008,6,30,3
2008-06-30 04:30:00,70.065118,0.0,5.065118,2008,6,30,4
2008-06-30 05:30:00,69.793713,0.0,4.793713,2008,6,30,5


### Filling Missing Values: Forecasting Only

#### Period 1: 6 Mar 2005 - 12 Mar 2005

In [31]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-03-06 00:30:00") & (feature_matrix.index > "2005-02-15 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-03-06 00:30:00") & (load_wide_log.index > "2005-02-15 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [32]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [33]:
# Defining feature space dedicated to period 1
period_1_X = feature_matrix.loc[(feature_matrix.index >= "2005-03-06 00:30:00") & (feature_matrix.index <= "2005-03-12 23:30:00"),:]

dp = DeterministicProcess(
    index=period_1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_1_X = period_1_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_1 = period_1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [34]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_1.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_1)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [35]:
# Undoing log transformation for original predictions
predictions_p1_unlogged = np.exp(predictions_set_1)
predictions_p1_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-03-06 00:30:00,19573.855749,173827.01079,187559.749964,558.678543,8517.48799,182421.636868,187559.749964,4057.558755,81995.192432,27316.708197,113176.573349,144932.416457,22506.783567,24937.175856,69802.09755,37023.97488,35551.080736,246525.568097,90432.935359,92277.429913
2005-03-06 01:30:00,18964.220939,168570.071205,181887.489076,500.07994,8089.64787,176739.728735,181887.489076,3887.782054,83098.672537,26154.435511,108078.010205,136906.907005,21475.841658,23839.287271,67425.680199,35567.199097,34460.328671,236786.482422,86921.514296,88997.134691
2005-03-06 02:30:00,19318.381886,168939.798844,182286.408903,480.600936,8191.173487,177213.239236,182286.408903,3899.400975,81782.003228,26087.960771,107971.726528,135931.229606,21379.92863,23930.93791,67568.221164,35931.099405,34707.98562,238773.497677,87804.932887,89053.731752
2005-03-06 03:30:00,19916.942625,172069.56378,185663.420763,481.40614,8480.443513,180633.538262,185663.420763,3982.265882,78541.457355,26432.971009,109443.173232,137312.071593,21645.601617,24073.089271,68149.707505,36314.552506,35135.282439,242798.803408,89222.798536,90684.302635
2005-03-06 04:30:00,21710.524934,181633.855009,195983.289907,528.866629,9385.545676,191088.591814,195983.289907,4269.915518,73634.919279,28093.432032,117155.014347,147170.550447,23077.391466,25800.599654,72117.7146,39163.255749,37306.130545,261915.979306,96423.405184,96119.156662


In [36]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p1_unlogged = np.exp(
    train_p1_y[-len(period_1_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p1_unlogged.index = period_1_X.index
last_cycle_naive_p1_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-03-06 00:30:00,18954.0,162388.0,175217.0,531.0,8530.0,170918.0,175217.0,4054.0,80850.0,25031.0,108347.0,142581.0,22016.0,25947.0,70602.0,36004.0,34078.0,245140.0,91067.0,85633.0
2005-03-06 01:30:00,19614.0,161693.0,174468.0,502.0,8437.0,170131.0,174468.0,4009.0,80241.0,24794.0,106572.0,138708.0,22100.0,25967.0,70621.0,36449.0,33875.0,247743.0,91186.0,85038.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 2: 20 Jun 2005 - 26 Jun 2005

In [39]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-06-20 00:30:00") & (feature_matrix.index > "2005-05-30 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-06-20 00:30:00") & (load_wide_log.index > "2005-05-30 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [40]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [41]:
# Defining feature space dedicated to period 1
period_2_X = feature_matrix.loc[(feature_matrix.index >= "2005-06-20 00:30:00") & (feature_matrix.index <= "2005-06-26 23:30:00"),:]

dp = DeterministicProcess(
    index=period_2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_2_X = period_2_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_2 = period_2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [42]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_2.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_2)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [43]:
# Undoing log transformation for original predictions
predictions_p2_unlogged = np.exp(predictions_set_1)
predictions_p2_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-06-20 00:30:00,12422.714966,140382.715772,151473.310983,357.222535,4945.417807,145348.999337,151473.310983,2416.164588,86992.626563,19018.145275,69354.149321,81468.949548,13929.737497,12394.921771,46497.954793,17775.17512,24279.860814,142577.222424,51756.157101,66823.855883
2005-06-20 01:30:00,11337.223784,132719.691548,143204.858123,325.129971,4417.942316,137085.049897,143204.858123,2219.052599,86716.004086,17831.582845,64196.796044,74244.259588,12571.430155,11036.884146,42973.149895,16086.22762,22653.721777,128758.393282,46574.359219,61881.063598
2005-06-20 02:30:00,10660.441104,128014.212094,138127.608122,308.247745,4108.390784,132020.22899,138127.608122,2112.014865,85209.589225,17107.758176,60982.944945,69609.128952,11739.035031,10156.31522,40540.925917,15008.601056,21575.649798,120012.000835,43320.733922,58924.191555
2005-06-20 03:30:00,10415.413434,126812.402679,136830.835409,307.462919,4019.238077,130710.086273,136830.835409,2098.515094,83194.839739,16921.278132,59985.911272,67861.42682,11523.772188,9806.433941,39529.613636,14619.044229,21152.868704,116905.647685,42214.014859,58318.943299
2005-06-20 04:30:00,10567.806318,129053.292173,139248.754186,322.157153,4127.04999,133066.213114,139248.754186,2171.070467,81517.26869,17261.272819,61121.79252,68852.926865,11928.150997,9957.389842,39980.089474,14884.337173,21370.131973,119178.987398,43133.385831,60042.873079


In [44]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p2_unlogged = np.exp(
    train_p1_y[-len(period_2_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p2_unlogged.index = period_2_X.index
last_cycle_naive_p2_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-06-20 00:30:00,15416.0,149278.0,161072.0,337.0,6698.0,155977.0,161072.0,2859.0,82992.0,21079.0,92005.0,117462.0,15390.0,20707.0,56611.0,26251.0,31975.0,185626.0,71916.0,71910.0
2005-06-20 01:30:00,14390.0,143795.0,155155.0,322.0,5938.0,149733.0,155155.0,2622.0,82551.0,20259.0,84773.0,106672.0,14315.0,18792.0,52290.0,24402.0,30406.0,170506.0,65670.0,67696.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 3: 10 Sep 2005 - 16 Sep 2005

In [45]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-09-10 00:30:00") & (feature_matrix.index > "2005-08-18 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-09-10 00:30:00") & (load_wide_log.index > "2005-08-18 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [46]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [47]:
# Defining feature space dedicated to period 1
period_3_X = feature_matrix.loc[(feature_matrix.index >= "2005-09-10 00:30:00") & (feature_matrix.index <= "2005-09-16 23:30:00"),:]

dp = DeterministicProcess(
    index=period_3_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()

period_3_X = period_3_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_3 = period_3_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [48]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_3.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_3)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [49]:
# Undoing log transformation for original predictions
predictions_p3_unlogged = np.exp(predictions_set_1)
predictions_p3_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-09-10 00:30:00,15041.288684,138340.878242,149270.072153,386.418057,5352.98032,143703.653261,149270.072153,2506.866996,64499.52894,24183.432952,82682.626977,102058.44782,16039.151434,15812.534424,57102.305396,22212.376643,29145.320261,157636.637267,62578.280386,70227.341341
2005-09-10 01:30:00,13621.243312,130338.587625,140635.607229,352.714786,4784.098608,135087.63465,140635.607229,2297.076153,64787.184824,22567.508227,75710.367741,91901.111766,14522.502408,14071.086404,52497.903458,20026.440804,26980.664468,142496.230676,56181.584438,64914.143656
2005-09-10 02:30:00,12766.049376,125530.634066,135447.838819,335.100759,4478.164155,129947.555675,135447.838819,2191.218655,64318.034412,21529.652738,71354.705938,85424.387346,13680.831639,12924.497969,49553.351771,18631.032169,25564.133059,133421.029066,52311.445607,62050.866611
2005-09-10 03:30:00,12469.207738,124289.551103,134108.72844,334.382787,4422.885272,128649.196737,134108.72844,2186.565917,63185.933272,21172.096388,69855.451007,82810.747229,13558.583188,12415.106557,48455.996837,18091.13727,24979.524776,130640.350162,51029.540799,61794.79067
2005-09-10 04:30:00,12669.086659,126446.825405,136436.447963,349.639532,4583.705051,130982.867788,136436.447963,2270.60084,61463.183899,21489.493518,71055.887649,83768.241025,14125.011134,12494.679371,49115.086018,18349.760334,25174.608923,133600.489548,52081.939795,63965.581125


In [50]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p3_unlogged = np.exp(
    train_p1_y[-len(period_3_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p3_unlogged.index = period_3_X.index
last_cycle_naive_p3_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-09-10 00:30:00,14051.0,142723.0,153999.0,401.0,6045.0,148768.0,153999.0,2550.0,95193.0,27228.0,96888.0,121831.0,14637.0,16886.0,57439.0,27612.0,26608.0,172283.0,62838.0,65611.0
2005-09-10 01:30:00,12366.0,133690.0,144251.0,342.0,5152.0,138841.0,144251.0,2252.0,94983.0,25381.0,86992.0,106760.0,13390.0,14886.0,53102.0,24180.0,24548.0,151473.0,55876.0,60123.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 4: 25 Dec 2005 - 31 Dec 2005

In [51]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-12-25 00:30:00") & (feature_matrix.index > "2005-12-02 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-12-25 00:30:00") & (load_wide_log.index > "2005-12-02 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [52]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [53]:
# Defining feature space dedicated to period 1
period_4_X = feature_matrix.loc[(feature_matrix.index >= "2005-12-25 00:30:00") & (feature_matrix.index <= "2005-12-31 23:30:00"),:]

dp = DeterministicProcess(
    index=period_4_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_4_X = period_4_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_4 = period_4_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [54]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_4.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_4)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [55]:
# Undoing log transformation for original predictions
predictions_p4_unlogged = np.exp(predictions_set_1)
predictions_p4_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-12-25 00:30:00,20138.966002,184804.153288,199404.028297,559.359454,9263.6717,194050.211212,199404.028297,4266.00427,66000.225901,25753.76008,107062.723288,141251.541882,21760.960747,23804.050049,65140.938784,34173.473607,33894.715947,239020.906958,85593.297937,91750.420591
2005-12-25 01:30:00,19511.925293,178542.779031,192647.990586,533.471214,8875.33289,187399.429627,192647.990586,4104.3945,68193.576434,24438.653141,101689.57602,132818.22796,20864.857765,22851.115506,63131.238699,33089.560257,32775.810883,232168.646697,82632.689625,88449.560705
2005-12-25 02:30:00,19566.065813,177019.198411,191004.041818,528.018111,8903.423482,185915.318319,191004.041818,4086.714852,68341.267412,23955.024911,100192.526134,130054.373382,20449.914732,22457.917869,62279.911303,33046.068572,32555.784819,232156.378859,81924.48941,87431.239494
2005-12-25 03:30:00,20187.125395,179653.710484,193846.681928,542.024944,9300.026499,188963.585084,193846.681928,4198.774556,66387.2474,24189.447259,102101.804269,132321.363918,20528.895455,22558.556805,62550.418771,33835.893978,33105.08056,237856.417638,83230.249589,88472.671326
2005-12-25 04:30:00,21130.05029,185098.663257,199721.791035,571.424147,9949.127874,195068.848216,199721.791035,4402.514103,62911.303475,24908.731799,106363.062381,138167.448361,21017.275201,22961.828257,63608.368102,35040.922883,34105.987496,246718.407664,85813.727445,90939.262115


In [56]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p4_unlogged = np.exp(
    train_p1_y[-len(period_4_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p4_unlogged.index = period_4_X.index
last_cycle_naive_p4_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-12-25 00:30:00,21429.0,179903.0,194116.0,537.0,10044.0,189947.0,194116.0,4456.0,80115.0,24745.0,109326.0,146826.0,23890.0,26423.0,74062.0,37538.0,35435.0,263232.0,96292.0,94372.0
2005-12-25 01:30:00,21124.0,174678.0,188478.0,524.0,9628.0,184306.0,188478.0,4274.0,79884.0,23819.0,105173.0,140610.0,23035.0,25054.0,71411.0,36092.0,33705.0,256316.0,92807.0,91423.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 5: 13 Feb 2006 - 19 Feb 2006

In [57]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-02-13 00:30:00") & (feature_matrix.index > "2006-01-20 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-02-13 00:30:00") & (load_wide_log.index > "2006-01-20 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [58]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [59]:
# Defining feature space dedicated to period 1
period_5_X = feature_matrix.loc[(feature_matrix.index >= "2006-02-13 00:30:00") & (feature_matrix.index <= "2006-02-19 23:30:00"),:]

dp = DeterministicProcess(
    index=period_5_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_5_X = period_5_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_5 = period_5_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [60]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_5.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_5)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [61]:
# Undoing log transformation for original predictions
predictions_p5_unlogged = np.exp(predictions_set_1)
predictions_p5_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-02-13 00:30:00,22171.131264,192679.976348,207902.14338,543.159375,9583.413177,202257.028788,207902.14338,4296.306443,60368.169583,29426.85346,115800.381409,150872.752606,24236.564139,29222.764918,75503.801857,39310.893684,37680.980129,274623.626369,102245.518768,99719.595068
2006-02-13 01:30:00,21913.093829,188292.162672,203167.658893,515.194116,9312.70797,197604.24394,203167.658893,4175.256608,59976.5449,28487.409712,112440.205845,145179.838253,23775.340859,29132.7532,75243.369821,39269.521153,37434.175262,273453.186154,102090.085547,98170.233437
2006-02-13 02:30:00,22275.4163,187888.051644,202731.597435,506.934833,9434.797347,197321.023557,202731.597435,4184.114623,58483.034292,28267.067857,112327.22832,144101.269091,23735.551799,29399.785597,75798.980404,39904.727424,37889.909101,278089.521786,103760.907065,98633.897147
2006-02-13 03:30:00,23509.801465,193035.874053,208286.089668,522.120888,10049.488761,203065.417325,208286.089668,4353.415153,55879.826792,29097.796303,116673.254596,149228.399236,24365.203728,30488.638762,77993.312186,41780.387473,39451.677731,291971.181733,108634.521192,101949.176734
2006-02-13 04:30:00,25670.197647,204143.888997,220271.653553,560.932593,11175.447723,215247.187422,220271.653553,4682.752106,52592.52551,31097.724431,125832.158767,160960.737602,25755.091311,32612.31302,82138.385936,45122.848942,42229.250006,316228.565751,117208.727691,108294.220481


In [62]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p5_unlogged = np.exp(
    train_p1_y[-len(period_5_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p5_unlogged.index = period_5_X.index
last_cycle_naive_p5_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-02-13 00:30:00,22108.0,173382.0,187079.0,510.0,8332.0,181713.0,187079.0,3964.0,64848.0,23557.0,93146.0,127480.0,23316.0,24632.0,70279.0,33329.0,36974.0,236617.0,92291.0,90099.0
2006-02-13 01:30:00,21988.0,173426.0,187126.0,502.0,8288.0,181713.0,187126.0,3798.0,65478.0,23188.0,91117.0,124229.0,22799.0,24574.0,70199.0,33797.0,36221.0,235005.0,91260.0,88589.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 6: 25 May 2006 - 31 May 2006

In [63]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-05-25 00:30:00") & (feature_matrix.index > "2006-05-02 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-05-25 00:30:00") & (load_wide_log.index > "2006-05-02 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [64]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [65]:
# Defining feature space dedicated to period 1
period_6_X = feature_matrix.loc[(feature_matrix.index >= "2006-05-25 00:30:00") & (feature_matrix.index <= "2006-05-31 23:30:00"),:]

dp = DeterministicProcess(
    index=period_6_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_6_X = period_6_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_6 = period_6_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [66]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_6.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_6)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [67]:
# Undoing log transformation for original predictions
predictions_p6_unlogged = np.exp(predictions_set_1)
predictions_p6_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-05-25 00:30:00,11805.002985,124534.657463,134373.168495,354.847753,4409.370495,129002.050724,134373.168495,2329.850342,70567.011329,17688.73735,64774.217457,79652.440789,13273.825433,11964.652665,43446.776458,16396.061853,23381.384737,127469.83076,48062.47762,61980.80476
2006-05-25 01:30:00,10958.095121,118574.996052,127942.655316,329.446643,3974.839012,122573.224621,127942.655316,2205.811126,68823.495178,16549.347502,59784.285499,72376.578697,12423.613382,10728.151219,40763.417968,14961.008854,21873.168967,116504.925519,43990.934609,58519.098365
2006-05-25 02:30:00,10577.020563,115557.00483,124686.204381,319.277746,3797.753893,119358.883362,124686.204381,2175.27313,67268.54395,15955.470182,57378.514055,68591.865569,12118.287751,10170.103811,39653.866064,14308.484988,21073.76808,111537.465579,42317.490918,57283.867566
2006-05-25 03:30:00,10733.471205,116293.642912,125481.008593,325.999745,3899.43738,120203.126593,125481.008593,2252.826477,65583.723576,16012.794504,57802.940803,68641.289348,12479.370163,10307.079414,40312.183691,14520.487112,21107.397085,113392.903612,43378.186722,58680.857441
2006-05-25 04:30:00,11359.595875,120510.985598,130031.504316,348.156896,4250.125006,124791.248875,130031.504316,2421.680213,63998.01197,16667.858428,60920.785335,72232.767031,13441.888391,11055.808465,42575.746782,15518.021567,21892.013046,121381.83142,46814.829054,62397.753081


In [68]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p6_unlogged = np.exp(
    train_p1_y[-len(period_6_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p6_unlogged.index = period_6_X.index
last_cycle_naive_p6_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-05-25 00:30:00,11268.0,128006.0,138119.0,346.0,4030.0,132037.0,138119.0,2319.0,80703.0,18125.0,62918.0,73969.0,12821.0,10846.0,41124.0,14890.0,23115.0,118544.0,44021.0,62178.0
2006-05-25 01:30:00,10635.0,123694.0,133467.0,331.0,3723.0,127418.0,133467.0,2224.0,69027.0,16771.0,58922.0,67396.0,12212.0,9960.0,38955.0,14050.0,21825.0,110073.0,41008.0,59775.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 7: 02 Aug 2006 - 08 Aug 2006

In [69]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-08-02 00:30:00") & (feature_matrix.index > "2006-07-11 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-08-02 00:30:00") & (load_wide_log.index > "2006-07-11 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [70]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [71]:
# Defining feature space dedicated to period 1
period_7_X = feature_matrix.loc[(feature_matrix.index >= "2006-08-02 00:30:00") & (feature_matrix.index <= "2006-08-08 23:30:00"),:]

dp = DeterministicProcess(
    index=period_7_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_7_X = period_7_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_7 = period_7_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [72]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_7.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_7)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [73]:
# Undoing log transformation for original predictions
predictions_p7_unlogged = np.exp(predictions_set_1)
predictions_p7_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-08-02 00:30:00,20319.593983,180514.378632,194775.436101,463.744855,8350.41959,188900.886247,194775.436101,3562.912697,81948.817099,32938.745523,129717.451038,170966.106116,17272.589488,23689.561977,63594.837643,32973.559192,37137.055038,225017.590524,83357.872449,85841.814971
2006-08-02 01:30:00,18006.804008,167738.157101,180989.871492,418.084936,7261.163845,175011.287809,180989.871492,3200.566344,77342.62197,30225.244552,116443.743868,150025.295578,15669.635155,20919.525156,57832.622757,29487.286446,34219.304183,200284.464718,74260.325351,78742.1096
2006-08-02 02:30:00,16379.939675,158803.297586,171349.140867,388.424809,6523.847801,165311.90846,171349.140867,2967.959022,73270.765497,28321.796351,107105.499645,135131.989658,14631.565468,18906.762761,53579.056189,27009.022565,32046.545074,182936.534727,67834.050199,74052.046775
2006-08-02 03:30:00,15476.42729,154339.998594,166533.22933,376.523053,6138.990675,160448.559057,166533.22933,2867.699956,71293.705656,27380.241387,102147.188367,126743.60495,14213.26765,17733.724273,51150.038133,25630.930775,30770.589242,173626.853273,64329.69619,72054.077529
2006-08-02 04:30:00,14880.554363,151588.353084,163564.181451,376.382373,5925.887263,157480.620926,163564.181451,2825.321914,73306.936812,26698.677863,97973.40498,119820.917262,14177.051048,16911.748063,49491.479851,24505.023597,29731.952382,167640.085482,61947.271981,71360.430845


In [74]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p7_unlogged = np.exp(
    train_p1_y[-len(period_7_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p7_unlogged.index = period_7_X.index
last_cycle_naive_p7_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-08-02 00:30:00,16349.0,161874.0,174662.0,402.0,6234.0,168108.0,174662.0,2730.0,93450.0,26556.0,99613.0,123715.0,15541.0,17521.0,55921.0,24984.0,31938.0,177341.0,66148.0,77538.0
2006-08-02 01:30:00,14609.0,154462.0,166665.0,376.0,5615.0,160077.0,166665.0,2502.0,93660.0,24888.0,92294.0,110378.0,14234.0,15618.0,51720.0,22532.0,29547.0,161397.0,60256.0,72783.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 8: 22 Nov 2006 - 28 Nov 2006

In [75]:
# Creating training data 1 (3 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-11-22 00:30:00") & (feature_matrix.index >= "2006-11-01 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-11-22 00:30:00") & (load_wide_log.index >= "2006-11-01 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')

In [76]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [77]:
# Defining feature space dedicated to period 1
period_8_X = feature_matrix.loc[(feature_matrix.index >= "2006-11-22 00:30:00") & (feature_matrix.index <= "2006-11-28 23:30:00"),:]

dp = DeterministicProcess(
    index=period_8_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_8_X = period_8_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_8 = period_8_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [78]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_8.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_8)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!


In [79]:
# Undoing log transformation for original predictions
predictions_p8_unlogged = np.exp(predictions_set_1)
predictions_p8_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-11-22 00:30:00,16896.128265,155776.95374,168083.656031,498.724159,6935.342175,162848.870939,168083.656031,3345.483572,66759.677723,22291.010241,93958.274947,111037.353126,18861.08172,20565.907609,59681.36518,26621.16061,32029.44332,196938.167536,75357.293605,81694.572479
2006-11-22 01:30:00,15484.889703,145895.049336,157421.07876,451.513589,6132.115232,152154.264993,157421.07876,3068.124455,71700.174363,20433.554641,85160.025819,98865.449686,17492.799924,19082.108158,56496.156209,24495.816712,30072.402807,180885.439985,69670.463389,76514.151611
2006-11-22 02:30:00,14638.248397,139434.12618,150449.734028,420.675567,5657.513248,145201.714032,150449.734028,2903.5904,73156.857076,19209.649517,79517.713181,90873.81682,16584.630733,18116.979645,54239.926708,23082.924927,28752.772836,170489.647907,66089.001722,73228.912401
2006-11-22 03:30:00,14390.574593,136958.975493,147779.034417,408.214659,5505.195638,142564.755682,147779.034417,2855.236059,70659.195359,18692.194506,77254.506532,87214.559424,16199.771426,17641.848111,53057.578418,22382.730602,28155.204772,166027.495028,64643.924844,72083.113173
2006-11-22 04:30:00,14882.346115,139150.906313,150144.118302,416.496936,5720.025424,144977.708121,150144.118302,2940.403765,65103.997685,18996.976079,78963.579925,88604.219841,16468.31091,17846.640574,53465.160725,22677.988864,28519.468769,169305.035347,66017.478015,73444.098189


In [80]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p8_unlogged = np.exp(
    train_p1_y[-len(period_8_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p8_unlogged.index = period_8_X.index
last_cycle_naive_p8_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-11-22 00:30:00,13684.0,135868.0,146602.0,475.0,4695.0,140563.0,146602.0,2757.0,35259.0,17730.0,71808.0,80241.0,16972.0,12639.0,49360.0,17784.0,25334.0,139865.0,58657.0,73176.0
2006-11-22 01:30:00,13530.0,133060.0,143573.0,461.0,4566.0,137626.0,143573.0,2698.0,55671.0,17137.0,68911.0,76321.0,16737.0,12242.0,48467.0,17557.0,24838.0,138431.0,58346.0,71566.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

### Performance Comparison
Performance is compared with actual load values as well as competition's benchmarked values for the missing 8 weeks. RMSE is used as a metric.

#### Reading Actual Values

In [81]:
# Reading actual load values
actual_load = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Dataset\GEFCom2012\GEFCOM2012_Data\Load\Load_solution.csv")
actual_load.drop(["id","weight"], axis=1, inplace=True)
actual_load[0:2]

Unnamed: 0,zone_id,year,month,day,h1,h2,h3,h4,h5,h6,...,h15,h16,h17,h18,h19,h20,h21,h22,h23,h24
0,1,2005,3,6,19964,19544,19390,19442,19755,20008,...,14535,13955,13712,14372,16392,18253,18355,17157,16089,15146
1,2,2005,3,6,162096,160890,160924,158962,163197,165197,...,151681,148210,149373,153728,171318,175893,175858,166342,155411,145988


In [82]:
# Converting the data into long-format

actual_load_long = actual_load.melt(
                id_vars=["zone_id","year","month","day"],
                value_vars=[f"h{i}" for i in range(1, 25)],
                var_name="hour",
                value_name="load"
                        )


actual_load_long.head()

Unnamed: 0,zone_id,year,month,day,hour,load
0,1,2005,3,6,h1,19964
1,2,2005,3,6,h1,162096
2,3,2005,3,6,h1,174901
3,4,2005,3,6,h1,528
4,5,2005,3,6,h1,9061


In [83]:
# Replacing hour values with interval mid-point

# Create a mapping from 'h1' to 'h24' → '00:30' to '23:30'
hour_map = {f"h{i}": f"{str(i-1).zfill(2)}:30" for i in range(1, 25)}

# Replace the values using .map()
actual_load_long["hour"] = actual_load_long["hour"].map(hour_map)

# Creating timestamps using existing information
actual_load_long["timestamp"] = pd.to_datetime(
    actual_load_long["year"].astype(str) + "-" +
    actual_load_long["month"].astype(str).str.zfill(2) + "-" +
    actual_load_long["day"].astype(str).str.zfill(2) + " " +
    actual_load_long["hour"]
)

actual_load_long.head()

Unnamed: 0,zone_id,year,month,day,hour,load,timestamp
0,1,2005,3,6,00:30,19964,2005-03-06 00:30:00
1,2,2005,3,6,00:30,162096,2005-03-06 00:30:00
2,3,2005,3,6,00:30,174901,2005-03-06 00:30:00
3,4,2005,3,6,00:30,528,2005-03-06 00:30:00
4,5,2005,3,6,00:30,9061,2005-03-06 00:30:00


In [84]:
# Pivoting to record zone by column
actual_load_long = pd.pivot_table(actual_load_long, index="timestamp", columns="zone_id", values="load")
actual_load_long[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-03-06 00:30:00,19964.0,162096.0,174901.0,528.0,9061.0,171157.0,174901.0,4091.0,61215.0,26459.0,...,140417.0,21302.0,27740.0,74218.0,40411.0,36845.0,268789.0,102241.0,89800.0,1719688.0
2005-03-06 01:30:00,19544.0,160890.0,173600.0,499.0,8697.0,169587.0,173600.0,3971.0,61131.0,25979.0,...,137418.0,20466.0,27713.0,73397.0,40408.0,36745.0,267273.0,101374.0,88325.0,1703132.0


#### Comparing Time Series Regression With Actual Load Performance

##### Period 1

In [85]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error
r2_scores_p1 = []
mape_scores_p1 = []
rmse_scores_p1 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p1_unlogged.index,i]),np.array(predictions_p1_unlogged.loc[:,predictions_p1_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p1_unlogged.index,i]),np.array(predictions_p1_unlogged.loc[:,predictions_p1_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p1_unlogged.index,i]),np.array(predictions_p1_unlogged.loc[:,predictions_p1_unlogged.columns[i-1]])))

    # adding scores to score list
    r2_scores_p1.append(r2)
    mape_scores_p1.append(mape)
    rmse_scores_p1.append(rmse)

print(f"Average R2 score for all zones, Period 1: {np.mean(r2_scores_p1)}")
print(f"Average MAPE score for all zones, Period 1: {np.mean(mape_scores_p1)}")
print(f"Average RMSE score for all zones, Period 1: {np.mean(rmse_scores_p1)}")

Average R2 score for all zones, Period 1: 0.7303278234836291
Average MAPE score for all zones, Period 1: 0.06680645708220345
Average RMSE score for all zones, Period 1: 6567.481090164481


In [86]:
# Zone 4 and 9 are showing some issues
r2_scores_p1

[0.8590556929495683,
 0.746826702587424,
 0.7468236502915753,
 0.17180775349141497,
 0.8542322314972606,
 0.768215904852062,
 0.7468236502915753,
 0.8243671421584903,
 -0.3592118568695588,
 0.7776760818525417,
 0.8639017828453819,
 0.851331167369741,
 0.7677845416105107,
 0.8152989199760524,
 0.8789442771600395,
 0.8753016271941388,
 0.8628401837590051,
 0.8967960259417455,
 0.8401273671546494,
 0.8176136235589654]

##### Period 2

In [87]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p2 = []
mape_scores_p2 = []
rmse_scores_p2 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p2_unlogged.index,i]),np.array(predictions_p2_unlogged.loc[:,predictions_p2_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p2_unlogged.index,i]),np.array(predictions_p2_unlogged.loc[:,predictions_p2_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p2_unlogged.index,i]),np.array(predictions_p2_unlogged.loc[:,predictions_p2_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p2.append(r2)
    mape_scores_p2.append(mape)
    rmse_scores_p2.append(rmse)

print(f"Average R2 score for all zones, Period 2: {np.mean(r2_scores_p2)}")
print(f"Average MAPE score for all zones, Period 2: {np.mean(mape_scores_p2)}")
print(f"Average RMSE score for all zones, Period 2: {np.mean(rmse_scores_p2)}")

Average R2 score for all zones, Period 2: 0.7160385872719668
Average MAPE score for all zones, Period 2: 0.08379027790899499
Average RMSE score for all zones, Period 2: 7605.118821888791


##### Period 3

In [88]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p3 = []
mape_scores_p3 = []
rmse_scores_p3 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p3_unlogged.index,i]),np.array(predictions_p3_unlogged.loc[:,predictions_p3_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p3_unlogged.index,i]),np.array(predictions_p3_unlogged.loc[:,predictions_p3_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p3_unlogged.index,i]),np.array(predictions_p3_unlogged.loc[:,predictions_p3_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p3.append(r2)
    mape_scores_p3.append(mape)
    rmse_scores_p3.append(rmse)

print(f"Average R2 score for all zones, Period 3: {np.mean(r2_scores_p3)}")
print(f"Average MAPE score for all zones, Period 3: {np.mean(mape_scores_p3)}")
print(f"Average RMSE score for all zones, Period 3: {np.mean(rmse_scores_p3)}")

Average R2 score for all zones, Period 3: 0.673254988524538
Average MAPE score for all zones, Period 3: 0.11071325061221231
Average RMSE score for all zones, Period 3: 8478.291241607498


##### Period 4

In [90]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p4 = []
mape_scores_p4 = []
rmse_scores_p4 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p4_unlogged.index,i]),np.array(predictions_p4_unlogged.loc[:,predictions_p4_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p4_unlogged.index,i]),np.array(predictions_p4_unlogged.loc[:,predictions_p4_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p4_unlogged.index,i]),np.array(predictions_p4_unlogged.loc[:,predictions_p4_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p4.append(r2)
    mape_scores_p4.append(mape)
    rmse_scores_p4.append(rmse)

print(f"Average R2 score for all zones, Period 4: {np.mean(r2_scores_p4)}")
print(f"Average MAPE score for all zones, Period 4: {np.mean(mape_scores_p4)}")
print(f"Average RMSE score for all zones, Period 4: {np.mean(rmse_scores_p4)}")

Average R2 score for all zones, Period 4: 0.26684885392847363
Average MAPE score for all zones, Period 4: 0.11115205662231745
Average RMSE score for all zones, Period 4: 9710.357003093688


##### Period 5

In [91]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p5 = []
mape_scores_p5 = []
rmse_scores_p5 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p5_unlogged.index,i]),np.array(predictions_p5_unlogged.loc[:,predictions_p5_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p5_unlogged.index,i]),np.array(predictions_p5_unlogged.loc[:,predictions_p5_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p5_unlogged.index,i]),np.array(predictions_p5_unlogged.loc[:,predictions_p5_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p5.append(r2)
    mape_scores_p5.append(mape)
    rmse_scores_p5.append(rmse)

print(f"Average R2 score for all zones, Period 5: {np.mean(r2_scores_p5)}")
print(f"Average MAPE score for all zones, Period 5: {np.mean(mape_scores_p5)}")
print(f"Average RMSE score for all zones, Period 5: {np.mean(rmse_scores_p5)}")

Average R2 score for all zones, Period 5: 0.7401441440246251
Average MAPE score for all zones, Period 5: 0.10551287623425287
Average RMSE score for all zones, Period 5: 8220.195829141936


##### Period 6

In [92]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p6 = []
mape_scores_p6 = []
rmse_scores_p6 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p6_unlogged.index,i]),np.array(predictions_p6_unlogged.loc[:,predictions_p6_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p6_unlogged.index,i]),np.array(predictions_p6_unlogged.loc[:,predictions_p6_unlogged.columns[i-1]]))
    mape = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p6_unlogged.index,i]),np.array(predictions_p6_unlogged.loc[:,predictions_p6_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p6.append(r2)
    mape_scores_p6.append(mape)
    rmse_scores_p6.append(rmse)

print(f"Average R2 score for all zones, Period 6: {np.mean(r2_scores_p6)}")
print(f"Average MAPE score for all zones, Period 6: {np.mean(mape_scores_p6)}")
print(f"Average RMSE score for all zones, Period 6: {np.mean(rmse_scores_p6)}")

Average R2 score for all zones, Period 6: 0.04822135771382995
Average MAPE score for all zones, Period 6: 19627.04868099009
Average RMSE score for all zones, Period 6: 8896.0860690228


##### Period 7

In [93]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p7 = []
mape_scores_p7 = []
rmse_scores_p7 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p7_unlogged.index,i]),np.array(predictions_p7_unlogged.loc[:,predictions_p7_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p7_unlogged.index,i]),np.array(predictions_p7_unlogged.loc[:,predictions_p7_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p7_unlogged.index,i]),np.array(predictions_p7_unlogged.loc[:,predictions_p7_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p7.append(r2)
    mape_scores_p7.append(mape)
    rmse_scores_p7.append(rmse)

print(f"Average R2 score for all zones, Period 7: {np.mean(r2_scores_p7)}")
print(f"Average MAPE score for all zones, Period 7: {np.mean(mape_scores_p7)}")
print(f"Average RMSE score for all zones, Period 7: {np.mean(rmse_scores_p7)}")

Average R2 score for all zones, Period 7: 0.8207859469427976
Average MAPE score for all zones, Period 7: 0.10033398804928809
Average RMSE score for all zones, Period 7: 10234.015057896868


##### Period 8

In [94]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p8 = []
mape_scores_p8 = []
rmse_scores_p8 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p8_unlogged.index,i]),np.array(predictions_p8_unlogged.loc[:,predictions_p8_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p8_unlogged.index,i]),np.array(predictions_p8_unlogged.loc[:,predictions_p8_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p8_unlogged.index,i]),np.array(predictions_p8_unlogged.loc[:,predictions_p8_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p8.append(r2)
    mape_scores_p8.append(mape)
    rmse_scores_p8.append(rmse)

print(f"Average R2 score for all zones, Period 8: {np.mean(r2_scores_p8)}")
print(f"Average MAPE score for all zones, Period 8: {np.mean(mape_scores_p8)}")
print(f"Average RMSE score for all zones, Period 8: {np.mean(rmse_scores_p8)}")

Average R2 score for all zones, Period 8: 0.1336288962910366
Average MAPE score for all zones, Period 8: 0.11859559347048769
Average RMSE score for all zones, Period 8: 10834.16533935219


###### Overall Performance on Average, across periods

In [95]:
# Metric averaged across 8 periods, complete grid performance

avg_r2_list = []
for i in [r2_scores_p1,r2_scores_p2,r2_scores_p3,r2_scores_p4,r2_scores_p5,r2_scores_p6,r2_scores_p7,r2_scores_p8]:
    avg_r = np.mean(i)        # averaged across all 20 zones, for a given period
    avg_r2_list.append(avg_r)

print(f"Average R2 score across all 8 periods, complete grid: {np.mean(avg_r2_list)}")

avg_mape_list = []
for j in [mape_scores_p1,mape_scores_p2,mape_scores_p3,mape_scores_p4,mape_scores_p5,mape_scores_p6,mape_scores_p7,mape_scores_p8]:
    avg_m = np.mean(j)        # averaged across all 20 zones, for a given period
    avg_mape_list.append(avg_m)

print(f"Average MAPE score across all 8 periods, complete grid: {np.mean(avg_mape_list)}")

avg_rmse_list = []
for k in [rmse_scores_p1,rmse_scores_p2,rmse_scores_p3,rmse_scores_p4,rmse_scores_p5,rmse_scores_p6,rmse_scores_p7,rmse_scores_p8]:
    avg_rm = np.mean(k)        # averaged across all 20 zones, for a given period
    avg_rmse_list.append(avg_rm)

print(f"Average RMSE score across all 8 periods, complete grid: {np.mean(avg_rmse_list)}")

Average R2 score across all 8 periods, complete grid: 0.5161563247726122
Average MAPE score across all 8 periods, complete grid: 2453.4681981862586
Average RMSE score across all 8 periods, complete grid: 8818.213806521031


#### Comparing Naive Prediction With Actual Load Values

##### Period 1

In [96]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p1 = []
mape_scores_p1 = []
rmse_scores_p1 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p1_unlogged.index,i]),np.array(last_cycle_naive_p1_unlogged.loc[:,last_cycle_naive_p1_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p1_unlogged.index,i]),np.array(last_cycle_naive_p1_unlogged.loc[:,last_cycle_naive_p1_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p1_unlogged.index,i]),np.array(last_cycle_naive_p1_unlogged.loc[:,last_cycle_naive_p1_unlogged.columns[i-1]])))

    # adding scores to score list
    r2_scores_p1.append(r2)
    mape_scores_p1.append(mape)
    rmse_scores_p1.append(rmse)

print(f"Average R2 score for all zones, Period 1: {np.mean(r2_scores_p1)}")
print(f"Average MAPE score for all zones, Period 1: {np.mean(mape_scores_p1)}")
print(f"Average RMSE score for all zones, Period 1: {np.mean(rmse_scores_p1)}")

Average R2 score for all zones, Period 1: -0.403443635119234
Average MAPE score for all zones, Period 1: 0.1751119096892755
Average RMSE score for all zones, Period 1: 15805.303177066051


In [78]:
# most scores are extremely poor
r2_scores_p1

[-0.8045817850316563,
 0.22020601071801205,
 0.22021134777983098,
 -0.21288627755387135,
 -0.08485266343471598,
 0.18635357864996405,
 0.22021134777983098,
 0.1067208111758482,
 -5.270399713829374,
 0.3043234098764367,
 0.1616939130051499,
 0.0384987959159101,
 -0.11459705037099832,
 -0.42820074834427,
 -0.39638026369890045,
 -0.41540187727136635,
 -0.712037845142111,
 -0.35927722881498303,
 -0.7219497766522796,
 -0.006526687141136245]

##### Period 2

In [97]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p2 = []
mape_scores_p2 = []
rmse_scores_p2 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p2_unlogged.index,i]),np.array(last_cycle_naive_p2_unlogged.loc[:,last_cycle_naive_p2_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p2_unlogged.index,i]),np.array(last_cycle_naive_p2_unlogged.loc[:,last_cycle_naive_p2_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p2_unlogged.index,i]),np.array(last_cycle_naive_p2_unlogged.loc[:,last_cycle_naive_p2_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p2.append(r2)
    mape_scores_p2.append(mape)
    rmse_scores_p2.append(rmse)

print(f"Average R2 score for all zones, Period 2: {np.mean(r2_scores_p2)}")
print(f"Average MAPE score for all zones, Period 2: {np.mean(mape_scores_p2)}")
print(f"Average RMSE score for all zones, Period 2: {np.mean(rmse_scores_p2)}")

Average R2 score for all zones, Period 2: -0.7604482328878115
Average MAPE score for all zones, Period 2: 0.24201497981319758
Average RMSE score for all zones, Period 2: 22595.193372002155


##### Period 3

In [98]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p3 = []
mape_scores_p3 = []
rmse_scores_p3 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p3_unlogged.index,i]),np.array(last_cycle_naive_p3_unlogged.loc[:,last_cycle_naive_p3_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p3_unlogged.index,i]),np.array(last_cycle_naive_p3_unlogged.loc[:,last_cycle_naive_p3_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p3_unlogged.index,i]),np.array(last_cycle_naive_p3_unlogged.loc[:,last_cycle_naive_p3_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p3.append(r2)
    mape_scores_p3.append(mape)
    rmse_scores_p3.append(rmse)

print(f"Average R2 score for all zones, Period 3: {np.mean(r2_scores_p3)}")
print(f"Average MAPE score for all zones, Period 3: {np.mean(mape_scores_p3)}")
print(f"Average RMSE score for all zones, Period 3: {np.mean(rmse_scores_p3)}")

Average R2 score for all zones, Period 3: 0.5358353871978386
Average MAPE score for all zones, Period 3: 0.12832544176716906
Average RMSE score for all zones, Period 3: 12729.16779995972


##### Period 4

In [99]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p4 = []
mape_scores_p4 = []
rmse_scores_p4 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p4_unlogged.index,i]),np.array(last_cycle_naive_p4_unlogged.loc[:,last_cycle_naive_p4_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p4_unlogged.index,i]),np.array(last_cycle_naive_p4_unlogged.loc[:,last_cycle_naive_p4_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p4_unlogged.index,i]),np.array(last_cycle_naive_p4_unlogged.loc[:,last_cycle_naive_p4_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p4.append(r2)
    mape_scores_p4.append(mape)
    rmse_scores_p4.append(rmse)

print(f"Average R2 score for all zones, Period 4: {np.mean(r2_scores_p4)}")
print(f"Average MAPE score for all zones, Period 4: {np.mean(mape_scores_p4)}")
print(f"Average RMSE score for all zones, Period 4: {np.mean(rmse_scores_p4)}")

Average R2 score for all zones, Period 4: -2.4401341249971926
Average MAPE score for all zones, Period 4: 0.2265497298883694
Average RMSE score for all zones, Period 4: 21212.871729678176


##### Period 5

In [100]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p5 = []
mape_scores_p5 = []
rmse_scores_p5 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p5_unlogged.index,i]),np.array(last_cycle_naive_p5_unlogged.loc[:,last_cycle_naive_p5_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p5_unlogged.index,i]),np.array(last_cycle_naive_p5_unlogged.loc[:,last_cycle_naive_p5_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p5_unlogged.index,i]),np.array(last_cycle_naive_p5_unlogged.loc[:,last_cycle_naive_p5_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p5.append(r2)
    mape_scores_p5.append(mape)
    rmse_scores_p5.append(rmse)

print(f"Average R2 score for all zones, Period 5: {np.mean(r2_scores_p5)}")
print(f"Average MAPE score for all zones, Period 5: {np.mean(mape_scores_p5)}")
print(f"Average RMSE score for all zones, Period 5: {np.mean(rmse_scores_p5)}")

Average R2 score for all zones, Period 5: -0.15845509256449225
Average MAPE score for all zones, Period 5: 0.2281669305082108
Average RMSE score for all zones, Period 5: 18991.150026154777


##### Period 6

In [101]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p6 = []
mape_scores_p6 = []
rmse_scores_p6 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p6_unlogged.index,i]),np.array(last_cycle_naive_p6_unlogged.loc[:,last_cycle_naive_p6_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p6_unlogged.index,i]),np.array(last_cycle_naive_p6_unlogged.loc[:,last_cycle_naive_p6_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p6_unlogged.index,i]),np.array(last_cycle_naive_p6_unlogged.loc[:,last_cycle_naive_p6_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p6.append(r2)
    mape_scores_p6.append(mape)
    rmse_scores_p6.append(rmse)

print(f"Average R2 score for all zones, Period 6: {np.mean(r2_scores_p6)}")
print(f"Average MAPE score for all zones, Period 6: {np.mean(mape_scores_p6)}")
print(f"Average RMSE score for all zones, Period 6: {np.mean(rmse_scores_p6)}")

Average R2 score for all zones, Period 6: -0.10384797999226474
Average MAPE score for all zones, Period 6: 0.19152103232021125
Average RMSE score for all zones, Period 6: 20614.317789472036


##### Period 7

In [102]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p7 = []
mape_scores_p7 = []
rmse_scores_p7 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p7_unlogged.index,i]),np.array(last_cycle_naive_p7_unlogged.loc[:,last_cycle_naive_p7_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p7_unlogged.index,i]),np.array(last_cycle_naive_p7_unlogged.loc[:,last_cycle_naive_p7_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p7_unlogged.index,i]),np.array(last_cycle_naive_p7_unlogged.loc[:,last_cycle_naive_p7_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p7.append(r2)
    mape_scores_p7.append(mape)
    rmse_scores_p7.append(rmse)

print(f"Average R2 score for all zones, Period 7: {np.mean(r2_scores_p7)}")
print(f"Average MAPE score for all zones, Period 7: {np.mean(mape_scores_p7)}")
print(f"Average RMSE score for all zones, Period 7: {np.mean(rmse_scores_p7)}")

Average R2 score for all zones, Period 7: 0.685169965264994
Average MAPE score for all zones, Period 7: 0.13441551148576594
Average RMSE score for all zones, Period 7: 14533.344010093784


##### Period 8

In [103]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p8 = []
mape_scores_p8 = []
rmse_scores_p8 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p8_unlogged.index,i]),np.array(last_cycle_naive_p8_unlogged.loc[:,last_cycle_naive_p8_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p8_unlogged.index,i]),np.array(last_cycle_naive_p8_unlogged.loc[:,last_cycle_naive_p8_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p8_unlogged.index,i]),np.array(last_cycle_naive_p8_unlogged.loc[:,last_cycle_naive_p8_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p8.append(r2)
    mape_scores_p8.append(mape)
    rmse_scores_p8.append(rmse)

print(f"Average R2 score for all zones, Period 8: {np.mean(r2_scores_p8)}")
print(f"Average MAPE score for all zones, Period 8: {np.mean(mape_scores_p8)}")
print(f"Average RMSE score for all zones, Period 8: {np.mean(rmse_scores_p8)}")

Average R2 score for all zones, Period 8: -0.5163474329789117
Average MAPE score for all zones, Period 8: 0.1574119195207866
Average RMSE score for all zones, Period 8: 13439.366006618406


###### Overall Performance on Average, across periods

In [104]:
# Metric averaged across 8 periods, complete grid performance

avg_r2_list = []
for i in [r2_scores_p1,r2_scores_p2,r2_scores_p3,r2_scores_p4,r2_scores_p5,r2_scores_p6,r2_scores_p7,r2_scores_p8]:
    avg_r = np.mean(i)        # averaged across all 20 zones, for a given period
    avg_r2_list.append(avg_r)

print(f"Average R2 score across all 8 periods, complete grid: {np.mean(avg_r2_list)}")

avg_mape_list = []
for j in [mape_scores_p1,mape_scores_p2,mape_scores_p3,mape_scores_p4,mape_scores_p5,mape_scores_p6,mape_scores_p7,mape_scores_p8]:
    avg_m = np.mean(j)        # averaged across all 20 zones, for a given period
    avg_mape_list.append(avg_m)

print(f"Average MAPE score across all 8 periods, complete grid: {np.mean(avg_mape_list)}")

avg_rmse_list = []
for k in [rmse_scores_p1,rmse_scores_p2,rmse_scores_p3,rmse_scores_p4,rmse_scores_p5,rmse_scores_p6,rmse_scores_p7,rmse_scores_p8]:
    avg_rm = np.mean(k)        # averaged across all 20 zones, for a given period
    avg_rmse_list.append(avg_rm)

print(f"Average RMSE score across all 8 periods, complete grid: {np.mean(avg_rmse_list)}")

Average R2 score across all 8 periods, complete grid: -0.39520889325963426
Average MAPE score across all 8 periods, complete grid: 0.18543968187412327
Average RMSE score across all 8 periods, complete grid: 17490.08923888064


> The error rate (MAPE) has decreased by approximately 47% relative to the original (naive) error rate, when time series regression is used.

> <b>The error rate (RMSE) has decreased by approximately 50% -- from 17,490 to 8818 -- relative to the original (naive) error rate, when time series regression is used.</b>

#### Comparing Competition's Benchmark Values With Actual Load Values

In [105]:
# Reading benchmark load

load_bench = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Dataset\GEFCom2012\GEFCOM2012_Data\Load\Load_benchmark.csv")
load_bench.head()

Unnamed: 0,id,zone_id,year,month,day,h1,h2,h3,h4,h5,...,h15,h16,h17,h18,h19,h20,h21,h22,h23,h24
0,1,1,2005,3,6,20505,19445,19373,18833,19962,...,14333,13606,14009,15133,16097,16949,16895,16322,14969,14016
1,2,2,2005,3,6,167016,163275,164618,164731,165840,...,145182,140364,141993,148293,151031,158234,162813,158044,148985,140408
2,3,3,2005,3,6,180211,176174,177624,177745,178942,...,156652,151453,153210,160008,162963,170735,175675,170530,160756,151501
3,4,4,2005,3,6,530,496,489,460,492,...,497,472,479,512,535,542,552,536,468,415
4,5,5,2005,3,6,8585,8346,8564,8638,8793,...,6835,6486,6679,7343,7723,8347,8536,7856,6816,5974


In [106]:
# Converting the data into long-format

load_bench = load_bench.melt(
                id_vars=["zone_id","year","month","day"],
                value_vars=[f"h{i}" for i in range(1, 25)],
                var_name="hour",
                value_name="load"
                        )


# Replacing hour values with interval mid-point

# Replace the values using .map()
load_bench["hour"] = load_bench["hour"].map(hour_map)

# Creating timestamps using existing information
load_bench["timestamp"] = pd.to_datetime(
    load_bench["year"].astype(str) + "-" +
    load_bench["month"].astype(str).str.zfill(2) + "-" +
    load_bench["day"].astype(str).str.zfill(2) + " " +
    load_bench["hour"]
)

load_bench.head()

Unnamed: 0,zone_id,year,month,day,hour,load,timestamp
0,1,2005,3,6,00:30,20505,2005-03-06 00:30:00
1,2,2005,3,6,00:30,167016,2005-03-06 00:30:00
2,3,2005,3,6,00:30,180211,2005-03-06 00:30:00
3,4,2005,3,6,00:30,530,2005-03-06 00:30:00
4,5,2005,3,6,00:30,8585,2005-03-06 00:30:00


In [107]:
# Pivoting to record zone by column
load_bench = pd.pivot_table(load_bench, index="timestamp", columns="zone_id", values="load")
load_bench[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-03-06 00:30:00,20505.0,167016.0,180211.0,530.0,8585.0,175595.0,180211.0,4053.0,70026.0,23703.0,...,144092.0,21404.0,25238.0,70867.0,36580.0,33450.0,243806.0,91796.0,86322.0,1695779.0
2005-03-06 01:30:00,19445.0,163275.0,176174.0,496.0,8346.0,171615.0,176174.0,3844.0,70123.0,22995.0,...,138805.0,20139.0,23888.0,67545.0,36716.0,32688.0,244966.0,86833.0,82373.0,1655145.0


##### Period 1

In [108]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p1 = []
mape_scores_p1 = []
rmse_scores_p1 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p1_unlogged.index,i]),np.array(load_bench.loc[predictions_p1_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p1_unlogged.index,i]),np.array(load_bench.loc[predictions_p1_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p1_unlogged.index,i]),np.array(load_bench.loc[predictions_p1_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p1.append(r2)
    mape_scores_p1.append(mape)
    rmse_scores_p1.append(rmse)

print(f"Average R2 score for all zones, Period 1: {np.mean(r2_scores_p1)}")
print(f"Average MAPE score for all zones, Period 1: {np.mean(mape_scores_p1)}")
print(f"Average RMSE score for all zones, Period 1: {np.mean(rmse_scores_p1)}")

Average R2 score for all zones, Period 1: 0.7843876695118762
Average MAPE score for all zones, Period 1: 0.05993021699568011
Average RMSE score for all zones, Period 1: 5698.5266936622


##### Period 2

In [109]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p2 = []
mape_scores_p2 = []
rmse_scores_p2 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p2_unlogged.index,i]),np.array(load_bench.loc[predictions_p2_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p2_unlogged.index,i]),np.array(load_bench.loc[predictions_p2_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p2_unlogged.index,i]),np.array(load_bench.loc[predictions_p2_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p2.append(r2)
    mape_scores_p2.append(mape)
    rmse_scores_p2.append(rmse)

print(f"Average R2 score for all zones, Period 2: {np.mean(r2_scores_p2)}")
print(f"Average MAPE score for all zones, Period 2: {np.mean(mape_scores_p2)}")
print(f"Average RMSE score for all zones, Period 2: {np.mean(rmse_scores_p2)}")

Average R2 score for all zones, Period 2: 0.7670102016469259
Average MAPE score for all zones, Period 2: 0.07011818431947027
Average RMSE score for all zones, Period 2: 6212.343813248906


##### Period 3

In [110]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p3 = []
mape_scores_p3 = []
rmse_scores_p3 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p3_unlogged.index,i]),np.array(load_bench.loc[predictions_p3_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p3_unlogged.index,i]),np.array(load_bench.loc[predictions_p3_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p3_unlogged.index,i]),np.array(load_bench.loc[predictions_p3_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p3.append(r2)
    mape_scores_p3.append(mape)
    rmse_scores_p3.append(rmse)

print(f"Average R2 score for all zones, Period 3: {np.mean(r2_scores_p3)}")
print(f"Average MAPE score for all zones, Period 3: {np.mean(mape_scores_p3)}")
print(f"Average RMSE score for all zones, Period 3: {np.mean(rmse_scores_p3)}")

Average R2 score for all zones, Period 3: 0.8408390776386794
Average MAPE score for all zones, Period 3: 0.06986351198380902
Average RMSE score for all zones, Period 3: 6695.382053205253


##### Period 4

In [93]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p4 = []
mape_scores_p4 = []
rmse_scores_p4 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p4_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p4_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p4_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p4.append(r2)
    mape_scores_p4.append(mape)
    rmse_scores_p4.append(rmse)

print(f"Average R2 score for all zones, Period 4: {np.mean(r2_scores_p4)}")
print(f"Average MAPE score for all zones, Period 4: {np.mean(mape_scores_p4)}")
print(f"Average RMSE score for all zones, Period 4: {np.mean(rmse_scores_p4)}")

Average R2 score for all zones, Period 4: 0.4436922037267463
Average MAPE score for all zones, Period 4: 0.10292999439807642
Average RMSE score for all zones, Period 4: 7866.08314636025


##### Period 5

In [94]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p5 = []
mape_scores_p5 = []
rmse_scores_p5 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p5_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p5_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p5_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p5.append(r2)
    mape_scores_p5.append(mape)
    rmse_scores_p5.append(rmse)

print(f"Average R2 score for all zones, Period 5: {np.mean(r2_scores_p5)}")
print(f"Average MAPE score for all zones, Period 5: {np.mean(mape_scores_p5)}")
print(f"Average RMSE score for all zones, Period 5: {np.mean(rmse_scores_p5)}")

Average R2 score for all zones, Period 5: 0.3605287919423327
Average MAPE score for all zones, Period 5: 0.10270817944634048
Average RMSE score for all zones, Period 5: 6441.805707461544


##### Period 6

In [95]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p6 = []
mape_scores_p6 = []
rmse_scores_p6 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p6_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p6_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p6_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p6.append(r2)
    mape_scores_p6.append(mape)
    rmse_scores_p6.append(rmse)

print(f"Average R2 score for all zones, Period 6: {np.mean(r2_scores_p6)}")
print(f"Average MAPE score for all zones, Period 6: {np.mean(mape_scores_p6)}")
print(f"Average RMSE score for all zones, Period 6: {np.mean(rmse_scores_p6)}")

Average R2 score for all zones, Period 6: 0.5734876577895001
Average MAPE score for all zones, Period 6: 0.11750389336898162
Average RMSE score for all zones, Period 6: 9385.092514092286


##### Period 7

In [96]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p7 = []
mape_scores_p7 = []
rmse_scores_p7 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p7_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p7_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p7_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p7.append(r2)
    mape_scores_p7.append(mape)
    rmse_scores_p7.append(rmse)

print(f"Average R2 score for all zones, Period 7: {np.mean(r2_scores_p7)}")
print(f"Average MAPE score for all zones, Period 7: {np.mean(mape_scores_p7)}")
print(f"Average RMSE score for all zones, Period 7: {np.mean(rmse_scores_p7)}")

Average R2 score for all zones, Period 7: 0.9163187319055831
Average MAPE score for all zones, Period 7: 0.07326753758533004
Average RMSE score for all zones, Period 7: 6190.742612735311


##### Period 8

In [97]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p8 = []
mape_scores_p8 = []
rmse_scores_p8 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p8_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p8_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p8_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p8.append(r2)
    mape_scores_p8.append(mape)
    rmse_scores_p8.append(rmse)

print(f"Average R2 score for all zones, Period 8: {np.mean(r2_scores_p8)}")
print(f"Average MAPE score for all zones, Period 8: {np.mean(mape_scores_p8)}")
print(f"Average RMSE score for all zones, Period 8: {np.mean(rmse_scores_p8)}")

Average R2 score for all zones, Period 8: 0.4237862559810378
Average MAPE score for all zones, Period 8: 0.09115063921173688
Average RMSE score for all zones, Period 8: 8393.01873850579


###### Overall Performance on Average, across periods

In [98]:
# Metric averaged across 8 periods, complete grid performance

avg_r2_list = []
for i in [r2_scores_p1,r2_scores_p2,r2_scores_p3,r2_scores_p4,r2_scores_p5,r2_scores_p6,r2_scores_p7,r2_scores_p8]:
    avg_r = np.mean(i)        # averaged across all 20 zones, for a given period
    avg_r2_list.append(avg_r)

print(f"Average R2 score across all 8 periods, complete grid: {np.mean(avg_r2_list)}")

avg_mape_list = []
for j in [mape_scores_p1,mape_scores_p2,mape_scores_p3,mape_scores_p4,mape_scores_p5,mape_scores_p6,mape_scores_p7,mape_scores_p8]:
    avg_m = np.mean(j)        # averaged across all 20 zones, for a given period
    avg_mape_list.append(avg_m)

print(f"Average MAPE score across all 8 periods, complete grid: {np.mean(avg_mape_list)}")

avg_rmse_list = []
for k in [rmse_scores_p1,rmse_scores_p2,rmse_scores_p3,rmse_scores_p4,rmse_scores_p5,rmse_scores_p6,rmse_scores_p7,rmse_scores_p8]:
    avg_rm = np.mean(k)        # averaged across all 20 zones, for a given period
    avg_rmse_list.append(avg_rm)

print(f"Average RMSE score across all 8 periods, complete grid: {np.mean(avg_rmse_list)}")

Average R2 score across all 8 periods, complete grid: 0.6387563237678352
Average MAPE score across all 8 periods, complete grid: 0.08593401966367811
Average RMSE score across all 8 periods, complete grid: 7110.374409908943


> !For benchmarked values, error reduced by 53.5% compared to naive method (MAPE)!

> !<b>For benchmarked values, error reduced by 59.3% compared to naive method (RMSE)</b>!

### Conclusion
Compared to competition benchmark, error i.e. rmse increased from 7110.374409908943 to 8818