# Grid Forecasting: Missing Values
In the previous notebooks, analysis was done only with respect to Zone 1. Here, Time Series Regression is used to perform prediction for all zones simultaneously.
> Predictions are done to fill missing values (best estimates)

## Pre-processing

### Defining Dependent Variable

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

# Loading the data
load_long = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Exploratory Code\load_history_long.csv").sort_values(by = "timestamp")
load_long.head()

Unnamed: 0,zone_id,year,month,day,hour,load,timestamp
0,1,2004,1,1,00:30,16853.0,2004-01-01 00:30:00
14850,10,2004,1,1,00:30,23339.0,2004-01-01 00:30:00
16500,11,2004,1,1,00:30,90700.0,2004-01-01 00:30:00
28050,18,2004,1,1,00:30,200946.0,2004-01-01 00:30:00
9900,7,2004,1,1,00:30,136233.0,2004-01-01 00:30:00


In [2]:
# Converting to Wide Format
load_wide = load_long.pivot_table(
    index='timestamp',      # The column to use as the index
    columns='zone_id',  # The column whose unique values will become the new column names
    values='load'    # The column to use for the values in the new DataFrame.
).sort_values(by="timestamp")

# Converting string to datetime
from datetime import datetime
load_wide.index = pd.to_datetime(load_wide.index)
load_wide.index[0]

# Segregating temporal information
load_wide['year'] = load_wide.index.year
load_wide['month'] = load_wide.index.month
load_wide['day'] = load_wide.index.day
load_wide['hour'] = load_wide.index.hour

load_wide.head()

zone_id,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,year,month,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01 00:30:00,16853.0,126259.0,136233.0,484.0,6829.0,133088.0,136233.0,3124.0,75243.0,23339.0,...,65970.0,28752.0,30645.0,200946.0,82298.0,79830.0,2004,1,1,0
2004-01-01 01:30:00,16450.0,123313.0,133055.0,457.0,6596.0,129909.0,133055.0,2956.0,67368.0,22100.0,...,64600.0,27851.0,30461.0,195835.0,79827.0,77429.0,2004,1,1,1
2004-01-01 02:30:00,16517.0,119192.0,128608.0,450.0,6525.0,125717.0,128608.0,2953.0,64050.0,21376.0,...,63843.0,27631.0,30197.0,194093.0,77728.0,75558.0,2004,1,1,2
2004-01-01 03:30:00,16873.0,117507.0,126791.0,448.0,6654.0,124162.0,126791.0,2914.0,63861.0,21335.0,...,64023.0,27986.0,30264.0,194708.0,76433.0,75709.0,2004,1,1,3
2004-01-01 04:30:00,17064.0,118343.0,127692.0,444.0,6977.0,125320.0,127692.0,3221.0,75852.0,21564.0,...,65679.0,29160.0,30907.0,202458.0,78172.0,77475.0,2004,1,1,4


In [3]:
# Log transformation on load values (no-scaling)
load_wide_log = load_wide
load_wide_log[list(range(1,21,1))] = load_wide_log[list(range(1,21,1))].apply(np.log)
load_wide_log[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,year,month,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01 00:30:00,9.732284,11.746091,11.822122,6.182085,8.828934,11.798766,11.822122,8.04687,11.228478,10.057881,...,11.096955,10.266463,10.330225,12.210791,11.318102,11.287655,2004,1,1,0
2004-01-01 01:30:00,9.708081,11.722481,11.798518,6.124683,8.794219,11.774589,11.798518,7.991592,11.117925,10.003333,...,11.07597,10.234624,10.324202,12.185028,11.287617,11.257117,2004,1,1,1


### Defining Feature Matrix

In [4]:
# Loading the temperature data
temperature = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Exploratory Code\weighted_temperature.csv")
feature_matrix = temperature[["timestamp","temp_weighted"]]
feature_matrix.set_index("timestamp", inplace=True)
feature_matrix.index = pd.to_datetime(feature_matrix.index)
feature_matrix[0:3]

Unnamed: 0_level_0,temp_weighted
timestamp,Unnamed: 1_level_1
2004-01-01 00:30:00,42.338937
2004-01-01 01:30:00,41.239284
2004-01-01 02:30:00,39.591442


In [5]:
# Adding temperature knots for PLR

T_H = 55  # Heating Threshold
T_C = 65  # Cooling Threshold

# Temporarily changing alias of df
mul_df = feature_matrix.copy()

# Construct the Heating Demand Knot: HDK = max(0, T_H - Temp)
## This captures load increase when temp is below T_H.
mul_df["HDK"] = np.where(
    mul_df["temp_weighted"] < T_H,  
    T_H - mul_df["temp_weighted"],  # Value if True: The positive difference
    0                               # Value if False: Zero
)

# Construct the Cooling Demand Knot; CDK = max(0, Temp - T_C)
## This captures load increase when temp is above T_C.
mul_df["CDK"] = np.where(
    mul_df["temp_weighted"] > T_C,  
    mul_df["temp_weighted"] - T_C,  # Value if True: The positive difference
    0                               # Value if False: Zero
)

# Reverting back to original alias
feature_matrix = mul_df.copy()
feature_matrix[0:3]

Unnamed: 0_level_0,temp_weighted,HDK,CDK
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-01-01 00:30:00,42.338937,12.661063,0.0
2004-01-01 01:30:00,41.239284,13.760716,0.0
2004-01-01 02:30:00,39.591442,15.408558,0.0


In [6]:
# Segregating temporal information
feature_matrix['year'] = feature_matrix.index.year
feature_matrix['month'] = feature_matrix.index.month
feature_matrix['day'] = feature_matrix.index.day
feature_matrix['hour'] = feature_matrix.index.hour
feature_matrix.sort_index(inplace=True)
feature_matrix.tail()

Unnamed: 0_level_0,temp_weighted,HDK,CDK,year,month,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-06-30 01:30:00,71.08037,0.0,6.08037,2008,6,30,1
2008-06-30 02:30:00,70.803812,0.0,5.803812,2008,6,30,2
2008-06-30 03:30:00,70.248102,0.0,5.248102,2008,6,30,3
2008-06-30 04:30:00,70.065118,0.0,5.065118,2008,6,30,4
2008-06-30 05:30:00,69.793713,0.0,4.793713,2008,6,30,5


### Filling Missing Values: Forecasting & Backcasting

#### Period 1: 6 Mar 2005 - 12 Mar 2005

In [7]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-03-06 00:30:00") & (feature_matrix.index > "2005-02-26 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-03-06 00:30:00") & (load_wide_log.index > "2005-02-26 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2005-03-21 00:30:00") & (feature_matrix.index > "2005-03-12 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2005-03-21 00:30:00") & (load_wide_log.index > "2005-03-12 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [8]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [9]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [10]:
# Defining feature space dedicated to period 1
period_1_X = feature_matrix.loc[(feature_matrix.index >= "2005-03-06 00:30:00") & (feature_matrix.index <= "2005-03-12 23:30:00"),:]

dp = DeterministicProcess(
    index=period_1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_1_X = period_1_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_1 = period_1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [11]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_1.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_1)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_1.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_1)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [12]:
# Mean Prediction
predictions_p1_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p1_mean_unlogged = np.exp(predictions_p1_mean)
predictions_p1_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-03-06 00:30:00,20802.333067,181539.662243,195881.735649,515.862557,8508.35248,190074.436355,195881.735649,3992.49197,72807.831556,28069.218974,110882.269044,137132.679286,22146.998363,27000.24385,71925.496205,37078.741315,37044.87494,253307.703203,94383.967478,95476.234489
2005-03-06 01:30:00,19973.21536,174698.825433,188500.429198,450.692592,8071.055353,182793.283295,188500.429198,3828.383179,76400.867021,26581.378142,105631.237415,129653.470417,21098.121718,25542.480837,69184.621704,35599.913346,35449.879414,242435.857767,89913.702226,91438.413253
2005-03-06 02:30:00,20286.386948,174443.284904,188224.67916,429.171659,8149.661463,182621.018169,188224.67916,3828.654856,74868.098771,26371.963173,105273.291919,128438.104849,20924.686692,25351.922036,69076.86044,35732.069786,35471.91834,243295.774857,90142.023854,91119.385068
2005-03-06 03:30:00,20662.735856,176178.134673,190096.551622,421.756983,8451.334979,184659.291178,190096.551622,3921.771939,73387.609394,26399.287914,106591.447579,130295.520387,21038.387735,25076.383225,68818.040702,36030.883806,35439.92073,245372.977014,90270.18381,91805.928109
2005-03-06 04:30:00,22609.260386,186350.156991,201072.167632,469.263193,9340.75461,195716.078376,201072.167632,4191.027546,65996.904569,28126.61181,114029.327321,139284.426934,22304.391334,26648.556982,72503.750295,38479.775617,37731.24695,263217.569879,97061.920752,97245.223276


In [13]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p1_unlogged = np.exp(
    train_p1_y[-len(period_1_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p1_unlogged.index = period_1_X.index
last_cycle_naive_p1_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-03-06 00:30:00,18954.0,162388.0,175217.0,531.0,8530.0,170918.0,175217.0,4054.0,80850.0,25031.0,108347.0,142581.0,22016.0,25947.0,70602.0,36004.0,34078.0,245140.0,91067.0,85633.0
2005-03-06 01:30:00,19614.0,161693.0,174468.0,502.0,8437.0,170131.0,174468.0,4009.0,80241.0,24794.0,106572.0,138708.0,22100.0,25967.0,70621.0,36449.0,33875.0,247743.0,91186.0,85038.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 2: 20 Jun 2005 - 26 Jun 2005

In [14]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-06-20 00:30:00") & (feature_matrix.index > "2005-06-11 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-06-20 00:30:00") & (load_wide_log.index > "2005-06-11 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2005-07-05 00:30:00") & (feature_matrix.index > "2005-06-26 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2005-07-05 00:30:00") & (load_wide_log.index > "2005-06-26 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [15]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [16]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [17]:
# Defining feature space dedicated to period 1
period_2_X = feature_matrix.loc[(feature_matrix.index >= "2005-06-20 00:30:00") & (feature_matrix.index <= "2005-06-26 23:30:00"),:]

dp = DeterministicProcess(
    index=period_2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_2_X = period_2_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_2 = period_2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [18]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_2.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_2)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_2.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_2)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [19]:
# Mean Prediction
predictions_p2_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p2_mean_unlogged = np.exp(predictions_p2_mean)
predictions_p2_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-06-20 00:30:00,13504.556262,145093.576382,156556.262647,364.7184,6399.274188,151544.063843,156556.262647,2820.022518,125845.887423,19634.518332,81572.032755,96813.494347,15378.6684,15011.125419,49762.189115,21303.391412,25921.193033,166304.302301,57560.478824,71558.947693
2005-06-20 01:30:00,12166.159439,136763.68289,147568.241105,331.09101,5633.966571,142406.17487,147568.241105,2554.699683,120142.150116,18668.054841,75407.283944,87942.671345,13804.25888,13454.856233,45829.46195,19253.107612,24178.050249,149243.626799,51590.230084,65921.862974
2005-06-20 02:30:00,11243.720139,130792.90194,141125.713069,312.356438,5118.225889,135882.762859,141125.713069,2387.431874,115225.41385,18002.037758,71080.668898,81617.380364,12775.167397,12315.339761,42855.204878,17802.707556,22890.700755,137299.678043,47368.96853,62234.54653
2005-06-20 03:30:00,10779.228656,127827.823412,137926.356024,309.396679,4861.607586,132643.605086,137926.356024,2323.762097,112096.880334,17729.62728,68981.33996,78316.168897,12383.135081,11687.981522,41214.766341,17068.020933,22196.412772,131249.214507,45223.644328,60865.643369
2005-06-20 04:30:00,10759.528174,127973.356905,138083.369424,321.644278,4847.569331,132781.047477,138083.369424,2359.858843,111749.64124,17849.845731,69123.22843,78035.514218,12633.899241,11567.128792,41001.476444,17048.046324,22108.162303,131016.86769,45129.283567,61817.719413


In [20]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p2_unlogged = np.exp(
    train_p1_y[-len(period_2_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p2_unlogged.index = period_2_X.index
last_cycle_naive_p2_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-06-20 00:30:00,15416.0,149278.0,161072.0,337.0,6698.0,155977.0,161072.0,2859.0,82992.0,21079.0,92005.0,117462.0,15390.0,20707.0,56611.0,26251.0,31975.0,185626.0,71916.0,71910.0
2005-06-20 01:30:00,14390.0,143795.0,155155.0,322.0,5938.0,149733.0,155155.0,2622.0,82551.0,20259.0,84773.0,106672.0,14315.0,18792.0,52290.0,24402.0,30406.0,170506.0,65670.0,67696.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 3: 10 Sep 2005 - 16 Sep 2005

In [21]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-09-10 00:30:00") & (feature_matrix.index > "2005-09-02 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-09-10 00:30:00") & (load_wide_log.index > "2005-09-02 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2005-09-23 00:30:00") & (feature_matrix.index > "2005-09-16 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2005-09-23 00:30:00") & (load_wide_log.index > "2005-09-16 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [22]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [23]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [24]:
# Defining feature space dedicated to period 1
period_3_X = feature_matrix.loc[(feature_matrix.index >= "2005-09-10 00:30:00") & (feature_matrix.index <= "2005-09-16 23:30:00"),:]

dp = DeterministicProcess(
    index=period_3_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_3_X = period_3_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_3 = period_3_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [25]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_3.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_3)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_3.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_3)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [26]:
# Mean Prediction
predictions_p3_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p3_mean_unlogged = np.exp(predictions_p3_mean)
predictions_p3_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-09-10 00:30:00,13425.883573,122054.244571,131696.672907,367.683934,5384.740671,127563.990687,131696.672907,2336.080305,84391.913014,21493.04538,79240.755312,103454.37989,13835.562835,16743.086052,55722.818008,23464.707087,25988.445722,152444.845747,60834.793698,59056.74683
2005-09-10 01:30:00,12097.596747,115132.261389,124227.886388,333.820135,4717.108618,119965.287451,124227.886388,2130.772417,84481.82083,19994.374166,72378.37139,92622.70831,12595.157847,14807.884063,51438.841048,21083.71275,24034.212123,137225.546876,54466.873981,54729.625066
2005-09-10 02:30:00,11341.586226,111282.36024,120073.878546,317.887531,4373.430353,115754.691852,120073.878546,2038.752907,83948.875889,19082.333836,68398.931059,86201.658655,11945.044228,13636.305811,48808.259913,19680.134495,22867.779837,128667.12454,50800.745741,52612.882715
2005-09-10 03:30:00,11125.530327,110700.758997,119446.364748,320.150629,4322.210727,115115.257107,119446.364748,2053.102612,83148.879114,18827.025341,67378.024184,84095.068609,11912.799052,13205.596076,47939.245034,19252.2352,22504.593792,126637.545707,49762.133316,52762.792599
2005-09-10 04:30:00,11386.478239,113187.670563,122129.770098,339.397518,4525.803454,117810.377822,122129.770098,2159.932774,82260.332876,19215.531463,69075.243495,85843.420987,12472.672747,13441.81527,48733.108391,19702.702721,22886.642892,130546.711354,51094.247811,55026.162764


In [27]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p3_unlogged = np.exp(
    train_p1_y[-len(period_3_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p3_unlogged.index = period_3_X.index
last_cycle_naive_p3_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-09-10 00:30:00,14051.0,142723.0,153999.0,401.0,6045.0,148768.0,153999.0,2550.0,95193.0,27228.0,96888.0,121831.0,14637.0,16886.0,57439.0,27612.0,26608.0,172283.0,62838.0,65611.0
2005-09-10 01:30:00,12366.0,133690.0,144251.0,342.0,5152.0,138841.0,144251.0,2252.0,94983.0,25381.0,86992.0,106760.0,13390.0,14886.0,53102.0,24180.0,24548.0,151473.0,55876.0,60123.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 4: 25 Dec 2005 - 31 Dec 2005

In [28]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2005-12-25 00:30:00") & (feature_matrix.index > "2005-12-16 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2005-12-25 00:30:00") & (load_wide_log.index > "2005-12-16 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2006-01-08 00:30:00") & (feature_matrix.index > "2005-12-31 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2006-01-08 00:30:00") & (load_wide_log.index > "2005-12-31 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [29]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [30]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [31]:
# Defining feature space dedicated to period 1
period_4_X = feature_matrix.loc[(feature_matrix.index >= "2005-12-25 00:30:00") & (feature_matrix.index <= "2005-12-31 23:30:00"),:]

dp = DeterministicProcess(
    index=period_4_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_4_X = period_4_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_4 = period_4_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [32]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_4.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_4)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_4.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_4)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [33]:
# Mean Prediction
predictions_p4_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p4_mean_unlogged = np.exp(predictions_p4_mean)
predictions_p4_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-12-25 00:30:00,20239.251798,156775.809325,169161.415787,540.197777,8869.077605,165624.292266,169161.415787,3976.485902,74087.670074,23768.841135,103095.729938,141329.893289,22823.746281,28408.316818,72894.234129,36177.679773,35785.84507,246025.511379,95467.692465,87854.138456
2005-12-25 01:30:00,19509.6933,151378.578832,163337.788075,509.114419,8390.598859,159766.268875,163337.788075,3795.568385,76093.432463,22732.694874,97804.916983,132436.58144,21768.51589,27111.095548,70603.97615,34745.192728,34613.241468,237535.080155,91883.497129,84526.143658
2005-12-25 02:30:00,19509.896884,149876.675409,161717.239493,498.459523,8338.802026,158218.300859,161717.239493,3751.567753,77907.052412,22411.091858,96196.929326,129108.586277,21275.748768,26534.674374,69695.608477,34390.762192,34341.861098,236494.527635,90961.852901,83412.782395
2005-12-25 03:30:00,20157.878083,152069.612446,164083.443467,507.323345,8669.180826,160736.211227,164083.443467,3832.634946,78666.171464,22747.410704,97914.912906,130792.818791,21341.884974,26585.597898,70094.682715,34919.812108,34868.59355,241771.72465,92369.750644,84384.315226
2005-12-25 04:30:00,21231.816343,157164.060258,169580.39266,532.043698,9278.781684,166420.035364,169580.39266,4006.390134,77770.369611,23575.903354,102072.654415,136183.041986,21867.420325,27031.464543,71400.868333,35942.376942,35906.386612,250868.27581,95241.31886,86948.976345


In [34]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p4_unlogged = np.exp(
    train_p1_y[-len(period_4_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p4_unlogged.index = period_4_X.index
last_cycle_naive_p4_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-12-25 00:30:00,21429.0,179903.0,194116.0,537.0,10044.0,189947.0,194116.0,4456.0,80115.0,24745.0,109326.0,146826.0,23890.0,26423.0,74062.0,37538.0,35435.0,263232.0,96292.0,94372.0
2005-12-25 01:30:00,21124.0,174678.0,188478.0,524.0,9628.0,184306.0,188478.0,4274.0,79884.0,23819.0,105173.0,140610.0,23035.0,25054.0,71411.0,36092.0,33705.0,256316.0,92807.0,91423.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 5: 13 Feb 2006 - 19 Feb 2006

In [35]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-02-13 00:30:00") & (feature_matrix.index > "2006-02-05 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-02-13 00:30:00") & (load_wide_log.index > "2006-02-05 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2006-02-26 00:30:00") & (feature_matrix.index > "2006-02-19 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2006-02-26 00:30:00") & (load_wide_log.index > "2006-02-19 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [36]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [37]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [38]:
# Defining feature space dedicated to period 1
period_5_X = feature_matrix.loc[(feature_matrix.index >= "2006-02-13 00:30:00") & (feature_matrix.index <= "2006-02-19 23:30:00"),:]

dp = DeterministicProcess(
    index=period_5_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_5_X = period_5_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_5 = period_5_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [39]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_5.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_5)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_5.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_5)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [40]:
# Mean Prediction
predictions_p5_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p5_mean_unlogged = np.exp(predictions_p5_mean)
predictions_p5_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-02-13 00:30:00,21452.619328,197400.547213,212995.585905,541.901326,9297.023787,206701.458471,212995.585905,4159.59917,66146.224903,30636.373837,116280.661018,151169.457921,24244.018647,26431.732481,70573.035123,37555.121103,38510.10796,267228.020762,93665.56768,99940.86827
2006-02-13 01:30:00,21387.695577,192799.542447,208031.109739,517.119122,9116.947953,201932.439743,208031.109739,4053.017084,66515.473801,29848.697381,113949.818906,147184.148454,23638.496945,26365.806545,70054.05145,37592.367586,38419.363954,266397.079082,93611.669198,97857.735656
2006-02-13 02:30:00,21911.045379,192310.5952,207503.558519,511.101791,9280.497945,201620.803547,207503.558519,4070.015655,66248.86807,29789.774908,114615.504914,147242.996711,23472.817509,26633.174636,70402.449263,38260.040994,39046.91912,271028.126436,95412.423331,97850.481783
2006-02-13 03:30:00,23231.116238,197204.736666,212784.379298,528.284457,9872.678682,207112.341531,212784.379298,4235.146199,64828.688262,30777.957896,119298.559187,152738.992882,24006.097963,27591.759832,72334.47672,39985.443242,40746.476557,283923.119026,100098.969084,100662.152143
2006-02-13 04:30:00,25359.121033,207617.962125,224020.311124,569.265829,10893.136007,218528.331212,224020.311124,4542.307608,62191.924455,32914.972882,128162.965154,163865.416221,25340.912617,29384.115435,76104.374125,42896.071298,43571.559195,305702.49321,107924.375714,106401.372144


In [41]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p5_unlogged = np.exp(
    train_p1_y[-len(period_5_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p5_unlogged.index = period_5_X.index
last_cycle_naive_p5_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-02-13 00:30:00,22108.0,173382.0,187079.0,510.0,8332.0,181713.0,187079.0,3964.0,64848.0,23557.0,93146.0,127480.0,23316.0,24632.0,70279.0,33329.0,36974.0,236617.0,92291.0,90099.0
2006-02-13 01:30:00,21988.0,173426.0,187126.0,502.0,8288.0,181713.0,187126.0,3798.0,65478.0,23188.0,91117.0,124229.0,22799.0,24574.0,70199.0,33797.0,36221.0,235005.0,91260.0,88589.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 6: 25 May 2006 - 31 May 2006

In [42]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-05-25 00:30:00") & (feature_matrix.index > "2006-05-17 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-05-25 00:30:00") & (load_wide_log.index > "2006-05-17 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2006-06-08 00:30:00") & (feature_matrix.index > "2006-05-31 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2006-06-08 00:30:00") & (load_wide_log.index > "2006-05-31 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [43]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [44]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [45]:
# Defining feature space dedicated to period 1
period_6_X = feature_matrix.loc[(feature_matrix.index >= "2006-05-25 00:30:00") & (feature_matrix.index <= "2006-05-31 23:30:00"),:]

dp = DeterministicProcess(
    index=period_6_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_6_X = period_6_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_6 = period_6_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [46]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_6.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_6)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_6.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_6)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [47]:
# Mean Prediction
predictions_p6_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p6_mean_unlogged = np.exp(predictions_p6_mean)
predictions_p6_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-05-25 00:30:00,10412.224387,99142.407845,106974.008513,322.115907,3644.838779,102840.689429,106974.008513,2187.398267,123708.090695,12961.052199,55301.429431,71778.446359,11910.238937,10358.310105,40682.313885,15092.070868,18252.368517,107611.985695,41315.395879,50953.0429
2006-05-25 01:30:00,8521.736062,81396.592418,87825.943855,277.443177,2704.281944,84097.806297,87825.943855,1938.026827,128844.640122,9859.075601,43157.850466,56388.289439,9915.103199,7688.012452,34147.715644,11822.373771,14105.47221,83145.83282,31993.427603,41876.726238
2006-05-25 02:30:00,7856.71337,75942.687704,81941.105141,261.989644,2414.944856,78325.582277,81941.105141,1861.323381,127149.461895,8921.827963,39072.461829,50664.68176,9261.40255,6757.383893,31785.473578,10664.794822,12763.45172,75008.272123,28890.042838,39096.880094
2006-05-25 03:30:00,7788.053065,75469.215693,81430.190685,264.758772,2406.418633,77840.776782,81430.190685,1896.75086,122308.55249,8785.184961,38419.74045,49454.198484,9361.916159,6588.527207,31589.980849,10512.493429,12491.911777,74278.061804,28677.383006,39291.660924
2006-05-25 04:30:00,8491.420505,82621.51131,89147.578486,289.587057,2774.00385,85383.603127,89147.578486,2062.411478,113977.810438,9866.106866,42608.49991,54136.234403,10435.097131,7399.862693,34235.082556,11668.711886,13806.239181,83445.481557,32260.519336,43639.831461


In [48]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p6_unlogged = np.exp(
    train_p1_y[-len(period_6_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p6_unlogged.index = period_6_X.index
last_cycle_naive_p6_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-05-25 00:30:00,11268.0,128006.0,138119.0,346.0,4030.0,132037.0,138119.0,2319.0,80703.0,18125.0,62918.0,73969.0,12821.0,10846.0,41124.0,14890.0,23115.0,118544.0,44021.0,62178.0
2006-05-25 01:30:00,10635.0,123694.0,133467.0,331.0,3723.0,127418.0,133467.0,2224.0,69027.0,16771.0,58922.0,67396.0,12212.0,9960.0,38955.0,14050.0,21825.0,110073.0,41008.0,59775.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 7: 02 Aug 2006 - 08 Aug 2006

In [49]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-08-02 00:30:00") & (feature_matrix.index > "2006-07-25 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-08-02 00:30:00") & (load_wide_log.index > "2006-07-25 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2006-08-16 00:30:00") & (feature_matrix.index > "2006-08-08 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2006-08-16 00:30:00") & (load_wide_log.index > "2006-08-08 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [50]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [51]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [52]:
# Defining feature space dedicated to period 1
period_7_X = feature_matrix.loc[(feature_matrix.index >= "2006-08-02 00:30:00") & (feature_matrix.index <= "2006-08-08 23:30:00"),:]

dp = DeterministicProcess(
    index=period_7_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_7_X = period_7_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_7 = period_7_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [53]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_7.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_7)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_7.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_7)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [54]:
# Mean Prediction
predictions_p7_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p7_mean_unlogged = np.exp(predictions_p7_mean)
predictions_p7_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-08-02 00:30:00,22263.173983,189471.784514,204440.480273,413.564144,7951.32119,197404.977538,204440.480273,3192.990675,102967.240815,34004.605391,123666.676419,152914.692524,18721.601404,23002.170301,73725.158485,30888.022328,38678.890407,227521.843538,90539.272036,95629.972864
2006-08-02 01:30:00,19692.746676,175310.881411,189160.802303,371.742922,6896.625839,182142.616152,189160.802303,2876.443971,97873.266129,31131.278528,110811.649694,134140.038793,16771.486267,19966.100599,66540.430593,27355.578331,35298.274875,200430.742033,79640.550664,86720.723267
2006-08-02 02:30:00,17898.544942,165566.116388,178646.14695,345.459776,6204.861247,171666.302158,178646.14695,2681.831628,93226.558341,29167.496922,101921.900085,120987.675559,15536.856513,17846.31329,61472.955832,24925.798435,32834.601625,181903.227119,72195.209674,80930.393775
2006-08-02 03:30:00,16921.895186,160873.125109,173582.372439,336.036383,5869.351473,166618.401142,173582.372439,2608.096218,91112.915405,28260.319227,97416.141548,113865.900566,15071.171779,16689.352788,58818.497689,23680.853115,31450.624043,172492.319884,68389.973071,78556.317186
2006-08-02 04:30:00,16314.11347,157439.808494,169877.795058,341.500828,5726.604923,163058.85851,169877.795058,2613.633062,91728.422019,27642.898757,94529.923261,109557.334266,15135.529493,16110.392445,57335.046658,23091.209048,30438.153476,167777.715565,66416.840823,77636.149418


In [55]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p7_unlogged = np.exp(
    train_p1_y[-len(period_7_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p7_unlogged.index = period_7_X.index
last_cycle_naive_p7_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-08-02 00:30:00,16349.0,161874.0,174662.0,402.0,6234.0,168108.0,174662.0,2730.0,93450.0,26556.0,99613.0,123715.0,15541.0,17521.0,55921.0,24984.0,31938.0,177341.0,66148.0,77538.0
2006-08-02 01:30:00,14609.0,154462.0,166665.0,376.0,5615.0,160077.0,166665.0,2502.0,93660.0,24888.0,92294.0,110378.0,14234.0,15618.0,51720.0,22532.0,29547.0,161397.0,60256.0,72783.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

#### Period 8: 22 Nov 2006 - 28 Nov 2006

In [56]:
# Creating training data 1 (1 weeks)
train_p1_X = feature_matrix.loc[(feature_matrix.index < "2006-11-22 00:30:00") & (feature_matrix.index > "2006-11-13 23:30:00"),:]
train_p1_y = load_wide_log.loc[(load_wide_log.index < "2006-11-22 00:30:00") & (load_wide_log.index > "2006-11-13 23:30:00"), list(range(1,21,1))]

# Creating training data 2 (1 weeks)
train_p2_X = feature_matrix.loc[(feature_matrix.index < "2006-12-06 00:30:00") & (feature_matrix.index > "2006-11-28 23:30:00"),:]
train_p2_y = load_wide_log.loc[(load_wide_log.index < "2006-12-06 00:30:00") & (load_wide_log.index > "2006-11-28 23:30:00"), list(range(1,21,1))]

# Adding periodicities
from statsmodels.tsa.deterministic import Fourier
from statsmodels.tsa.deterministic import DeterministicProcess
periodicity = Fourier(period=24, order=2) # daily cycle i.e. 24 hours, 2 harmonics

# Defining the sine wave (training set 1)
dp1 = DeterministicProcess(
    index=train_p1_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves1 = dp1.in_sample()

# Defining the sine wave (training set 2)
dp2 = DeterministicProcess(
    index=train_p2_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves2 = dp2.in_sample()

# Combining fourier terms with other variables
train_p1_X = train_p1_X.merge(waves1, left_index=True, right_index=True, how='left')
train_p2_X = train_p2_X.merge(waves2, left_index=True, right_index=True, how='left')

In [57]:
# Fitting Linear Regression Model (all zones) - training set 1 
from sklearn.linear_model import LinearRegression
X_train1 = train_p1_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_1 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p1_y.columns):
    
    # Extract the current target vector (y)
    y = train_p1_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train1, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_1[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [58]:
# Fitting Linear Regression Model (all zones) - training set 2 
from sklearn.linear_model import LinearRegression
X_train2 = train_p2_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

# Creating dictionary to store the fitted models
fitted_models_set_2 = {}

print("Starting model training...")

# Looping through each target column
for zone in list(train_p2_y.columns):
    
    # Extract the current target vector (y)
    y = train_p2_y[zone]
    
    # 1. Instantiate the model
    # A new model object is created for each iteration
    model = LinearRegression()
    
    # 2. Fit the model
    # Train the model using the common features (X) and the current target (y)
    model.fit(X_train2, y)
    
    # 3. Store the fitted model in the dictionary
    fitted_models_set_2[zone] = model
    
    print(f"  -> Finished fitting model for: {zone}")

print("Training complete!")

Starting model training...
  -> Finished fitting model for: 1
  -> Finished fitting model for: 2
  -> Finished fitting model for: 3
  -> Finished fitting model for: 4
  -> Finished fitting model for: 5
  -> Finished fitting model for: 6
  -> Finished fitting model for: 7
  -> Finished fitting model for: 8
  -> Finished fitting model for: 9
  -> Finished fitting model for: 10
  -> Finished fitting model for: 11
  -> Finished fitting model for: 12
  -> Finished fitting model for: 13
  -> Finished fitting model for: 14
  -> Finished fitting model for: 15
  -> Finished fitting model for: 16
  -> Finished fitting model for: 17
  -> Finished fitting model for: 18
  -> Finished fitting model for: 19
  -> Finished fitting model for: 20
Training complete!


In [59]:
# Defining feature space dedicated to period 1
period_8_X = feature_matrix.loc[(feature_matrix.index >= "2006-11-22 00:30:00") & (feature_matrix.index <= "2006-11-28 23:30:00"),:]

dp = DeterministicProcess(
    index=period_8_X.index,
    period=None,         # It's not defined so that frequency can be read from the index
    constant=False,      # defined later
    order=1,             # linear trend not required since seasonal diff made the series stationary
    seasonal=False,      # no seasonal dummies
    additional_terms=[periodicity], # 2 seperate waves will be generated
    drop=True            # if perfect collinearity exists, the terms can be dropped
)

waves = dp.in_sample()
period_8_X = period_8_X.merge(waves, left_index=True, right_index=True, how='left')

X_period_8 = period_8_X[['CDK', 'HDK', 'trend', 'sin(1,24)', 'cos(1,24)', 'sin(2,24)', 'cos(2,24)']]

In [60]:
# Predicting load value per zone - from training set 1 models

# Initializing an empty DataFrame with the correct index
predictions_set_1 = pd.DataFrame(index=X_period_8.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_1.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_8)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_1[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 1 complete!")


# Predicting load value per zone - from training set 2 models

# Initializing an empty DataFrame with the correct index
predictions_set_2 = pd.DataFrame(index=X_period_8.index)

# Loop through the dictionary items
for zone, model in fitted_models_set_2.items():
    
    # 1. Generate Predictions
    # This returns a NumPy array of predicted values
    predictions_array = model.predict(X_period_8)
    
    # 2. Assign the predictions array as a new column
    # The new column is named 'Predicted_Target_X'
    column_name = f'Zone_{zone}_pred'
    
    # Pandas should match the array to the DataFrame's existing index
    predictions_set_2[column_name] = predictions_array
    
    print(f"  -> Added column: {column_name}")

print("Prediction generation from training set 2 complete!")

  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> Added column: Zone_11_pred
  -> Added column: Zone_12_pred
  -> Added column: Zone_13_pred
  -> Added column: Zone_14_pred
  -> Added column: Zone_15_pred
  -> Added column: Zone_16_pred
  -> Added column: Zone_17_pred
  -> Added column: Zone_18_pred
  -> Added column: Zone_19_pred
  -> Added column: Zone_20_pred
Prediction generation from training set 1 complete!
  -> Added column: Zone_1_pred
  -> Added column: Zone_2_pred
  -> Added column: Zone_3_pred
  -> Added column: Zone_4_pred
  -> Added column: Zone_5_pred
  -> Added column: Zone_6_pred
  -> Added column: Zone_7_pred
  -> Added column: Zone_8_pred
  -> Added column: Zone_9_pred
  -> Added column: Zone_10_pred
  -> A

In [61]:
# Mean Prediction
predictions_p8_mean = (predictions_set_1+predictions_set_2)/2

# Undoing log transformation for original predictions
predictions_p8_mean_unlogged = np.exp(predictions_p8_mean)
predictions_p8_mean_unlogged.head()

Unnamed: 0_level_0,Zone_1_pred,Zone_2_pred,Zone_3_pred,Zone_4_pred,Zone_5_pred,Zone_6_pred,Zone_7_pred,Zone_8_pred,Zone_9_pred,Zone_10_pred,Zone_11_pred,Zone_12_pred,Zone_13_pred,Zone_14_pred,Zone_15_pred,Zone_16_pred,Zone_17_pred,Zone_18_pred,Zone_19_pred,Zone_20_pred
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-11-22 00:30:00,15783.489499,146172.840846,157720.744196,474.70834,6095.800222,152439.268248,157720.744196,3230.468424,81314.748203,18795.723026,82707.080006,94286.497431,18843.340671,17601.897266,57088.677932,23915.053434,28889.731494,183473.916523,71572.710025,80522.9537
2006-11-22 01:30:00,15002.66179,140229.302822,151307.672774,444.77373,5696.561111,146075.571198,151307.672774,3073.173672,82281.776169,17811.133407,77448.528427,87017.689959,17736.171334,16525.675036,54483.52668,22563.235799,27546.158176,173083.866452,67291.865322,76773.532601
2006-11-22 02:30:00,14854.50946,138550.61528,149496.385266,434.522447,5630.529077,144321.972268,149496.385266,3036.869961,80565.610026,17536.699215,75650.523675,84148.407029,17201.548234,15992.021349,53061.644731,21971.168881,27013.600327,169324.409739,65456.286025,75396.083844
2006-11-22 03:30:00,15284.070603,141073.379158,152218.468238,443.148044,5860.977658,147076.335155,152218.468238,3109.085192,77173.361617,17948.967176,77119.925459,85318.53719,17213.22473,15909.777511,52746.817116,22015.321007,27241.652279,171437.373496,65699.218702,76236.388094
2006-11-22 04:30:00,16337.557517,147752.122284,159424.858714,470.835385,6398.529197,154300.942447,159424.858714,3289.500665,73197.524071,19004.214232,81969.036468,90643.333522,17876.217383,16402.967952,53904.151798,22888.561901,28304.386777,180519.431997,68485.304135,79453.587607


In [62]:
# Best naive method found: Repeating last cycle (unlogged)
last_cycle_naive_p8_unlogged = np.exp(
    train_p1_y[-len(period_8_X):]
)

# Reset index (correcting timestamp)
last_cycle_naive_p8_unlogged.index = period_8_X.index
last_cycle_naive_p8_unlogged[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006-11-22 00:30:00,13684.0,135868.0,146602.0,475.0,4695.0,140563.0,146602.0,2757.0,35259.0,17730.0,71808.0,80241.0,16972.0,12639.0,49360.0,17784.0,25334.0,139865.0,58657.0,73176.0
2006-11-22 01:30:00,13530.0,133060.0,143573.0,461.0,4566.0,137626.0,143573.0,2698.0,55671.0,17137.0,68911.0,76321.0,16737.0,12242.0,48467.0,17557.0,24838.0,138431.0,58346.0,71566.0


For fair comparison across grids with different scales, r2_score and mape make the most sense.

### Performance Comparison
Performance is compared with actual load values as well as competition's benchmarked values for the missing 8 weeks. For fair comparison R2 score and MAPE are used as error metrics since they are not dependent on scale.

#### Reading Actual Values

In [63]:
# Reading actual load values
actual_load = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Dataset\GEFCom2012\GEFCOM2012_Data\Load\Load_solution.csv")
actual_load.drop(["id","weight"], axis=1, inplace=True)
actual_load[0:2]

Unnamed: 0,zone_id,year,month,day,h1,h2,h3,h4,h5,h6,...,h15,h16,h17,h18,h19,h20,h21,h22,h23,h24
0,1,2005,3,6,19964,19544,19390,19442,19755,20008,...,14535,13955,13712,14372,16392,18253,18355,17157,16089,15146
1,2,2005,3,6,162096,160890,160924,158962,163197,165197,...,151681,148210,149373,153728,171318,175893,175858,166342,155411,145988


In [64]:
# Converting the data into long-format

actual_load_long = actual_load.melt(
                id_vars=["zone_id","year","month","day"],
                value_vars=[f"h{i}" for i in range(1, 25)],
                var_name="hour",
                value_name="load"
                        )


actual_load_long.head()

Unnamed: 0,zone_id,year,month,day,hour,load
0,1,2005,3,6,h1,19964
1,2,2005,3,6,h1,162096
2,3,2005,3,6,h1,174901
3,4,2005,3,6,h1,528
4,5,2005,3,6,h1,9061


In [65]:
# Replacing hour values with interval mid-point

# Create a mapping from 'h1' to 'h24' → '00:30' to '23:30'
hour_map = {f"h{i}": f"{str(i-1).zfill(2)}:30" for i in range(1, 25)}

# Replace the values using .map()
actual_load_long["hour"] = actual_load_long["hour"].map(hour_map)

# Creating timestamps using existing information
actual_load_long["timestamp"] = pd.to_datetime(
    actual_load_long["year"].astype(str) + "-" +
    actual_load_long["month"].astype(str).str.zfill(2) + "-" +
    actual_load_long["day"].astype(str).str.zfill(2) + " " +
    actual_load_long["hour"]
)

actual_load_long.head()

Unnamed: 0,zone_id,year,month,day,hour,load,timestamp
0,1,2005,3,6,00:30,19964,2005-03-06 00:30:00
1,2,2005,3,6,00:30,162096,2005-03-06 00:30:00
2,3,2005,3,6,00:30,174901,2005-03-06 00:30:00
3,4,2005,3,6,00:30,528,2005-03-06 00:30:00
4,5,2005,3,6,00:30,9061,2005-03-06 00:30:00


In [66]:
# Pivoting to record zone by column
actual_load_long = pd.pivot_table(actual_load_long, index="timestamp", columns="zone_id", values="load")
actual_load_long[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-03-06 00:30:00,19964.0,162096.0,174901.0,528.0,9061.0,171157.0,174901.0,4091.0,61215.0,26459.0,...,140417.0,21302.0,27740.0,74218.0,40411.0,36845.0,268789.0,102241.0,89800.0,1719688.0
2005-03-06 01:30:00,19544.0,160890.0,173600.0,499.0,8697.0,169587.0,173600.0,3971.0,61131.0,25979.0,...,137418.0,20466.0,27713.0,73397.0,40408.0,36745.0,267273.0,101374.0,88325.0,1703132.0


#### Comparing Time Series Regression With Actual Load Performance

##### Period 1

In [67]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error
r2_scores_p1 = []
mape_scores_p1 = []
rmse_scores_p1 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p1_mean_unlogged.index,i]),np.array(predictions_p1_mean_unlogged.loc[:,predictions_p1_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p1_mean_unlogged.index,i]),np.array(predictions_p1_mean_unlogged.loc[:,predictions_p1_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p1_mean_unlogged.index,i]),np.array(predictions_p1_mean_unlogged.loc[:,predictions_p1_mean_unlogged.columns[i-1]])))

    # adding scores to score list
    r2_scores_p1.append(r2)
    mape_scores_p1.append(mape)
    rmse_scores_p1.append(rmse)

print(f"Average R2 score for all zones, Period 1: {np.mean(r2_scores_p1)}")
print(f"Average MAPE score for all zones, Period 1: {np.mean(mape_scores_p1)}")
print(f"Average RMSE score for all zones, Period 1: {np.mean(rmse_scores_p1)}")

Average R2 score for all zones, Period 1: 0.27518553894682396
Average MAPE score for all zones, Period 1: 0.09005387465758358
Average RMSE score for all zones, Period 1: 8107.365819103414


In [68]:
# Zone 4 and 9 are showing some issues
r2_scores_p1

[0.7190577847776094,
 0.7089290009511293,
 0.7089255538296835,
 -1.7480598408049048,
 0.8193506581691473,
 0.7282814152107118,
 0.7089255538296835,
 0.70486150613994,
 -6.374771929939902,
 0.7534339847720513,
 0.8205575838638297,
 0.7557335686032196,
 0.7103436330563887,
 0.7511700740147598,
 0.7992957229654462,
 0.8758137693492368,
 0.6642897826021197,
 0.8835970572915676,
 0.7576423817523699,
 0.7563335185023928]

##### Period 2

In [69]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p2 = []
mape_scores_p2 = []
rmse_scores_p2 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p2_mean_unlogged.index,i]),np.array(predictions_p2_mean_unlogged.loc[:,predictions_p2_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p2_mean_unlogged.index,i]),np.array(predictions_p2_mean_unlogged.loc[:,predictions_p2_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p2_mean_unlogged.index,i]),np.array(predictions_p2_mean_unlogged.loc[:,predictions_p2_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p2.append(r2)
    mape_scores_p2.append(mape)
    rmse_scores_p2.append(rmse)

print(f"Average R2 score for all zones, Period 2: {np.mean(r2_scores_p2)}")
print(f"Average MAPE score for all zones, Period 2: {np.mean(mape_scores_p2)}")
print(f"Average RMSE score for all zones, Period 2: {np.mean(rmse_scores_p2)}")

Average R2 score for all zones, Period 2: 0.1363698330839401
Average MAPE score for all zones, Period 2: 0.0979776554596269
Average RMSE score for all zones, Period 2: 8446.908942592518


##### Period 3

In [70]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p3 = []
mape_scores_p3 = []
rmse_scores_p3 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p3_mean_unlogged.index,i]),np.array(predictions_p3_mean_unlogged.loc[:,predictions_p3_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p3_mean_unlogged.index,i]),np.array(predictions_p3_mean_unlogged.loc[:,predictions_p3_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p3_mean_unlogged.index,i]),np.array(predictions_p3_mean_unlogged.loc[:,predictions_p3_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p3.append(r2)
    mape_scores_p3.append(mape)
    rmse_scores_p3.append(rmse)

print(f"Average R2 score for all zones, Period 3: {np.mean(r2_scores_p3)}")
print(f"Average MAPE score for all zones, Period 3: {np.mean(mape_scores_p3)}")
print(f"Average RMSE score for all zones, Period 3: {np.mean(rmse_scores_p3)}")

Average R2 score for all zones, Period 3: 0.7727762634560691
Average MAPE score for all zones, Period 3: 0.09350283854501222
Average RMSE score for all zones, Period 3: 8483.527856562328


##### Period 4

In [71]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p4 = []
mape_scores_p4 = []
rmse_scores_p4 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(predictions_p4_mean_unlogged.loc[:,predictions_p4_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(predictions_p4_mean_unlogged.loc[:,predictions_p4_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(predictions_p4_mean_unlogged.loc[:,predictions_p4_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p4.append(r2)
    mape_scores_p4.append(mape)
    rmse_scores_p4.append(rmse)

print(f"Average R2 score for all zones, Period 4: {np.mean(r2_scores_p4)}")
print(f"Average MAPE score for all zones, Period 4: {np.mean(mape_scores_p4)}")
print(f"Average RMSE score for all zones, Period 4: {np.mean(rmse_scores_p4)}")

Average R2 score for all zones, Period 4: 0.5375143207642463
Average MAPE score for all zones, Period 4: 0.09552146057797163
Average RMSE score for all zones, Period 4: 7303.21206914267


##### Period 5

In [72]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p5 = []
mape_scores_p5 = []
rmse_scores_p5 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(predictions_p5_mean_unlogged.loc[:,predictions_p5_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(predictions_p5_mean_unlogged.loc[:,predictions_p5_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(predictions_p5_mean_unlogged.loc[:,predictions_p5_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p5.append(r2)
    mape_scores_p5.append(mape)
    rmse_scores_p5.append(rmse)

print(f"Average R2 score for all zones, Period 5: {np.mean(r2_scores_p5)}")
print(f"Average MAPE score for all zones, Period 5: {np.mean(mape_scores_p5)}")
print(f"Average RMSE score for all zones, Period 5: {np.mean(rmse_scores_p5)}")

Average R2 score for all zones, Period 5: 0.8076961578726424
Average MAPE score for all zones, Period 5: 0.10418891102547367
Average RMSE score for all zones, Period 5: 6660.027118203421


##### Period 6

In [73]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p6 = []
mape_scores_p6 = []
rmse_scores_p6 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(predictions_p6_mean_unlogged.loc[:,predictions_p6_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(predictions_p6_mean_unlogged.loc[:,predictions_p6_mean_unlogged.columns[i-1]]))
    mape = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(predictions_p6_mean_unlogged.loc[:,predictions_p6_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p6.append(r2)
    mape_scores_p6.append(mape)
    rmse_scores_p6.append(rmse)

print(f"Average R2 score for all zones, Period 6: {np.mean(r2_scores_p6)}")
print(f"Average MAPE score for all zones, Period 6: {np.mean(mape_scores_p6)}")
print(f"Average RMSE score for all zones, Period 6: {np.mean(rmse_scores_p6)}")

Average R2 score for all zones, Period 6: 0.4714400388948642
Average MAPE score for all zones, Period 6: 14820.242318577893
Average RMSE score for all zones, Period 6: 5299.770766260277


##### Period 7

In [74]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p7 = []
mape_scores_p7 = []
rmse_scores_p7 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(predictions_p7_mean_unlogged.loc[:,predictions_p7_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(predictions_p7_mean_unlogged.loc[:,predictions_p7_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(predictions_p7_mean_unlogged.loc[:,predictions_p7_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p7.append(r2)
    mape_scores_p7.append(mape)
    rmse_scores_p7.append(rmse)

print(f"Average R2 score for all zones, Period 7: {np.mean(r2_scores_p7)}")
print(f"Average MAPE score for all zones, Period 7: {np.mean(mape_scores_p7)}")
print(f"Average RMSE score for all zones, Period 7: {np.mean(rmse_scores_p7)}")

Average R2 score for all zones, Period 7: 0.8580772662181951
Average MAPE score for all zones, Period 7: 0.09588009665710692
Average RMSE score for all zones, Period 7: 9258.506987863877


##### Period 8

In [75]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p8 = []
mape_scores_p8 = []
rmse_scores_p8 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(predictions_p8_mean_unlogged.loc[:,predictions_p8_mean_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(predictions_p8_mean_unlogged.loc[:,predictions_p8_mean_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(predictions_p8_mean_unlogged.loc[:,predictions_p8_mean_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p8.append(r2)
    mape_scores_p8.append(mape)
    rmse_scores_p8.append(rmse)

print(f"Average R2 score for all zones, Period 8: {np.mean(r2_scores_p8)}")
print(f"Average MAPE score for all zones, Period 8: {np.mean(mape_scores_p8)}")
print(f"Average RMSE score for all zones, Period 8: {np.mean(rmse_scores_p8)}")

Average R2 score for all zones, Period 8: 0.251919090432252
Average MAPE score for all zones, Period 8: 0.10834000744050616
Average RMSE score for all zones, Period 8: 10320.825613947552


###### Overall Performance on Average, across periods

In [76]:
# Metric averaged across 8 periods, complete grid performance

avg_r2_list = []
for i in [r2_scores_p1,r2_scores_p2,r2_scores_p3,r2_scores_p4,r2_scores_p5,r2_scores_p6,r2_scores_p7,r2_scores_p8]:
    avg_r = np.mean(i)        # averaged across all 20 zones, for a given period
    avg_r2_list.append(avg_r)

print(f"Average R2 score across all 8 periods, complete grid: {np.mean(avg_r2_list)}")

avg_mape_list = []
for j in [mape_scores_p1,mape_scores_p2,mape_scores_p3,mape_scores_p4,mape_scores_p5,mape_scores_p6,mape_scores_p7,mape_scores_p8]:
    avg_m = np.mean(j)        # averaged across all 20 zones, for a given period
    avg_mape_list.append(avg_m)

print(f"Average MAPE score across all 8 periods, complete grid: {np.mean(avg_mape_list)}")

avg_rmse_list = []
for k in [rmse_scores_p1,rmse_scores_p2,rmse_scores_p3,rmse_scores_p4,rmse_scores_p5,rmse_scores_p6,rmse_scores_p7,rmse_scores_p8]:
    avg_rm = np.mean(k)        # averaged across all 20 zones, for a given period
    avg_rmse_list.append(avg_rm)

print(f"Average RMSE score across all 8 periods, complete grid: {np.mean(avg_rmse_list)}")

Average R2 score across all 8 periods, complete grid: 0.5138723137086292
Average MAPE score across all 8 periods, complete grid: 1852.615972927782
Average RMSE score across all 8 periods, complete grid: 7985.018146709507


#### Comparing Naive Prediction With Actual Load Values

##### Period 1

In [77]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p1 = []
mape_scores_p1 = []
rmse_scores_p1 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p1_unlogged.index,i]),np.array(last_cycle_naive_p1_unlogged.loc[:,last_cycle_naive_p1_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p1_unlogged.index,i]),np.array(last_cycle_naive_p1_unlogged.loc[:,last_cycle_naive_p1_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p1_unlogged.index,i]),np.array(last_cycle_naive_p1_unlogged.loc[:,last_cycle_naive_p1_unlogged.columns[i-1]])))

    # adding scores to score list
    r2_scores_p1.append(r2)
    mape_scores_p1.append(mape)
    rmse_scores_p1.append(rmse)

print(f"Average R2 score for all zones, Period 1: {np.mean(r2_scores_p1)}")
print(f"Average MAPE score for all zones, Period 1: {np.mean(mape_scores_p1)}")
print(f"Average RMSE score for all zones, Period 1: {np.mean(rmse_scores_p1)}")

Average R2 score for all zones, Period 1: -0.403443635119234
Average MAPE score for all zones, Period 1: 0.1751119096892755
Average RMSE score for all zones, Period 1: 15805.303177066051


In [78]:
# most scores are extremely poor
r2_scores_p1

[-0.8045817850316563,
 0.22020601071801205,
 0.22021134777983098,
 -0.21288627755387135,
 -0.08485266343471598,
 0.18635357864996405,
 0.22021134777983098,
 0.1067208111758482,
 -5.270399713829374,
 0.3043234098764367,
 0.1616939130051499,
 0.0384987959159101,
 -0.11459705037099832,
 -0.42820074834427,
 -0.39638026369890045,
 -0.41540187727136635,
 -0.712037845142111,
 -0.35927722881498303,
 -0.7219497766522796,
 -0.006526687141136245]

##### Period 2

In [79]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p2 = []
mape_scores_p2 = []
rmse_scores_p2 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p2_unlogged.index,i]),np.array(last_cycle_naive_p2_unlogged.loc[:,last_cycle_naive_p2_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p2_unlogged.index,i]),np.array(last_cycle_naive_p2_unlogged.loc[:,last_cycle_naive_p2_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p2_unlogged.index,i]),np.array(last_cycle_naive_p2_unlogged.loc[:,last_cycle_naive_p2_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p2.append(r2)
    mape_scores_p2.append(mape)
    rmse_scores_p2.append(rmse)

print(f"Average R2 score for all zones, Period 2: {np.mean(r2_scores_p2)}")
print(f"Average MAPE score for all zones, Period 2: {np.mean(mape_scores_p2)}")
print(f"Average RMSE score for all zones, Period 2: {np.mean(rmse_scores_p2)}")

Average R2 score for all zones, Period 2: -0.7604482328878115
Average MAPE score for all zones, Period 2: 0.24201497981319758
Average RMSE score for all zones, Period 2: 22595.193372002155


##### Period 3

In [80]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p3 = []
mape_scores_p3 = []
rmse_scores_p3 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p3_unlogged.index,i]),np.array(last_cycle_naive_p3_unlogged.loc[:,last_cycle_naive_p3_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p3_unlogged.index,i]),np.array(last_cycle_naive_p3_unlogged.loc[:,last_cycle_naive_p3_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p3_unlogged.index,i]),np.array(last_cycle_naive_p3_unlogged.loc[:,last_cycle_naive_p3_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p3.append(r2)
    mape_scores_p3.append(mape)
    rmse_scores_p3.append(rmse)

print(f"Average R2 score for all zones, Period 3: {np.mean(r2_scores_p3)}")
print(f"Average MAPE score for all zones, Period 3: {np.mean(mape_scores_p3)}")
print(f"Average RMSE score for all zones, Period 3: {np.mean(rmse_scores_p3)}")

Average R2 score for all zones, Period 3: 0.5358353871978386
Average MAPE score for all zones, Period 3: 0.12832544176716906
Average RMSE score for all zones, Period 3: 12729.16779995972


##### Period 4

In [81]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p4 = []
mape_scores_p4 = []
rmse_scores_p4 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p4_unlogged.index,i]),np.array(last_cycle_naive_p4_unlogged.loc[:,last_cycle_naive_p4_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p4_unlogged.index,i]),np.array(last_cycle_naive_p4_unlogged.loc[:,last_cycle_naive_p4_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p4_unlogged.index,i]),np.array(last_cycle_naive_p4_unlogged.loc[:,last_cycle_naive_p4_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p4.append(r2)
    mape_scores_p4.append(mape)
    rmse_scores_p4.append(rmse)

print(f"Average R2 score for all zones, Period 4: {np.mean(r2_scores_p4)}")
print(f"Average MAPE score for all zones, Period 4: {np.mean(mape_scores_p4)}")
print(f"Average RMSE score for all zones, Period 4: {np.mean(rmse_scores_p4)}")

Average R2 score for all zones, Period 4: -2.4401341249971926
Average MAPE score for all zones, Period 4: 0.2265497298883694
Average RMSE score for all zones, Period 4: 21212.871729678176


##### Period 5

In [82]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p5 = []
mape_scores_p5 = []
rmse_scores_p5 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p5_unlogged.index,i]),np.array(last_cycle_naive_p5_unlogged.loc[:,last_cycle_naive_p5_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p5_unlogged.index,i]),np.array(last_cycle_naive_p5_unlogged.loc[:,last_cycle_naive_p5_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p5_unlogged.index,i]),np.array(last_cycle_naive_p5_unlogged.loc[:,last_cycle_naive_p5_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p5.append(r2)
    mape_scores_p5.append(mape)
    rmse_scores_p5.append(rmse)

print(f"Average R2 score for all zones, Period 5: {np.mean(r2_scores_p5)}")
print(f"Average MAPE score for all zones, Period 5: {np.mean(mape_scores_p5)}")
print(f"Average RMSE score for all zones, Period 5: {np.mean(rmse_scores_p5)}")

Average R2 score for all zones, Period 5: -0.15845509256449225
Average MAPE score for all zones, Period 5: 0.2281669305082108
Average RMSE score for all zones, Period 5: 18991.150026154777


##### Period 6

In [83]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p6 = []
mape_scores_p6 = []
rmse_scores_p6 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p6_unlogged.index,i]),np.array(last_cycle_naive_p6_unlogged.loc[:,last_cycle_naive_p6_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p6_unlogged.index,i]),np.array(last_cycle_naive_p6_unlogged.loc[:,last_cycle_naive_p6_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p6_unlogged.index,i]),np.array(last_cycle_naive_p6_unlogged.loc[:,last_cycle_naive_p6_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p6.append(r2)
    mape_scores_p6.append(mape)
    rmse_scores_p6.append(rmse)

print(f"Average R2 score for all zones, Period 6: {np.mean(r2_scores_p6)}")
print(f"Average MAPE score for all zones, Period 6: {np.mean(mape_scores_p6)}")
print(f"Average RMSE score for all zones, Period 6: {np.mean(rmse_scores_p6)}")

Average R2 score for all zones, Period 6: -0.10384797999226474
Average MAPE score for all zones, Period 6: 0.19152103232021125
Average RMSE score for all zones, Period 6: 20614.317789472036


##### Period 7

In [84]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p7 = []
mape_scores_p7 = []
rmse_scores_p7 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p7_unlogged.index,i]),np.array(last_cycle_naive_p7_unlogged.loc[:,last_cycle_naive_p7_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p7_unlogged.index,i]),np.array(last_cycle_naive_p7_unlogged.loc[:,last_cycle_naive_p7_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p7_unlogged.index,i]),np.array(last_cycle_naive_p7_unlogged.loc[:,last_cycle_naive_p7_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p7.append(r2)
    mape_scores_p7.append(mape)
    rmse_scores_p7.append(rmse)

print(f"Average R2 score for all zones, Period 7: {np.mean(r2_scores_p7)}")
print(f"Average MAPE score for all zones, Period 7: {np.mean(mape_scores_p7)}")
print(f"Average RMSE score for all zones, Period 7: {np.mean(rmse_scores_p7)}")

Average R2 score for all zones, Period 7: 0.685169965264994
Average MAPE score for all zones, Period 7: 0.13441551148576594
Average RMSE score for all zones, Period 7: 14533.344010093784


##### Period 8

In [85]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p8 = []
mape_scores_p8 = []
rmse_scores_p8 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[last_cycle_naive_p8_unlogged.index,i]),np.array(last_cycle_naive_p8_unlogged.loc[:,last_cycle_naive_p8_unlogged.columns[i-1]]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[last_cycle_naive_p8_unlogged.index,i]),np.array(last_cycle_naive_p8_unlogged.loc[:,last_cycle_naive_p8_unlogged.columns[i-1]]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[last_cycle_naive_p8_unlogged.index,i]),np.array(last_cycle_naive_p8_unlogged.loc[:,last_cycle_naive_p8_unlogged.columns[i-1]])))
    
    # adding scores to score list
    r2_scores_p8.append(r2)
    mape_scores_p8.append(mape)
    rmse_scores_p8.append(rmse)

print(f"Average R2 score for all zones, Period 8: {np.mean(r2_scores_p8)}")
print(f"Average MAPE score for all zones, Period 8: {np.mean(mape_scores_p8)}")
print(f"Average RMSE score for all zones, Period 8: {np.mean(rmse_scores_p8)}")

Average R2 score for all zones, Period 8: -0.5163474329789117
Average MAPE score for all zones, Period 8: 0.1574119195207866
Average RMSE score for all zones, Period 8: 13439.366006618406


###### Overall Performance on Average, across periods

In [86]:
# Metric averaged across 8 periods, complete grid performance

avg_r2_list = []
for i in [r2_scores_p1,r2_scores_p2,r2_scores_p3,r2_scores_p4,r2_scores_p5,r2_scores_p6,r2_scores_p7,r2_scores_p8]:
    avg_r = np.mean(i)        # averaged across all 20 zones, for a given period
    avg_r2_list.append(avg_r)

print(f"Average R2 score across all 8 periods, complete grid: {np.mean(avg_r2_list)}")

avg_mape_list = []
for j in [mape_scores_p1,mape_scores_p2,mape_scores_p3,mape_scores_p4,mape_scores_p5,mape_scores_p6,mape_scores_p7,mape_scores_p8]:
    avg_m = np.mean(j)        # averaged across all 20 zones, for a given period
    avg_mape_list.append(avg_m)

print(f"Average MAPE score across all 8 periods, complete grid: {np.mean(avg_mape_list)}")

avg_rmse_list = []
for k in [rmse_scores_p1,rmse_scores_p2,rmse_scores_p3,rmse_scores_p4,rmse_scores_p5,rmse_scores_p6,rmse_scores_p7,rmse_scores_p8]:
    avg_rm = np.mean(k)        # averaged across all 20 zones, for a given period
    avg_rmse_list.append(avg_rm)

print(f"Average RMSE score across all 8 periods, complete grid: {np.mean(avg_rmse_list)}")

Average R2 score across all 8 periods, complete grid: -0.39520889325963426
Average MAPE score across all 8 periods, complete grid: 0.18543968187412327
Average RMSE score across all 8 periods, complete grid: 17490.08923888064


> The error rate (MAPE) has decreased by approximately 47% relative to the original (naive) error rate, when time series regression is used.

> <b>The error rate (RMSE) has decreased by approximately 51.7% relative to the original (naive) error rate, when time series regression is used.</b>

#### Comparing Competition's Benchmark Values With Actual Load Values

In [87]:
# Reading benchmark load

load_bench = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 6 (TU Dresden)\Thesis Work\Dataset\GEFCom2012\GEFCOM2012_Data\Load\Load_benchmark.csv")
load_bench.head()

Unnamed: 0,id,zone_id,year,month,day,h1,h2,h3,h4,h5,...,h15,h16,h17,h18,h19,h20,h21,h22,h23,h24
0,1,1,2005,3,6,20505,19445,19373,18833,19962,...,14333,13606,14009,15133,16097,16949,16895,16322,14969,14016
1,2,2,2005,3,6,167016,163275,164618,164731,165840,...,145182,140364,141993,148293,151031,158234,162813,158044,148985,140408
2,3,3,2005,3,6,180211,176174,177624,177745,178942,...,156652,151453,153210,160008,162963,170735,175675,170530,160756,151501
3,4,4,2005,3,6,530,496,489,460,492,...,497,472,479,512,535,542,552,536,468,415
4,5,5,2005,3,6,8585,8346,8564,8638,8793,...,6835,6486,6679,7343,7723,8347,8536,7856,6816,5974


In [88]:
# Converting the data into long-format

load_bench = load_bench.melt(
                id_vars=["zone_id","year","month","day"],
                value_vars=[f"h{i}" for i in range(1, 25)],
                var_name="hour",
                value_name="load"
                        )


# Replacing hour values with interval mid-point

# Replace the values using .map()
load_bench["hour"] = load_bench["hour"].map(hour_map)

# Creating timestamps using existing information
load_bench["timestamp"] = pd.to_datetime(
    load_bench["year"].astype(str) + "-" +
    load_bench["month"].astype(str).str.zfill(2) + "-" +
    load_bench["day"].astype(str).str.zfill(2) + " " +
    load_bench["hour"]
)

load_bench.head()

Unnamed: 0,zone_id,year,month,day,hour,load,timestamp
0,1,2005,3,6,00:30,20505,2005-03-06 00:30:00
1,2,2005,3,6,00:30,167016,2005-03-06 00:30:00
2,3,2005,3,6,00:30,180211,2005-03-06 00:30:00
3,4,2005,3,6,00:30,530,2005-03-06 00:30:00
4,5,2005,3,6,00:30,8585,2005-03-06 00:30:00


In [89]:
# Pivoting to record zone by column
load_bench = pd.pivot_table(load_bench, index="timestamp", columns="zone_id", values="load")
load_bench[0:2]

zone_id,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-03-06 00:30:00,20505.0,167016.0,180211.0,530.0,8585.0,175595.0,180211.0,4053.0,70026.0,23703.0,...,144092.0,21404.0,25238.0,70867.0,36580.0,33450.0,243806.0,91796.0,86322.0,1695779.0
2005-03-06 01:30:00,19445.0,163275.0,176174.0,496.0,8346.0,171615.0,176174.0,3844.0,70123.0,22995.0,...,138805.0,20139.0,23888.0,67545.0,36716.0,32688.0,244966.0,86833.0,82373.0,1655145.0


##### Period 1

In [90]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p1 = []
mape_scores_p1 = []
rmse_scores_p1 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p1_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p1_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p1_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p1_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p1_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p1_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p1.append(r2)
    mape_scores_p1.append(mape)
    rmse_scores_p1.append(rmse)

print(f"Average R2 score for all zones, Period 1: {np.mean(r2_scores_p1)}")
print(f"Average MAPE score for all zones, Period 1: {np.mean(mape_scores_p1)}")
print(f"Average RMSE score for all zones, Period 1: {np.mean(rmse_scores_p1)}")

Average R2 score for all zones, Period 1: 0.7843876695118762
Average MAPE score for all zones, Period 1: 0.05993021699568011
Average RMSE score for all zones, Period 1: 5698.5266936622


##### Period 2

In [91]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p2 = []
mape_scores_p2 = []
rmse_scores_p2 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p2_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p2_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p2_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p2_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p2_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p2_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p2.append(r2)
    mape_scores_p2.append(mape)
    rmse_scores_p2.append(rmse)

print(f"Average R2 score for all zones, Period 2: {np.mean(r2_scores_p2)}")
print(f"Average MAPE score for all zones, Period 2: {np.mean(mape_scores_p2)}")
print(f"Average RMSE score for all zones, Period 2: {np.mean(rmse_scores_p2)}")

Average R2 score for all zones, Period 2: 0.7670102016469259
Average MAPE score for all zones, Period 2: 0.07011818431947027
Average RMSE score for all zones, Period 2: 6212.343813248906


##### Period 3

In [92]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p3 = []
mape_scores_p3 = []
rmse_scores_p3 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p3_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p3_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p3_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p3_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p3_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p3_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p3.append(r2)
    mape_scores_p3.append(mape)
    rmse_scores_p3.append(rmse)

print(f"Average R2 score for all zones, Period 3: {np.mean(r2_scores_p3)}")
print(f"Average MAPE score for all zones, Period 3: {np.mean(mape_scores_p3)}")
print(f"Average RMSE score for all zones, Period 3: {np.mean(rmse_scores_p3)}")

Average R2 score for all zones, Period 3: 0.8408390776386794
Average MAPE score for all zones, Period 3: 0.06986351198380902
Average RMSE score for all zones, Period 3: 6695.382053205253


##### Period 4

In [93]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p4 = []
mape_scores_p4 = []
rmse_scores_p4 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p4_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p4_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p4_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p4_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p4.append(r2)
    mape_scores_p4.append(mape)
    rmse_scores_p4.append(rmse)

print(f"Average R2 score for all zones, Period 4: {np.mean(r2_scores_p4)}")
print(f"Average MAPE score for all zones, Period 4: {np.mean(mape_scores_p4)}")
print(f"Average RMSE score for all zones, Period 4: {np.mean(rmse_scores_p4)}")

Average R2 score for all zones, Period 4: 0.4436922037267463
Average MAPE score for all zones, Period 4: 0.10292999439807642
Average RMSE score for all zones, Period 4: 7866.08314636025


##### Period 5

In [94]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p5 = []
mape_scores_p5 = []
rmse_scores_p5 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p5_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p5_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p5_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p5_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p5.append(r2)
    mape_scores_p5.append(mape)
    rmse_scores_p5.append(rmse)

print(f"Average R2 score for all zones, Period 5: {np.mean(r2_scores_p5)}")
print(f"Average MAPE score for all zones, Period 5: {np.mean(mape_scores_p5)}")
print(f"Average RMSE score for all zones, Period 5: {np.mean(rmse_scores_p5)}")

Average R2 score for all zones, Period 5: 0.3605287919423327
Average MAPE score for all zones, Period 5: 0.10270817944634048
Average RMSE score for all zones, Period 5: 6441.805707461544


##### Period 6

In [95]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p6 = []
mape_scores_p6 = []
rmse_scores_p6 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p6_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p6_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p6_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p6_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p6.append(r2)
    mape_scores_p6.append(mape)
    rmse_scores_p6.append(rmse)

print(f"Average R2 score for all zones, Period 6: {np.mean(r2_scores_p6)}")
print(f"Average MAPE score for all zones, Period 6: {np.mean(mape_scores_p6)}")
print(f"Average RMSE score for all zones, Period 6: {np.mean(rmse_scores_p6)}")

Average R2 score for all zones, Period 6: 0.5734876577895001
Average MAPE score for all zones, Period 6: 0.11750389336898162
Average RMSE score for all zones, Period 6: 9385.092514092286


##### Period 7

In [96]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p7 = []
mape_scores_p7 = []
rmse_scores_p7 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p7_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p7_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p7_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p7_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p7.append(r2)
    mape_scores_p7.append(mape)
    rmse_scores_p7.append(rmse)

print(f"Average R2 score for all zones, Period 7: {np.mean(r2_scores_p7)}")
print(f"Average MAPE score for all zones, Period 7: {np.mean(mape_scores_p7)}")
print(f"Average RMSE score for all zones, Period 7: {np.mean(rmse_scores_p7)}")

Average R2 score for all zones, Period 7: 0.9163187319055831
Average MAPE score for all zones, Period 7: 0.07326753758533004
Average RMSE score for all zones, Period 7: 6190.742612735311


##### Period 8

In [97]:
# Initiating empty lists
from sklearn.metrics import mean_absolute_percentage_error, r2_score
r2_scores_p8 = []
mape_scores_p8 = []
rmse_scores_p8 = []

for i in range(1,21,1):
    # calculating score metrics for each zone 'i'
    r2 = r2_score(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p8_mean_unlogged.index,i]))
    mape = mean_absolute_percentage_error(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p8_mean_unlogged.index,i]))
    rmse = np.sqrt(mean_squared_error(np.array(actual_load_long.loc[predictions_p8_mean_unlogged.index,i]),np.array(load_bench.loc[predictions_p8_mean_unlogged.index,i])))
    
    # adding scores to score list
    r2_scores_p8.append(r2)
    mape_scores_p8.append(mape)
    rmse_scores_p8.append(rmse)

print(f"Average R2 score for all zones, Period 8: {np.mean(r2_scores_p8)}")
print(f"Average MAPE score for all zones, Period 8: {np.mean(mape_scores_p8)}")
print(f"Average RMSE score for all zones, Period 8: {np.mean(rmse_scores_p8)}")

Average R2 score for all zones, Period 8: 0.4237862559810378
Average MAPE score for all zones, Period 8: 0.09115063921173688
Average RMSE score for all zones, Period 8: 8393.01873850579


###### Overall Performance on Average, across periods

In [98]:
# Metric averaged across 8 periods, complete grid performance

avg_r2_list = []
for i in [r2_scores_p1,r2_scores_p2,r2_scores_p3,r2_scores_p4,r2_scores_p5,r2_scores_p6,r2_scores_p7,r2_scores_p8]:
    avg_r = np.mean(i)        # averaged across all 20 zones, for a given period
    avg_r2_list.append(avg_r)

print(f"Average R2 score across all 8 periods, complete grid: {np.mean(avg_r2_list)}")

avg_mape_list = []
for j in [mape_scores_p1,mape_scores_p2,mape_scores_p3,mape_scores_p4,mape_scores_p5,mape_scores_p6,mape_scores_p7,mape_scores_p8]:
    avg_m = np.mean(j)        # averaged across all 20 zones, for a given period
    avg_mape_list.append(avg_m)

print(f"Average MAPE score across all 8 periods, complete grid: {np.mean(avg_mape_list)}")

avg_rmse_list = []
for k in [rmse_scores_p1,rmse_scores_p2,rmse_scores_p3,rmse_scores_p4,rmse_scores_p5,rmse_scores_p6,rmse_scores_p7,rmse_scores_p8]:
    avg_rm = np.mean(k)        # averaged across all 20 zones, for a given period
    avg_rmse_list.append(avg_rm)

print(f"Average RMSE score across all 8 periods, complete grid: {np.mean(avg_rmse_list)}")

Average R2 score across all 8 periods, complete grid: 0.6387563237678352
Average MAPE score across all 8 periods, complete grid: 0.08593401966367811
Average RMSE score across all 8 periods, complete grid: 7110.374409908943


> !For benchmarked values, error reduced by 53.5% compared to naive method (MAPE)!

> !<b>For benchmarked values, error reduced by 59.3% compared to naive method (RMSE)</b>!

### Conclusion
Compared to competition benchmark, error i.e. rmse increased from 7110.374409908943 to 8110