# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.utils import shuffle
from sklearn.svm import SVR

# Preprocessing

In [30]:
TARGET = "TOTAL_CONS"

# keep 1 year for testing
START_TEST_DATE = pd.to_datetime('2018-01-01')

END_TRAIN_DATE = START_TEST_DATE - pd.to_timedelta(1, 'h')
START_TRAIN_DATE = START_TEST_DATE - pd.DateOffset(months=30)

DAYS_FOR_INFERENCE = 1
DAYS_FORWARD_TO_SKIP = 0
DAYS_BACK_TO_SKIP = 0
START_STEP_FORWARD = 24 * DAYS_FORWARD_TO_SKIP + 1
LAST_STEP_FORWARD = 24 * (DAYS_FORWARD_TO_SKIP + DAYS_FOR_INFERENCE)
STEPS_BACKWARD_START = 24 * DAYS_BACK_TO_SKIP

In [31]:
df = pd.read_csv("/content/FINAL_DATASET_2.csv")
df.set_index(pd.to_datetime(df["Timestamp"]), inplace=True)
df.drop("Timestamp", axis=1, inplace=True)
df

Unnamed: 0_level_0,TOTAL_CONS,Weekend,Holiday,temp,humidity,hour,weekday,dayofyear
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-10-01 00:00:00,4390.054,0,0,16.12,79.0,0,4,274
2010-10-01 01:00:00,4046.071,0,0,15.19,77.0,1,4,274
2010-10-01 02:00:00,3885.451,0,0,14.65,82.0,2,4,274
2010-10-01 03:00:00,3808.100,0,0,14.03,71.0,3,4,274
2010-10-01 04:00:00,3782.623,0,0,13.29,77.0,4,4,274
...,...,...,...,...,...,...,...,...
2020-11-22 19:00:00,4281.942,1,0,10.74,54.0,19,6,327
2020-11-22 20:00:00,4091.488,1,0,10.15,51.0,20,6,327
2020-11-22 21:00:00,3738.827,1,0,9.81,51.0,21,6,327
2020-11-22 22:00:00,3461.113,1,0,9.67,54.0,22,6,327


In [32]:
TARGET_POS = np.where(df.columns == TARGET)[0][0]

train_val_df = df[(df.index >= START_TRAIN_DATE) & (df.index <= END_TRAIN_DATE)]
test_df = df[(df.index >= START_TEST_DATE) & (df.index <= END_TEST_DATE)]

scaler = MinMaxScaler()
train_val_scaled = scaler.fit_transform(train_val_df)
train_val_df_scaled = pd.DataFrame(train_val_scaled,
                                   columns=train_val_df.columns,
                                   index=train_val_df.index)
test_scaled = scaler.transform(test_df)
test_df_scaled = pd.DataFrame(test_scaled,
                              columns=test_df.columns,
                              index=test_df.index)

scaled_df = pd.concat([train_val_df_scaled, test_df_scaled], axis=0)
scaled_df

Unnamed: 0_level_0,TOTAL_CONS,Weekend,Holiday,temp,humidity,hour,weekday,dayofyear
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-07-01 00:00:00,0.269006,0.0,0.0,0.557942,0.827586,0.000000,0.333333,0.49589
2015-07-01 01:00:00,0.213217,0.0,0.0,0.553946,0.632184,0.043478,0.333333,0.49589
2015-07-01 02:00:00,0.179723,0.0,0.0,0.518482,0.586207,0.086957,0.333333,0.49589
2015-07-01 03:00:00,0.164580,0.0,0.0,0.524975,0.609195,0.130435,0.333333,0.49589
2015-07-01 04:00:00,0.161554,0.0,0.0,0.499001,0.678161,0.173913,0.333333,0.49589
...,...,...,...,...,...,...,...,...
2018-12-31 20:00:00,0.580972,0.0,1.0,0.232018,0.643678,0.869565,0.000000,0.99726
2018-12-31 21:00:00,0.469822,0.0,1.0,0.228022,0.505747,0.913043,0.000000,0.99726
2018-12-31 22:00:00,0.349390,0.0,1.0,0.229770,0.666667,0.956522,0.000000,0.99726
2018-12-31 23:00:00,0.310601,0.0,1.0,0.243257,0.735632,1.000000,0.000000,0.99726


# MAIN

In [33]:
step_results_dict = {}
iters = 0
while True:
  iters += 1
  if iters == 1:  
    startTestDate = pd.to_datetime(START_TEST_DATE)
  else:
    startTestDate += pd.DateOffset(days=7)

  endTestDate = startTestDate + pd.DateOffset(days=7)

  endTrainDate = startTestDate - pd.to_timedelta(1, 'h')
  startTrainDate = startTestDate - pd.DateOffset(months=30)
  

  if (startTestDate > scaled_df.index[-1] - pd.to_timedelta(LAST_STEP_FORWARD, 'h')):
    print(f'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - END - - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    break
  else:
    print(f'\n********************************* Inference from {startTestDate} to {endTestDate} **************************************************\n')
    mape_list = list()
    for step_forward in np.arange(START_STEP_FORWARD, LAST_STEP_FORWARD + 1, step=1):
    # for step_forward in np.arange(START_STEP_FORWARD, 3, step=1):
      scaled_df_copy = scaled_df.copy()  # pd.DataFrame()

      # *** shift Weather + Time data ***
      for col in scaled_df.drop(TARGET, axis=1).columns:
        scaled_df_copy[col + f"_(t+{step_forward})"] = scaled_df[col].shift(-step_forward)
        scaled_df_copy.drop(col, axis=1, inplace=True)

      # ---------- REFRAMING -----------------
      # *** shift Target ***
      scaled_df_copy[TARGET + f"_(t+{step_forward})"] = scaled_df[TARGET].shift(-step_forward)

      # *** shift Backsteps *** 
      if (step_forward == 1) or (step_forward == 2):
        scaled_df_copy[TARGET + "_(t-1)"] = scaled_df[TARGET].shift(1)
      else:
        for day_back in np.arange(DAYS_BACK_TO_SKIP + 1, DAYS_BACK_TO_SKIP + 11, step=1):
          step_back = 24 * day_back - step_forward
          scaled_df_copy[TARGET + f"_(t-{step_back})"] = scaled_df[TARGET].shift(step_back)
          
      refr_df = scaled_df_copy.drop(TARGET, axis=1).copy()
      refr_df.dropna(inplace=True)

      # split to train-val and test sets
      train_df_refr = refr_df[(refr_df.index >= startTrainDate) & (refr_df.index <= endTrainDate)]
      test_df_refr = refr_df[(refr_df.index >= startTestDate) & (refr_df.index <= endTestDate)]

      # shuffle train-val and test sets
      train_df_refr = shuffle(train_df_refr)

      # split to features and targets
      X_train_val_df = train_df_refr.drop(TARGET + f"_(t+{step_forward})", axis=1)
      y_train_val_df = train_df_refr[TARGET + f"_(t+{step_forward})"]

      X_test_df = test_df_refr.drop(TARGET + f"_(t+{step_forward})", axis=1)
      y_test_df = test_df_refr[TARGET + f"_(t+{step_forward})"]

      # XGBoost
      xgb = XGBRegressor(max_depth=6, eta=0.06)
      xgb.fit(X_train_val_df, y_train_val_df)
      y_week_step_preds_scaled = xgb.predict(X_test_df)

      y_week_step_preds = scaler.data_min_[TARGET_POS] + y_week_step_preds_scaled * (scaler.data_max_[TARGET_POS] - scaler.data_min_[TARGET_POS])
      y_week_step_test = scaler.data_min_[TARGET_POS] + y_test_df.to_numpy() * (scaler.data_max_[TARGET_POS] - scaler.data_min_[TARGET_POS])

      week_step_results = pd.DataFrame(
          {
            "real": y_week_step_test,
            "predictions": y_week_step_preds
          },
          index=y_test_df.index + pd.to_timedelta(step_forward, 'h')
      )

      week_step_results['abs_error'] = np.abs(week_step_results["real"] - week_step_results["predictions"])
      week_step_results['ape'] = np.where(week_step_results["real"] == 0, np.NaN, 100 * week_step_results['abs_error'] / week_step_results["real"])

      if iters == 1:
        step_results_dict[step_forward] = week_step_results
      else:
        step_results_dict[step_forward] = pd.concat([step_results_dict[step_forward], week_step_results], axis=0)

      step_mape = step_results_dict[step_forward]['ape'].mean()
      mape_list.append(step_mape)

    MAPE = np.mean(np.array(mape_list))
    print(f"\nMAPE = {MAPE}%")


********************************* Inference from 2018-01-01 00:00:00 to 2018-01-08 00:00:00 **************************************************


MAPE = 4.266029442417079%

********************************* Inference from 2018-01-08 00:00:00 to 2018-01-15 00:00:00 **************************************************


MAPE = 3.2923063576166043%

********************************* Inference from 2018-01-15 00:00:00 to 2018-01-22 00:00:00 **************************************************


MAPE = 3.1069043472027964%

********************************* Inference from 2018-01-22 00:00:00 to 2018-01-29 00:00:00 **************************************************


MAPE = 2.892690003299792%

********************************* Inference from 2018-01-29 00:00:00 to 2018-02-05 00:00:00 **************************************************


MAPE = 2.81962672689348%

********************************* Inference from 2018-02-05 00:00:00 to 2018-02-12 00:00:00 **********************************************

In [36]:
for key in step_results_dict.keys():
  MAPE = step_results_dict[key]['ape'].mean()
  print(f"step {key} --> MAPE = {MAPE}%")

step 1 --> MAPE = 2.399918065094685%
step 2 --> MAPE = 2.8574106732564135%
step 3 --> MAPE = 2.862671678145455%
step 4 --> MAPE = 2.847356719165865%
step 5 --> MAPE = 2.8348648109554317%
step 6 --> MAPE = 2.8513212986503733%
step 7 --> MAPE = 2.8472409730296366%
step 8 --> MAPE = 2.8555362942501787%
step 9 --> MAPE = 2.8616249706141486%
step 10 --> MAPE = 2.8466652750533763%
step 11 --> MAPE = 2.8464608821339064%
step 12 --> MAPE = 2.840650165798636%
step 13 --> MAPE = 2.842841500072666%
step 14 --> MAPE = 2.8396116777326053%
step 15 --> MAPE = 2.8275422240031927%
step 16 --> MAPE = 2.845502295991936%
step 17 --> MAPE = 2.83412290650547%
step 18 --> MAPE = 2.8376370016494423%
step 19 --> MAPE = 2.828212414936693%
step 20 --> MAPE = 2.835696761394311%
step 21 --> MAPE = 2.8221037317268833%
step 22 --> MAPE = 2.8354031126192147%
step 23 --> MAPE = 2.824525332502453%
step 24 --> MAPE = 2.812103607471226%


In [44]:
step = 1
step_df = step_results_dict[step]
step_df['ape_above_10_flag'] = np.where(step_df['ape'] >= 10., 1, 0)
step_df_grouped = step_df.groupby(by=step_results_dict[1].index.month).sum()
step_df_grouped['ape_above_10_(%)'] = 100 * step_df_grouped['ape_above_10_flag'] / (30 * 24)

In [46]:
30*24 / 100

7.2

In [45]:
step_df_grouped

Unnamed: 0_level_0,real,predictions,abs_error,ape,ape_above_10_flag,ape_above_10_(%)
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3960361.0,4000131.75,96642.727982,1812.307568,7,0.972222
2,3460459.0,3488864.5,76689.124811,1468.514397,2,0.277778
3,3444252.0,3498218.75,88341.425979,1863.339904,6,0.833333
4,2974280.0,3040086.75,85537.343385,2130.365756,28,3.888889
5,3114114.0,3158992.75,70183.130016,1655.170483,0,0.0
6,3373676.0,3399904.5,61535.197924,1311.411942,1,0.138889
7,3916224.0,3922625.5,72973.346984,1392.958454,1,0.138889
8,3731260.0,3744256.25,64654.666789,1309.108627,0,0.0
9,3260988.0,3291000.75,65297.055357,1434.516581,0,0.0
10,2996180.0,3092653.0,113264.70616,2826.395727,33,4.583333


In [50]:
step_df_grouped['ape_above_10_(%)'].mean()

1.1805555555555554