# __How far shall we forecast into the future?__

In order to find that out we will forecast for different horizon values.

In [1]:
import pandas as pd
import numpy as np
import os
import datetime as dt

import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

import warnings
warnings.simplefilter('ignore')

### __For that, we prepare the corresponding table__

In [2]:
file_path = '../../data/train.pkl'
df = pd.read_pickle(file_path)
df.drop(columns='Price', inplace=True)

df.head()

Unnamed: 0,CO2E_EMISSIONS_FACTOR
2009-07-01 04:00:00,0.991217
2009-07-01 04:05:00,0.0
2009-07-01 04:10:00,0.0
2009-07-01 04:15:00,0.991217
2009-07-01 04:20:00,1.025701


__We will have to perform add time and non-time features again.__

### __Dataframe preparation__

__Load demand table__

In [3]:
demand = pd.read_csv('../../big_data/demand.csv', index_col=-1, parse_dates=True)
demand.drop(columns=["SETTLEMENTDATE", "I", "INTERVENTION"], inplace=True)
demand = demand[(demand.index >= df.index.min()) & (demand.index <= df.index.max())]

assert demand.index.min() == df.index.min()
assert demand.index.max() == df.index.max()

demand.head()

Unnamed: 0_level_0,TOTALDEMAND,AVAILABLEGENERATION
start-of-interval,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-07-01 04:00:00,1004.32,3043.771
2009-07-01 04:05:00,1007.58,3043.694
2009-07-01 04:10:00,1019.33,3045.307
2009-07-01 04:15:00,1025.24,3045.436
2009-07-01 04:20:00,1050.5,3043.964


__Add demand coluomn__

In [4]:
demand = demand.groupby(demand.index).mean()

df["demand"] = demand.TOTALDEMAND
df["demand_capacity"] = demand.TOTALDEMAND/demand.AVAILABLEGENERATION

print(df.shape)
df.head()

(937968, 3)


Unnamed: 0,CO2E_EMISSIONS_FACTOR,demand,demand_capacity
2009-07-01 04:00:00,0.991217,1004.32,0.329959
2009-07-01 04:05:00,0.0,1007.58,0.331039
2009-07-01 04:10:00,0.0,1019.33,0.334722
2009-07-01 04:15:00,0.991217,1025.24,0.336648
2009-07-01 04:20:00,1.025701,1050.5,0.345109


__Load interconnector table__

In [5]:
interconnectors = pd.read_csv('../../big_data/interconnectors.csv', index_col=-1, parse_dates=True)
interconnectors.drop(columns=["SETTLEMENTDATE", "I", "INTERCONNECTORID"], inplace=True)
interconnectors = interconnectors[(interconnectors.index >= df.index.min()) & (interconnectors.index <= df.index.max())]

assert interconnectors.index.min() == df.index.min()
assert interconnectors.index.max() == df.index.max()

interconnectors.head()

AssertionError: 

__Add interconnector table__

In [6]:
interconnectors = interconnectors.groupby(interconnectors.index).mean()
print(interconnectors.shape)
df["interconnector"] = interconnectors.MWFLOW
df.head()

(920160, 1)


Unnamed: 0,CO2E_EMISSIONS_FACTOR,demand,demand_capacity,interconnector
2009-07-01 04:00:00,0.991217,1004.32,0.329959,
2009-07-01 04:05:00,0.0,1007.58,0.331039,
2009-07-01 04:10:00,0.0,1019.33,0.334722,
2009-07-01 04:15:00,0.991217,1025.24,0.336648,
2009-07-01 04:20:00,1.025701,1050.5,0.345109,


In [7]:
df.isna().any()

CO2E_EMISSIONS_FACTOR    False
demand                   False
demand_capacity          False
interconnector            True
dtype: bool

__Again, we apply dropna to just for the shorter time period of interconnectors.__

In [8]:
df.dropna(inplace=True)

In [9]:
df.isna().any()

CO2E_EMISSIONS_FACTOR    False
demand                   False
demand_capacity          False
interconnector           False
dtype: bool

### __Data preparation and training for horizon = 12 (1h forecast)__

#### __Data preparation__

In [10]:
def time_columns(df):
    
    df.loc[:,'minute'] = df.index.minute
    df['hour'] = df.index.hour
    df['weekday'] = np.where(df.index.weekday < 5, 0, 1)
    df['month'] = df.index.month
    df['year'] = df.index.year

    df['minute_sin'] = np.sin(2 * np.pi * df.loc[:,'minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df.loc[:,'minute']/60)

    df['hour_sin'] = np.sin(2 * np.pi * df.loc[:,'hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df.loc[:,'hour']/24)
    
    df['month_sin'] = np.sin(2 * np.pi * df.loc[:,'month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df.loc[:,'month']/12)
    
    df.drop(columns=['minute', 'hour', 'month'], inplace=True)
    
    return df

In [11]:
def lag_horizon(df, lag, horizon):
    '''
    Returns dataset with additional features defined by lag and modified target defined by horizon
    lag=integer of how far back time series should look
    horizon=integer of how far into the future the model shall predict; horizon=0 means prediciton 1 step into future
    '''
    for i in range(1,lag+1):
        df['lag{}'.format(i)] = df.CO2E_EMISSIONS_FACTOR.shift(i)
    
    for i in range(horizon+1,horizon+2):
        df['horizon{}'.format(i-1)] = df.CO2E_EMISSIONS_FACTOR[lag+i:].shift(-i+1)
        
    return df

In [12]:
df = time_columns(df)
df_1h = lag_horizon(df, 12, 12)
print(df_1h.columns)
df_1h.head()

Index(['CO2E_EMISSIONS_FACTOR', 'demand', 'demand_capacity', 'interconnector',
       'weekday', 'year', 'minute_sin', 'minute_cos', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
       'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'horizon12'],
      dtype='object')


Unnamed: 0,CO2E_EMISSIONS_FACTOR,demand,demand_capacity,interconnector,weekday,year,minute_sin,minute_cos,hour_sin,hour_cos,...,lag4,lag5,lag6,lag7,lag8,lag9,lag10,lag11,lag12,horizon12
2009-09-01 00:00:00,0.986067,1667.12,0.603199,302.86,0,2009,0.0,1.0,0.0,1.0,...,,,,,,,,,,
2009-09-01 00:05:00,0.97636,1657.52,0.599962,300.0,0,2009,0.5,0.8660254,0.0,1.0,...,,,,,,,,,,
2009-09-01 00:10:00,0.976889,1650.15,0.596877,290.52,0,2009,0.866025,0.5,0.0,1.0,...,,,,,,,,,,
2009-09-01 00:15:00,1.03278,1630.66,0.589438,260.75,0,2009,1.0,2.832769e-16,0.0,1.0,...,,,,,,,,,,
2009-09-01 00:20:00,0.975655,1628.96,0.587282,256.98,0,2009,0.866025,-0.5,0.0,1.0,...,0.986067,,,,,,,,,


In [13]:
df_1h.isna().any()

CO2E_EMISSIONS_FACTOR    False
demand                   False
demand_capacity          False
interconnector           False
weekday                  False
year                     False
minute_sin               False
minute_cos               False
hour_sin                 False
hour_cos                 False
month_sin                False
month_cos                False
lag1                      True
lag2                      True
lag3                      True
lag4                      True
lag5                      True
lag6                      True
lag7                      True
lag8                      True
lag9                      True
lag10                     True
lag11                     True
lag12                     True
horizon12                 True
dtype: bool

In [14]:
df_1h.dropna(inplace=True)
df_1h.isna().any()

CO2E_EMISSIONS_FACTOR    False
demand                   False
demand_capacity          False
interconnector           False
weekday                  False
year                     False
minute_sin               False
minute_cos               False
hour_sin                 False
hour_cos                 False
month_sin                False
month_cos                False
lag1                     False
lag2                     False
lag3                     False
lag4                     False
lag5                     False
lag6                     False
lag7                     False
lag8                     False
lag9                     False
lag10                    False
lag11                    False
lag12                    False
horizon12                False
dtype: bool

### __Training__

In [15]:
def train_validation_ts(df, relative_train, maximal_lag, horizon):
    '''
    Time series (ts) split function creates a train/test set under consideration of potential overlap between the two due to lag processing
    X_train, y_train, X_test, y_test = ...
    df=must contain target column as "target"; all other columns must be used as features
    percentage_train=how much of the total dataset shall be used for training; must be added between 0 - 1
    maximal_lag=out of all lag feature engineering, enter the maximal lag number
    '''
    k = int(df.shape[0] * relative_train)
    data_train = df.iloc[:k,:]
    #to avoid overlapping of train and test data, a gap of the maximal lag - 1 must be included between the two sets
    data_test = df.iloc[k+maximal_lag:,:]
    
    assert data_train.index.max() < data_test.index.min()
    
    #returns in the sequence X_train, y_train, X_test, y_test
    return (data_train.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_train[f"horizon{horizon}"],
            data_test.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_test[f"horizon{horizon}"])

In [16]:
def errors(model, X_train, y_train, X_test, y_test):

    train_mae = (sum(abs(y_train - model.predict(X_train)))/len(y_train))
    train_mape = (sum(abs((y_train - model.predict(X_train))/y_train)))*(100/len(y_train))
    train_smape = sum(abs(y_train - model.predict(X_train)))/sum(y_train + model.predict(X_train))

    test_mae = (sum(abs(y_test - model.predict(X_test)))/len(y_test))
    test_mape = (sum(abs((y_test - model.predict(X_test))/y_test)))*(100/len(y_test))
    test_smape = sum(abs(y_test - model.predict(X_test)))/sum(y_test + model.predict(X_test))

    print(f'train_MAE: {train_mae}')
    print(f'test_MAE: {test_mae}')
    
    print(f'train_MAPE: {train_mape}')
    print(f'test_MAPE: {test_mape}')
    
    print(f'train_SMAPE: {train_smape}')
    print(f'test_SMAPE: {test_smape}')

In [17]:
X_train_1h, y_train_1h, X_val_1h, y_val_1h = train_validation_ts(df_1h, 0.8, 12, 12)

model = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model.fit(X_train_1h, y_train_1h)

errors(model,X_train_1h, y_train_1h, X_val_1h, y_val_1h)

train_MAE: 0.23333682990532875
test_MAE: 0.24013341852479317
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.16578312136574902
test_SMAPE: 0.18145604440734112


## __Data preparation and training for horizon = 24 (2h forecast)__

In [18]:
df_2h = lag_horizon(df, 12, 24)
print(df_1h.columns)
df_2h.head()

Index(['CO2E_EMISSIONS_FACTOR', 'demand', 'demand_capacity', 'interconnector',
       'weekday', 'year', 'minute_sin', 'minute_cos', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
       'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'horizon12',
       'horizon24'],
      dtype='object')


Unnamed: 0,CO2E_EMISSIONS_FACTOR,demand,demand_capacity,interconnector,weekday,year,minute_sin,minute_cos,hour_sin,hour_cos,...,lag5,lag6,lag7,lag8,lag9,lag10,lag11,lag12,horizon12,horizon24
2009-09-01 02:05:00,0.973187,1350.13,0.50624,179.13,0,2009,0.5,0.8660254,0.5,0.866025,...,,,,,,,,,1.010055,
2009-09-01 02:10:00,0.973187,1359.79,0.509407,198.52,0,2009,0.866025,0.5,0.5,0.866025,...,,,,,,,,,1.02477,
2009-09-01 02:15:00,0.946617,1323.9,0.49742,227.6,0,2009,1.0,2.832769e-16,0.5,0.866025,...,,,,,,,,,0.984948,
2009-09-01 02:20:00,0.912643,1350.34,0.507537,208.13,0,2009,0.866025,-0.5,0.5,0.866025,...,,,,,,,,,0.947127,
2009-09-01 02:25:00,0.908305,1323.32,0.496485,240.26,0,2009,0.5,-0.8660254,0.5,0.866025,...,,,,,,,,,0.908518,


In [19]:
df_2h.dropna(inplace=True)
df_2h.isna().any()

CO2E_EMISSIONS_FACTOR    False
demand                   False
demand_capacity          False
interconnector           False
weekday                  False
year                     False
minute_sin               False
minute_cos               False
hour_sin                 False
hour_cos                 False
month_sin                False
month_cos                False
lag1                     False
lag2                     False
lag3                     False
lag4                     False
lag5                     False
lag6                     False
lag7                     False
lag8                     False
lag9                     False
lag10                    False
lag11                    False
lag12                    False
horizon12                False
horizon24                False
dtype: bool

In [22]:
X_train_2h, y_train_2h, X_val_2h, y_val_2h = train_validation_ts(df_2h, 0.8, 12, 24)

model1 = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model1.fit(X_train_2h, y_train_2h)

errors(model1,X_train_2h, y_train_2h, X_val_2h, y_val_2h)

train_MAE: 0.23553157581533404
test_MAE: 0.2428864894770213
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.16734495922048115
test_SMAPE: 0.18314553596105415


__Even forecasts of up to 24h into the future seem possible with a decent accuracy. But are they really? Let's apply out trained models onto our test set and have a closer look a the outcome.__