In [21]:
import pandas as pd
import numpy as np
import os
import datetime as dt

import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

import warnings
warnings.simplefilter('ignore')

### __data_prep__

In [22]:
file_path = '{}/nem-data/nemde_clean/df_DUID_CO2_mean.pkl'.format(os.environ['HOME'])
df = pd.read_pickle(file_path)
df = df.sort_index()
df = df[df.index >= dt.datetime(2009,9,1,0,0,0)] #minimum time of interconncector
print(df.shape)
df.head()

(1042896, 2)


Unnamed: 0,Price,CO2E_EMISSIONS_FACTOR
2009-09-01 00:00:00,21.64992,0.986067
2009-09-01 00:05:00,21.49293,0.97636
2009-09-01 00:10:00,21.4896,0.976889
2009-09-01 00:15:00,21.14641,1.03278
2009-09-01 00:20:00,20.94913,0.975655


In [1]:
def time_columns(df):
    
    df.loc[:,"minute"] = df.index.minute
    df["hour"] = df.index.hour
    df["weekday"] = np.where(df.index.weekday < 5, 0, 1)
    df["month"] = df.index.month
    df["year"] = df.index.year

    df['minute_sin'] = np.sin(2 * np.pi * df.loc[:,'minute']/60.0)
    df['minute_cos'] = np.cos(2 * np.pi * df.loc[:,'minute']/60.0)

    df['hour_sin'] = np.sin(2 * np.pi * df.loc[:,'hour']/24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df.loc[:,'hour']/24.0)
    
    df['weekday_sin'] = np.sin(2 * np.pi * df.loc[:,'weekday']/7.0)
    
    df['month_sin'] = np.sin(2 * np.pi * df.loc[:,'month']/12.0)
    df['month_cos'] = np.cos(2 * np.pi * df.loc[:,'month']/12.0)

    df.drop(columns=["Price", "minute", "weekday", "hour", "month"], inplace=True)
    
    return df

In [24]:
def lag_horizon(df, lag, horizon):
    '''
    Returns dataset with additional features defined by lag and modified target defined by horizon
    lag=integer of how far back time series should look
    horizon=integer of how far into the future the model shall predict; horizon=0 means prediciton 1 step into future
    '''
    for i in range(1,lag+1):
        df['lag{}'.format(i)] = df.CO2E_EMISSIONS_FACTOR.shift(i)
    
    for i in range(horizon+1,horizon+2):
        df['horizon{}'.format(i-1)] = df.CO2E_EMISSIONS_FACTOR[lag+i:].shift(-i+1)
        
    return df

In [25]:
demand = pd.read_csv('{}/nem-data/demand.csv'.format(os.environ['HOME']), index_col=-1, parse_dates=True)
demand.drop(columns=["SETTLEMENTDATE", "I", "INTERVENTION"], inplace=True)
demand = demand[(demand.index >= df.index.min()) & (demand.index <= df.index.max())]

assert demand.index.min() == df.index.min()
assert demand.index.max() == df.index.max()

demand.head()

Unnamed: 0_level_0,TOTALDEMAND,AVAILABLEGENERATION
start-of-interval,Unnamed: 1_level_1,Unnamed: 2_level_1
2009-09-01 00:00:00,1667.12,2763.79779
2009-09-01 00:05:00,1657.52,2762.70982
2009-09-01 00:10:00,1650.15,2764.64186
2009-09-01 00:15:00,1630.66,2766.46786
2009-09-01 00:20:00,1628.96,2773.72786


In [26]:
demand = demand.groupby(demand.index).mean()

df["demand"] = demand.TOTALDEMAND
df["demand_capacity"] = demand.TOTALDEMAND/demand.AVAILABLEGENERATION

print(df.shape)
df.head()

(1042896, 4)


Unnamed: 0,Price,CO2E_EMISSIONS_FACTOR,demand,demand_capacity
2009-09-01 00:00:00,21.64992,0.986067,1667.12,0.603199
2009-09-01 00:05:00,21.49293,0.97636,1657.52,0.599962
2009-09-01 00:10:00,21.4896,0.976889,1650.15,0.596877
2009-09-01 00:15:00,21.14641,1.03278,1630.66,0.589438
2009-09-01 00:20:00,20.94913,0.975655,1628.96,0.587282


In [27]:
interconnectors = pd.read_csv('{}/nem-data/interconnectors.csv'.format(os.environ['HOME']), index_col=-1, parse_dates=True)
interconnectors.drop(columns=["SETTLEMENTDATE", "I", "INTERCONNECTORID"], inplace=True)
interconnectors = interconnectors[(interconnectors.index >= df.index.min()) & (interconnectors.index <= df.index.max())]

assert interconnectors.index.min() == df.index.min()
assert interconnectors.index.max() == df.index.max()

interconnectors.head()

Unnamed: 0_level_0,MWFLOW
start-of-interval,Unnamed: 1_level_1
2009-09-01 00:00:00,302.86
2009-09-01 00:05:00,300.0
2009-09-01 00:10:00,290.52
2009-09-01 00:15:00,260.75
2009-09-01 00:20:00,256.98


In [28]:
print(interconnectors.shape)
print(interconnectors.index.min())
print(interconnectors.index.max())

(1084876, 1)
2009-09-01 00:00:00
2019-08-01 03:55:00


In [29]:
interconnectors = interconnectors.groupby(interconnectors.index).mean()
print(interconnectors.shape)
df["interconnector"] = interconnectors.MWFLOW
df.head()

(1042896, 1)


Unnamed: 0,Price,CO2E_EMISSIONS_FACTOR,demand,demand_capacity,interconnector
2009-09-01 00:00:00,21.64992,0.986067,1667.12,0.603199,302.86
2009-09-01 00:05:00,21.49293,0.97636,1657.52,0.599962,300.0
2009-09-01 00:10:00,21.4896,0.976889,1650.15,0.596877,290.52
2009-09-01 00:15:00,21.14641,1.03278,1630.66,0.589438,260.75
2009-09-01 00:20:00,20.94913,0.975655,1628.96,0.587282,256.98


In [30]:
df1 = time_columns(df)
df2 = lag_horizon(df1, 12, 12)
df2.shape

(1042896, 27)

In [31]:
df2.loc[:,["CO2E_EMISSIONS_FACTOR","horizon12"]].isna().any()

CO2E_EMISSIONS_FACTOR    True
horizon12                True
dtype: bool

In [32]:
df2.dropna(inplace=True)
df2.shape

(790500, 27)

In [33]:
df2.head()

Unnamed: 0,CO2E_EMISSIONS_FACTOR,demand,demand_capacity,interconnector,year,minute_sin,minute_cos,hour_sin,hour_cos,weekday_sin,...,lag4,lag5,lag6,lag7,lag8,lag9,lag10,lag11,lag12,horizon12
2009-09-01 02:05:00,0.973187,1350.13,0.50624,179.13,2009,0.5,0.8660254,0.5,0.866025,0.0,...,0.986067,1.013504,0.950691,1.081595,0.97636,0.97636,0.97636,1.154818,0.980592,1.010055
2009-09-01 02:10:00,0.973187,1359.79,0.509407,198.52,2009,0.866025,0.5,0.5,0.866025,0.0,...,0.544686,0.986067,1.013504,0.950691,1.081595,0.97636,0.97636,0.97636,1.154818,1.02477
2009-09-01 02:15:00,0.946617,1323.9,0.49742,227.6,2009,1.0,2.832769e-16,0.5,0.866025,0.0,...,0.976889,0.544686,0.986067,1.013504,0.950691,1.081595,0.97636,0.97636,0.97636,0.984948
2009-09-01 02:20:00,0.912643,1350.34,0.507537,208.13,2009,0.866025,-0.5,0.5,0.866025,0.0,...,0.912643,0.976889,0.544686,0.986067,1.013504,0.950691,1.081595,0.97636,0.97636,0.947127
2009-09-01 02:25:00,0.908305,1323.32,0.496485,240.26,2009,0.5,-0.8660254,0.5,0.866025,0.0,...,0.973187,0.912643,0.976889,0.544686,0.986067,1.013504,0.950691,1.081595,0.97636,0.908518


### __training 1h__

In [34]:
def rel_errors(model, X_train, y_train, X_test, y_test):

    train_mae = (sum(abs(y_train - model.predict(X_train)))/len(y_train))/y_train.mean()
    train_mape = (sum(abs(y_train - model.predict(X_train))*100)/len(y_train))/y_train.mean()

    test_mae = (sum(abs(y_test - model.predict(X_test)))/len(y_test))/y_test.mean()
    test_mape = (sum(abs(y_test - model.predict(X_test))*100)/len(y_test))/y_test.mean()

    print(f"rel_train_mae: {train_mae}")
    print(f"rel_test_mae: {test_mae}")
    print(f"rel_train_mape: {train_mape}")
    print(f"rel_test_mape: {test_mape}")

In [35]:
def train_test_ts(df, relative_train, maximal_lag, horizon):
    '''
    Time series (ts) split function creates a train/test set under consideration of potential overlap between the two due to lag processing
    X_train, y_train, X_test, y_test = ...
    df=must contain target column as "target"; all other columns must be used as features
    percentage_train=how much of the total dataset shall be used for training; must be added between 0 - 1
    maximal_lag=out of all lag feature engineering, enter the maximal lag number
    '''
    k = int(df.shape[0] * relative_train)
    data_train = df.iloc[:k,:]
    #to avoid overlapping of train and test data, a gap of the maximal lag - 1 must be included between the two sets
    data_test = df.iloc[k+maximal_lag:,:]
    
    assert data_train.index.max() < data_test.index.min()
    
    #returns in the sequence X_train, y_train, X_test, y_test
    return (data_train.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_train[f"horizon{horizon}"],
            data_test.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_test[f"horizon{horizon}"])

In [36]:
df_train_val = df2[df2.index < dt.datetime(2017,5,31,0,0,0)]
df_test_set = df2[df2.index >= dt.datetime(2017,5,31,0,0,0)]

X_train, y_train, X_test, y_test = train_test_ts(df_train_val, 0.8, 12, 12)

model = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model.fit(X_train, y_train)

rel_errors(model,X_train, y_train, X_test, y_test)

rel_train_mae: 0.3368824780946134
rel_test_mae: 0.32045831258814045
rel_train_mape: 33.68824780950654
rel_test_mape: 32.04583125882107


In [37]:
for i in range(12,13):
    print(i)

12


### __training 2h__

In [38]:
df3 = lag_horizon(df1, 12, 24)
df3.dropna(inplace=True)
print(df3.shape)

df_train_val1 = df3[df3.index < dt.datetime(2017,5,31,0,0,0)]
df_test_set1 = df3[df3.index >= dt.datetime(2017,5,31,0,0,0)]

X_train1, y_train1, X_test1, y_test1 = train_test_ts(df_train_val1, 0.8, 12, 12)

model1 = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model1.fit(X_train1, y_train1)

rel_errors(model1,X_train1, y_train1, X_test1, y_test1)

(790439, 28)
rel_train_mae: 0.33414619601140083
rel_test_mae: 0.31363533718842107
rel_train_mape: 33.41461960118428
rel_test_mape: 31.363533718848114
