## __Forecast of marginal CO2 emissions: effect of lagging and non-time series features__

In [2]:
import pandas as pd
import numpy as np
import os
import datetime as dt
import joblib

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [3]:
file_path = '../../big_data/df_clean_interconnectors.pkl'
df = pd.read_pickle(file_path)

__It is crucial for xgboost to not encounter any NaN values. Let's see:__

In [4]:
df.isna().any()

CO2E_EMISSIONS_FACTOR    False
weekday                  False
year                     False
minute_sin               False
minute_cos               False
hour_sin                 False
hour_cos                 False
month_sin                False
month_cos                False
lag1                     False
lag2                     False
lag3                     False
lag4                     False
lag5                     False
lag6                     False
lag7                     False
lag8                     False
lag9                     False
lag10                    False
lag11                    False
lag12                    False
horizon0                 False
demand                   False
demand_capacity          False
interconnector           False
dtype: bool

In [5]:
df.shape

(920160, 25)

In [6]:
df.head()

Unnamed: 0,CO2E_EMISSIONS_FACTOR,weekday,year,minute_sin,minute_cos,hour_sin,hour_cos,month_sin,month_cos,lag1,...,lag7,lag8,lag9,lag10,lag11,lag12,horizon0,demand,demand_capacity,interconnector
2009-09-01 00:00:00,0.986067,0,2009,0.0,1.0,0.0,1.0,-1.0,-1.83697e-16,1.03278,...,0.938327,0.912643,0.79831,0.980592,0.473708,0.971761,0.986067,1667.12,0.603199,302.86
2009-09-01 00:05:00,0.97636,0,2009,0.5,0.8660254,0.0,1.0,-1.0,-1.83697e-16,0.986067,...,0.908518,0.938327,0.912643,0.79831,0.980592,0.473708,0.97636,1657.52,0.599962,300.0
2009-09-01 00:10:00,0.976889,0,2009,0.866025,0.5,0.0,1.0,-1.0,-1.83697e-16,0.97636,...,0.971761,0.908518,0.938327,0.912643,0.79831,0.980592,0.976889,1650.15,0.596877,290.52
2009-09-01 00:15:00,1.03278,0,2009,1.0,2.832769e-16,0.0,1.0,-1.0,-1.83697e-16,0.976889,...,0.980592,0.971761,0.908518,0.938327,0.912643,0.79831,1.03278,1630.66,0.589438,260.75
2009-09-01 00:20:00,0.975655,0,2009,0.866025,-0.5,0.0,1.0,-1.0,-1.83697e-16,1.03278,...,0.903942,0.980592,0.971761,0.908518,0.938327,0.912643,0.975655,1628.96,0.587282,256.98


In [7]:
def train_validation_ts(df, relative_train, maximal_lag, horizon):
    '''
    Time series (ts) split function creates a train/test set under consideration of potential overlap between the two due to lag processing
    X_train, y_train, X_test, y_test = ...
    df=must contain target column as "target"; all other columns must be used as features
    percentage_train=how much of the total dataset shall be used for training; must be added between 0 - 1
    maximal_lag=out of all lag feature engineering, enter the maximal lag number
    '''
    k = int(df.shape[0] * relative_train)
    data_train = df.iloc[:k,:]
    #to avoid overlapping of train and test data, a gap of the maximal lag - 1 must be included between the two sets
    data_test = df.iloc[k+maximal_lag:,:]
    
    assert data_train.index.max() < data_test.index.min()
    
    #returns in the sequence X_train, y_train, X_test, y_test
    return (data_train.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_train[f"horizon{horizon}"],
            data_test.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_test[f"horizon{horizon}"])

In [8]:
def errors(model, X_train, y_train, X_test, y_test):

    train_mae = (sum(abs(y_train - model.predict(X_train)))/len(y_train))
    train_mape = (sum(abs((y_train - model.predict(X_train))/y_train)))*(100/len(y_train))
    train_smape = sum(abs(y_train - model.predict(X_train)))/sum(y_train + model.predict(X_train))

    test_mae = (sum(abs(y_test - model.predict(X_test)))/len(y_test))
    test_mape = (sum(abs((y_test - model.predict(X_test))/y_test)))*(100/len(y_test))
    test_smape = sum(abs(y_test - model.predict(X_test)))/sum(y_test + model.predict(X_test))

    print(f'train_MAE: {train_mae}')
    print(f'test_MAE: {test_mae}')
    
    print(f'train_MAPE: {train_mape}')
    print(f'test_MAPE: {test_mape}')
    
    print(f'train_SMAPE: {train_smape}')
    print(f'test_SMAPE: {test_smape}')

### __Training with no demand and interconncetors__

In [9]:
df1 = df.drop(columns=["demand", "demand_capacity", "interconnector"])

X_train1, y_train1, X_val1, y_val1 = train_validation_ts(df1, 0.8, 12, 0)

model1 = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model1.fit(X_train1, y_train1)

errors(model1,X_train1, y_train1, X_val1, y_val1)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


train_MAE: 0.19634647785729523
test_MAE: 0.2079189048272252
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.13949958875843474
test_SMAPE: 0.16148983467267983


### __Training with demand__

In [11]:
df2 = df.drop(columns="interconnector")

X_train2, y_train2, X_val2, y_val2 = train_validation_ts(df2, 0.8, 12, 0)

model2 = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model2.fit(X_train2, y_train2)

errors(model2, X_train2, y_train2, X_val2, y_val2)

train_MAE: 0.19614051542470323
test_MAE: 0.2077163555725931
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.13935325194307954
test_SMAPE: 0.16126737874512465


### __Training with interconnectors__

In [15]:
df3 = df.drop(columns=["demand", "demand_capacity"])

X_train3, y_train3, X_val3, y_val3 = train_validation_ts(df3, 0.8, 12, 0)

model3 = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model3.fit(X_train3, y_train3)

errors(model3, X_train3, y_train3, X_val3, y_val3)

train_MAE: 0.1963061202823627
test_MAE: 0.2083725963713885
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.13947091972861783
test_SMAPE: 0.16224841562813572


### __Training with demand and interconnectors__

In [16]:
X_train, y_train, X_val, y_val = train_validation_ts(df, 0.8, 12, 0)

print(X_train.columns)
print(X_val.columns)

print(X_train.index.max())
print(X_val.index.min())

model = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=7,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model.fit(X_train, y_train)

errors(model, X_train, y_train, X_val, y_val)

Index(['weekday', 'year', 'minute_sin', 'minute_cos', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
       'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'demand',
       'demand_capacity', 'interconnector'],
      dtype='object')
Index(['weekday', 'year', 'minute_sin', 'minute_cos', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
       'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12', 'demand',
       'demand_capacity', 'interconnector'],
      dtype='object')
2016-08-30 23:55:00
2016-08-31 01:00:00
train_MAE: 0.1960687095270836
test_MAE: 0.2078975412755377
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.13930223810527195
test_SMAPE: 0.1614946632702779


In [29]:
def plot_feature_importances(rf, cols, model_dir):
    importances = pd.DataFrame()
    importances.loc[:, 'importances'] = rf.feature_importances_
    importances.loc[:, 'features'] = cols
    importances.sort_values('importances', inplace=True)
    f, a = plt.subplots()
    importances.plot(ax=a, kind='bar', x='features', y='importances')
    plt.gcf().subplots_adjust(bottom=0.3)
    f.savefig(os.path.join(model_dir, 'importances.png'))

In [30]:
plot_feature_importances(model5, df5.iloc[:,1:],'{}/nem-data/trainings/'.format(os.environ['HOME']))

NameError: name 'plt' is not defined

In [None]:
model5.feature_importances_