In [176]:
import pandas as pd
import numpy as np
import os
import datetime as dt

In [163]:
file_path = '{}/nem-data/nemde_clean/df_clean_interconnectors.pkl'.format(os.environ['HOME'])
df = pd.read_pickle(file_path)

In [164]:
df_train_val = df[df.index < dt.datetime(2017,5,31,0,0,0)]
df_test_set = df[df.index >= dt.datetime(2017,5,31,0,0,0)]

In [165]:
print(df_train_val.index.max())
print(df_test_set.index.min())

2017-05-30 23:55:00
2017-05-31 00:00:00


In [1]:
def train_test_ts(df, relative_train, maximal_lag, horizon):
    '''
    Time series (ts) split function creates a train/test set under consideration of potential overlap between the two due to lag processing
    X_train, y_train, X_test, y_test = ...
    df=must contain target column as "target"; all other columns must be used as features
    percentage_train=how much of the total dataset shall be used for training; must be added between 0 - 1
    maximal_lag=out of all lag feature engineering, enter the maximal lag number
    '''
    k = int(df.shape[0] * relative_train)
    data_train = df.iloc[:k,:]
    #to avoid overlapping of train and test data, a gap of the maximal lag - 1 must be included between the two sets
    data_test = df.iloc[k+maximal_lag:,:]
    
    assert data_train.index.max() < data_test.index.min()
    
    #returns in the sequence X_train, y_train, X_test, y_test
    return (data_train.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_train[f"horizon{horizon}"],
            data_test.drop(columns=[f"horizon{horizon}","CO2E_EMISSIONS_FACTOR"], axis=1), data_test[f"horizon{horizon}"])

In [167]:
X_train, y_train, X_test, y_test = train_test_ts(df_train_val, 0.8, 12)

In [168]:
print(X_test.index.min())
print(X_train.index.max())

2016-03-10 09:15:00
2016-03-10 08:10:00


In [169]:
def errors_GS(model_name):

    file_path = '{}/nem-data/trainings/grid_searches/{}.pkl'.format(os.environ['HOME'], model_name)

    best_model = joblib.load(file_path)

    train_mae = (sum(abs(y_train - best_model.predict(X_train)))/len(y_train))/y_train.mean()
    train_mape = sum(abs(y_train - best_model.predict(X_train))*100)/len(y_train)

    test_mae = sum(abs(y_test - best_model.predict(X_test)))/len(y_test)
    test_mape = sum(abs(y_test - best_model.predict(X_test))*100)/len(y_test)

    print(f"train_mae: {train_mae}")
    print(f"test_mae: {test_mae}")
    print(f"train_mape: {train_mape}")
    print(f"test_mape: {test_mape}")
    
    return best_model

In [170]:
def rel_errors_GS(model_name):

    file_path = '{}/nem-data/trainings/grid_searches/{}.pkl'.format(os.environ['HOME'], model_name)

    best_model = joblib.load(file_path)

    train_mae = (sum(abs(y_train - best_model.predict(X_train)))/len(y_train))/y_train.mean()
    train_mape = (sum(abs(y_train - best_model.predict(X_train))*100)/len(y_train))/y_train.mean()

    test_mae = (sum(abs(y_test - best_model.predict(X_test)))/len(y_test))/y_test.mean()
    test_mape = (sum(abs(y_test - best_model.predict(X_test))*100)/len(y_test))/y_test.mean()

    print(f"train_mae: {train_mae}")
    print(f"test_mae: {test_mae}")
    print(f"train_mape: {train_mape}")
    print(f"test_mape: {test_mape}")
    
    return best_model

In [154]:
best_model = rel_errors_GS("3_GS_model")

train_mae: 0.27959603558873336
test_mae: 0.2687075709892779
train_mape: 27.959603558926503
test_mape: 26.870757098929374


In [156]:
best_model.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100,
 'reg_alpha': 0.05,
 'reg_lambda': 0}

In [155]:
df1 = df.drop(columns="interconnector")
df_train_val1 = df1[df1.index < dt.datetime(2017,1,1,0,0,0)]
df_test_set1 = df1[df1.index >= dt.datetime(2017,1,1,0,0,0)]
X_train, y_train, X_test, y_test = train_test_ts(df_train_val1, 0.8, 12)

best_model1 = rel_errors_GS("grid_search_object")

train_mae: 0.2857561508363076
test_mae: 0.23963506193454695
train_mape: 28.57561508369905
test_mape: 23.963506193454265


In [160]:
best_model1.best_params_

{'eta': 0.3, 'max_depth': 5}

In [174]:
best_model2 = rel_errors_GS("4_GS_object")

train_mae: 0.0038969861269728553
test_mae: 0.3034322689961902
train_mape: 0.38969861269729683
test_mape: 30.34322689962513


In [175]:
best_model2.best_params_

{'learning_rate': 0.1,
 'max_depth': 50,
 'n_estimators': 100,
 'reg_alpha': 0.05,
 'reg_lambda': 0}