In [2]:
import pandas as pd

import xgboost as xgb

In [3]:
file_path = '../data/train_time_features.pkl'
df = pd.read_pickle(file_path)

In [4]:
df.dropna(inplace=True) #xgbosst cannot handle nans; be careful with such operations;

In [7]:
def train_test_ts(df, relative_train, maximal_lag, horizon):
    '''
    Time series (ts) split function creates a train/test set under consideration of potential overlap between the two due to lag processing
    X_train, y_train, X_test, y_test = ...
    df=must contain target column as "target"; all other columns must be used as features
    percentage_train=how much of the total dataset shall be used for training; must be added between 0 - 1
    maximal_lag=out of all lag feature engineering, enter the maximal lag number
    '''
    k = int(df.shape[0] * relative_train)
    data_train = df.iloc[:k,:]
    #to avoid overlapping of train and test data, a gap of the maximal lag - 1 must be included between the two sets
    data_test = df.iloc[k+maximal_lag:,:]
    
    assert data_train.index.max() < data_test.index.min()
    
    #returns in the sequence X_train, y_train, X_test, y_test
    return (data_train.drop(columns=[f'horizon{horizon}','t CO2-e / MWh'], axis=1), data_train[f'horizon{horizon}'],
            data_test.drop(columns=[f'horizon{horizon}','t CO2-e / MWh'], axis=1), data_test[f'horizon{horizon}'])

In [8]:
def errors(model, X_train, y_train, X_test, y_test):

    train_mae = (sum(abs(y_train - model.predict(X_train)))/len(y_train))
    train_mape = (sum(abs((y_train - model.predict(X_train))/y_train)))*(100/len(y_train))
    train_smape = sum(abs(y_train - model.predict(X_train)))/sum(y_train + model.predict(X_train))

    test_mae = (sum(abs(y_test - model.predict(X_test)))/len(y_test))
    test_mape = (sum(abs((y_test - model.predict(X_test))/y_test)))*(100/len(y_test))
    test_smape = sum(abs(y_test - model.predict(X_test)))/sum(y_test + model.predict(X_test))

    print(f'train_MAE: {train_mae}')
    print(f'test_MAE: {test_mae}')
    
    print(f'train_MAPE: {train_mape}')
    print(f'test_MAPE: {test_mape}')
    
    print(f'train_SMAPE: {train_smape}')
    print(f'test_SMAPE: {test_smape}')

In [9]:
X_train, y_train, X_test, y_test = train_test_ts(df=df, relative_train=0.8, maximal_lag=12, horizon=0)

print(df.columns)

print(X_train.index.max())
print(X_test.index.min())

model = xgb.XGBRegressor(max_depth=5,
                         learning_rate=0.1,
                         num_estimators=100,
                         n_jobs=3,
                         reg_alpha=0.05,
                         reg_lambda=0,
                        )

model.fit(X_train, y_train)

errors(model, X_train, y_train, X_test, y_test)

Index(['t CO2-e / MWh', 'year', 'minute_sin', 'minute_cos', 'hour_sin',
       'hour_cos', 'weekday_sin', 'month_sin', 'month_cos', 'lag1', 'lag2',
       'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10',
       'lag11', 'lag12', 'horizon0'],
      dtype='object')
2015-10-30 20:05:00
2015-10-30 21:10:00


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


train_MAE: 0.19879497626888845
test_MAE: 0.18236447612387316
train_MAPE: inf
test_MAPE: inf
train_SMAPE: 0.14208212403924375
test_SMAPE: 0.1278180010564451
