In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

import holidays

In [2]:
# GET FORMATTED DATA (IN DATE-TIME) AND TRAIN-TEST SPLIT

def date_formatting(bus):

    df = pd.read_csv(f"./data/bus_{bus}_load.csv")
    df.index = pd.to_datetime(df.index, unit='h', origin=pd.Timestamp("2018-01-01"))

    # training_data = df[df.index < split_date] # Splitting data now will cause it to lose 24 training datapoints and 24*7 test datapoints
    # test_data = df[df.index >= split_date]

    return df
    

In [5]:
# GET X,Y SPLIT IN TRAIN, TEST

def split_train_test(x, y, lag_size, split_date="2018-10-16", train_val_split=0.8): # Timestep 6912

    original_timestep = (pd.Timestamp(split_date) - pd.Timestamp("2018-01-01 00:00:00")).total_seconds()/3600 # Get original timestep (hour) from split date
    split_timestep = int(original_timestep - lag_size) # Split timestep in the new dataframe (starts at timestep lag_size)
    
    x_tr = x[:split_timestep]
    y_tr = y[:split_timestep]

    x_train, x_val = x_tr[:int(train_val_split * len(x_tr))], x_tr[int(train_val_split * len(x_tr)):] # Split in train and validation
    y_train, y_val = y_tr[:int(train_val_split * len(y_tr))], y_tr[int(train_val_split * len(y_tr)):]

    x_test = x[split_timestep:]
    y_test = y[split_timestep:]

    return x_train, y_train, x_val, y_val, x_test, y_test

In [37]:
bus1_df = date_formatting(2)

In [54]:
def create_benchmark(df, t, split_date=6912):
    y = df["Load"].to_numpy()[t:]
    y_pred = df["Load"].to_numpy()[:-t]

    y_train_pred, y_test_pred = y_pred[:split_date], y_pred[split_date:]
    y_train, y_test = y[:split_date], y[split_date:]

    return y_train_pred, y_train, y_test_pred, y_test

In [55]:
def get_scores(df, t):

    y_train_pred, y_train, y_test_pred, y_test = create_benchmark(df, t)

    train_score = mean_absolute_error(y_train_pred, y_train)
    test_score = mean_absolute_error(y_test_pred, y_test)

    return {"Train": train_score, "Test": test_score}

In [57]:
allbus_scores = {}

for b in range(1,29):
    bus_df = date_formatting(b)
    scores = {}
    
    for t in [1, 24, 24*7]:
        scores[f"{t}h"] = get_scores(bus_df, t)
    
    allbus_scores[b] = scores
    

In [58]:
with open("./benchmark_scores.pkl", "wb") as f:
    pickle.dump(allbus_scores, f)

In [53]:
print(scores[f"{24*7}h"])

{'Train': 8.738796727916686, 'Test': 12.632438489114529}
