In [1]:
import pmdarima as Arima
import config
import pickle
import torch

from utils import data_handling, helpers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# TTrain ARIAMA and predict

For every ID in all three datasets we fit an ARIMA model and do predictions over every 96 time step window of each datasets test set.

We take the last 2000 datapoints of the train set for model training for computational reasons. We add multiple covariates, which are different laggs and time of day encoding.

In [2]:
# use electricity dataset
electricity_dict = data_handling.format_electricity()


for key, value in electricity_dict.items():
			electricity_dict[key]= data_handling.df_to_tensor(value)
train_standardize_dict = None

# normalize train and use matrics for val and test
electricity_dict["train"], train_standardize_dict = helpers.custom_standardizer(electricity_dict["train"])
electricity_dict["validation"], _ = helpers.custom_standardizer(electricity_dict["validation"], train_standardize_dict)
electricity_dict["test"], _ = helpers.custom_standardizer(electricity_dict["test"], train_standardize_dict)

# load bavaria dataset
data_tensor = data_handling.load_bavaria_electricity()
bavaria_dict, standadizer = data_handling.train_test_split_eu_elec(data_tensor, standardize=True)

# building genome project dataset
data_tensor = data_handling.load_genome_project_data()
gp_dict, standadizer = data_handling.train_test_split_eu_elec(data_tensor, standardize=True)

In [3]:
def lag_tensor(df, lag):
    if lag > 0:
        return torch.cat((torch.zeros(lag, df.size(1)), df[:-lag]), dim=0)
    return df

# Example tensor of shape [2929, 348]
def create_lagged(df):

    # Lag by 24, ...
    lagged_1 = lag_tensor(df, 1)
    lagged_24 = lag_tensor(df, 24)
    lagged_48 = lag_tensor(df, 24*2)
    lagged_72 = lag_tensor(df, 24*3)
    lagged_96 = lag_tensor(df, 24*4)

    length = df.size(0)
    ids = df.size(1)

    # create time of day index
    hours = torch.arange(0, 24)

    # implement sin/cosine encoding for 24h
    sin_encodings = torch.sin(2 * torch.pi * hours / 24)
    cos_encodings = torch.cos(2 * torch.pi * hours / 24)

    time_of_day_sin = sin_encodings.repeat(ids, length//23).transpose(0,1)[:length,:]
    time_of_day_cos = cos_encodings.repeat(ids, length//23).transpose(0,1)[:length,:]
    
   # return torch.stack((lagged_24, lagged_48, lagged_72, lagged_96, time_of_day_sin, time_of_day_cos), dim=2)
    return torch.stack((lagged_1, lagged_24, lagged_48, lagged_72, lagged_96, time_of_day_sin, time_of_day_cos), dim=2)

In [4]:
def process_and_predict(df, dataset_name, data_split_description):
    num_96_horizons = int(df["test"][:,0].shape[0] / (96))
    lagged_covariates_train = create_lagged(df["train"])
    lagged_covariates_test = create_lagged(df["test"])

    #filename = config.CONFIG_OUTPUT_PATH["arima"] / f'arima_{key_}predictions.csv'
    filename = config.CONFIG_OUTPUT_PATH["arima"] / f'arima_{dataset_name}_predictions{data_split_description}_new.pkl'


    # load predictions if available
    try:
        with open(filename, 'rb') as file:
            prediction_list = pickle.load(file)
    except: 
        print("no predictions available.")
        prediction_list = []


    # fit a model for each id and iterate over the test split for inference
    for id in range(len(prediction_list), df["train"].size(1)):
        model = Arima.auto_arima(df["train"][-2000:,id], exogenous=lagged_covariates_train[-2000:,id,:], stepwise=True, seasonal=True, m=24, maxiter=3)

        sum_mse = 0
        sum_mae = 0
        sum_mape = 0
        for i in range(num_96_horizons):
            time_step = i * 96
            target = df["test"][time_step : time_step+96, id]

            lagged_window_test = lagged_covariates_test[time_step:time_step+96,id,:]
            forecasts = model.predict(n_periods=96, return_conf_int=False, exogenous=lagged_window_test, alpha=0.1)

            # evaluate forecasts
            sum_mse += mean_squared_error(forecasts, target)
            sum_mae += mean_absolute_error(forecasts, target)
            sum_mape = mean_absolute_percentage_error(forecasts, target)
            

        prediction_list.append([sum_mse / num_96_horizons, sum_mae / num_96_horizons, sum_mape / num_96_horizons])
        print(f"MSE of {id}: ", sum_mse / num_96_horizons, "MAE:" ,sum_mae / num_96_horizons)


        # save as pickle
        with open(filename, 'wb') as file:
            pickle.dump(prediction_list, file)

In [5]:
# do training on 2000 time steps
process_and_predict(electricity_dict, "electricity", "full")
process_and_predict(bavaria_dict, "bavaria", "full")
process_and_predict(gp_dict, "genome_project", "full")

MSE of 1045:  3.743397301046462 MAE: 1.7709848320762598
MSE of 1046:  2.0035961656385766 MAE: 1.186081797254964
MSE of 1047:  0.7317639544441809 MAE: 0.4430142489238984
MSE of 1048:  1.2978196723126663 MAE: 0.9033094802445929
MSE of 1049:  1.4352172072199374 MAE: 0.9299916789074498
MSE of 1050:  0.9458829255018649 MAE: 0.7028529389411741
MSE of 1051:  0.5897052524286407 MAE: 0.5500772314190294
MSE of 1052:  1.0647105983117293 MAE: 0.7330611595433355
MSE of 1053:  1.8759848881373669 MAE: 1.2537845755031105
MSE of 1054:  0.8730469690460012 MAE: 0.8134619581436283
MSE of 1055:  1.4415051195004622 MAE: 1.2006269693374634




MSE of 1056:  0.5497402316977861 MAE: 0.6277373598901604
MSE of 1057:  1.387017469986565 MAE: 0.8986977890452722
MSE of 1058:  0.8574309693285864 MAE: 0.6702595645917718
MSE of 1059:  0.7373556407430423 MAE: 0.704047061593814
MSE of 1060:  4.3654037336058975 MAE: 1.8570150284293903
MSE of 1061:  1.1845526273765594 MAE: 0.7956855346316755
MSE of 1062:  0.9324574457590267 MAE: 0.7962995495367091
MSE of 1063:  0.7324645768014081 MAE: 0.7199205169708931
MSE of 1064:  0.9617324118069965 MAE: 0.7864263179346934
MSE of 1065:  1.0948213094372465 MAE: 0.8104201571366939
MSE of 1066:  1.4602990114956842 MAE: 0.8917560379460328
MSE of 1067:  0.9782511776833169 MAE: 0.8053017244098073
MSE of 1068:  0.7507103113571654 MAE: 0.7610999953346089
MSE of 1069:  0.9283812112129468 MAE: 0.7777460822547437
MSE of 1070:  1.1483807100967005 MAE: 0.8562584308415863
MSE of 1071:  0.6557821580901193 MAE: 0.6002303249289798
MSE of 1072:  1.2733377636483165 MAE: 0.8825217423852215
MSE of 1073:  0.7788393913721009 