In [1]:
# TODO which one?
#git clone https://github.com/lucidrains/iTransformer.git
#import iTransformer
import sys
sys.path.append('/vol/fob-vol7/nebenf21/reinbene/bene/MA/iTransformer') 
from iTransformer import iTransformer

import torch
import torch.optim as optim
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from pathlib import Path


from utils import data_handling, training_functions
import config 

print("Import succesfull")

Import succesfull


# Sanity checking our iTransformer implementation

We use the same parameters as presented in the paper to do a first evaulation if our model is actually able
to reproduce the results as shown in the original paper.

We take a window size of 96 hours as input and predict different horizons from 96h to 720h. 

The parameters used are in the range of the optimal parameters evaluated in the original paper.

In [2]:
# use electricity dataset
data_dict = data_handling.load_electricity()

window_size = 96
pred_length = (96, 192, 336, 720)

dataloader_train, dataloader_validation, dataloader_test = data_handling.convert_data(data_dict, window_size, pred_length)
len(dataloader_train)

Feature batch shape: torch.Size([32, 96, 348])


131

# Train model on electricity dataset

In [4]:
normalization_strategies = {"base" : [False, False],
							"revin" : [True, True],
							"stationary" : [True, False]
                            }

In [4]:
# run experiment for each normalizaiton strategie and save model and evaluation metrics

for key, value in normalization_strategies.items():

    # define parameters and create config 
    best_parameters = {'depth': 2, 'dim': 256, 'dim_head': 56, 'heads': 4, 'attn_dropout': 0.2, 'ff_mult': 4, 'ff_dropout': 0.2, 
                    'num_mem_tokens': 4, 'learning_rate': 0.0005}


    model_config = {
        'num_variates': data_dict["train"].size(1),
        'lookback_len': window_size,
        'depth': best_parameters["depth"],
        'dim': best_parameters["dim"],
        'num_tokens_per_variate': 1,
        'pred_length': pred_length,
        'dim_head': best_parameters["dim_head"],
        'heads': best_parameters["heads"],
        'attn_dropout': best_parameters["attn_dropout"],
        'ff_mult': best_parameters["ff_mult"],
        'ff_dropout': best_parameters["ff_dropout"],
        'num_mem_tokens': best_parameters["num_mem_tokens"],
        'use_reversible_instance_norm': value[0],
        'reversible_instance_norm_affine': value[1],
        'flash_attn': True
    }

    # select available deviec
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # defining all needed instances
    model = iTransformer(**model_config).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_parameters["learning_rate"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    writer = SummaryWriter(log_dir=config.CONFIG_LOGS_PATH[key])

    # run model training as mentioned in the original paper
    epoch = 15

    for epoch in range(1, epoch + 1):
        metrics, best_model = training_functions.train_one_epoch(epoch, model, device, dataloader_train, dataloader_validation, optimizer, scheduler, writer)

    model = best_model
    metrics = training_functions.fast_eval(model, dataloader_test)


    # save model
    checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict' : scheduler.state_dict(),
            'epoch': epoch,
            'loss': metrics[96]["mse"].item(),
            'global_step_writer' : 0,
        }

    torch.save(checkpoint, f'{config.CONFIG_MODEL_LOCATION[key]}/electricity_{key}_epoch_{epoch}_loss_{checkpoint["loss"]}.pt')  

    print(f"Checkpointing succesfull after epoch {epoch} for {key}")

    # convert metrics to dataframe and save as csv
    for key_1, values_1 in metrics.items():
        for key_2, values_2 in values_1.items():
            metrics[key_1][key_2] = (values_2.item())

    metrics_df = pd.DataFrame.from_dict(metrics, orient='index')

    metrics_df.to_csv(f"{config.CONFIG_OUTPUT_PATH[key]}/metrics_{key}_epochs{epoch}.csv")



Using device: cuda
Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda


Epoch: 1: 100%|██████████| 151/151 [02:23<00:00,  1.05it/s]


Epoch 1, MSE-Loss: 0.0684398826680436, LR: 0.0005


Epoch: Validating: 100%|██████████| 21/21 [00:18<00:00,  1.16it/s]


Validation MAE is {96: {'mse': tensor(0.2269, device='cuda:0')}, 192: {'mse': 0}, 336: {'mse': 0}, 720: {'mse': 0}}


Epoch: 2: 100%|██████████| 151/151 [02:24<00:00,  1.05it/s]


Epoch 2, MSE-Loss: 0.047383620471551716, LR: 0.0005


Epoch: Validating: 100%|██████████| 21/21 [00:15<00:00,  1.34it/s]


Validation MAE is {96: {'mse': tensor(0.2194, device='cuda:0')}, 192: {'mse': 0}, 336: {'mse': 0}, 720: {'mse': 0}}


Epoch: 3: 100%|██████████| 151/151 [02:23<00:00,  1.05it/s]


Epoch 3, MSE-Loss: 0.04356631458989832, LR: 0.0005


Epoch: Validating: 100%|██████████| 21/21 [00:18<00:00,  1.14it/s]


Validation MAE is {96: {'mse': tensor(0.2076, device='cuda:0')}, 192: {'mse': 0}, 336: {'mse': 0}, 720: {'mse': 0}}


Epoch: 4:  83%|████████▎ | 125/151 [01:58<00:24,  1.06it/s]