This notebook can be executed to run all experiments using the darts library.

Training using the Transformer model is unreliable, sometimes returning nan with the same input. 

In [1]:
from darts import TimeSeries
from darts.models import NBEATSModel, NHiTSModel, TransformerModel, TSMixerModel
from darts.utils.losses import *
from darts.metrics import metrics as darts_metrics
from utils import data_handling, helpers
import torch
import numpy as np
import os
import pickle
import pandas as pd
import config
import copy
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import shutil

# Constants
DEVICE = [1]
IN_LEN = 96
OUT_LEN = 96
LOSS_FN = torch.nn.MSELoss()
LAYER_WIDTH = 256
NUM_STACKS = 4
NUM_BLOCKS = 2
NUM_LAYERS = 3
COEFFS_DIM = 5
DROPOUT = 0.25
VERBOSE = True
TRAIN_EPOCHS = 15
TUNE_EPOCHS = 5
four_weeks = -24*7*4
LR = 0.005
BATCH_SIZE = 32

metrics_output_path = config.CONFIG_OUTPUT_PATH["darts"] / "darts_metrics.csv"

model_path = config.CONFIG_MODEL_LOCATION["darts"]




  from tqdm.autonotebook import tqdm


In [2]:
def extend_source_to_target_id_count(source, target):
    source_id_count = source["train"].shape[1]
    target_id_count = target["train"].shape[1]

    full_repeats = target_id_count // source_id_count
    remainder = target_id_count % source_id_count

    repeated_tensor = source["train"].repeat(1, full_repeats)
    remainder_tensor = source["train"][:, :remainder]
    source_train = torch.cat((repeated_tensor, remainder_tensor), dim=1)
    
    assert target_id_count == source_train.size(1), f"Reshaping was incorrect. Target_train = {target_id_count}, source_train = {source_train.size(1)}."

    repeated_tensor = source["validation"].repeat(1, full_repeats)
    remainder_tensor = source["validation"][:, :remainder]
    source_validation = torch.cat((repeated_tensor, remainder_tensor), dim=1)
    assert target_id_count == source_validation.size(1), f"Reshaping was incorrect. Target_val = {target_id_count}, source_val = {source_validation.size(1)}."

    return source_train, source_validation


def process_tl_data(source_data, target_data):
    # either reshape source or target dataset according to which has less IDs
    source_ids = source_data["train"].size(1)
    target_ids = target_data["test"].size(1)

    fine_tune_horizon = -24*7*4
    target_test = target_data["test"]
    target_fine_tuning = target_data["train"][fine_tune_horizon:,:]

    # remove IDs if source is bigger than target or
    # repeat IDs if target is bigger than source
    if target_ids < source_ids:
        source_train = source_data["train"][:,:target_ids]
        source_validation = source_data["validation"][:,:target_ids]
    else:
        source_train, source_validation = extend_source_to_target_id_count(source_data, target_data)

    # convert to TimeSeries dataframe
    source_train = TimeSeries.from_values(source_train)
    source_validation = TimeSeries.from_values(source_validation)
    target_test = TimeSeries.from_values(target_test)
    target_fine_tuning = TimeSeries.from_values(target_fine_tuning)
    target_train = TimeSeries.from_values(target_data["train"])
    target_validation = TimeSeries.from_values(target_data["validation"])

    tl_dataset = {
                    "source_train" : source_train,
                    "source_validation" : source_validation,
                    "target_fine_tuning" : target_fine_tuning,
                    "target_test" : target_test,
                    "target_train" : target_train,
                    "target_validation" : target_validation
                }

    return tl_dataset

In [3]:
def evaluate(model, target_test):
    """
    Evaluates models on target test set
    Input:  -trained model
            -List of target test sets shaped according to models

    Output: Dict{MSE, MAE}
    """

    
    # check for last input point and create input/target lists of 96 horizons
    forecasting_endpoint = int(len(target_test)) - 96*2
    window = [target_test[i:i+96] for i in range(0, forecasting_endpoint, 5)]
    target = [target_test[i+96:i+96+96] for i in range(0, forecasting_endpoint, 5)]

    # predict over dataloader with slidingwindow implementation and 5 time step shifts for each input
    predictions = model.predict(n=96, series=window)

    mse = darts_metrics.mse(predictions, target)
    mae = darts_metrics.mae(predictions, target)

    mse = sum(mse) / len(predictions)
    mae = sum(mae) / len(predictions)

    return {'MSE': mse, 'MAE': mae}



def load_model(model_name:str, setup_name:str="generic", checkpointing=True):
    TRAINER_ARGS = {"enable_progress_bar": True, 
                "accelerator": "gpu",  
                "devices" : DEVICE,
             }
    
    saving_name = model_name+"_"+setup_name

    print(saving_name)

    if model_name == "Transformer":
        model = TransformerModel(
            input_chunk_length=IN_LEN, 
            output_chunk_length=OUT_LEN,
            d_model=LAYER_WIDTH, 
            nhead=4, 
            num_encoder_layers=2, 
            num_decoder_layers=3, 
            dim_feedforward=LAYER_WIDTH, 
            dropout=DROPOUT, 
            activation='relu', 
            loss_fn=LOSS_FN,
            optimizer_kwargs={"lr": LR},
            use_reversible_instance_norm=True,
            pl_trainer_kwargs=TRAINER_ARGS,
            model_name=saving_name,
            save_checkpoints=checkpointing,
            work_dir = model_path,
            batch_size=BATCH_SIZE
            )
        

    if model_name == "TSMixer":
        model = TSMixerModel(
        input_chunk_length=IN_LEN, 
        output_chunk_length=OUT_LEN, 
        hidden_size=LAYER_WIDTH, 
        ff_size=LAYER_WIDTH, 
        num_blocks=NUM_BLOCKS, 
        activation='ReLU', 
        dropout=DROPOUT, 
        loss_fn=LOSS_FN,
        norm_type='LayerNorm', 
        optimizer_kwargs={"lr": LR},
        use_reversible_instance_norm=True,
        pl_trainer_kwargs=TRAINER_ARGS,
        model_name= saving_name,
        save_checkpoints=checkpointing,
        work_dir = model_path,
        batch_size=BATCH_SIZE
    )
        
    if model_name == "NHiTS":
        model = NHiTSModel(
        input_chunk_length=IN_LEN,
        output_chunk_length=OUT_LEN,
        activation='ReLU',
        num_stacks=NUM_STACKS,
        num_blocks=NUM_BLOCKS,
        num_layers=NUM_LAYERS,
        layer_widths=LAYER_WIDTH,
        dropout=DROPOUT,
        loss_fn=LOSS_FN,
        use_reversible_instance_norm=True,
        optimizer_kwargs={"lr": LR},
        pl_trainer_kwargs=TRAINER_ARGS,
        model_name= saving_name,
        save_checkpoints=checkpointing,
        work_dir = model_path,
        batch_size=BATCH_SIZE
    )

    return model

def delete_checkpoint(model_name, setup_name):
    directory_path = model_path / (model_name + "_" + setup_name)
    try:
        shutil.rmtree(directory_path)
        print(f"File {directory_path} deleted successfully.")
    except Exception as e:
        print(f"Error deleting file {directory_path}.")

In [4]:
# use electricity dataset
electricity_dict = data_handling.format_electricity()

for key, value in electricity_dict.items():
			electricity_dict[key]= data_handling.df_to_tensor(value)

# normalize train and use matrics for val and test
electricity_dict["4_weeks_train"] = electricity_dict["train"][four_weeks:,:]
electricity_dict["train"], train_standardize_dict = helpers.custom_standardizer(electricity_dict["train"])
electricity_dict["validation"], _ = helpers.custom_standardizer(electricity_dict["validation"], train_standardize_dict)
electricity_dict["test"], _ = helpers.custom_standardizer(electricity_dict["test"], train_standardize_dict)
electricity_dict["4_weeks_train"], _ = helpers.custom_standardizer(electricity_dict["4_weeks_train"], train_standardize_dict)

# bavaria dataset
data_tensor = data_handling.load_bavaria_electricity()
bavaria_dict, standadizer = data_handling.train_test_split_eu_elec(data_tensor, standardize=True)
bavaria_dict["4_weeks_train"] = bavaria_dict["train"][four_weeks:,:]

# building genome project dataset
data_tensor = data_handling.load_genome_project_data()
gp_dict, standadizer = data_handling.train_test_split_eu_elec(data_tensor, standardize=True)
gp_dict["4_weeks_train"] = gp_dict["train"][four_weeks:,:]

In [5]:
tl_setups = {
    "ELD_to_Bavaria" : (electricity_dict, bavaria_dict), 
    "ELD_to_GP2" : (electricity_dict, gp_dict),
    "Bavaria_to_ELD" : (bavaria_dict, electricity_dict), 
    "Bavaria_to_GP2" : (bavaria_dict, gp_dict), 
    "GP2_to_Bavaria": (gp_dict, bavaria_dict), 
    "GP2_to_ELD" : (gp_dict, electricity_dict)
     }

model_names = ["NHiTS",	"Transformer",	"TSMixer"]
learning_scenarios = ["Zero-Shot", "four_weeks_tl", "full_tl", "full_training", "four_weeks_training"]
metrics = ["MSE", "MAE"]

# Initialize the DataFrame
try:
    results_df = pd.read_csv(metrics_output_path, index_col=[0, 1, 2])
except FileNotFoundError:
    index = pd.MultiIndex.from_product([tl_setups.keys(), learning_scenarios, metrics], names=["Setup", "Learning_scenario", "Metric"])
    results_df = pd.DataFrame(columns=model_names, index=index)

# Helper functions
def update_metrics(setup_name, model_name, learning_scenario, mae, mse):
    results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
    results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse

def is_metric_filled(setup_name, model_name, learning_scenario):
    # Check if specific metrics for a model in a setup and fine-tuning scenario are NaN or not
    metrics_filled = not results_df.loc[(setup_name, learning_scenario, slice(None)), model_name].isnull().any()
    return metrics_filled

In [6]:
for setup_name, (source_data, target_data) in tl_setups.items():
    # create ts_format
    tl_data = process_tl_data(source_data, target_data)
   
    source_train = tl_data["source_train"]
    source_val = tl_data["source_validation"]
    target_fine_tuning = tl_data["target_fine_tuning"]
    target_test = tl_data["target_test"]
    target_train = tl_data["target_train"]
    target_val = tl_data["target_validation"]

    # select model
    for model_name in model_names:
        delete_checkpoint(model_name, setup_name)

        trained_model = False

        # short fine tuning
        if not is_metric_filled(setup_name, model_name, "Zero-Shot") :
            # zero shot
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=True)
            model.fit(source_train, val_series=source_val, epochs=TRAIN_EPOCHS, verbose=VERBOSE)
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=False)
            best_model = model.load_from_checkpoint(work_dir=model_path, model_name=model_name+"_"+setup_name, best=True)
            metrics = evaluate(best_model, target_test)
            update_metrics(setup_name, model_name, "Zero-Shot", metrics['MAE'], metrics['MSE'])
            results_df.to_csv(metrics_output_path)
            trained_model = True
        
        # short fine tuning
        if not is_metric_filled(setup_name, model_name, "four_weeks_tl") :
            if trained_model == False:
                model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=True)
                model.fit(source_train, val_series=source_val, epochs=TRAIN_EPOCHS, verbose=VERBOSE)
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=False)
            best_model = model.load_from_checkpoint(work_dir=model_path, model_name=model_name+"_"+setup_name, best=True)
            print(best_model.epochs_trained)
            best_model.fit(target_fine_tuning, epochs=best_model.epochs_trained + TUNE_EPOCHS)
            metrics = evaluate(best_model, target_test)
            update_metrics(setup_name, model_name, "four_weeks_tl", metrics['MAE'], metrics['MSE'])
            results_df.to_csv(metrics_output_path)
            trained_model = True

        # long fine tuning
        if not is_metric_filled(setup_name, model_name, "full_tl") :
            if trained_model == False:
                model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=True)
                model.fit(source_train, val_series=source_val, epochs=TRAIN_EPOCHS, verbose=VERBOSE)
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=False)
            best_model = model.load_from_checkpoint(work_dir=model_path, model_name=model_name+"_"+setup_name, best=True)
            best_model.fit(target_train, epochs=best_model.epochs_trained + TUNE_EPOCHS)
            metrics = evaluate(best_model, target_test)
            update_metrics(setup_name, model_name, "full_tl", metrics['MAE'], metrics['MSE'])
            results_df.to_csv(metrics_output_path)


        delete_checkpoint(model_name, setup_name)

        # short baseline
        if not is_metric_filled(setup_name, model_name, "full_training") :
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=True)
            model.fit(target_fine_tuning, val_series=target_val, epochs=TRAIN_EPOCHS, verbose=VERBOSE)
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=False)
            best_model = model.load_from_checkpoint(work_dir=model_path, model_name=model_name+"_"+setup_name, best=True)
            metrics = evaluate(best_model, target_test)
            update_metrics(setup_name, model_name, "full_training", metrics['MAE'], metrics['MSE'])
            delete_checkpoint(model_name, setup_name)
            results_df.to_csv(metrics_output_path)

        # long baseline
        if not is_metric_filled(setup_name, model_name, "four_weeks_training") :
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=True)
            model.fit(target_train, val_series=target_val, epochs=TRAIN_EPOCHS, verbose=VERBOSE)
            model = load_model(model_name=model_name, setup_name=setup_name, checkpointing=False)
            best_model = model.load_from_checkpoint(work_dir=model_path, model_name=model_name+"_"+setup_name, best=True)
            metrics = evaluate(best_model, target_test)
            update_metrics(setup_name, model_name, "four_weeks_training", metrics['MAE'], metrics['MSE'])
            delete_checkpoint(model_name, setup_name)
            results_df.to_csv(metrics_output_path)


Error deleting file /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria.
NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name          | Type             | Params
-----------------------------

Epoch 14: 100%|██████████| 151/151 [00:05<00:00, 30.07it/s, train_loss=1.080, val_loss=0.781]

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|██████████| 151/151 [00:06<00:00, 22.66it/s, train_loss=1.080, val_loss=0.781]
NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting DataLoader 0: 100%|██████████| 17/17 [00:00<00:00, 26.35it/s]


  results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
  results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse


NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria/checkpoints exists and is not empty.
Restoring states from the checkpoint path at /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria/checkpoints/best-epoch=0-val_loss=0.55.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


0


Restored all states from the checkpoint at /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria/checkpoints/best-epoch=0-val_loss=0.55.ckpt


Epoch 1: 100%|██████████| 16/16 [00:00<00:00, 30.94it/s, train_loss=0.00164]

/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:383: `ModelCheckpoint(monitor='val_loss')` could not find the monitored key in the returned metrics: ['train_loss', 'epoch', 'step']. HINT: Did you call `log('val_loss', value)` in the `LightningModule`?


Epoch 4: 100%|██████████| 16/16 [00:00<00:00, 21.31it/s, train_loss=0.000355]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 16/16 [00:02<00:00,  6.24it/s, train_loss=0.000355]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting DataLoader 0: 100%|██████████| 17/17 [00:00<00:00, 22.79it/s]


  results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
  results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse


NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria/checkpoints exists and is not empty.
Restoring states from the checkpoint path at /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria/checkpoints/best-epoch=0-val_loss=0.55.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Epoch 4: 100%|██████████| 304/304 [00:11<00:00, 26.64it/s, train_loss=0.000224]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 304/304 [00:13<00:00, 23.32it/s, train_loss=0.000224]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting DataLoader 0: 100%|██████████| 17/17 [00:01<00:00, 15.92it/s]


  results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
  results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse
/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


File /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria deleted successfully.
NHiTS_ELD_to_Bavaria



  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | MSELoss          | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | rin           | RINorm           | 118   
4 | stacks        | ModuleList       | 13.5 M
---------------------------------------------------
12.1 M    Trainable params
1.5 M     Non-trainable params
13.5 M    Total params
54.148    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 16/16 [00:00<00:00, 18.77it/s, train_loss=9.44e-5, val_loss=0.000355] 

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|██████████| 16/16 [00:02<00:00,  6.16it/s, train_loss=9.44e-5, val_loss=0.000355]
NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting DataLoader 0: 100%|██████████| 17/17 [00:00<00:00, 21.55it/s]


  results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
  results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse


File /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria deleted successfully.
NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | MSELoss          | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | rin           | RINorm           | 118   
4 | stacks        | ModuleList       | 13.5 M
---------------------------------------------------
12.1 M    Trainable params
1.5 M     Non-trainable params


Epoch 14: 100%|██████████| 304/304 [00:10<00:00, 29.72it/s, train_loss=0.000162, val_loss=0.000283]

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|██████████| 304/304 [00:13<00:00, 23.22it/s, train_loss=0.000162, val_loss=0.000283]
NHiTS_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting DataLoader 0: 100%|██████████| 17/17 [00:00<00:00, 22.36it/s]


  results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
  results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse


File /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/NHiTS_ELD_to_Bavaria deleted successfully.
Error deleting file /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/Transformer_ELD_to_Bavaria.
Transformer_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name                | Type                | Params
------------------------------------------------------------
0 | criterion           | MSELoss             | 0     
1 | train_metrics       | MetricCollection    | 0     
2 | val_metrics         | MetricCollection    | 0     
3 | rin                 | RINorm              | 118   
4 | encoder             | Linear              | 15.4 K
5 | positional_encoding | _PositionalEncoding |

Epoch 14: 100%|██████████| 151/151 [00:07<00:00, 19.49it/s, train_loss=1.300, val_loss=0.781]

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: 100%|██████████| 151/151 [00:09<00:00, 16.75it/s, train_loss=1.300, val_loss=0.781]
Transformer_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


Predicting DataLoader 0: 100%|██████████| 17/17 [00:00<00:00, 39.01it/s]


  results_df.loc[(setup_name, learning_scenario, "MAE"), model_name] = mae
  results_df.loc[(setup_name, learning_scenario, "MSE"), model_name] = mse


Transformer_ELD_to_Bavaria


/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/py ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/vol/fob-vol7/nebenf21/reinbene/bene/MA/myenv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/Transformer_ELD_to_Bavaria/checkpoints exists and is not empty.
Restoring states from the checkpoint path at /vol/fob-vol7/nebenf21/reinbene/bene/MA/outputs/models/darts/Transformer_ELD_to_Bavaria/checkpoints/best-epoch=12-val_loss=0.78.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVI

0


MisconfigurationException: You restored a checkpoint with current_epoch=12, but you have set Trainer(max_epochs=5).