Time Series Forecast with NHiTS on the Vierlinden dataset (all sensors, 2021)

In [1]:
import sys
sys.path.append('./pytorch-forecasting/')
import pandas as pd
import numpy as np
import torch
import lightning.pytorch as pl
from pytorch_forecasting import NHiTS, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
pl.seed_everything(42)

Global seed set to 42


42

### Load dataset

In [2]:
# Read the dataset into a DataFrame
data = pd.read_csv('./RIWWER/Vierlinden/Vierlinden_2021_All.csv')

# Drop columns that have lots of missing values
data.drop(["FLP_Hohenstand_Pumpensumpf_pval","FLP_Strom_P3_pval","FLP_Strom_P4_pval","FLP_Strom_P5_pval","Durchfluss SWP1 und SWP2_pval","FLP_Hohenstand_Becken1_pval","FLP_Hohenstand_Becken3_pval","FLP_Hohenstand_Beckne2_pval"], axis=1, inplace=True)

# NaNs are not allowed by the model
data.fillna(method="bfill", inplace=True)
data.fillna(method="ffill", inplace=True)

# Set Datetime as index
data['Datetime'] = pd.to_datetime(data['Datetime'])

# One time series for the whole year?
data['series'] = 0

# As many timesteps per timeseries as hours in every month?
time_idx = []
for i in range(1):
    timesteps = len( data[ data['series'] == i ] )
    time_idx += list(range(timesteps))
data['time_idx'] = time_idx

In [3]:
data.head()

Unnamed: 0,Datetime,Schieber Position_pval,Oberwasser_pval,Unterwasser_pval,Durchflumenge_pval,Berechnete Durchflussmenge_pval,Fllstand SWS_pval,Fllstand RWS_pval,Strom P1_pval,Strom P2_pval,...,Strom Pumpe 2_pval1,Strom Pumpe 3_pval,Niederschlag,Füllstand_RRB,Entleerung_RüB,Füllstand_RüB_1,Füllstand_RüB_2,Füllstand_RüB_3,series,time_idx
0,2021-01-01 00:00:00,100.0,8.140845,5.753623,7.689189,7.732558,75.717949,36.0,1.076923,0.0,...,0.0,1.0,0.0,1.47,0.098,3.16,3.08,2.72,0,0
1,2021-01-01 01:00:00,100.0,8.0,5.173913,6.808219,8.271739,75.717949,36.0,1.076923,0.0,...,0.0,1.0,1.182353,1.47,0.099,3.16,3.08,2.72,0,1
2,2021-01-01 02:00:00,100.0,7.967742,5.0,5.813333,7.197674,75.717949,36.0,1.076923,0.0,...,0.0,1.0,1.182353,1.47,0.096,3.16,3.08,2.72,0,2
3,2021-01-01 03:00:00,100.0,7.076923,4.84375,4.216216,4.743243,75.717949,36.0,1.076923,0.0,...,0.0,1.0,1.182353,1.47,0.098,3.16,3.08,2.72,0,3
4,2021-01-01 04:00:00,100.0,8.464789,5.466667,8.384615,8.325,75.717949,36.0,1.076923,0.0,...,0.0,1.0,1.182353,1.47,0.098,3.16,3.08,2.72,0,4


In [4]:
# Parameters for dataloaders
max_encoder_length = 24*2
max_prediction_length = 5*2
training_cutoff = data["time_idx"].max() * 4 // 5 # 80% for training
context_length = max_encoder_length
prediction_length = max_prediction_length
batch_size = 32

In [5]:
# Load best model (from NHits_Vierlinden_Train.ipynb)
best_model_path = './RIWWER/torch_forecasting/model_checkpoints/NHits_Vierlinden_saved_datasets/lightning_logs/version_0/checkpoints/epoch=14-step=2250.ckpt'
best_model = NHiTS.load_from_checkpoint(best_model_path)

  rank_zero_warn(
  rank_zero_warn(


### Corrupt sensor clusters

Note: there appears to be a problem with NHits when dealing with covariates. The prediction stays the same even when replacing all sensor values with 0!
See issues:
https://github.com/jdb78/pytorch-forecasting/issues/1065
https://github.com/jdb78/pytorch-forecasting/issues/1071

In [6]:
# Sensor clusters
# Weather and rain tanks: ['Niederschlag', 'Füllstand_RRB', 'Füllstand_RüB_1', 'Füllstand_RüB_2', 'Füllstand_RüB_3']
sensors = {
    "Herzogstr":     ['Schieber Position_pval', 'Oberwasser_pval', 'Unterwasser_pval', 'Durchflumenge_pval', 'Berechnete Durchflussmenge_pval'],
    #"Kaiserstr":     ['Fllstand SWS_pval', 'Fllstand RWS_pval', 'Strom P1_pval', 'Strom P2_pval', 'Strom P3_pval', 'Strom P4_pval', 'Strom P5_pval', 'Strom P6_pval'],
    #"Kreuzweg":      ['Fllstand Pumpensumpf_pval', 'Strom Pumpe 1_pval', 'Strom Pumpe 2_pval'],
    #"Vierlindenhof": ['Fllstand Pumpensumpf_pval1', 'Strom Pumpe 1_pval1', 'Strom Pumpe 2_pval1', 'Strom Pumpe 3_pval']
}

In [7]:
# Turn off sensors iteratively according to the cluster location
for cluster in sensors:
    print("\n\n++++++++++ Turning off " + cluster + " ++++++++++\n")

    ### Create Dataloader for corrupted dataset ###
    validation = TimeSeriesDataSet(
        data[lambda x: x.time_idx > training_cutoff],
        target_normalizer="auto",
        time_idx="time_idx",
        target="Entleerung_RüB",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        time_varying_unknown_reals=list(set(data.columns) - {'Datetime', 'series', 'time_idx'}),
        max_encoder_length=context_length,
        min_encoder_length=max_encoder_length,
        max_prediction_length=prediction_length,
        min_prediction_length=max_prediction_length,
        allow_missing_timesteps=True
    )
    loaded_validation = validation.load('./RIWWER/Vierlinden/val_set')

    # Note: After replacing the feature with zeros, the TimeSeriesDataset applies 'transform_values'
    #       which leads to all the values in that column being replaced not by 0, but by another rescaled value
    #       You may want to comment that line out in timeseries.py
    for column in sensors[cluster]:
        loaded_validation.set_overwrite_values(10, column, 'all')

    val_dataloader = loaded_validation.to_dataloader(train=False, batch_size=batch_size, num_workers=18)

    ### Make prediction and evaluate ###
    actuals = torch.cat([y[0] for x, y in iter(val_dataloader)]).to(torch.device('cuda:0'))
    predictions = best_model.predict(val_dataloader,
                                     trainer_kwargs=dict(default_root_dir="./RIWWER/torch_forecasting/model_checkpoints/NHits_Vierlinden_saved_datasets"))
    print(column)
    err = actuals - predictions
    mae = err.abs().mean()
    print('MAE = ' + str(mae))
    rmse = torch.sqrt( torch.square(err).mean() )
    print('RMSE = ' + str(rmse))



++++++++++ Turning off Herzogstr ++++++++++



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Berechnete Durchflussmenge_pval
MAE = tensor(1.4225, device='cuda:0')
RMSE = tensor(8.2007, device='cuda:0')


In [10]:
loaded_validation.index

Unnamed: 0,time_first,time_last,time_diff_to_next,index_start,time,count,sequence_id,index_end,sequence_length
6960,6960,8759,1,0,6960,1800,0,57,58
6961,6960,8759,1,1,6961,1800,0,58,58
6962,6960,8759,1,2,6962,1800,0,59,58
6963,6960,8759,1,3,6963,1800,0,60,58
6964,6960,8759,1,4,6964,1800,0,61,58
...,...,...,...,...,...,...,...,...,...
8698,6960,8759,1,1738,8698,1800,0,1795,58
8699,6960,8759,1,1739,8699,1800,0,1796,58
8700,6960,8759,1,1740,8700,1800,0,1797,58
8701,6960,8759,1,1741,8701,1800,0,1798,58


In [8]:
for i in range(len(loaded_validation.data['reals'][0])):
    loaded_validation.data['reals'][:, i] = 0.0

    val_dataloader = loaded_validation.to_dataloader(train=False, batch_size=batch_size, num_workers=18)

    ### Make prediction and evaluate ###
    actuals = torch.cat([y[0] for x, y in iter(val_dataloader)]).to(torch.device('cuda:0'))
    predictions = best_model.predict(val_dataloader,
                                     trainer_kwargs=dict(default_root_dir="./RIWWER/torch_forecasting/model_checkpoints/NHits_Vierlinden_saved_datasets"))
    print(str(i+1) + ' sensors shut down.')
    err = actuals - predictions
    mae = err.abs().mean()
    print('MAE = ' + str(mae))
    rmse = torch.sqrt( torch.square(err).mean() )
    print('RMSE = ' + str(rmse))
    del val_dataloader

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


1 sensors shut down.
MAE = tensor(1.4225, device='cuda:0')
RMSE = tensor(8.2007, device='cuda:0')


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


2 sensors shut down.
MAE = tensor(1.4225, device='cuda:0')
RMSE = tensor(8.2007, device='cuda:0')


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Exception ignored in: <function _releaseLock at 0x7fbb74b0eb60>
Traceback (most recent call last):
  File "/home/tchiaburu/anaconda3/envs/torch_gpu_riwwer_new/lib/python3.11/logging/__init__.py", line 237, in _releaseLock
    def _releaseLock():
    
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 3611413, 3611449, 3611485, 3611521, 3611557, 3611593, 3611629, 3611665, 3611701, 3611737, 3611773, 3611809, 3611845, 3611881, 3611917) exited unexpectedly

In [None]:
loaded_validation.data['reals'][:, 0] = 0.0
loaded_validation.data['reals']

In [None]:
loaded_validation.time_varying_unknown_reals

In [None]:
for x, y in iter(val_dataloader):
    t = x['encoder_cont'][0][0]
    t[0] = 0.0

In [None]:
t[0] = 0.0

In [None]:
t

In [None]:
for x, y in iter(val_dataloader):
    print(x['encoder_cont'][0][0])

In [None]:
len(x['encoder_cont'][0][0])

In [None]:
for x, y in iter(val_dataloader):
    for t in x['encoder_cont'][0][0]:
        #t[0] = 0.0
        print(t.item())
    break

In [None]:
loaded_validation._overwrite_values