## Modeling

For the model, I will use Temporal Fusion Transformer from PyTorchForecasting library. This library yields several class and methods to facilitate the work with TimeSeries with excellents results.
Initially, I will use TimeSeriesDataset to create a PyTorch dataset (my TimeSeries) and then use DataLoaders to create batches to  fit the model.</br>
More information about this model can be found: 
[Temporal Fusion Transformer ](https://pytorch-forecasting.readthedocs.io/en/latest/api/pytorch_forecasting.models.temporal_fusion_transformer.TemporalFusionTransformer.html)

In [3]:
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer, EncoderNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss, MAE, MAPE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [4]:
data = pd.read_csv('./output/dataset.csv')
data.head()

# group_dict = {'60190': 0,
#               '20209': 1,
#               '70165': 2,
#               '50215': 3,
#               '70208': 4,
#               '70271': 5
#              }

Unnamed: 0,item_code,quantity,avg_price,date,min_temp,max_temp,log_quantity,month,sma4,time_idx,group_ids,quantity_scaled
0,20209,109.0,6.99,2017-01-01,35.0,76.0,4.691348,1,,0,1,0.111588
1,50215,422.0,9.845294,2017-01-01,35.0,76.0,6.045005,1,,0,3,0.447425
2,70165,471.0,9.458725,2017-01-01,35.0,76.0,6.154858,1,,0,2,0.5
3,70208,145.0,10.067692,2017-01-01,35.0,76.0,4.976734,1,,0,4,0.150215
4,20209,136.0,7.144666,2017-02-01,42.0,83.0,4.912655,2,,1,1,0.140558


In [7]:
# Create Dataset and Dataloaders
max_encoder_length = 24
max_prediction_length = 6
training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],    
    time_idx="time_idx",
    target="quantity_scaled",
    #     categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["group_ids"],
    # only unknown variable is "value" - and N-Beats can also not take any additional variables
    time_varying_unknown_reals=["quantity_scaled"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length
)

# validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff + 1)
validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

In [None]:
# # load the first batch
# x, y = next(iter(train_dataloader))
# print("x =", x)
# print("\ny =", y)
# print("\nsizes of x =")
# for key, value in x.items():
#     print(f"\t{key} = {value.size()}")

In [8]:
# calculate baseline mean absolute error, i.e. predict next value as the last available value from the history
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
MAE()(baseline_predictions, actuals)

tensor(0.1511)

In [10]:
# Training the model (TemporalFusionTransformer)

# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=True, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(max_epochs=10,
                     min_epochs=5,
                     gpus=0,
                     weights_summary="top",
                     gradient_clip_val=0.1,
                     limit_train_batches= 200,  # coment in for training, running valiation every 30 batches
                     # fast_dev_run=True,  # comment in to check that network or dataset has no serious bugs
                     #     callbacks=[lr_logger, early_stop_callback],
                     logger=logger
                    )


tft_model = TemporalFusionTransformer.from_dataset(training,
                                                   learning_rate=5.88e-06,
                                                   hidden_size=20,
                                                   attention_head_size=1,
                                                   dropout=0.2,
                                                   hidden_continuous_size=8,
                                                   #     output_size=7,  # 7 quantiles by default
                                                   loss=MAPE(),
                                                   #     loss=
                                                   log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
                                                   reduce_on_plateau_patience=4
                                                  )
print(f"Number of parameters in network: {tft_model.size()/1e3:.1f}k")

# Fit the model
trainer.fit(tft_model, train_dataloader, val_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Number of parameters in network: 22.4k



   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | MAPE                            | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 0     
3  | prescalers                         | ModuleDict                      | 16    
4  | static_variable_selection          | VariableSelectionNetwork        | 0     
5  | encoder_variable_selection         | VariableSelectionNetwork        | 620   
6  | decoder_variable_selection         | VariableSelectionNetwork        | 0     
7  | static_context_variable_selection  | GatedResidualNetwork            | 1.7 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 1.7 K 
9  | static_context_initial_cell_lstm   | GatedResidualNetwork            | 1.7 

Validation sanity check: 0it [00:00, ?it/s]

StopIteration: 

In [None]:
# Model Evaluation
# Select best model according to loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [None]:
# calcualte mean absolute error on validation dataset
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = best_tft_model.predict(val_dataloader)
(actuals - predictions).abs().mean()

In [None]:
# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions, x = best_tft_model.predict(val_dataloader, mode="raw", return_x=True)

In [None]:
for idx in range(5):  # plot 10 examples
    best_tft_model.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);

In [None]:
predictions, x = best_tft_model.predict(val_dataloader, return_x=True)
predictions_vs_actuals = best_tft_model.calculate_prediction_actual_by_variable(x, predictions)
best_tft_model.plot_prediction_actual_by_variable(predictions_vs_actuals);

In [None]:
interpretation = best_tft_model.interpret_output(raw_predictions, reduction="sum")
best_tft_model.plot_interpretation(interpretation)

In [None]:
X, y = next(iter(val_dataloader))
predictions, x = best_tft_model.predict(val_dataloader, return_x=True)
predictions_vs_actuals = best_tft_model.calculate_prediction_actual_by_variable(x, predictions)
best_tft_model.plot_prediction_actual_by_variable(predictions_vs_actuals);