In [None]:
# default_exp models.transformer.informer

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Informer

> API details.

In [None]:
#export
import math
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch import optim

from neuralforecast.models.components.transformer import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
from neuralforecast.models.components.selfattention import (
    TriangularCausalMask, ProbMask, 
    FullAttention, ProbAttention, AttentionLayer
)
from neuralforecast.models.components.embed import DataEmbedding
from neuralforecast.losses.utils import LossFunction

In [None]:
#export
class _Informer(nn.Module):
    """
    Informer with Propspare attention in O(LlogL) complexity
    """
    def __init__(self, pred_len, output_attention,
                 enc_in, dec_in, d_model, c_out, embed, freq, dropout,
                 factor, n_heads, d_ff, activation, e_layers,
                 d_layers, distil):
        super(_Informer, self).__init__()
        self.pred_len = pred_len
        self.output_attention = output_attention

        # Embedding
        self.enc_embedding = DataEmbedding(enc_in, d_model, embed, freq,
                                           dropout)
        self.dec_embedding = DataEmbedding(dec_in, d_model, embed, freq,
                                           dropout)

        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(
                        ProbAttention(False, factor, attention_dropout=dropout,
                                      output_attention=output_attention),
                        d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation
                ) for l in range(e_layers)
            ],
            [
                ConvLayer(
                    d_model
                ) for l in range(e_layers - 1)
            ] if distil else None,
            norm_layer=torch.nn.LayerNorm(d_model)
        )
        # Decoder
        self.decoder = Decoder(
            [
                DecoderLayer(
                    AttentionLayer(
                        ProbAttention(True, factor, attention_dropout=dropout, output_attention=False),
                        d_model, n_heads),
                    AttentionLayer(
                        ProbAttention(False, factor, attention_dropout=dropout, output_attention=False),
                        d_model, n_heads),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation,
                )
                for l in range(d_layers)
            ],
            norm_layer=torch.nn.LayerNorm(d_model),
            projection=nn.Linear(d_model, c_out, bias=True)
        )

    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):

        enc_out = self.enc_embedding(x_enc, x_mark_enc)
        enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)

        dec_out = self.dec_embedding(x_dec, x_mark_dec)
        dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)

        if self.output_attention:
            return dec_out[:, -self.pred_len:, :], attns
        else:
            return dec_out[:, -self.pred_len:, :]  # [B, L, D]

# Informer model wrapper

In [None]:
#export
class Informer(pl.LightningModule):
    def __init__(self, seq_len, 
                 label_len, pred_len, output_attention,
                 enc_in, dec_in, d_model, c_out, embed, freq, dropout,
                 factor, n_heads, d_ff, activation, e_layers, d_layers, distil,
                 loss_train, loss_valid, loss_hypar, learning_rate,
                 lr_decay, weight_decay, lr_decay_step_size,
                 random_seed):
        super(Informer, self).__init__()

        #------------------------ Model Attributes ------------------------#
        # Architecture parameters
        self.seq_len = seq_len 
        self.label_len = label_len 
        self.pred_len = pred_len 
        self.output_attention = output_attention
        self.enc_in = enc_in 
        self.dec_in = dec_in 
        self.d_model = d_model 
        self.c_out = c_out 
        self.embed = embed
        self.freq = freq 
        self.dropout = dropout
        self.factor = factor 
        self.n_heads = n_heads 
        self.d_ff = d_ff 
        self.activation = activation 
        self.e_layers = e_layers
        self.d_layers = d_layers
        self.distil = distil
        
        # Loss functions
        self.loss_train = loss_train
        self.loss_hypar = loss_hypar
        self.loss_valid = loss_valid
        self.loss_fn_train = LossFunction(loss_train, 
                                          seasonality=self.loss_hypar)
        self.loss_fn_valid = LossFunction(loss_valid,
                                          seasonality=self.loss_hypar)
        
        # Regularization and optimization parameters      
        self.learning_rate = learning_rate
        self.lr_decay = lr_decay
        self.weight_decay = weight_decay
        self.lr_decay_step_size = lr_decay_step_size
        self.random_seed = random_seed

        self.model = _Informer(pred_len, output_attention,
                               enc_in, dec_in, d_model, c_out, 
                               embed, freq, dropout,
                               factor, n_heads, d_ff, 
                               activation, e_layers,
                               d_layers, distil)
    
    def forward(self, batch):
        """
        Autoformer needs batch of shape (batch_size, time, series) for y
        and (batch_size, time, exogenous) for x
        and doesnt need X for each time series.
        USE DataLoader from pytorch instead of TimeSeriesLoader.
        """
        Y = batch['Y'].permute(0, 2, 1)
        X = batch['X'][:, 0, :, :].permute(0, 2, 1)
        sample_mask = batch['sample_mask'].permute(0, 2, 1)
        available_mask = batch['available_mask']
        
        s_begin = 0
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len
        
        batch_x = Y[:, s_begin:s_end, :]
        batch_y = Y[:, r_begin:r_end, :]
        batch_x_mark = X[:, s_begin:s_end, :]
        batch_y_mark = X[:, r_begin:r_end, :]
        outsample_mask = sample_mask[:, r_begin:r_end, :]
        
        dec_inp = torch.zeros_like(batch_y[:, -self.pred_len:, :])
        dec_inp = torch.cat([batch_y[:, :self.label_len, :], dec_inp], dim=1)
        
        if self.output_attention:
            forecast = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
        else:
            forecast = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
            
        batch_y = batch_y[:, -self.pred_len:, :]
        outsample_mask = outsample_mask[:, -self.pred_len:, :]

        return batch_y, forecast, outsample_mask
    
    def training_step(self, batch, batch_idx):
        
        outsample_y, forecast, outsample_mask = self(batch)

        loss = self.loss_fn_train(y=outsample_y,
                                  y_hat=forecast,
                                  mask=outsample_mask,
                                  y_insample=batch['Y'].permute(0, 2, 1))

        self.log('train_loss', loss, prog_bar=True, on_epoch=True)

        return loss

    def validation_step(self, batch, idx):
        
        outsample_y, forecast, outsample_mask = self(batch)

        loss = self.loss_fn_valid(y=outsample_y,
                                  y_hat=forecast,
                                  mask=outsample_mask,
                                  y_insample=batch['Y'].permute(0, 2, 1))

        self.log('val_loss', loss, prog_bar=True)
        
        return loss

    def on_fit_start(self):
        torch.manual_seed(self.random_seed)
        np.random.seed(self.random_seed)
        random.seed(self.random_seed)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.model.parameters(),
                               lr=self.learning_rate, 
                               weight_decay=self.weight_decay)
        
        lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 
                                                 step_size=self.lr_decay_step_size, 
                                                 gamma=self.lr_decay)

        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler}

## Informer Usage Example

### Load Data

In [None]:
from neuralforecast.data.datasets.long_horizon import LongHorizon

Y_df, X_df, S_df = LongHorizon.load(directory='./data', group='ETTm2')
Y_df = Y_df.reset_index(drop=True)

In [None]:
Y_df.head()

Unnamed: 0,unique_id,ds,y
0,HUFL,2016-07-01 00:00:00,41.130001
1,HUFL,2016-07-01 00:15:00,39.622002
2,HUFL,2016-07-01 00:30:00,38.868
3,HUFL,2016-07-01 00:45:00,35.518002
4,HUFL,2016-07-01 01:00:00,37.528


In [None]:
X_df.head()

Unnamed: 0,unique_id,ds,HourOfDay,DayOfWeek,DayOfMonth,DayOfYear
0,HUFL,2016-07-01 00:00:00,-0.5,0.166667,-0.5,-0.00137
1,HUFL,2016-07-01 00:15:00,-0.5,0.166667,-0.5,-0.00137
2,HUFL,2016-07-01 00:30:00,-0.5,0.166667,-0.5,-0.00137
3,HUFL,2016-07-01 00:45:00,-0.5,0.166667,-0.5,-0.00137
4,HUFL,2016-07-01 01:00:00,-0.456522,0.166667,-0.5,-0.00137


In [None]:
f_cols = X_df.drop(columns=['unique_id', 'ds']).columns.to_list()

### Declare Model and Data Parameters

- s_begin = index
- s_end = index + self.seq_len
- r_begin = index + self.seq_len - self.label_len
- r_end = index + self.seq_len + self.pred_len

In [None]:
# Architecture parameters
mc_model = {}

mc_model['seq_len'] = 96
mc_model['label_len'] = 48
mc_model['pred_len'] = 24
mc_model['output_attention'] = False
mc_model['enc_in'] = 7
mc_model['dec_in'] = 7
mc_model['d_model'] = 512
mc_model['c_out'] = 7
mc_model['embed'] = 'timeF'
mc_model['freq'] = 'h'
mc_model['dropout'] = 0.05
mc_model['factor'] = 1
mc_model['n_heads'] = 8
mc_model['d_ff'] = 2_048
mc_model['activation'] = 'gelu'
mc_model['e_layers'] = 2 
mc_model['d_layers'] = 1
mc_model['distil'] = None
mc_model['loss_train'] = 'MAE'
mc_model['loss_hypar'] = 0.5
mc_model['loss_valid'] = 'MAE'
mc_model['learning_rate'] = 0.001
mc_model['lr_decay'] = 0.5
mc_model['weight_decay'] = 0.
mc_model['lr_decay_step_size'] = 2
mc_model['random_seed'] = 1

# Dataset parameters
mc_data = {}
mc_data['mode'] = 'iterate_windows'
mc_data['n_time_in'] = mc_model['seq_len']
mc_data['n_time_out'] = mc_model['pred_len']
mc_data['batch_size'] = 32
mc_data['normalizer_y'] = None
mc_data['normalizer_x'] = None
mc_data['max_epochs'] = 1
mc_data['max_steps'] = 2
mc_data['early_stop_patience'] = 20

len_val = 11_520
len_test = 11_520

### Instantiate Loaders and Model

In [None]:
from neuralforecast.data.tsdataset import IterateWindowsDataset

In [None]:
from torch.utils.data import DataLoader
from neuralforecast.experiments.utils import create_datasets

train_dataset, val_dataset, test_dataset, scaler_y = create_datasets(mc=mc_data,
                                                                     S_df=None, 
                                                                     Y_df=Y_df, X_df=X_df,
                                                                     f_cols=f_cols,
                                                                     ds_in_val=len_val,
                                                                     ds_in_test=len_test)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=int(mc_data['batch_size']),
                          shuffle=True,
                          drop_last=True)

val_loader = DataLoader(dataset=val_dataset,
                        batch_size=int(mc_data['batch_size']),
                        shuffle=False)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=int(mc_data['batch_size']),
                         shuffle=False)

INFO:root:Train Validation splits

INFO:root:                                       ds                    
                                      min                 max
unique_id sample_mask                                        
HUFL      0           2017-10-29 20:00:00 2018-06-26 19:45:00
          1           2016-07-01 00:00:00 2017-10-29 19:45:00
HULL      0           2017-10-29 20:00:00 2018-06-26 19:45:00
          1           2016-07-01 00:00:00 2017-10-29 19:45:00
MUFL      0           2017-10-29 20:00:00 2018-06-26 19:45:00
          1           2016-07-01 00:00:00 2017-10-29 19:45:00
MULL      0           2017-10-29 20:00:00 2018-06-26 19:45:00
          1           2016-07-01 00:00:00 2017-10-29 19:45:00
LUFL      0           2017-10-29 20:00:00 2018-06-26 19:45:00
          1           2016-07-01 00:00:00 2017-10-29 19:45:00
LULL      0           2017-10-29 20:00:00 2018-06-26 19:45:00
          1           2016-07-01 00:00:00 2017-10-29 19:45:00
OT        0           201

In [None]:
model = Informer(**mc_model)

### Train Model

In [None]:
from pytorch_lightning.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="train_loss", 
                               min_delta=1e-4, 
                               patience=mc_data['early_stop_patience'],
                               verbose=False,
                               mode="min")

trainer = pl.Trainer(max_epochs=mc_data['max_epochs'], 
                     max_steps=mc_data['max_steps'],
                     gradient_clip_val=1.0,
                     progress_bar_refresh_rate=10, 
                     log_every_n_steps=500, 
                     check_val_every_n_epoch=1,
                     callbacks=[early_stopping])

trainer.fit(model, train_loader)

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name  | Type      | Params
------------------------------------
0 | model | _Informer | 10.5 M
------------------------------------
10.5 M    Trainable params
0         Non-trainable params
10.5 M    Total params
42.160    Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

### Make Predictions

In [None]:
#outputs = trainer.predict(model, val_loader)

#print("outputs[0][0].shape", outputs[0][0].shape)
#print("outputs[0][1].shape", outputs[0][1].shape)
#print("outputs[0][2].shape", outputs[0][2].shape)