# Installs + Imports

In [2]:
!pip install pytorch_lightning
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]>2021.06.0->pytorch_lightning)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
Collecting multidict

In [3]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.colab import files
from tqdm.auto import tqdm
from torchmetrics import MeanAbsolutePercentageError
from datetime import datetime 
from typing import Tuple
from functools import partial
from torch.optim.lr_scheduler import ReduceLROnPlateau
from math import sqrt

# Classes + Helpers

## Data processing

In [4]:
def scale_data(load_df, 
               start_train_date,
               end_val_date,
               start_test_date,
               end_test_date):
  
  train_val_df = load_df[(load_df.index >= start_train_date) &
                        (load_df.index <= end_val_date)]
  test_df = load_df[(load_df.index >= start_test_date) &
                    (load_df.index <= end_test_date)]

  scaler = MaxAbsScaler()   # MinMaxScaler()
  train_val_scaled = scaler.fit_transform(train_val_df)
  train_val_df_scaled = pd.DataFrame(train_val_scaled,
                                    columns=train_val_df.columns,
                                    index=train_val_df.index)
  test_scaled = scaler.transform(test_df)
  test_df_scaled = pd.DataFrame(test_scaled,
                                columns=test_df.columns,
                                index=test_df.index)

  load_df_scaled = pd.concat([train_val_df_scaled, test_df_scaled], axis=0)

  return load_df_scaled, scaler

In [5]:
def reframing_enc(X_df, skip_steps_back, n_backwards):

  feat_cols, feat_names = [], []
  # iterate through all columns
  for col_index, col_name in enumerate(X_df.columns):
    series = X_df[col_name].copy()
    # input sequence (t-skip_steps_back, ... ,t-(n_backwards+1) )
    for b in range(skip_steps_back, n_backwards):
      feat_cols.append(series.shift(b))
      feat_names.append(f'{col_name}_(t-{b})')
  
  # put it all together
  X = pd.concat(feat_cols, axis=1)
  X.columns = feat_names
  # drop rows with NaN values
  X.dropna(inplace=True)
  
  return X

def reframe_enc_data(load_df_scaled,
                     target,
                     skip_steps_back,
                     last_step_back,
                     last_step_forward):

  # shift future values
  for col in load_df_scaled.drop(target, axis=1).columns:
    load_df_scaled[col + f'_(t+{last_step_forward})'] = load_df_scaled[col].shift(-last_step_forward)
    load_df_scaled.drop(col, axis=1, inplace=True)

  return reframing_enc(load_df_scaled,
                       skip_steps_back=skip_steps_back, 
                       n_backwards=last_step_back)

def split_enc_data(load_df_scaled_reframed,
                    start_train_date,
                    end_train_date,
                    start_val_date,
                    end_val_date,
                    start_test_date,
                    end_test_date,
                    last_step_back):

  load_train_df_scaled_reframed = load_df_scaled_reframed[(load_df_scaled_reframed.index >= start_train_date) & 
                                                          (load_df_scaled_reframed.index <= end_train_date)]

  load_val_df_scaled_reframed = load_df_scaled_reframed[(load_df_scaled_reframed.index >= start_val_date) & 
                                                        (load_df_scaled_reframed.index <= end_val_date)]

  load_test_df_scaled_reframed = load_df_scaled_reframed[(load_df_scaled_reframed.index >= start_test_date) & 
                                                        (load_df_scaled_reframed.index <= end_test_date)]

  X_train_3D = create3Dinput(load_train_df_scaled_reframed, last_step_back)
  X_val_3D = create3Dinput(load_val_df_scaled_reframed, last_step_back)
  X_test_3D = create3Dinput(load_test_df_scaled_reframed, last_step_back)

  return X_train_3D, X_val_3D, X_test_3D

In [6]:
def reframing_dec(X_df, n_backwards):

  feat_cols, feat_names = [], []
  # iterate through all columns
  for col_index, col_name in enumerate(X_df.columns):
    series = X_df[col_name].copy()
    # input sequence (t, t-1, ... ,t-(n_backwards+1) )
    for b in range(n_backwards):
      feat_cols.append(series.shift(b))
      feat_names.append(f'{col_name}_(t-{b})')
  
  # put it all together
  X = pd.concat(feat_cols, axis=1)
  X.columns = feat_names
  # drop rows with NaN values
  X.dropna(inplace=True)
  
  return X

def reframe_dec_data(load_df_scaled,
                     last_step_back,
                     last_step_forward):

  # shift future values (weather + time data)
  for col in load_df_scaled.columns:
    load_df_scaled[col + f'_(t+{last_step_forward})'] = load_df_scaled[col].shift(-last_step_forward)
    load_df_scaled.drop(col, axis=1, inplace=True)

  return reframing_dec(load_df_scaled, 
                       n_backwards=last_step_back + last_step_forward)

def split_dec_data(load_df_scaled_reframed,
                    start_train_date,
                    end_train_date,
                    start_val_date,
                    end_val_date,
                    start_test_date,
                    end_test_date,
                    steps_forward,
                    last_step_back):

  load_train_df_scaled_reframed = load_df_scaled_reframed[(load_df_scaled_reframed.index >= start_train_date) & 
                                                          (load_df_scaled_reframed.index <= end_train_date)]

  load_val_df_scaled_reframed = load_df_scaled_reframed[(load_df_scaled_reframed.index >= start_val_date) & 
                                                        (load_df_scaled_reframed.index <= end_val_date)]

  load_test_df_scaled_reframed = load_df_scaled_reframed[(load_df_scaled_reframed.index >= start_test_date) & 
                                                        (load_df_scaled_reframed.index <= end_test_date)]

  X_train_3D = create3Dinput(load_train_df_scaled_reframed, last_step_back+steps_forward)
  X_val_3D = create3Dinput(load_val_df_scaled_reframed, last_step_back+steps_forward)
  X_test_3D = create3Dinput(load_test_df_scaled_reframed, last_step_back+steps_forward)

  return X_train_3D, X_val_3D, X_test_3D

In [7]:
def create3Dinput(df, steps):
  N, D = df.shape
  D = int(D/steps)
  arr_3d = np.zeros((N, steps, D))
  for i in range(D):
    arr_3d[:, :, i] = df.iloc[:, i*steps:(i+1)*steps].values
  print(arr_3d.shape)
  return arr_3d

class LoadDataset(Dataset):
  def __init__(self, X_3D_enc, X_3D_time_enc, X_3D_dec, X_3D_time_dec, y):
    self.X_enc = torch.tensor(X_3D_enc, dtype=torch.float32)
    self.X_time_enc = torch.tensor(X_3D_time_enc, dtype=torch.float32)
    self.X_dec = torch.tensor(X_3D_dec, dtype=torch.float32)
    self.X_time_dec = torch.tensor(X_3D_time_dec, dtype=torch.float32)
    self.y = torch.tensor(y, dtype=torch.float32)
  
  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X_enc[idx], self.X_time_enc[idx], self.X_dec[idx], self.X_time_dec[idx], self.y[idx]

## Informer class

### Embeddings

In [8]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEmbedding, self).__init__()
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:, :x.size(1)]

class TokenEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(TokenEmbedding, self).__init__()
        padding = 1 if torch.__version__>='1.5.0' else 2
        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, 
                                    kernel_size=3, padding=padding, padding_mode='circular')
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight,mode='fan_in',nonlinearity='leaky_relu')

    def forward(self, x):
        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1,2)
        return x

class FixedEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(FixedEmbedding, self).__init__()

        w = torch.zeros(c_in, d_model).float()
        w.require_grad = False

        position = torch.arange(0, c_in).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

        w[:, 0::2] = torch.sin(position * div_term)
        w[:, 1::2] = torch.cos(position * div_term)

        self.emb = nn.Embedding(c_in, d_model)
        self.emb.weight = nn.Parameter(w, requires_grad=False)

    def forward(self, x):
        return self.emb(x).detach()

class TemporalEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='fixed', freq='h'):
        super(TemporalEmbedding, self).__init__()

        minute_size = 4; hour_size = 24
        weekday_size = 7; day_size = 32; month_size = 13

        Embed = FixedEmbedding if embed_type=='fixed' else nn.Embedding
        if freq=='t':
            self.minute_embed = Embed(minute_size, d_model)
        self.hour_embed = Embed(hour_size, d_model)
        self.weekday_embed = Embed(weekday_size, d_model)
        self.day_embed = Embed(day_size, d_model)
        self.month_embed = Embed(month_size, d_model)
    
    def forward(self, x):
        x = x.long()
        
        minute_x = self.minute_embed(x[:,:,4]) if hasattr(self, 'minute_embed') else 0.
        # print(f"minute_x --> {minute_x}")
        hour_x = self.hour_embed(x[:,:,3])
        # print(f"hour_x --> {hour_x.shape}")
        weekday_x = self.weekday_embed(x[:,:,2])
        # print(f"weekday_x --> {weekday_x.shape}")
        day_x = self.day_embed(x[:,:,1])
        # print(f"day_x --> {day_x.shape}")
        month_x = self.month_embed(x[:,:,0])
        # print(f"month_x --> {month_x.shape}")
        
        return hour_x + weekday_x + day_x + month_x + minute_x

class TimeFeatureEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='timeF', freq='h'):
        super(TimeFeatureEmbedding, self).__init__()

        freq_map = {'h':4, 't':5, 's':6, 'm':1, 'a':1, 'w':2, 'd':3, 'b':3}
        d_inp = freq_map[freq]
        self.embed = nn.Linear(d_inp, d_model)
    
    def forward(self, x):
        return self.embed(x)

class DataEmbedding(nn.Module):
    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
        super(DataEmbedding, self).__init__()

        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
        self.position_embedding = PositionalEmbedding(d_model=d_model)
        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type!='timeF' else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, x_mark):
        x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark)
        
        return self.dropout(x)

### Masking

In [9]:
class TriangularCausalMask():
    def __init__(self, B, L, device="cpu"):
        mask_shape = [B, 1, L, L]
        with torch.no_grad():
            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)

    @property
    def mask(self):
        return self._mask

class ProbMask():
    def __init__(self, B, H, L, index, scores, device="cpu"):
        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
        indicator = _mask_ex[torch.arange(B)[:, None, None],
                             torch.arange(H)[None, :, None],
                             index, :].to(device)
        self._mask = indicator.view(scores.shape).to(device)
    
    @property
    def mask(self):
        return self._mask

### Attention

In [10]:
class FullAttention(nn.Module):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(FullAttention, self).__init__()
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)
        
    def forward(self, queries, keys, values, attn_mask):
        B, L, H, E = queries.shape
        _, S, _, D = values.shape
        scale = self.scale or 1./sqrt(E)

        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
        if self.mask_flag:
            if attn_mask is None:
                attn_mask = TriangularCausalMask(B, L, device=queries.device)

            scores.masked_fill_(attn_mask.mask, -np.inf)

        A = self.dropout(torch.softmax(scale * scores, dim=-1))
        V = torch.einsum("bhls,bshd->blhd", A, values)
        
        if self.output_attention:
            return (V.contiguous(), A)
        else:
            return (V.contiguous(), None)

class ProbAttention(nn.Module):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(ProbAttention, self).__init__()
        self.factor = factor
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q)
        # Q [B, H, L, D]
        B, H, L_K, E = K.shape
        _, _, L_Q, _ = Q.shape

        # calculate the sampled Q_K
        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
        index_sample = torch.randint(L_K, (L_Q, sample_k)) # real U = U_part(factor*ln(L_k))*L_q
        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2)

        # find the Top_k query with sparisty measurement
        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
        M_top = M.topk(n_top, sorted=False)[1]

        # use the reduced Q to calculate Q_K
        Q_reduce = Q[torch.arange(B)[:, None, None],
                     torch.arange(H)[None, :, None],
                     M_top, :] # factor*ln(L_q)
        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k

        return Q_K, M_top

    def _get_initial_context(self, V, L_Q):
        B, H, L_V, D = V.shape
        if not self.mask_flag:
            # V_sum = V.sum(dim=-2)
            V_sum = V.mean(dim=-2)
            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
        else: # use mask
            assert(L_Q == L_V) # requires that L_Q == L_V, i.e. for self-attention only
            contex = V.cumsum(dim=-2)
        return contex

    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
        B, H, L_V, D = V.shape

        if self.mask_flag:
            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
            scores.masked_fill_(attn_mask.mask, -np.inf)

        attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores)

        context_in[torch.arange(B)[:, None, None],
                   torch.arange(H)[None, :, None],
                   index, :] = torch.matmul(attn, V).type_as(context_in)
        if self.output_attention:
            attns = (torch.ones([B, H, L_V, L_V])/L_V).type_as(attn).to(attn.device)
            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
            return (context_in, attns)
        else:
            return (context_in, None)

    def forward(self, queries, keys, values, attn_mask):
        B, L_Q, H, D = queries.shape
        _, L_K, _, _ = keys.shape

        queries = queries.transpose(2,1)
        keys = keys.transpose(2,1)
        values = values.transpose(2,1)

        U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k)
        u = self.factor * np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q) 

        U_part = U_part if U_part<L_K else L_K
        u = u if u<L_Q else L_Q
        
        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u) 

        # add scale factor
        scale = self.scale or 1./sqrt(D)
        if scale is not None:
            scores_top = scores_top * scale
        # get the context
        context = self._get_initial_context(values, L_Q)
        # update the context with selected top_k queries
        context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
        
        return context.transpose(2,1).contiguous(), attn


class AttentionLayer(nn.Module):
    def __init__(self, attention, d_model, n_heads, 
                 d_keys=None, d_values=None, mix=False):
        super(AttentionLayer, self).__init__()

        d_keys = d_keys or (d_model//n_heads)
        d_values = d_values or (d_model//n_heads)

        self.inner_attention = attention
        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
        self.value_projection = nn.Linear(d_model, d_values * n_heads)
        self.out_projection = nn.Linear(d_values * n_heads, d_model)
        self.n_heads = n_heads
        self.mix = mix

    def forward(self, queries, keys, values, attn_mask):
        B, L, _ = queries.shape
        _, S, _ = keys.shape
        H = self.n_heads

        queries = self.query_projection(queries).view(B, L, H, -1)
        keys = self.key_projection(keys).view(B, S, H, -1)
        values = self.value_projection(values).view(B, S, H, -1)

        out, attn = self.inner_attention(
            queries,
            keys,
            values,
            attn_mask
        )
        if self.mix:
            out = out.transpose(2,1).contiguous()
        out = out.view(B, L, -1)

        return self.out_projection(out), attn

### Encoder

In [11]:
class ConvLayer(nn.Module):
    def __init__(self, c_in):
        super(ConvLayer, self).__init__()
        padding = 1 if torch.__version__>='1.5.0' else 2
        self.downConv = nn.Conv1d(in_channels=c_in,
                                  out_channels=c_in,
                                  kernel_size=3,
                                  padding=padding,
                                  padding_mode='circular')
        self.norm = nn.BatchNorm1d(c_in)
        self.activation = nn.ELU()
        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

    def forward(self, x):
        x = self.downConv(x.permute(0, 2, 1))
        x = self.norm(x)
        x = self.activation(x)
        x = self.maxPool(x)
        x = x.transpose(1,2)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
        super(EncoderLayer, self).__init__()
        d_ff = d_ff or 4*d_model
        self.attention = attention
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x, attn_mask=None):
        # x [B, L, D]
        # x = x + self.dropout(self.attention(
        #     x, x, x,
        #     attn_mask = attn_mask
        # ))
        new_x, attn = self.attention(
            x, x, x,
            attn_mask = attn_mask
        )
        x = x + self.dropout(new_x)

        y = x = self.norm1(x)
        y = self.dropout(self.activation(self.conv1(y.transpose(-1,1))))
        y = self.dropout(self.conv2(y).transpose(-1,1))

        return self.norm2(x+y), attn

class Encoder(nn.Module):
    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
        super(Encoder, self).__init__()
        self.attn_layers = nn.ModuleList(attn_layers)
        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
        self.norm = norm_layer

    def forward(self, x, attn_mask=None):
        # x [B, L, D]
        attns = []
        if self.conv_layers is not None:
            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
                x, attn = attn_layer(x, attn_mask=attn_mask)
                x = conv_layer(x)
                attns.append(attn)
            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
            attns.append(attn)
        else:
            for attn_layer in self.attn_layers:
                x, attn = attn_layer(x, attn_mask=attn_mask)
                attns.append(attn)

        if self.norm is not None:
            x = self.norm(x)

        return x, attns

### Decoder

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
                 dropout=0.1, activation="relu"):
        super(DecoderLayer, self).__init__()
        d_ff = d_ff or 4*d_model
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x, cross, x_mask=None, cross_mask=None):
        x = x + self.dropout(self.self_attention(
            x, x, x,
            attn_mask=x_mask
        )[0])
        x = self.norm1(x)

        x = x + self.dropout(self.cross_attention(
            x, cross, cross,
            attn_mask=cross_mask
        )[0])

        y = x = self.norm2(x)
        y = self.dropout(self.activation(self.conv1(y.transpose(-1,1))))
        y = self.dropout(self.conv2(y).transpose(-1,1))

        return self.norm3(x+y)

class Decoder(nn.Module):
    def __init__(self, layers, norm_layer=None):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList(layers)
        self.norm = norm_layer

    def forward(self, x, cross, x_mask=None, cross_mask=None):
        for layer in self.layers:
            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)

        if self.norm is not None:
            x = self.norm(x)

        return x

### Informer

In [13]:
class Informer(nn.Module):
    def __init__(self, enc_in, dec_in, c_out, out_len,
                factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512, 
                dropout=0.0, attn='prob', embed='fixed', freq='h', activation='gelu', 
                output_attention = False, distil=True, mix=True,
                device=torch.device('cuda:0')):
        super(Informer, self).__init__()
        self.pred_len = out_len
        self.attn = attn
        self.output_attention = output_attention

        # Encoding
        self.enc_embedding = DataEmbedding(enc_in, d_model, embed, freq, dropout)
        self.dec_embedding = DataEmbedding(dec_in, d_model, embed, freq, dropout)
        # Attention
        Attn = ProbAttention if attn=='prob' else FullAttention
        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(Attn(False, factor, attention_dropout=dropout, output_attention=output_attention), 
                                d_model, n_heads, mix=False),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation
                ) for l in range(e_layers)
            ],
            [
                ConvLayer(
                    d_model
                ) for l in range(e_layers-1)
            ] if distil else None,
            norm_layer=torch.nn.LayerNorm(d_model)
        )
        # Decoder
        self.decoder = Decoder(
            [
                DecoderLayer(
                    AttentionLayer(Attn(True, factor, attention_dropout=dropout, output_attention=False), 
                                d_model, n_heads, mix=mix),
                    AttentionLayer(FullAttention(False, factor, attention_dropout=dropout, output_attention=False), 
                                d_model, n_heads, mix=False),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation,
                )
                for l in range(d_layers)
            ],
            norm_layer=torch.nn.LayerNorm(d_model)
        )
        # self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True)
        # self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True)
        self.projection = nn.Linear(d_model, c_out, bias=True)
        
    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, 
                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
        enc_out = self.enc_embedding(x_enc, x_mark_enc)
        enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)

        dec_out = self.dec_embedding(x_dec, x_mark_dec)
        dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)
        dec_out = self.projection(dec_out)
        
        # dec_out = self.end_conv1(dec_out)
        # dec_out = self.end_conv2(dec_out.transpose(2,1)).transpose(1,2)
        if self.output_attention:
            return dec_out[:,-self.pred_len:,:], attns
        else:
            return dec_out[:,-self.pred_len:,:] # [B, L, D]

## Training and Evaluation

In [14]:
def mape(y_preds, y_true):
  epsilon = 1.17e-06
  abs_diff = torch.abs(y_preds - y_true)
  abs_per_error = abs_diff / torch.clamp(torch.abs(y_true), min=epsilon)
  mape = torch.sum(abs_per_error) / y_true.numel()

  return mape 

def train_step(model,
               dataloader, 
               optimizer, 
               device):
  
  model.train()
  loss = 0
  for batch, (X_enc, X_time_enc, X_dec, X_time_dec, Y) in enumerate(dataloader):
    X_enc, X_time_enc, X_dec, X_time_dec, Y = X_enc.to(device), X_time_enc.to(device), X_dec.to(device), X_time_dec.to(device), Y.to(device)
    Y_preds = model(x_enc=X_enc, x_mark_enc=X_time_enc, x_dec=X_dec, x_mark_dec=X_time_dec).squeeze()
    batch_loss = mape(Y_preds, Y) 
    loss += batch_loss.item()
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
  
  loss /= len(dataloader)
  return loss

def val_step(model, dataloader, device):
  model.eval()
  val_loss = 0
  with torch.inference_mode():
    for batch, (X_enc, X_time_enc, X_dec, X_time_dec, Y) in enumerate(dataloader):
      X_enc, X_time_enc, X_dec, X_time_dec, Y = X_enc.to(device), X_time_enc.to(device), X_dec.to(device), X_time_dec.to(device), Y.to(device)
      Y_preds = model(x_enc=X_enc, x_mark_enc=X_time_enc, x_dec=X_dec, x_mark_dec=X_time_dec).squeeze()

      y_preds_unscaled = Y_preds * scaler.max_abs_[TARGET_POS]
      y_true_unscaled  = Y * scaler.max_abs_[TARGET_POS] 

      batch_loss = 100 * mape(y_preds_unscaled, y_true_unscaled)     # loss_fn(Y_preds, Y) 
      val_loss += batch_loss.item()
  
  val_loss /= len(dataloader)
  return val_loss

def train(model, 
          train_dataloader,
          val_dataloader,
          optimizer,
          scheduler,
          epochs,
          patience,
          device,
          path):
  
  results = {
      "loss": [],
      "val_loss": []
  }

  for epoch in tqdm(range(epochs)):
    flag = 0
    loss = train_step(model=model,
                      dataloader=train_dataloader,
                      optimizer=optimizer,
                      device=device)

    val_loss = val_step(model=model,
                        dataloader=val_dataloader,
                        device=device)
    scheduler.step(val_loss)
    
    results['loss'].append(loss)
    results['val_loss'].append(val_loss)
    if epoch == 0:
      best_val_loss = val_loss
      best_epoch = -1
      checkpoint(model, optimizer, path)
      flag = 1
      print(f"Epoch: {epoch+1}/{epochs} | Loss: {loss:.4f} | Val loss: {val_loss:.4f} - *Checkpoint*")
    else:
      if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch
        checkpoint(model, optimizer, path)
        flag = 1
        print(f"Epoch: {epoch+1}/{epochs} | Loss: {loss:.4f} | Val loss: {val_loss:.4f} - *Checkpoint*")
      elif epoch - best_epoch > patience:
        print(f"\nEarly stopping applied at epoch {epoch}.")
        break
    if flag == 0:
      print(f"Epoch: {epoch+1}/{epochs} | Loss: {loss:.4f} | Val loss: {val_loss:.4f}")
  
  return results

def checkpoint(model, optimizer, filepath):
  torch.save({
    "optimizer": optimizer.state_dict(),
    "model": model.state_dict()
  }, filepath)

# MAIN

In [17]:
# Encoder 
DAYS_BACK_ENC = 2
SKIP_DAYS_BACK_ENC = 2
SKIP_STEPS_BACK_ENC = 24 * SKIP_DAYS_BACK_ENC
STEPS_BACK_ENC = 24 * DAYS_BACK_ENC
LAST_STEP_BACK_ENC = STEPS_BACK_ENC + SKIP_STEPS_BACK_ENC

# Decoder
DAYS_BACK_DEC = 2
LAST_STEP_BACK_DEC = 24 * DAYS_BACK_DEC

# Forward
DAYS_TO_SKIP = 0
STEPS_FORWARD = 24    # 1 day
SKIP_STEPS_FORWARD = 24 * DAYS_TO_SKIP
LAST_STEP_FORWARD = STEPS_FORWARD + SKIP_STEPS_FORWARD

LAST_STEP_BACK = LAST_STEP_BACK_ENC + LAST_STEP_BACK_DEC + STEPS_FORWARD

# keep 1 year for testing
START_TEST_DATE = pd.to_datetime('2018-01-01') - pd.to_timedelta(SKIP_STEPS_FORWARD+1, 'h')
END_TEST_DATE = START_TEST_DATE + pd.DateOffset(years=1)

END_VAL_DATE = START_TEST_DATE - pd.to_timedelta(1, 'h')
START_VAL_DATE = pd.to_datetime('2017-01-01') - pd.to_timedelta(LAST_STEP_FORWARD, 'h')

START_TRAIN_DATE = pd.to_datetime('2010-10-01')
END_TRAIN_DATE = START_VAL_DATE - pd.to_timedelta(1, 'h')

# END_TRAIN_DATE = START_TEST_DATE - pd.to_timedelta(1, 'h')
# START_TRAIN_DATE = pd.to_datetime('2017-01-01') - pd.to_timedelta(SKIP_STEPS_FORWARD+1, 'h')

# START_VAL_DATE = pd.to_datetime('2010-10-01')
# END_VAL_DATE = START_TRAIN_DATE - pd.to_timedelta(1, 'h')

TARGET = "TOTAL_CONS"

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Train from {START_TRAIN_DATE} to {END_TRAIN_DATE}")
print(f"Validation from {START_VAL_DATE} to {END_VAL_DATE}")
print(f"Test from {START_TEST_DATE} to {END_TEST_DATE}")

Train from 2010-10-01 00:00:00 to 2016-12-30 23:00:00
Validation from 2016-12-31 00:00:00 to 2017-12-31 22:00:00
Test from 2017-12-31 23:00:00 to 2018-12-31 23:00:00


In [18]:
load_df = pd.read_csv("/content/FINAL_DATASET_2.csv")
load_df.set_index(pd.to_datetime(load_df["Timestamp"]), inplace=True)
load_df.drop("Timestamp", axis=1, inplace=True)

TIME_COLS = list(load_df.drop([TARGET, 'temp', 'humidity'], axis=1).columns)
TARGET_POS = np.where(load_df.columns == TARGET)[0][0]
TIME_POS = list(np.where(load_df.columns == col)[0][0] for col in TIME_COLS)
NO_TARGET_POS = list(np.where(load_df.columns == col)[0][0] for col in load_df.drop(TARGET, axis=1).columns)


data_df_scaled, scaler = scale_data(load_df,
                                    # START_VAL_DATE,
                                    # END_TRAIN_DATE,
                                    # START_TEST_DATE,
                                    # END_TEST_DATE)
                                    START_TRAIN_DATE,
                                    END_VAL_DATE,
                                    START_TEST_DATE,
                                    END_TEST_DATE)

In [19]:
data_df_scaled_reframed_enc = reframe_enc_data(data_df_scaled.copy(), 
                                               TARGET,
                                               SKIP_STEPS_BACK_ENC,
                                               LAST_STEP_BACK_ENC,
                                               LAST_STEP_FORWARD)

data_df_scaled_reframed_dec = reframe_dec_data(data_df_scaled.copy(),
                                               LAST_STEP_BACK_DEC,
                                               LAST_STEP_FORWARD)

common_index = data_df_scaled_reframed_enc.index.intersection(data_df_scaled_reframed_dec.index)
data_df_scaled_reframed_enc = data_df_scaled_reframed_enc.loc[common_index]
data_df_scaled_reframed_dec = data_df_scaled_reframed_dec.loc[common_index]

print(data_df_scaled_reframed_enc.shape, data_df_scaled_reframed_dec.shape)

(72217, 384) (72217, 576)


In [None]:
data_df_scaled_reframed_dec.iloc[:5, 2*24:3*24 + 1]

In [None]:
# X_train_3D_dec[0, :5, TARGET_POS]

In [21]:
X_train_3D_enc, X_val_3D_enc, X_test_3D_enc = split_enc_data(data_df_scaled_reframed_enc,
                                                              # START_VAL_DATE,
                                                              # END_VAL_DATE,
                                                              # START_TRAIN_DATE,
                                                              # END_TRAIN_DATE,
                                                              # START_TEST_DATE,
                                                              # END_TEST_DATE,
                                                              # STEPS_BACK_ENC)
                                                              START_TRAIN_DATE,
                                                              END_TRAIN_DATE,
                                                              START_VAL_DATE,
                                                              END_VAL_DATE,
                                                              START_TEST_DATE,
                                                              END_TEST_DATE,
                                                              STEPS_BACK_ENC)

X_train_3D_dec_orig, X_val_3D_dec_orig, X_test_3D_dec_orig = split_dec_data(data_df_scaled_reframed_dec,
                                                                            # START_VAL_DATE,
                                                                            # END_VAL_DATE,
                                                                            # START_TRAIN_DATE,
                                                                            # END_TRAIN_DATE,
                                                                            # START_TEST_DATE,
                                                                            # END_TEST_DATE,
                                                                            # STEPS_FORWARD,
                                                                            # LAST_STEP_BACK_DEC)
                                                                            START_TRAIN_DATE,
                                                                            END_TRAIN_DATE,
                                                                            START_VAL_DATE,
                                                                            END_VAL_DATE,
                                                                            START_TEST_DATE,
                                                                            END_TEST_DATE,
                                                                            STEPS_FORWARD,
                                                                            LAST_STEP_BACK_DEC)

(54697, 48, 8)
(8783, 48, 8)
(8737, 48, 8)
(54697, 72, 8)
(8783, 72, 8)
(8737, 72, 8)


In [22]:
y_train = X_train_3D_dec_orig[:, :LAST_STEP_FORWARD, TARGET_POS]
X_train_3D_dec = np.concatenate(
    (
        X_train_3D_dec_orig[:, LAST_STEP_FORWARD:, TARGET_POS].reshape(-1, LAST_STEP_BACK_DEC, 1),
        X_train_3D_dec_orig[:, :LAST_STEP_BACK_DEC, NO_TARGET_POS]
    ), axis=2
)

y_val = X_val_3D_dec_orig[:, :LAST_STEP_FORWARD, TARGET_POS]
X_val_3D_dec = np.concatenate(
    (
        X_val_3D_dec_orig[:, LAST_STEP_FORWARD:, TARGET_POS].reshape(-1, LAST_STEP_BACK_DEC, 1),
        X_val_3D_dec_orig[:, :LAST_STEP_BACK_DEC, NO_TARGET_POS]
    ), axis=2
)

y_test = X_test_3D_dec_orig[:, :LAST_STEP_FORWARD, TARGET_POS]
X_test_3D_dec = np.concatenate(
    (
        X_test_3D_dec_orig[:, LAST_STEP_FORWARD:, TARGET_POS].reshape(-1, LAST_STEP_BACK_DEC, 1),
        X_test_3D_dec_orig[:, :LAST_STEP_BACK_DEC, NO_TARGET_POS]
    ), axis=2
)

print(X_train_3D_dec.shape, X_val_3D_dec.shape, X_test_3D_dec.shape)

(54697, 48, 8) (8783, 48, 8) (8737, 48, 8)


In [23]:
X_train_3D_enc = np.flip(X_train_3D_enc, axis=1)  #.reshape(-1, 8, LAST_STEP_BACK_DEC)
X_val_3D_enc = np.flip(X_val_3D_enc, axis=1)  #.reshape(-1, 8, LAST_STEP_BACK_DEC)
X_test_3D_enc = np.flip(X_test_3D_enc, axis=1)  #.reshape(-1, 8, LAST_STEP_BACK_DEC)
X_train_3D_dec = np.flip(X_train_3D_dec, axis=1)  #.reshape(-1, 8, LAST_STEP_BACK_DEC)
X_val_3D_dec = np.flip(X_val_3D_dec, axis=1)  #.reshape(-1, 8, LAST_STEP_BACK_DEC)
X_test_3D_dec = np.flip(X_test_3D_dec, axis=1)  #.reshape(-1, 8, LAST_STEP_BACK_DEC)

y_train = np.flip(y_train, axis=1)
y_val = np.flip(y_val, axis=1)
y_test = np.flip(y_test, axis=1)

In [24]:
BATCH_SIZE = 1024

# *** DATALOADERS ***
train_dataset = LoadDataset(X_3D_enc=X_train_3D_enc.copy(), 
                            X_3D_time_enc=X_train_3D_enc[:, :, TIME_POS].copy(),
                            X_3D_dec=X_train_3D_dec.copy(), 
                            X_3D_time_dec=X_train_3D_dec[:, :, TIME_POS].copy(),
                            y=y_train.copy())
train_dataloader = DataLoader(dataset=train_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=True)

val_dataset = LoadDataset(X_3D_enc=X_val_3D_enc.copy(), 
                          X_3D_time_enc=X_val_3D_enc[:, :, TIME_POS].copy(),
                          X_3D_dec=X_val_3D_dec.copy(), 
                          X_3D_time_dec=X_val_3D_dec[:, :, TIME_POS].copy(),
                          y=y_val.copy())
val_dataloader = DataLoader(dataset=val_dataset, 
                            batch_size=BATCH_SIZE,
                            shuffle=False)

# TRAIN

In [25]:
EPOCHS = 1000
PATIENCE = 31
PATH = "model.pth"

model = Informer(enc_in=8, dec_in=8, c_out=24, out_len=1,
                 d_model=64, n_heads=8, e_layers=1, d_layers=1, d_ff=64, attn='prob').to(device)

optimizer = torch.optim.Adam(params=model.parameters(),
                             lr=1e-3,
                             weight_decay=0)
# optimizer = t.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.33, patience=20, verbose=True)
# loss_fn = MeanAbsolutePercentageError().to(device)   # MeanAbsolutePercentageError(), L1Loss(), MSELoss()

!rm -rf "model.pth"

model_results = train(model=model, 
                      train_dataloader=train_dataloader,
                      val_dataloader=val_dataloader,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      epochs=EPOCHS,
                      patience=PATIENCE,
                      device=device,
                      path=PATH)

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 1/1000 | Loss: 0.2926 | Val loss: 18.8822 - *Checkpoint*
Epoch: 2/1000 | Loss: 0.1586 | Val loss: 16.6571 - *Checkpoint*
Epoch: 3/1000 | Loss: 0.1515 | Val loss: 16.4987 - *Checkpoint*
Epoch: 4/1000 | Loss: 0.1360 | Val loss: 13.8486 - *Checkpoint*
Epoch: 5/1000 | Loss: 0.1057 | Val loss: 12.0381 - *Checkpoint*
Epoch: 6/1000 | Loss: 0.1019 | Val loss: 10.7985 - *Checkpoint*
Epoch: 7/1000 | Loss: 0.0776 | Val loss: 9.2250 - *Checkpoint*
Epoch: 8/1000 | Loss: 0.0719 | Val loss: 7.4359 - *Checkpoint*
Epoch: 9/1000 | Loss: 0.0696 | Val loss: 7.7626
Epoch: 10/1000 | Loss: 0.0724 | Val loss: 7.0476 - *Checkpoint*
Epoch: 11/1000 | Loss: 0.0624 | Val loss: 7.0871
Epoch: 12/1000 | Loss: 0.0598 | Val loss: 6.9922 - *Checkpoint*
Epoch: 13/1000 | Loss: 0.0638 | Val loss: 6.6646 - *Checkpoint*
Epoch: 14/1000 | Loss: 0.0650 | Val loss: 6.8257
Epoch: 15/1000 | Loss: 0.0590 | Val loss: 6.8649
Epoch: 16/1000 | Loss: 0.0526 | Val loss: 5.9650 - *Checkpoint*
Epoch: 17/1000 | Loss: 0.0495 | Val los

# Inference

In [26]:
checkpoint = torch.load("model.pth")
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

X_enc = torch.tensor(X_test_3D_enc.copy(), dtype=torch.float32).to(device)
X_time_enc = torch.tensor(X_test_3D_enc[:, :, TIME_POS].copy(), dtype=torch.float32).to(device)
X_dec = torch.tensor(X_test_3D_dec.copy(), dtype=torch.float32).to(device)
X_time_dec = torch.tensor(X_test_3D_dec[:, :, TIME_POS].copy(), dtype=torch.float32).to(device)

test_dataset = LoadDataset(X_3D_enc=X_enc, 
                           X_3D_time_enc=X_time_enc,
                           X_3D_dec=X_dec, 
                           X_3D_time_dec=X_time_dec,
                           y=y_test.copy())
test_dataloader = DataLoader(dataset=test_dataset, 
                             batch_size=BATCH_SIZE,
                             shuffle=False)

model.eval()
with torch.inference_mode():
  for batch, (X_enc, X_time_enc, X_dec, X_time_dec, Y) in enumerate(test_dataloader):
    X_enc, X_time_enc, X_dec, X_time_dec, Y = X_enc.to(device), X_time_enc.to(device), X_dec.to(device), X_time_dec.to(device), Y.to(device)
    test_batch_preds = model(x_enc=X_enc, x_mark_enc=X_time_enc, x_dec=X_dec, x_mark_dec=X_time_dec).squeeze()
    if batch == 0:
      test_preds_scaled = test_batch_preds
    else:
      test_preds_scaled = torch.cat((test_preds_scaled, test_batch_preds), dim=0)

print(test_preds_scaled.shape)

  self.X_enc = torch.tensor(X_3D_enc, dtype=torch.float32)
  self.X_time_enc = torch.tensor(X_3D_time_enc, dtype=torch.float32)
  self.X_dec = torch.tensor(X_3D_dec, dtype=torch.float32)
  self.X_time_dec = torch.tensor(X_3D_time_dec, dtype=torch.float32)


torch.Size([8737, 24])


In [27]:
test_preds_scaled = test_preds_scaled.to('cpu').numpy()
y_test_scaled = y_test.squeeze()

test_preds = np.zeros(test_preds_scaled.shape)
for i in range(test_preds.shape[1]):
  test_preds[:, i] = test_preds_scaled[:, i] * scaler.max_abs_[TARGET_POS]

y_test = np.zeros(y_test_scaled.shape)
for i in range(test_preds.shape[1]):
  y_test[:, i] = y_test_scaled[:, i] * scaler.max_abs_[TARGET_POS] 

In [28]:
mape_list = list()
step_results_dict = {}
for step in range(STEPS_FORWARD):
  step_results_df = pd.DataFrame(
      {
          "real": y_test[:, step],
          "predictions": test_preds[:, step]
      }
  )
  step_results_df['abs_error'] = abs(step_results_df['real'] - step_results_df['predictions'])
  step_results_df['ape'] = np.where(step_results_df['real'] == 0, np.NaN, 100 * step_results_df['abs_error']/step_results_df['real'])
  step_mape = step_results_df['ape'].mean()
  mape_list.append(step_mape)
  print(f"Step {step} -> MAPE = {step_mape}")

  step_results_dict[step] = step_results_df
mape = np.array(mape_list).mean()
print(f"\nMAPE = {mape}")

Step 0 -> MAPE = 2.729022637940422
Step 1 -> MAPE = 2.7720945149979106
Step 2 -> MAPE = 2.898093454732678
Step 3 -> MAPE = 2.956590204828603
Step 4 -> MAPE = 3.000225872679718
Step 5 -> MAPE = 3.057659713091322
Step 6 -> MAPE = 3.1544333045111115
Step 7 -> MAPE = 3.2269032648513645
Step 8 -> MAPE = 3.1734984255614584
Step 9 -> MAPE = 3.19854421566061
Step 10 -> MAPE = 3.2195345913018167
Step 11 -> MAPE = 3.319253351571106
Step 12 -> MAPE = 3.3234515303640584
Step 13 -> MAPE = 3.5262040928076024
Step 14 -> MAPE = 3.5979370098943653
Step 15 -> MAPE = 3.600307698343608
Step 16 -> MAPE = 3.6073483585332204
Step 17 -> MAPE = 3.698461838424507
Step 18 -> MAPE = 3.786883661349547
Step 19 -> MAPE = 3.8202653569921203
Step 20 -> MAPE = 3.781110481428837
Step 21 -> MAPE = 3.7662384423941986
Step 22 -> MAPE = 3.8075315420324127
Step 23 -> MAPE = 3.836756191207882

MAPE = 3.369097906479187
