# Informer Notebook

In [2]:
# Load libs

import math
from math import e
import os
from operator import mod
from datetime import datetime, timedelta
from sys import platform
import argparse
import torch
from torch.nn.modules import transformer
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd
from runx.logx import logx
import pickle
from typing import List
from pandas.tseries.frequencies import to_offset
from pandas.tseries import offsets
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from math import sqrt

# libs
import evaluate
import libs.utils as utils
from libs.losses import LossHelper

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
# Args
args = utils.DotDict()
args.arch = "informer"
args.loss_type = "sharpe"
args.stopping_type = "strategy"
args.filename = "futures_prop.csv"
args.frequency = "d"
args.start_date = "01/01/1990" # do not change
args.test_date = "01/01/2000" # do not change
args.end_date = "01/01/2005" # do not change
args.scaler = "none" # do not change
args.lead_tages = "1" # do not change
args.win_len = 63
args.step = 63
args.epochs = 100
args.patience = 25
args.lr = 0.01
args.batch_size = 128
args.max_grad_norm = 1
args.dropout = 0.2
args.n_layer = 2
args.d_hidden = 64
args.d_model = 32
args.n_head = 4
args.attn = 'prob'
args.embed_type = 'timeF'
args.factor = 5
args.do_log = False


In [3]:
# Load preprocessed data
tmp = pickle.load(open("data_tmp.p", "rb"))
train_iter = tmp['train_iter']
val_iter = tmp['val_iter']
val_df = tmp['val_df']
test_iter = tmp['test_iter']
test_df = tmp['test_df']

loss_type = LossHelper.get_loss_type(args.loss_type)
train_manager = {
    # args
    'args': args if type(args) == utils.DotDict else vars(args),
    # loss
    'loss_label': args.loss_type,
    'loss_type': loss_type,
    'loss_fn': LossHelper.get_loss_function(loss_type),
    # learning
    'lr': args.lr,
    'patience': args.patience,
    'epochs': args.epochs,
    # data
    'frequency': args.frequency,
    'year_test': pd.to_datetime(args.test_date).year,
    # scaler
    'scaler_path': None
}

## Informer model

### I.1 Embeddings

In [4]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEmbedding, self).__init__()
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float()
                    * -(math.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:, :x.size(1)]


class TokenEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(TokenEmbedding, self).__init__()
        padding = 1 if torch.__version__ >= '1.5.0' else 2
        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
                                   kernel_size=3, padding=padding, padding_mode='circular')
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_in', nonlinearity='leaky_relu')

    def forward(self, x):
        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
        return x


class FixedEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(FixedEmbedding, self).__init__()

        w = torch.zeros(c_in, d_model).float()
        w.require_grad = False

        position = torch.arange(0, c_in).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float()
                    * -(math.log(10000.0) / d_model)).exp()

        w[:, 0::2] = torch.sin(position * div_term)
        w[:, 1::2] = torch.cos(position * div_term)

        self.emb = nn.Embedding(c_in, d_model)
        self.emb.weight = nn.Parameter(w, requires_grad=False)

    def forward(self, x):
        return self.emb(x).detach()


class TemporalEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='fixed', freq='h'):
        super(TemporalEmbedding, self).__init__()

        minute_size = 4
        hour_size = 24
        weekday_size = 7
        day_size = 32
        month_size = 13

        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
        if freq == 't':
            self.minute_embed = Embed(minute_size, d_model)
        # See https://github.com/zhouhaoyi/Informer2020/issues/149
        if freq == 'h':
            self.hour_embed = Embed(hour_size, d_model)
        self.weekday_embed = Embed(weekday_size, d_model)
        self.day_embed = Embed(day_size, d_model)
        self.month_embed = Embed(month_size, d_model)

    def forward(self, x):
        x = x.long()

        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
            self, 'minute_embed') else 0.
        # See https://github.com/zhouhaoyi/Informer2020/issues/149
        hour_x = self.hour_embed(x[:, :, 3]) if hasattr(
            self, 'hour_embed') else 0.
        weekday_x = self.weekday_embed(x[:, :, 2])
        day_x = self.day_embed(x[:, :, 1])
        month_x = self.month_embed(x[:, :, 0])

        return minute_x + hour_x + weekday_x + day_x + month_x


class TimeFeatureEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='timeF', freq='h'):
        super(TimeFeatureEmbedding, self).__init__()

        freq_map = {'h': 4, 't': 5, 's': 6,
                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
        d_inp = freq_map[freq]
        self.embed = nn.Linear(d_inp, d_model)

    def forward(self, x):
        return self.embed(x)


class DataEmbedding(nn.Module):
    def __init__(self, c_in, d_model, embed_type='fixed', freq='d', dropout=0.1, only_encoder=False):
        super(DataEmbedding, self).__init__()
        self.only_encoder = only_encoder

        # SVEN
        if self.only_encoder:
            print("> Informer only encoding")
            self.value_embedding = nn.Linear(c_in, d_model)
        else:
            self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)

        self.position_embedding = PositionalEmbedding(d_model=d_model)
        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
            d_model=d_model, embed_type=embed_type, freq=freq)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, x_mark):
        # SVEN
        x = self.value_embedding(
            x) + self.position_embedding(x_mark) + self.temporal_embedding(x_mark)

        return self.dropout(x)

In [5]:
# --- --- ---
# informer_encoder.py
# Sven Giegerich / 19.05.2021
# --- --- ---

# Based on,
# Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H. and Zhang, W., 2020.
# Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.
# arXiv preprint arXiv:2012.07436.


# Notebook
#from libs.models.embeddings import *

# --- ---
# ENCODER ONLY
# model.py
# --- ---

class InformerEncoder(nn.Module):

    name = 'informer'
    batch_first = True

    def __init__(self, enc_in, c_out, loss_type,
                 factor=5, d_model=512, n_heads=8, e_layers=3, d_ff=512,
                 dropout=0.0, attn='prob', embed_type='fixed', freq='d', activation='gelu',
                 output_attention=True, distil=False, win_len=None):
        super(InformerEncoder, self).__init__()

        self.enc_in = enc_in
        self.c_out = c_out
        self.factor = factor
        self.d_model = d_model
        self.n_heads = n_heads
        self.e_layers = e_layers
        self.d_ff = d_ff
        self.dropout = dropout
        self.freq = freq
        self.attn = attn
        self.output_attention = output_attention
        self.win_len = win_len
        self.embed_type = embed_type  # either 'fixed' or 'timeF'
        self.distil = distil

        if self.win_len is not None:
            self.enc_self_mask = self.generate_causal_mask(win_len)

        mask_flag = True

        # Embedding
        if self.embed_type == "simple":
            print("> Use simple positional encoding")
            self.enc_simple_embedding = nn.Linear(enc_in, d_model)
            self.pos_encoder = SimplePositionalEncoding(d_model=d_model)
        else:
            print("> Use data embedding")
            self.enc_data_embedding = DataEmbedding(
                c_in=enc_in, d_model=d_model, embed_type=embed_type, freq=freq, dropout=dropout, only_encoder=True)

        # Attention
        Attn = ProbCausalAttention if attn == 'prob' else FullAttention

        print(f"> Using attention mechanism {Attn}")

        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(Attn(mask_flag=mask_flag, factor=factor, attention_dropout=dropout, output_attention=output_attention),
                                   d_model, n_heads, mix=False),
                    d_model,
                    d_ff,
                    dropout=dropout,
                    activation=activation
                ) for l in range(e_layers)
            ],
            [
                ConvLayer(
                    d_model
                ) for l in range(e_layers-1)
            ] if distil else None,
            norm_layer=torch.nn.LayerNorm(d_model)
        )

        # SVEN
        # Decoder (aka projection)
        self.decoder = nn.Linear(d_model, c_out, bias=True)
        self.output_fn = LossHelper.get_output_activation(loss_type)

    def forward(self, src, x_mark_enc, enc_self_mask=None):
        if self.embed_type == "simple":
            # opt1: simple pos encoding (original attention is all you need)
            emb = self.pos_encoder(self.enc_simple_embedding(src))
        else:
            # opt2:
            emb = self.enc_data_embedding(src, x_mark_enc)

        enc_out, attns = self.encoder(emb, attn_mask=enc_self_mask)

        # SVEN: leave out the decoder part
        dec_out = self.decoder(enc_out)
        out = self.output_fn(dec_out)

        if self.output_attention:
            return out, attns
        else:
            return out  # [B, L, D]

    def generate_causal_mask(self, size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float(
            '-inf')).masked_fill(mask == 1, float(0.0))
        return mask

# --- ---
# encoder.py
# --- ---


class ConvLayer(nn.Module):
    def __init__(self, c_in):
        super(ConvLayer, self).__init__()
        self.downConv = nn.Conv1d(in_channels=c_in,
                                  out_channels=c_in,
                                  kernel_size=3,
                                  padding=2,
                                  padding_mode='circular')
        self.norm = nn.BatchNorm1d(c_in)
        self.activation = nn.ELU()
        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

    def forward(self, x):
        x = self.downConv(x.permute(0, 2, 1))
        x = self.norm(x)
        x = self.activation(x)
        x = self.maxPool(x)
        x = x.transpose(1, 2)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
        super(EncoderLayer, self).__init__()
        d_ff = d_ff or 4*d_model
        self.attention = attention
        self.conv1 = nn.Conv1d(in_channels=d_model,
                               out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(
            in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x, attn_mask=None):
        # x [B, L, D]
        # x = x + self.dropout(self.attention(
        #     x, x, x,
        #     attn_mask = attn_mask
        # ))
        new_x, attn = self.attention(
            x, x, x,
            attn_mask=attn_mask
        )
        x = x + self.dropout(new_x)

        y = x = self.norm1(x)
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))

        return self.norm2(x+y), attn


class Encoder(nn.Module):
    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
        super(Encoder, self).__init__()
        self.attn_layers = nn.ModuleList(attn_layers)
        self.conv_layers = nn.ModuleList(
            conv_layers) if conv_layers is not None else None
        self.norm = norm_layer

    def forward(self, x, attn_mask=None):
        # x [B, L, D]
        attns = []
        if self.conv_layers is not None:
            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
                x, attn = attn_layer(x, attn_mask=attn_mask)
                x = conv_layer(x)
                attns.append(attn)
            x, attn = self.attn_layers[-1](x, attn_mask=attn_mask)
            attns.append(attn)
        else:
            for attn_layer in self.attn_layers:
                x, attn = attn_layer(x, attn_mask=attn_mask)
                attns.append(attn)

        if self.norm is not None:
            x = self.norm(x)

        return x, attns


class EncoderStack(nn.Module):
    def __init__(self, encoders, inp_lens):
        super(EncoderStack, self).__init__()
        self.encoders = nn.ModuleList(encoders)
        self.inp_lens = inp_lens

    def forward(self, x, attn_mask=None):
        # x [B, L, D]
        x_stack = []
        attns = []
        for i_len, encoder in zip(self.inp_lens, self.encoders):
            inp_len = x.shape[1]//(2**i_len)
            x_s, attn = encoder(x[:, -inp_len:, :])
            x_stack.append(x_s)
            attns.append(attn)
        x_stack = torch.cat(x_stack, -2)

        return x_stack, attns

# --- ---
# decoder.py
# --- ---


class DecoderLayer(nn.Module):
    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
                 dropout=0.1, activation="relu"):
        super(DecoderLayer, self).__init__()
        d_ff = d_ff or 4*d_model
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.conv1 = nn.Conv1d(in_channels=d_model,
                               out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(
            in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x, cross, x_mask=None, cross_mask=None):
        x = x + self.dropout(self.self_attention(
            x, x, x,
            attn_mask=x_mask
        )[0])
        x = self.norm1(x)

        x = x + self.dropout(self.cross_attention(
            x, cross, cross,
            attn_mask=cross_mask
        )[0])

        y = x = self.norm2(x)
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))

        return self.norm3(x+y)


class Decoder(nn.Module):
    def __init__(self, layers, norm_layer=None):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList(layers)
        self.norm = norm_layer

    def forward(self, x, cross, x_mask=None, cross_mask=None):
        for layer in self.layers:
            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)

        if self.norm is not None:
            x = self.norm(x)

        return x

# --- ---
# embed.py
# --- ---

# see seperate file libs.models.embeddings.py

# --- ---
# attn.py
# --- ---


class FullAttention(nn.Module):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(FullAttention, self).__init__()
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def forward(self, queries, keys, values, attn_mask):
        B, L, H, E = queries.shape
        _, S, _, D = values.shape
        scale = self.scale or 1./sqrt(E)

        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
        if self.mask_flag:
            if attn_mask is None:
                attn_mask = TriangularCausalMask(B, L, device=queries.device)

            scores.masked_fill_(attn_mask.mask, -np.inf)

        A = self.dropout(torch.softmax(scale * scores, dim=-1))
        V = torch.einsum("bhls,bshd->blhd", A, values)

        if self.output_attention:
            return (V.contiguous(), A)
        else:
            return (V.contiguous(), None)


class ProbCausalAttention(nn.Module):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(ProbCausalAttention, self).__init__()
        self.factor = factor
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
        # Q [B, H, L, D]
        B, H, L_K, E = K.shape
        _, _, L_Q, _ = Q.shape

        # calculate the sampled Q_K
        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
        # real U = U_part(factor*ln(L_k))*L_q
        index_sample = torch.randint(L_K, (L_Q, sample_k))
        K_sample = K_expand[:, :, torch.arange(
            L_Q).unsqueeze(1), index_sample, :]
        Q_K_sample = torch.matmul(
            Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze()

        # find the Top_k query with sparisty measurement
        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
        M_top = M.topk(n_top, sorted=False)[1]

        # use the reduced Q to calculate Q_K
        Q_reduce = Q[torch.arange(B)[:, None, None],
                     torch.arange(H)[None, :, None],
                     M_top, :]  # factor*ln(L_q)
        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k

        return Q_K, M_top

    def forward(self, queries, keys, values, attn_mask):
        B, L_Q, H, D = queries.shape
        _, L_K, _, _ = keys.shape

        queries = queries.transpose(2, 1)
        keys = keys.transpose(2, 1)
        values = values.transpose(2, 1)

        U_part = self.factor * \
            np.ceil(np.log(L_K)).astype('int').item()  # c*ln(L_k)
        u = self.factor * \
            np.ceil(np.log(L_Q)).astype('int').item()  # c*ln(L_q)

        U_part = U_part if U_part < L_K else L_K
        u = u if u < L_Q else L_Q

        scores_top, index = self._prob_QK(
            queries, keys, sample_k=U_part, n_top=u)

        # add scale factor
        scale = self.scale or 1./sqrt(D)
        if scale is not None:
            scores_top = scores_top * scale

        # NEW: START ----

        # uniform distr as starter
        scores = (torch.ones([B, H, L_Q, L_Q]) /
                  L_Q).type_as(scores_top).to(device)
        # ... uniform
        scores[torch.arange(B)[:, None, None],
               torch.arange(H)[None, :, None],
               index, :] = scores_top

        # causal mask
        if self.mask_flag:
            if attn_mask is None:
                attn_mask = TriangularCausalMask(B, L_Q, device=queries.device)

            scores.masked_fill_(attn_mask.mask, -np.inf)

        # tmp: plus dropout? not in prob, but in full... don't think necessary
        A = torch.softmax(scale * scores, dim=-1)
        V = torch.einsum("bhls,bhsd->blhd", A, values)

        # NEW: END ----

        if self.output_attention:
            return (V.contiguous(), A)
        else:
            return (V.contiguous(), None)
        pass


class ProbAttention(nn.Module):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(ProbAttention, self).__init__()
        self.factor = factor
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
        # Q [B, H, L, D]
        B, H, L_K, E = K.shape
        _, _, L_Q, _ = Q.shape

        # calculate the sampled Q_K
        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
        # real U = U_part(factor*ln(L_k))*L_q
        index_sample = torch.randint(L_K, (L_Q, sample_k))
        K_sample = K_expand[:, :, torch.arange(
            L_Q).unsqueeze(1), index_sample, :]
        Q_K_sample = torch.matmul(
            Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze()

        # find the Top_k query with sparisty measurement
        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
        M_top = M.topk(n_top, sorted=False)[1]

        # use the reduced Q to calculate Q_K
        Q_reduce = Q[torch.arange(B)[:, None, None],
                     torch.arange(H)[None, :, None],
                     M_top, :]  # factor*ln(L_q)
        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k

        return Q_K, M_top

    def _get_initial_context(self, V, L_Q):
        B, H, L_V, D = V.shape

        if not self.mask_flag:
            # V_sum = V.sum(dim=-2)
            V_sum = V.mean(dim=-2)
            contex = V_sum.unsqueeze(-2).expand(B, H,
                                                L_Q, V_sum.shape[-1]).clone()
        else:  # use mask
            # requires that L_Q == L_V, i.e. for self-attention only
            assert(L_Q == L_V)
            contex = V.cumsum(dim=-2)

        return contex

    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
        B, H, L_V, D = V.shape

        if self.mask_flag:
            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
            scores.masked_fill_(attn_mask.mask, -np.inf)

        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)

        context_in[torch.arange(B)[:, None, None],
                   torch.arange(H)[None, :, None],
                   index, :] = torch.matmul(attn, V).type_as(context_in)
        if self.output_attention:
            # SVEN: add uniform distribution
            attns = (torch.ones([B, H, L_V, L_V]) /
                     L_V).type_as(attn).to(attn.device)

            # SVEN: probably just for visz. -> more substantive for context
            # ... add mask here!
            # ... but also necessary somwhere else? as this only effects display
            causal_mask = TriangularCausalMask(
                B, L_Q, device).mask[0, :, :].squeeze()
            attns.masked_fill_(causal_mask, 0)  # broadcasting
            # END SVEN

            # SVEN: add the actual attention values for the sparse entries
            attns[torch.arange(B)[:, None, None], torch.arange(H)[
                None, :, None], index, :] = attn
            return (context_in, attns)
        else:
            return (context_in, None)

    def forward(self, queries, keys, values, attn_mask):
        B, L_Q, H, D = queries.shape
        _, L_K, _, _ = keys.shape

        queries = queries.transpose(2, 1)
        keys = keys.transpose(2, 1)
        values = values.transpose(2, 1)

        U_part = self.factor * \
            np.ceil(np.log(L_K)).astype('int').item()  # c*ln(L_k)
        u = self.factor * \
            np.ceil(np.log(L_Q)).astype('int').item()  # c*ln(L_q)

        U_part = U_part if U_part < L_K else L_K
        u = u if u < L_Q else L_Q

        scores_top, index = self._prob_QK(
            queries, keys, sample_k=U_part, n_top=u)

        # add scale factor
        scale = self.scale or 1./sqrt(D)
        if scale is not None:
            scores_top = scores_top * scale
        # get the context
        context = self._get_initial_context(values, L_Q)
        # update the context with selected top_k queries
        context, attn = self._update_context(
            context, values, scores_top, index, L_Q, attn_mask)

        return context.transpose(2, 1).contiguous(), attn


class AttentionLayer(nn.Module):
    def __init__(self, attention, d_model, n_heads,
                 d_keys=None, d_values=None, mix=False):
        super(AttentionLayer, self).__init__()

        d_keys = d_keys or (d_model//n_heads)
        d_values = d_values or (d_model//n_heads)

        self.inner_attention = attention
        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
        self.value_projection = nn.Linear(d_model, d_values * n_heads)
        self.out_projection = nn.Linear(d_values * n_heads, d_model)
        self.n_heads = n_heads
        self.mix = mix

    def forward(self, queries, keys, values, attn_mask):
        B, L, _ = queries.shape
        _, S, _ = keys.shape
        H = self.n_heads

        queries = self.query_projection(queries).view(B, L, H, -1)
        keys = self.key_projection(keys).view(B, S, H, -1)
        values = self.value_projection(values).view(B, S, H, -1)

        out, attn = self.inner_attention(
            queries,
            keys,
            values,
            attn_mask
        )
        if self.mix:
            out = out.transpose(2, 1).contiguous()
        out = out.view(B, L, -1)

        return self.out_projection(out), attn

# --- ---
# utils masking
# --- ---


class TriangularCausalMask():
    def __init__(self, B, L, device="cpu"):
        mask_shape = [B, 1, L, L]
        with torch.no_grad():
            self._mask = torch.triu(torch.ones(
                mask_shape, dtype=torch.bool), diagonal=1).to(device)

    @property
    def mask(self):
        return self._mask


class ProbMask():
    def __init__(self, B, H, L, index, scores, device="cpu"):
        _mask = torch.ones(
            L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
        indicator = _mask_ex[torch.arange(B)[:, None, None],
                             torch.arange(H)[None, :, None],
                             index, :].to(device)
        self._mask = indicator.view(scores.shape).to(device)

    @property
    def mask(self):
        return self._mask

# --- ---
# utils timefeatures
# --- ---


class TimeFeature:
    def __init__(self):
        pass

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        pass

    def __repr__(self):
        return self.__class__.__name__ + "()"


class SecondOfMinute(TimeFeature):
    """Minute of hour encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.second / 59.0 - 0.5


class MinuteOfHour(TimeFeature):
    """Minute of hour encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.minute / 59.0 - 0.5


class HourOfDay(TimeFeature):
    """Hour of day encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.hour / 23.0 - 0.5


class DayOfWeek(TimeFeature):
    """Hour of day encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.dayofweek / 6.0 - 0.5


class DayOfMonth(TimeFeature):
    """Day of month encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.day - 1) / 30.0 - 0.5


class DayOfYear(TimeFeature):
    """Day of year encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.dayofyear - 1) / 365.0 - 0.5


class MonthOfYear(TimeFeature):
    """Month of year encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.month - 1) / 11.0 - 0.5


class WeekOfYear(TimeFeature):
    """Week of year encoded as value between [-0.5, 0.5]"""

    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.isocalendar().week - 1) / 52.0 - 0.5


def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
    """
    Returns a list of time features that will be appropriate for the given frequency string.
    Parameters
    ----------
    freq_str
        Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
    """

    features_by_offsets = {
        offsets.YearEnd: [],
        offsets.QuarterEnd: [MonthOfYear],
        offsets.MonthEnd: [MonthOfYear],
        offsets.Week: [DayOfMonth, WeekOfYear],
        offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear],
        offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear],
        offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear],
        offsets.Minute: [
            MinuteOfHour,
            HourOfDay,
            DayOfWeek,
            DayOfMonth,
            DayOfYear,
        ],
        offsets.Second: [
            SecondOfMinute,
            MinuteOfHour,
            HourOfDay,
            DayOfWeek,
            DayOfMonth,
            DayOfYear,
        ],
    }

    offset = to_offset(freq_str)

    for offset_type, feature_classes in features_by_offsets.items():
        if isinstance(offset, offset_type):
            return [cls() for cls in feature_classes]

    supported_freq_msg = f"""
    Unsupported frequency {freq_str}
    The following frequencies are supported:
        Y   - yearly
            alias: A
        M   - monthly
        W   - weekly
        D   - daily
        B   - business days
        H   - hourly
        T   - minutely
            alias: min
        S   - secondly
    """
    raise RuntimeError(supported_freq_msg)


def time_features(dates, timeenc=1, freq='h'):
    """
    > `time_features` takes in a `dates` dataframe with a 'dates' column and extracts the date down to `freq` where freq can be any of the following if `timeenc` is 0: 
    > * m - [month]
    > * w - [month]
    > * d - [month, day, weekday]
    > * b - [month, day, weekday]
    > * h - [month, day, weekday, hour]
    > * t - [month, day, weekday, hour, *minute]
    > 
    > If `timeenc` is 1, a similar, but different list of `freq` values are supported (all encoded between [-0.5 and 0.5]): 
    > * Q - [month]
    > * M - [month]
    > * W - [Day of month, week of year]
    > * D - [Day of week, day of month, day of year]
    > * B - [Day of week, day of month, day of year]
    > * H - [Hour of day, day of week, day of month, day of year]
    > * T - [Minute of hour*, hour of day, day of week, day of month, day of year]
    > * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year]

    *minute returns a number from 0-3 corresponding to the 15 minute period it falls into.
    """
    if timeenc == 0:
        dates['month'] = dates.date.apply(lambda row: row.month, 1)
        dates['day'] = dates.date.apply(lambda row: row.day, 1)
        dates['weekday'] = dates.date.apply(lambda row: row.weekday(), 1)
        dates['hour'] = dates.date.apply(lambda row: row.hour, 1)
        dates['minute'] = dates.date.apply(lambda row: row.minute, 1)
        dates['minute'] = dates.minute.map(lambda x: x//15)
        freq_map = {
            'y': [], 'm': ['month'], 'w': ['month'], 'd': ['month', 'day', 'weekday'],
            'b': ['month', 'day', 'weekday'], 'h': ['month', 'day', 'weekday', 'hour'],
            't': ['month', 'day', 'weekday', 'hour', 'minute'],
        }
        return dates[freq_map[freq.lower()]].values
    if timeenc == 1:
        dates = pd.to_datetime(dates.date.values)
        return np.vstack([feat(dates) for feat in time_features_from_frequency_str(freq)]).transpose(1, 0)

# Run 

## Build model

In [6]:
print(f"(3) Build model: {args.arch}")
d_input = 8
d_output = 1

freq = 'd'  # daily
factor = args.factor  # factor to sample for prob attention
d_ff = args.d_model
attn = args.attn  # 'full' or 'prob'
embed_type = args.embed_type  # could be changed to learnable
# if n_layer > 1: each succ layer will be reduced by 2
do_distil = False
output_attention = True
win_len = args.win_len

model = InformerEncoder(enc_in=d_input, c_out=d_output, factor=factor,loss_type=train_manager['loss_type'], d_model=args.d_model, n_heads=args.n_head,e_layers=args.n_layer, d_ff=d_ff, dropout=args.dropout, attn=attn, embed_type=embed_type,freq=freq, output_attention=output_attention, distil=do_distil, win_len=win_len)

(3) Build model: informer
> Use data embedding
> Informer only encoding
> Using attention mechanism <class '__main__.ProbCausalAttention'>


## Train model

In [7]:
def train(model, train_iter, val_iter, train_manager, do_log=False, val_df=None):
    model = model.to(device).double()
    best_val_score = np.inf

    # train manager ----
    train_manager['optimizer'] = torch.optim.Adam(
        model.parameters(), lr=train_manager['lr'])

    early_stopping = utils.EarlyStopping(
        patience=train_manager['patience'], path=None, verbose=True)

    # run training ----
    for epoch_i in range(train_manager['epochs']):
        epoch_loss = run_epoch(
            model=model, train_iter=train_iter, train_manager=train_manager, epoch_i=epoch_i, do_log=do_log)

        val_loss = evaluate_iter(
            model=model, data_iter=val_iter, train_manager=train_manager)
        val_str_loss = evaluate_iter(
            model=model, data_iter=val_iter, train_manager=train_manager, do_strategy=True, base_df=val_df)

        if val_loss < best_val_score:
            best_val_score = val_loss

        # verb ----
        epoch_print = f">> Train Epoch {epoch_i + 1}\t -- avg --\t train batch loss: {epoch_loss:.6f}\t val batch loss: {val_loss:.6f} \t  val strategy loss: {val_str_loss:.6f}"
        if do_log:
            logx.msg(epoch_print)
            logx.add_scalar("Loss/val", val_loss, epoch_i)

            metrics_train = {'loss': epoch_loss}
            metrics_val = {'loss': val_loss}
            # to be extented
            logx.metric(phase='train', metrics=metrics_train,
                        epoch=epoch_i + 1)
            logx.metric(phase='val', metrics=metrics_val,
                        epoch=epoch_i + 1)
        else:
            print(epoch_print)

        # early stopping ----
        if train_manager['args']['stopping_type'] == 'strategy':
            checkpoint_loss = val_str_loss
        else:
            checkpoint_loss = val_loss

        early_stopping(checkpoint_loss, model)

        if early_stopping.early_stop:
            print(f"> Early stopping")
            break

    best_val_loss = -early_stopping.best_score
    return (early_stopping.path, best_val_loss)


def run_epoch(model, train_iter, train_manager, epoch_i=None, do_log=False):
    model.train()

    optimizer = train_manager['optimizer']
    loss_fn = train_manager['loss_fn']
    max_grad_norm = train_manager['args']['max_grad_norm']

    loss_epoch = np.zeros((len(train_iter), 2))  # batch loss & batch size
    for i, batch in enumerate(train_iter):
        inputs = batch['inp'].double().to(device)
        labels = batch['trg'].double().to(device)
        returns = batch['rts'].double().to(device)

        optimizer.zero_grad()

        # time embedding?
        if model.name == 'informer':
            inputs_time_embd = batch['time_embd'].double().to(device)
            prediction, attns = model(inputs, inputs_time_embd)
        else:
            prediction = model(inputs)

        if LossHelper.use_returns_for_loss(train_manager['loss_type']):
            loss = loss_fn(prediction, returns,
                           freq=train_manager['frequency'])
        else:
            loss = loss_fn(prediction, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), max_norm=max_grad_norm)
        optimizer.step()

        batch_size = batch['inp'].shape[0]
        loss_epoch[i] = np.array([loss, batch_size])

        # log results
        if epoch_i is not None and i % 5 == 0:
            if do_log:
                writer_path = f"Loss/train/{train_manager['loss_label']}/{train_manager['year_test']}"
                logx.add_scalar(writer_path, loss, epoch_i *
                                len(train_iter) + i)
            print_msg = f">> Train Epoch {epoch_i+1}\t batch {i}\t train batch loss: {loss:.6f}"
            if do_log:
                logx.msg(print_msg)
            else:
                print(print_msg)

    mean_loss_epoch = np.average(loss_epoch[:, 0], weights=loss_epoch[:, 1])
    return mean_loss_epoch


def evaluate_iter(model, data_iter, train_manager, do_strategy=False, base_df=None, do_log=False):
    if do_strategy and base_df is not None:
        # strategy loss ----
        df_skeleton = base_df.swaplevel(axis=1)['prs']
        scaled_rts = base_df.xs('rts_scaled', axis=1,
                                level=1, drop_level=True)

        predictions = evaluate.calc_predictions_df(model, data_iter, df_shape=df_skeleton.shape,
                                                   df_index=df_skeleton.index, df_insts=df_skeleton.columns,
                                                   win_step=train_manager['args']['win_len'], scaler=train_manager['args']['scaler'], loss_type=train_manager['loss_type'])
        positions = evaluate.calc_position_df(
            predictions, train_manager['loss_type'])
        str_returns = utils.calc_strategy_returns(
            positions=positions, realized_returns=scaled_rts, aggregate_by='time', lead=1)

        loss_fn = LossHelper.get_strategy_loss_function(
            train_manager['loss_type'])
        str_loss = loss_fn(str_returns)

        return str_loss
    else:
        # batch loss ----
        return evaluate.evaluate_model(model, data_iter, train_manager, do_log=do_log)

In [8]:
best_checkpoint_path, val_loss = train(model=model, train_iter=train_iter, val_iter=val_iter,train_manager=train_manager, do_log=args.do_log, val_df=val_df)

>> Train Epoch 1	 batch 0	 train batch loss: 0.148552
>> Train Epoch 1	 -- avg --	 train batch loss: -0.451244	 val batch loss: -0.377727 	  val strategy loss: -1.493124
>> Train Epoch 2	 batch 0	 train batch loss: -0.613084
>> Train Epoch 2	 -- avg --	 train batch loss: -0.797094	 val batch loss: -0.297252 	  val strategy loss: -1.205903
EarlyStopping counter: 1 out of 25
>> Train Epoch 3	 batch 0	 train batch loss: -0.765849
>> Train Epoch 3	 -- avg --	 train batch loss: -0.827781	 val batch loss: -0.223587 	  val strategy loss: -0.866911
EarlyStopping counter: 2 out of 25
>> Train Epoch 4	 batch 0	 train batch loss: -0.770731
>> Train Epoch 4	 -- avg --	 train batch loss: -0.920287	 val batch loss: -0.239677 	  val strategy loss: -1.052359
EarlyStopping counter: 3 out of 25
>> Train Epoch 5	 batch 0	 train batch loss: -1.156718
>> Train Epoch 5	 -- avg --	 train batch loss: -1.006408	 val batch loss: -0.239015 	  val strategy loss: -1.084095
EarlyStopping counter: 4 out of 25
>> Tra

## Test loss

In [9]:
if train_manager['args']['stopping_type'] == 'strategy':
    test_loss = evaluate_iter(model=model, data_iter=test_iter, train_manager=train_manager, do_log=False, do_strategy=True,base_df=test_df)
else:
    test_loss = evaluate_iter(model=model, data_iter=test_iter, train_manager=train_manager, do_log=False, do_strategy=False)

print(f">> Val loss: {val_loss:.6f}")
print(f">> Test loss: {test_loss:.6f}")

>> Val loss: -6.441751
>> Test loss: -4.840829


In [3]:
pickle.load(open('input_numpy.p', 'rb'))

array([[[[ 1.11567950e+00, -1.95681203e-01, -1.33816270e-01, ...,
          -1.13808535e+00,  7.67775147e-03,  2.40350032e+00],
         [ 1.10041239e-01, -2.24016349e-01, -1.21578649e-01, ...,
          -1.14331353e+00, -6.01067508e-02,  2.28199018e+00],
         [ 6.70466110e-01, -1.92969808e-01, -1.29305755e-01, ...,
          -1.11940434e+00, -1.06596147e-01,  2.19545070e+00],
         ...,
         [ 6.73579553e-01,  1.56944757e-01,  2.56580209e-01, ...,
           1.07252786e+00,  1.22037084e+00,  2.08981886e+00],
         [ 1.07694106e+00,  2.25401951e-01,  3.13038486e-01, ...,
           1.15111636e+00,  1.28490407e+00,  2.14408475e+00],
         [ 2.83211307e-01,  1.89892875e-01,  3.02244238e-01, ...,
           1.23068953e+00,  1.35303324e+00,  2.20201345e+00]],

        [[-6.30127649e-01, -6.71802831e-02, -2.10861652e-01, ...,
          -1.18601675e+00, -1.73146586e+00, -1.78324939e+00],
         [-4.22851950e-01, -7.87189291e-02, -2.91015962e-01, ...,
          -1.20167858e