Embedding representation of each vector in the time series is  constructed from 3 embeddings:
- A projection: A vector projection of the current input
- Local time stamp embedding
- Global time stamp embedding

Position encoding is the local time stamp embedding. It only provides the ordering of data points. However, time series data needs to be more context aware of weekdays vs weekends, holidays and special events. To encode this information, we include a global time stamp embedding that incorperates:
- Hierarchical time events (hours, minutes, seconds, day, week)
- Special events (hoidays)

In [None]:
from torch import nn
import torch

# Projection

![Conv 1D](https://discuss.pytorch.org/uploads/default/original/3X/5/f/5faf64f4eb86dd37121774c720877b1d44f7f617.gif)

In [None]:
c_in = 3 # Number of features per input time stamp
c_out = d_model = 4 # The number of features we want to project for each time stamp
sequence_length = L_Q = L_K = L_V = 10
n_heads = 1
batch_size = 2

x = torch.randn( (batch_size, sequence_length, c_in) )

In [None]:
x

tensor([[[ 0.6890, -0.9911, -1.8774],
         [ 0.1350, -0.5215, -0.2963],
         [ 0.5942, -0.8601,  1.0417],
         [-1.5380,  1.0780,  1.0357],
         [-0.5142, -0.0160, -0.6396],
         [-0.7799, -0.3531,  0.6835],
         [ 0.6796, -0.9898, -0.0750],
         [-0.7833,  0.4132, -0.3299],
         [-1.4058,  0.9521,  0.0413],
         [ 0.1728, -1.1039,  0.1382]],

        [[ 0.7377,  0.4342,  1.4384],
         [ 1.5556, -0.1158, -1.2052],
         [ 0.8334,  0.5213,  0.0516],
         [ 0.2976, -0.1361,  0.3186],
         [-1.3595,  1.2676,  0.5898],
         [-1.0110, -1.1203, -2.0545],
         [ 0.5777, -0.4785,  2.4316],
         [-1.7078, -0.4486,  0.2782],
         [ 1.0887, -0.0074, -0.2103],
         [ 0.3970,  1.3127,  0.2000]]])

In [None]:
x.shape

torch.Size([2, 10, 3])

In [None]:
tokenConv = nn.Conv1d(
    in_channels=c_in,
    out_channels=d_model,
    kernel_size=3,
    padding=1,
    padding_mode='circular'
)

In [None]:
projection = tokenConv(x.permute(0, 2, 1))

In [None]:
projection = projection.transpose(1, 2)
projection.shape

torch.Size([2, 10, 4])

In [None]:
projection

tensor([[[ 0.7016,  0.6930,  0.3599, -1.0037],
         [ 0.8856,  1.1300,  1.1661, -0.4213],
         [ 0.5852,  1.2137, -0.2574,  0.4477],
         [-0.0773, -0.0470,  0.2788,  0.0732],
         [ 0.0610,  0.0740, -0.1659, -0.3673],
         [ 0.8265, -0.0173,  0.2261, -0.1117],
         [ 0.7131,  0.4631, -0.6574, -0.0875],
         [ 0.0146,  0.6671,  0.7218,  0.0400],
         [ 0.2215, -0.0841,  0.7647, -0.1248],
         [ 0.9970, -0.7928, -0.9952, -0.1394]],

        [[-0.1491, -0.7639, -1.0443, -0.2113],
         [-0.5387,  0.1373, -0.6228, -1.3294],
         [-0.0543,  0.8048,  0.5732, -0.2836],
         [-0.3113,  0.5853, -0.2341,  0.0650],
         [ 0.1387, -0.3098,  0.2476,  0.4547],
         [ 0.0370,  0.0759,  0.5109, -1.5504],
         [ 1.8057,  1.6076,  0.1172,  1.6382],
         [-0.1589, -0.7862, -0.3747, -1.2088],
         [ 0.5514,  0.4087, -0.3432, -0.2144],
         [-0.5906,  0.7147,  0.6666, -0.5975]]], grad_fn=<TransposeBackward0>)

# Local Timestamp

$$
PE(\text{position}, 2i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

$$
PE(\text{position}, 2i+1) = \cos\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

We can rewrite these as

$$
PE(\text{position}, i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{i}{d_{model}}} \bigg) \text{ when i is even}
$$

$$
PE(\text{position}, i) = \cos\bigg( \frac{ \text{position} }{10000^\frac{i-1}{d_{model}}} \bigg) \text{ when i is odd}
$$

In [None]:
max_sequence_length = sequence_length

In [None]:
even_i = torch.arange(0, d_model, 2).float()
even_i

tensor([0., 2.])

In [None]:
even_denominator = torch.pow(10000, even_i/d_model)
even_denominator

tensor([  1., 100.])

In [None]:
odd_i = torch.arange(1, d_model, 2).float()
odd_i

tensor([1., 3.])

In [None]:
odd_denominator = torch.pow(10000, (odd_i - 1)/d_model)
odd_denominator

tensor([  1., 100.])

In [None]:
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)

In [None]:
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [None]:
even_PE = torch.sin(position / even_denominator)
odd_PE = torch.cos(position / odd_denominator)

In [None]:
even_PE, odd_PE

(tensor([[ 0.0000,  0.0000],
         [ 0.8415,  0.0100],
         [ 0.9093,  0.0200],
         [ 0.1411,  0.0300],
         [-0.7568,  0.0400],
         [-0.9589,  0.0500],
         [-0.2794,  0.0600],
         [ 0.6570,  0.0699],
         [ 0.9894,  0.0799],
         [ 0.4121,  0.0899]]),
 tensor([[ 1.0000,  1.0000],
         [ 0.5403,  0.9999],
         [-0.4161,  0.9998],
         [-0.9900,  0.9996],
         [-0.6536,  0.9992],
         [ 0.2837,  0.9988],
         [ 0.9602,  0.9982],
         [ 0.7539,  0.9976],
         [-0.1455,  0.9968],
         [-0.9111,  0.9960]]))

`torch.sin` expects input in radians

In [None]:
torch.sin(torch.tensor([1]))

tensor([0.8415])

Interleave by column

In [None]:
position_embedding = torch.empty(even_PE.shape[0], even_PE.shape[1] + odd_PE.shape[1])
# Fill the new tensor
position_embedding[:, 0::2] = even_PE  # Fill columns 0, 2, 4 with even_PE
position_embedding[:, 1::2] = odd_PE   # Fill columns 1, 3 with odd_PE

In [None]:
position_embedding

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996],
        [-0.7568, -0.6536,  0.0400,  0.9992],
        [-0.9589,  0.2837,  0.0500,  0.9988],
        [-0.2794,  0.9602,  0.0600,  0.9982],
        [ 0.6570,  0.7539,  0.0699,  0.9976],
        [ 0.9894, -0.1455,  0.0799,  0.9968],
        [ 0.4121, -0.9111,  0.0899,  0.9960]])

In [None]:
position_embedding.shape

torch.Size([10, 4])

# Global Timestamps

This can be learnable or fixed.

If fixed, we encode each hour instance (1, 2, 3, ..., 24) based on position and "fix" the embeddings so they are not learnable. This logic is similar to the previous section.

However, we can also make these embeddings learnable.

In [None]:
hour_size = 24
weekday_size = 7
day_size = 31
month_size = 12

In [None]:
hour_embed = nn.Embedding(hour_size, d_model)
weekday_embed = nn.Embedding(weekday_size, d_model)
day_embed = nn.Embedding(day_size, d_model)
month_embed = nn.Embedding(month_size, d_model)

In [None]:
hour_embed.weight

Parameter containing:
tensor([[ 0.0370, -0.6790,  0.4507, -0.6564],
        [-0.2024,  0.5290, -1.0422, -0.7800],
        [ 0.0370,  0.1176,  0.9344,  0.5144],
        [ 1.8843, -1.4810, -0.4794,  0.0945],
        [-0.5156,  1.0703, -0.7393,  0.1494],
        [ 0.3453,  0.0355,  0.2714,  0.9125],
        [ 0.4752,  1.0350, -1.4702, -1.0391],
        [ 1.2276, -2.0311, -2.1486,  1.2287],
        [ 1.5426, -0.8114, -0.1439,  2.3367],
        [ 0.1095, -0.3748,  0.4521, -1.5516],
        [ 0.3600,  0.3866,  0.5277,  0.8853],
        [-1.6584,  0.4676,  0.6434,  2.6287],
        [-1.4121,  0.6812, -1.1155, -1.6690],
        [-0.4367,  1.0713, -1.4448,  0.6032],
        [-0.3468,  1.2167, -1.8978,  0.1680],
        [-0.1768,  1.1593,  0.3491,  1.2422],
        [-0.6108,  2.5675,  2.7137,  0.1728],
        [-0.5374,  0.6323,  1.0061,  1.0328],
        [-0.6617, -0.1247,  1.5724,  0.8489],
        [ 0.0042,  0.6643,  0.8872, -0.5905],
        [ 0.2833,  2.2927,  1.7785,  0.1190],
        [ 1.

In [None]:
hours = torch.randint(0, 23, (batch_size, sequence_length, 1))
days = torch.randint(0, 6, (batch_size, sequence_length, 1))
dates = torch.randint(0, 30, (batch_size, sequence_length, 1))
months = torch.randint(0, 11, (batch_size, sequence_length, 1))

# Concatenate these tensors along the last dimension
x = torch.cat((hours, days, dates, months), dim=2)

In [None]:
x

tensor([[[12,  5, 27,  2],
         [ 4,  1,  0, 10],
         [12,  2, 17,  4],
         [10,  4, 18,  7],
         [15,  0,  4,  4],
         [10,  3, 14,  4],
         [17,  2, 12,  3],
         [16,  4, 28,  1],
         [ 0,  1, 21,  4],
         [13,  0,  3,  7]],

        [[14,  4, 24,  1],
         [22,  0, 16,  2],
         [ 9,  5,  4,  7],
         [ 9,  0,  9,  8],
         [13,  4, 15,  4],
         [ 4,  5,  1,  3],
         [ 1,  0, 12,  8],
         [18,  0,  1,  5],
         [15,  4, 16,  5],
         [ 5,  1,  6,  8]]])

In [None]:
x.shape

torch.Size([2, 10, 4])

In [None]:
hour_x = hour_embed(x[:,:,0])
weekday_x = weekday_embed(x[:,:,1])
day_x = day_embed(x[:,:,2])
month_x = month_embed(x[:,:,3])

global_position_embedding = hour_x + weekday_x + day_x + month_x

In [None]:
global_position_embedding

tensor([[[-3.8396,  0.7981,  0.3770, -1.8705],
         [-3.5538,  2.0864, -0.3235, -0.5462],
         [-0.4161, -2.8516, -3.8787, -4.3991],
         [ 1.2439,  1.1240,  0.2222,  0.7697],
         [-4.5657,  0.4786, -1.9032,  1.7100],
         [ 1.1784,  1.8781, -0.4839,  1.4881],
         [-2.2563, -2.2930, -1.5085,  1.3123],
         [-2.7715,  3.6828,  2.2076,  0.0177],
         [-0.0285, -0.1002, -0.0610, -0.9076],
         [-3.6307,  1.3411, -1.5292, -0.2987]],

        [[-2.2606,  3.0897, -2.7189,  0.6964],
         [-6.4836, -0.6810,  0.7366,  1.0519],
         [-1.0193,  0.3355, -0.8943, -2.0160],
         [-4.5899, -1.5260, -2.7387,  0.8446],
         [-0.9623,  1.7616, -0.7617, -0.0379],
         [-2.3435,  3.0586, -0.0272,  1.2993],
         [-5.3289,  0.0267, -2.4150,  2.6881],
         [-4.6398,  1.1857,  2.1572,  3.4466],
         [-3.6502,  1.0433,  0.0401,  1.9094],
         [ 0.4882,  1.1521,  0.2325,  2.5636]]], grad_fn=<AddBackward0>)

In [None]:
global_position_embedding.shape

torch.Size([2, 10, 4])

We add these projections together

In [None]:
global_position_embedding.shape, projection.shape, position_embedding.shape

(torch.Size([2, 10, 4]), torch.Size([2, 10, 4]), torch.Size([10, 4]))

In [None]:
embedding = projection + position_embedding + global_position_embedding

In [None]:
embedding

tensor([[[-3.1381e+00,  2.4911e+00,  7.3693e-01, -1.8742e+00],
         [-1.8267e+00,  3.7567e+00,  8.5265e-01,  3.2484e-02],
         [ 1.0784e+00, -2.0541e+00, -4.1162e+00, -2.9516e+00],
         [ 1.3078e+00,  8.6951e-02,  5.3103e-01,  1.8424e+00],
         [-5.2614e+00, -1.0103e-01, -2.0291e+00,  2.3418e+00],
         [ 1.0460e+00,  2.1445e+00, -2.0775e-01,  2.3752e+00],
         [-1.8227e+00, -8.6978e-01, -2.1059e+00,  2.2230e+00],
         [-2.0999e+00,  5.1038e+00,  2.9993e+00,  1.0552e+00],
         [ 1.1824e+00, -3.2982e-01,  7.8360e-01, -3.5581e-02],
         [-2.2216e+00, -3.6277e-01, -2.4345e+00,  5.5785e-01]],

        [[-2.4097e+00,  3.3258e+00, -3.7632e+00,  1.4851e+00],
         [-6.1808e+00, -3.3520e-03,  1.2382e-01,  7.2241e-01],
         [-1.6435e-01,  7.2418e-01, -3.0107e-01, -1.2998e+00],
         [-4.7601e+00, -1.9307e+00, -2.9427e+00,  1.9092e+00],
         [-1.5804e+00,  7.9812e-01, -4.7408e-01,  1.4161e+00],
         [-3.2654e+00,  3.4182e+00,  5.3372e-01,  7.4

In [None]:
embedding.shape

torch.Size([2, 10, 4])

# Embedding Code

Let's take a look at the code in informer