<a href="https://colab.research.google.com/github/alam156/ClaudeAPI/blob/main/7-Transformers-Timeseries/Transformers_for_timeseries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Transformers for Timeseries

Click to run on colab (if you're not already there): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/charlesollion/dlexperiments/blob/master/7-Transformers-Timeseries/Transformers_for_timeseries.ipynb)

The goal of this notebook is to illustrate the use of a transformer for timeseries prediction.
This notebook was built by Alice Martin and adapted to pytorch by Charles Ollion

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from math import sqrt


In [21]:
import torch
import pandas as pd

file_path = "sample_data/Time_series_trend_latest.xlsx"
df = pd.read_csv("sample_data/Time_series_trend_latest.csv")

time_series = df.drop(columns=["Topic"])

train_data = time_series.iloc[:, :15]
test_data  = time_series.iloc[:, 15:]

n_topics = df.shape[0]
train_len = train_data.shape[1]
test_len  = test_data.shape[1]
sequence_length = 15

n_heads = 1
batch_size = n_topics
d_model = train_len
d_k = d_v = d_q = d_model // n_heads

L_Q = test_len
L_K = L_V = train_len

# Initialize tensors
Q = torch.randn((batch_size, L_Q, n_heads, d_q))
K = torch.randn((batch_size, L_K, n_heads, d_k))
V = torch.randn((batch_size, L_V, n_heads, d_v))

print("Q shape:", Q.shape)
print("K shape:", K.shape)
print("V shape:", V.shape)


Q shape: torch.Size([235, 10, 1, 15])
K shape: torch.Size([235, 15, 1, 15])
V shape: torch.Size([235, 15, 1, 15])


In [22]:
Q = Q.transpose(2, 1)
K = K.transpose(2, 1)
V = V.transpose(2, 1)
Q.shape, K.shape, V.shape

(torch.Size([235, 1, 10, 15]),
 torch.Size([235, 1, 15, 15]),
 torch.Size([235, 1, 15, 15]))

In [23]:
factor = 2 # multiplier
L_K_bar = factor * np.ceil(np.log(L_K)).astype('int').item() # U_part = factor * ln(L_k)
L_Q_bar= factor * np.ceil(np.log(L_Q)).astype('int').item() # u = factor * ln(L_q)
L_Q_bar, L_K_bar

(6, 6)

In [24]:
L_K_bar = L_K_bar if L_K_bar < L_K else L_K
L_Q_bar = L_Q_bar if L_Q_bar < L_Q else L_Q
L_Q_bar, L_K_bar

(6, 6)

In [25]:
B, H, L_K, E = K.shape
_, _, L_Q, _ = Q.shape
# unsqueeze adds dimension, expand with reshape
K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
K_expand

tensor([[[[[-0.1385,  0.1425, -2.1280,  ...,  0.3710,  0.5906,  0.1651],
           [ 0.7314,  0.2787, -0.7112,  ..., -1.1612,  0.8487, -0.1126],
           [ 0.8237,  0.3499, -0.8011,  ..., -0.1334,  0.9969, -1.6088],
           ...,
           [-0.5413, -0.1085, -1.2366,  ...,  0.6914, -0.2345,  0.7737],
           [-1.5802, -1.2534, -0.2690,  ...,  1.5410, -1.2427,  0.5872],
           [-0.6719,  0.0726, -1.3977,  ...,  1.7629,  1.6383,  1.5785]],

          [[-0.1385,  0.1425, -2.1280,  ...,  0.3710,  0.5906,  0.1651],
           [ 0.7314,  0.2787, -0.7112,  ..., -1.1612,  0.8487, -0.1126],
           [ 0.8237,  0.3499, -0.8011,  ..., -0.1334,  0.9969, -1.6088],
           ...,
           [-0.5413, -0.1085, -1.2366,  ...,  0.6914, -0.2345,  0.7737],
           [-1.5802, -1.2534, -0.2690,  ...,  1.5410, -1.2427,  0.5872],
           [-0.6719,  0.0726, -1.3977,  ...,  1.7629,  1.6383,  1.5785]],

          [[-0.1385,  0.1425, -2.1280,  ...,  0.3710,  0.5906,  0.1651],
           [ 0.

In [26]:
K_expand.shape

torch.Size([235, 1, 10, 15, 15])

In [27]:
index_sample = torch.randint(L_K, (L_Q, L_K_bar))
index_sample

tensor([[ 5, 10,  5,  4,  5,  8],
        [10,  5,  4,  2, 14, 10],
        [10, 14,  3,  8,  5,  6],
        [ 0, 12,  5,  4,  2, 13],
        [ 2, 12, 12,  2,  2, 14],
        [14, 13, 12,  2,  8, 10],
        [ 1,  9,  8,  8,  1, 14],
        [12, 11,  2,  2,  6,  3],
        [ 9,  3,  9,  2,  9,  1],
        [10,  9, 10, 12,  9, 10]])

In [28]:
index_sample.shape

torch.Size([10, 6])

In [29]:
K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
K_sample.shape

torch.Size([235, 1, 10, 6, 15])

In [30]:
K_sample

tensor([[[[[ 1.6946, -0.9932,  1.5565,  ..., -0.4926, -1.6964, -0.3402],
           [ 0.5332,  0.7952, -0.1971,  ..., -0.8434, -0.7659, -1.7385],
           [ 1.6946, -0.9932,  1.5565,  ..., -0.4926, -1.6964, -0.3402],
           [ 0.7850, -0.2959,  1.9206,  ...,  0.3618, -0.6039,  0.4218],
           [ 1.6946, -0.9932,  1.5565,  ..., -0.4926, -1.6964, -0.3402],
           [-0.4209,  0.3484, -0.5484,  ...,  0.1914, -0.7825, -0.0902]],

          [[ 0.5332,  0.7952, -0.1971,  ..., -0.8434, -0.7659, -1.7385],
           [ 1.6946, -0.9932,  1.5565,  ..., -0.4926, -1.6964, -0.3402],
           [ 0.7850, -0.2959,  1.9206,  ...,  0.3618, -0.6039,  0.4218],
           [ 0.8237,  0.3499, -0.8011,  ..., -0.1334,  0.9969, -1.6088],
           [-0.6719,  0.0726, -1.3977,  ...,  1.7629,  1.6383,  1.5785],
           [ 0.5332,  0.7952, -0.1971,  ..., -0.8434, -0.7659, -1.7385]],

          [[ 0.5332,  0.7952, -0.1971,  ..., -0.8434, -0.7659, -1.7385],
           [-0.6719,  0.0726, -1.3977,  ...,  1

In [31]:
Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2)
Q_K_sample.shape

torch.Size([235, 1, 10, 6])

In [32]:
Q.unsqueeze(-2).shape, K_sample.transpose(-2, -1).shape

(torch.Size([235, 1, 10, 1, 15]), torch.Size([235, 1, 10, 15, 6]))

In [33]:
Q_K_sample

tensor([[[[  1.4774,  -1.4753,   1.4774,  -0.6899,   1.4774,   0.4373],
          [ -1.7246,   2.3890,   1.1878,   1.3215,   2.4799,  -1.7246],
          [ -0.0270,   1.8621,  -5.5745,  -2.7849,  -2.7600,   2.5996],
          ...,
          [  1.7102,   4.9081,  -5.4689,  -5.4689,  -1.7787,   4.3789],
          [ -0.0990,  -0.7856,  -0.0990,   0.1794,  -0.0990,  -3.8030],
          [  0.3818,   4.4912,   0.3818,  -2.8382,   4.4912,   0.3818]]],


        [[[ -1.6907,   0.2329,  -1.6907,  -1.8096,  -1.6907,   1.1009],
          [  0.9123,  11.5897,  -3.9585,   7.4826,  -5.4881,   0.9123],
          [  4.2796,   8.7262,   6.6345,   1.4833,  -0.3722,   0.1979],
          ...,
          [  6.4168,   2.6012,  -6.0772,  -6.0772,  -1.4624,  -2.6726],
          [ -4.1134,  -2.9533,  -4.1134,   7.7848,  -4.1134,   0.9102],
          [  0.2318,  -1.1959,   0.2318,  -1.1392,  -1.1959,   0.2318]]],


        [[[  0.0521,   0.7359,   0.0521,   2.6375,   0.0521,  -2.0821],
          [ -0.6485,   5.7

In [35]:
M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
M

tensor([[[ 1.2971,  2.2180,  3.0453,  ...,  5.0227,  0.4931,  4.0053]],

        [[ 1.4707, 10.8263,  7.3296,  ...,  6.9016,  8.2247,  0.4208]],

        [[ 2.5410,  5.7292,  5.5563,  ...,  1.9977,  3.5546,  7.0862]],

        ...,

        [[ 8.6741,  7.8931,  4.0457,  ...,  2.7834, 11.6750,  3.9798]],

        [[ 6.8253,  2.5153,  0.6452,  ...,  0.3637,  3.2309,  0.9774]],

        [[ 3.6965,  4.1324,  2.6688,  ...,  9.6274,  1.1876,  1.1305]]])

In [36]:
M.shape

torch.Size([235, 1, 10])

In [37]:
M.topk(L_Q_bar, sorted=False)

torch.return_types.topk(
values=tensor([[[ 4.0053,  5.0227,  7.3107,  4.5413,  3.0453,  2.2180]],

        [[10.7870, 10.8263,  7.3296,  8.2247,  7.1265,  6.9016]],

        [[ 7.0862,  7.8085,  5.5563,  5.7292,  5.1567,  3.6756]],

        ...,

        [[ 6.3623,  7.8931, 11.6750,  8.6741,  5.9578,  4.0457]],

        [[ 3.9718,  6.8253,  4.9187,  2.6752,  3.2309,  2.6139]],

        [[ 4.1324,  9.6274,  5.3424,  5.7446,  8.2569,  3.6965]]]),
indices=tensor([[[9, 7, 5, 3, 2, 1]],

        [[5, 1, 2, 8, 3, 7]],

        [[9, 5, 2, 1, 3, 6]],

        ...,

        [[5, 1, 8, 0, 3, 2]],

        [[3, 0, 5, 6, 8, 4]],

        [[1, 7, 5, 3, 4, 0]]]))

In [38]:
M_top = M.topk(L_Q_bar, sorted=False)[1]
M_top.shape

torch.Size([235, 1, 6])

In [39]:
M_top

tensor([[[9, 7, 5, 3, 2, 1]],

        [[5, 1, 2, 8, 3, 7]],

        [[9, 5, 2, 1, 3, 6]],

        ...,

        [[5, 1, 8, 0, 3, 2]],

        [[3, 0, 5, 6, 8, 4]],

        [[1, 7, 5, 3, 4, 0]]])

In [40]:
Q_bar = Q[
    torch.arange(B)[:, None, None],
    torch.arange(H)[None, :, None],
    M_top,
    :
]
Q_bar


tensor([[[[ 8.1922e-01,  1.2751e+00, -5.4858e-01,  ...,  5.5828e-01,
           -2.6888e-02,  6.3163e-01],
          [-1.0390e+00, -8.5943e-01,  4.0301e-01,  ..., -6.0559e-01,
           -1.0225e+00,  1.2821e+00],
          [ 1.3517e+00,  8.6803e-02, -8.9822e-01,  ..., -1.3790e+00,
            1.4990e+00, -4.8140e-02],
          [-1.1080e+00,  3.4137e-01, -1.3931e-01,  ...,  5.5880e-01,
            7.5094e-01,  1.6012e+00],
          [ 3.6871e-01,  1.8229e+00,  3.4029e-01,  ...,  7.8406e-01,
            3.5086e-01,  1.0581e+00],
          [ 1.1077e+00,  1.5478e-01,  4.1912e-01,  ..., -1.5073e+00,
            2.4996e+00, -9.7012e-01]]],


        [[[ 8.8081e-01,  1.7107e+00,  1.2481e-01,  ...,  1.5005e+00,
           -8.6354e-01, -1.2609e+00],
          [-5.5116e-01, -1.4709e+00, -2.8202e+00,  ...,  1.2571e+00,
           -2.1224e+00, -4.6314e-01],
          [ 8.2542e-01, -2.9278e-01,  8.2035e-01,  ..., -3.8617e-02,
            4.9656e-01, -1.2081e+00],
          [-6.9143e-01,  7.1428e-

In [41]:
Q_bar.size()

torch.Size([235, 1, 6, 15])

In [42]:
Q_bar_K = torch.matmul(Q_bar, K.transpose(-2, -1))
Q_bar_K

tensor([[[[  0.1003,   0.0986,   0.6302,  ...,  -2.8382,  -1.6464,  -1.1526],
          [  0.3217,  -4.3644,  -5.4689,  ...,   1.7102,   6.7519,   5.1630],
          [  3.0600,   3.1597,   5.5152,  ...,  -3.4497,   1.3881,  -2.6642],
          [  2.8954,   0.8689,  -1.9803,  ...,   2.3314,   5.0307,   7.1257],
          [  1.9558,  -1.3510,   1.4455,  ...,  -2.8207,   4.2489,   1.8621],
          [  0.9939,   5.7911,   1.3215,  ...,  -4.2350,  -8.4762,   2.4799]]],


        [[[ -0.2727,   2.2570,   1.6573,  ...,  -0.2524,  11.6164,  -4.1018],
          [ -5.0313,   7.9114,   7.4826,  ..., -11.3083,   2.4997,  -5.4881],
          [ -5.6688,  -2.8657,  -1.2773,  ...,   1.2374,  -4.8581,   8.7262],
          [ -7.2938,   0.9102,   7.7848,  ...,  -1.0043,  13.3260,  -3.0465],
          [  7.4492,   2.7453,   2.5324,  ...,  -6.5289,  -6.4449,   2.2755],
          [  1.6089,   3.0783,  -6.0772,  ...,   6.4168,  -9.0227,  -2.0094]]],


        [[[ -4.4793,  -5.5568,   2.2275,  ...,   7.2676,

In [43]:
Q_bar.shape, K.shape, Q_bar_K.shape

(torch.Size([235, 1, 6, 15]),
 torch.Size([235, 1, 15, 15]),
 torch.Size([235, 1, 6, 15]))

In [44]:
Q_bar_K = 1./sqrt(d_q) * Q_bar_K
Q_bar_K

tensor([[[[ 0.0259,  0.0255,  0.1627,  ..., -0.7328, -0.4251, -0.2976],
          [ 0.0831, -1.1269, -1.4121,  ...,  0.4416,  1.7433,  1.3331],
          [ 0.7901,  0.8158,  1.4240,  ..., -0.8907,  0.3584, -0.6879],
          [ 0.7476,  0.2243, -0.5113,  ...,  0.6020,  1.2989,  1.8399],
          [ 0.5050, -0.3488,  0.3732,  ..., -0.7283,  1.0971,  0.4808],
          [ 0.2566,  1.4952,  0.3412,  ..., -1.0935, -2.1885,  0.6403]]],


        [[[-0.0704,  0.5828,  0.4279,  ..., -0.0652,  2.9993, -1.0591],
          [-1.2991,  2.0427,  1.9320,  ..., -2.9198,  0.6454, -1.4170],
          [-1.4637, -0.7399, -0.3298,  ...,  0.3195, -1.2544,  2.2531],
          [-1.8833,  0.2350,  2.0100,  ..., -0.2593,  3.4408, -0.7866],
          [ 1.9234,  0.7088,  0.6539,  ..., -1.6858, -1.6641,  0.5875],
          [ 0.4154,  0.7948, -1.5691,  ...,  1.6568, -2.3296, -0.5188]]],


        [[[-1.1565, -1.4348,  0.5751,  ...,  1.8765,  0.5988,  0.4344],
          [ 0.2975,  0.2834, -0.0409,  ..., -1.6722,  2.

In [45]:
attn = torch.softmax(Q_bar_K, dim=-1)
attn

tensor([[[[0.0571, 0.0570, 0.0654,  ..., 0.0267, 0.0363, 0.0413],
          [0.0333, 0.0099, 0.0075,  ..., 0.0476, 0.1751, 0.1162],
          [0.0772, 0.0792, 0.1455,  ..., 0.0144, 0.0501, 0.0176],
          [0.0853, 0.0505, 0.0242,  ..., 0.0737, 0.1480, 0.2542],
          [0.1022, 0.0435, 0.0896,  ..., 0.0298, 0.1847, 0.0997],
          [0.0699, 0.2414, 0.0761,  ..., 0.0181, 0.0061, 0.1027]]],


        [[[0.0224, 0.0430, 0.0368,  ..., 0.0225, 0.4820, 0.0083],
          [0.0056, 0.1576, 0.1411,  ..., 0.0011, 0.0390, 0.0050],
          [0.0076, 0.0157, 0.0236,  ..., 0.0452, 0.0094, 0.3122],
          [0.0031, 0.0258, 0.1521,  ..., 0.0157, 0.6359, 0.0093],
          [0.1475, 0.0438, 0.0414,  ..., 0.0040, 0.0041, 0.0388],
          [0.0644, 0.0942, 0.0089,  ..., 0.2229, 0.0041, 0.0253]]],


        [[[0.0112, 0.0085, 0.0634,  ..., 0.2330, 0.0649, 0.0551],
          [0.0618, 0.0610, 0.0441,  ..., 0.0086, 0.3755, 0.1659],
          [0.0169, 0.0621, 0.0577,  ..., 0.0714, 0.0750, 0.0246],
  

In [46]:
attn.shape

torch.Size([235, 1, 6, 15])

In [47]:
V, V.shape

(tensor([[[[ 1.3668e+00, -1.1333e-01, -9.0307e-01,  ...,  8.3144e-01,
            -9.4416e-01, -2.1511e+00],
           [ 4.1982e-02, -1.4635e+00, -1.8549e-01,  ...,  9.0694e-01,
            -2.7729e-01, -5.0540e-01],
           [ 1.0823e+00, -1.2729e+00,  8.6191e-01,  ..., -1.3788e+00,
             9.2381e-01,  9.3861e-01],
           ...,
           [-5.1791e-01,  9.9309e-01, -2.4220e-01,  ...,  1.0046e+00,
            -1.4214e+00, -2.2463e+00],
           [-3.7707e-01, -1.3439e+00,  1.6943e+00,  ...,  1.7683e+00,
            -7.3450e-01,  7.1978e-01],
           [ 4.8704e-02, -7.7979e-01,  9.9322e-01,  ...,  9.8583e-01,
            -9.6970e-01, -1.5515e-02]]],
 
 
         [[[ 1.3401e+00, -1.0626e+00, -1.6482e-01,  ..., -3.6580e-01,
             3.1917e-01,  3.7530e-01],
           [-1.3137e+00, -7.6958e-01,  1.2053e+00,  ..., -9.5253e-01,
            -6.8109e-02,  3.6678e-01],
           [ 1.7549e-01, -6.0995e-01,  2.4386e-01,  ...,  4.0443e-01,
            -9.9386e-01,  1.4269e+00

In [48]:
V_mean = V.mean(dim=-2)
V_mean, V_mean.shape

(tensor([[[-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343]],
 
         [[ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808]],
 
         [[-0.3431, -0.0419, -0.0539,  ...,  0.1275, -0.0250,  0.2118]],
 
         ...,
 
         [[-0.1949,  0.1612, -0.1717,  ..., -0.1957,  0.3209, -0.2750]],
 
         [[-0.0603,  0.0586,  0.3604,  ...,  0.2444, -0.3976,  0.0568]],
 
         [[ 0.4078,  0.7270,  0.2699,  ..., -0.0653,  0.3744,  0.1110]]]),
 torch.Size([235, 1, 15]))

In [49]:
values = V_mean.unsqueeze(-2).expand(B, H, L_Q, V_mean.shape[-1]).clone()
values

tensor([[[[-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          [-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          [-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          ...,
          [-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          [-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          [-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343]]],


        [[[ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808],
          [ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808],
          [ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808],
          ...,
          [ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808],
          [ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808],
          [ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808]]],


        [[[-0.3431, -0.0419, -0.0539,  ...,  0.1275, -0.0250,  0.2118],
          [-0.3431, -0.041

In [50]:
values.shape

torch.Size([235, 1, 10, 15])

In [51]:
values[
    torch.arange(B)[:, None, None],
    torch.arange(H)[None, :, None],
    M_top,
    :
] = torch.matmul(attn, V).type_as(values)

In [52]:
values

tensor([[[[-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          [-0.0423, -0.6131, -0.0466,  ...,  0.6113, -0.1474, -0.1170],
          [-0.0593, -0.4029,  0.4090,  ...,  0.5379, -0.3179, -0.1748],
          ...,
          [-0.3721, -0.5979,  0.2707,  ...,  0.6153, -0.2673, -0.0944],
          [-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343],
          [-0.2360, -0.4316, -0.0325,  ...,  0.2057,  0.0751, -0.1802]]],


        [[[ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808],
          [-0.3587,  0.2081,  0.7859,  ..., -0.2807,  0.1556,  0.5516],
          [ 0.2269,  0.0123,  0.4842,  ..., -0.2535,  0.1724,  0.0046],
          ...,
          [-0.4292,  0.4371,  0.3140,  ..., -0.2543,  0.5144,  0.0562],
          [ 0.9381,  0.0750,  1.1067,  ...,  0.6471, -0.6558,  0.6619],
          [ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808]]],


        [[[-0.3431, -0.0419, -0.0539,  ...,  0.1275, -0.0250,  0.2118],
          [-0.1457,  0.199

In [53]:
M_top, attn.shape, V.shape, torch.matmul(attn, V).shape, values.shape

(tensor([[[9, 7, 5, 3, 2, 1]],
 
         [[5, 1, 2, 8, 3, 7]],
 
         [[9, 5, 2, 1, 3, 6]],
 
         ...,
 
         [[5, 1, 8, 0, 3, 2]],
 
         [[3, 0, 5, 6, 8, 4]],
 
         [[1, 7, 5, 3, 4, 0]]]),
 torch.Size([235, 1, 6, 15]),
 torch.Size([235, 1, 15, 15]),
 torch.Size([235, 1, 6, 15]),
 torch.Size([235, 1, 10, 15]))

In [54]:
out = values.transpose(2, 1).contiguous()
out

tensor([[[[-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343]],

         [[-0.0423, -0.6131, -0.0466,  ...,  0.6113, -0.1474, -0.1170]],

         [[-0.0593, -0.4029,  0.4090,  ...,  0.5379, -0.3179, -0.1748]],

         ...,

         [[-0.3721, -0.5979,  0.2707,  ...,  0.6153, -0.2673, -0.0944]],

         [[-0.2252, -0.3808,  0.0933,  ...,  0.4740, -0.2346, -0.2343]],

         [[-0.2360, -0.4316, -0.0325,  ...,  0.2057,  0.0751, -0.1802]]],


        [[[ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808]],

         [[-0.3587,  0.2081,  0.7859,  ..., -0.2807,  0.1556,  0.5516]],

         [[ 0.2269,  0.0123,  0.4842,  ..., -0.2535,  0.1724,  0.0046]],

         ...,

         [[-0.4292,  0.4371,  0.3140,  ..., -0.2543,  0.5144,  0.0562]],

         [[ 0.9381,  0.0750,  1.1067,  ...,  0.6471, -0.6558,  0.6619]],

         [[ 0.3994,  0.1229,  0.4572,  ...,  0.0367,  0.0124,  0.2808]]],


        [[[-0.3431, -0.0419, -0.0539,  ...,  0.1275, -0.0250,  0.2118]],

    

In [55]:
out.shape

torch.Size([235, 10, 1, 15])

In [56]:
out = out.view(batch_size, sequence_length, -1)

NameError: name 'sequence_length' is not defined

### Preparing the Dataset
Energy consumption dataset from https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction
* gathers 10-min measurements of household appliances energy consumption (20 first features), coupled with local meteorological data (8 last features).
* The time-series forecasting task is to predict the first 20 features, given as input data the 28 features. A window of observations of 12 time steps is considered to predict the next series of observations (this corresponds to a 2-hours window of observations.

you may get the dataset (a single csv file) by running the following cell:

In [None]:
!wget https://raw.githubusercontent.com/LuisM78/Appliances-energy-prediction-data/master/energydata_complete.csv

### Implementation of the Transformer model

## Training the Transformer

### Evaluation on Test set