# Positional Encoding in Attention is All You Need (AAYN)

* $PE_{pos, 2i} = \sin\left(\frac{pos}{10000^{\frac{2i}{d_{model}}}}\right)$ and $PE_{pos, 2i+1} = \cos\left(\frac{pos}{10000^{\frac{2i}{d_{model}}}}\right)$

**Ref: [The AiEdge Newsletter](https://drive.google.com/file/d/1Je2SAFBlsWcgwzK_gl1_f-LtPK3SOzg3/view)**

<img src="../../assets/pos_enc.png" width="700" height="350">

In [1]:
import torch
import torch.nn as nn

In [2]:
class PositionalEncoding(nn.Module):
    """
    simple positional encoding with transformer model (in attention is all you need)

    Args:
        context_size (int): maximum lenght of the input sequence (also known as max_length)
        d_model (int): internal dimension of the model or dimension of embeddings.
        (also known as 'hidden_size')
    """    
    def __init__(self, context_size: int, d_model: int):
        super().__init__()

        pos = torch.arange(0, context_size).unsqueeze(dim=1) # [context_size, 1]
        # even dimension indices
        ii = torch.arange(0, d_model, 2)
        
        # initialize positional encoding [context_size, d_model]
        self.encoding = torch.zeros(context_size, d_model)
        self.encoding[:, 0::2] = torch.sin(pos / 10000 ** (ii/d_model)) # even positions
        self.encoding[:, 1::2] = torch.cos(pos / 10000 ** (ii/d_model)) # odd  positions

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        returns positional encoding for a given input tensor x.
        (input tensor x is comming from token embedding layer in the transformer architecture)

        Args:
            x (torch.Tensor): input tensor [batch_size, seq_len, d_model]

        Returns:
            torch.Tensor: positional encoded tensor of shape [seq_len, d_model]
        """        
        seq_len = x.size(1) # number of tokens in the input sequence
        return self.encoding[:seq_len, :]

In [3]:
vocab_size = 15
d_model = 10
context_size = 20   # maximum sequence length (that is suppored)
batch_size = 1
seq_len = 5

embedding = nn.Embedding(vocab_size, d_model) # token embedding layer
pos_encoder = PositionalEncoding(context_size=context_size, d_model=d_model) # positional encoding module

In [4]:
# Example: batch of 2 sequences with 7 elements
x = torch.tensor([
    [6, 7, 8, 9, 0, 1, 2],
    [0, 0, 1, 4, 5, 9, 5]
]) # shape [batch_size, seq_len]

x_emb = embedding(x) #[batch_size, seq_len, d_model]
x_pos_embed = pos_encoder(x_emb) #[seq_len, d_model]
x_pos_embed

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  1.5783e-01,  9.8747e-01,  2.5116e-02,
          9.9968e-01,  3.9811e-03,  9.9999e-01,  6.3096e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  3.1170e-01,  9.5018e-01,  5.0217e-02,
          9.9874e-01,  7.9621e-03,  9.9997e-01,  1.2619e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  4.5775e-01,  8.8908e-01,  7.5285e-02,
          9.9716e-01,  1.1943e-02,  9.9993e-01,  1.8929e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  5.9234e-01,  8.0569e-01,  1.0031e-01,
          9.9496e-01,  1.5924e-02,  9.9987e-01,  2.5238e-03,  1.0000e+00],
        [-9.5892e-01,  2.8366e-01,  7.1207e-01,  7.0211e-01,  1.2526e-01,
          9.9212e-01,  1.9904e-02,  9.9980e-01,  3.1548e-03,  1.0000e+00],
        [-2.7942e-01,  9.6017e-01,  8.1396e-01,  5.8092e-01,  1.5014e-01,
          9.8866e-01,  2.3884e-0

In [5]:
x_emb.size(), x_pos_embed.size()

(torch.Size([2, 7, 10]), torch.Size([7, 10]))