In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


PART 1: Input embedding 

original sentence:

    - LEARNT token embeddings (vector) of size 512 - values between (0, 1)

    - FIXED positional embedding of size 512

        - calculated using sine and cosine functions of position and dimension to capture unique, periodic patterns

    - encoder input => token embeddings + positional embedding

In [3]:
batch_size = 1
vocab_size = 6           # max_seq_len
embedding_size = 8       # d_model



In [4]:
#randn creates a (no of lines - here 3) dim tensor of vocab_size x embedding_size
torch.manual_seed(132)
token_embed = torch.randn(batch_size, vocab_size, embedding_size)


In [5]:
token_embed.size()

torch.Size([1, 6, 8])

In [6]:
token_embed

tensor([[[ 0.5628, -0.0576, -0.8741,  0.0612, -0.1741,  2.1812,  0.1573,
          -1.4425],
         [ 2.1285,  0.3479,  0.9685,  0.8488,  0.5011,  0.9357,  0.4584,
           1.2065],
         [-0.7349, -0.9706, -1.2097,  1.6960,  1.4241, -0.5467, -0.4898,
          -1.6538],
         [ 1.0423,  0.6688, -1.0684,  0.2520, -1.9652,  0.3251, -0.2690,
          -1.0828],
         [ 0.1590,  1.0780, -1.4566, -0.8454, -0.1251,  2.0603,  0.7509,
           0.0553],
         [ 0.8920,  0.7346, -1.5393,  0.7653, -0.4533,  2.4488, -0.5876,
          -1.3040]]])

PART 2: Positional encoding

- generate an empty tensor 'pe' of size: embedding_size for each word in vocab
- create another column vector 'positions' which holds the index of every word in vocab
    - unsqueeze turns 1D row tensor to a 2D column tensor
- for loop to iterate through each vocab and it's embeddings 
    - calculate denominator first
    - calculate even and odd positional embedding for each pe

- final positional encoding => position_encod = position + pe
- final embeddings => positional_embed = position_encod + token_embed


In [7]:
pe = torch.zeros(vocab_size, embedding_size).float()
pe.size()

torch.Size([6, 8])

In [8]:
position = torch.arange(0, vocab_size).unsqueeze(1).float()
position.size()

torch.Size([6, 1])

In [9]:
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.]])

In [10]:
for pos in range(vocab_size):
    for i in range(embedding_size):
        #print(pos, i)
        den = 10000 ** (2 * i / embedding_size)
        if (i % 2 == 0):
            pe[pos, i] = np.sin(pos/ den)
        else:
            pe[pos, i] = np.cos(pos/ den)

        

In [11]:
pe

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  9.9500e-01,  9.9998e-03,  1.0000e+00,  1.0000e-04,
          1.0000e+00,  1.0000e-06,  1.0000e+00],
        [ 9.0930e-01,  9.8007e-01,  1.9999e-02,  1.0000e+00,  2.0000e-04,
          1.0000e+00,  2.0000e-06,  1.0000e+00],
        [ 1.4112e-01,  9.5534e-01,  2.9996e-02,  1.0000e+00,  3.0000e-04,
          1.0000e+00,  3.0000e-06,  1.0000e+00],
        [-7.5680e-01,  9.2106e-01,  3.9989e-02,  9.9999e-01,  4.0000e-04,
          1.0000e+00,  4.0000e-06,  1.0000e+00],
        [-9.5892e-01,  8.7758e-01,  4.9979e-02,  9.9999e-01,  5.0000e-04,
          1.0000e+00,  5.0000e-06,  1.0000e+00]])

In [12]:
positional_embed = pe + token_embed
positional_embed.size()

torch.Size([1, 6, 8])

In [13]:
positional_embed

tensor([[[ 0.5628,  0.9424, -0.8741,  1.0612, -0.1741,  3.1812,  0.1573,
          -0.4425],
         [ 2.9700,  1.3429,  0.9785,  1.8488,  0.5012,  1.9357,  0.4584,
           2.2065],
         [ 0.1744,  0.0095, -1.1897,  2.6960,  1.4243,  0.4533, -0.4898,
          -0.6538],
         [ 1.1834,  1.6242, -1.0384,  1.2520, -1.9649,  1.3251, -0.2690,
          -0.0828],
         [-0.5978,  1.9990, -1.4166,  0.1546, -0.1247,  3.0603,  0.7509,
           1.0553],
         [-0.0670,  1.6122, -1.4893,  1.7653, -0.4528,  3.4488, -0.5876,
          -0.3040]]])

In [14]:
# Class implementation

torch.manual_seed(132)
token_embed = torch.randn(batch_size, vocab_size, embedding_size).to(device)

class PosEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        #self.token_embed = nn.Embedding(vocab_size, d_model)
        self.token_embed = token_embed.to(device)        
        self.pos_embed = self.pos_encod(max_len, d_model)
        self.max_len = max_len

    def pos_encod(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model)
        for pos in range(max_len):
            for i in range(d_model):
                den = 10000 ** (2 * i / d_model)
                if (i % 2 == 0):
                    pe[pos, i] = torch.sin(torch.tensor(pos, dtype=torch.float, device=device) / den)
                else:
                    pe[pos, i] = torch.cos(torch.tensor(pos, dtype=torch.float, device=device) / den)
        
        return pe.unsqueeze(0).to(device)
    
    def forward(self, x):
        return self.token_embed + self.pos_embed[:, :x.size(1), :]

    


In [15]:
model = PosEmbedding(vocab_size = 6, d_model = 8, max_len = 1).to(device)
x = torch.arange(10).unsqueeze(0).to(device)
output = model(x)

output.size()

torch.Size([1, 6, 8])

In [16]:
x.size()



torch.Size([1, 10])

In [17]:
output

tensor([[[ 0.5628,  0.9424, -0.8741,  1.0612, -0.1741,  3.1812,  0.1573,
          -0.4425],
         [ 2.1285,  1.3479,  0.9685,  1.8488,  0.5011,  1.9357,  0.4584,
           2.2065],
         [-0.7349,  0.0294, -1.2097,  2.6960,  1.4241,  0.4533, -0.4898,
          -0.6538],
         [ 1.0423,  1.6688, -1.0684,  1.2520, -1.9652,  1.3251, -0.2690,
          -0.0828],
         [ 0.1590,  2.0780, -1.4566,  0.1546, -0.1251,  3.0603,  0.7509,
           1.0553],
         [ 0.8920,  1.7346, -1.5393,  1.7653, -0.4533,  3.4488, -0.5876,
          -0.3040]]], device='mps:0')