# 1. Encoder Block

## 1.1 Input Embeddings

### 1.1.1 Create input sequence and input embeddings

In [42]:
import torch
import torch.nn as nn

input_seq = ['<sos>', 'hello', 'world', '<eos>']
input_tokens = [132, 87, 101, 777] # just for example, in practice, there is a tokenizer to convert the input sequence to tokens

# imagine we have a vocabulary of 1000 words
vocab_size = 1000
embedding_dim = 64

vocab_embedding_table = nn.Embedding(vocab_size, embedding_dim)

# create input embeddings
input_embeddings = vocab_embedding_table(torch.LongTensor(input_tokens))

### 1.1.2 Create positional encodings

In [43]:
import math

max_seq_len = len(input_seq)
positional_encodings = torch.zeros(max_seq_len, embedding_dim)

for i in range(max_seq_len):
    for j in range(embedding_dim):
        if j % 2 == 0:
            positional_encodings[i, j] = math.sin(i / (10000 ** (j / embedding_dim)))
        else:
            positional_encodings[i, j] = math.cos(i / (10000 ** ((j - 1) / embedding_dim)))

### 1.1.3 Add input embeddings and positional encodings

In [44]:
input_embeddings = input_embeddings + positional_encodings

## 1.2 Multi-head Attention Layer

### 1.2.1 Create query, key, value matrices

In [46]:
W_q = nn.Linear(embedding_dim, embedding_dim, bias=False)
W_k = nn.Linear(embedding_dim, embedding_dim, bias=False)
W_v = nn.Linear(embedding_dim, embedding_dim, bias=False)

Q = W_q(input_embeddings)
K = W_k(input_embeddings)
V = W_v(input_embeddings)

print(Q.shape, K.shape, V.shape)

torch.Size([4, 64]) torch.Size([4, 64]) torch.Size([4, 64])


### 1.2.2 Convert To Multi-Head

In [49]:
head_num = 2
Q = Q.view(head_num, len(input_seq), embedding_dim//head_num)
K = K.view(head_num, len(input_seq), embedding_dim//head_num)
V = V.view(head_num, len(input_seq), embedding_dim//head_num)

print(Q.shape, K.shape, V.shape)

torch.Size([2, 4, 32]) torch.Size([2, 4, 32]) torch.Size([2, 4, 32])


### 1.2.3 Calculate Attentions
This include matrix multiplication, scaling, and softmax

In [52]:
# matrix multiplication
QK = torch.matmul(Q, K.transpose(1, 2))

# scaling
QK = QK / math.sqrt(embedding_dim//head_num)

# softmax
QK = torch.softmax(QK, dim=-1)

print(QK.shape)

torch.Size([2, 4, 4])


### 1.2.4 Multiply Values

In [53]:
QKV = torch.matmul(QK, V)

print(QKV.shape)

torch.Size([2, 4, 32])


### 1.2.5 Add & Norm

In [57]:
multi_head_output = QKV.transpose(0, 1).contiguous()
multi_head_output = multi_head_output.view(len(input_seq), embedding_dim)

# residual connection
output = input_embeddings + multi_head_output

# layer normalization
output = nn.LayerNorm(embedding_dim)(output)
output = nn.Dropout(0.3)(output)

## 1.3 FeedForward Layer (MLP)

In [59]:
W_ff_1 = nn.Linear(embedding_dim, embedding_dim)
W_ff_2 = nn.Linear(embedding_dim, embedding_dim)

# feedforward: linear -> relu -> linear
ff_output = W_ff_1(output)
ff_output = nn.ReLU()(ff_output)
ff_output = W_ff_2(ff_output)

# residual connection
ff_output = output + ff_output

# layer normalization
ff_output = nn.LayerNorm(embedding_dim)(ff_output)
ff_output = nn.Dropout(0.3)(ff_output)

encoder_output = ff_output