In [4]:
import numpy as np

np.set_printoptions(suppress=True)

# Encoder

## 1. Positional encoding

In [5]:
sentence: str = "Hello World"
# The original transformer method used a vector of size 512, I will do 4.
# The embedding is random for now and I will hardcode a matrix. I might apply word2vec or GloVe later.
E = np.array([
    [1,2,3,4],
    [2,3,4,5]
], dtype=np.float64)
d_model = 4

In [6]:
def positional_encoding(embeddings: np.array, verbose: bool = False) -> np.array:
    mat = np.zeros_like(a=embeddings, dtype=embeddings.dtype)
    for pos, embedding in enumerate(embeddings):
        if verbose:
            print(f"\nEmbedding {embedding}")
        # for i, _ in enumerate(embedding): 
        # Pos encoding should be word agnostic and only looking at the 
        # position in the embedding matrix pos and dimension inside each embedding i.
        for i in range(d_model):
            func = np.sin if not i % 2 else np.cos
            mat[pos][i] = func(pos / 10000 ** ((2 * i) / d_model))

            # Copilot generated print horror for sanity check
            if verbose:
                print(
                    f"i = {i} ({'even' if i % 2 == 0 else 'odd'}): PE({pos},{i}) = sin({pos} / 10000^({2 * i} / {d_model})) = sin({pos / 10000 ** ((2 * i) / d_model)}) = {func(pos / 10000 ** ((2 * i) / d_model))}"
                )
    return mat

In [7]:
positional_encodings = positional_encoding(E)
positional_encodings

array([[0.        , 1.        , 0.        , 1.        ],
       [0.84147098, 0.99995   , 0.0001    , 1.        ]])

## 2. Adding positional encoding and input embeddings

In [8]:
print(f"E before adding:\n{E}")
print(f"\nPositional encodings:\n{positional_encodings}")

E = E + positional_encodings

print(f"\nE after adding:\n{E}")

E before adding:
[[1. 2. 3. 4.]
 [2. 3. 4. 5.]]

Positional encodings:
[[0.         1.         0.         1.        ]
 [0.84147098 0.99995    0.0001     1.        ]]

E after adding:
[[1.         3.         3.         5.        ]
 [2.84147098 3.99995    4.0001     6.        ]]


## Self-attention
Let’s use 2 attention heads for our example. We’ll use random values for these matrices. Each matrix will be a 4x3 matrix. 

With this, each matrix will transform the 4-dimensional embeddings into 3-dimensional keys, values, and queries. 

This reduces the dimensionality for attention mechanism, which helps in managing the computational complexity. 

Note that using a too small attention size will hurt the performance of the model. 

In [82]:
WK1 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
WV1 = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1], [0, 1, 0]])
WQ1 = np.array([[0, 0, 0], [1, 1, 0], [0, 0, 1], [1, 0, 0]])

WK2 = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [0, 1, 0]])
WV2 = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]])
WQ2 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]])

In [87]:
# We need to multiply our embeddings with the weight matrices to obtain the keys, queries and values.
K1 = E @ WK1    # Key calculations
V1 = E @ WV1    # Values calculations
Q1 = E @ WQ1    # Query calculations

K2 = E @ WK2
V2 = E @ WV2
Q2 = E @ WQ2
print(f"Shape {K1.shape}")

Shape (2, 3)


The attention function used in the paper is computed as:

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

Where:
- $Q$ is the matrix of queries
- $K$ is the matrix of keys
- $V$ is the matrix of values
- $d_k$ is the dimension of the keys


Calculating the attention score requires a couple of steps:

1. Calculate the dot product of the query with each key
2. Divide the result by the square root of the dimension of the key vector
3. Apply a softmax function to obtain the attention weights
4. Multiply each value vector by the attention weights


In [143]:
# 1
scores1 = Q1 @ K1.T
scores1

array([[ 68.        , 105.25713083],
       [ 87.9998    , 135.78163588]])

In [144]:
# 2
scores1 = scores1 / np.sqrt(K1.shape[1])
scores1

array([[39.2598183 , 60.77023282],
       [50.80670822, 78.39356402]])

The softmax function is computed as:

$$\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}$$

Where:
- $x_i$ is the i-th element of the input vector
- The denominator is the sum of the exponentials of all elements in the input vector

In [145]:
# 3
def softmax(mat: np.array) -> np.array:
    return np.exp(mat) / np.sum(a=np.exp(mat), axis=1, keepdims=True)

scores1 = softmax(scores1)
scores1

array([[0., 1.],
       [0., 1.]])

In [146]:
# Reminding myself about the axis dilemma
mat = np.array([[1, 2, 3], [2, 3, 4]])
print(mat)
np.sum(mat, axis=None)
# Axis 1 means that the second dimension of the shape of the matrix is going to be collapsed and the number in that direction will be summed.

[[1 2 3]
 [2 3 4]]


15

In [147]:
# 4
scores1 = scores1 @ V1
scores1

array([[8.00005   , 8.84147098, 6.84157098],
       [8.00005   , 8.84147098, 6.84157098]])

In [148]:
def attention(E: np.array, WQ: np.array, WK: np.array, WV: np.array) -> np.array:
    Q = E @ WQ
    K = E @ WK
    V = E @ WV

    return softmax((Q @ K.T) / np.sqrt(K.shape[1])) @ V

In [152]:
score1 = attention(E, WQ1, WQ1, WV1)
score1

array([[8.00004391, 8.84146233, 6.84156233],
       [8.00004981, 8.84147071, 6.84157071]])

In [153]:
# Now we get the attention score for the second attention head.
score2 = attention(E, WQ2, WQ2, WV2)
score2

array([[8.84147098, 3.99995   , 8.00005   ],
       [8.84147098, 3.99995   , 8.00005   ]])

In [160]:
scores = np.concatenate((scores1, score2), axis=1)
scores

array([[8.00005   , 8.84147098, 6.84157098, 8.84147098, 3.99995   ,
        8.00005   ],
       [8.00005   , 8.84147098, 6.84157098, 8.84147098, 3.99995   ,
        8.00005   ]])

We finally multiply this concatenated matrix by a weight matrix to obtain the final output of the attention layer. 

This weight matrix is also learned! 

The dimension of the matrix ensures we go back to the same dimension as the embedding (4 in our case).

In [161]:
# Just some random values
W = np.array(
    [
        [0.79445237, 0.1081456, 0.27411536, 0.78394531],
        [0.29081936, -0.36187258, -0.32312791, -0.48530339],
        [-0.36702934, -0.76471963, -0.88058366, -1.73713022],
        [-0.02305587, -0.64315981, -0.68306653, -1.25393866],
        [0.29077448, -0.04121674, 0.01509932, 0.13149906],
        [0.57451867, -0.08895355, 0.02190485, 0.24535932],
    ]
)
Z = scores @ W
Z

array([[ 11.97128599, -14.12917589, -12.49224156, -18.50167966],
       [ 11.971286  , -14.12917589, -12.49224156, -18.50167966]])

## Feed-forward layer

In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully

connected feed-forward network, which is applied to each position separately and identically. 


This consists of two linear transformations with a ReLU activation in between.

The feed forward layer, denoted as FFN, is computed as:

$$\text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2$$

Where:
- $x$ is the input
- $W_1$ and $W_2$ are the weight matrices
- $b_1$ and $b_2$ are the bias vectors
- $\max(0, \cdot)$ is the ReLU (Rectified Linear Unit) activation function

In [176]:
# In this approach, I will expand from 4 dimensions to 8 in the first layer and then back to 4 in the second. 
d_model = 8
W1 = np.random.randn(4, d_model)
b1 = np.random.randn(d_model)
W2 = np.random.randn(d_model, 4)
b2 = np.random.randn(4)

In [177]:
def relu(x: np.array) -> np.array:
    return np.maximum(0, x)


def feed_forward(x: np.array) -> np.array:
    return relu(x @ W1 + b1) @ W2 + b2

In [179]:
feed_forward(Z)

array([[21.27537029,  5.73870724, 44.73625853, 33.08235132],
       [21.2753703 ,  5.73870724, 44.73625853, 33.08235133]])

## Encapsulating everything so far

In [47]:
d_embedding = 4
d_key = d_value = d_query = 4
d_feed_forward = 8
n_attention_heads = 2

def softmax(mat: np.array) -> np.array:
    return np.exp(mat) / np.sum(a=np.exp(mat), axis=1, keepdims=True)

def attention(E: np.array, WQ: np.array, WK: np.array, WV: np.array) -> np.array:
    Q = E @ WQ
    K = E @ WK
    V = E @ WV

    return softmax((Q @ K.T) / np.sqrt(K.shape[1])) @ V

def multihead_attention(x, WQs, WKs, WVs) -> np.array:
    attentions = np.concatenate([attention(x, WQ, WK, WV) for WQ, WK, WV in zip(WQs, WKs, WVs)], axis=1)
    W = np.random.randn(n_attention_heads * d_value, d_embedding)
    return attentions @ W

def relu(x: np.array) -> np.array:
    return np.maximum(0, x)


def feed_forward(x: np.array, W1: np.array, W2: np.array, b1: np.array, b2: np.array) -> np.array:
    return relu(x @ W1 + b1) @ W2 + b2

def encoder_block_(x: np.array, WQs: np.array, WKs: np.array, WVs: np.array, W1: np.array, W2: np.array, b1: np.array, b2: np.array) -> np.array:
    Z = multihead_attention(x, WQs, WKs, WVs)
    Z = feed_forward(Z, W1, W2, b1, b2)
    return Z

def random_encoder_block(x: np.array) -> np.array:
    WQs = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]

    W1 = np.random.randn(d_embedding, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_embedding)
    b2 = np.random.randn(d_embedding)

    return encoder_block(x, WQs, WKs, WVs, W1, W2, b1, b2)

In [11]:
E

array([[1.        , 3.        , 3.        , 5.        ],
       [2.84147098, 3.99995   , 4.0001    , 6.        ]])

In [189]:
random_encoder_block(E)

array([[ 5.65114892, -3.74230864, -1.65071085, -1.83249781],
       [ 5.65115293, -3.74231196, -1.6507186 , -1.83251494]])

This was just one encoder block. The original paper uses 6 encoders. The output of one encoder goes to the next, and so on:

In [190]:
def encoder(x: np.array):
    for _ in range(6):
        x = random_encoder_block(x)
    return x

In [191]:
encoder(E)

  return np.exp(mat) / np.sum(a=np.exp(mat), axis=1, keepdims=True)
  return np.exp(mat) / np.sum(a=np.exp(mat), axis=1, keepdims=True)


array([[nan, nan, nan, nan],
       [nan, nan, nan, nan]])

We are getting too big numbers thus we need to use normalization.

There are two common techniques to mitigate this problem: 
- residual connections
- layer normalization. 


Residual connections:
$$\text{Residual}(x) = x + \text{Layer}(x)$$

Where:

- $x$ is the input
- $\text{Layer}(x)$ is the output of the sublayer with x as the input 

Residual connections are simply adding the input of the layer to it output. 

For example, we add the initial embedding to the output of the attention. 

Residual connections mitigate the **vanishing gradient problem**. 

The intuition is that if the gradient is too small, we can just add the input to the output and the **gradient will be larger**.

Layer normalization, denoted as LayerNorm, is computed as:

$$\text{LayerNorm}(x) = \frac{x - \mu(x)}{\sqrt{\sigma^2(x) + \epsilon}} * \gamma + \beta$$

Where:
- $x$ is the input (embedding)
- $\mu(x)$ is the mean of $x$
- $\sigma^2(x)$ is the variance (squared standard deviation) of $x$
- $\epsilon$ is a small constant for numerical stability (usually $1e-6$)
- $\gamma$ and $\beta$ are learnable parameters


Layer normalization is a technique to normalize the inputs of a layer. 

It normalizes across the embedding dimension. 

The intuition is that we want to normalize the inputs of a layer so that they have a mean of 0 and a standard deviation of 1. 

This helps with the gradient flow.

Unlike batch normalization (no worries if you don’t know what it is), **layer normalization normalizes** 

**across the embedding dimension** - that means that **each embedding will not be affected by other samples in the batch**.

Why do we add the learnable parameters $\gamma$ and $\beta$ ? The reason is that we don’t want to lose the representational power of the layer. 

If we just normalize the inputs, we might lose some information. By adding the learnable parameters, we can learn to scale and shift the normalized values.

Finally, our formulas for the encoder block (a single one) should look like this:

1. The self-attention layer with residual connection and layer normalization, denoted as $Z(x)$, is computed as:

    $$Z(x) = \text{LayerNorm}(x + \text{Attention}(x))$$

2. The feed forward layer, denoted as FFN, is computed as:

    $$\text{FFN}(x) = \text{ReLU}(xW_1 + b_1)W_2 + b_2$$

3. The encoder layer, which combines the self-attention and feed forward layers with residual connections and layer normalization, is computed as:

    $$\text{Encoder}(x) = \text{LayerNorm}(Z(x) + \text{FFN}(Z(x) + x))$$

Where:
- $x$ is the input
- $W_1$ and $W_2$ are the weight matrices
- $b_1$ and $b_2$ are the bias vectors
- $\text{ReLU}(\cdot)$ is the Rectified Linear Unit activation function
- $\text{LayerNorm}(\cdot)$ is the layer normalization function
- $\text{Attention}(\cdot)$ is the self-attention function

Let’s now calculate the layer normalization, we can divide it into three steps:

1. Compute mean and variance for each embedding.
2. Normalize by substracting the mean of its row and dividing by the square root of its row variance (plus a small number to avoid division by zero).
3. Scale and shift by multiplying by gamma and adding beta.

In [29]:
# Z(x) = LayerNorm(x + Attention(x))
# FFN(x) = ReLU(xW1 + b1)W2 + b2
# Encoder(x) = LayerNorm(Z(x) + FFN(Z(x) + x))

# 1. Compute mean and variance for EACH EMBEDDING (across the embedding dimension, i.e. axis=1)
WQs = [
    np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
]
WKs = [
    np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
]
WVs = [
    np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
]

W1 = np.random.randn(d_embedding, d_feed_forward)
b1 = np.random.randn(d_feed_forward)
W2 = np.random.randn(d_feed_forward, d_embedding)
b2 = np.random.randn(d_embedding)

attn = multihead_attention(E, WQs, WKs, WVs)
# Z(x) = LayerNorm(x + Attention(x)) -> LayerNorm(E + attn(E))
# 1.1 mean of E+attn(E) accross the embedding dimension
mean = (E + attn).mean(axis=1, keepdims=True)
mean

array([[-28.51127031],
       [-27.33706259]])

In [31]:
var = (E + attn).var(axis=1, keepdims=True)
var

array([[2268.1795805],
       [2311.7727231]])

In [41]:
# Assuming for simplification
epsilon = 0.000001
gamma = 1
beta = 0
norm = (E + attn) - mean / np.sqrt(var + epsilon) * gamma + beta
norm

array([[ 51.8491194 , -36.07429372, -70.07217858, -57.35310153],
       [ 53.79726931, -35.15201909, -69.21674721, -56.50249777]])

In [34]:
def layer_norm(x: np.array, epsilon: float=1e-6) -> np.array:
    mean = x.mean(axis=1, keepdims=True)
    var = x.var(axis=1, keepdims=True)
    return x - mean / np.sqrt(var + epsilon) * gamma + beta

In [44]:
print(layer_norm(E + attn) == norm)
layer_norm(E + attn)

[[ True  True  True  True]
 [ True  True  True  True]]


array([[ 51.8491194 , -36.07429372, -70.07217858, -57.35310153],
       [ 53.79726931, -35.15201909, -69.21674721, -56.50249777]])

In [66]:
d_embedding = 4
d_key = d_value = d_query = 4
d_feed_forward = 8
n_attention_heads = 2
epsilon = 1e-6
gamma = 1
beta = 0

def softmax(mat: np.array) -> np.array:
    return np.exp(mat) / np.sum(a=np.exp(mat), axis=1, keepdims=True)


def attention(E: np.array, WQ: np.array, WK: np.array, WV: np.array) -> np.array:
    Q = E @ WQ
    K = E @ WK
    V = E @ WV

    return softmax((Q @ K.T) / np.sqrt(K.shape[1])) @ V


def multihead_attention(x, WQs, WKs, WVs) -> np.array:
    attentions = np.concatenate(
        [attention(x, WQ, WK, WV) for WQ, WK, WV in zip(WQs, WKs, WVs)], axis=1
    )
    W = np.random.randn(n_attention_heads * d_value, d_embedding)
    return attentions @ W


def relu(x: np.array) -> np.array:
    return np.maximum(0, x)


def feed_forward(
    x: np.array, W1: np.array, W2: np.array, b1: np.array, b2: np.array
) -> np.array:
    return relu(x @ W1 + b1) @ W2 + b2


def layer_norm(x: np.array, epsilon: float=1e-6) -> np.array:
    mean = x.mean(axis=1, keepdims=True)
    # var = x.var(axis=1, keepdims=True)  # NOTE: variance is the standard deviation squared
    # To lower the amount of calcuations (avoid calc. var and then sqrt in the denominator) we calculate the std dev
    std_dev = x.std(axis=1, keepdims=True)
    return (x - mean) / std_dev + epsilon * gamma + beta

# New definition with layer normalization and residual connections
def encoder_block(
    x: np.array,
    WQs: np.array,
    WKs: np.array,
    WVs: np.array,
    W1: np.array,
    W2: np.array,
    b1: np.array,
    b2: np.array,
) -> np.array:
    Z = multihead_attention(x, WQs, WKs, WVs)
    Z = layer_norm(x + Z)  # LayerNorm of the Residual on attention output
    output = feed_forward(Z, W1, W2, b1, b2)

    return layer_norm(Z + output)  # LayerNorm of the Residual on FFN output


def random_encoder_block(x: np.array) -> np.array:
    WQs = [np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)]
    WKs = [np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)]
    WVs = [np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)]

    W1 = np.random.randn(d_embedding, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_embedding)
    b2 = np.random.randn(d_embedding)

    return encoder_block(x, WQs, WKs, WVs, W1, W2, b1, b2)

In [67]:
def encoder(x, n=6):
    for _ in range(n):
        x = random_encoder_block(x)
    return x


encoder(E)

array([[-0.79123393,  1.40275105, -1.08654597,  0.47503285],
       [-0.79123088,  1.40275248, -1.08654759,  0.47502999]])

Amazing! These values make sense and we don’t get NaNs! 

The idea of the stack of encoders is that they output a continuous representation, z, 
that captures the meaning of the input sequence. 

This representation is then passed to the decoder, which will genrate an output sequence of symbols, one element at a time.