In [53]:
import numpy as np

np.set_printoptions(suppress=True)

# Encoder

## 1. Positional encoding

In [73]:
sentence: str = "Hello World"
# The original transformer method used a vector of size 512, I will do 4.
# The embedding is random for now and I will hardcode a matrix. I might apply word2vec or GloVe later.
E = np.array([
    [1,2,3,4],
    [2,3,4,5]
], dtype=np.float64)
d_model = 4

In [78]:
def positional_encoding(embeddings: np.array, verbose: bool = False) -> np.array:
    mat = np.zeros_like(a=embeddings, dtype=embeddings.dtype)
    for pos, embedding in enumerate(embeddings):
        if verbose:
            print(f"\nEmbedding {embedding}")
        # for i, _ in enumerate(embedding): 
        # Pos encoding should be word agnostic and only looking at the 
        # position in the embedding matrix pos and dimension inside each embedding i.
        for i in range(d_model):
            func = np.sin if not i % 2 else np.cos
            mat[pos][i] = func(pos / 10000 ** ((2 * i) / d_model))

            # Copilot generated print horror for sanity check
            if verbose:
                print(
                    f"i = {i} ({'even' if i % 2 == 0 else 'odd'}): PE({pos},{i}) = sin({pos} / 10000^({2 * i} / {d_model})) = sin({pos / 10000 ** ((2 * i) / d_model)}) = {func(pos / 10000 ** ((2 * i) / d_model))}"
                )
    return mat

In [79]:
positional_encodings = positional_encoding(E)
positional_encodings

array([[0.        , 1.        , 0.        , 1.        ],
       [0.84147098, 0.99995   , 0.0001    , 1.        ]])

## 2. Adding positional encoding and input embeddings

In [81]:
print(f"E before adding:\n{E}")
print(f"\nPositional encodings:\n{positional_encodings}")

E = E + positional_encodings

print(f"\nE after adding:\n{E}")

E before adding:
[[1. 2. 3. 4.]
 [2. 3. 4. 5.]]

Positional encodings:
[[0.         1.         0.         1.        ]
 [0.84147098 0.99995    0.0001     1.        ]]

E after adding:
[[1.         3.         3.         5.        ]
 [2.84147098 3.99995    4.0001     6.        ]]


## Self-attention
Let’s use 2 attention heads for our example. We’ll use random values for these matrices. Each matrix will be a 4x3 matrix. 

With this, each matrix will transform the 4-dimensional embeddings into 3-dimensional keys, values, and queries. 

This reduces the dimensionality for attention mechanism, which helps in managing the computational complexity. 

Note that using a too small attention size will hurt the performance of the model. 

In [82]:
WK1 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
WV1 = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1], [0, 1, 0]])
WQ1 = np.array([[0, 0, 0], [1, 1, 0], [0, 0, 1], [1, 0, 0]])

WK2 = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [0, 1, 0]])
WV2 = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]])
WQ2 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]])

In [85]:
# We need to multiply our embeddings with the weight matrices to obtain the keys, queries and values.
K1 = E @ WK1    # Key calculations
V1 = E @ WV1    # Values calculations
Q1 = E @ WQ1    # Query calculations

K2 = E @ WK2
V2 = E @ WV2
Q2 = E @ WQ2

The attention function used in the paper is computed as:

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

Where:
- $Q$ is the matrix of queries
- $K$ is the matrix of keys
- $V$ is the matrix of values
- $d_k$ is the dimension of the keys