In [1]:
import numpy as np

## Setup

In [10]:
n = 3 # number of tokens
d_vocab = 1000 # vocabulary size
dm = 512 # "model size"
h = 8 # number of "heads"
dk = dm // h
E = np.zeros((d_vocab, dm)) # embedding matrix

## Encoder

In [148]:
x_input = np.zeros((n, d_vocab))

# Embed tokens:
x_embedded = x_input @ E # shape (n, dm)

# Add "positional encoding"
x_pos_encoded = x_embedded + positional_encoding(n, dm)

def layer(x):
    # "Projection" parameter matrices map Q, K, V to a lower dimensional space
    WQ = [np.zeros((dm, dk)) for _ in range(h)]
    WK = [np.zeros((dm , dk)) for _ in range(h)]
    WV = [np.zeros((dm , dk)) for _ in range(h)]
    
    # Parameters for FFN
    W1 = np.zeros((dm, 2048))
    b1 = np.zeros(2048)
    W2 = np.zeros((2048, dm))
    b2 = np.zeros(dm)

    # Sublayer 1: Multi-Head Attention
    mha = multi_head_attention(x, x, x, WQ, WK, WV)
    ffn_1 = ffn(mha, W1, W2, b1, b2)
    sublayer1 = layer_norm(ffn_1 + x)

    # Sublayer 2: Feed-Forward Network
    ffn_2 = ffn(sublayer1, W1, W2, b1, b2)
    sublayer2 = layer_norm(ffn_2 + sublayer1)
    return sublayer2

# repeat this N=6 times (6 layers)
x = x_pos_encoded
for _ in range(6):
    x = layer(x)

# now pass x to decoder

## Helper functions

In [150]:
def layer_norm(x):
    # TODO: this is not defined in the paper.
    return x

def multi_head_attention(Q, K, V ,WQ, WK, WV):
    heads = [attention(Q@WQ[i], K@WK[i], V@WV[i]) for i in range(len(WQ))]
    return np.concatenate(heads, axis=1)    

def softmax(x):
    e = np.exp(x)
    e /= np.expand_dims(np.sum(e, axis=1), 1)
    return e

def positional_encoding(n, dm):
    PE = np.empty((n, dm))
    t = np.array(range(n))
    for i in range(dm // 2):
        u = t / 10000**(2*i / dm)
        PE[:, 2*i] = np.sin(u)
        PE[:, 2*i+1] = np.cos(u)
    return PE

def attention(Q, K, V):
    # equation 1 in paper
    return softmax(Q @ K.T / np.sqrt(dk)) @ V

def ffn(x, W1, W2, b1, b2):
    return np.maximum(0, x @ W1 + b1) @ W2 + b2