In [1]:
import numpy as np

In [2]:
WK1 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
WV1 = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1], [0, 1, 0]])
WQ1 = np.array([[0, 0, 0], [1, 1, 0], [0, 0, 1], [1, 0, 0]])

WK2 = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [0, 1, 0]])
WV2 = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]])
WQ2 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]])

In [3]:
embedding = np.array([[1, 3, 3, 5], [2.84, 3.99, 4, 6]])
K1 = embedding @ WK1
K1

array([[4.  , 8.  , 4.  ],
       [6.84, 9.99, 6.84]])

In [4]:
V1 = embedding @ WV1
V1

array([[6.  , 6.  , 4.  ],
       [7.99, 8.84, 6.84]])

In [5]:
Q1 = embedding @ WQ1
Q1

array([[8.  , 3.  , 3.  ],
       [9.99, 3.99, 4.  ]])

In [6]:
scores1 = Q1 @ K1.T
scores1

array([[ 68.    , 105.21  ],
       [ 87.88  , 135.5517]])

In [7]:
scores1 = scores1 / np.sqrt(3)
scores1

array([[39.2598183 , 60.74302182],
       [50.73754166, 78.26081048]])

In [8]:
def softmax(x):
    x = np.array(x)  # Convert input to numpy array
    if len(x.shape) == 1:  # Handle 1D arrays by adding a new axis
        x = x[np.newaxis, :]
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)


scores1 = softmax(scores1)
scores1

array([[4.67695573e-10, 1.00000000e+00],
       [1.11377182e-12, 1.00000000e+00]])

In [9]:
attention1 = scores1 @ V1
attention1

array([[7.99, 8.84, 6.84],
       [7.99, 8.84, 6.84]])

In [10]:
def attention(x, WQ, WK, WV):
    K = x @ WK
    V = x @ WV
    Q = x @ WQ

    scores = Q @ K.T
    scores = scores / np.sqrt(3)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [11]:
attention1 = attention(embedding, WQ1, WK1, WV1)
attention1

array([[7.99, 8.84, 6.84],
       [7.99, 8.84, 6.84]])

In [12]:
attention2 = attention(embedding, WQ2, WK2, WV2)
attention2

array([[8.84, 3.99, 7.99],
       [8.84, 3.99, 7.99]])

In [13]:
attentions = np.concatenate([attention1, attention2], axis=1)
attentions

array([[7.99, 8.84, 6.84, 8.84, 3.99, 7.99],
       [7.99, 8.84, 6.84, 8.84, 3.99, 7.99]])

In [14]:
# Just some random values
W = np.array(
    [
        [0.79445237, 0.1081456, 0.27411536, 0.78394531],
        [0.29081936, -0.36187258, -0.32312791, -0.48530339],
        [-0.36702934, -0.76471963, -0.88058366, -1.73713022],
        [-0.02305587, -0.64315981, -0.68306653, -1.25393866],
        [0.29077448, -0.04121674, 0.01509932, 0.13149906],
        [0.57451867, -0.08895355, 0.02190485, 0.24535932],
    ]
)
Z = attentions @ W
Z

array([[ 11.95481735, -14.12627891, -12.49250332, -18.50804518],
       [ 11.95481735, -14.12627891, -12.49250332, -18.50804518]])

In [15]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

In [16]:
def relu(x):
    return np.maximum(0, x)

def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

In [17]:
output_encoder = feed_forward(Z, W1, b1, W2, b2)
output_encoder

array([[  4.96671815,  33.63661787, -36.76354977, -38.03381201],
       [  4.96671815,  33.63661787, -36.76354978, -38.03381202]])

In [18]:
d_embedding = 4
d_key = d_value = d_query = 3
d_feed_forward = 8
n_attention_heads = 2

def attention(x, WQ, WK, WV):
    K = x @ WK
    V = x @ WV
    Q = x @ WQ

    scores = Q @ K.T
    scores = scores / np.sqrt(d_key)
    scores = softmax(scores)
    scores = scores @ V
    return scores

def multi_head_attention(x, WQs, WKs, WVs):
    attentions = np.concatenate(
        [attention(x, WQ, WK, WV) for WQ, WK, WV in zip(WQs, WKs, WVs)], axis=1
    )
    W = np.random.randn(n_attention_heads * d_value, d_embedding)
    return attentions @ W

def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

def encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2):
    Z = multi_head_attention(x, WQs, WKs, WVs)
    Z = feed_forward(Z, W1, b1, W2, b2)
    return Z

def random_encoder_block(x):
    WQs = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]
    W1 = np.random.randn(d_embedding, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_embedding)
    b2 = np.random.randn(d_embedding)
    return encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2)

In [19]:
embedding

array([[1.  , 3.  , 3.  , 5.  ],
       [2.84, 3.99, 4.  , 6.  ]])

In [20]:
random_encoder_block(embedding)

array([[  89.93068206,    1.11549344,  -40.16761922, -105.61800212],
       [  87.32358868,    2.66121053,  -39.94791518, -104.3511874 ]])

In [21]:
def encoder(x, n=6):
    for _ in range(n):
        x = random_encoder_block(x)
    return x


encoder(embedding)

array([[   822335.58296085, -20651612.30220575, -16447459.15878337,
         -7748784.18792568],
       [   822335.58296085, -20651612.30220575, -16447459.15878337,
         -7748784.18792568]])

In [22]:
(embedding + Z).mean(axis=-1, keepdims=True)

array([[-5.29300251],
       [-4.08550252]])

In [23]:
(embedding + Z).std(axis=-1, keepdims=True)

array([[10.63170702],
       [10.99362604]])

In [24]:
def layer_norm(x, epsilon=1e-6):
    mean = x.mean(axis=-1, keepdims=True)
    std = x.std(axis=-1, keepdims=True)
    return (x - mean) / (std + epsilon)

def encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2):
    Z = multi_head_attention(x, WQs, WKs, WVs)
    Z = layer_norm(Z + x)

    output = feed_forward(Z, W1, b1, W2, b2)
    return layer_norm(output + Z)

In [25]:
layer_norm(Z + embedding)

array([[ 1.71635826, -0.54866785, -0.39499776, -0.77269265],
       [ 1.7173877 , -0.55038945, -0.40086868, -0.76612956]])

In [26]:
def encoder(x, n=6):
    for _ in range(n):
        x = random_encoder_block(x)
    return x


encoder(embedding)

array([[-1.06393141,  0.14680229,  1.55991141, -0.64278229],
       [-1.06391719,  0.14678596,  1.55991998, -0.64278875]])

In [27]:
d_embedding = 4
n_attention_heads = 2

E = np.array([[1, 1, 0, 1]])
WQs = [np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)]
WKs = [np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)]
WVs = [np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)]

Z_self_attention = multi_head_attention(E, WQs, WKs, WVs)
Z_self_attention

array([[ 6.95866806,  3.3584017 , -8.72447368,  5.78033371]])

In [28]:
Z_self_attention = layer_norm(Z_self_attention + E)
Z_self_attention

array([[ 0.80538571,  0.26496303, -1.69885904,  0.6285103 ]])

In [29]:
def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
    # The next three lines are the key difference!
    K = encoder_output @ WK    # Note that now we pass the previous encoder output!
    V = encoder_output @ WV    # Note that now we pass the previous encoder output!
    Q = attention_input @ WQ   # Same as self-attention

    # This stays the same
    scores = Q @ K.T
    scores = scores / np.sqrt(d_key)
    scores = softmax(scores)
    scores = scores @ V
    return scores


def multi_head_encoder_decoder_attention(
    encoder_output, attention_input, WQs, WKs, WVs
):
    # Note that now we pass the previous encoder output!
    attentions = np.concatenate(
        [
            encoder_decoder_attention(
                encoder_output, attention_input, WQ, WK, WV
            )
            for WQ, WK, WV in zip(WQs, WKs, WVs)
        ],
        axis=1,
    )
    W = np.random.randn(n_attention_heads * d_value, d_embedding)
    return attentions @ W

In [30]:
WQs = [np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)]
WKs = [np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)]
WVs = [np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)]

encoder_output = np.array([[-1.5, 1.0, -0.8, 1.5], [1.0, -1.0, -0.5, 1.0]])

Z_encoder_decoder = multi_head_encoder_decoder_attention(
    encoder_output, Z_self_attention, WQs, WKs, WVs
)
Z_encoder_decoder

array([[-5.33900838,  1.19704191,  2.27804123, -0.26290895]])

In [31]:
Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z_self_attention)
Z_encoder_decoder

array([[-1.70528006,  0.84955343,  0.47336843,  0.3823582 ]])

In [32]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

output = layer_norm(feed_forward(Z_encoder_decoder, W1, b1, W2, b2) + Z_encoder_decoder)
output

array([[ 0.5219184 , -1.73048459,  0.64106057,  0.56750562]])

In [33]:
d_embedding = 4
d_key = d_value = d_query = 3
d_feed_forward = 8
n_attention_heads = 2
encoder_output = np.array([[-1.5, 1.0, -0.8, 1.5], [1.0, -1.0, -0.5, 1.0]])

def decoder_block(
    x,
    encoder_output,
    WQs_self_attention, WKs_self_attention, WVs_self_attention,
    WQs_ed_attention, WKs_ed_attention, WVs_ed_attention,
    W1, b1, W2, b2,
):
    # Same as before
    Z = multi_head_attention(
        x, WQs_self_attention, WKs_self_attention, WVs_self_attention
    )
    Z = layer_norm(Z + x)

    # The next three lines are the key difference!
    Z_encoder_decoder = multi_head_encoder_decoder_attention(
        encoder_output, Z, WQs_ed_attention, WKs_ed_attention, WVs_ed_attention
    )
    Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z)

    # Same as before
    output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
    return layer_norm(output + Z_encoder_decoder)

def random_decoder_block(x, encoder_output):
    # Just a bunch of random initializations
    WQs_self_attention = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs_self_attention = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs_self_attention = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]

    WQs_ed_attention = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs_ed_attention = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs_ed_attention = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]

    W1 = np.random.randn(d_embedding, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_embedding)
    b2 = np.random.randn(d_embedding)


    return decoder_block(
        x, encoder_output,
        WQs_self_attention, WKs_self_attention, WVs_self_attention,
        WQs_ed_attention, WKs_ed_attention, WVs_ed_attention,
        W1, b1, W2, b2,
    )

In [34]:
def decoder(x, decoder_embedding, n=6):
    for _ in range(n):
        x = random_decoder_block(x, decoder_embedding)
    return x

decoder(E, encoder_output)

array([[ 0.86819809,  0.05077623,  0.72823997, -1.6472143 ]])

In [35]:
def linear(x, W, b):
    return np.dot(x, W) + b

x = linear([1, 0, 1, 0], np.random.randn(4, 10), np.random.randn(10))
x

array([ 1.08011155,  0.41452874,  0.29163508,  2.18390481,  0.56179814,
       -2.77183395, -0.18728854, -0.32215983, -1.92614305, -1.87526253])

In [36]:
softmax(x)

array([[0.16051384, 0.08249992, 0.07295944, 0.48404284, 0.09558986,
        0.00340905, 0.04519471, 0.03949242, 0.00794169, 0.00835622]])

In [37]:
vocabulary = [
    "hello",
    "mundo",
    "world",
    "how",
    "?",
    "EOS",
    "SOS",
    "a",
    "hola",
    "c",
]
embedding_reps = np.random.randn(10, 4)
vocabulary_embeddings = {
    word: embedding_reps[i] for i, word in enumerate(vocabulary)
}
vocabulary_embeddings

{'hello': array([-0.82087961, -0.64054363, -0.38639977, -1.00798525]),
 'mundo': array([-0.60512693, -0.1352113 ,  0.90482958, -1.56891667]),
 'world': array([ 1.14470487, -1.85164297,  0.92776147, -0.51977999]),
 'how': array([ 0.62038366,  0.89215519,  0.81909244, -0.02642335]),
 '?': array([ 0.63584655,  1.39137262, -1.42092484, -0.1498072 ]),
 'EOS': array([-1.4973367 , -0.60157315, -1.18544969,  1.37872054]),
 'SOS': array([-0.10597324, -0.55418694,  0.13619576,  0.93146755]),
 'a': array([0.14341067, 0.32762159, 0.03070575, 0.55885494]),
 'hola': array([ 0.51286258, -0.46369143, -1.56593688,  0.38688579]),
 'c': array([-0.53763329,  0.47407343,  0.44130252, -0.11569071])}

In [38]:
def generate(input_sequence, max_iters=3):
    # We first encode the inputs into embeddings
    # This skips the positional encoding step for simplicity
    embedded_inputs = [
        vocabulary_embeddings[token] for token in input_sequence
    ]
    print("Embedding representation (encoder input)", embedded_inputs)

    # We then generate an embedding representation
    encoder_output = encoder(embedded_inputs)
    print("Embedding generated by encoder (encoder output)", encoder_output)

    # We initialize the decoder output with the embedding of the start token
    sequence_embeddings = [vocabulary_embeddings["SOS"]]
    output = "SOS"
    
    # Random matrices for the linear layer
    W_linear = np.random.randn(d_embedding, len(vocabulary))
    b_linear = np.random.randn(len(vocabulary))

    # We limit number of decoding steps to avoid too long sequences without EOS
    for i in range(max_iters):
        # Decoder step
        decoder_output = decoder(sequence_embeddings, encoder_output)

        # Only use the last output for prediction
        logits = linear(decoder_output[-1], W_linear, b_linear)
        # We wrap logits in a list as our softmax expects batches/2D array
        probs = softmax([logits])

        # We get the most likely next token
        next_token = vocabulary[np.argmax(probs)]
        sequence_embeddings.append(vocabulary_embeddings[next_token])
        output += " " + next_token

        print(
            "Iteration", i, 
            "next token", next_token,
            "with probability of", np.max(probs),
        )

        # If the next token is the end token, we return the sequence
        if next_token == "EOS":
            return output

    return output, sequence_embeddings

In [39]:
generate(["hello", "world"])

Embedding representation (encoder input) [array([-0.82087961, -0.64054363, -0.38639977, -1.00798525]), array([ 1.14470487, -1.85164297,  0.92776147, -0.51977999])]
Embedding generated by encoder (encoder output) [[-0.57053348 -0.56023307  1.73185505 -0.60108849]
 [-0.57053347 -0.5602332   1.73185505 -0.60108838]]
Iteration 0 next token hola with probability of 0.4517065200675811
Iteration 1 next token hola with probability of 0.35317224393434704
Iteration 2 next token c with probability of 0.833188985883351


('SOS hola hola c',
 [array([-0.10597324, -0.55418694,  0.13619576,  0.93146755]),
  array([ 0.51286258, -0.46369143, -1.56593688,  0.38688579]),
  array([ 0.51286258, -0.46369143, -1.56593688,  0.38688579]),
  array([-0.53763329,  0.47407343,  0.44130252, -0.11569071])])