In [118]:
import numpy as np

In [119]:
WK1 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
WV1 = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1], [0, 1, 0]])
WQ1 = np.array([[0, 0, 0], [1, 1, 0], [0, 0, 1], [1, 0, 0]])

WK2 = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [0, 1, 0]])
WV2 = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]])
WQ2 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]])

In [120]:
embedding = np.array([[1, 3, 3, 5], [2.84, 3.99, 4, 6]])
K1 = embedding @ WK1
K1

array([[4.  , 8.  , 4.  ],
       [6.84, 9.99, 6.84]])

In [121]:
V1 = embedding @ WV1
V1

array([[6.  , 6.  , 4.  ],
       [7.99, 8.84, 6.84]])

In [122]:
Q1 = embedding @ WQ1
Q1

array([[8.  , 3.  , 3.  ],
       [9.99, 3.99, 4.  ]])

In [123]:
scores1 = Q1 @ K1.T
scores1

array([[ 68.    , 105.21  ],
       [ 87.88  , 135.5517]])

In [124]:
scores1 = scores1 / np.sqrt(3)
scores1

array([[39.2598183 , 60.74302182],
       [50.73754166, 78.26081048]])

In [125]:
def softmax(x):
    x = np.array(x)  # Convert input to numpy array
    if len(x.shape) == 1:  # Handle 1D arrays by adding a new axis
        x = x[np.newaxis, :]
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)


scores1 = softmax(scores1)
scores1

array([[4.67695573e-10, 1.00000000e+00],
       [1.11377182e-12, 1.00000000e+00]])

In [126]:
attention1 = scores1 @ V1
attention1

array([[7.99, 8.84, 6.84],
       [7.99, 8.84, 6.84]])

In [127]:
def attention(x, WQ, WK, WV):
    K = x @ WK
    V = x @ WV
    Q = x @ WQ

    scores = Q @ K.T
    scores = scores / np.sqrt(3)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [128]:
attention1 = attention(embedding, WQ1, WK1, WV1)
attention1

array([[7.99, 8.84, 6.84],
       [7.99, 8.84, 6.84]])

In [129]:
attention2 = attention(embedding, WQ2, WK2, WV2)
attention2

array([[8.84, 3.99, 7.99],
       [8.84, 3.99, 7.99]])

In [130]:
attentions = np.concatenate([attention1, attention2], axis=1)
attentions

array([[7.99, 8.84, 6.84, 8.84, 3.99, 7.99],
       [7.99, 8.84, 6.84, 8.84, 3.99, 7.99]])

In [131]:
# Just some random values
W = np.array(
    [
        [0.79445237, 0.1081456, 0.27411536, 0.78394531],
        [0.29081936, -0.36187258, -0.32312791, -0.48530339],
        [-0.36702934, -0.76471963, -0.88058366, -1.73713022],
        [-0.02305587, -0.64315981, -0.68306653, -1.25393866],
        [0.29077448, -0.04121674, 0.01509932, 0.13149906],
        [0.57451867, -0.08895355, 0.02190485, 0.24535932],
    ]
)
Z = attentions @ W
Z

array([[ 11.95481735, -14.12627891, -12.49250332, -18.50804518],
       [ 11.95481735, -14.12627891, -12.49250332, -18.50804518]])

In [132]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

In [133]:
def relu(x):
    return np.maximum(0, x)

def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

In [134]:
output_encoder = feed_forward(Z, W1, b1, W2, b2)
output_encoder

array([[ 51.77304821,   2.26545466, -14.22306709, -14.92134175],
       [ 51.77304821,   2.26545467, -14.22306709, -14.92134175]])

In [135]:
d_embedding = 4
d_key = d_value = d_query = 3
d_feed_forward = 8
n_attention_heads = 2

def attention(x, WQ, WK, WV):
    K = x @ WK
    V = x @ WV
    Q = x @ WQ

    scores = Q @ K.T
    scores = scores / np.sqrt(d_key)
    scores = softmax(scores)
    scores = scores @ V
    return scores

def multi_head_attention(x, WQs, WKs, WVs):
    attentions = np.concatenate(
        [attention(x, WQ, WK, WV) for WQ, WK, WV in zip(WQs, WKs, WVs)], axis=1
    )
    W = np.random.randn(n_attention_heads * d_value, d_embedding)
    return attentions @ W

def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

def encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2):
    Z = multi_head_attention(x, WQs, WKs, WVs)
    Z = feed_forward(Z, W1, b1, W2, b2)
    return Z

def random_encoder_block(x):
    WQs = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]
    W1 = np.random.randn(d_embedding, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_embedding)
    b2 = np.random.randn(d_embedding)
    return encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2)

In [136]:
embedding

array([[1.  , 3.  , 3.  , 5.  ],
       [2.84, 3.99, 4.  , 6.  ]])

In [137]:
random_encoder_block(embedding)

array([[  6.15837065, -30.64037675,  -7.86182274,  -3.35033514],
       [  6.15840883, -30.64053491,  -7.86186341,  -3.35035016]])

In [138]:
def encoder(x, n=6):
    for _ in range(n):
        x = random_encoder_block(x)
    return x


encoder(embedding)

array([[  6457673.25413776, -43390583.29903263,  19039272.34502068,
        -30805080.65915656],
       [  6457673.25413776, -43390583.29903263,  19039272.34502068,
        -30805080.65915656]])

In [139]:
(embedding + Z).mean(axis=-1, keepdims=True)

array([[-5.29300251],
       [-4.08550252]])

In [140]:
(embedding + Z).std(axis=-1, keepdims=True)

array([[10.63170702],
       [10.99362604]])

In [141]:
def layer_norm(x, epsilon=1e-6):
    mean = x.mean(axis=-1, keepdims=True)
    std = x.std(axis=-1, keepdims=True)
    return (x - mean) / (std + epsilon)

def encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2):
    Z = multi_head_attention(x, WQs, WKs, WVs)
    Z = layer_norm(Z + x)

    output = feed_forward(Z, W1, b1, W2, b2)
    return layer_norm(output + Z)

In [142]:
layer_norm(Z + embedding)

array([[ 1.71635826, -0.54866785, -0.39499776, -0.77269265],
       [ 1.7173877 , -0.55038945, -0.40086868, -0.76612956]])

In [143]:
def encoder(x, n=6):
    for _ in range(n):
        x = random_encoder_block(x)
    return x


encoder(embedding)

array([[ 0.21283866, -1.5675127 ,  0.13880594,  1.2158681 ],
       [ 0.21283514, -1.56752293,  0.13883565,  1.21585214]])

In [144]:
d_embedding = 4
n_attention_heads = 2

E = np.array([[1, 1, 0, 1]])
WQs = [np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)]
WKs = [np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)]
WVs = [np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)]

Z_self_attention = multi_head_attention(E, WQs, WKs, WVs)
Z_self_attention

array([[ 2.58130419,  2.1213747 , -2.65274035,  2.525234  ]])

In [145]:
Z_self_attention = layer_norm(Z_self_attention + E)
Z_self_attention

array([[ 0.6414116,  0.4665955, -1.7281068,  0.6200997]])

In [146]:
def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
    # The next three lines are the key difference!
    K = encoder_output @ WK    # Note that now we pass the previous encoder output!
    V = encoder_output @ WV    # Note that now we pass the previous encoder output!
    Q = attention_input @ WQ   # Same as self-attention

    # This stays the same
    scores = Q @ K.T
    scores = scores / np.sqrt(d_key)
    scores = softmax(scores)
    scores = scores @ V
    return scores


def multi_head_encoder_decoder_attention(
    encoder_output, attention_input, WQs, WKs, WVs
):
    # Note that now we pass the previous encoder output!
    attentions = np.concatenate(
        [
            encoder_decoder_attention(
                encoder_output, attention_input, WQ, WK, WV
            )
            for WQ, WK, WV in zip(WQs, WKs, WVs)
        ],
        axis=1,
    )
    W = np.random.randn(n_attention_heads * d_value, d_embedding)
    return attentions @ W

In [147]:
WQs = [np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)]
WKs = [np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)]
WVs = [np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)]

encoder_output = np.array([[-1.5, 1.0, -0.8, 1.5], [1.0, -1.0, -0.5, 1.0]])

Z_encoder_decoder = multi_head_encoder_decoder_attention(
    encoder_output, Z_self_attention, WQs, WKs, WVs
)
Z_encoder_decoder

array([[-1.02529108, -4.32320118,  4.78989449, -9.23399638]])

In [148]:
Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z_self_attention)
Z_encoder_decoder

array([[ 0.47793213, -0.32609457,  1.27569396, -1.42753153]])

In [149]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

output = layer_norm(feed_forward(Z_encoder_decoder, W1, b1, W2, b2) + Z_encoder_decoder)
output

array([[-1.66783935,  0.12313221,  0.70133706,  0.84337007]])

In [150]:
d_embedding = 4
d_key = d_value = d_query = 3
d_feed_forward = 8
n_attention_heads = 2
encoder_output = np.array([[-1.5, 1.0, -0.8, 1.5], [1.0, -1.0, -0.5, 1.0]])

def decoder_block(
    x,
    encoder_output,
    WQs_self_attention, WKs_self_attention, WVs_self_attention,
    WQs_ed_attention, WKs_ed_attention, WVs_ed_attention,
    W1, b1, W2, b2,
):
    # Same as before
    Z = multi_head_attention(
        x, WQs_self_attention, WKs_self_attention, WVs_self_attention
    )
    Z = layer_norm(Z + x)

    # The next three lines are the key difference!
    Z_encoder_decoder = multi_head_encoder_decoder_attention(
        encoder_output, Z, WQs_ed_attention, WKs_ed_attention, WVs_ed_attention
    )
    Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z)

    # Same as before
    output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
    return layer_norm(output + Z_encoder_decoder)

def random_decoder_block(x, encoder_output):
    # Just a bunch of random initializations
    WQs_self_attention = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs_self_attention = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs_self_attention = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]

    WQs_ed_attention = [
        np.random.randn(d_embedding, d_query) for _ in range(n_attention_heads)
    ]
    WKs_ed_attention = [
        np.random.randn(d_embedding, d_key) for _ in range(n_attention_heads)
    ]
    WVs_ed_attention = [
        np.random.randn(d_embedding, d_value) for _ in range(n_attention_heads)
    ]

    W1 = np.random.randn(d_embedding, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_embedding)
    b2 = np.random.randn(d_embedding)


    return decoder_block(
        x, encoder_output,
        WQs_self_attention, WKs_self_attention, WVs_self_attention,
        WQs_ed_attention, WKs_ed_attention, WVs_ed_attention,
        W1, b1, W2, b2,
    )

In [151]:
def decoder(x, decoder_embedding, n=6):
    for _ in range(n):
        x = random_decoder_block(x, decoder_embedding)
    return x

decoder(E, encoder_output)

array([[-0.58272481, -0.90031757,  1.67697472, -0.19393235]])

In [152]:
def linear(x, W, b):
    return np.dot(x, W) + b

x = linear([1, 0, 1, 0], np.random.randn(4, 10), np.random.randn(10))
x

array([-2.55368201,  3.26190882, -0.09717209,  0.42648578,  0.30394799,
        0.97572017, -0.28418015, -1.65858315,  1.32533095, -1.06461042])

In [153]:
softmax(x)

array([[0.00206484, 0.69273079, 0.02408431, 0.04065893, 0.03596983,
        0.07041827, 0.01997642, 0.00505385, 0.0998894 , 0.00915337]])

In [154]:
vocabulary = [
    "hello",
    "mundo",
    "world",
    "how",
    "?",
    "EOS",
    "SOS",
    "a",
    "hola",
    "c",
]
embedding_reps = np.random.randn(10, 4)
vocabulary_embeddings = {
    word: embedding_reps[i] for i, word in enumerate(vocabulary)
}
vocabulary_embeddings

{'hello': array([-0.3740027 , -0.27331461,  0.83044974, -0.89449293]),
 'mundo': array([ 0.51187159, -0.23632051,  1.36690286, -0.48006702]),
 'world': array([ 1.56589079, -0.62575696, -1.28642105, -0.6150428 ]),
 'how': array([ 1.5089655 ,  0.48213663, -0.93913669,  1.87220858]),
 '?': array([-0.40593975, -1.48501512,  1.24030524, -1.16183631]),
 'EOS': array([-1.28896434,  0.34225844, -0.27898599, -1.92970403]),
 'SOS': array([-0.50054203, -0.29423137, -1.40294651, -0.12970428]),
 'a': array([ 1.1007179 , -1.86644389, -0.81330588,  0.81295671]),
 'hola': array([ 1.2931899 ,  1.51516573, -0.11730997,  0.86117891]),
 'c': array([ 1.76754676, -1.49698142,  2.94760718,  0.04001005])}

In [155]:
def generate(input_sequence, max_iters=3):
    # We first encode the inputs into embeddings
    # This skips the positional encoding step for simplicity
    embedded_inputs = [
        vocabulary_embeddings[token] for token in input_sequence
    ]
    print("Embedding representation (encoder input)", embedded_inputs)

    # We then generate an embedding representation
    encoder_output = encoder(embedded_inputs)
    print("Embedding generated by encoder (encoder output)", encoder_output)

    # We initialize the decoder output with the embedding of the start token
    sequence_embeddings = [vocabulary_embeddings["SOS"]]
    output = "SOS"
    
    # Random matrices for the linear layer
    W_linear = np.random.randn(d_embedding, len(vocabulary))
    b_linear = np.random.randn(len(vocabulary))

    # We limit number of decoding steps to avoid too long sequences without EOS
    for i in range(max_iters):
        # Decoder step
        decoder_output = decoder(sequence_embeddings, encoder_output)

        # Only use the last output for prediction
        logits = linear(decoder_output[-1], W_linear, b_linear)
        # We wrap logits in a list as our softmax expects batches/2D array
        probs = softmax([logits])

        # We get the most likely next token
        next_token = vocabulary[np.argmax(probs)]
        sequence_embeddings.append(vocabulary_embeddings[next_token])
        output += " " + next_token

        print(
            "Iteration", i, 
            "next token", next_token,
            "with probability of", np.max(probs),
        )

        # If the next token is the end token, we return the sequence
        if next_token == "EOS":
            return output

    return output, sequence_embeddings

In [156]:
generate(["hello", "world"])

Embedding representation (encoder input) [array([-0.3740027 , -0.27331461,  0.83044974, -0.89449293]), array([ 1.56589079, -0.62575696, -1.28642105, -0.6150428 ])]
Embedding generated by encoder (encoder output) [[-1.05509726 -0.42878511  1.63691548 -0.15303311]
 [-1.0551545  -0.42873063  1.63689502 -0.15300989]]
Iteration 0 next token SOS with probability of 0.6542590025392748
Iteration 1 next token how with probability of 0.6583692092399231
Iteration 2 next token a with probability of 0.41477578261860854


('SOS SOS how a',
 [array([-0.50054203, -0.29423137, -1.40294651, -0.12970428]),
  array([-0.50054203, -0.29423137, -1.40294651, -0.12970428]),
  array([ 1.5089655 ,  0.48213663, -0.93913669,  1.87220858]),
  array([ 1.1007179 , -1.86644389, -0.81330588,  0.81295671])])