In [30]:
import numpy as np
from rich import print
from icecream import ic


In [31]:
text = "Hello, world"
# text_embedding = np.random.randint(1, 10, size=(2,4)) # random embedding for the two words
text_embedding = np.array([[1, 2, 3, 4], [2, 3, 4, 5]], dtype="float")
print(text_embedding)


The text embedding has no concept of order or position. Order of words matter in Englisah language at least. 
The position of each embedding vector is typically represented as another vector of the same size as the embedding vector. This means, we can also learn the position embedding or just use a fixed vector as they did in the landmark paper "Attention is all you need". The original paper used sine function foe even positions and cosine function for odd positions.

$$ PE(pos, 2i) = sin(\frac{pos}{1000^{\frac{2i}{dmodel}}})$$
$$ PE(pos, 2i+1) = cos(\frac{pos}{1000^{\frac{2i}{dmodel}}})$$

- *dmodel* is the length of the embedding vector, which is 4 in this case. This is the number of columns in our embedding matrix.
- *i* is the index of the elements of the embedding vectors
- *pos* is the row number of the text. In our case, we have two heads and two rows.  


In [32]:
import math


def position_embedding(embed_vectors):
    position, dmodel = embed_vectors.shape
    pos_vec = np.empty_like(embed_vectors)
    for pos in range(position):
        for ind in range(dmodel):
            if ind % 2 == 0:
                pos_vec[pos, ind] = math.sin(pos / (1000 ** (2 * ind / dmodel)))
            else:
                pos_vec[pos, ind] = math.cos(pos / (1000 ** (2 * ind / dmodel)))
    return pos_vec


In [33]:
text_position = position_embedding(text_embedding)
print(text_position)


In [34]:
# Add the text_embedding and text_position vector to create the
# final encoder input embedding

encoder_input = text_embedding + text_position
print(encoder_input)


# Attention
The model use self attanetion to attend to different part of the encider input. Multi-head attention is a mechanism for the model to jointly attend to different parts of the input in different vector subsapces.
This is accomplish with multiple attending heads each with its own K, V, and Q matrices.

**Where did these matrices come from?**

Let us assume we have two heads in this case. We then have six Q, K, and V matrices. The first three matrices belings to the first head and the last three matrices belong to the second head.


In [35]:
WK1 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
WV1 = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1], [0, 1, 0]])
WQ1 = np.array([[0, 0, 0], [1, 1, 0], [0, 0, 1], [1, 0, 0]]) # 4 X 3
WK2 = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [0, 1, 0]])
WV2 = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1], [1, 0, 0]])
WQ2 = np.array([[1, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1]])


In [36]:
def get_QKV(embed, WK, WV, WQ):
    embed_WK = np.dot(embed, WK)
    embed_WV = np.dot(embed, WV)
    embed_WQ = np.dot(embed, WQ)
    return embed_WK, embed_WV, embed_WQ


In [37]:
K1, V1, Q1 = get_QKV(encoder_input, WK1, WV1, WQ1) # 2 X 3
K2, V2, Q2 = get_QKV(encoder_input, WK2, WV2, WQ2) # 2 X 3


In [38]:
print(Q1)


Notice that the dmodel reduced in size from 4 to 3 after we multiply the encoder input with those three matrices.

In [39]:
def softmax(x):
    return np.exp(x)/ np.sum(np.exp(x), axis = 1, keepdims=True)


In [40]:
def scaled_dot_product(Q, K, V):
    scores1 = np.dot(Q, K.T)
    scores1 = scores1 / np.sqrt(K1.shape[1])
    scores1 = softmax(scores1)
    attention = scores1 @ V1
    return scores1, attention


In [41]:
K1.shape[1]


3

In [42]:
scores1, attention = scaled_dot_product(Q1, K1, V1)
print(scores1)
print(attention)


In [43]:
# Combine all functions to compute attention for ecah heads

def attention(embed, WQ, WK, WV,):
    embed_WK = np.dot(embed, WK)
    embed_WV = np.dot(embed, WV)
    embed_WQ = np.dot(embed, WQ)
    scores = np.dot(embed_WQ, embed_WK.T)
    # scores = scores / np.sqrt(embed_WK.shape[1])

    # implement a temporary hack to scale the scores by 30
    scores = scores /30

    scores = softmax(scores)
    attention = scores @ embed_WV
    return scores, attention


In [44]:
scores1, attention1 = attention(encoder_input, WQ1, WK1, WV1)
scores2, attention2 = attention(encoder_input, WQ2, WK2, WV2)
print(scores1)
print(attention1)
print(scores2)
print(attention2)


The next layer of the encoder expects a one matrix. So we will need to concatenate the attentions for each heads horixontally or along the columns


In [45]:
multi_head_output = np.concatenate((attention1, attention2), axis=1)
print(multi_head_output)
print(multi_head_output.shape)


We need to convert the concatenated weight matrix to thensame dimension as the encoder_input dimention by multiplying it with anther learned weight matrix of suitable dimention. 

In [46]:
W = np.array(
    [
        [0.79445237, 0.1081456, 0.27411536, 0.78394531],
        [0.29081936, -0.36187258, -0.32312791, -0.48530339],
        [-0.36702934, -0.76471963, -0.88058366, -1.73713022],
        [-0.02305587, -0.64315981, -0.68306653, -1.25393866],
        [0.29077448, -0.04121674, 0.01509932, 0.13149906],
        [0.57451867, -0.08895355, 0.02190485, 0.24535932],
    ]
)
print(W.shape)


In [47]:
attenton_layer_output = multi_head_output @ W
print(attenton_layer_output)
print(attenton_layer_output.shape)


# FFN

The attention layer output is then feed into a simple Feed Forward Neural Network of linear layer, RELU layer, and linear layer in that order. The first linear layer expands the embedding dimension ($dmodel$) to allow the model to learn more complex patterns, the RELU layer allow the model to learn non-linear function, and the final linear layer reverts the dimension back to the original dimension. That is it revrets the effect of the first linear layer so that we get back to the initial attenion layer dimension.


$$FFN = RELU(xW_1 + b_1)W_2 + b_2$$

In [48]:
# Create random weight for the FFN layers
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)
ic(W1.shape, W2.shape, b1.shape, b2.shape)


ic| W1.shape: (4, 8), W2.shape: (8, 4), b1.shape: (8,), b2.shape: (4,)


((4, 8), (8, 4), (8,), (4,))

In [49]:
# # Forward pass function


# def relu(x):
#     return np.maximum(0, x)


# def feed_forward(x, W1, b1, W2, b2):
#     l1 = x @ W1 + b1 # linear layer 1
#     l2 = relu(l1) # Non Linear layer 2
#     l3 = l2 @ W2 + b2 # Linear layer 3
#     return l3
# output_encoder = feed_forward(attenton_layer_output, W1, b1, W2, b2)
# output_encoder


In [50]:
# Combine all functions to compute attention for ecah heads
d_value = d_key = d_query = 3
d_model = 4
n_heads = 2
d_feed_forward = 8


def layer_norm(x, epsilon=1e-6):
    mean = np.mean(x, axis=1, keepdims=True)
    std = np.std(x, axis=1, keepdims=True)
    return (x - mean) / (std + epsilon)


def attention(
    embed,
    WQ,
    WK,
    WV,
):
    embed_WK = np.dot(embed, WK)
    embed_WV = np.dot(embed, WV)
    embed_WQ = np.dot(embed, WQ)
    scores = np.dot(embed_WQ, embed_WK.T)
    # scores = scores / np.sqrt(embed_WK.shape[1])

    # implement a temporary hack to scale the scores by 30
    scores = scores / 30

    scores = softmax(scores)
    attention = scores @ embed_WV
    return attention


def multi_attention_head(
    embed,
    WQs,
    WKs,
    WVs,
):
    attentions = np.concatenate(
        [attention(embed, WQ, WK, WV) for WQ, WK, WV in zip(WQs, WKs, WVs)], axis=1
    )
    W = np.random.randn(n_heads * d_value, d_model)
    return attentions @ W


def multi_attention_head_with_layer_norm(embed, WQs, WKs, WVs):
    attentions = np.concatenate(
        [attention(embed, WQ, WK, WV) for WQ, WK, WV in zip(WQs, WKs, WVs)], axis=1
    )
    W = np.random.randn(n_heads * d_value, d_model)
    attentions = attentions @ W  # revert back to the input dimension
    residual = embed + attentions
    attention_layer_output = layer_norm(residual)
    return attention_layer_output


def relu(x):
    return np.maximum(0, x)


def feed_forward(x, W1, b1, W2, b2):
    l1 = x @ W1 + b1  # linear layer 1
    l2 = relu(l1)  # Non Linear layer 2
    l3 = l2 @ W2 + b2  # Linear layer 3
    return l3


def encoder_block_with_norm(embed, WQs, WKs, WVs, W1, b1, W2, b2):
    attentions = multi_attention_head_with_layer_norm(embed, WQs, WKs, WVs)
    # ic(attentions.shape)
    encoder_output = feed_forward(attentions, W1, b1, W2, b2)
    encoder_output = layer_norm(encoder_output + attentions)
    return encoder_output


def encoder_block(embed, WQs, WKs, WVs, W1, b1, W2, b2):
    attentions = multi_attention_head(embed, WQs, WKs, WVs)
    # ic(attentions.shape)
    encoder_output = feed_forward(attentions, W1, b1, W2, b2)
    return encoder_output


def random_encoder_block(x):
    WQs = [np.random.randn(d_model, d_query) for _ in range(n_heads)]
    WKs = [np.random.randn(d_model, d_key) for _ in range(n_heads)]
    WVs = [np.random.randn(d_model, d_value) for _ in range(n_heads)]
    W1 = np.random.randn(d_model, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_model)
    b2 = np.random.randn(d_model)
    # return encoder_block(x, WQs, WKs, WVs, W1, b1, W2, b2)
    return encoder_block_with_norm(x, WQs, WKs, WVs, W1, b1, W2, b2)


In [51]:
random_encoder_block(encoder_input)


array([[-1.59996341,  0.2418922 ,  0.19975211,  1.1583191 ],
       [-1.59150373,  0.24296948,  0.17486338,  1.17367087]])

In [52]:
def encoder(x, n=6):
    for _ in range(n):
        x = random_encoder_block(x)
    return x


In [53]:
encoder(encoder_input)


array([[ 0.91096696, -0.48395323,  0.97912822, -1.40614196],
       [ 0.91096807, -0.48395335,  0.9791272 , -1.40614192]])

Six encoder blocks was used in the original paper. The below function simulate this implementation. Note that we got NaNs in our result. This is bacause small changes small chnages in ealier layers got amplified in later layers. Note that we are having to do a lot of multiplication as we go from one layer to another. This may lead to a very large value by the time we reach the last layer. This phenomenon is called gradient explosio in deep learning. There are two approaches to avoiding this issue:

1. Residual connection
2. Layer normalization

**Residual Connection**
To perform residual connection you add the input to the output as shown below.
$Residual Connection = input + Layer(input)$

We  will apply this to the output of the attention layer and the output of the feedforward layer. In this way we can prevent varnishing gradient. This  also have the side effect of preventing the other layesr in the stack to get waht was learned from the initial input.

**Layer Normalization**
We want to normalize the input along the embedding dimenssion, ensuring that each embedding will not be affceted by other samples in the batch. The embedding will have a mean of zero and std of 1. This helps with the flow of gradients. 
$$\text{Layer Norm} = \frac{(x - \mu)}{\sqrt{\sigma^2 + \epsilon}} * \gamma + \beta$$


- $\mu$ is the mean of the embedding
- $\sigma$ is the standard deviation of the embedding
- $\epsilon$ is a small number added to prevent zero division
- $\gamma$ and $\beta$ are learned parameters that control both the scaling and shifting



Now lets apply residual connection and layer normalization to each of the encoder block to avoid the problem stated previously.

In [54]:
# The output of a stack of six encoder blocks that captures the meaning of the input
# sequence. This outout is then passed to the decoder.
encoder_output = encoder(encoder_input)
print(encoder_output)


# Decoder Block

The decoder block has two self-attention layers and a feed-forward layer as well. The decoder block takes two inputs:

- Econder output
- and a sequence of output from the decoder

The decoder will start from a special start of sequence token and ends with end of sequence token. Each passes through the decoder blocks creates an output which is added to the original input to become the next input. 

Step 1: Decoder predics special SOS token
Step 2: Decoder predicts the next token "We"
Step 3: the next input is SOS + We
Step 4: The decoder predicst "are"
Step 5: The next input is SOS + We + are
step 6: The decoder predicts "good"
Step 7: The next input becomes SOS + We + are + good
step 8: The decoder predicts "."
Step 9: The next input becomes SOS + We + are + good.
step 10: The decoder predicts "EOS"




In [55]:
decoder_input_emb = np.array([1, 0, 0, 0], dtype="float").reshape(1, -1)
decoder_positional_emb = position_embedding(decoder_input_emb)
decoder_input_emb = decoder_input_emb + decoder_positional_emb
ic(decoder_input_emb.shape)
print(decoder_input_emb)


ic| decoder_input_emb.shape: (1, 4)


In [56]:
d_model = 4
n_heads = 2

WQs = [np.random.randn(d_model, d_query) for _ in range(n_heads)]
WKs = [np.random.randn(d_model, d_key) for _ in range(n_heads)]
WVs = [np.random.randn(d_model, d_value) for _ in range(n_heads)]

Z_self_attention = multi_attention_head_with_layer_norm(decoder_input_emb, WQs, WKs, WVs)
Z_self_attention


array([[-0.28566176,  1.24780425,  0.49322824, -1.45537072]])

In [57]:
def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
    # The next three lines are the key difference!
    K = encoder_output @ WK  # Note that now we pass the previous encoder output!
    V = encoder_output @ WV  # Note that now we pass the previous encoder output!
    Q = attention_input @ WQ  # Same as self-attention

    # This stays the same
    scores = Q @ K.T
    scores = scores / np.sqrt(d_key)
    scores = softmax(scores)
    scores = scores @ V
    return scores


def multi_head_encoder_decoder_attention(
    encoder_output, attention_input, WQs, WKs, WVs
):
    # Note that now we pass the previous encoder output!
    attentions = np.concatenate(
        [
            encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV)
            for WQ, WK, WV in zip(WQs, WKs, WVs)
        ],
        axis=1,
    )
    W = np.random.randn(n_heads * d_value, d_model)
    attentions = attentions @ W  # revert back to the input dimension
    residual = attention_input + attentions
    attention_layer_output = layer_norm(residual)
    return attention_layer_output


In [59]:
WQs = [np.random.randn(d_model, d_query) for _ in range(n_heads)]
WKs = [np.random.randn(d_model, d_key) for _ in range(n_heads)]
WVs = [np.random.randn(d_model, d_value) for _ in range(n_heads)]
Z_encoder_decoder = multi_head_encoder_decoder_attention(
    encoder_output, Z_self_attention, WQs, WKs, WVs
)
print(Z_encoder_decoder)


In [60]:
def decoder_block(decoder_input, encoder_output,WQs_self_attention, WKs_self_attention, WVs_self_attention,
    WQs_ed_attention, WKs_ed_attention, WVs_ed_attention,
    W1, b1, W2, b2):
    Z = multi_attention_head_with_layer_norm(decoder_input,
                         WQs_self_attention, WKs_self_attention, WVs_self_attention)

    Z_encoder_decoder = multi_head_encoder_decoder_attention( encoder_output, Z, WQs_ed_attention, WKs_ed_attention, WVs_ed_attention)
    output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
    output = layer_norm(output + Z_encoder_decoder)
    return output


In [61]:
def random_decoder_block(x, encoder_output):
    # Just a bunch of random initializations
    WQs_self_attention = [
        np.random.randn(d_model, d_query) for _ in range(n_heads)
    ]
    WKs_self_attention = [
        np.random.randn(d_model, d_key) for _ in range(n_heads)
    ]
    WVs_self_attention = [
        np.random.randn(d_model, d_value) for _ in range(n_heads)
    ]

    WQs_ed_attention = [
        np.random.randn(d_model, d_query) for _ in range(n_heads)
    ]
    WKs_ed_attention = [
        np.random.randn(d_model, d_key) for _ in range(n_heads)
    ]
    WVs_ed_attention = [
        np.random.randn(d_model, d_value) for _ in range(n_heads)
    ]

    W1 = np.random.randn(d_model, d_feed_forward)
    b1 = np.random.randn(d_feed_forward)
    W2 = np.random.randn(d_feed_forward, d_model)
    b2 = np.random.randn(d_model)

    return decoder_block(
        x,
        encoder_output,
        WQs_self_attention,
        WKs_self_attention,
        WVs_self_attention,
        WQs_ed_attention,
        WKs_ed_attention,
        WVs_ed_attention,
        W1,
        b1,
        W2,
        b2,
    )


def decoder(x, decoder_embedding, n=6):
    for _ in range(n):
        x = random_decoder_block(x, decoder_embedding)
    return x


In [62]:
decoder_encoder_output = decoder(decoder_input_emb, encoder_output)
decoder_encoder_output


array([[-0.0081298 , -1.62301003,  0.68241529,  0.94872453]])

That is the encoder-decoder block. We do need to transfrm the decoder output into a vector of vocab size where each elemets of the vocab is the probailitity of the word at that index being the next word. 
We accomplish this with a linear layer whose weight is of the d_model by vocab_size.
We then apply softmax to the matrix multiplication of the decoder output and linear weight to get the probabilities.


In [63]:
def linear(x, W, b):
    output = np.dot(x, W) + b
    output = softmax(output)
    return output


W_linear = np.random.randn(4, 10)  # vocab size is 10; dmodel is 4
b = np.random.randn(10)

final_output = linear(decoder_encoder_output, W_linear, b)
print(final_output)


Now lets write the encoder-decoder to generate text

In [64]:
vocabulary = [
    "hello",
    "mundo",
    "world",
    "how",
    "?",
    "EOS",
    "SOS",
    "a",
    "hola",
    "c",
]
embedding_reps = np.random.randn(10, 1, 4)
vocabulary_embeddings = {word: embedding_reps[i] for i, word in enumerate(vocabulary)}
print(vocabulary_embeddings)


In [65]:
[vocabulary_embeddings['hello'][0]]


[array([-0.53796188, -1.24626629,  0.49815192,  0.60167653])]

In [69]:
text = "hello World"
text.split()


['hello', 'World']

In [73]:
text = "hello world"
xx = [vocabulary_embeddings[token][0] for token in text.split()]
print(xx)
print(np.array(xx))


In [75]:
def generate(input_sequence, max_iters=10):
    # Encode the input sequence into embeddings
    # We skip the positional encoding step
    embedding_inputs = [vocabulary_embeddings[token][0] for token in input_sequence]
    embedding_inputs = np.array(embedding_inputs)
    print("Embedding representation of the encoder input", embedding_inputs)

    # lets generate the encoder outout using the encoder block options
    encoder_output = encoder(embedding_inputs)
    print("Embedding generated by the encoder", encoder_output)

    # We initialize the first output of the decoder block with SOS token
    output = "SOS"
    sequence = vocabulary_embeddings["SOS"]

    # Create the random matrices for the linear layer
    W_linear = np.random.randn(d_model, len(vocabulary))
    b_linear = np.random.randn(len(vocabulary))

    # lets limit the number of decoding runs to avoid decoding
    # for too long without hitting the "EOS" token
    for i in range(max_iters):
        # Decoder step
        decoder_output = decoder(sequence, encoder_output)
        logits = linear(decoder_output, W_linear, b_linear)
        probs = softmax(logits)

        # get the most like;y token - greedy smapling
        next_token = vocabulary[np.argmax(probs)]
        output += " " + next_token
        print(
            "Iteration",
            i,
            "next token",
            next_token,
            "with probability of",
            np.max(probs),
        )
        # If the next token is the end token, we return the sequence
        if next_token == "EOS":
            return output
    return output


generate(["hello", "world"])


'SOS c EOS'