In [4]:
import tensorflow as tf
import numpy as np

In [5]:
#Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V, mask = None):
    matmul_qk = tf.matmul(Q, K, transpose_b = True) #Uses to compute dot product (.) of two matrices
    d_k = tf.cast(tf.shape(K)[-1], tf.float32) #Extracts the size of the last dimension of tensor and to float32
    scaled_attention_logits = matmul_qk / tf.math.sqrt(d_k)
    #This to prevent overly large values that could destablize training
    if mask is not None:
        scaled_attention_logits += (mask * -1e9) #-1e9 = -1000000000
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
    #Converts the logits into a probabilisitc distribution where each element is either a 0 or 1
    #Sum of these probabilistics = 1
    output = tf.matmul(attention_weights, V)
    return output, attention_weights

In [6]:
'''
Scaled Dot-Product Attention
It is a way to determine how much focus or attention that the each part of an input sequences
should have on every other sides while producing a prediction.

Parameters: Q: Queries, K: Keys, V: Values
    - The query represent the part of the inputs to be given attention.
    - The key helps deciding how part of the input is related to query
    - The value holds actual information you want to use after getting the 'focus'

Working Methodology:
    - Dot Products: Compute a dot product between query and key. This result is the score for how much attention
    do each key gets in relative with its query
    - Scale: There are many times when the dot products can go upto large, large values. So, you need to
    divide them by the square root of 'dk' to keep the model stable and working fine.
    - Softmax: This is an activation function, actually. So, what it does is, it turns these scores into
    probabilities (or call them attention weights). These weights tell you how much each value contribute
    into the final output.
    - Weighted Sum: For the desired output, we have to multiply the attention weights by the values and
    sum up them together.

Formula:
    Attention(Q; K; V) = softmax(QKT / sqrt(dk)) * V
    Here:
        QKT: Dot product of queries and keys.
'''

"\nScaled Dot-Product Attention\nIt is a way to determine how much focus or attention that the each part of an input sequences\nshould have on every other sides while producing a prediction.\n\nParameters: Q: Queries, K: Keys, V: Values\n    - The query represent the part of the inputs to be given attention.\n    - The key helps deciding how part of the input is related to query\n    - The value holds actual information you want to use after getting the 'focus'\n\nWorking Methodology:\n    - Dot Products: Compute a dot product between query and key. This result is the score for how much attention\n    do each key gets in relative with its query\n    - Scale: There are many times when the dot products can go upto large, large values. So, you need to\n    divide them by the square root of 'dk' to keep the model stable and working fine.\n    - Softmax: This is an activation function, actually. So, what it does is, it turns these scores into\n    probabilities (or call them attention weigh

In [7]:
#Multi-Head Attention
class MultiHead(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads

        #Linear layers for projections
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        self.W_o = tf.keras.layers.Dense(d_model)
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_k))
        return tf.transpose(x, perm = [0, 2, 1, 3])

    def call(self, Q, K, V, mask = None):
        batch_size = tf.shape(Q)[0]
        #Linear projections
        Q = self.W_q(Q)
        K = self.W_k(K)
        V = self.W_v(V)
        #These projections split into heads
        Q = self.split_heads(Q, batch_size)
        K = self.split_heads(K, batch_size)
        V = self.split_heads(V, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(Q, K, V, mask)
        scaled_attention = tf.transpose(scaled_attention, perm = [0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.W_o(concat_attention)
        return output, attention_weights

In [8]:
'''
Multi-Head Attention
It is the way to split the attention process into multiple smaller "heads" (Linear projections) to capture
different aspects of the inputs.

Difference between Scaled Attention v Multi-Head Attention
    - Scaled Attention works with one set of Q: Queries, K: Keys, and V: Values to calculate the output. 
    These are all typically the same size, say 'd_model'.
    - In Multi-Head Attention, we break this model into smaller parts. (If we had 8 heads, each head
    will work with 64)
'''

'\nMulti-Head Attention\nIt is the way to split the attention process into multiple smaller "heads" (Linear projections) to capture\ndifferent aspects of the inputs.\n\nDifference between Scaled Attention v Multi-Head Attention\n    - Scaled Attention works with one set of Q: Queries, K: Keys, and V: Values to calculate the output. \n    These are all typically the same size, say \'d_model\'.\n    - In Multi-Head Attention, we break this model into smaller parts. (If we had 8 heads, each head\n    will work with 64)\n'

In [9]:
#Position-Wise Feed-Forward Networks
class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(d_ff, activation = 'relu') #W_1, b_1
        self.dense2 = tf.keras.layers.Dense(d_model) #W_2, b_2
    def call(self, x):
        return self.dense2(self.dense1(x))

In [18]:
'''
Position-Wise Feed-Forward Networks

Each layer of both the encoder and decoder has a feed-forward network (also known as FFN).

Working Methodology:
    - Each position (which can considered as word or token) in the input is processed independently using
    the same function.
    - This function is consisting of two linear transformations which is done by multiply by weights with
    a 'ReLU' activation.

Formula:
    - FFN(x) = max(0, xW1 + b1)W2 + b2
    Here,
        - W1, W2: Weight matrices
        - b1, b2: Bias 
        - ReLU: Non-linear activations
Points:
    - Every position in this input is all processed individually but in the same way, irrespective of 
    its position.
    - The weights here changes from layer to layer, but remains same in all positions.
    - Input and output dimensions are 512, while intermediate layers has 2048 dimensions.
'''

"\nPosition-Wise Feed-Forward Networks\n\nEach layer of both the encoder and decoder has a feed-forward network (also known as FFN).\n\nWorking Methodology:\n    - Each position (which can considered as word or token) in the input is processed independently using\n    the same function.\n    - This function is consisting of two linear transformations which is done by multiply by weights with\n    a 'ReLU' activation.\n\nFormula:\n    - FFN(x) = max(0, xW1 + b1)W2 + b2\n    Here,\n        - W1, W2: Weight matrices\n        - b1, b2: Bias \n        - ReLU: Non-linear activations\nPoints:\n    - Every position in this input is all processed individually but in the same way, irrespective of \n    its position.\n    - The weights here changes from layer to layer, but remains same in all positions.\n    - Input and output dimensions are 512, while intermediate layers has 2048 dimensions.\n"

In [24]:
#Embeddings and Softmax
class EmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)

    def call(self, x):
        embed = self.embedding(x)
        embed *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        #Scaling here by sqrt method of d_model
        return embed

In [26]:
'''
Token Embedding and Weight Sharing

- This model is used to convert words or tokens into numerical vectors using embeddings.
- Each of these tokens is turned into a vector of size 'd_model' (which is roughly 512 dimensions) using
learned embeddings.
- In the end of decoder, this model uses a linear transformation + softmax function to predict the
next word in the given sequences which comes in this model.
- Weight Matrix
    - Input embeddings
    - Output embeddings
    - Final transformation which is used to predict the next token
- Scaling Trick: Embeddings is multipled by (root 512) to used the maintain the numerical stability.
'''

"\nToken Embedding and Weight Sharing\n\n- This model is used to convert words or tokens into numerical vectors using embeddings.\n- Each of these tokens is turned into a vector of size 'd_model' (which is roughly 512 dimensions) using\nlearned embeddings.\n- In the end of decoder, this model uses a linear transformation + softmax function to predict the\nnext word in the given sequences which comes in this model.\n- Weight Matrix\n    - Input embeddings\n    - Output embeddings\n    - Final transformation which is used to predict the next token\n- Scaling Trick: Embeddings is multipled by (root 512) to used the maintain the numerical stability.\n"

In [30]:
#Positional Encoding
def positional_encoding(max_seq_len, d_model):
    angles = np.arange(max_seq_len)[:, np.newaxis] / np.power(
        10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    pos_encoding = np.zeros(angles.shape)
    pos_encoding[:, 0::2] = np.sin(angles[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(angles[:, 1::2])

    return tf.cast(pos_encoding[np.newaxis, ...], dtype = tf.float32)

In [36]:
'''
Positional Encoding 

This model currently does not understand the words or tokens in a sentence, because it doesn't use recurrence
(which indefinitely means the Recurrent Neural Networks) or the convolutions (like Convolutional Neural
Networks). To fix the bug, we add positional encoding to the embeddings so that the model can understand
the position of each word in the sequences.

Working Methodology:
1. Adding Positional Stuff
    - Each token is converted into a vector
    - These encodings are added to these embeddings to give model the understanding of order.
    - Positional Encoding == Positional Embeddings (roughly 512 dimensions)
2. Using Sin & Cos waves. Formula will be given down below.

Formula:
    PE(pos, 2i) = sin(pos / 10000^2i/d_model) #Sine Wave
    PE(pos, 2i + 1) = cos(pos / 10000^2i/d_model) #Cosine Wave

These are used to create unique and weird patterns for each positions.
They also allow the model to learn the relative based positions like how far apart are these words are, easily.
This pattern continues which allows to handle longer sequences than the process of doing in training.
'''

"\nPositional Encoding \n\nThis model currently does not understand the words or tokens in a sentence, because it doesn't use recurrence\n(which indefinitely means the Recurrent Neural Networks) or the convolutions (like Convolutional Neural\nNetworks). To fix the bug, we add positional encoding to the embeddings so that the model can understand\nthe position of each word in the sequences.\n\nWorking Methodology:\n1. Adding Positional Stuff\n    - Each token is converted into a vector\n    - These encodings are added to these embeddings to give model the understanding of order.\n    - Positional Encoding == Positional Embeddings (roughly 512 dimensions)\n2. Using Sin & Cos waves. Formula will be given down below.\n\nFormula:\n    PE(pos, 2i) = sin(pos / 10000^2i/d_model) #Sine Wave\n    PE(pos, 2i + 1) = cos(pos / 10000^2i/d_model) #Cosine Wave\n\nThese are used to create unique and weird patterns for each positions.\nThey also allow the model to learn the relative based positions like

In [40]:
#Encoder Stacks
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate = 0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, training, mask = None):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training = training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training = training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

In [42]:
'''
Encoder - Transformer

The encoder consists of 6 identical layers, each with two key components:
    - Multi-Head Self-Attention: This helps model focus on the relevant words in the input.
    - Feed-Forward Network: A fully connected layer applied to each position separately.

Instead of just passing the output of each layer to the next, we add original input back.
This helps avoid vanishing gradients and makes training easier.

Formula:
    - Output = LayerNorm(x + Sublayer(x))
    Here,
        - x: Input to sub-layer
        - Sublayer(x): Self-attention or feed-forward network
        - LayerNorm: Normalizes the values to keep the model stable.
'''

'\nEncoder - Transformer\n\nThe encoder consists of 6 identical layers, each with two key components:\n    - Multi-Head Self-Attention: This helps model focus on the relevant words in the input.\n    - Feed-Forward Network: A fully connected layer applied to each position separately.\n\nInstead of just passing the output of each layer to the next, we add original input back.\nThis helps avoid vanishing gradients and makes training easier.\n\nFormula:\n    - Output = LayerNorm(x + Sublayer(x))\n    Here,\n        - x: Input to sub-layer\n        - Sublayer(x): Self-attention or feed-forward network\n        - LayerNorm: Normalizes the values to keep the model stable.\n'

In [46]:
#Decoder Stack
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate = 0.1):
        super().__init()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
    
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x, enc_output, training, look_ahead_mask = None, padding_mask = None):
        attn1, _ = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training = training)
        out1 = self.layernorm1(x + attn1)

        attn2, _ = self.mha2(out1, enc_output, enc_output, padding_mask)
        attn2 = self.dropout2(attn2, training = training)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training = training)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

In [None]:
'''
