In [2]:
import tensorflow as tf
import numpy as np

In [9]:
#Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V, mask = None):
    matmul_qk = tf.matmul(Q, K, transpose_b = True) #Uses to compute dot product (.) of two matrices
    d_k = tf.cast(tf.shape(K)[-1], tf.float32) #Extracts the size of the last dimension of tensor and to float32
    scaled_attention_logits = matmul_qk / tf.math.sqrt(d_k)
    #This to prevent overly large values that could destablize training
    if mask is not None:
        scaled_attention_logits += (mask * -1e9) #-1e9 = -1000000000
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
    #Converts the logits into a probabilisitc distribution where each element is either a 0 or 1
    #Sum of these probabilistics = 1
    output = tf.matmul(attention_weights, V)
    return output, attention_weights

In [19]:
'''
Scaled Dot-Product Attention
It is a way to determine how much focus or attention that the each part of an input sequences
should have on every other sides while producing a prediction.

Parameters: Q: Queries, K: Keys, V: Values
    - The query represent the part of the inputs to be given attention.
    - The key helps deciding how part of the input is related to query
    - The value holds actual information you want to use after getting the 'focus'

Working Methodology:
    - Dot Products: Compute a dot product between query and key. This result is the score for how much attention
    do each key gets in relative with its query
    - Scale: There are many times when the dot products can go upto large, large values. So, you need to
    divide them by the square root of 'dk' to keep the model stable and working fine.
    - Softmax: This is an activation function, actually. So, what it does is, it turns these scores into
    probabilities (or call them attention weights). These weights tell you how much each value contribute
    into the final output.
    - Weighted Sum: For the desired output, we have to multiply the attention weights by the values and
    sum up them together.

Formula:
    Attention(Q; K; V) = softmax(QKT / sqrt(dk)) * V
    Here:
        QKT: Dot product of queries and keys.
'''

"\nScaled Dot-Product Attention\nIt is a way to determine how much focus or attention that the each part of an input sequences\nshould have on every other sides while producing a prediction.\n\nParameters: Q: Queries, K: Keys, V: Values\n    - The query represent the part of the inputs to be given attention.\n    - The key helps deciding how part of the input is related to query\n    - The value holds actual information you want to use after getting the 'focus'\n\nWorking Methodology:\n    - Dot Products: Compute a dot product between query and key. This result is the score for how much attention\n    do each key gets in relative with its query\n    - Scale: There are many times when the dot products can go upto large, large values. So, you need to\n    divide them by the square root of 'dk' to keep the model stable and working fine.\n    - Softmax: This is an activation function, actually. So, what it does is, it turns these scores into\n    probabilities (or call them attention weigh

In [21]:
#Multi-Head Attention
class MultiHead(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads

        #Linear layers for projections
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        self.W_o = tf.keras.layers.Dense(d_model)
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_k))
        return tf.transpose(x, perm = [0, 2, 1, 3])

    def call(self, Q, K, V, mask = None):
        batch_size = tf.shape(Q)[0]
        #Linear projections
        Q = self.W_q(Q)
        K = self.W_k(K)
        V = self.W_v(V)
        #These projections split into heads
        Q = self.split_heads(Q, batch_size)
        K = self.split_heads(K, batch_size)
        V = self.split_heads(V, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(Q, K, V, mask)
        scaled_attention = tf.transpose(scaled_attention, perm = [0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.W_o(concat_attention)
        return output, attention_weights

In [27]:
'''
Multi-Head Attention
It is the way to split the attention process into multiple smaller "heads" (Linear projections) to capture
different aspects of the inputs.

Difference between Scaled Attention v Multi-Head Attention
    - Scaled Attention works with one set of Q: Queries, K: Keys, and V: Values to calculate the output. 
    These are all typically the same size, say 'd_model'.
    - In Multi-Head Attention, we break this model into smaller parts. (If we had 8 heads, each head
    will work with 64)
'''

SyntaxError: incomplete input (2869645389.py, line 1)