In [None]:
import torch
import math
from torch import nn
import torch.nn.functional as F

def scaled_dot_product(q, k, v, mask=None):
    """
    
    Here we compute the similarity between the query and the key vectors
    by calculating the multiplication of Q with K (i.e. Q. K ^T).
    We divide this by square root of the dimension of k to reduce the
    variance. We pass the resultant through a softmax to get a matrix of
    probabilities which are the "Attention Scores" denoting (numerically)
    how much each word/token in a sentence is related to the rest of the
    words.

    Parameters
    ----------
    q : tensor
        This is a tensor of dimension batch size x number of attention heads
        x sequence length x length of the query vector of each head.
    k : tensor
        This is a tensor of dimension batch size x number of attention heads
        x sequence length x length of the key vector of each head.
    v : tensor
        This is a tensor of dimension batch size x number of attention heads
        x sequence length x length of the value vector of each head.
    mask : matrix, optional
        In the Encoder, we do not require masking, in Decoder we do
        require masking as we do not want to know the relavence of
        the next words. We do not want the behaviour to be
        bi-directional for language,
        by default None
  

    Returns
    -------
    values and attention scores
    type: matrices
    """
    
    
    
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    print(f"scaled.size() : {scaled.size()}")
    if mask is not None:
        print(f"-- ADDING MASK of shape {mask.size()} --") 
        # Broadcasting add. So just the last N dimensions need to match
        scaled += mask
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class MultiHeadAttention(nn.Module):
    
    """
    This is the Multihead Attention class defined by 3 arguments:

    input_dim: This represents the vector dimension of every word
    that goes into the attention unit.

    d_model: output of the attention unit for every single word
    (i.e. after coming out as a value vector)

    num_heads: number of attention heads
    
    """
    


    def __init__(self, d_model, num_heads):
        
        """
        This is the __init__ method the constructor of the class.
        It carries out the follwoing actions:
        a) Calls the superclass nn.Module constructor with
        super().__init__().
        b) Sets the input arguments as the attributes of the class for
        later use (self.input_dim, self.d_model, self.num_heads).
        c) Calculates the dimension of the attention head by dividing
        d_model by number of heads

        Further we have 2 linear layers:
        a) self.qkv_layer: This represents the qkv_later which
        takes the input vector and maps it to the concatenated q, k, v
        vectors respectively.

        b)self.linear_layer: This linear layer is used to process
        the concatenated outputs of all attention heads.
        It takes the concatenated results and maps them
        back to the original d_model dimension.


        Parameters
        ----------
        input_dim : integer
            As defined above
        d_model : integer
            As defined above
        num_heads : integer
           As defined above
        """
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, mask=None):
        
        """
        Here we carry out the multihead attention mechanism.
        The multi head attention mechanism involves the follwoing steps:
        1) We pass the vector from positional encoding through the q, k, v layer
        2) The q, k, v layer is a linear layer which transforms vector from
        positional encoding into a concatenated q, k, v vectors expressing
        query, key, value as discussed in theory.
        3) The tensor of batch size x sequence length x concatenated q, k, v
        length (e.g. 512 x 3) is reshaped into batch size x sequence length
        x no. of heads x head dimension.
        4) Permute - we switch around the 2nd and 3rd dimension
        5) Chunk - i.e. we obtain the query, key and value vector individually by breaking down
        the entire tensor by its last dimension/
        6) We get the value vector and the matrix of attention score through
        the scalar_dot_product function.
        7) We pass the value vector through another linear layer in order to
        exchange the information through various heads.


        Parameters
        ----------
        x : tensor
            tensor from positional encoding of size batch_size, sequence_length,
            input_dim
        mask : matrix, optional
        In the Encoder, we do not require masking, in Decoder we do
        require masking as we do not want to know the relavence of
        the next words. We do not want the behaviour to be
        bi-directional for language,
        by default None


        Returns
        -------
        tensor
            final concatenated value tensor
        """
        batch_size, max_sequence_length, d_model = x.size()
        print(f"x.size(): {x.size()}")
        qkv = self.qkv_layer(x)
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.reshape(batch_size, max_sequence_length, self.num_heads, 3 * self.head_dim)
        print(f"qkv.size(): {qkv.size()}")
        qkv = qkv.permute(0, 2, 1, 3)
        print(f"qkv.size(): {qkv.size()}")
        q, k, v = qkv.chunk(3, dim=-1)
        print(f"q size: {q.size()}, k size: {k.size()}, v size: {v.size()}, ")
        values, attention = scaled_dot_product(q, k, v, mask)
        print(f"values.size(): {values.size()}, attention.size:{ attention.size()} ")
        values = values.reshape(batch_size, max_sequence_length, self.num_heads * self.head_dim)
        print(f"values.size(): {values.size()}")
        out = self.linear_layer(values)
        print(f"out.size(): {out.size()}")
        return out


class LayerNormalization(nn.Module):
    
    """

    This is a class to carry out PositionwiseNormalization along the feature
    dimension of the word embeddings. It is done to ensure that the values
    are consistent and hence do not affect upstream process in feed forward
    network.

    """
    
    def __init__(self, parameters_shape, eps=1e-5):
        
        """
        The constructor (__init__) initializes the PositionwiseNormalization
        object with the following parameters:

        parameters_shape : integer
            This is the shape of the parameters (gamma and beta) used for
            normalization. It specifies the dimensions over which normalization
            will be applied.

        eps : decimal
            This is a small constant added to the denominator to prevent
            division by zero (avoiding numerical instability).

        Inside the constructor, the class initializes two learnable parameters:

        self.gamma:
            It's initialized as a learnable parameter with ones,
            meaning it starts with no scaling (identity operation).

        self.beta:
            It's initialized as a learnable parameter with zeros,
            meaning it starts with no shift.

        """
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        
        """
        
        The forward function is where the actual Layer Normalization is
        applied to the input tensor.

        Parameters
        ----------
        input : tensor
            This is a tensor of word embeddings

        Returns
        -------
        out   : tensor
            This is a tensor of normalized output
            
        """
        
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean ({mean.size()})")
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation  ({std.size()})")
        y = (inputs - mean) / std
        print(f"y: {y.size()}")
        out = self.gamma * y  + self.beta
        print(f"self.gamma: {self.gamma.size()}, self.beta: {self.beta.size()}")
        print(f"out: {out.size()}")
        return out

  
class PositionwiseFeedForward(nn.Module):
    


    def __init__(self, d_model, hidden, drop_prob=0.1):
        
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        
        
        x = self.linear1(x)
        print(f"x after first linear layer: {x.size()}")
        x = self.relu(x)
        print(f"x after activation: {x.size()}")
        x = self.dropout(x)
        print(f"x after dropout: {x.size()}")
        x = self.linear2(x)
        print(f"x after 2nd linear layer: {x.size()}")
        return x


class EncoderLayer(nn.Module):
    
    
    """
    
    Whenever an instance of EncoderLayer is created its constructor is called. 
    In the constructor, several instances of sub-classes, 
    including Muli-head attention, Layer Normalization, Positionwise feed forward 
    network are created.
    
    These sub-classes are all blocks of the Encoder component of the 
    Transformer Architecture 
    
    
    """
    
    
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x):
        residual_x = x
        x = self.attention(x, mask=None)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

class Encoder(nn.Module):
    
    """
    
    We have a class called Encoder which is inherited/derived from the nn.module.
    

    """
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
        
        """
        
        the constructor __init__ method:
        
        1) Initializes an instance of the Encoder class and 
        takes 5 parameters – d_model, ffn_hidden, num_heads, drop_prob, num_layers as 
        explained in my blog.
        
        2) EncoderLayer()is used to create a single instance of the EncoderLayer class 
        which as discussed above. 
        
        3) (*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                     for _ in range(num_layers)]) 
        is a list comprehension that creates a list containing num_layers 
        identical instances of EncoderLayer class 

        """
        
        super().__init__()
        self.layers = nn.Sequential(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                     for _ in range(num_layers)])

    def forward(self, x):
        x = self.layers(x)
        return x