In [6]:
import math  # This imports the math module ,which provides access to mathemactical functions
!git clone https://github.com/python/cpython.git
from typing import List # use for type hinting

import torch  # use for deep learning
from torch import nn # from PyTorch which contains classes and funcations to build nerural networks


fatal: destination path 'cpython' already exists and is not an empty directory.


In [8]:
! pip install labml

Collecting labml
  Downloading labml-0.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting gitpython (from labml)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython->labml)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython->labml)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading labml-0.5.2-py3-none-any.whl (110 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.9/110.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)


In [9]:
from labml import tracker # USED  for tracking and visualizing machine learning experiments

###Prepare for multi-head attention


This module does a linear transformation and splits the vector into given number of heads for multi-head attention. This is used to transform key, query, and value vectors.

In [14]:
import torch.nn as nn

class PrepareForMultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, heads: int, d_k: int, bias: bool):
        super().__init__()
        # Linear layer for linear transform
        self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
        # Number of heads
        self.heads = heads
        # Number of Dimensions in vectors in each head
        self.d_k = d_k

    def forward(self, x: torch.Tensor): # 'X' was changed to 'x' for consistency
        # Input has shape [seq_len, batch_size, d_model] or [batch_size, d_model].
        # We apply the linear transformation to the last dimension and split that into the heads
        head_shape = x.shape[:-1]
        # Linear transform
        x = self.linear(x)
        # Split last dimension into heads
        x = x.view(*head_shape, self.heads, self.d_k)
        # Output has shape [seq_len, batch_size, heads, d_k] or [batch_size, heads, d_model]
        return x

##Multi-Head Attention Module

This explains the computation of scaled multi-headed attention for given query, key, and value vectors.

### Attention(Q, K, V) = softmax((QK^T) / √d_k) V

In simple terms, it finds keys that match the query and retrieves the corresponding values. Here's a breakdown of the process:

1. **Dot Product**: It uses the dot product of the query (Q) and key (K) vectors to determine how well they match.
2. **Scaling**: Before applying softmax, the dot products are scaled by the square root of the dimension of the key vector (√d_k). This scaling prevents large dot-product values from causing the softmax function to output very small gradients when the key vector's dimension (d_k) is large.
3. **Softmax**: The softmax function is applied to the scaled dot products to obtain a distribution over the keys, calculated along the sequence (or time) axis.
4. **Weighted Sum**: The resulting weights from the softmax are used to compute a weighted sum of the value (V) vectors.

In essence, the attention mechanism identifies relevant keys that correspond to a given query and uses these keys to retrieve the associated values, allowing the model to focus on specific parts of the input sequence.

In [15]:
class MultiHeadAttention(nn.Module):
  #heads is the number of heads
  #d_model is the number of features in the query,key and value vectors
  def __init__(self,heads:int,d_model:int,dropout_prob:float=0.1,bias:bool=True):
    super().__init__()
    #number of features per head
    self.d_k=d_model//head_shape
    #number of heads
    self.heads=heads
    #these transform the query,key and value vectors for multi-headed attention
    self.query=PrepareForMultiHeadAttention(d_model,heads,self.d_k,bias=bias)
    self.key=PrepareForMultiHeadAttenation(d_model,heads,self.d-K,bias=bias)
    self.value=PrepareForMultiHeadAttention(d_model,heads,self.d_k,bias=True)
    # softmax for attention along the time dimension of key
    self.softmax=nn.softmax(dim=1)

    #output layer
    self.output=nn.Linear(d-model,d_model)

    #Dropout
    self.dropout=nn.Dropout(dropout_prob)
    # scaling factor before the softmax
    self.scale=1/math.sqrt(self.d_k)

    #we store attentions so that it can be used for logging ,or other computations if needed
    self.attn=None


# Calculate scores between queries and keys

To calculate the dot product \( QK^\top \) or \( S_{ijbh} = \sum_d Q_{ibhd} K_{jbhd} \), we follow these steps:

1. **Notations**:
   - \( Q \): Query matrix with shape \([batch, seq\_len\_q, num\_heads, depth\_q]\)
   - \( K \): Key matrix with shape \([batch, seq\_len\_k, num\_heads, depth\_k]\)
   - \( S \): Output matrix after computing dot product of \( Q \) and \( K \), with shape \([batch, seq\_len\_q, seq\_len\_k, num\_heads]\)

2. **Dot Product Calculation**:
   - For each batch \( i \), sequence positions \( j \) and \( k \), and head \( b \), we compute:
     \[
     S_{ijbh} = \sum_d Q_{ibhd} K_{jbhd}
     \]

Here's a step-by-step breakdown:

1. **Ensure the dimensions match**:
   - The depth dimensions of \( Q \) and \( K \) should be the same, i.e., \( depth\_q = depth\_k \).

2. **Element-wise Multiplication and Summation**:
   - For each combination of \( i, j, b, \) and \( h \), compute the dot product by summing the products of corresponding elements in the \( d \) dimension of \( Q \) and \( K \).

Here is a Python implementation using NumPy:

```python
import numpy as np

def compute_attention_scores(Q, K):
    # Q shape: [batch, seq_len_q, num_heads, depth]
    # K shape: [batch, seq_len_k, num_heads, depth]
    
    # Transpose K to match dimensions for dot product
    K_transposed = np.transpose(K, (0, 2, 3, 1))  # Shape: [batch, num_heads, depth, seq_len_k]
    
    # Compute dot product
    S = np.matmul(Q, K_transposed)  # Shape: [batch, seq_len_q, num_heads, seq_len_k]
    
    # Transpose S to the final shape
    S = np.transpose(S, (0, 1, 3, 2))  # Shape: [batch, seq_len_q, seq_len_k, num_heads]
    
    return S

# Example usage
batch_size = 2
seq_len_q = 3
seq_len_k = 4
num_heads = 5
depth = 6

Q = np.random.rand(batch_size, seq_len_q, num_heads, depth)
K = np.random.rand(batch_size, seq_len_k, num_heads, depth)

S = compute_attention_scores(Q, K)
print(S.shape)  # Should print: (2, 3, 4, 5)
```

In this implementation:
- We first transpose the key matrix \( K \) to align its dimensions for the dot product operation.
- We then perform the matrix multiplication of \( Q \) and the transposed \( K \).
- Finally, we transpose the resulting matrix \( S \) to get it in the desired shape.

this method can be overridden for other variations like relative attention

In [16]:
def get_scores(self,query:torch.Tensor,key:torch.Tensor):
  return torch.einsum('ibhd,jbhd->ijbh',query,key)
  #mask has shape [seq_len_q,seq_len_k,batch_size],where first dimension is the query dimension. if the query dimension is equal to 1 it will be broadcasted

In [17]:
def prepare_mask(self,mark:torch.Tensor,query_shape:List[int],key_shape:List[int]):
  assert mask.shape[0]==1 or mask.shape[0] == query_shape[0]
  assert mask.shape[1]==key_shape[0]
  assert mask.shape[2]==1 or mask.shape[2]==query_shape[1]

  # same mask applied to all heads
  mask =mask.unsqueeze(-1)
  # resulting mask has shape [aeq_len_q,seq_len_k,batch_size,heads]
  return mask


In [22]:
#query,key and value are the tensors that store collection of query ,key and value vectors.
from typing import Optional  # Import Optional for type hinting

#They have shape [seq_len,batch_size,d_model].
# mask has shape [seq_len,seq_len,batch_size] and mask[i,j,b] indicates whether for batch b,query at postition i has access to key-Value at postionj .
def forward(self,*,
            query:torch.Tensor,
            key:torch.Tensor,
            value:torch.Tensor,
            mask:Optional[torch.Tensor]=None):
  #query,key and value have shape[seq_len,batch_size,d_model]
     seq_len,batch_size,_=query.key_shape
     if mask is not None:
      mask =self.prepare_mask(mask,query.shape,key.shape)

      #Prepare query,key and value for attention computation.these will then have shape
      #[seq_len,batch_size,heads,d_k]
      query=self.query(query)
      key=self.key(key)
      value=self.value(value)

      #compute attention QK^T .this gives a tensor of shape
      #[seq_len,seq_len,batch_size,self.heads].

      scores=self.get_scores(query,key)
      #scale scores QK^T/root(dk)
      scores *=self.scale
      #apply mask
      if mask is not None:
        scores=scores.masked_fill(mask==0,float('-inf'))


      attn=self.softmax(scores)
      #save attentions if debugging
      tracker.debug('attn',attn)

      #apply dropout

      attn=self.dropout(attn)
      #multiply by values

      x=torch.einsum('ijbh,jbhd->ibhd',attn,value)

      # save attentions for any other calculations
      self.attn=attn.detach()
      # concatenate multiple heads
      x=x.reshape(seq_len,batch_size,-1)
      #output layer
      x=self.output(x)
      return x
