<a href="https://colab.research.google.com/github/abdulsamadkhan/Courses-LLM-Lectures/blob/main/BERT_with_One_Layer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Phase 1: Self-Attention Mechanism
Implement a self-attention mechanism as the foundational component of BERT.

In [None]:
import torch
import torch.nn as nn

# Define the Self-Attention mechanism
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        """
        Initialize the self-attention module.
        Args:
        - embed_dim: Dimensionality of input embeddings and attention vectors.
        """
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim

        # Linear layers to transform input embeddings into Q, K, and V matrices
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)

        # Scale factor to normalize the dot product attention scores
        self.scale = torch.sqrt(torch.tensor([embed_dim], dtype=torch.float32))

    def forward(self, x):
        """
        Perform self-attention on input x.
        Args:
        - x: Input tensor of shape (batch_size, sequence_length, embed_dim)
        Returns:
        - Output tensor after applying self-attention.
        """
        # Project the input x into Q, K, and V matrices
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        # Compute attention scores by taking dot product of Q and transposed K, and scaling
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        # Apply softmax to get the attention weights
        attention_weights = torch.softmax(scores, dim=-1)

        # Compute the output as the weighted sum of V using the attention weights
        return torch.matmul(attention_weights, V)

# Testing Self-Attention
embed_dim = 64  # Define embedding dimension
x = torch.rand(10, 16, embed_dim)  # Random input tensor of shape (batch_size, sequence_length, embed_dim)

# Initialize the self-attention layer
self_attention = SelfAttention(embed_dim)

# Pass the input through the self-attention layer
output = self_attention(x)

# Print the output shape to verify the dimensions
print("Self-Attention output shape:", output.shape)  # Expected: (batch_size, sequence_length, embed_dim)


Self-Attention output shape: torch.Size([10, 16, 64])


# Phase 2: Multi-Head Attention
Create multi-head attention by concatenating outputs from multiple self-attention heads.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        """
        Initialize the multi-head attention module.
        Args:
        - embed_dim: Total dimensionality of input embeddings.
        - num_heads: Number of attention heads.
        """
        super(MultiHeadAttention, self).__init__()

        # Number of attention heads and dimensions per head
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.head_dim = embed_dim // num_heads  # Dimension for each head

        # Create a list of SelfAttention layers, one for each head
        self.attention_heads = nn.ModuleList([SelfAttention(self.head_dim) for _ in range(num_heads)])

        # Linear layer to transform concatenated attention outputs back to embed_dim
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        """
        Perform multi-head attention on input x.
        Args:
        - x: Input tensor of shape (batch_size, sequence_length, embed_dim)
        Returns:
        - Output tensor after applying multi-head attention.
        """
        # Extract dimensions
        batch_size, seq_length, embed_dim = x.size()

        # Reshape x to split it for each head: (batch_size, num_heads, seq_length, head_dim)
        x = x.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        # Apply self-attention to each head individually
        # Each attention head processes its part of the input tensor x
        attention_outputs = [head(x[:, i]) for i, head in enumerate(self.attention_heads)]

        # Concatenate outputs from all heads along the last dimension (head_dim)
        concat_output = torch.cat(attention_outputs, dim=-1)

        # Apply a linear layer to the concatenated output to return to the original embed_dim size
        output = self.output_linear(concat_output)
        return output

# Testing Multi-Head Attention
embed_dim = 64      # Define embedding dimension
num_heads = 4       # Define number of heads
x = torch.rand(10, 16, embed_dim)  # Random input tensor of shape (batch_size, sequence_length, embed_dim)

# Initialize the multi-head attention layer
multihead_attention = MultiHeadAttention(embed_dim, num_heads)

# Pass the input through the multi-head attention layer
output = multihead_attention(x)

# Print the output shape to verify the dimensions
print("Multi-Head Attention output shape:", output.shape)  # Expected: (batch_size, sequence_length, embed_dim)


Multi-Head Attention output shape: torch.Size([10, 16, 64])


# Phase 3: Add Layer Normalization


In [None]:
class AddNorm(nn.Module):
    def __init__(self, embed_dim):
        """
        Initialize the Add & Norm layer.
        Args:
        - embed_dim: Dimensionality of the input embeddings.
        """
        super(AddNorm, self).__init__()

        # Layer normalization to stabilize and scale the combined inputs
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x, sublayer_output):
        """
        Apply Add & Norm operation.
        Args:
        - x: Original input tensor of shape (batch_size, sequence_length, embed_dim).
        - sublayer_output: Output from a sublayer (e.g., multi-head attention) with the same shape.
        Returns:
        - Normalized output after adding the original input and the sublayer output.
        """
        # Add the input tensor `x` and the output of the sublayer, then apply layer normalization
        return self.norm(x + sublayer_output)

# Testing Add & Norm
add_norm = AddNorm(embed_dim)  # Initialize Add & Norm layer with embedding dimension

# Get output from a multi-head attention layer as sublayer_output
sublayer_output = multihead_attention(x)

# Apply Add & Norm with original input `x` and the sublayer output
output = add_norm(x, sublayer_output)

# Print the output shape to verify dimensions
print("Add & Norm output shape:", output.shape)  # Expected: (batch_size, sequence_length, embed_dim)


Add & Norm output shape: torch.Size([10, 16, 64])


# Phase 4: Parallel Feed-Forward Neural Network
Add a feed-forward neural network in parallel to the attention mechanism.

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        """
        Initialize the Feed-Forward layer.
        Args:
        - embed_dim: Dimensionality of the input embeddings.
        - hidden_dim: Dimensionality of the hidden layer.
        """
        super(FeedForward, self).__init__()

        # First linear layer to project the input from embed_dim to hidden_dim
        self.fc1 = nn.Linear(embed_dim, hidden_dim)

        # Second linear layer to project back from hidden_dim to embed_dim
        self.fc2 = nn.Linear(hidden_dim, embed_dim)

        # ReLU activation function for introducing non-linearity
        self.relu = nn.ReLU()

    def forward(self, x):
        """
        Forward pass of the feed-forward network.
        Args:
        - x: Input tensor of shape (batch_size, sequence_length, embed_dim).
        Returns:
        - Output tensor after applying the two linear layers and ReLU activation.
        """
        # Apply the first linear layer followed by ReLU, then the second linear layer
        return self.fc2(self.relu(self.fc1(x)))

# Testing Feed-Forward NN
feed_forward = FeedForward(embed_dim, hidden_dim=128)  # Initialize Feed-Forward layer with hidden dimension

# Pass the output from the previous layer through the Feed-Forward network
output = feed_forward(output)

# Print the output shape to verify dimensions
print("Feed-Forward output shape:", output.shape)  # Expected: (batch_size, sequence_length, embed_dim)


Feed-Forward output shape: torch.Size([10, 16, 64])


#Phase 5: Complete Transformer Block
Combine all components into a single Transformer block.

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_dim):
        """
        Initialize the Transformer block.
        Args:
        - embed_dim: Dimensionality of the input embeddings.
        - num_heads: Number of attention heads for multi-head attention.
        - hidden_dim: Dimensionality of the hidden layer in the feed-forward network.
        """
        super(TransformerBlock, self).__init__()

        # Multi-head attention layer
        self.multihead_attention = MultiHeadAttention(embed_dim, num_heads)

        # Add & Norm layer after multi-head attention
        self.add_norm1 = AddNorm(embed_dim)

        # Feed-forward neural network layer
        self.feed_forward = FeedForward(embed_dim, hidden_dim)

        # Second Add & Norm layer after the feed-forward network
        self.add_norm2 = AddNorm(embed_dim)

    def forward(self, x):
        """
        Forward pass of the Transformer block.
        Args:
        - x: Input tensor of shape (batch_size, sequence_length, embed_dim).
        Returns:
        - Output tensor after applying multi-head attention, Add & Norm, feed-forward network, and Add & Norm.
        """
        # Apply multi-head attention to the input
        attention_output = self.multihead_attention(x)

        # Apply the first Add & Norm layer with the attention output
        x = self.add_norm1(x, attention_output)

        # Apply the feed-forward network
        feed_forward_output = self.feed_forward(x)

        # Apply the second Add & Norm layer with the feed-forward output
        x = self.add_norm2(x, feed_forward_output)

        return x

# Testing Transformer Block
transformer_block = TransformerBlock(embed_dim, num_heads, hidden_dim=128)  # Initialize Transformer block

# Pass input tensor `x` through the Transformer block
output = transformer_block(x)

# Print the output shape to verify dimensions
print("Transformer Block output shape:", output.shape)  # Expected: (batch_size, sequence_length, embed_dim)


Transformer Block output shape: torch.Size([10, 16, 64])


# Phase 6: Add Positional Encoding and Embedding Layer
Add positional encoding and embedding layer for input tokens.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_length=5000):
        """
        Initialize the Positional Encoding.
        Args:
        - embed_dim: Dimensionality of the embeddings.
        - max_length: Maximum length of the sequence to support with positional encodings.
        """
        super(PositionalEncoding, self).__init__()

        # Create a matrix to hold positional encodings with dimensions (max_length, embed_dim)
        self.encoding = torch.zeros(max_length, embed_dim)

        # Create a tensor of shape (max_length, 1) with positions ranging from 0 to max_length-1
        position = torch.arange(0, max_length).unsqueeze(1)

        # Calculate the scaling term for even indices based on the position in the embedding
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(torch.log(torch.tensor(10000.0)) / embed_dim))

        # Apply sine to even indices (0, 2, 4, ...) of the positional encoding
        self.encoding[:, 0::2] = torch.sin(position * div_term)

        # Apply cosine to odd indices (1, 3, 5, ...) of the positional encoding
        self.encoding[:, 1::2] = torch.cos(position * div_term)

        # Add an extra dimension at the beginning for batch compatibility
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        """
        Forward pass of the positional encoding.
        Args:
        - x: Input tensor of shape (batch_size, sequence_length, embed_dim).
        Returns:
        - Tensor with positional encoding added to input embeddings.
        """
        # Add positional encoding to the input tensor x
        # Only take as many positions as needed based on sequence length of x
        return x + self.encoding[:, :x.size(1), :]

# Testing Positional Encoding
positional_encoding = PositionalEncoding(embed_dim)  # Initialize positional encoding


x = torch.rand(10, 16, embed_dim)  # Random input tensor of shape (batch_size, sequence_length, embed_dim)

# Apply positional encoding to input tensor x
embedded_x = positional_encoding(x)

# Print the output shape to verify dimensions
print("Positional Encoding output shape:", embedded_x.shape)  # Expected: (batch_size, sequence_length, embed_dim)


Positional Encoding output shape: torch.Size([10, 16, 8])


# Phase 7: Understanding the embedding layer


In [None]:
import torch
import torch.nn as nn

# Define parameters for the embedding layer
vocab_size = 100     # Size of vocabulary (e.g., 100 unique tokens)
embed_dim = 8        # Embedding dimension (e.g., each token will be represented by an 8-dimensional vector)

# Initialize the embedding layer
embedding_layer = nn.Embedding(vocab_size, embed_dim)

# Create a batch of token IDs
batch_size = 5       # Number of examples in a batch
sequence_length = 4   # Number of tokens in each example
input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length))  # Random token IDs in range [0, vocab_size)

print("Input token IDs:\n", input_ids)

# Pass the token IDs through the embedding layer
output = embedding_layer(input_ids)

# Display the shape and content of the output
print("\nOutput from the embedding layer:")
print("Output shape:", output.shape)   # Expected shape: (batch_size, sequence_length, embed_dim)
print("Output values:\n", output)


Input token IDs:
 tensor([[63, 53, 88, 67],
        [49, 73, 32, 47],
        [45, 64, 92, 24],
        [97, 11, 82, 11],
        [79, 60, 30, 15]])

Output from the embedding layer:
Output shape: torch.Size([5, 4, 8])
Output values:
 tensor([[[-1.1734,  1.1417,  1.6486,  0.4211,  0.4692, -0.0909, -0.3167,
           1.2432],
         [-0.1273, -0.3382, -2.0314, -0.8377,  1.0141, -1.6332,  0.6968,
          -0.8438],
         [-1.1493, -0.7665, -0.6098, -0.0044, -0.4661,  0.7721,  0.3780,
          -0.6827],
         [ 2.3378,  0.5129, -0.1060,  0.4381,  1.1368,  0.0957,  0.2459,
          -0.9346]],

        [[-1.3704,  0.9207, -1.0321,  1.3557,  0.7781,  1.0541,  0.9063,
          -0.8551],
         [ 0.2555,  0.1818,  0.8608,  0.9628,  1.5397,  0.3957,  0.1870,
           0.3106],
         [ 1.6036,  1.1730,  1.5289,  0.7095,  0.6508,  0.1734, -0.3149,
           0.7287],
         [ 1.4319, -0.5518, -0.1282, -0.1049, -0.9707, -1.3494,  0.7407,
           0.6971]],

        [[ 0.3495

# Phase 8: Training for Next-Word Prediction
Use the Transformer block for next-word prediction.

* Tokenize the input text and generate training data.
* Pass the input through embedding, positional encoding, and the Transformer block.
* Add a linear layer for predictions and train on next-word prediction.

In [None]:
class NextWordPredictionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers):
        super(NextWordPredictionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim)
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)]
        )
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        print("shape of embeddings of the input tokesn for the batch",x.shape)
        x = self.positional_encoding(x)
        print("shape of embeddings of After the positional encoding",x.shape)


        for block in self.transformer_blocks:
            x = block(x)
        return self.linear(x)

# Example training loop
model = NextWordPredictionModel(vocab_size=30522, embed_dim=embed_dim, num_heads=num_heads, hidden_dim=128, num_layers=1)

#torch.randint: This function generates a tensor filled with random integers from a specified range.
# 0 to 30522
# Together, (10, 16) indicates we are working with a batch of 10 sequences, where each sequence has 16 tokens (or words).
input_ids = torch.randint(0, 30522, (10, 16))  #
#print intput_ids in meaninful way
print("inputs:", input_ids.shape)  # Expected: (batch_size, sequence_length)
output_logits = model(input_ids)
print("Output logits shape:", output_logits.shape)  # Expected: (batch_size, sequence_length, vocab_size)


inputs: torch.Size([10, 16])
shape of embeddings of the input tokesn for the batch torch.Size([10, 16, 64])
Output logits shape: torch.Size([10, 16, 30522])
