In [1]:
import torch  
import torch.nn as nn  
import math  

In [2]:
# ***************** PositionalEncoder VERSION WITH COMMENTS AND PRINTS FOR EXPLANATION *************************

# Subclass the PyTorch nn.Module class to create a custom module for positional encoding
# This class is used to add positional information to the input embeddings in a Transformer model
class PositionalEncoder_explanations(nn.Module):
    def __init__(self, d_model, max_length):
        # Call the parent class's constructor
        super(PositionalEncoder_explanations, self).__init__()
        
        # Initialize the dimensions of the model and the maximum sequence length
        self.d_model = d_model  # The dimension of the input embeddings
        self.max_length = max_length  # The maximum length of words of tokens in the input sequences
        
        # Initialize the positional encoding matrix with zeros
        # This matrix will store the positional encodings that will be added to the input embeddings
        pe = torch.zeros(max_length, d_model)  

        # Create a tensor of positions from 0 to max_length
        # This tensor represents the positions of the words in a sequence
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)  #  we unsqueeze because the multiplication operation position * div_term requires position to be a 2D tensor to correctly broadcast with div_term (the extra dimension is added at position '1': second position)
        print('position:', position)
        
        # Calculate the division term div_term for the positional encoding
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))  #  Increasing the embedding size d_model creates more dimensions. The last dimensions will change slowly due to the exponentially decreasing div_term. This happens because sine and cosine of small values hover close to 0 or 1, meaning they change slowly (calculated below over: position * div_term). This slow change is great for capturing long-range dependencies and the global context in longer sequences. On the flip side, the first dimensions will change more rapidly and be more useful for shorter texts (so in a longer text, the difference between the encoding of adjacent words may not be as pronounced as the difference between adjacent words in a shorter text).
        print('div_term:', div_term)
        
        # Calculate and assign position encodings to the matrix
        # For even indices, use sine of the position times the division term
        # For odd indices, use cosine of the position times the division term
        # These encodings are based on sine and cosine functions of different frequencies
        # The sine and cosine functions are used to ensure that the positional encodings are continuous and differentiable, which is important for the learning process. Also, these functions generate values between -1 and 1, which helps to keep the magnitude of the positional encodings manageable.
        # Using sine for even indices and cosine for odd indices provides two different signals for each position, which helps the model distinguish between different positions more effectively.
        print('position * div_term:', position * div_term)
        print('torch.sin(position * div_term):', torch.sin(position * div_term))
        print('torch.cos(position * div_term):', torch.cos(position * div_term))
        pe[:, 0::2] = torch.sin(position * div_term)  
        pe[:, 1::2] = torch.cos(position * div_term)  
        print('pe:', pe)
        
        # Add an extra dimension to the positional encoding matrix, turning it from a 2D tensor into a 3D tensor(the extra dimension is added at position '0': first position)
        # This is done to match the dimensions of the input embeddings (batch size, sequence length, and embedding size)
        pe = pe.unsqueeze(0)  
        
        # Register the positional encoding matrix as a buffer that should not be considered a model parameter
        # Buffers are tensors that are not updated during backpropagation but need to be part of the model's state
        self.register_buffer('pe', pe)  # N.B. self.pe is defined when pe is registered as a buffer. 
    
    # Define the forward pass of the model
    def forward(self, x):
        # Update the input tensor by adding the positional encodings
        # The positional encodings are added to the input embeddings so that the model can take into account the position of words in a sequence
        x = x + self.pe[:, :x.size(1)]  # returning the input sequence plus the positional encoding (ensuring that you're slicing the positional encodings to match the length of your input sequence)
        return x  


In [3]:
d_model = 4
max_length = 8

PositionalEncoder_explanations(d_model, max_length) # in this example of 8 words, the first two columns change rapidly, while the last two columns change slowly. For shorter texts, the rapid changes in the first columns help the model distinguish between closely spaced elements. For longer texts, the slow changes in the last columns help the model capture long-range dependencies and maintain context.

position: tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.]])
div_term: tensor([1.0000, 0.0100])
position * div_term: tensor([[0.0000, 0.0000],
        [1.0000, 0.0100],
        [2.0000, 0.0200],
        [3.0000, 0.0300],
        [4.0000, 0.0400],
        [5.0000, 0.0500],
        [6.0000, 0.0600],
        [7.0000, 0.0700]])
torch.sin(position * div_term): tensor([[ 0.0000,  0.0000],
        [ 0.8415,  0.0100],
        [ 0.9093,  0.0200],
        [ 0.1411,  0.0300],
        [-0.7568,  0.0400],
        [-0.9589,  0.0500],
        [-0.2794,  0.0600],
        [ 0.6570,  0.0699]])
torch.cos(position * div_term): tensor([[ 1.0000,  1.0000],
        [ 0.5403,  0.9999],
        [-0.4161,  0.9998],
        [-0.9900,  0.9996],
        [-0.6536,  0.9992],
        [ 0.2837,  0.9988],
        [ 0.9602,  0.9982],
        [ 0.7539,  0.9976]])
pe: tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
  

PositionalEncoder_explanations()

In [4]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_length):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.max_length = max_length

        # Create a positional encoding matrix
        pe = torch.zeros(max_length, d_model)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))
        
        # Apply sine to even indices and cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)  # Add extra dimension to match input embeddings
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # Add positional encoding to the input tensor
        x = x + self.pe[:, :x.size(1)]
        return x


In [5]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Number of attention heads.
        self.num_heads = num_heads

        # Dimension of the input embeddings.
        self.d_model = d_model 

        # Dimension of each head.
        self.head_dim = d_model // num_heads

        # Linear transformations for queries, keys, and values.
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)

        # Final linear transformation for concatenated output.
        self.output_linear = nn.Linear(d_model, d_model)
        
    def split_heads(self, x, batch_size): # splits the input vectors into multiple "heads" to allow parallel attention mechanisms. Each head processes the data differently, helping the model learn diverse representations and capture various aspects of the input data.
        # Split input vectors into different attention heads.
        x = x.view(batch_size, -1, self.num_heads, self.head_dim)
        # Rearrange dimensions to bring heads to the second dimension.
        return x.permute(0, 2, 1, 3).contiguous().view(batch_size * self.num_heads, -1, self.head_dim) 
    
    def compute_attention(self, query, key, mask=None):
        # Compute attention weights.
        scores = torch.matmul(query, key.permute(0, 2, 1))  # Fixed from original
        if mask is not None:
            # Apply mask to prevent focusing on certain positions.
            scores = scores.masked_fill(mask == 0, float("-1e20")) # In the transformer’s encoder, you don’t want the attention mechanism to consider [PAD] tokens. So, the mask tells the model to ignore these positions
        attention_weights = F.softmax(scores, dim=-1)
        return attention_weights
    
    def forward(self, query, key, value, mask=None):
        # Compute output of multi-head attention layer.
        batch_size = query.size(0) 

        query = self.split_heads(self.query_linear(query), batch_size)
        key = self.split_heads(self.key_linear(key), batch_size)
        value = self.split_heads(self.value_linear(value), batch_size)

        attention_weights = self.compute_attention(query, key, mask) # These heads independently compute attention scores to focus on different parts of the input

        output = torch.matmul(attention_weights, value) # The outputs from all heads are concatenated and linearly transformed in a final output: a context vector that combines information from all heads, representing a rich and comprehensive understanding of the input.
        # Reshape output to match dimensions
        output = output.view(batch_size, self.num_heads, -1, self.head_dim).permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        return self.output_linear(output)


In [6]:
class FeedForwardSubLayer(nn.Module): # helps in adding depth and complexity to the model's capability to learn intricate patterns and representations
    
    # Specify the two linear layers' input and output sizes
    def __init__(self, d_model, d_ff):
        super(FeedForwardSubLayer, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

	# Apply a forward pass
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [7]:
# Complete the initialization of elements in the encoder layer
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

# In self-attention, especially within the encoder of a transformer, the same x is used for queries, keys, and values. Here's why:
# Query (Q): Represents the current token that’s “asking” for information.
# Key (K): Represents the tokens that can provide information.
# Value (V): Represents the actual information content of the tokens.        

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask) # The input x is a sequence of embeddings representing the input tokens, and its shape is generally (batch_size, sequence_length, d_model). In the context of self-attention mechanisms, such as the one used in Transformer models, x is used as the Query (Q), Key (K), and Value (V). This allows the model to compute attention scores based on the input itself.
        x = self.norm1(x + self.dropout(attn_output)) # The operation x + self.dropout(attn_output) is an example of a technique called residual connection: The idea is that it’s easier to model a residual (or difference) than to learn to model the full information. In this specific case,we are “adding the residual”, that is the output of the self-attention mechanism (which has learned how to modify the input) back to the original input. 
        ff_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(ff_output))

In [8]:
class TransformerEncoder(nn.Module): # whole transformer encoder structure that includes a num_layers number of encoder layers
    
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        # Define a stack of multiple encoder layers
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

# Initial x Input:
# Shape: (batch_size, sequence_length)
# This represents indices of tokens in the sequence.

# After Embedding Layer:
# Converts token indices into dense vectors of fixed size d_model.
# Output x Shape: (batch_size, sequence_length, d_model)
# This transforms each token index into a d_model-dimensional vector.   

    # Complete the forward pass method
    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x


In [9]:
import torch.nn.functional as F


class ClassifierHead(nn.Module):
    
    def __init__(self, d_model, num_classes):
        super(ClassifierHead, self).__init__()
        # Add linear layer for multiple-class classification
        self.fc = nn.Linear(d_model, num_classes)

# having x shape (batch_size, sequence_length, d_model),
# the below expression x[:, 0, :] retrieves the first token's embedding from each sequence in the batch. Here's the breakdown:
# x[:, 0, :]:
# : (first position) selects all batches.
# 0 (second position) selects the first token in the sequence.
# : (third position) selects all dimensions of the embedding vector.
# So, for each sequence in the batch, this slice pulls out the embedding corresponding to the very first token. 
# Essentially, it narrows down x from (batch_size, sequence_length, d_model) to (batch_size, d_model)

    def forward(self, x):
        logits = self.fc(x[:, 0, :])
        # Obtain log class probabilities upon raw outputs
        return F.log_softmax(logits, dim=-1)

In [10]:
num_classes = 3
vocab_size = 10000
batch_size = 8
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
sequence_length = 256
dropout = 0.1

In [11]:
# Note: although a random input sequence and mask are being used here, in practice, the mask should correspond to the actual location of padding tokens in the input sequences to ensure all of them are the same length

input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))


print(input_sequence.shape) # (batch_size, sequence_length)
input_sequence

torch.Size([8, 256])


tensor([[9797, 3924, 4525,  ..., 6777, 9344, 4056],
        [9984,  309, 2283,  ..., 8692,  793, 5878],
        [3583, 6290, 8428,  ..., 9452, 7530, 1299],
        ...,
        [1988, 5750, 4841,  ..., 5073, 1621, 4851],
        [4950, 6813, 7399,  ..., 4631, 2544, 4798],
        [7983, 6744, 1579,  ..., 6352, 5008, 7510]])

In [12]:
mask = torch.randint(0, 2, (sequence_length, sequence_length))

print(mask.shape) 
mask

torch.Size([256, 256])


tensor([[0, 0, 0,  ..., 1, 0, 1],
        [1, 1, 1,  ..., 1, 0, 1],
        [0, 1, 0,  ..., 1, 0, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 1, 0, 0],
        [0, 1, 1,  ..., 1, 1, 0]])

In [13]:
# Instantiate the encoder transformer's body and head
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

encoder

TransformerEncoder(
  (embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoder()
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardSubLayer(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

In [14]:
classifier = ClassifierHead(d_model, num_classes)

classifier

ClassifierHead(
  (fc): Linear(in_features=512, out_features=3, bias=True)
)

In [15]:
# Complete the forward pass 
enc_output = encoder(input_sequence, mask)

print(enc_output.shape) #(batch_size, sequence_length, d_model)
enc_output


torch.Size([8, 256, 512])


tensor([[[-1.7435,  1.1385, -0.7223,  ...,  0.1889, -0.6962,  0.4831],
         [ 0.8105, -1.6185,  1.3498,  ..., -1.6169, -1.1257, -2.0812],
         [ 0.5484, -0.1069,  0.5720,  ..., -0.4243,  0.6431,  0.9754],
         ...,
         [ 2.3135,  0.1473,  0.4844,  ..., -1.1608, -0.8113, -0.1542],
         [ 1.6193, -0.0295, -1.8626,  ...,  0.4774, -1.1475,  0.2491],
         [-1.5673, -1.6159,  0.1447,  ...,  0.2701, -0.8520, -0.9383]],

        [[-0.8274, -0.0733,  0.5979,  ..., -1.0813,  1.0623, -1.3290],
         [ 0.4181, -1.2374, -0.2271,  ...,  0.0966,  0.0662, -2.1758],
         [ 1.6146, -0.8896, -0.6006,  ..., -0.7868,  0.0171, -0.8474],
         ...,
         [ 2.4098,  0.4992,  0.2534,  ..., -0.9127, -0.1523,  0.7399],
         [ 0.0809, -0.3021, -1.4943,  ..., -0.6124, -0.3575, -0.6111],
         [-0.4675, -1.7376,  0.4175,  ...,  0.9002, -0.2969, -0.1444]],

        [[ 0.5061,  0.2235, -1.3462,  ..., -0.5088, -0.1415,  0.5459],
         [ 0.0041,  0.5678,  0.2001,  ..., -0

In [16]:
classification = classifier(enc_output)
print("Classification outputs for a batch of ", batch_size, "sequences:")
print (enc_output[:,0,:].shape) # (batch_size, d_model)
print(classification.shape) # (batch_size, n_classes)
classification

Classification outputs for a batch of  8 sequences:
torch.Size([8, 512])
torch.Size([8, 3])


tensor([[-0.9858, -1.1400, -1.1808],
        [-1.6497, -1.5691, -0.5114],
        [-1.3324, -1.3797, -0.7246],
        [-1.3645, -0.7959, -1.2265],
        [-0.8527, -1.1873, -1.3142],
        [-0.7921, -1.3617, -1.2348],
        [-1.4061, -0.6102, -1.5528],
        [-1.7412, -1.2330, -0.6287]], grad_fn=<LogSoftmaxBackward0>)

In [17]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        # Initialize the causal (masked) self-attention and cross-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

# 1) Self-Attention in Encoder: Captures intra-sequence dependencies within the input (original language).
# 2) Self-Attention in Decoder: Captures intra-sequence dependencies within the generated sequence (target language).
# 3) Cross-Attention in Decoder: Integrates information from the input sequence to guide the generation of the output sequence.                
        
    def forward(self, x, causal_mask, encoder_output, cross_mask):
        # Pass the necessary arguments to the causal self-attention and cross-attention
        self_attn_output = self.self_attn(x, x, x, causal_mask) # this causal mask hides future tokens to prevent the model from "cheating" by looking ahead (in the masked multihead attention) the tokens of the target language
        x = self.norm1(x + self.dropout(self_attn_output)) # residual connection
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, cross_mask) # it's a Padding Mask: Like the encoder, it ignores padding tokens
        x = self.norm2(x + self.dropout(cross_attn_output)) # residual connection
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [18]:
class TransformerDecoder(nn.Module): # whole transformer decoder structure that includes a num_layers number of decoder layers
    
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) 

        # Add a linear layer (head) for next-word prediction
        self.fc = nn.Linear(d_model, vocab_size) # next word predicted among all words in the vocabulary of size vocab_size
            
    def forward(self, x, self_mask, encoder_output, cross_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, self_mask, encoder_output, cross_mask)

        # Apply the forward pass through the model head
        x = self.fc(x)
        # When you apply F.log_softmax(x, dim=-1), the softmax function is applied to the d_model dimension. This means that the softmax function is applied independently to each sequence in each batch, and the output tensor will have the same shape as the input tensor.
        return F.log_softmax(x, dim=-1)

In [19]:
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))

print(input_sequence.shape)
input_sequence

torch.Size([8, 256])


tensor([[5471, 3530,  568,  ..., 4662, 1914, 4709],
        [3909, 9180, 6682,  ..., 7176, 7367, 8959],
        [2504,  730, 1121,  ..., 7738, 3483, 3910],
        ...,
        [7761, 1065, 6337,  ...,  730, 1216, 4146],
        [ 975, 7459, 8713,  ..., 4435, 4343, 6762],
        [1399, 4249, 8867,  ..., 4669,  972,  599]])

In [20]:
torch.ones(1, 8, 8)

tensor([[[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]]])

In [21]:
torch.triu(torch.ones(1, 8, 8), diagonal=1)

tensor([[[0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0.]]])

In [22]:
1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 1., 0.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 1., 0., 0.],
         [1., 1., 1.,  ..., 1., 1., 0.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])

In [23]:
# Create a triangular attention mask for causal attention
self_attention_mask = (1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)).bool()  # Upper triangular mask

print(self_attention_mask.shape)
self_attention_mask # This mask allows each position to attend only to itself and previous positions.

torch.Size([1, 256, 256])


tensor([[[ True, False, False,  ..., False, False, False],
         [ True,  True, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ...,  True, False, False],
         [ True,  True,  True,  ...,  True,  True, False],
         [ True,  True,  True,  ...,  True,  True,  True]]])

In [24]:
# Instantiate the decoder transformer
decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

decoder

TransformerDecoder(
  (embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoder()
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (cross_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardSubLayer(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in

In [25]:
# decoder cross_mask

padding_mask = torch.randint(0, 2, (sequence_length, sequence_length))
padding_mask

tensor([[1, 1, 1,  ..., 1, 0, 1],
        [0, 0, 1,  ..., 1, 1, 0],
        [1, 0, 0,  ..., 0, 0, 1],
        ...,
        [0, 1, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 0, 1],
        [1, 1, 1,  ..., 0, 1, 1]])

In [26]:
dec_output = decoder(input_sequence, self_attention_mask, enc_output, padding_mask)

print(dec_output.shape) # batch_size, sequence_length, vocab_size

# For each token position (of sequence_lenght shape 8) in each sequence (of batch_size shape 256), the model predicts the next token. This means for each of the 256 token positions in every of the 8 sequence, it outputs a distribution over the 10000 vocabulary tokens.

print(dec_output)

torch.Size([8, 256, 10000])
tensor([[[ -8.8514, -10.4841,  -9.6291,  ...,  -9.3091,  -9.4755, -10.2760],
         [-10.0028,  -9.9526,  -9.6459,  ...,  -9.7123,  -9.2278, -10.1058],
         [ -9.5896,  -9.9621,  -9.3631,  ..., -10.1368,  -9.4306,  -9.3334],
         ...,
         [ -9.9828,  -9.6016,  -9.4854,  ...,  -8.7922, -10.0499, -10.7145],
         [-10.3352,  -9.1532,  -8.5892,  ...,  -8.9328, -10.1256,  -9.3146],
         [-10.3416,  -9.1305,  -9.9421,  ...,  -9.1008, -10.3433,  -9.2755]],

        [[-10.0274, -10.2554, -10.3774,  ...,  -9.1231,  -9.1468,  -9.9129],
         [-10.1445, -10.1929, -10.3947,  ...,  -9.2896,  -9.2572,  -9.5090],
         [-10.1883,  -9.8544,  -9.8537,  ...,  -8.6108,  -9.1650,  -9.6822],
         ...,
         [ -8.9103,  -9.6868,  -9.4214,  ...,  -8.6500,  -9.5056,  -9.9902],
         [ -9.8133,  -9.6981,  -9.5066,  ...,  -8.6682,  -9.3694, -10.1616],
         [ -9.5896,  -9.2353,  -8.9476,  ...,  -8.3409,  -9.9228,  -9.3737]],

        [[ -9.72

In [27]:
# Resuming above:

# The classification head we applied earlier was a custom component tailored for a different task—specifically, classification. It was indeed separate from the standard encoder-decoder transformer architecture.
# So, the classification head predicted the class for each entire sequence (sentence) rather than each word, while the encoder-decoder transformer focuses on generating a comprehensive output for each token in the sequence. This duality allows transformers to be versatile across different NLP tasks. 