
## <font color='blue'>LLM from scratch</font>

In [1]:
import random
random.seed(10)

In [2]:
# Imports
import re
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import *

## Loading the text data

In [3]:
# Load the text data
texts = open('text.txt', 'r').read()

In [4]:
print(texts)

'Hello, how are you? I am Camila.\n'
'Hello, Camila, my name is Fernando. Nice to meet you.\n'
'Nice to meet you too. How are you today?\n'
'Great. My soccer team won the competition.\n'
'Wow, congratulations Fernando!\n'
'Thank you, Camila.\n'
'Shall we have pizza later to celebrate?\n'
'Sure. Do you recommend any restaurant, Camila?\n'
'Yes, a new restaurant opened, and they say the banana pizza is phenomenal.\n'
'Okay. Shall we meet at the restaurant at seven in the evening?\n'
'Sounds good. See you later then.'\n'



## Text Data Preprocessing and Vocabulary Construction

In [5]:
# We filter special characters: '.', ',', '?', '!'
sentences = re.sub("[.,!?\\-]", '', texts.lower()).split('\n') 

In [6]:
print(sentences)

["'hello how are you i am camila\\n'", "'hello camila my name is fernando nice to meet you\\n'", "'nice to meet you too how are you today\\n'", "'great my soccer team won the competition\\n'", "'wow congratulations fernando\\n'", "'thank you camila\\n'", "'shall we have pizza later to celebrate\\n'", "'sure do you recommend any restaurant camila\\n'", "'yes a new restaurant opened and they say the banana pizza is phenomenal\\n'", "'okay shall we meet at the restaurant at seven in the evening\\n'", "'sounds good see you later then'\\n'", '']


In [7]:
# We split the sentences into words and create a list of words
word_list = list(set(" ".join(sentences).split()))

In [8]:
print(word_list)

['any', 'congratulations', 'too', 'fernando', 'seven', 'the', 'am', "phenomenal\\n'", "'shall", "'hello", 'name', "celebrate\\n'", 'and', 'team', 'meet', "fernando\\n'", 'at', "'sure", "then'\\n'", 'new', 'soccer', 'how', 'won', "'okay", 'camila', "'nice", "'thank", 'restaurant', 'shall', 'good', 'do', 'have', 'opened', "you\\n'", 'i', 'to', 'pizza', 'recommend', 'they', 'in', 'see', 'nice', "'great", "evening\\n'", "'yes", 'are', 'later', 'you', 'my', "today\\n'", "'sounds", "camila\\n'", 'we', 'say', 'a', 'banana', "'wow", 'is', "competition\\n'"]


In [9]:
# Initialize the word dictionary with BERT's special tokens
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}

In [10]:
print(word_dict)

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}


In [11]:
# Include the words in the dictionary and create indices
for i, w in enumerate(word_list):
    word_dict[w] = i + 4

In [12]:
print(word_dict)

{'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3, 'any': 4, 'congratulations': 5, 'too': 6, 'fernando': 7, 'seven': 8, 'the': 9, 'am': 10, "phenomenal\\n'": 11, "'shall": 12, "'hello": 13, 'name': 14, "celebrate\\n'": 15, 'and': 16, 'team': 17, 'meet': 18, "fernando\\n'": 19, 'at': 20, "'sure": 21, "then'\\n'": 22, 'new': 23, 'soccer': 24, 'how': 25, 'won': 26, "'okay": 27, 'camila': 28, "'nice": 29, "'thank": 30, 'restaurant': 31, 'shall': 32, 'good': 33, 'do': 34, 'have': 35, 'opened': 36, "you\\n'": 37, 'i': 38, 'to': 39, 'pizza': 40, 'recommend': 41, 'they': 42, 'in': 43, 'see': 44, 'nice': 45, "'great": 46, "evening\\n'": 47, "'yes": 48, 'are': 49, 'later': 50, 'you': 51, 'my': 52, "today\\n'": 53, "'sounds": 54, "camila\\n'": 55, 'we': 56, 'say': 57, 'a': 58, 'banana': 59, "'wow": 60, 'is': 61, "competition\\n'": 62}


In [13]:
# Reverse the order and put indices as keys and words as values in the dictionary
number_dict = {i: w for i, w in enumerate(word_dict)}

In [14]:
number_dict

{0: '[PAD]',
 1: '[CLS]',
 2: '[SEP]',
 3: '[MASK]',
 4: 'any',
 5: 'congratulations',
 6: 'too',
 7: 'fernando',
 8: 'seven',
 9: 'the',
 10: 'am',
 11: "phenomenal\\n'",
 12: "'shall",
 13: "'hello",
 14: 'name',
 15: "celebrate\\n'",
 16: 'and',
 17: 'team',
 18: 'meet',
 19: "fernando\\n'",
 20: 'at',
 21: "'sure",
 22: "then'\\n'",
 23: 'new',
 24: 'soccer',
 25: 'how',
 26: 'won',
 27: "'okay",
 28: 'camila',
 29: "'nice",
 30: "'thank",
 31: 'restaurant',
 32: 'shall',
 33: 'good',
 34: 'do',
 35: 'have',
 36: 'opened',
 37: "you\\n'",
 38: 'i',
 39: 'to',
 40: 'pizza',
 41: 'recommend',
 42: 'they',
 43: 'in',
 44: 'see',
 45: 'nice',
 46: "'great",
 47: "evening\\n'",
 48: "'yes",
 49: 'are',
 50: 'later',
 51: 'you',
 52: 'my',
 53: "today\\n'",
 54: "'sounds",
 55: "camila\\n'",
 56: 'we',
 57: 'say',
 58: 'a',
 59: 'banana',
 60: "'wow",
 61: 'is',
 62: "competition\\n'"}

In [15]:
# Vocabulary size
vocab_size = len(word_dict)
print(vocab_size)

63


In [16]:
# Create a list for tokens
token_list = list()

In [17]:
# Loop through the sentences to create the list of tokens
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

In [18]:
token_list

[[13, 25, 49, 51, 38, 10, 55],
 [13, 28, 52, 14, 61, 7, 45, 39, 18, 37],
 [29, 39, 18, 51, 6, 25, 49, 51, 53],
 [46, 52, 24, 17, 26, 9, 62],
 [60, 5, 19],
 [30, 51, 55],
 [12, 56, 35, 40, 50, 39, 15],
 [21, 34, 51, 41, 4, 31, 55],
 [48, 58, 23, 31, 36, 16, 42, 57, 9, 59, 40, 61, 11],
 [27, 32, 56, 18, 20, 9, 31, 20, 8, 43, 9, 47],
 [54, 33, 44, 51, 50, 22],
 []]

In [19]:
# First phrase
texts[0:33]

"'Hello, how are you? I am Camila."

In [20]:
# First sentence in token format (to be used for training the BERT model)
token_list[0]

[13, 25, 49, 51, 38, 10, 55]

## Definition of Hyperparameters

In [21]:
# Hyperparameters
batch_size = 6
n_segments = 2
dropout = 0.2

# Maximum length
maxlen = 100

# Maximum number of tokens to predict
max_pred = 7

# Number of layers
n_layers = 6

# Number of heads in multi-head attention
n_heads = 12

# Embedding size
d_model = 768

# Dimension of feedforward layer: 4 * d_model
d_ff = d_model * 4

# Dimension of K(=Q)V
d_k = d_v = 64

# Epochs
NUM_EPOCHS = 50


## Creation of Data Batches and Application of Special Tokens

The function `make_batch()` below creates batches of data for training the BERT model. It is responsible for generating the correct input required for BERT training, which includes input tokens, masked tokens, masked token positions, segment IDs, and a label indicating whether the second sentence immediately follows the first. Let's describe each part of the function and use images to aid understanding.

**Initialization**: The function starts by initializing an empty batch and counters for positive and negative sentences. Positive sentences are pairs where the second sentence immediately follows the first, while negative sentences are pairs where this does not occur. The batch must be balanced between positive and negative sentences.

**Generating Sentence Pairs**: For each instance in the batch, the function randomly selects two sentences from the dataset. Each sentence is then converted into a list of token IDs, and special tokens `[CLS]` and `[SEP]` are added at appropriate positions.

**Segment IDs**: For each sentence pair, the function generates segment IDs, where IDs are 0 for tokens in the first sentence and 1 for tokens in the second sentence.

**Masked Language Model (MLM)**: The function then randomly selects 15% of the tokens to mask for the MLM task, ensuring that `[CLS]` and `[SEP]` tokens are not masked. These tokens are replaced by the `[MASK]` token, a random token, or remain unchanged, depending on a random draw.

**Padding**: The function pads input IDs, segment IDs, masked tokens, and masked positions to ensure all lists have the same length.

**Next Sentence Prediction**: Finally, the function checks if the second sentence immediately follows the first. If yes, it adds a `True` label to the instance and increments the positive counter. If not, it adds a `False` label and increments the negative counter.

This function continues generating instances until the batch is full and contains an equal number of positive and negative instances. The batch is then returned.

Note that this function serves as an example of how data can be prepared for BERT training. Depending on the dataset and specific task, adjustments to this function may be necessary.


In [22]:
# Define the function to create data batches
def make_batch():
    
    # Initialize the batch as an empty list
    batch = []
    
    # Initialize counters for positive and negative examples
    positive = negative = 0
    
    # Continue until half of the batch is positive examples and the other half is negative examples
    while positive != batch_size/2 or negative != batch_size/2:
        
        # Choose random indices for two sentences
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        
        # Retrieve tokens corresponding to the indices
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        
        # Prepare input ids by adding special tokens [CLS] and [SEP]
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        
        # Define segment ids to differentiate the two sentences
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        # Calculate the number of predictions to be made (15% of tokens)
        n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15))))
        
        # Identify candidate positions for masking that are not [CLS] or [SEP]
        cand_maked_pos = [i for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        
        # Shuffle candidate positions
        shuffle(cand_maked_pos)
        
        # Initialize lists for masked tokens and their positions
        masked_tokens, masked_pos = [], []
        
        # Mask tokens until reaching the desired number of predictions
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            
            # Random mask
            if random() < 0.8:  
                input_ids[pos] = word_dict['[MASK]'] 
            
            # Replace with another token 10% of the time (20% of the remaining time)
            elif random() < 0.5:  
                index = randint(0, vocab_size - 1) 
                input_ids[pos] = word_dict[number_dict[index]] 
        
        # Add zero padding to input ids and segment ids to reach maximum length
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        
        # Add zero padding to masked tokens and their positions if necessary
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        
        # Add to the batch as a positive example if sentences are consecutive
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) 
            positive += 1
        
        # Add to the batch as a negative example if sentences are not consecutive
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) 
            negative += 1
    
    # Return the complete batch
    return batch


In [23]:
# Padding function
def get_attn_pad_masked(seq_q, seq_k):
    
    batch_size, len_q = seq_q.size()
    
    batch_size, len_k = seq_k.size()
    
    pad_attn_masked = seq_k.data.eq(0).unsqueeze(1)
    
    return pad_attn_masked.expand(batch_size, len_q, len_k)

The function above creates an attention mask for padding tokens in a sequence.

**Inputs**: The function takes two sequences, seq_q and seq_k. These are typically the query sequence and the key sequence in an attention operation.

**Batch size extraction**: The function extracts the batch size and sequence lengths (len_q and len_k) from the input sequences' dimensions.

**Mask creation**: The attention mask is created by checking which elements in seq_k are equal to zero (indicating a padding token). This results in a boolean matrix of the same size as seq_k, where True indicates a padding token and False indicates a real token.

**Adding a dimension**: A dimension is added to the mask using the `unsqueeze(1)` method, which adds an extra dimension at index 1. This is necessary because the attention mask must match the dimensions of the attention matrices in the Transformer.

**Mask expansion**: Finally, the mask is expanded to match the size of the attention matrix, which has dimensions (batch_size, len_q, len_k). The expanded mask is returned by the function.

In summary, the function creates a mask that can be used to prevent the model from attending to padding tokens when computing attention. Padding tokens are used to pad sequences to equal lengths, but they carry no useful information, so it's important to ensure that the model ignores them.


In [24]:
# Create a batch
batch = make_batch()

In [25]:
# Extract batch elements
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [26]:
# Ids of the inputs
input_ids

tensor([[ 1, 27, 32, 56, 18, 20,  3, 31, 20,  8, 43,  9,  3,  2, 13, 25, 49, 51,
         38, 10, 55,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  3, 33, 44, 51, 50, 22,  2, 60,  5, 19,  2,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  3,  3, 49, 51, 38, 10, 55,  2, 46, 52, 24, 17, 26,  9,  3,  2,  0,
         

In [27]:
# Ids of first entry
input_ids[0]

tensor([ 1, 27, 32, 56, 18, 20,  3, 31, 20,  8, 43,  9,  3,  2, 13, 25, 49, 51,
        38, 10, 55,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [28]:
segment_ids[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

In [29]:
masked_tokens[0]

tensor([18, 47,  9,  0,  0,  0,  0])

In [30]:
masked_pos[0]

tensor([ 4, 12,  6,  0,  0,  0,  0])

In [31]:
isNext[0]

tensor(0)

In [32]:
# Apply padding function
get_attn_pad_masked(input_ids, input_ids)[0][0], input_ids[0]

(tensor([False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]),
 tensor([ 1, 27, 32, 56, 18, 20,  3, 31, 20,  8, 43,  9,  3,  2, 13, 25, 49, 51,
         38, 10, 55,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  

## Model Construction

The image below is a high-level description of the Transformer encoder. The input is a sequence of tokens, which are first embedded into vectors and then processed in the neural network. The output is a sequence of vectors of size H, where each vector corresponds to an input token with the same index.

Technically, predicting the output words requires:

- 1- Adding a classification layer on top of the encoder output.
- 2- Multiplying the output vectors by the embedding matrix, transforming them into the vocabulary dimension.
- 3- Calculating the probability of each word in the vocabulary with softmax.

The loss function in the BERT model only considers the prediction of masked values and ignores the prediction of unmasked words. As a consequence, the model converges more slowly than directional models, a characteristic compensated by its greater contextual awareness.

In the BERT training process, the model receives pairs of sentences as input and learns to predict if the second sentence in the pair is the subsequent sentence in the original document. During training, 50% of the inputs are a pair where the second sentence is the subsequent sentence in the original document, while in the other 50%, a random sentence from the corpus is chosen as the second sentence.

To help the model distinguish between the two sentences during training, the input is processed as follows before entering the model:

- 1- A [CLS] token is inserted at the beginning of the first sentence, and a [SEP] token is inserted at the end of each sentence.
- 2- A sentence embedding indicating Sentence A or Sentence B is added to each token. Sentence embeddings are similar in concept to token embeddings with a vocabulary of 2.
- 3- A positional embedding is added to each token to indicate its position in the sequence. The concept and implementation of the positional embedding are presented in the Transformer paper.

In fact, the embedding used to train the model is a combination of several embeddings.

In [33]:
# GeLU activation function
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

### 1- Embedding Module

The Embedding class below is part of the BERT architecture. Individual components of the class:

**Initialization (`def __init__(self)`):** The class constructor initializes the necessary components for embeddings.

- `self.tok_embed`: This is the token embedding layer that maps each token to a vector of dimension `d_model`.
- `self.pos_embed`: This is the positional embedding layer that maps the position of a token within a sequence to a vector of dimension `d_model`.
- `self.seg_embed`: This is the segment embedding layer that maps the type of token (0 for the first sentence and 1 for the second sentence) to a vector of dimension `d_model`.
- `self.norm`: This is the layer normalization component used to normalize the embedding vectors.

**Forward Method (`def forward(self, x, seg)`):** The forward method is where the actual embedding happens.

- First, it calculates the position of each token in the sequence.
- Next, it creates a position matrix of the same shape as the input `x` using `pos.unsqueeze(0).expand_as(x)`.
- Then, it computes the total embedding as the sum of token, position, and segment embeddings.
- Finally, it normalizes the embedding using the layer normalization and returns the result.

The combination of these three embeddings allows BERT to consider both the individual meaning of the tokens and their order in the sequence, as well as whether the token belongs to the first or the second sentence. This makes the BERT embedding very powerful and flexible.

In [34]:
# Embedding Class
class Embedding(nn.Module):
    
    # Constructor method
    def __init__(self):
        
        super(Embedding, self).__init__()
        
        # Token embedding
        self.tok_embed = nn.Embedding(vocab_size, d_model)  
        
        # Position embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  
        
        # Segment (token type) embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  
        
        # Layer normalization
        self.norm = nn.LayerNorm(d_model)

    # Forward method
    def forward(self, x, seg):
        
        seq_len = x.size(1)
        
        pos = torch.arange(seq_len, dtype=torch.long)
        
        # (seq_len,) -> (batch_size, seq_len)
        pos = pos.unsqueeze(0).expand_as(x)  
        
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        
        return self.norm(embedding)

### 2- Scaled Dot Product Attention Module

Below is the implementation of the Scaled Dot-Product Attention mechanism, which is a key part of the Transformer model used in BERT and other natural language processing models.

Here is a line-by-line explanation of the forward method:

**Scores**: The dot product of Q (query matrix) and K (key matrix) is calculated to determine the score for each key-query pair. These scores determine how much each element of the input sequence should be attended to in producing the output representation for a given element. The score is then scaled by the square root of the dimension of the keys (d_k) to prevent the dot product values from becoming too large in high-dimensional settings.

**Attention Mask**: The attention mask is applied to the scores by filling the locations where the mask has a value of 1 with a very large negative number (-1e9). This ensures that these locations receive a weight close to zero when softmax is applied.

**Softmax**: The softmax function is applied to the last axis of the scores to obtain the attention weights. This ensures that all weights are positive and sum to 1, so they can be interpreted as probabilities.

**Context**: The attention weights are then multiplied by the value matrix V to obtain the output of the attention mechanism. Each value is weighted by the amount we should "attend" to that value, as determined by the attention weights.

The method returns the context (the weighted output) and the attention matrix.

In the Transformer model, Scaled Dot-Product Attention is used multiple times in each layer, allowing the model to attend to different parts of the input while producing each element of the output. This enables the Transformer to effectively handle long-range dependencies between words in input sequences.


In [35]:
# Define the class to perform scaled dot-product attention
class ScaledDotProductAttention(nn.Module):
    
    # Initialization method
    def __init__(self):
        
        # Initialize the base class
        super(ScaledDotProductAttention, self).__init__()

    # Forward method to define the forward pass of the data
    def forward(self, Q, K, V, attn_mask):
        
        # Compute the attention scores as the product of Q and K, scaled by the key size
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        
        # Apply the attention mask to avoid attending to certain tokens
        scores.masked_fill_(attn_mask, -1e9)
        
        # Apply softmax to obtain normalized attention weights
        attn = nn.Softmax(dim=-1)(scores)
        
        # Multiply the attention weights by V to get the context
        context = torch.matmul(attn, V)
        
        # Return the context and attention weights
        return context, attn

### 3- Multi-Head Attention Module

Below is the implementation of Multi-Head Attention, which is a key component of the Transformer architecture used in models like BERT. The idea of multi-head attention is to apply scaled dot-product attention multiple times in parallel, each with different learned weights. This allows the model to focus on different positions and capture various types of information.

Let's analyze the forward method line by line:

**Initialization**: `residual` and `batch_size` are initialized with `Q` and the size of the first axis of `Q`, respectively. The `residual` will be used later for the residual connection path.

**Linear Transformations**: We apply linear transformations to the input data (Q, K, and V) using different weights. These transformations generate multiple "heads" of attention.

**Reshaping**: The outputs of these linear transformations are then reshaped and transposed to have the appropriate form for scaled dot-product attention.

**Attention Mask**: The attention mask is adjusted to match the format of the attention heads.

**Scaled Dot-Product Attention**: Scaled dot-product attention is then applied to each of the attention heads.

**Context Reshaping**: The output (context) from each attention head is then reshaped and concatenated.

**Linear Transformation and Normalization**: A linear transformation is applied to the concatenated context, followed by layer normalization.

**Residual Connection**: The final output is obtained by adding the output of the layer normalization to the residual connection path (original input Q).

Finally, the function returns the normalized output and the attention matrix. Multi-head attention allows the model to consider information from different parts of the input sequence, in different representation subspaces, simultaneously, which enhances the model's ability to capture various features of the text.

In [36]:
# Define the class to perform multi-head attention
class MultiHeadAttention(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(MultiHeadAttention, self).__init__()
        
        # Define the weight matrix for queries Q
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        
        # Define the weight matrix for keys K
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        
        # Define the weight matrix for values V
        self.W_V = nn.Linear(d_model, d_v * n_heads)

    # Forward method to define the forward pass of the data
    def forward(self, Q, K, V, attn_mask):
        
        # Save the input Q for residual connection and get the batch size
        residual, batch_size = Q, Q.size(0)
        
        # Process Q through W_Q and reshape to have [n_heads] on the second dimension
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
        
        # Process K through W_K and reshape to have [n_heads] on the second dimension
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
        
        # Process V through W_V and reshape to have [n_heads] on the second dimension
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)
        
        # Adjust attn_mask to be compatible with the dimensions of q_s, k_s, v_s
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        
        # Calculate scaled dot-product attention and context for each attention head
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        
        # Reshape context to combine the attention heads and return to the original format
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)
        
        # Apply a linear transformation to the combined context
        output = nn.Linear(n_heads * d_v, d_model)(context)
        
        # Normalize the output layer and add the residual
        return nn.LayerNorm(d_model)(output + residual), attn

In [37]:
# Create the Embedding object
emb = Embedding()

In [38]:
# Generate Embeddings
embeds = emb(input_ids, segment_ids)

In [39]:
# Generate an attention mask
attenM = get_attn_pad_masked(input_ids, input_ids)

In [40]:
# Generate the MultiHeadAttention
MHA = MultiHeadAttention()(embeds, embeds, embeds, attenM)

In [41]:
# Output
output, A = MHA

In [42]:
A[0][0]

tensor([[0.0388, 0.0563, 0.0715,  ..., 0.0000, 0.0000, 0.0000],
        [0.0545, 0.0541, 0.0661,  ..., 0.0000, 0.0000, 0.0000],
        [0.0617, 0.0449, 0.0568,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0478, 0.0441, 0.0355,  ..., 0.0000, 0.0000, 0.0000],
        [0.0456, 0.0409, 0.0503,  ..., 0.0000, 0.0000, 0.0000],
        [0.0528, 0.0445, 0.0426,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<SelectBackward0>)

### 4- Positional Feedforward Module

This is the implementation of the Positional Feedforward Network (PoswiseFeedForward), which is a component of the Transformer architecture used in models like BERT.

The Positional Feedforward Network consists of two linear layers with a GELU (Gaussian Error Linear Unit) activation in between.

Here is a detailed explanation of the forward method:

**First Linear Layer (self.fc1)**: The input x is passed through a linear layer (also known as a fully connected layer). This layer performs a linear transformation with d_model inputs and d_ff outputs, where d_model is the embedding space dimension and d_ff is the hidden layer dimension of the feedforward network. This allows the model to learn nonlinear representations.

**GELU Activation**: Next, the GELU activation is applied. The GELU function allows the model to learn more complex and nonlinear transformations. It helps address the vanishing gradient problem by enabling more information to pass through the network.

**Second Linear Layer (self.fc2)**: Finally, the output of the GELU activation is passed through a second linear layer, which transforms the output back to the original dimension d_model. This is done so that the output of this feedforward network can be added to the original input (residual connection) in the Transformer.

The function returns, therefore, the output of this second linear layer, which has gone through the transformation of the first linear layer, GELU activation, and the second linear layer.

Positional feedforward networks are an important part of Transformer models, allowing them to learn more complex representations and make nonlinear transformations of the input data.

In [43]:
# Define the class for the Positional Feed Forward network
class PoswiseFeedForward(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(PoswiseFeedForward, self).__init__()
        
        # First linear layer that increases the dimension of the data from d_model to d_ff
        self.fc1 = nn.Linear(d_model, d_ff)
        
        # Second linear layer that reduces the dimension back from d_ff to d_model
        self.fc2 = nn.Linear(d_ff, d_model)

    # Forward method to define the forward pass of the data
    def forward(self, x):
        
        # Apply the first linear transformation, followed by the GELU activation function 
        # and then the second linear transformation
        return self.fc2(gelu(self.fc1(x)))

### 5- Encoder Layer Module

This class defines an Encoder Layer, which is a component of the Transformer architecture and is also used in models like BERT. Each encoder layer in the Transformer contains two sub-layers: a Multi-Head Attention layer and a Positional Feed-Forward Network.

Here is a detailed explanation of the forward method:

**Multi-Head Attention (self.enc_self_attn)**: The input `enc_inputs` passes through a Multi-Head Attention layer, which allows each word in the input to attend to all other words. This layer also receives a mask (`enc_self_attn_mask`) used to prevent the model from attending to certain words (such as padding tokens). The output of the Multi-Head Attention is another sequence of vector representations with the same dimension as the input. The attention matrix, showing how each word attended to all the others, is also returned.

**Positional Feed-Forward Network (self.pos_ffn)**: The output of the Multi-Head Attention layer is then passed through a Positional Feed-Forward Network. This is a simple neural network that operates independently at each position in the sequence (i.e., the same network is applied to each position). This allows the model to learn more complex representations and perform nonlinear transformations of the data.

The function returns the output of this encoder layer, which is the output of the Positional Feed-Forward Network, along with the attention matrix. Thus, the input and output of this encoder layer have the same dimension, allowing multiple encoder layers to be stacked to form the complete Transformer encoder.

In [44]:
# Define the class for the encoder layer
class EncoderLayer(nn.Module):
    
    def __init__(self) -> None:
        
        # Initialize the base class
        super(EncoderLayer, self).__init__()
        
        # Instantiate the multi-head attention for encoder self-attention
        self.enc_self_attn = MultiHeadAttention()
        
        # Instantiate the positional feed-forward network to use after self-attention
        self.pos_ffn = PoswiseFeedForward()

    # Forward method to define the forward pass of the data
    def forward(self, enc_inputs, enc_self_attn_mask):
        
        # Apply self-attention to the input data
        enc_inputs, atnn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        
        # After self-attention, pass the result through the positional feed-forward network
        enc_inputs = self.pos_ffn(enc_inputs)
        
        # Return the output of the encoder and the attention weights
        return enc_inputs, atnn

### 6- Final Architecture of the LLM (BERT Model)

This class defines the BERT (Bidirectional Encoder Representations from Transformers) model, a state-of-the-art language model that uses transformers and bidirectional attention to understand the semantics of words within context.

Let's break down the forward method in detail:

**Embedding (self.embedding)**: Transforms the inputs (`input_ids` and `segment_ids`) into dense vectors (embeddings).

**Attention Mask (get_attn_pad_masked)**: Generates an attention mask to ignore padding tokens in the inputs.

**Encoder Layers (self.layers)**: Passes the output from the embedding and the attention mask through multiple encoder layers. Each encoder layer consists of a multi-head attention layer and a positional feed-forward network.

**Pooling (self.activ1(self.fc(output[:, 0]))): Applies a fully connected layer and a hyperbolic tangent activation function to the first position (the classification token) of each sequence in the encoder output. This results in a sequence representation vector.

**Classifier (self.classifier)**: A fully connected layer that generates logits for the next-sentence classification task.

**Masked Token Extraction (torch.gather(output, 1, masked_pos))**: Selects the output vectors corresponding to the masked tokens.

**Masked Token Transformation (self.norm(self.activ2(self.linear(h_masked))))**: Applies a linear transformation, GELU activation, and normalization to the output of the masked tokens.

**Decoder (self.decoder)**: A linear layer that generates logits for the masked language modeling task. It uses the same weights as the token embedding layer for consistency in the representation space. This decoder function is used solely to generate the final logits and is not used in the model’s training process.

The method returns the logits for the masked language modeling task and the next-sentence classification task. These logits can then be used to calculate losses for both tasks during training.

In [45]:
# BERT Model
class BERT(nn.Module):
    
    def __init__(self) -> None:
        
        super(BERT, self).__init__()
        
        self.embedding = Embedding()
        
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        
        self.fc = nn.Linear(d_model, d_model)
        
        self.activ1 = nn.Tanh()
        
        self.linear = nn.Linear(d_model, d_model)
        
        self.activ2 = gelu
        
        self.norm = nn.LayerNorm(d_model)
        
        self.classifier = nn.Linear(d_model, 2)
        
        embed_weight = self.embedding.tok_embed.weight
        
        n_vocab, n_dim = embed_weight.size()
        
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        
        self.decoder.weight = embed_weight
        
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        
        output = self.embedding(input_ids, segment_ids)
        
        enc_self_attn_mask = get_attn_pad_masked(input_ids, input_ids)
        
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        
        h_pooled = self.activ1(self.fc(output[:, 0]))
        
        logits_clsf = self.classifier(h_pooled)
        
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))
        
        h_masked = torch.gather(output, 1, masked_pos)
        
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        
        logits_lm = self.decoder(h_masked) + self.decoder_bias
        
        return logits_lm, logits_clsf

## Treinamento e Avaliação do LLM

In [46]:
# Create the model
modelo_dsa = BERT()

In [47]:
# Error function
criterion = nn.CrossEntropyLoss()

In [48]:
# Optimizer
optimizer = optim.Adam(modelo_dsa.parameters(), lr = 0.001)

In [49]:
batch = make_batch()

In [50]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

Here is a breakdown of the typical training loop for one epoch in a machine learning model:

**`optimizer.zero_grad()`**: Zeros the gradients of all optimized variables. This is necessary because gradients in PyTorch are accumulated, meaning each time `.backward()` is called, gradients are added to the existing ones rather than being replaced. Thus, we need to clear these accumulated gradients before each optimization step.

**`logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)`**: Feeds the input data into the model and obtains the model's output. The output consists of `logits_lm` and `logits_clsf`, which are the raw, unnormalized results for the language modeling task and the classification task, respectively.

**`loss_lm = criterion(logits_lm.transpose(1,2), masked_tokens)`**: Computes the loss for the masked language modeling task. `criterion` is the loss function, `logits_lm.transpose(1,2)` are the model's predictions, and `masked_tokens` are the true targets.

**`loss_lm = (loss_lm.float()).mean()`**: Converts the loss to a floating-point type (if it isn’t already) and then calculates the mean of the loss.

**`loss_clsf = criterion(logits_clsf, isNext)`**: Computes the loss for the next sentence classification task.

**`loss = loss_lm + loss_clsf`**: Combines the two losses into a single scalar loss.

**`loss.backward()`**: Computes the gradients of all optimized variables with respect to the loss. These gradients are calculated with respect to the combined loss.

**`optimizer.step()`**: Updates the model parameters using the computed gradients.

These steps are repeated for each epoch of training. Each epoch represents a complete cycle through the training dataset. Therefore, if `NUM_EPOCHS` is 10, the entire training process is executed 10 times.

In [51]:
%%time

# Start the training loop for a defined number of epochs
for epoch in range(NUM_EPOCHS):
    
    # Zero the gradients of the optimizer to prevent accumulation from previous epochs
    optimizer.zero_grad()
    
    # Pass the input data through the model and get logits for language masking 
    # and next sentence classification
    logits_lm, logits_clsf = modelo_dsa(input_ids, segment_ids, masked_pos)
    
    # Compute the loss for the language masking task by comparing the predicted logits 
    # with the actual masked tokens
    loss_lm = criterion(logits_lm.transpose(1,2), masked_tokens)
    
    # Compute the mean of the loss for normalization
    loss_lm = (loss_lm.float()).mean()
    
    # Compute the loss for the next sentence classification task
    loss_clsf = criterion(logits_clsf, isNext)
    
    # Combine the losses from both tasks to get the total loss
    loss = loss_lm + loss_clsf
    
    # Print the current epoch and total loss
    print(f'Epoch: {epoch + 1} | Loss {loss:.4f}')
    
    # Perform backpropagation to compute gradients
    loss.backward()
    
    # Update the model parameters based on the calculated gradients
    optimizer.step()

Epoch: 1 | Loss 85.6257
Epoch: 2 | Loss 103.3067
Epoch: 3 | Loss 374.3922
Epoch: 4 | Loss 83.0070
Epoch: 5 | Loss 127.3027
Epoch: 6 | Loss 56.7147
Epoch: 7 | Loss 77.9815
Epoch: 8 | Loss 80.3260
Epoch: 9 | Loss 66.5356
Epoch: 10 | Loss 45.4595
Epoch: 11 | Loss 31.6981
Epoch: 12 | Loss 37.3686
Epoch: 13 | Loss 37.8565
Epoch: 14 | Loss 35.4382
Epoch: 15 | Loss 39.5302
Epoch: 16 | Loss 37.2350
Epoch: 17 | Loss 32.1279
Epoch: 18 | Loss 33.0896
Epoch: 19 | Loss 32.7537
Epoch: 20 | Loss 33.2081
Epoch: 21 | Loss 33.5253
Epoch: 22 | Loss 34.1890
Epoch: 23 | Loss 32.7589
Epoch: 24 | Loss 31.0161
Epoch: 25 | Loss 28.2764
Epoch: 26 | Loss 26.2311
Epoch: 27 | Loss 29.8080
Epoch: 28 | Loss 29.9723
Epoch: 29 | Loss 26.2666
Epoch: 30 | Loss 21.8840
Epoch: 31 | Loss 22.9788
Epoch: 32 | Loss 22.6751
Epoch: 33 | Loss 21.4630
Epoch: 34 | Loss 21.1502
Epoch: 35 | Loss 20.3271
Epoch: 36 | Loss 20.1865
Epoch: 37 | Loss 19.4822
Epoch: 38 | Loss 20.1860
Epoch: 39 | Loss 18.7797
Epoch: 40 | Loss 18.2321
Epoch:

## Extracting Predictions from the Trained LLM

In [52]:
# Extrai o batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))
print(texts)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

'Hello, how are you? I am Camila.\n'
'Hello, Camila, my name is Fernando. Nice to meet you.\n'
'Nice to meet you too. How are you today?\n'
'Great. My soccer team won the competition.\n'
'Wow, congratulations Fernando!\n'
'Thank you, Camila.\n'
'Shall we have pizza later to celebrate?\n'
'Sure. Do you recommend any restaurant, Camila?\n'
'Yes, a new restaurant opened, and they say the banana pizza is phenomenal.\n'
'Okay. Shall we meet at the restaurant at seven in the evening?\n'
'Sounds good. See you later then.'\n'

['[CLS]', "'hello", 'camila', '[MASK]', 'name', 'is', 'fernando', 'nice', 'to', 'meet', "you\\n'", '[SEP]', "'sure", 'do', 'you', 'recommend', '[MASK]', 'restaurant', "camila\\n'", '[SEP]']


In [53]:
# Extract Token Predictions
logits_lm, logits_clsf = modelo_dsa(input_ids, segment_ids, masked_pos)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('Real Masket Tokens List: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])
print('Predicted Masked Tokens List: ', [pos for pos in logits_lm if pos != 0])

Real Masket Tokens List:  [13, 52, 4]
Predicted Masked TOkens List:  []


In [54]:
# Extract the predictons of the next token
logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext (Real Value): ', True if isNext else False)
print('isNext (Predicted Value): ', True if logits_clsf else False)

isNext (Real Value):  False
isNext (Predicted Value):  False


# End