### Large language models and how they work

#### transformer architecture

pseudo code for the algorithm
1- initialize the model with embedding, positional embedding, and transformer encoder layer and output layer.
2- for each input sequence :
a. get token embeddings from the embeddings layer
b. create positional embedding based on sequence length (how many tokens the model can process at once ) and add them to the token embedding
c. pass the combined embedding to the stacked transformer encoder layer
    for each encoder layer:
    a. apply multi head self attention
    b.apply residual connection and layer normalization
    c. pass through feed forward layer
    d. apply residual connection and layer normalization
d. pass the final encoder output to the output layer to get predictions
3- return the output logits for each token position


In [11]:
import torch.nn as nn
import torch
import math

class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, num_layers, max_len = 5000):
        super(TransformerModel, self).__init__()
        
        #embedding layer for input tokens
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        # learn positional embedding
        self.positional_embedding = nn.Embedding(max_len, embed_dim)
        
        # transformer encoder layer represent one layer of transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model = embed_dim,
            nhead = num_heads,
            dim_feedforward = 4 * embed_dim,
            activation = 'relu'
            
        )
        # a module that stacks multiple transformer encoder layers to form a complete encoder instead of manually stacking them
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        
        # fully connected output layer
        self.fc_out = nn.Linear(embed_dim, input_dim)
        
    def forward(self, x):
        #apply embedding layer
        x = self.embedding(x)
        x = x*math.sqrt(self.embedding.embedding_dim) #caling the embeddings helps stabilize their values when they’re first fed into the Transformer layers.
        #generate positional indices and apply positional embedding
        seq_len = x.size(1)
        positions = torch.arange(0, seq_len, device = x.device).unsqueeze(0)
        
        x = x + self.positional_embedding(positions)
        # Pass through transformer encoder
        x = self.transformer_encoder(x)
        
        #pass through linear layer
        output = self.fc_out(x)
        return output
            
            
input_dim = 1000  #vocabulary size
embed_dim = 512   # embedding dimention
num_heads = 8     # number of attention heads
num_layers = 6    # number of transformer layers
max_len = 5000    # number of sequence length

model = TransformerModel(input_dim, embed_dim, num_heads, num_layers, max_len)

# Sample input (batch_size, sequence_length)
sample_input = torch.randint(0, input_dim, (32, 100))                     #sample input in the range of 0 to input_dim - 1 with shape (32, 100)

output = model(sample_input)

print(output.shape)  # Should output (batch_size, sequence_length, input_dim)

            
            
            
        
        
        
        



torch.Size([32, 100, 1000])


In [None]:
import torch
import torch.nn as nn 
# we have a vocabulary of 10000 unique words (input_dim = 10000)

# we want each word to be represented with 64-dimentional vector (embed_dim = 64)
input_dim = 10000
embed_dim = 64

#initialize embedding layer
embedding_layer = nn.Embedding(input_dim, embed_dim)
example_input = torch.tensor([[42, 24, 35, 3], [51, 7, 100, 63]]) # a batch of two sentences each with 4 tokens
print(example_input.shape)
output = embedding_layer(example_input)
print(f'input token :{example_input}')
# print(f'embedded output:{output}')
print(f'output shape:{output.shape}')


torch.Size([2, 4])
input token :tensor([[ 42,  24,  35,   3],
        [ 51,   7, 100,  63]])
output shape:torch.Size([2, 4, 64])
torch.Size([32, 100])
