<a href="https://colab.research.google.com/github/azzouzioussama/LLM/blob/main/LLM_LLaMA_fromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

In the [Customize building LLM in PyTorch](https://www.kaggle.com/code/aisuko/customize-building-llm-in-pytorch), we have been covered essential concepts. Here we will replicate our basic model to LLaMA architecture.

**Note: The pictures all from the internet or the credit section below**

LLaMA introduces three architectural modifications to the original Transformer:
- RMSNorm for pre-normalization
- Rotary embeedings
- SwiGLU activation function


<div style="text-align: center"><img src="https://files.mastodon.social/media_attachments/files/111/615/341/791/154/468/small/847f74b69b31f5a1.webp" width="60%" heigh="80%" alt="encoder and decoder in transformers"></div>

In [None]:
import torch

from torch import nn
from torch.nn import functional as F

import numpy as np

from matplotlib import pyplot as plt

import time

import pandas as pd

import urllib.request

# The basic model

In [None]:
url="https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

file_name="tinyshakespeare.txt"

urllib.request.urlretrieve(url, file_name)

lines=open(file_name, 'r').read()

vocab=sorted(list(set(lines)))

In [None]:
DEFAULT_CONFIG={
    'd_model':128,
    'vocab_size':len(vocab),
    'batch_size':8,
    'context_window':16
}

In [None]:
itos={i: ch for i, ch in enumerate(vocab)}

stoi={ch: i for i, ch in enumerate(vocab)}

def encode(s):
    return [stoi[ch] for ch in s]

def decode(l):
    return ''.join(itos[i] for i in l)

decode(encode("morning"))

In [None]:
dataset=torch.tensor(encode(lines), dtype=torch.int8)
dataset.shape

In [None]:
def get_batches(data, split, batch_size, context_window, config=DEFAULT_CONFIG):
    # Split the dataset into training, validation, and test sets
    train=data[:int(.8*len(data))]
    val=data[int(.8 * len(data)): int(.9*len(data))]
    test=data[int(.9 *len(data)):]

    # Determine whcih split to use
    batch_data=train
    if split=='val':
        batch_data=val
    if split=='test':
        batch_data=test

    # Pick random starting points within the data
    ix=torch.randint(0,batch_data.size(0)-context_window-1, (batch_size,))

    # create input sequences (x) and corrsponding target sequences (y)
    x=torch.stack([batch_data[i:i+context_window] for i in ix]).long()
    y=torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()

    return x,y

In [None]:
xs, ys=get_batches(dataset, 'train', DEFAULT_CONFIG['batch_size'], DEFAULT_CONFIG['context_window'])

decoded_samples=[(decode(xs[i].tolist()), decode(ys[i].tolist())) for i in range(len(xs))]

decoded_samples

In [None]:
@torch.no_grad()
def evaluate_loss(model, config=DEFAULT_CONFIG):
    out={}

    model.eval()

    for split in ["train", "val"]:

        losses=[]

        for _ in range(10):
            xb,yb=get_batches(dataset, split, config['batch_size'], config['context_window'])
            _,loss=model(xb,yb)
            losses.append(loss.item())

        out[split]=np.mean(losses)
    model.train()

    return out

# RMSNorm for pre-normalization

We are defininf an RMSNorm class. During the initialization, it registers a scale parameter.

In the forward pass, it calculates the **Frobenius norm** of the input tensor and then normalizes the tensor. Finally, the tensor is scaled by the registered scale parameter. This function is designed for use in LLaMA to replace the LayerNorm operation.

In [None]:
class RMSNorm(nn.Module):
    def __init__(self, layer_shape, eps=1e-8, bias=False):
        super(RMSNorm, self).__init__()

        # Registering a learnable parameter 'scale' as a parameter o fthe module
        self.register_parameter('scale', nn.Parameter(torch.ones(layer_shape)))

    def forward(self,x):
        """
        Assumes shape is (batch, seq_len, d_model)
        """

        #calculating the Frobenius norm, RMS=1/sqrt(N)* Frobenius norm
        ff_rms=torch.linalg.norm(x, dim=(1,2))*x[0].numel() ** -.5

        # normalizing the input tensor 'x' with respect to RMS
        raw=x/ff_rms.unsqueeze(-1).unsqueeze(-1)

        # scaling the normalized tensor using the learnable parameter 'scale'
        return self.scale[:x.shape[1],:].unsqueeze(0) * raw

In [None]:
class SimpleModel_RMS(nn.Module):
    def __init__(self, config=DEFAULT_CONFIG):
        super().__init__()
        self.config=config

        #Embedding layer to convert character indices to vectors
        self.embedding=nn.Embedding(config['vocab_size'], config['d_model'])

        #RMSNorm layer for pre-normalization
        self.rms=RMSNorm(config['context_window'], config['d_model'])

        #Linear layers for modeling relationships between features
        self.linear=nn.Sequential(
            nn.Linear(config['d_model'], config['d_model']),
            nn.ReLU(),
            nn.Linear(config['d_model'], config['vocab_size']),
        )

    def forward(self, idx, targets=None):
        # Embedding layer converts character indices to vectors
        x =self.embedding(idx)

        # Linear layers for modeling relationships between features
        logits=self.linear(x)

        if targets is not None:
            loss=F.cross_entropy(logits.view(-1, self.config['vocab_size']), targets.view(-1))
            return logits, loss
        else:
            return logits

Let's execute the modified NN model with RMSNorm and observe the update number of parameters in the model, along with the loss:

In [None]:
DEFAULT_CONFIG.update({
    'epochs': 1000,
    'log_interval':10,
    'batch_size':32,
})

model=SimpleModel_RMS(DEFAULT_CONFIG)

xs, ys=get_batches(dataset, 'train', DEFAULT_CONFIG['batch_size'], DEFAULT_CONFIG['context_window'])

logits, loss=model(xs, ys)

optimizer=torch.optim.Adam(model.parameters())

In [None]:
def train(model, optimizer, scheduler=None, config=DEFAULT_CONFIG, print_logs=False):
    losses=[]

    start_time=time.time()

    for epoch in range(config['epochs']):
        optimizer.zero_grad()

        xs, ys=get_batches(dataset, 'train', config['batch_size'], config['context_window'])

        logits, loss=model(xs, targets=ys)
        loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()

        if epoch%config['log_interval']==0:
            batch_time=time.time()-start_time
            x=evaluate_loss(model)

            losses+=[x]

            if print_logs:
                print(f"Epoch {epoch} | val loss {x['val']:.3f}|Time {batch_time:.3f}|ETA in seconds{batch_time*(config['epochs']-epoch)/config['log_interval']:.3f}")

            start_time=time.time()

            if scheduler:
                print("lr: ", scheduler.get_lr())
    print("Validation loss:", losses[-1]['val'])

    return pd.DataFrame(losses).plot()

train(model, optimizer)

As we can see that the validation loss experiences a small decrease.


# Rotary Embeddings

Next, we will implement rotary positional embeddings. In RoPE, we need to embedding the position of a token in a sequence by rotating the embedding, applying a different rotation at each position. Let's create a function that mimics the actual paper implement of RoPE:

In [None]:
def get_rotary_matrix(context_window, embedding_dim):
    # Initialize a tensor for the rotary matrix with zeros
    R=torch.zeros((context_window, embedding_dim, embedding_dim), requires_grad=False)

    # Loop thorugh each position in the context window
    for position in range(context_window):
        # Loop through each dimension in the embedding
        for i in range(embedding_dim//2):
            # Calculate the rotation angle (theta) based on the position and embedding dimension
            theta=10000. ** (-2.*(i-1)/embedding_dim)
            # Calculate the rotated matrix elements using sine and cosine functions
            m_theta=position*theta
            R[position, 2*i, 2*i]=np.cos(m_theta)
            R[position, 2*i, 2*i+1]=-np.sin(m_theta)
            R[position, 2*i+1, 2*i]=np.sin(m_theta)
            R[position, 2*i+1, 2*i+1]=np.cos(m_theta)
    return R

We generate a rotary matrix based on the specified context window and embedding dimension, following the proposed RoPE implementation.


## Masked Attention Head

We know that the attention heads in the architecture of transformers.

We need to create attention heads when replicating LLaMA. Let's first create a single **masked attention head** that returns attention weights.

In [None]:
class RoPEMaskedAttentionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config=config
        # Linear transformation for query
        self.w_q=nn.Linear(config['d_model'], config['d_model'], bias=False)
        # Linear transformation for key
        self.w_k=nn.Linear(config['d_model'], config['d_model'], bias=False)
        # Linear transformation for value
        self.w_v=nn.Linear(config['d_model'], config['d_model'], bias=False)
        # Obtain rotary matrix for positional embeddings
        self.R=get_rotary_matrix(config['context_window'], config['d_model'])

    def get_rotary_matrix(context_window, embedding_dim):
        # Initialize a tensor for the rotary matrix with zeros
        R=torch.zeros((context_window, embedding_dim, embedding_dim), requires_grad=False)

        # Loop thorugh each position in the context window
        for position in range(context_window):
            # Loop through each dimension in the embedding
            for i in range(embedding_dim//2):
                # Calculate the rotation angle (theta) based on the position and embedding dimension
                theta=10000. ** (-2.*(i-1)/embedding_dim)
                # Calculate the rotated matrix elements using sine and cosine functions
                m_theta=position*theta
                R[position, 2*i, 2*i]=np.cos(m_theta)
                R[position, 2*i, 2*i+1]=-np.sin(m_theta)
                R[position, 2*i+1, 2*i]=np.sin(m_theta)
                R[position, 2*i+1, 2*i+1]=np.cos(m_theta)
        return R

    def forward(self, x, return_attn_weights=False):
        # x: input tensor of shape (batch, sequence length, dimension)
        b,m,d=x.shape # batch size, sequence length, dimension

        # Linear transformations for Q, K and V
        q=self.w_q(x)
        k=self.w_k(x)
        v=self.w_v(x)

        # Rotate Q and K using the RoPE matrix
        q_rotated=(torch.bmm(q.transpose(0,1), self.R[:m])).transpose(0,1)
        k_rotated=(torch.bmm(k.transpose(0,1), self.R[:m])).transpose(0,1)

        # Perform scaled dot-product attention
        activations=F.scaled_dot_product_attention(
            q_rotated, k_rotated, v, dropout_p=0.1, is_causal=True
        )

        if return_attn_weights:
            # Create a causal attention mask
            atten_mask=torch.tril(torch.ones((m,m)), diagnoal=0)
            # Calculate attention weights and add causal mask
            attn_weights =torch.bmm(q_rotated, k_rotated.transpose(1,2))/np.sqrt(d)+attn_mask
            attn_weights=F.softmax(attn_weights, dim=-1)
            return activations, attn_weights
        return activations

## Multi-Head attention machanism

In [None]:
class RoPEMaskedMultiheadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config=config
        # Create a list of RoPEMaskedAttentionHead instances as attention heads
        self.heads=nn.ModuleList([
            RoPEMaskedAttentionHead(config) for _ in range(config['n_heads'])
        ])
        # Linear layer after concatenating heads
        self.linear=nn.Linear(config['n_heads']* config['d_model'], config['d_model'])
        self.dropout=nn.Dropout(.1) # Dropout layer

    def forward(self, x):
        # x: input tensor of shape (batch, sequence length, dimension)

        # Process each attention head and concatenate the results
        heads=[h(x) for h in self.heads]
        x=torch.cat(heads, dim=-1)

        # Apply linear transformation to the concatenated output
        x=self.linear(x)

        # Apply dropout
        x=self.dropout(x)
        return x

The original paper used 32 heads for their smaller 7b LLM variation, but due to constraints, we will use 8 heads for our approach

In [None]:
DEFAULT_CONFIG.update({
    'n_heads':8
})

Let's update the previously RMSNorm code by using:

- Rotational Embedding
- Multi-Head-attention

In [None]:
class RopeModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config=config

        #embedding layer for input tokens
        self.embedding =nn.Embedding(config['vocab_size'], config['d_model'])

        #RMSNorm layer for pre-normalization ---------------------------------------- IT Must be TUPLE
        self.rms=RMSNorm((config['context_window'], config['d_model']))

        #RoPEMaskedMultiheadAttention layer
        self.rope_attention=RoPEMaskedMultiheadAttention(config)

        #Linear layer forward by ReLU activation
        self.linear=nn.Sequential(
            nn.Linear(config['d_model'], config['d_model']),
            nn.ReLU(),
        )


        self.last_linear = nn.Linear(config['d_model'], config['vocab_size'])

        #final linear layer for prediction
        self.last_linear=nn.Linear(config['d_model'], config['vocab_size'])

        print ("Model params:",sum([m.numel() for m in self.parameters()]))

    def forward(self, idx, targets=None):
        #idx: input indices
        x=self.embedding(idx)

        #One block of attention
        x=self.rms(x)
        x=x+self.rope_attention(x)
        #RMS pre-normalization
        x=self.rms(x)
        x=x+self.linear(x)

        logits=self.last_linear(x)

        if targets is not None:
            loss=F.cross_entropy(logits.view(-1, self.config['vocab_size']), targets.view(-1))
            return logits, loss
        else:
            return logits


In [None]:
model=RopeModel(DEFAULT_CONFIG)

xs, ys=get_batches(dataset, 'train', DEFAULT_CONFIG['batch_size'], DEFAULT_CONFIG['context_window'])

logits, loss=model(xs, ys)

optimizer=torch.optim.Adam(model.parameters())

train(model, optimizer)

In [None]:
DEFAULT_CONFIG.update({
    "epochs":5000,
    "log_interval":10,
})

train(model, optimizer)

# Credit

https://levelup.gitconnected.com/building-a-million-parameter-llm-from-scratch-using-python-f612398f06c2#21f1