<a href="https://colab.research.google.com/github/amanjaiswalofficial/machine-learning-engineer-projects/blob/main/llm0to1/08_making_a_transformer_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparing the Dataset
We'll use a small but meaningful dataset with English sentences. The goal is to predict the next word in a given sequence.

Steps:
1. Create a dataset of sentences.
2. Tokenize sentences using a subword-based tokenizer (e.g., WordPiece or Byte Pair Encoding).
3. Convert sentences into numerical sequences.
4. Prepare input-target pairs for training.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [None]:
sentences = [
    "The cat sat on the mat",
    "The dog barked at the stranger",
    "She loves reading books",
    "They are playing football",
    "He enjoys coding machine learning models",
]
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_data = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

# Prepare input-target pairs
train_data = []
for tokens in tokenized_data:
    for i in range(1, len(tokens)):
        train_data.append((tokens[:i], tokens[i]))

# Convert to PyTorch tensors
max_len = max(len(seq) for seq, _ in train_data)

def pad_sequence(seq, max_len, pad_token):
    return seq + [pad_token] * (max_len - len(seq))

input_data = torch.tensor([pad_sequence(seq,
                                        max_len,
                                        tokenizer.pad_token_id)
                          for seq, _ in train_data])
target_data = torch.tensor([target for _, target in train_data])

# Create DataLoader
class NextWordDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

dataset = NextWordDataset(input_data, target_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

print("Sample input sequence:", tokenizer.decode(dataset[0][0].tolist()))
print("Target word:", tokenizer.decode([dataset[0][1].item()]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Sample input sequence: [CLS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Target word: the


## Building the Transformer Model
Now that we have our dataset prepared, we’ll implement a Transformer model from scratch. The key components include:

1. Token Embeddings – Convert tokens into dense vectors.
2. Positional Encoding – Add position information to token embeddings.
3. Self-Attention Mechanism – Helps the model focus on relevant words.
4. Multi-Head Attention – Enhances learning by using multiple attention heads.
5. Feedforward Network – Processes attention outputs.
6. Layer Normalization & Residual Connections – Stabilizes training.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=100):
    super().__init__()
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype = torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0) # Remove transpose(0, 1) here
    self.register_buffer('pe', pe) # Shape: (1, max_len, d_model)


  def forward(self, x):
    # x.size(1) gives the current sequence length
    # Slice the positional encoding to match the current sequence length
    return x + self.pe[: ,:x.size(1), :].to(x.device) # Adjust slicing

# 2. Self-Attention Mechanism
class SelfAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.scale = 1.0 / math.sqrt(d_model)

    def forward(self, x):
        Q, K, V = self.query(x), self.key(x), self.value(x)
        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        attention = torch.softmax(scores, dim=-1)
        return torch.matmul(attention, V)

# 3. Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.head_dim = d_model // num_heads
        self.num_heads = num_heads
        self.qkv_proj = nn.Linear(d_model, d_model * 3)  # Project input to Q, K, V
        self.out_proj = nn.Linear(d_model, d_model)
        self.scale = 1.0 / math.sqrt(self.head_dim)

    def forward(self, x):
        B, L, D = x.shape
        qkv = self.qkv_proj(x).reshape(B, L, 3, self.num_heads, self.head_dim)
        Q, K, V = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]

        scores = torch.einsum("blhd,bLhd->bhlL", Q, K) * self.scale
        attention = torch.softmax(scores, dim=-1)
        out = torch.einsum("bhlL,bLhd->blhd", attention, V).reshape(B, L, D)
        return self.out_proj(out)

# 4. Transformer Encoder Block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_hidden_dim):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, d_model),
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))  # Add & Norm
        x = x + self.ff(self.norm2(x))    # Add & Norm
        return x

# 5. Transformer Model for Next-Word Prediction
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, ff_hidden_dim, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_hidden_dim) for _ in range(num_layers)
        ])
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.encoder_layers:
            x = layer(x)
        return self.output_layer(x)

## Training the Transformer Model
Now, we'll:

1. Prepare data for training – Convert text into tokenized format.
2. Define loss function & optimizer – Use CrossEntropyLoss & AdamW.
3. Train the model – Loop through epochs to minimize loss.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import random

# Load a tokenizer (using a pre-trained one from Hugging Face)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example text data (we would normally use IMDB reviews, but let's keep it simple)
text_samples = [
    "The movie was absolutely fantastic and full of surprises",
    "I did not expect the ending to be so emotional",
    "The performances were top-notch and the cinematography was beautiful",
    "The film had a great storyline but the pacing was slow",
]

In [None]:
# Tokenize the text and create dataset
class NextWordDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=10):
        self.data = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        for text in texts:
            tokens = tokenizer.encode(text, add_special_tokens=False)
            for i in range(1, len(tokens)):
                input_ids = tokens[:i]
                target = tokens[i]
                if len(input_ids) < max_len:
                    input_ids = [0] * (max_len - len(input_ids)) + input_ids  # Padding
                self.data.append((torch.tensor(input_ids), torch.tensor(target)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
# Create dataset and dataloader
dataset = NextWordDataset(text_samples, tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
# Model parameters
vocab_size = tokenizer.vocab_size
d_model = 128
num_heads = 4
num_layers = 2
ff_hidden_dim = 256
max_len = 10

# Initialize model
model = TransformerModel(vocab_size, d_model, num_heads, num_layers, ff_hidden_dim, max_len)
criterion = nn.CrossEntropyLoss()  # Standard loss for classification
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

In [None]:
# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)  # Forward pass
        outputs = outputs[:, -1, :]  # Get last token predictions
        loss = criterion(outputs, targets)  # Compute loss

        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch 1/10, Loss: 10.5473
Epoch 2/10, Loss: 8.9250
Epoch 3/10, Loss: 7.1367
Epoch 4/10, Loss: 4.2817
Epoch 5/10, Loss: 2.8689
Epoch 6/10, Loss: 2.0253
Epoch 7/10, Loss: 1.2258
Epoch 8/10, Loss: 1.0509
Epoch 9/10, Loss: 0.8362
Epoch 10/10, Loss: 0.6428


## Model Inference (Predicting the Next Word)
Now that we have trained our Transformer model, let's test it on unseen inputs and predict the next word in a sentence.

In [None]:
import torch.nn.functional as F

def predict_next_word(model, tokenizer, text, max_len=10, top_k=5):
    """
    Given a partial sentence, predict the most probable next word.

    - `model`: Trained Transformer model
    - `tokenizer`: Tokenizer used for text processing
    - `text`: Input text for prediction
    - `max_len`: Maximum length of input sequence
    - `top_k`: Return top-k predictions for diversity
    """

    model.eval()  # Set model to evaluation mode
    tokens = tokenizer.encode(text, add_special_tokens=False)

    if len(tokens) > max_len:
        tokens = tokens[-max_len:]  # Truncate sequence if it's too long

    # Convert tokens to tensor and add batch dimension
    input_tensor = torch.tensor(tokens).unsqueeze(0).to(device)

    with torch.no_grad():  # No gradient computation needed for inference
        output_logits = model(input_tensor)
        output_logits = output_logits[:, -1, :]  # Get last token prediction

        # Convert logits to probabilities
        probs = F.softmax(output_logits, dim=-1)

        # Get top-k predictions
        top_k_probs, top_k_indices = torch.topk(probs, k=top_k, dim=-1)

        # Convert token IDs to words
        predicted_words = [tokenizer.decode([idx]) for idx in top_k_indices[0].cpu().numpy()]

    return predicted_words



In [None]:
# Example sentences for prediction
test_sentences = [
    "The movie was",
    "The acting in this film was",
    "I really enjoyed the",
    "The cinematography in the movie was"
]

# Generate predictions
for sentence in test_sentences:
    predicted_words = predict_next_word(model, tokenizer, sentence)
    print(f"Input: {sentence}")
    print(f"Predicted Next Words: {predicted_words}\n")


Input: The movie was
Predicted Next Words: ['slow', 'beautiful', 'absolutely', 'surprises', 'and']

Input: The acting in this film was
Predicted Next Words: ['slow', 'absolutely', 'beautiful', 'full', 'surprises']

Input: I really enjoyed the
Predicted Next Words: ['ending', 'pacing', 'film', 'cinematography', 'movie']

Input: The cinematography in the movie was
Predicted Next Words: ['beautiful', 'absolutely', 'slow', 'full', 'surprises']

