In [1]:
# Building Mini-Transformer from scratch
# this code for learning
# Using Pytorch


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    """The attention mechanism - the heart of transformers"""
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # Linear layers for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.shape

        # 1. Create Q, K, V
        Q = self.W_q(x)  # [batch, seq_len, d_model]
        K = self.W_k(x)
        V = self.W_v(x)

        # 2. Split into multiple heads
        Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        # Now: [batch, num_heads, seq_len, d_k]

        # 3. Attention calculation
        attention_output = self._attention(Q, K, V, mask)

        # 4. Concatenate heads
        attention_output = attention_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len, d_model)

        # 5. Final linear layer
        return self.W_o(attention_output)

    def _attention(self, Q, K, V, mask=None):
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask if provided (for causal/decoder attention)
        if mask is not None:
            scores.masked_fill_(mask == 0, -1e9)

        # Softmax to get attention weights
        attention_weights = F.softmax(scores, dim=-1)

        # Apply attention to values
        return torch.matmul(attention_weights, V)

class FeedForward(nn.Module):
    """Simple 2-layer MLP with ReLU"""
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))

class TransformerBlock(nn.Module):
    """One transformer layer: Attention + FFN + Skip connections + LayerNorm"""
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Self-attention with skip connection
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))  # Add & Norm

        # Feed-forward with skip connection
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))    # Add & Norm

        return x

class PositionalEncoding(nn.Module):
    """Add position information to tokens"""
    def __init__(self, d_model, max_len=1000):
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()

        # Create sinusoidal patterns
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                           -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MiniTransformer(nn.Module):
    """Complete mini transformer model"""
    def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=2, d_ff=512):
        super().__init__()
        self.d_model = d_model

        # Token embedding
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)

        # Transformer layers
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff)
            for _ in range(num_layers)
        ])

        # Output head
        self.output_projection = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask=None):
        # 1. Token embedding + positional encoding
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)

        # 2. Pass through transformer layers
        for layer in self.layers:
            x = layer(x, mask)

        # 3. Project to vocabulary
        return self.output_projection(x)

def create_causal_mask(seq_len):
    """Create mask so model can't see future tokens"""
    mask = torch.tril(torch.ones(seq_len, seq_len))
    return mask.unsqueeze(0).unsqueeze(0)  # Add batch and head dims

# Let's test it!
if __name__ == "__main__":
    print("🚀 Testing Mini Transformer")

    # Create model
    vocab_size = 100
    model = MiniTransformer(vocab_size=vocab_size, d_model=64, num_heads=4, num_layers=2)

    # Test input: batch of token sequences
    batch_size = 2
    seq_len = 10
    x = torch.randint(0, vocab_size, (batch_size, seq_len))

    print(f"Input shape: {x.shape}")
    print(f"Input tokens: {x[0].tolist()}")  # Show first sequence

    # Create causal mask (for autoregressive generation)
    mask = create_causal_mask(seq_len)

    # Forward pass
    with torch.no_grad():
        output = model(x, mask)

    print(f"Output shape: {output.shape}")  # [batch, seq_len, vocab_size]
    print(f"Output logits for first token: {output[0, 0, :5].tolist()}")

    # Convert to probabilities
    probs = F.softmax(output, dim=-1)
    print(f"Probabilities sum to 1: {probs[0, 0].sum().item():.3f}")

    print("\n✅ Transformer is working!")
    print("\nComponents built:")
    print("  ✓ Multi-Head Attention")
    print("  ✓ Feed Forward Network")
    print("  ✓ Layer Normalization")
    print("  ✓ Positional Encoding")
    print("  ✓ Skip Connections")
    print("  ✓ Causal Masking")

    # Show model size
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\nModel has {total_params:,} parameters")

🚀 Testing Mini Transformer
Input shape: torch.Size([2, 10])
Input tokens: [30, 76, 24, 64, 66, 3, 55, 40, 68, 68]
Output shape: torch.Size([2, 10, 100])
Output logits for first token: [-0.6635676622390747, -0.3851340115070343, -0.7031242847442627, 0.002661745995283127, 0.708402693271637]
Probabilities sum to 1: 1.000

✅ Transformer is working!

Components built:
  ✓ Multi-Head Attention
  ✓ Feed Forward Network
  ✓ Layer Normalization
  ✓ Positional Encoding
  ✓ Skip Connections
  ✓ Causal Masking

Model has 178,916 parameters


# 🚀 Mini Transformer from Scratch — Documentation

This notebook builds a **mini Transformer model** step by step in PyTorch.  
It is a simplified version of what powers large models like GPT, BERT, etc.

---

## 🔑 Components

### 1. Embedding Layer
- Converts token IDs (integers) into dense vectors (`d_model` dimensions).
- Provides the model with a numerical representation of each token.

---

### 2. Positional Encoding
- Transformers don’t inherently know the order of tokens.  
- Positional encoding adds **sinusoidal patterns** (sine/cosine functions) to embeddings.
- This allows the model to understand sequence order (e.g., word #1 comes before word #2).

---

### 3. Multi-Head Self-Attention
- **Core mechanism of transformers.**
- Each token creates three vectors:
  - **Query (Q):** What am I looking for?
  - **Key (K):** What information do I have?
  - **Value (V):** The actual content to pass along.
- Formula:  
  \[
  Attention(Q,K,V) = softmax\Big(\frac{QK^T}{\sqrt{d_k}}\Big)V
  \]
- **Multi-heads**: run this process multiple times in parallel to capture different relationships (syntax, meaning, etc.).
- Produces weighted combinations of values, depending on how much attention each token pays to others.

---

### 4. Feed Forward Network (FFN)
- A simple 2-layer MLP applied to each token independently.
- Expands dimensionality (`d_model → d_ff → d_model`).
- Adds non-linearity and further processing after attention.

---

### 5. Transformer Block
- Combines **self-attention** and **feed-forward network**.
- Includes:
  - **Skip connections (residuals):** help training stability.
  - **Layer Normalization:** keeps values stable.
  - **Dropout:** prevents overfitting.

Each block = `Attention → Add+Norm → FeedForward → Add+Norm`.

---

### 6. Stacked Transformer Layers
- Multiple blocks are stacked (`num_layers`) to build depth.
- Each layer refines the representation further.

---

### 7. Output Projection
- Final linear layer projects hidden states back to **vocabulary size**.
- Produces logits → probabilities over possible next tokens.

---

### 8. Causal Mask
- Used in autoregressive models (like GPT).
- Ensures each token only attends to **previous tokens** (not the future).
- Implemented with a lower-triangular matrix.

---

## ⚙️ Workflow of the Mini Transformer
1. Input tokens → Embedding layer.  
2. Add positional encoding.  
3. Pass through multiple Transformer blocks:  
   - Multi-head self-attention.  
   - Feed-forward network.  
   - Skip connections + normalization.  
4. Output projection → logits for vocabulary.  
5. Apply softmax → probabilities for next token prediction.  

---

## ✅ Why This Matters
- This is a **toy version of GPT**.  
- By increasing `d_model`, `num_heads`, and `num_layers`, you approach the architecture of real LLMs.  
- Understanding this foundation gives you the tools to build and experiment with language models.  
