# NOTE: We will revise pytorch and implement a language model from scratch hopefully this will clear all doubts for future weeks

# Pytorch Basic Session :1

In [1]:
#Step 1 : Understanding nn.Module
import torch
import torch.nn as nn

class SimpleModule(nn.Module):
    def __init__(self):
        super().__init__()  #By calling super().__init__(), you're saying: "Hey parent class, set up all your infrastructure before I add my custom stuff."(parent class's constructor.))
        
    def forward(self, x):
        return x

In [2]:
#Setp 1:1 Example
import torch
import torch.nn as nn

class SimpleModule(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return x * 2

# Create model
model = SimpleModule()

# Test it
x = torch.tensor([1.0, 2.0, 3.0])
output = model(x)  # This calls forward() automatically

print(f"Input: {x}")
print(f"Output: {output}")

Input: tensor([1., 2., 3.])
Output: tensor([2., 4., 6.])


In [3]:
#Step 2 :Understanding nn.Parameter
import torch
import torch.nn as nn

class ModuleWithWeight(nn.Module):
    def __init__(self, size):
        super().__init__()
        
        # This creates a LEARNABLE parameter
        self.weight = nn.Parameter(torch.randn(size))
        
    def forward(self, x):
        return x * self.weight

# Create model
model = ModuleWithWeight(3)

# Test
x = torch.tensor([1.0, 2.0, 3.0])
output = model(x)

print(f"Input: {x}")
print(f"Weight: {model.weight}")
print(f"Output: {output}")
print(f"\nIs weight learnable? {model.weight.requires_grad}")

Input: tensor([1., 2., 3.])
Weight: Parameter containing:
tensor([-0.4903,  1.9947,  0.6297], requires_grad=True)
Output: tensor([-0.4903,  3.9895,  1.8890], grad_fn=<MulBackward0>)

Is weight learnable? True


In [4]:
# Step 3 : Understanding nn.Linear
import torch
import torch.nn as nn

# Create a linear layer
input_dim = 512
output_dim = 256
linear = nn.Linear(input_dim, output_dim)

# What does it contain?
print(f"Weight shape: {linear.weight.shape}")
print(f"Bias shape: {linear.bias.shape}")
print(f"Weight requires_grad: {linear.weight.requires_grad}")

Weight shape: torch.Size([256, 512])
Bias shape: torch.Size([256])
Weight requires_grad: True


In [5]:
import torch
import torch.nn as nn

# Create linear layer
linear = nn.Linear(512, 256)
# This creates a weight matrix of 256 and 512

# Create input
batch_size = 32
seq_len = 10
x = torch.randn(batch_size, seq_len, 512)

# Forward pass
output = linear(x)

print(f"Input shape:  {x.shape}")      # [32, 10, 512]
print(f"Output shape: {output.shape}")  # [32, 10, 256]

Input shape:  torch.Size([32, 10, 512])
Output shape: torch.Size([32, 10, 256])


# Buliding Token Embeddings

In [6]:
import torch
import torch.nn as nn

#Start by inheriting from nn.Module
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__() #Call the parent class's constructor so that all methods of nn.Module are properly initialized.
        #nn.embedding creates a lookup table that maps from integer indices (representing tokens) to dense vectors of fixed size (the embeddings).
        #nn.embedding ,creates a lookup table and intializes it with random values. and makes the values learnable parameters of the model. (also registers it as a parameter of the module)
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
    def forward(self, x):
        return self.embedding(x)
    
#Create
vocab_size = 1000  # Size of the vocabulary
d_model = 512    # Dimension of the embeddings
token_embed= TokenEmbedding(vocab_size, d_model)

#Test
token_ids=torch.randint(0, vocab_size, (2,10))
output= token_embed (token_ids)
print(f"Input token IDs shape: {token_ids.shape}")  # [2, 10]
print(f"Output embeddings shape: {output.shape}")    # [2, 10,

Input token IDs shape: torch.Size([2, 10])
Output embeddings shape: torch.Size([2, 10, 512])


# Position embeddings

In [7]:
import torch
import torch.nn as nn

class PositionEmbedding(nn.Module): #Inherit from nn.Module
    def __init__(self, max_seq_len, d_model):
        super().__init__() #Call the parent class's constructor to make sure all methods of nn.Module are properly initialized. and available.
        #NOTE:Transformers (Original paper) used sinusoidal position embeddings, but nn.Embedding is a common choice for learnable position embeddings.(we will make sure gradients also flow through these embeddings during training.)
        #This creates a learnable position embedding table of size (max_seq_len, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)

    def forward(self, seq_len):
        #Generate position indices from 0 to seq_len - 1
        positions=torch.arange(seq_len)#This creaates a tensor with seqential numbersye
        return self.position_embedding(positions)
    
#Create
pos_embed= PositionEmbedding(max_seq_len=2048, d_model=512)
        
output = pos_embed(10)  # Example input sequence length
print(f"Output position embeddings shape: {output.shape}")  # [10, 512]
print(f"Positions Used: {torch.arange(10)}")

Output position embeddings shape: torch.Size([10, 512])
Positions Used: tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


# Combine Token and Position Embeddings

In [8]:
import torch
import torch.nn as nn

class Embeddings(nn.Module):#This is normal inheritance from nn.Module
    def __init__(self, vocab_size, max_seq_len, d_model):
        super().__init__() #Initialize the parent class nn.Module so that all its methods and attributes are available. and usable.
        """
        1.token embedding layer to convert token IDs into dense vectors.
        2.position embedding layer to add positional information to the token embeddings.

        NOTE:Both embeddings are learnable parameters of the model.
        """
        
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_seq_len, d_model)
        
    def forward(self, token_ids):
        batch_size, seq_len = token_ids.shape
        
        tokens = self.token_embed(token_ids)
        #torch.arrange helps to convert tokens into id's which can be used to get position embeddings from nn.Embedding lookup table.
        positions = torch.arange(seq_len, device=token_ids.device)
        pos = self.pos_embed(positions)
        
        return tokens + pos

# Test
embeddings = Embeddings(vocab_size=1000, max_seq_len=2048, d_model=512)
token_ids = torch.randint(0, 1000, (2, 10))  # [2, 10]
output = embeddings(token_ids)

print(f"Input shape: {token_ids.shape}")
print(f"Output shape: {output.shape}")

Input shape: torch.Size([2, 10])
Output shape: torch.Size([2, 10, 512])


# Layer Normalization

In [10]:
#Normalizes features to have zero mean and unit variance across the feature dimension
import torch
import torch.nn as nn

#Simple implementation of LayerNorm
d_model = 512
layer_norm = nn.LayerNorm(d_model)

#Test
x = torch.randn(2, 10, d_model)  # [batch_size, seq_len, d_model]
output = layer_norm(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"\nBefore LayerNorm:")
print(f"  Mean: {x.mean():.4f}")
print(f"  Std: {x.std():.4f}")
print(f"\nAfter LayerNorm:")
print(f"  Mean: {output.mean():.4f}")
print(f"  Std: {output.std():.4f}")

#NOTE:Layer norm has two learnable parameters per feature dimension: weight and bias. These parameters allow the model to scale and shift the normalized output, providing flexibility in how the normalized values are represented.
# It has 2 learnable parameters:
print(f"Gamma (scale): {layer_norm.weight.shape}")  # [512]
print(f"Beta (shift): {layer_norm.bias.shape}")     # [512]

Input shape: torch.Size([2, 10, 512])
Output shape: torch.Size([2, 10, 512])

Before LayerNorm:
  Mean: -0.0006
  Std: 0.9969

After LayerNorm:
  Mean: 0.0000
  Std: 1.0000
Gamma (scale): torch.Size([512])
Beta (shift): torch.Size([512])


# FFN (Feed Forward Network)

In [11]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        Args:
            d_model: Model dimension (512)
            d_ff: Hidden dimension (2048, typically 4x d_model)
            dropout: Dropout rate
        """
        super().__init__()
        
        # Two linear layers
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        
        # Activation
        self.relu = nn.ReLU()
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [batch, seq, d_model]
        
        # Expand: 512 → 2048
        x = self.linear1(x)
        
        # Activate
        x = self.relu(x)
        
        # Dropout
        x = self.dropout(x)
        
        # Contract: 2048 → 512
        x = self.linear2(x)
        
        # Dropout again
        x = self.dropout(x)
        
        return x
    
# Create FFN
ffn = FeedForward(d_model=512, d_ff=2048)

# Test
x = torch.randn(2, 10, 512)  # [batch, seq, d_model]
output = ffn(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")


Input shape: torch.Size([2, 10, 512])
Output shape: torch.Size([2, 10, 512])


# Residual Connection 


In [12]:
import torch

# Input
x = torch.randn(2, 10, 512)

# Some operation (pretend it's attention)
transformed = torch.randn(2, 10, 512)

# WITHOUT residual
output_no_residual = transformed

# WITH residual (just add!)
output_with_residual = x + transformed

print(f"Input: {x.shape}")
print(f"Transformed: {transformed.shape}")
print(f"With residual: {output_with_residual.shape}")
print("\nIt's just addition!")

Input: torch.Size([2, 10, 512])
Transformed: torch.Size([2, 10, 512])
With residual: torch.Size([2, 10, 512])

It's just addition!


# Multi Head Attention

In [13]:
import torch
import torch.nn as nn
import math

class MultiHeadAttention(nn.Module):
    """
    This is the standard multi-head attention class.
    """
    def __init__(self, d_model, num_heads, dropout=0.1):  # This runs when you create an object of this class
        super().__init__()  # This is used to call nn.module's init method which initializes the methods and attributes of the nn.module class
        assert d_model % num_heads == 0
        
        # We are storing all these so that they can be anywhere in the code
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
       
        # nn.Linear is PyTorch's fully connected (dense) layer that performs a linear transformation on the input.
        # It takes the input and multiplies it by a weight matrix and adds a bias term.
        # So it does a y=xw^T+b
        
        # So we need to create projections for Q, K, V (the parameters are input_dim, output_dim), so self.q_proj will create a weight matrix of size d_model x d_model,the weight initlization follows Xavier/Kaiming Initilication
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)

        # Post combination of all heads we need a final projection
        self.out_proj = nn.Linear(d_model, d_model)

        # Dropout helps us to randomly drop out some neurons to prevent overfitting
        self.dropout = nn.Dropout(dropout)
        
    # This is the method which runs when you call the model
    def forward(self, x):
        # This is tuple unpacking
        batch_size, seq_len, _ = x.size()  # Fixed: using _ instead of d_model to avoid shadowing

        # Now we need to project the input matrix into a different matrix
        # So we need to create projections for Q, K, V
        # Q: What am i looking for?
        # K: What do i contain?
        # V: What information do i have?

        Q = self.q_proj(x)  # Query = x@W_q^T + b_q  #This actually calls the forward method
        K = self.k_proj(x)  # Key = x@W_k^T + b_k
        V = self.v_proj(x)  # Value = x@W_v^T + b_v
        
        # Now we wish to split the query, key and value matrices into multiple attention heads so that we can perform parallel computations
        # Now we are reshaping the matrix to (batch_size, seq_len, num_heads, head_dim)
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim)

        # Now we need to transpose the matrix to put heads first
        # We are doing this since we want to compute attention for each head separately
        Q = Q.transpose(1, 2)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        # Compute attention scores
        # Scaling prevents softmax from saturating
        # scores[i,j]: how much token i should attend to token j high score means more attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        # Convert to probabilities
        attn_weights = torch.softmax(scores, dim=-1)

        # Apply dropout to the attention weights
        attn_weights = self.dropout(attn_weights)

        # We need to multiply with V
        # (batch_size, num_heads, seq_len, seq_len) * (batch_size, num_heads, seq_len, head_dim)
        # Here we are taking combination of information from all the heads weighted by attention
        output = torch.matmul(attn_weights, V)
        
        # We need to concatenate heads back
        # This is done to transpose the output and make it contiguous in memory (since a simple transpose is not contiguous)
        output = output.transpose(1, 2).contiguous()
        # This is concatenation of heads
        output = output.view(batch_size, seq_len, self.d_model)  # Fixed: batch -> batch_size, d_model -> self.d_model

        # Final Projection
        output = self.out_proj(output)

        return output


# Test the implementation
if __name__ == "__main__":
    # Create model
    model = MultiHeadAttention(d_model=512, num_heads=8, dropout=0.1)
    
    # Create input
    batch_size = 32
    seq_len = 10
    x = torch.randn(batch_size, seq_len, 512)
    
    # Forward pass
    output = model(x)
    
    print(f"Input shape:  {x.shape}")       # [32, 10, 512]
    print(f"Output shape: {output.shape}")   # [32, 10, 512]
    print("Multi-head attention works!")

Input shape:  torch.Size([32, 10, 512])
Output shape: torch.Size([32, 10, 512])
Multi-head attention works!


# Creating a Transformers Block
```
Input
  ↓
LayerNorm
  ↓
Multi-Head Attention 
  ↓
Residual (add input back)
  ↓
LayerNorm
  ↓
Feed-Forward Network
  ↓
Residual (add input back)
  ↓
Output
```

In [15]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        
        # Multi-Head Attention
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        
        # Feed-Forward Network
        self.ffn = FeedForward(d_model, d_ff, dropout)
        
        # Layer Norms
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [batch, seq, d_model]
        
        # ============================================
        # BLOCK 1: Multi-Head Attention + Residual
        # ============================================
        
        # Step 1: Save input for residual
        residual = x
        
        # Step 2: Layer Norm
        x = self.norm1(x)
        
        # Step 3: Attention
        x = self.attention(x)
        
        # Step 4: Dropout
        x = self.dropout(x)
        
        # Step 5: Residual connection
        x = residual + x
        
        # ============================================
        # BLOCK 2: Feed-Forward Network + Residual
        # ============================================
        
        # Step 6: Save input for residual
        residual = x
        
        # Step 7: Layer Norm
        x = self.norm2(x)
        
        # Step 8: FFN
        x = self.ffn(x)
        
        # Step 9: Dropout
        x = self.dropout(x)
        
        # Step 10: Residual connection
        x = residual + x
        
        return x
    
block = TransformerBlock(d_model=512, num_heads=8, d_ff=2048)

# Input
x = torch.randn(2, 10, 512)  # [batch=2, seq=10, d_model=512]

# Forward pass
output = block(x)

print(f"Input shape:  {x.shape}")
print(f"Output shape: {output.shape}")
print("✅ Transformer Block works!")

Input shape:  torch.Size([2, 10, 512])
Output shape: torch.Size([2, 10, 512])
✅ Transformer Block works!


# Now we need to stack multiple transformers Blocks
 

In [16]:
# nn.ModuleList is a pytorch container that creates a list of modules that Pytorch can track

import torch
import torch.nn as nn

class Transformer(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        """
        Args:
            vocab_size: Size of vocabulary (e.g., 50000)
            max_seq_len: Maximum sequence length (e.g., 2048)
            d_model: Model dimension (e.g., 512)
            num_heads: Number of attention heads (e.g., 8)
            d_ff: FFN hidden dimension (e.g., 2048)
            num_layers: Number of transformer blocks to stack (e.g., 6)
            dropout: Dropout rate
        """
        super().__init__()
        
        # Embeddings
        self.embeddings = Embeddings(vocab_size, max_seq_len, d_model)
        
        # Stack of Transformer Blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Final Layer Norm
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, token_ids):
        # token_ids: [batch, seq]
        
        # Get embeddings
        x = self.embeddings(token_ids)  # [batch, seq, d_model]
        
        # Pass through all transformer blocks
        for block in self.blocks:
            x = block(x)
        
        # Final normalization
        x = self.norm(x)
        
        return x
    
# Create model with 6 stacked blocks
model = Transformer(
    vocab_size=1000,
    max_seq_len=2048,
    d_model=512,
    num_heads=8,
    d_ff=2048,
    num_layers=6,  # 6 transformer blocks!
    dropout=0.1
)

# Test
token_ids = torch.randint(0, 1000, (2, 10))  # [batch=2, seq=10]
output = model(token_ids)

print(f"Input shape:  {token_ids.shape}")
print(f"Output shape: {output.shape}")
print(f"Number of blocks: {len(model.blocks)}")

Input shape:  torch.Size([2, 10])
Output shape: torch.Size([2, 10, 512])
Number of blocks: 6


# Final Output Head

In [18]:
import torch
import torch.nn as nn

class LanguageModel(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()
        
        # Embeddings
        self.embeddings = Embeddings(vocab_size, max_seq_len, d_model)
        
        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Final Layer Norm
        self.norm = nn.LayerNorm(d_model)
        
        # Output Head: Project to vocabulary
        self.lm_head = nn.Linear(d_model, vocab_size)
        
    def forward(self, token_ids):
        # token_ids: [batch, seq]
        
        # Embeddings
        x = self.embeddings(token_ids)  # [batch, seq, d_model]
        
        # Transformer blocks
        for block in self.blocks:
            x = block(x)
        
        # Final norm
        x = self.norm(x)  # [batch, seq, d_model]
        
        # Project to vocabulary
        #NOTE:lm_head is a linear layer that maps the final hidden states to the vocabulary size, producing logits for each token position.
        # Logits are the raw, unnormalized scores outputted by the model before applying softmax to get probabilities.
        logits = self.lm_head(x)  # [batch, seq, vocab_size]
        
        return logits
    
# Create complete language model
model = LanguageModel(
    vocab_size=1000,
    max_seq_len=2048,
    d_model=512,
    num_heads=8,
    d_ff=2048,
    num_layers=6,
    dropout=0.1
)

# Test
token_ids = torch.randint(0, 1000, (2, 10))
logits = model(token_ids)

print(f"Input shape:  {token_ids.shape}")      # [2, 10]
print(f"Output shape: {logits.shape}")         # [2, 10, 1000]

Input shape:  torch.Size([2, 10])
Output shape: torch.Size([2, 10, 1000])


# Load Dataset

In [23]:
%pip install datasets transformers hf_transfer

Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m22.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: hf_transfer
Successfully installed hf_transfer-0.1.9
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'
from datasets import load_dataset

# Load WikiText-2
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

print("Dataset loaded!")
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset loaded!
Train samples: 36718
Validation samples: 3760
Test samples: 4358

First training example:



In [27]:
# Find first non-empty example
print("\nLooking for non-empty examples:")
for i in range(10):
    text = dataset['train'][i]['text'].strip()
    if len(text) > 0:
        print(f"\nExample {i}:")
        print(text[:300])  # First 300 chars
        break


Looking for non-empty examples:

Example 1:
= Valkyria Chronicles III =


# Tokenization

In [29]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# GPT-2 tokenizer needs a pad token (it doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

print(f"Vocabulary size: {len(tokenizer)}")
print(f"Example tokens:")

# Test it
text = "The cat sat on the mat"
tokens = tokenizer.encode(text)
print(f"\nText: {text}")
print(f"Token IDs: {tokens}")
print(f"Decoded back: {tokenizer.decode(tokens)}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Vocabulary size: 50257
Example tokens:

Text: The cat sat on the mat
Token IDs: [464, 3797, 3332, 319, 262, 2603]
Decoded back: The cat sat on the mat


# Create a pytroch Dataset


In [33]:
"""
NOTE:
1.So mostly when we create a pytorch dataset
we need three things:
- __init__ : to initialize the dataset object, load data, and set up any necessary variables..
- __len__ : to return the total number of samples in the dataset.
- __getitem__ : to retrieve a single sample from the dataset given an index.
"""
from torch.utils.data import Dataset
class MyDataset(Dataset):
    
    def __init__(self):
        # Initialize: Load/prepare data
        pass
    
    def __len__(self):
        # Return: How many samples?
        pass
    
    def __getitem__(self, idx):
        # Return: Give me sample number 'idx'
        pass

import torch
from torch.utils.data import Dataset

class SimpleDataset(Dataset):
    def __init__(self):
        # Store some numbers
        self.data = [10, 20, 30, 40, 50]
    
    def __len__(self):
        # How many samples?
        return len(self.data)  # 5
    
    def __getitem__(self, idx):
        # Get sample number idx
        return self.data[idx]

# Create dataset
dataset = SimpleDataset()

print(f"Dataset has {len(dataset)} samples")  # 5
print(f"Sample 0: {dataset[0]}")  # 10
print(f"Sample 2: {dataset[2]}")  # 30

Dataset has 5 samples
Sample 0: 10
Sample 2: 30


In [34]:
import torch
from torch.utils.data import Dataset

class InputTargetDataset(Dataset):
    def __init__(self):
        # Store a sequence
        self.data = [10, 20, 30, 40, 50, 60]
    
    def __len__(self):
        # We can make 5 pairs (last one has no target, so -1)
        return len(self.data) - 1
    
    def __getitem__(self, idx):
        # Input: current number
        # Target: next number
        input_val = self.data[idx]
        target_val = self.data[idx + 1]
        
        return input_val, target_val

# Create dataset
dataset = InputTargetDataset()

print(f"Dataset has {len(dataset)} samples\n")

# Get samples
for i in range(len(dataset)):
    input_val, target_val = dataset[i]
    print(f"Sample {i}: input={input_val}, target={target_val}")


Dataset has 5 samples

Sample 0: input=10, target=20
Sample 1: input=20, target=30
Sample 2: input=30, target=40
Sample 3: input=40, target=50
Sample 4: input=50, target=60


In [35]:
import torch
from torch.utils.data import Dataset

class WikiTextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        """
        Args:
            data: HuggingFace dataset (dataset['train'])
            tokenizer: GPT2Tokenizer
            max_length: Sequence length (512)
        """
        self.max_length = max_length
        
        # Step 1: Tokenize ALL text into one long list
        print("Tokenizing dataset...")
        all_tokens = []
        
        for example in data:
            text = example['text'].strip()
            if len(text) > 0:  # Skip empty lines
                tokens = tokenizer.encode(text)
                all_tokens.extend(tokens)
        
        # Step 2: Convert to PyTorch tensor
        self.tokens = torch.tensor(all_tokens, dtype=torch.long)
        print(f"Total tokens: {len(self.tokens):,}")
    
    def __len__(self):
        # How many sequences of length 512?
        return len(self.tokens) // self.max_length
    
    def __getitem__(self, idx):
        # Get chunk starting at position (idx * 512)
        start = idx * self.max_length
        end = start + self.max_length
        
        # Input: tokens[start:end]
        # Target: tokens[start+1:end+1] (shifted!)
        input_ids = self.tokens[start:end]
        target_ids = self.tokens[start+1:end+1]
        
        return input_ids, target_ids

In [36]:
from datasets import load_dataset
from transformers import GPT2Tokenizer

# Load data and tokenizer
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Create our dataset
train_dataset = WikiTextDataset(
    dataset['train'],
    tokenizer,
    max_length=512
)

print(f"\nTotal sequences: {len(train_dataset)}")

# Get first sample
input_ids, target_ids = train_dataset[0]
print(f"\nSample 0:")
print(f"Input shape: {input_ids.shape}")
print(f"Target shape: {target_ids.shape}")
print(f"First 10 input tokens: {input_ids[:10]}")
print(f"First 10 target tokens: {target_ids[:10]}")

Tokenizing dataset...
Total tokens: 2,347,038

Total sequences: 4584

Sample 0:
Input shape: torch.Size([512])
Target shape: torch.Size([512])
First 10 input tokens: tensor([   28,   569, 18354,  7496, 17740,  6711,   796, 10445,    73, 13090])
First 10 target tokens: tensor([  569, 18354,  7496, 17740,  6711,   796, 10445,    73, 13090,   645])


# Now we need to batch our data(Data Loader)

In [37]:
from torch.utils.data import DataLoader

# Create DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=32,      # 32 sequences per batch
    shuffle=True,       # Shuffle data each epoch
    num_workers=0       # Data loading processes (0 = main process)
)

print(f"Total batches: {len(train_loader)}")
print(f"Batch size: 32")

# Get first batch
batch = next(iter(train_loader))
input_ids, target_ids = batch

print(f"\nFirst batch:")
print(f"Input shape:  {input_ids.shape}")   # [32, 512]
print(f"Target shape: {target_ids.shape}")  # [32, 512]


Total batches: 144
Batch size: 32

First batch:
Input shape:  torch.Size([32, 512])
Target shape: torch.Size([32, 512])


# Training Loop
```
Loss(So loss is computed for each token)(Define the Loss)
↓
Optimizer(Optimizer updates the model's weights to minimize loss.)
↓
Training Loop(Forward-Loss-clear Gradients-Backward-Update)

In [44]:
#Define the Loss function
import torch
import torch.nn as nn

# Create loss function
criterion = nn.CrossEntropyLoss()

# Fake model output (logits)
# [batch=2, seq=3, vocab_size=5]
logits = torch.randn(2, 3, 5)

# Target token IDs
# [batch=2, seq=3]
targets = torch.tensor([
    [2, 4, 1],  # Sequence 1: correct tokens are 2, 4, 1
    [0, 3, 2]   # Sequence 2: correct tokens are 0, 3, 2
])

print("Logits shape:", logits.shape)    # [2, 3, 5]
print("Targets shape:", targets.shape)  # [2, 3]

# BUT! CrossEntropyLoss expects:
# logits: [N, vocab_size] where N = batch * seq
# targets: [N]

# So we need to reshape!
logits_flat = logits.view(-1, 5)      # [6, 5]  (2*3=6 positions)
targets_flat = targets.view(-1)        # [6]

print("\nAfter reshaping:")
print("Logits flat:", logits_flat.shape)   # [6, 5]
print("Targets flat:", targets_flat.shape) # [6]

# Calculate loss
loss = criterion(logits_flat, targets_flat)
print(f"\nLoss: {loss.item():.4f}")


Logits shape: torch.Size([2, 3, 5])
Targets shape: torch.Size([2, 3])

After reshaping:
Logits flat: torch.Size([6, 5])
Targets flat: torch.Size([6])

Loss: 2.0549


In [None]:
import torch
import torch.nn as nn

# Create loss function
criterion = nn.CrossEntropyLoss()

# Simulate model training
model = LanguageModel(
    vocab_size=50257,  # GPT-2 vocab size
    max_seq_len=512,
    d_model=512,
    num_heads=8,
    d_ff=2048,
    num_layers=6
)

# Get a batch from dataloader
input_ids, target_ids = next(iter(train_loader))

print(f"Input shape: {input_ids.shape}")    # [32, 512]
print(f"Target shape: {target_ids.shape}")  # [32, 512]

# Forward pass
logits = model(input_ids)
print(f"Logits shape: {logits.shape}")      # [32, 512, 50257]

# Reshape for loss calculation
batch_size, seq_len, vocab_size = logits.shape

logits_flat = logits.view(batch_size * seq_len, vocab_size)  # [16384, 50257]
targets_flat = target_ids.view(batch_size * seq_len)         # [16384]

print(f"\nFlattened logits: {logits_flat.shape}")
print(f"Flattened targets: {targets_flat.shape}")

# Calculate loss
loss = criterion(logits_flat, targets_flat)
print(f"\nLoss: {loss.item():.4f}")


Input shape: torch.Size([32, 512])
Target shape: torch.Size([32, 512])
Logits shape: torch.Size([32, 512, 50257])

Flattened logits: torch.Size([16384, 50257])
Flattened targets: torch.Size([16384])

Loss: 11.0243


In [45]:
# Create a PyTorch optimizer
import torch.optim as optim

# Create optimizer
optimizer = optim.AdamW(
    model.parameters(),  # All model weights to optimize
    lr=3e-4,            # Learning rate (how big each update is)
    weight_decay=0.01   # Regularization (prevents overfitting)
)

print("Optimizer created!")
print(f"Learning rate: 3e-4 = {3e-4}")

Optimizer created!
Learning rate: 3e-4 = 0.0003


In [46]:
import torch
import torch.nn as nn
import torch.optim as optim

# Simple model
model = nn.Linear(10, 5)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Fake data
x = torch.randn(2, 10)
targets = torch.tensor([2, 4])

print("=== Before Training ===")
print(f"Weight sample: {model.weight[0, 0].item():.4f}")

# Training step
optimizer.zero_grad()           # Clear old gradients
output = model(x)               # Forward
loss = criterion(output, targets)  # Loss
print(f"Loss: {loss.item():.4f}")

loss.backward()                 # Calculate gradients
print(f"Gradient sample: {model.weight.grad[0, 0].item():.6f}")

optimizer.step()                # Update weights

print("=== After Training ===")
print(f"Weight sample: {model.weight[0, 0].item():.4f}")
print("Weight changed! ✓")

=== Before Training ===
Weight sample: -0.2838
Loss: 1.8065
Gradient sample: -0.151754
=== After Training ===
Weight sample: -0.2828
Weight changed! ✓


In [47]:
import torch
import torch.nn as nn
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer, device, num_epochs=1):
    """
    Train the language model
    
    Args:
        model: Language model
        train_loader: DataLoader with training data
        criterion: Loss function
        optimizer: Optimizer
        device: 'cuda' or 'cpu'
        num_epochs: Number of epochs to train
    """
    model.to(device)  # Move model to GPU
    model.train()     # Set to training mode
    
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0
        
        # Progress bar
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for batch_idx, (input_ids, target_ids) in enumerate(pbar):
            # Move data to GPU
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)
            
            # Forward pass
            logits = model(input_ids)  # [batch, seq, vocab]
            
            # Reshape for loss
            batch_size, seq_len, vocab_size = logits.shape
            logits_flat = logits.view(batch_size * seq_len, vocab_size)
            targets_flat = target_ids.view(batch_size * seq_len)
            
            # Calculate loss
            loss = criterion(logits_flat, targets_flat)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Track loss
            total_loss += loss.item()
            num_batches += 1
            
            # Update progress bar
            avg_loss = total_loss / num_batches
            pbar.set_postfix({'loss': f'{avg_loss:.4f}'})
        
        print(f"Epoch {epoch+1} completed. Avg Loss: {avg_loss:.4f}")

# Complete Training Script

In [49]:
import torch
import gc

# Clear GPU cache
torch.cuda.empty_cache()
gc.collect()

print(f"GPU Memory after clearing: {torch.cuda.memory_allocated()/1e9:.2f} GB")

GPU Memory after clearing: 14.72 GB


In [51]:
import torch
import gc

# Delete ALL variables
del model
del optimizer
del criterion
del train_loader
del train_dataset
del dataset
del tokenizer

# Clear GPU
torch.cuda.empty_cache()
gc.collect()

print(f"GPU Memory after cleanup: {torch.cuda.memory_allocated()/1e9:.2f} GB")

GPU Memory after cleanup: 14.72 GB


In [1]:
# 1. Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import GPT2Tokenizer
from tqdm import tqdm

# 2. Check GPU is clean
print(f"GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")  # Should be 0!

# 3. Load data (only what we need)
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Continue to next step...

GPU Memory: 0.00 GB


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import GPT2Tokenizer
from tqdm import tqdm
import math

# ============================================
# 1. COMPONENTS (copy your classes here)
# ============================================

# MultiHeadAttention class
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        Q = self.q_proj(x)
        K = self.k_proj(x)
        V = self.v_proj(x)

        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim)

        Q = Q.transpose(1, 2)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn_weights = torch.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        output = torch.matmul(attn_weights, V)

        output = output.transpose(1, 2).contiguous()
        output = output.view(batch_size, seq_len, self.d_model)
        output = self.out_proj(output)

        return output

# FeedForward class
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()

        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.dropout(x)
        return x

# TransformerBlock class
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()

        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.dropout(x)
        x = residual + x

        residual = x
        x = self.norm2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = residual + x

        return x

# Embeddings class
class Embeddings(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model):
        super().__init__()

        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_seq_len, d_model)

    def forward(self, token_ids):
        batch_size, seq_len = token_ids.shape

        tokens = self.token_embed(token_ids)
        positions = torch.arange(seq_len, device=token_ids.device)
        pos = self.pos_embed(positions)

        return tokens + pos

# LanguageModel class
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()

        self.embeddings = Embeddings(vocab_size, max_seq_len, d_model)

        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])

        self.norm = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size)

    def forward(self, token_ids):
        x = self.embeddings(token_ids)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)
        logits = self.lm_head(x)

        return logits

# WikiTextDataset class
class WikiTextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.max_length = max_length

        print("Tokenizing dataset...")
        all_tokens = []

        for example in data:
            text = example['text'].strip()
            if len(text) > 0:
                tokens = tokenizer.encode(text)
                all_tokens.extend(tokens)

        self.tokens = torch.tensor(all_tokens, dtype=torch.long)
        print(f"Total tokens: {len(self.tokens):,}")

    def __len__(self):
        return len(self.tokens) // self.max_length

    def __getitem__(self, idx):
        start = idx * self.max_length
        end = start + self.max_length

        input_ids = self.tokens[start:end]
        target_ids = self.tokens[start+1:end+1]

        return input_ids, target_ids

# train_model function
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=1):
    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch_idx, (input_ids, target_ids) in enumerate(pbar):
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)

            logits = model(input_ids)

            batch_size, seq_len, vocab_size = logits.shape
            logits_flat = logits.view(batch_size * seq_len, vocab_size)
            targets_flat = target_ids.view(batch_size * seq_len)

            loss = criterion(logits_flat, targets_flat)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

            avg_loss = total_loss / num_batches
            pbar.set_postfix({'loss': f'{avg_loss:.4f}'})

        print(f"Epoch {epoch+1} completed. Avg Loss: {avg_loss:.4f}")

# ============================================
# 2. SETUP
# ============================================

device = torch.device('cuda')
print(f"Device: {device}")
print(f"Initial GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# Load data
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Create dataset
train_dataset = WikiTextDataset(dataset['train'], tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

print(f"Dataset ready. GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# ============================================
# 3. CREATE MODEL (TINY!)
# ============================================

model = LanguageModel(
    vocab_size=50257,
    max_seq_len=128,
    d_model=128,
    num_heads=2,
    d_ff=512,
    num_layers=2
).to(device)

print(f"Model created. GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

# ============================================
# 4. OPTIMIZER & LOSS
# ============================================

optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# ============================================
# 5. TRAIN!
# ============================================

print("\nStarting training...\n")
train_model(model, train_loader, criterion, optimizer, device, num_epochs=1)

print("TRAINING COMPLETE!")

Device: cuda
Initial GPU Memory: 0.00 GB
Tokenizing dataset...
Total tokens: 2,347,038
Dataset ready. GPU Memory: 0.00 GB
Model created. GPU Memory: 0.05 GB
Parameters: 13,329,233

Starting training...



Epoch 1/1: 100%|██████████| 9168/9168 [02:13<00:00, 68.68it/s, loss=3.5917] 

Epoch 1 completed. Avg Loss: 3.5917
TRAINING COMPLETE!





In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Clear GPU first
torch.cuda.empty_cache()

device = torch.device('cuda')
print(f"GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# Create datasets
train_dataset = WikiTextDataset(dataset['train'], tokenizer, max_length=256)
val_dataset = WikiTextDataset(dataset['validation'], tokenizer, max_length=256)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Create PROPER MHA model (bigger than test)
mha_model = LanguageModel(
    vocab_size=50257,
    max_seq_len=256,
    d_model=512,     # Assignment size
    num_heads=8,     # Assignment size  
    d_ff=2048,       # Assignment size
    num_layers=6     # Assignment size
).to(device)

optimizer = optim.AdamW(mha_model.parameters(), lr=3e-4, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

print(f"MHA Model parameters: {sum(p.numel() for p in mha_model.parameters()):,}")
print(f"GPU Memory after model: {torch.cuda.memory_allocated()/1e9:.2f} GB")

GPU Memory: 0.23 GB
Tokenizing dataset...
Total tokens: 2,347,038
Tokenizing dataset...
Total tokens: 242,643
MHA Model parameters: 70,559,825
GPU Memory after model: 0.41 GB


# Validation Function

In [5]:
def validate_model(model, val_loader, criterion, device):
    """
    Validate model and calculate perplexity
    """
    model.eval()  # Set to evaluation mode
    total_loss = 0
    num_batches = 0
    
    with torch.no_grad():  # No gradients needed for validation
        for input_ids, target_ids in val_loader:
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)
            
            # Forward pass
            logits = model(input_ids)
            
            # Reshape and calculate loss
            batch_size, seq_len, vocab_size = logits.shape
            logits_flat = logits.view(batch_size * seq_len, vocab_size)
            targets_flat = target_ids.view(batch_size * seq_len)
            
            loss = criterion(logits_flat, targets_flat)
            total_loss += loss.item()
            num_batches += 1
    
    avg_loss = total_loss / num_batches
    perplexity = torch.exp(torch.tensor(avg_loss))
    
    model.train()  # Back to training mode
    return avg_loss, perplexity.item()

# Training MHA 

In [6]:
print("=" * 60)
print("TRAINING MHA MODEL")
print("=" * 60)

num_epochs = 3

for epoch in range(num_epochs):
    # Train
    model = mha_model
    model.train()
    total_loss = 0
    num_batches = 0
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for input_ids, target_ids in pbar:
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)
        
        # Forward
        logits = model(input_ids)
        
        # Loss
        batch_size, seq_len, vocab_size = logits.shape
        logits_flat = logits.view(batch_size * seq_len, vocab_size)
        targets_flat = target_ids.view(batch_size * seq_len)
        loss = criterion(logits_flat, targets_flat)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Track
        total_loss += loss.item()
        num_batches += 1
        pbar.set_postfix({'loss': f'{total_loss/num_batches:.4f}'})
    
    # Validate
    val_loss, val_perplexity = validate_model(mha_model, val_loader, criterion, device)
    
    print(f"\nEpoch {epoch+1} Summary:")
    print(f"  Train Loss: {total_loss/num_batches:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  Val Perplexity: {val_perplexity:.2f}\n")

print("=" * 60)
print("MHA TRAINING COMPLETE!")
print("=" * 60)

TRAINING MHA MODEL


Epoch 1/3: 100%|██████████| 1146/1146 [01:31<00:00, 12.48it/s, loss=0.4095]



Epoch 1 Summary:
  Train Loss: 0.4095
  Val Loss: 0.2125
  Val Perplexity: 1.24



Epoch 2/3: 100%|██████████| 1146/1146 [01:32<00:00, 12.35it/s, loss=0.0845]



Epoch 2 Summary:
  Train Loss: 0.0845
  Val Loss: 0.1206
  Val Perplexity: 1.13



Epoch 3/3: 100%|██████████| 1146/1146 [01:33<00:00, 12.31it/s, loss=0.0419]



Epoch 3 Summary:
  Train Loss: 0.0419
  Val Loss: 0.0996
  Val Perplexity: 1.10

MHA TRAINING COMPLETE!


# Save the MHA Model

In [7]:
# Save the trained model
torch.save({
    'model_state_dict': mha_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_loss': 0.0419,
    'val_loss': 0.0996,
    'val_perplexity': 1.10,
}, 'mha_model.pt')

print("✅ MHA model saved as 'mha_model.pt'")

✅ MHA model saved as 'mha_model.pt'


In [10]:
def generate_text_fixed(model, tokenizer, prompt, max_length=50, top_k=50, device='cuda'):
    """
    Generate text with top-k sampling (more stable)
    """
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    print(f"Prompt: {prompt}")
    print(f"Generating...\n")
    
    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)
            next_token_logits = logits[0, -1, :]
            
            # TOP-K SAMPLING (key difference!)
            # Only consider top 50 most likely tokens
            top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
            probs = torch.softmax(top_k_logits, dim=-1)
            next_token_idx = torch.multinomial(probs, num_samples=1)
            next_token = top_k_indices[next_token_idx]
            
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
            
            if next_token.item() == tokenizer.eos_token_id:
                break
    
    generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    return generated_text


# Test with better sampling
prompts = [
    "The history of",
    "In mathematics,",
    "The cat sat on the",
]

for prompt in prompts:
    generated = generate_text_fixed(mha_model, tokenizer, prompt, max_length=30, top_k=50, device=device)
    print(f"Output: {generated}\n")
    print("-" * 60)

Prompt: The history of
Generating...

Output: The history of education of revered of of Interstate fighters of of of intelligence education of of intelligence of ofjohn of of church of of of of of of parish of of

------------------------------------------------------------
Prompt: In mathematics,
Generating...

Output: In mathematics,InIn plInInInInInInInInInInInInInInInInInInInInInInInInInInIn

------------------------------------------------------------
Prompt: The cat sat on the
Generating...

Output: The cat sat on the cat added added catalsuruuru the added on added on added added added added added added added added added added added added added on on added added added

------------------------------------------------------------


In [11]:
# Better Trainning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# ============================================
# CLEAR GPU AND START FRESH
# ============================================
torch.cuda.empty_cache()
device = torch.device('cuda')
print(f"GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")

# ============================================
# BETTER HYPERPARAMETERS (Less Overfitting)
# ============================================

# Create datasets with LONGER sequences
train_dataset = WikiTextDataset(dataset['train'], tokenizer, max_length=512)
val_dataset = WikiTextDataset(dataset['validation'], tokenizer, max_length=512)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

# ============================================
# VALIDATION FUNCTION
# ============================================

def validate_model(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    num_batches = 0
    
    with torch.no_grad():
        for input_ids, target_ids in val_loader:
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)
            
            logits = model(input_ids)
            
            batch_size, seq_len, vocab_size = logits.shape
            logits_flat = logits.view(batch_size * seq_len, vocab_size)
            targets_flat = target_ids.view(batch_size * seq_len)
            
            loss = criterion(logits_flat, targets_flat)
            total_loss += loss.item()
            num_batches += 1
    
    avg_loss = total_loss / num_batches
    perplexity = torch.exp(torch.tensor(avg_loss))
    
    model.train()
    return avg_loss, perplexity.item()

# ============================================
# BETTER TRAINING FUNCTION
# ============================================

def train_model_properly(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    model.to(device)
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        total_loss = 0
        num_batches = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for input_ids, target_ids in pbar:
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)
            
            # Forward
            logits = model(input_ids)
            
            # Loss
            batch_size, seq_len, vocab_size = logits.shape
            logits_flat = logits.view(batch_size * seq_len, vocab_size)
            targets_flat = target_ids.view(batch_size * seq_len)
            loss = criterion(logits_flat, targets_flat)
            
            # Backward with gradient clipping
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevents exploding gradients
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
            pbar.set_postfix({'loss': f'{total_loss/num_batches:.4f}'})
        
        # Validation
        val_loss, val_perplexity = validate_model(model, val_loader, criterion, device)
        
        print(f"\nEpoch {epoch+1}:")
        print(f"  Train Loss: {total_loss/num_batches:.4f}")
        print(f"  Val Loss: {val_loss:.4f}")
        print(f"  Val Perplexity: {val_perplexity:.2f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"  ✅ Best model so far!")
        print()
    
    return val_loss, val_perplexity

# ============================================
# TRAIN MHA MODEL PROPERLY
# ============================================

print("=" * 70)
print("TRAINING MHA MODEL (PROPERLY)")
print("=" * 70)

mha_model = LanguageModel(
    vocab_size=50257,
    max_seq_len=512,
    d_model=512,
    num_heads=8,
    d_ff=2048,
    num_layers=6,
    dropout=0.2  # Higher dropout to reduce overfitting
).to(device)

optimizer = optim.AdamW(mha_model.parameters(), lr=3e-4, weight_decay=0.1)  # Higher weight decay
criterion = nn.CrossEntropyLoss()

print(f"Parameters: {sum(p.numel() for p in mha_model.parameters()):,}")
print(f"GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB\n")

# Train for 5 epochs
mha_val_loss, mha_perplexity = train_model_properly(
    mha_model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5
)

# Save model
torch.save({
    'model_state_dict': mha_model.state_dict(),
    'val_loss': mha_val_loss,
    'val_perplexity': mha_perplexity,
}, 'mha_model_proper.pt')

print("=" * 70)
print(f"MHA TRAINING COMPLETE!")
print(f"Final Val Perplexity: {mha_perplexity:.2f}")
print("=" * 70)

GPU Memory: 1.56 GB
Tokenizing dataset...
Total tokens: 2,347,038
Tokenizing dataset...
Total tokens: 242,643
Train batches: 573
Val batches: 60
TRAINING MHA MODEL (PROPERLY)
Parameters: 70,690,897
GPU Memory: 1.28 GB



Epoch 1/5: 100%|██████████| 573/573 [01:31<00:00,  6.26it/s, loss=6.8247]



Epoch 1:
  Train Loss: 6.8247
  Val Loss: 6.3977
  Val Perplexity: 600.47
  ✅ Best model so far!



Epoch 2/5: 100%|██████████| 573/573 [01:32<00:00,  6.18it/s, loss=5.9716]



Epoch 2:
  Train Loss: 5.9716
  Val Loss: 6.1046
  Val Perplexity: 447.93
  ✅ Best model so far!



Epoch 3/5: 100%|██████████| 573/573 [01:32<00:00,  6.17it/s, loss=5.4670]



Epoch 3:
  Train Loss: 5.4670
  Val Loss: 5.6924
  Val Perplexity: 296.61
  ✅ Best model so far!



Epoch 4/5: 100%|██████████| 573/573 [01:33<00:00,  6.16it/s, loss=3.0203]



Epoch 4:
  Train Loss: 3.0203
  Val Loss: 1.1246
  Val Perplexity: 3.08
  ✅ Best model so far!



Epoch 5/5: 100%|██████████| 573/573 [01:33<00:00,  6.15it/s, loss=0.6486]



Epoch 5:
  Train Loss: 0.6486
  Val Loss: 0.3403
  Val Perplexity: 1.41
  ✅ Best model so far!

MHA TRAINING COMPLETE!
Final Val Perplexity: 1.41


In [12]:
def generate_text_proper(model, tokenizer, prompt, max_length=50, top_k=50, device='cuda'):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)
            next_token_logits = logits[0, -1, :]
            
            # Top-k sampling
            top_k_logits, top_k_indices = torch.topk(next_token_logits, top_k)
            probs = torch.softmax(top_k_logits, dim=-1)
            next_token_idx = torch.multinomial(probs, num_samples=1)
            next_token = top_k_indices[next_token_idx]
            
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
            
            if next_token.item() == tokenizer.eos_token_id:
                break
    
    return tokenizer.decode(input_ids[0], skip_special_tokens=True)


# Test generation
prompts = [
    "The history of",
    "In mathematics,",
    "The cat sat on the",
]

print("\n" + "=" * 70)
print("TESTING MHA MODEL GENERATION")
print("=" * 70)

for prompt in prompts:
    generated = generate_text_proper(mha_model, tokenizer, prompt, max_length=40, device=device)
    print(f"\nPrompt: {prompt}")
    print(f"Output: {generated}")
    print("-" * 70)


TESTING MHA MODEL GENERATION

Prompt: The history of
Output: The history of history largest historyThe history largest 36 largest history hurricanesThe Forces largest 36rd history First history hurricanes charge history history 200 feet foot kmrdrd largest history 36 kilometers kilometers km Run Run chart First history history
----------------------------------------------------------------------

Prompt: In mathematics,
Output: In mathematics, By By By By By By By By By By By By By By By — By By By By By By By By By By By By By By By By By By By By By By By By
----------------------------------------------------------------------

Prompt: The cat sat on the
Output: The cat sat on the the the on on Allied on the on the on Allied on on the set on on the on on Allied on the on on on the on on on set on on the the on on on right on
----------------------------------------------------------------------
