# NOTE: We will revise pytorch and implement a language model from scratch hopefully this will clear all doubts for future weeks

# Pytorch Basic Session :1

In [1]:
#Step 1 : Understanding nn.Module
import torch
import torch.nn as nn

class SimpleModule(nn.Module):
    def __init__(self):
        super().__init__()  #By calling super().__init__(), you're saying: "Hey parent class, set up all your infrastructure before I add my custom stuff."(parent class's constructor.))
        
    def forward(self, x):
        return x

In [2]:
#Setp 1:1 Example
import torch
import torch.nn as nn

class SimpleModule(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        return x * 2

# Create model
model = SimpleModule()

# Test it
x = torch.tensor([1.0, 2.0, 3.0])
output = model(x)  # This calls forward() automatically

print(f"Input: {x}")
print(f"Output: {output}")

Input: tensor([1., 2., 3.])
Output: tensor([2., 4., 6.])


In [3]:
#Step 2 :Understanding nn.Parameter
import torch
import torch.nn as nn

class ModuleWithWeight(nn.Module):
    def __init__(self, size):
        super().__init__()
        
        # This creates a LEARNABLE parameter
        self.weight = nn.Parameter(torch.randn(size))
        
    def forward(self, x):
        return x * self.weight

# Create model
model = ModuleWithWeight(3)

# Test
x = torch.tensor([1.0, 2.0, 3.0])
output = model(x)

print(f"Input: {x}")
print(f"Weight: {model.weight}")
print(f"Output: {output}")
print(f"\nIs weight learnable? {model.weight.requires_grad}")

Input: tensor([1., 2., 3.])
Weight: Parameter containing:
tensor([-0.4903,  1.9947,  0.6297], requires_grad=True)
Output: tensor([-0.4903,  3.9895,  1.8890], grad_fn=<MulBackward0>)

Is weight learnable? True


In [4]:
# Step 3 : Understanding nn.Linear
import torch
import torch.nn as nn

# Create a linear layer
input_dim = 512
output_dim = 256
linear = nn.Linear(input_dim, output_dim)

# What does it contain?
print(f"Weight shape: {linear.weight.shape}")
print(f"Bias shape: {linear.bias.shape}")
print(f"Weight requires_grad: {linear.weight.requires_grad}")

Weight shape: torch.Size([256, 512])
Bias shape: torch.Size([256])
Weight requires_grad: True


In [5]:
import torch
import torch.nn as nn

# Create linear layer
linear = nn.Linear(512, 256)
# This creates a weight matrix of 256 and 512

# Create input
batch_size = 32
seq_len = 10
x = torch.randn(batch_size, seq_len, 512)

# Forward pass
output = linear(x)

print(f"Input shape:  {x.shape}")      # [32, 10, 512]
print(f"Output shape: {output.shape}")  # [32, 10, 256]

Input shape:  torch.Size([32, 10, 512])
Output shape: torch.Size([32, 10, 256])


# Buliding Token Embeddings

In [6]:
import torch
import torch.nn as nn

#Start by inheriting from nn.Module
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__() #Call the parent class's constructor so that all methods of nn.Module are properly initialized.
        #nn.embedding creates a lookup table that maps from integer indices (representing tokens) to dense vectors of fixed size (the embeddings).
        #nn.embedding ,creates a lookup table and intializes it with random values. and makes the values learnable parameters of the model. (also registers it as a parameter of the module)
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
    def forward(self, x):
        return self.embedding(x)
    
#Create
vocab_size = 1000  # Size of the vocabulary
d_model = 512    # Dimension of the embeddings
token_embed= TokenEmbedding(vocab_size, d_model)

#Test
token_ids=torch.randint(0, vocab_size, (2,10))
output= token_embed (token_ids)
print(f"Input token IDs shape: {token_ids.shape}")  # [2, 10]
print(f"Output embeddings shape: {output.shape}")    # [2, 10,

Input token IDs shape: torch.Size([2, 10])
Output embeddings shape: torch.Size([2, 10, 512])


# Position embeddings

In [7]:
import torch
import torch.nn as nn

class PositionEmbedding(nn.Module): #Inherit from nn.Module
    def __init__(self, max_seq_len, d_model):
        super().__init__() #Call the parent class's constructor to make sure all methods of nn.Module are properly initialized. and available.
        #NOTE:Transformers (Original paper) used sinusoidal position embeddings, but nn.Embedding is a common choice for learnable position embeddings.(we will make sure gradients also flow through these embeddings during training.)
        #This creates a learnable position embedding table of size (max_seq_len, d_model)
        self.position_embedding = nn.Embedding(max_seq_len, d_model)

    def forward(self, seq_len):
        #Generate position indices from 0 to seq_len - 1
        positions=torch.arange(seq_len)#This creaates a tensor with seqential numbersye
        return self.position_embedding(positions)
    
#Create
pos_embed= PositionEmbedding(max_seq_len=2048, d_model=512)
        
output = pos_embed(10)  # Example input sequence length
print(f"Output position embeddings shape: {output.shape}")  # [10, 512]
print(f"Positions Used: {torch.arange(10)}")

Output position embeddings shape: torch.Size([10, 512])
Positions Used: tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


# Combine Token and Position Embeddings

In [8]:
import torch
import torch.nn as nn

class Embeddings(nn.Module):#This is normal inheritance from nn.Module
    def __init__(self, vocab_size, max_seq_len, d_model):
        super().__init__() #Initialize the parent class nn.Module so that all its methods and attributes are available. and usable.
        """
        1.token embedding layer to convert token IDs into dense vectors.
        2.position embedding layer to add positional information to the token embeddings.

        NOTE:Both embeddings are learnable parameters of the model.
        """
        
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_seq_len, d_model)
        
    def forward(self, token_ids):
        batch_size, seq_len = token_ids.shape
        
        tokens = self.token_embed(token_ids)
        #torch.arrange helps to convert tokens into id's which can be used to get position embeddings from nn.Embedding lookup table.
        positions = torch.arange(seq_len, device=token_ids.device)
        pos = self.pos_embed(positions)
        
        return tokens + pos

# Test
embeddings = Embeddings(vocab_size=1000, max_seq_len=2048, d_model=512)
token_ids = torch.randint(0, 1000, (2, 10))  # [2, 10]
output = embeddings(token_ids)

print(f"Input shape: {token_ids.shape}")
print(f"Output shape: {output.shape}")

Input shape: torch.Size([2, 10])
Output shape: torch.Size([2, 10, 512])


# Layer Normalization

In [10]:
#Normalizes features to have zero mean and unit variance across the feature dimension
import torch
import torch.nn as nn

#Simple implementation of LayerNorm
d_model = 512
layer_norm = nn.LayerNorm(d_model)

#Test
x = torch.randn(2, 10, d_model)  # [batch_size, seq_len, d_model]
output = layer_norm(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"\nBefore LayerNorm:")
print(f"  Mean: {x.mean():.4f}")
print(f"  Std: {x.std():.4f}")
print(f"\nAfter LayerNorm:")
print(f"  Mean: {output.mean():.4f}")
print(f"  Std: {output.std():.4f}")

#NOTE:Layer norm has two learnable parameters per feature dimension: weight and bias. These parameters allow the model to scale and shift the normalized output, providing flexibility in how the normalized values are represented.
# It has 2 learnable parameters:
print(f"Gamma (scale): {layer_norm.weight.shape}")  # [512]
print(f"Beta (shift): {layer_norm.bias.shape}")     # [512]

Input shape: torch.Size([2, 10, 512])
Output shape: torch.Size([2, 10, 512])

Before LayerNorm:
  Mean: -0.0006
  Std: 0.9969

After LayerNorm:
  Mean: 0.0000
  Std: 1.0000
Gamma (scale): torch.Size([512])
Beta (shift): torch.Size([512])


# FFN (Feed Forward Network)

In [11]:
import torch
import torch.nn as nn

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        Args:
            d_model: Model dimension (512)
            d_ff: Hidden dimension (2048, typically 4x d_model)
            dropout: Dropout rate
        """
        super().__init__()
        
        # Two linear layers
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        
        # Activation
        self.relu = nn.ReLU()
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [batch, seq, d_model]
        
        # Expand: 512 → 2048
        x = self.linear1(x)
        
        # Activate
        x = self.relu(x)
        
        # Dropout
        x = self.dropout(x)
        
        # Contract: 2048 → 512
        x = self.linear2(x)
        
        # Dropout again
        x = self.dropout(x)
        
        return x
    
# Create FFN
ffn = FeedForward(d_model=512, d_ff=2048)

# Test
x = torch.randn(2, 10, 512)  # [batch, seq, d_model]
output = ffn(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")


Input shape: torch.Size([2, 10, 512])
Output shape: torch.Size([2, 10, 512])


# Residual Connection 


In [12]:
import torch

# Input
x = torch.randn(2, 10, 512)

# Some operation (pretend it's attention)
transformed = torch.randn(2, 10, 512)

# WITHOUT residual
output_no_residual = transformed

# WITH residual (just add!)
output_with_residual = x + transformed

print(f"Input: {x.shape}")
print(f"Transformed: {transformed.shape}")
print(f"With residual: {output_with_residual.shape}")
print("\nIt's just addition!")

Input: torch.Size([2, 10, 512])
Transformed: torch.Size([2, 10, 512])
With residual: torch.Size([2, 10, 512])

It's just addition!


# Multi Head Attention

In [13]:
import torch
import torch.nn as nn
import math

class MultiHeadAttention(nn.Module):
    """
    This is the standard multi-head attention class.
    """
    def __init__(self, d_model, num_heads, dropout=0.1):  # This runs when you create an object of this class
        super().__init__()  # This is used to call nn.module's init method which initializes the methods and attributes of the nn.module class
        assert d_model % num_heads == 0
        
        # We are storing all these so that they can be anywhere in the code
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
       
        # nn.Linear is PyTorch's fully connected (dense) layer that performs a linear transformation on the input.
        # It takes the input and multiplies it by a weight matrix and adds a bias term.
        # So it does a y=xw^T+b
        
        # So we need to create projections for Q, K, V (the parameters are input_dim, output_dim), so self.q_proj will create a weight matrix of size d_model x d_model,the weight initlization follows Xavier/Kaiming Initilication
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)

        # Post combination of all heads we need a final projection
        self.out_proj = nn.Linear(d_model, d_model)

        # Dropout helps us to randomly drop out some neurons to prevent overfitting
        self.dropout = nn.Dropout(dropout)
        
    # This is the method which runs when you call the model
    def forward(self, x):
        # This is tuple unpacking
        batch_size, seq_len, _ = x.size()  # Fixed: using _ instead of d_model to avoid shadowing

        # Now we need to project the input matrix into a different matrix
        # So we need to create projections for Q, K, V
        # Q: What am i looking for?
        # K: What do i contain?
        # V: What information do i have?

        Q = self.q_proj(x)  # Query = x@W_q^T + b_q  #This actually calls the forward method
        K = self.k_proj(x)  # Key = x@W_k^T + b_k
        V = self.v_proj(x)  # Value = x@W_v^T + b_v
        
        # Now we wish to split the query, key and value matrices into multiple attention heads so that we can perform parallel computations
        # Now we are reshaping the matrix to (batch_size, seq_len, num_heads, head_dim)
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim)

        # Now we need to transpose the matrix to put heads first
        # We are doing this since we want to compute attention for each head separately
        Q = Q.transpose(1, 2)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        # Compute attention scores
        # Scaling prevents softmax from saturating
        # scores[i,j]: how much token i should attend to token j high score means more attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        # Convert to probabilities
        attn_weights = torch.softmax(scores, dim=-1)

        # Apply dropout to the attention weights
        attn_weights = self.dropout(attn_weights)

        # We need to multiply with V
        # (batch_size, num_heads, seq_len, seq_len) * (batch_size, num_heads, seq_len, head_dim)
        # Here we are taking combination of information from all the heads weighted by attention
        output = torch.matmul(attn_weights, V)
        
        # We need to concatenate heads back
        # This is done to transpose the output and make it contiguous in memory (since a simple transpose is not contiguous)
        output = output.transpose(1, 2).contiguous()
        # This is concatenation of heads
        output = output.view(batch_size, seq_len, self.d_model)  # Fixed: batch -> batch_size, d_model -> self.d_model

        # Final Projection
        output = self.out_proj(output)

        return output


# Test the implementation
if __name__ == "__main__":
    # Create model
    model = MultiHeadAttention(d_model=512, num_heads=8, dropout=0.1)
    
    # Create input
    batch_size = 32
    seq_len = 10
    x = torch.randn(batch_size, seq_len, 512)
    
    # Forward pass
    output = model(x)
    
    print(f"Input shape:  {x.shape}")       # [32, 10, 512]
    print(f"Output shape: {output.shape}")   # [32, 10, 512]
    print("Multi-head attention works!")

Input shape:  torch.Size([32, 10, 512])
Output shape: torch.Size([32, 10, 512])
Multi-head attention works!


# Creating a Transformers Block
```
Input
  ↓
LayerNorm
  ↓
Multi-Head Attention 
  ↓
Residual (add input back)
  ↓
LayerNorm
  ↓
Feed-Forward Network
  ↓
Residual (add input back)
  ↓
Output
```

In [15]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        
        # Multi-Head Attention
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        
        # Feed-Forward Network
        self.ffn = FeedForward(d_model, d_ff, dropout)
        
        # Layer Norms
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # x: [batch, seq, d_model]
        
        # ============================================
        # BLOCK 1: Multi-Head Attention + Residual
        # ============================================
        
        # Step 1: Save input for residual
        residual = x
        
        # Step 2: Layer Norm
        x = self.norm1(x)
        
        # Step 3: Attention
        x = self.attention(x)
        
        # Step 4: Dropout
        x = self.dropout(x)
        
        # Step 5: Residual connection
        x = residual + x
        
        # ============================================
        # BLOCK 2: Feed-Forward Network + Residual
        # ============================================
        
        # Step 6: Save input for residual
        residual = x
        
        # Step 7: Layer Norm
        x = self.norm2(x)
        
        # Step 8: FFN
        x = self.ffn(x)
        
        # Step 9: Dropout
        x = self.dropout(x)
        
        # Step 10: Residual connection
        x = residual + x
        
        return x
    
block = TransformerBlock(d_model=512, num_heads=8, d_ff=2048)

# Input
x = torch.randn(2, 10, 512)  # [batch=2, seq=10, d_model=512]

# Forward pass
output = block(x)

print(f"Input shape:  {x.shape}")
print(f"Output shape: {output.shape}")
print("✅ Transformer Block works!")

Input shape:  torch.Size([2, 10, 512])
Output shape: torch.Size([2, 10, 512])
✅ Transformer Block works!


# Now we need to stack multiple transformers Blocks
 

In [16]:
# nn.ModuleList is a pytorch container that creates a list of modules that Pytorch can track

import torch
import torch.nn as nn

class Transformer(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        """
        Args:
            vocab_size: Size of vocabulary (e.g., 50000)
            max_seq_len: Maximum sequence length (e.g., 2048)
            d_model: Model dimension (e.g., 512)
            num_heads: Number of attention heads (e.g., 8)
            d_ff: FFN hidden dimension (e.g., 2048)
            num_layers: Number of transformer blocks to stack (e.g., 6)
            dropout: Dropout rate
        """
        super().__init__()
        
        # Embeddings
        self.embeddings = Embeddings(vocab_size, max_seq_len, d_model)
        
        # Stack of Transformer Blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Final Layer Norm
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, token_ids):
        # token_ids: [batch, seq]
        
        # Get embeddings
        x = self.embeddings(token_ids)  # [batch, seq, d_model]
        
        # Pass through all transformer blocks
        for block in self.blocks:
            x = block(x)
        
        # Final normalization
        x = self.norm(x)
        
        return x
    
# Create model with 6 stacked blocks
model = Transformer(
    vocab_size=1000,
    max_seq_len=2048,
    d_model=512,
    num_heads=8,
    d_ff=2048,
    num_layers=6,  # 6 transformer blocks!
    dropout=0.1
)

# Test
token_ids = torch.randint(0, 1000, (2, 10))  # [batch=2, seq=10]
output = model(token_ids)

print(f"Input shape:  {token_ids.shape}")
print(f"Output shape: {output.shape}")
print(f"Number of blocks: {len(model.blocks)}")

Input shape:  torch.Size([2, 10])
Output shape: torch.Size([2, 10, 512])
Number of blocks: 6


# Final Output Head

In [18]:
import torch
import torch.nn as nn

class LanguageModel(nn.Module):
    def __init__(self, vocab_size, max_seq_len, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()
        
        # Embeddings
        self.embeddings = Embeddings(vocab_size, max_seq_len, d_model)
        
        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Final Layer Norm
        self.norm = nn.LayerNorm(d_model)
        
        # Output Head: Project to vocabulary
        self.lm_head = nn.Linear(d_model, vocab_size)
        
    def forward(self, token_ids):
        # token_ids: [batch, seq]
        
        # Embeddings
        x = self.embeddings(token_ids)  # [batch, seq, d_model]
        
        # Transformer blocks
        for block in self.blocks:
            x = block(x)
        
        # Final norm
        x = self.norm(x)  # [batch, seq, d_model]
        
        # Project to vocabulary
        #NOTE:lm_head is a linear layer that maps the final hidden states to the vocabulary size, producing logits for each token position.
        # Logits are the raw, unnormalized scores outputted by the model before applying softmax to get probabilities.
        logits = self.lm_head(x)  # [batch, seq, vocab_size]
        
        return logits
    
# Create complete language model
model = LanguageModel(
    vocab_size=1000,
    max_seq_len=2048,
    d_model=512,
    num_heads=8,
    d_ff=2048,
    num_layers=6,
    dropout=0.1
)

# Test
token_ids = torch.randint(0, 1000, (2, 10))
logits = model(token_ids)

print(f"Input shape:  {token_ids.shape}")      # [2, 10]
print(f"Output shape: {logits.shape}")         # [2, 10, 1000]

Input shape:  torch.Size([2, 10])
Output shape: torch.Size([2, 10, 1000])


# Load Dataset

In [23]:
%pip install datasets transformers hf_transfer

Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m22.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: hf_transfer
Successfully installed hf_transfer-0.1.9
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'
from datasets import load_dataset

# Load WikiText-2
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

print("Dataset loaded!")
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset loaded!
Train samples: 36718
Validation samples: 3760
Test samples: 4358

First training example:



In [27]:
# Find first non-empty example
print("\nLooking for non-empty examples:")
for i in range(10):
    text = dataset['train'][i]['text'].strip()
    if len(text) > 0:
        print(f"\nExample {i}:")
        print(text[:300])  # First 300 chars
        break


Looking for non-empty examples:

Example 1:
= Valkyria Chronicles III =


# Tokenization

In [29]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# GPT-2 tokenizer needs a pad token (it doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

print(f"Vocabulary size: {len(tokenizer)}")
print(f"Example tokens:")

# Test it
text = "The cat sat on the mat"
tokens = tokenizer.encode(text)
print(f"\nText: {text}")
print(f"Token IDs: {tokens}")
print(f"Decoded back: {tokenizer.decode(tokens)}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Vocabulary size: 50257
Example tokens:

Text: The cat sat on the mat
Token IDs: [464, 3797, 3332, 319, 262, 2603]
Decoded back: The cat sat on the mat


# Create a pytroch Dataset


In [33]:
"""
NOTE:
1.So mostly when we create a pytorch dataset
we need three things:
- __init__ : to initialize the dataset object, load data, and set up any necessary variables..
- __len__ : to return the total number of samples in the dataset.
- __getitem__ : to retrieve a single sample from the dataset given an index.
"""
from torch.utils.data import Dataset
class MyDataset(Dataset):
    
    def __init__(self):
        # Initialize: Load/prepare data
        pass
    
    def __len__(self):
        # Return: How many samples?
        pass
    
    def __getitem__(self, idx):
        # Return: Give me sample number 'idx'
        pass

import torch
from torch.utils.data import Dataset

class SimpleDataset(Dataset):
    def __init__(self):
        # Store some numbers
        self.data = [10, 20, 30, 40, 50]
    
    def __len__(self):
        # How many samples?
        return len(self.data)  # 5
    
    def __getitem__(self, idx):
        # Get sample number idx
        return self.data[idx]

# Create dataset
dataset = SimpleDataset()

print(f"Dataset has {len(dataset)} samples")  # 5
print(f"Sample 0: {dataset[0]}")  # 10
print(f"Sample 2: {dataset[2]}")  # 30

Dataset has 5 samples
Sample 0: 10
Sample 2: 30


In [34]:
import torch
from torch.utils.data import Dataset

class InputTargetDataset(Dataset):
    def __init__(self):
        # Store a sequence
        self.data = [10, 20, 30, 40, 50, 60]
    
    def __len__(self):
        # We can make 5 pairs (last one has no target, so -1)
        return len(self.data) - 1
    
    def __getitem__(self, idx):
        # Input: current number
        # Target: next number
        input_val = self.data[idx]
        target_val = self.data[idx + 1]
        
        return input_val, target_val

# Create dataset
dataset = InputTargetDataset()

print(f"Dataset has {len(dataset)} samples\n")

# Get samples
for i in range(len(dataset)):
    input_val, target_val = dataset[i]
    print(f"Sample {i}: input={input_val}, target={target_val}")


Dataset has 5 samples

Sample 0: input=10, target=20
Sample 1: input=20, target=30
Sample 2: input=30, target=40
Sample 3: input=40, target=50
Sample 4: input=50, target=60


In [35]:
import torch
from torch.utils.data import Dataset

class WikiTextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        """
        Args:
            data: HuggingFace dataset (dataset['train'])
            tokenizer: GPT2Tokenizer
            max_length: Sequence length (512)
        """
        self.max_length = max_length
        
        # Step 1: Tokenize ALL text into one long list
        print("Tokenizing dataset...")
        all_tokens = []
        
        for example in data:
            text = example['text'].strip()
            if len(text) > 0:  # Skip empty lines
                tokens = tokenizer.encode(text)
                all_tokens.extend(tokens)
        
        # Step 2: Convert to PyTorch tensor
        self.tokens = torch.tensor(all_tokens, dtype=torch.long)
        print(f"Total tokens: {len(self.tokens):,}")
    
    def __len__(self):
        # How many sequences of length 512?
        return len(self.tokens) // self.max_length
    
    def __getitem__(self, idx):
        # Get chunk starting at position (idx * 512)
        start = idx * self.max_length
        end = start + self.max_length
        
        # Input: tokens[start:end]
        # Target: tokens[start+1:end+1] (shifted!)
        input_ids = self.tokens[start:end]
        target_ids = self.tokens[start+1:end+1]
        
        return input_ids, target_ids

In [36]:
from datasets import load_dataset
from transformers import GPT2Tokenizer

# Load data and tokenizer
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Create our dataset
train_dataset = WikiTextDataset(
    dataset['train'],
    tokenizer,
    max_length=512
)

print(f"\nTotal sequences: {len(train_dataset)}")

# Get first sample
input_ids, target_ids = train_dataset[0]
print(f"\nSample 0:")
print(f"Input shape: {input_ids.shape}")
print(f"Target shape: {target_ids.shape}")
print(f"First 10 input tokens: {input_ids[:10]}")
print(f"First 10 target tokens: {target_ids[:10]}")

Tokenizing dataset...
Total tokens: 2,347,038

Total sequences: 4584

Sample 0:
Input shape: torch.Size([512])
Target shape: torch.Size([512])
First 10 input tokens: tensor([   28,   569, 18354,  7496, 17740,  6711,   796, 10445,    73, 13090])
First 10 target tokens: tensor([  569, 18354,  7496, 17740,  6711,   796, 10445,    73, 13090,   645])


# Now we need to batch our data(Data Loader)

In [37]:
from torch.utils.data import DataLoader

# Create DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=32,      # 32 sequences per batch
    shuffle=True,       # Shuffle data each epoch
    num_workers=0       # Data loading processes (0 = main process)
)

print(f"Total batches: {len(train_loader)}")
print(f"Batch size: 32")

# Get first batch
batch = next(iter(train_loader))
input_ids, target_ids = batch

print(f"\nFirst batch:")
print(f"Input shape:  {input_ids.shape}")   # [32, 512]
print(f"Target shape: {target_ids.shape}")  # [32, 512]


Total batches: 144
Batch size: 32

First batch:
Input shape:  torch.Size([32, 512])
Target shape: torch.Size([32, 512])
