# MiniGPT: A Lightweight Transformer-based Language Model

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer
from datasets import load_dataset

## Tokenizer Setup

In [2]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

## Model Components

In [3]:
# Self-Attention Mechanism
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.scale = embed_dim ** -0.5

    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        # Attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        # Softmax for probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # Weighted sum
        output = torch.matmul(attn_probs, V)
        return output

## Transformer Block

In [4]:

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, ff_dim, num_heads):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )

    def forward(self, x):
        # Self-Attention
        attn_output, _ = self.attention(x, x, x)
        x = self.norm1(x + attn_output)

        # Feed-Forward
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)
        return x

## Mini GPT Model

In [5]:
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, ff_dim, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformers = nn.ModuleList([
            TransformerBlock(embed_dim, ff_dim, num_heads)
            for _ in range(num_layers)
        ])
        self.output = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        # Input embedding
        x = self.embedding(x)

        # Transformer layers
        for layer in self.transformers:
            x = layer(x)

        # Output layer
        logits = self.output(x)
        return logits

## Dataset Preparation

In [6]:
# Load dataset and tokenize
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# DataLoader-compatible dataset
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True)

## Model Training 

In [None]:
# Model and optimizer setup
vocab_size = len(tokenizer)
model = MiniGPT(vocab_size, embed_dim=1024, num_layers=8, ff_dim=2048, num_heads=8)

# Adjust token embeddings if the tokenizer size was updated
model.embedding = nn.Embedding(vocab_size, model.embedding.embedding_dim)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(5):  # Example: 5 epochs
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        
        optimizer.zero_grad()
        logits = model(input_ids)
        
        # Shift labels for causal language modeling
        labels = input_ids[:, 1:].contiguous()
        logits = logits[:, :-1, :].contiguous()
        
        loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1))
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataloader)}")


## Text Generation 

In [None]:
def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_p=0.9):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    
    for _ in range(max_length):
        with torch.no_grad():
            logits = model(input_ids)[:, -1, :]  # Get logits for the last token
            
            # Apply temperature scaling
            logits = logits / temperature
            
            # Apply nucleus (top-p) sampling
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0
            logits[sorted_indices[sorted_indices_to_remove]] = -float('Inf')
            
            # Sample from the filtered distribution
            probabilities = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)
        
        # Append the generated token to the sequence
        input_ids = torch.cat((input_ids, next_token), dim=1)

        # Stop if EOS token is generated
        if next_token.item() == tokenizer.eos_token_id:
            break

    return tokenizer.decode(input_ids[0])

In [None]:
# Generate and print text
prompt = "Machine learning is a subset of artificial intelligence that"
output = generate_text(model, tokenizer, prompt, temperature=0.8, top_p=0.9)
print("Generated Text:", output)