1️⃣ Setup: Install & Import Dependencies
This section installs necessary libraries.

In [None]:
# Install dependencies (if running on Google Colab)
!pip install torch transformers datasets tqdm matplotlib


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt


2️⃣ Load & Preprocess Dataset
We load the Wikipedia dataset and tokenize it.

In [None]:
# Load Wikipedia dataset
dataset = load_dataset("wikipedia", "20220301.simple", split="train")

# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

# Apply tokenization
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(["text"])  # Remove raw text

# Convert dataset into PyTorch DataLoader
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return torch.tensor(self.dataset[idx]["input_ids"])

train_dataloader = DataLoader(TextDataset(dataset), batch_size=16, shuffle=True)

print("Dataset ready!")


3️⃣ Define the Model (MiniGPT)
This section defines the Transformer model.

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attention = self.attention(x, x, x)[0]
        x = self.norm1(x + attention)
        forward = self.feed_forward(x)
        x = self.norm2(x + forward)
        return x

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_size=256, num_layers=4, heads=8, dropout=0.1, forward_expansion=4):
        super(MiniGPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(embed_size, heads, dropout, forward_expansion) for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        for transformer in self.transformer_blocks:
            x = transformer(x)
        logits = self.fc_out(x)
        return logits


4️⃣ Train the Model
We now train the MiniGPT model and track the loss.

In [None]:
# Training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = MiniGPT(tokenizer.vocab_size).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss()

epochs = 3
losses = []

print("Starting training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in loop:
        optimizer.zero_grad()
        inputs = batch.to(device)

        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, tokenizer.vocab_size), inputs.view(-1))
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    losses.append(avg_loss)
    print(f"Epoch {epoch+1}: Loss = {avg_loss}")

print("Training Complete!")


5️⃣ Plot Loss Curve
To visualize training performance:

In [None]:
plt.figure(figsize=(8,6))
plt.plot(losses, marker='o', linestyle='-', color='b', label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("MiniGPT Training Loss Curve")
plt.legend()
plt.show()


6️⃣ Save the Trained Model



In [None]:
torch.save(model.state_dict(), "minigpt.pth")
print("Model saved successfully!")


7️⃣ Generate Text Using the Trained Model

In [None]:
import torch.nn.functional as F

def generate_text(prompt, max_length=50):
    """Generate text from the trained MiniGPT model."""
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            next_token = torch.argmax(F.softmax(outputs[:, -1, :], dim=-1), dim=-1).unsqueeze(0)
            input_ids = torch.cat([input_ids, next_token], dim=-1)

    return tokenizer.decode(input_ids[0])

# Example usage
print(generate_text("The future of AI is"))
