In [2]:
!pip install datasets transformers psutil --quiet

# Imports & Setup

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import psutil, os

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"‚úÖ Using device: {device.upper()}")

# Print system resource info
ram_gb = round(psutil.virtual_memory().total / (1024**3), 2)
print(f"üíæ System RAM: {ram_gb} GB")
if device == "cuda":
    gpu_mem = round(torch.cuda.get_device_properties(0).total_memory / 1e9, 2)
    print(f"‚öôÔ∏è GPU Memory: {gpu_mem} GB ({torch.cuda.get_device_name(0)})")

torch.manual_seed(1337)


‚úÖ Using device: CUDA
üíæ System RAM: 31.35 GB
‚öôÔ∏è GPU Memory: 15.83 GB (Tesla T4)


<torch._C.Generator at 0x7c2049d33550>

# Hyperparameters

In [4]:
batch_size = 32
block_size = 128
max_iters = 50000      
eval_interval = 1000
learning_rate = 3e-4
eval_iters = 100
n_embd = 256
n_head = 8
n_layer = 6             
dropout = 0.1

print("üìò Training Configuration:")
print(f"batch_size={batch_size}, block_size={block_size}, n_layer={n_layer}, n_head={n_head}, n_embd={n_embd}")


üìò Training Configuration:
batch_size=32, block_size=128, n_layer=6, n_head=8, n_embd=256


# Load WikiText Dataset

In [5]:
print("üì• Loading WikiText-103 (50%)...")
ds = load_dataset("wikitext", "wikitext-103-raw-v1", split="train[:50%]")
print(f"‚úÖ Loaded {len(ds)} documents")

# Preview a few samples
for i in range(2):
    print(f"\nüìù Sample {i+1}:\n{ds[i]['text'][:300]}...")


üì• Loading WikiText-103 (50%)...


README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(‚Ä¶):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(‚Ä¶):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(‚Ä¶):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(‚Ä¶):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

‚úÖ Loaded 900675 documents

üìù Sample 1:
...

üìù Sample 2:
 = Valkyria Chronicles III = 
...


# Tokenizer and Encoding

In [6]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

print("üî§ Encoding dataset using GPT-2 BPE tokenizer...")
sample_text = "Transformers are powerful models for language understanding."
encoded = tokenizer.encode(sample_text)
decoded = tokenizer.decode(encoded)
print(f"üß© Sample text:\n{sample_text}")
print(f"‚û°Ô∏è Encoded tokens: {encoded}")
print(f"‚Ü©Ô∏è Decoded back: {decoded}")

# Tokenize entire dataset
tokens = []
for item in ds:
    if item.get("text"):
        tokens.extend(tokenizer.encode(item["text"]))
data = torch.tensor(tokens, dtype=torch.long)

print(f"\nüìà Total tokens: {len(data):,}")
vocab_size = tokenizer.vocab_size
print(f"üìö Vocabulary size: {vocab_size}")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

üî§ Encoding dataset using GPT-2 BPE tokenizer...
üß© Sample text:
Transformers are powerful models for language understanding.
‚û°Ô∏è Encoded tokens: [41762, 364, 389, 3665, 4981, 329, 3303, 4547, 13]
‚Ü©Ô∏è Decoded back: Transformers are powerful models for language understanding.


Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 1024). Running this sequence through the model will result in indexing errors



üìà Total tokens: 58,984,516
üìö Vocabulary size: 50257


# Data Split & Batch Sampling

In [7]:
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

print(f"üß† Train data tokens: {len(train_data):,}")
print(f"üß™ Validation tokens: {len(val_data):,}")

def get_batch(split):
    data_local = train_data if split == "train" else val_data
    ix = torch.randint(len(data_local) - block_size, (batch_size,))
    x = torch.stack([data_local[i:i+block_size] for i in ix])
    y = torch.stack([data_local[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

xb, yb = get_batch("train")
print(f"‚úÖ Batch X shape: {xb.shape}, Y shape: {yb.shape}")
print(f"üî¢ Example token IDs:\n{xb[0][:20].tolist()}")
print(f"üó£ Decoded snippet:\n{tokenizer.decode(xb[0][:50].tolist())}")


üß† Train data tokens: 53,086,064
üß™ Validation tokens: 5,898,452
‚úÖ Batch X shape: torch.Size([32, 128]), Y shape: torch.Size([32, 128])
üî¢ Example token IDs:
[262, 14555, 43469, 2168, 837, 2716, 416, 3966, 89, 5799, 739, 262, 3670, 1012, 1236, 324, 837, 373, 2716, 287]
üó£ Decoded snippet:
 the earliest anthology series , released by Ohzora under the title Clannad , was released in June 2004 under their Twin Heart Comics imprint . Volumes for this series continued to be released until April 2005 with the fifth volume . The second anthology was


# Transformer Components

In [8]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x): return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


# GPT Architecture

In [9]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.view(B*T))
        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=1.0):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


model = GPT().to(device)
param_count = sum(p.numel() for p in model.parameters())
param_size_gb = (param_count * 4) / (1024**3)  # assuming float32
print(f"üßÆ Model parameters: {param_count:,} ({param_count/1e6:.2f}M)")
print(f"üíæ Model parameter size ‚âà {param_size_gb:.3f} GB")


üßÆ Model parameters: 30,549,073 (30.55M)
üíæ Model parameter size ‚âà 0.114 GB


# Training

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            _, loss = model(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

print("üöÄ Starting training...")
for it in range(max_iters):
    if it % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {it:5d}: train {losses['train']:.4f}, val {losses['val']:.4f}")
        torch.save(model.state_dict(), "mini_gpt.pt")

    xb, yb = get_batch("train")
    _, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


üöÄ Starting training...
step     0: train 11.0028, val 11.0028
step  1000: train 6.1067, val 6.1233
step  2000: train 5.6809, val 5.7316
step  3000: train 5.4047, val 5.4497
step  4000: train 5.2148, val 5.3042
step  5000: train 5.0828, val 5.1641
step  6000: train 4.9728, val 5.0710
step  7000: train 4.9058, val 4.9813
step  8000: train 4.8153, val 4.9380
step  9000: train 4.7651, val 4.8704
step 10000: train 4.7241, val 4.8243
step 11000: train 4.6535, val 4.7817
step 12000: train 4.6079, val 4.7101
step 13000: train 4.5650, val 4.6897
step 14000: train 4.5315, val 4.6706
step 15000: train 4.4776, val 4.6525
step 16000: train 4.4754, val 4.6061
step 17000: train 4.4306, val 4.5953
step 18000: train 4.4210, val 4.5778
step 19000: train 4.3762, val 4.5472
step 20000: train 4.3597, val 4.5232
step 21000: train 4.3347, val 4.5116
step 22000: train 4.3074, val 4.4819
step 23000: train 4.3079, val 4.4741
step 24000: train 4.2817, val 4.4695
step 25000: train 4.2805, val 4.4371
step 26000

In [15]:
prompt = (
    "Artificial intelligence is changing the world in profound ways. "
    "From self-driving cars to healthcare diagnostics, AI is enabling "
    "innovations that were once considered science fiction. Experts believe "
    "that the next decade will see AI integrated into nearly every aspect "
    "of daily life, transforming industries, education, and human interactions."
)

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

out_idx = model.generate(input_ids, max_new_tokens=50, temperature=0.8)[0].tolist()
print("\n----- GENERATED TEXT -----\n")
print(tokenizer.decode(out_idx))



----- GENERATED TEXT -----

Artificial intelligence is changing the world in profound ways. From self-driving cars to healthcare diagnostics, AI is enabling innovations that were once considered science fiction. Experts believe that the next decade will see AI integrated into nearly every aspect of daily life, transforming industries, education, and human interactions. in relatively recent decades . The 1988 media article Telescope that the 2007 academic survey of the White Motorism Project was " pure " , that it reduces class the future of the American community is " so far more attitude than any state of the ongoing fighting class


In [17]:

torch.save(model.state_dict(), "mini_gpt.pt")

# Later, to load:
model = GPT().to(device)  
model.load_state_dict(torch.load("mini_gpt.pt", map_location=device))
model.eval()  


GPT(
  (token_embedding_table): Embedding(50257, 256)
  (position_embedding_table): Embedding(128, 256)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (key): Linear(in_features=256, out_features=32, bias=False)
            (query): Linear(in_features=256, out_features=32, bias=False)
            (value): Linear(in_features=256, out_features=32, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
   