In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Simple tokenizer and vocab
vocab = {"The": 0, "cat": 1, "sat": 2, "on": 3, "the": 4, "mat": 5}
inv_vocab = {v: k for k, v in vocab.items()}

def simple_tokenizer(sentence):
    return [vocab[word] for word in sentence.split()]

# Positional encoding
def positional_encoding(seq_len, dim):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(dim)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(dim))
    angle_rads = pos * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return torch.tensor(angle_rads, dtype=torch.float32)

# Tiny GPT block
class TinyGPTBlock(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Parameter(positional_encoding(10, embed_dim), requires_grad=False)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads=1, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, 2 * embed_dim),
            nn.ReLU(),
            nn.Linear(2 * embed_dim, embed_dim)
        )
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.output = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.embed(x) + self.pos_embed[:seq_len]
        residual = x
        x = self.ln1(x)
        attn_out, _ = self.attn(x, x, x, need_weights=False)
        x = residual + attn_out

        residual2 = x
        x = self.ln2(x)
        x = self.ffn(x)
        x = x + residual2

        logits = self.output(x)
        return logits

# Run the model
sentence = "The cat sat on the"
token_ids = simple_tokenizer(sentence)
input_tensor = torch.tensor([token_ids])

model = TinyGPTBlock(vocab_size=len(vocab), embed_dim=16)
logits = model(input_tensor)

next_token_logits = logits[0, -1]
probs = F.softmax(next_token_logits, dim=-1)
predicted_id = torch.argmax(probs).item()
predicted_word = inv_vocab[predicted_id]

print("Next predicted word:", predicted_word)


Next predicted word: The


In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

input_text = "The cat sat on the"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1)

print(tokenizer.decode(outputs[0]))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The cat sat on the floor


In [24]:
#TinyGPT training setup from scratch
sentences = [
    "The cat sat on the mat",
    "The dog slept on the bed",
    "The bird flew over the tree",
    "The fish swam in the pond",
    "The cow stood near the barn"
]

# Build vocabulary
all_words = set(" ".join(sentences).split())
word2idx = {word: idx for idx, word in enumerate(sorted(all_words))}
idx2word = {idx: word for word, idx in word2idx.items()}

# Show vocab
print("Vocabulary size:", len(word2idx))
print("word2idx:", word2idx)

# Tokenizer
def tokenize(sentence):
    return [word2idx[word] for word in sentence.split()]

# Detokenizer (for predictions)
def detokenize(indices):
    return " ".join([idx2word[idx] for idx in indices])

    # Prepare input-output pairs
X_data = []
y_data = []

for sentence in sentences:
    words = sentence.split()
    input_words = words[:-1]
    target_word = words[-1]

    X_data.append(tokenize(" ".join(input_words)))
    y_data.append(word2idx[target_word])

# Show one example
print("Example input (tokens):", X_data[0])
print("Target token ID:", y_data[0])
print("Input words:", detokenize(X_data[0]))
print("Target word:", idx2word[y_data[0]])


import torch
from torch.nn.utils.rnn import pad_sequence

# Add a PAD token to vocab
PAD_TOKEN = "<PAD>"
pad_idx = len(word2idx)
word2idx[PAD_TOKEN] = pad_idx
idx2word[pad_idx] = PAD_TOKEN

# Update tokenized X_data to torch tensors
X_data_padded = pad_sequence(
    [torch.tensor(x, dtype=torch.long) for x in X_data],
    batch_first=True,
    padding_value=pad_idx
)

y_data_tensor = torch.tensor(y_data, dtype=torch.long)

import torch.nn as nn

class TinyGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.pos_embedding = nn.Embedding(20, embed_dim)  # assume max 20 tokens

        self.ln1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads=1, batch_first=True)
        self.ln2 = nn.LayerNorm(embed_dim)

        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, 2 * embed_dim),
            nn.ReLU(),
            nn.Linear(2 * embed_dim, embed_dim)
        )

        self.output_layer = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        batch_size, seq_len = x.shape
        positions = torch.arange(seq_len).unsqueeze(0).expand(batch_size, seq_len).to(x.device)

        x = self.embedding(x) + self.pos_embedding(positions)

        residual = x
        x = self.ln1(x)
        x, _ = self.attn(x, x, x, need_weights=False)
        x = x + residual

        residual2 = x
        x = self.ln2(x)
        x = self.ffn(x)
        x = x + residual2

        # Return logits for final token only
        return self.output_layer(x[:, -1])


# Parameters
vocab_size = len(word2idx)
embed_dim = 32
epochs = 200
learning_rate = 0.01

# Model, loss, optimizer
model = TinyGPT(vocab_size, embed_dim, pad_idx)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    outputs = model(X_data_padded)            # shape: (batch_size, vocab_size)
    loss = loss_fn(outputs, y_data_tensor)    # compare predicted token vs actual next token
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

print("Training done")

# Set model to evaluation mode
model.eval()

# Test input
test_prompt = "The cat sat on the"
test_ids = tokenize(test_prompt)

# Pad to match training shape
max_len = X_data_padded.shape[1]
if len(test_ids) < max_len:
    test_ids += [pad_idx] * (max_len - len(test_ids))

input_tensor = torch.tensor([test_ids])  # shape (1, seq_len)

# Predict
with torch.no_grad():
    logits = model(input_tensor)
    predicted_token_id = torch.argmax(logits, dim=-1).item()
    predicted_word = idx2word[predicted_token_id]

print(f"Input: {test_prompt}")
print(f"Predicted next word: {predicted_word} \n")


test_sentences = [
    "The dog slept on the",   # → bed
    "The bird flew over the", # → tree
    "The fish swam in the",   # → pond
    "The cow stood near the"  # → barn
]

model.eval()

for prompt in test_sentences:
    test_ids = tokenize(prompt)
    if len(test_ids) < max_len:
        test_ids += [pad_idx] * (max_len - len(test_ids))
    
    input_tensor = torch.tensor([test_ids])

    with torch.no_grad():
        logits = model(input_tensor)
        predicted_token_id = torch.argmax(logits, dim=-1).item()
        predicted_word = idx2word[predicted_token_id]

    print(f"Input: {prompt}")
    print(f"Predicted next word: {predicted_word}\n")


import pickle

# Save model
torch.save(model.state_dict(), "tinygpt_model.pth")

# Save word2idx and idx2word for tokenizer
with open("tinygpt_vocab.pkl", "wb") as f:
    pickle.dump((word2idx, idx2word), f)

print("✅ Model and tokenizer saved successfully.")


Vocabulary size: 21
word2idx: {'The': 0, 'barn': 1, 'bed': 2, 'bird': 3, 'cat': 4, 'cow': 5, 'dog': 6, 'fish': 7, 'flew': 8, 'in': 9, 'mat': 10, 'near': 11, 'on': 12, 'over': 13, 'pond': 14, 'sat': 15, 'slept': 16, 'stood': 17, 'swam': 18, 'the': 19, 'tree': 20}
Example input (tokens): [0, 4, 15, 12, 19]
Target token ID: 10
Input words: The cat sat on the
Target word: mat
Epoch 1/200, Loss: 3.0697
Epoch 20/200, Loss: 0.0000
Epoch 40/200, Loss: 0.0000
Epoch 60/200, Loss: 0.0000
Epoch 80/200, Loss: 0.0000
Epoch 100/200, Loss: 0.0000
Epoch 120/200, Loss: 0.0000
Epoch 140/200, Loss: 0.0000
Epoch 160/200, Loss: 0.0000
Epoch 180/200, Loss: 0.0000
Epoch 200/200, Loss: 0.0000
Training done
Input: The cat sat on the
Predicted next word: mat
Input: The dog slept on the
Predicted next word: bed

Input: The bird flew over the
Predicted next word: tree

Input: The fish swam in the
Predicted next word: pond

Input: The cow stood near the
Predicted next word: barn

✅ Model and tokenizer saved success