# Simple Language Model (Predict next char)
## 1. Create Model

In [1]:
import torch
import torch.nn as nn
from transformer import TransformerEncoder

In [4]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, max_seq_len, droput=0.1):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, d_model, n_heads, d_ff, n_layers, max_seq_len, droput)
        self.output_proj = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        # x: (batch, seq_len) - token IDs
        x = self.encoder(x) # (batch, seq_len, d_model)
        logits = self.output_proj(x) # (batch, seq_len, vocab_size)
        return logits
        

## 2. Prepare simple dataset

In [26]:
text = """To be or not to be, that is the question.
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles."""

# Create vocabulary (character-level)
chars = sorted(list(set(text)))
vocab_size = len(chars)
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

print(f"Vocab size: {vocab_size}")
print(f"Characters: {chars}")

# Encode text
encoded = torch.tensor([char_to_idx[ch] for ch in text])
print(f"Text length: {len(encoded)}")

Vocab size: 27
Characters: ['\n', ' ', "'", ',', '.', 'O', 'T', 'W', 'a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'q', 'r', 's', 't', 'u', 'w']
Text length: 171


## 3. Create Dataset & DataLoader

In [82]:
from torch.utils.data import Dataset, DataLoader
from typing import Any

class CharDataset(Dataset):
    def __init__(self, encoded_text, seq_len) -> None:
        super().__init__()
        self.data = encoded_text
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.data) - self.seq_len
    
    def __getitem__(self, idx) -> Any:
        # Input: seq_len tokens
        # Output: next seq_len tokens (shifted by 1)
        
        x = torch.tensor(self.data[idx : idx + self.seq_len], dtype=torch.long)
        y = torch.tensor(self.data[idx + 1: idx + self.seq_len + 1], dtype=torch.long)
        return x, y

# Create dataset
seq_len = 2
batch_size = 2
dataset = CharDataset(encoded_text=encoded, seq_len=seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Check one batch
x, y = next(iter(dataloader))
x2, y2 = next(iter(dataloader))
print(f"Input Shape: {x.shape}")
print(f"Output Shape: {y.shape}")
print(f"x1, y1: {x}\n{y}")
print(f"x2, y2: {x2}\n{y2}")

Input Shape: torch.Size([2, 2])
Output Shape: torch.Size([2, 2])
x1, y1: tensor([[20, 25],
        [24, 20]])
tensor([[25, 24],
        [20,  1]])
x2, y2: tensor([[ 8, 24],
        [ 1, 24]])
tensor([[24,  1],
        [24,  8]])


  x = torch.tensor(self.data[idx : idx + self.seq_len], dtype=torch.long)
  y = torch.tensor(self.data[idx + 1: idx + self.seq_len + 1], dtype=torch.long)


In [None]:
from torch.optim import Adam

vocab_size = len(chars)
d_model = 128
n_heads = 4
d_ff = 512
n_layers = 4
max_seq_len = 128
learning_rate = 3e-4
n_epochs = 50

# Create model
device = "cuda"


False