# **`In The name of God!`**

Link to the Kaggle dataset:

[Large Corpus of Persian Poems](https://www.kaggle.com/datasets/aminghd/large-corpus-of-farsi-poems)

while there are lots of Persian poems in this data source, we are going to just use Khayyam Poems for training our GPT model.

In [1]:
%cp /content/drive/MyDrive/GenAI/NanoGPT_Persian/khayyam_norm.txt .

In [2]:
# installing required packages and libraries
%%capture
!pip install tiktoken

In [None]:
import os
import torch
import math
import inspect
import requests
import tiktoken
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [3]:
input_file_path = '/content/khayyam_norm.txt'

with open(input_file_path, 'r', encoding='utf-8') as f:
    data = f.read()
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("gpt2")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile('train.bin')
val_ids.tofile('val.bin')

train has 39,069 tokens
val has 4,374 tokens


In [4]:
def decode_sample(ids, sample_size):
    sample_ids = ids[:sample_size]
    sample_text = enc.decode(sample_ids.tolist())
    return sample_text

print("Training Data Sample:")
print(decode_sample(train_ids, sample_size=100))

print("\nValidation Data Sample:")
print(decode_sample(val_ids, sample_size=100))

Training Data Sample:

  	
برخیز و بیا بتا برای دل ما
حل کن به جمال خویشتن مشکل ما
یک کوزه شراب تا بهم نوش کن�

Validation Data Sample:
 را
حالی خوش کن تو این دل سودا را
می نوش به ماهتاب ای ماه که ماه
بسیار بگردد و نیابد ما را
ای


In [5]:
class GPTDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx + self.block_size], dtype=torch.long)
        y = torch.tensor(self.data[idx + 1:idx + 1 + self.block_size], dtype=torch.long)
        return x, y

# Filtering function
def filter_indices(data, max_index):
    return [idx for idx in data if idx < max_index]

class LayerNorm(nn.Module):
    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight  # Weight tying

        # Init all weights
        self.apply(self._init_weights)
        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))

        # Report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params() / 1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)  # shape (t)

        # Forward the GPT model itself
        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # If we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # Inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :])  # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

In [6]:
import math
import torch
from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int
    vocab_size: int
    n_layer: int
    n_head: int
    n_embd: int  # n_embd must be divisible by n_head
    dropout: float
    bias: bool

config = GPTConfig(
    block_size=8,
    vocab_size= max(max(train_ids), max(val_ids)) + 1,
    n_layer=4,
    n_head=4,
    n_embd=256,  # Ensure n_embd is divisible by n_head
    dropout=0.1,
    bias=True
)

model = GPT(config)
print(model)

number of parameters: 15.20M
GPT(
  (transformer): ModuleDict(
    (wte): Embedding(47049, 256)
    (wpe): Embedding(8, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=True)
          (c_proj): Linear(in_features=256, out_features=256, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=256, out_features=1024, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=256, out_features=47049, bias=False)
)


In [7]:
train_dataset = GPTDataset(train_ids, config.block_size)
val_dataset = GPTDataset(val_ids, config.block_size)
gradient_accumulation_steps = 5 * 8
batch_size = 12

In [8]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
optimizer = model.configure_optimizers(weight_decay=0.1,
                                       learning_rate=3e-4,
                                       betas=(0.9, 0.95),
                                       device_type=device)

num decayed parameter tensors: 18, with 15,192,320 parameters
num non-decayed parameter tensors: 34, with 13,824 parameters
using fused AdamW: True


In [11]:
decay_lr = True
warmup_iters = 2000
lr_decay_iters = 600000
min_lr = 6e-5
learning_rate = 3e-4

def get_lr(it):
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    if it > lr_decay_iters:
        return min_lr
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

def print_sample_as_text(x, y, enc, train_val):
    random_idx = random.randint(0, x.size(0) - 1)
    x_text = decode_data(x[random_idx], enc)
    y_text = decode_data(y[random_idx], enc)
    print("Random Sample as text from  ", train_val)
    print(f"x: {x_text}")
    print(f"y: {y_text}")
    print("-" * 50)

def decode_data(ids, enc):
    sample_ids = ids.tolist()
    sample_text = enc.decode(sample_ids)
    return sample_text

In [22]:
import random
def train(model, train_loader, val_loader, optimizer, config, epochs=1, gradient_accumulation_steps=1):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to(device)

    model.train()
    iter_num = 0
    best_val_loss = float('inf')
    model.train()
    iter_num = 0
    best_val_loss = float('inf')

    for epoch in range(epochs):
        running_loss = 0.0
        for batch_idx, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)
            lr = get_lr(iter_num)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            logits, loss = model(x, y)
            loss = loss / gradient_accumulation_steps
            loss.backward()

            if (batch_idx + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                iter_num += 1

                running_loss += loss.item()
                if iter_num % 5 == 0:
                    avg_loss = running_loss / 5
                    print(f'Epoch {epoch+1}/{epochs}, Iteration {iter_num}, Avg Loss: {avg_loss:.4f}')

                    print_sample_as_text(x, y, enc,"Training")

                    running_loss = 0.0

        # Validation
        val_loss = evaluate(model, val_loader, device)
        print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss:.4f}')

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'Saving model with validation loss: {val_loss:.4f}')

def evaluate(model, val_loader, device): # Add device as parameter
    model.eval()
    val_loss = 0
    random_sample_printed = False  # Flag to ensure only one sample is printed
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device) # Move validation data to the same device
            logits, loss = model(x, y)
            val_loss += loss.item()
            # Print one random sample from the validation set
            if not random_sample_printed:
                print_sample_as_text(x, y, enc,"Validation")
                random_sample_printed = True
    val_loss /= len(val_loader)
    model.train()
    return val_loss

In [23]:
train(model,
      train_loader,
      val_loader,
      optimizer,
      config,
      epochs=100,
      gradient_accumulation_steps=gradient_accumulation_steps)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 47/100, Iteration 3770, Avg Loss: 0.0268
Random Sample as text from   Training
x: سمان قر�
y: مان قری
--------------------------------------------------
Epoch 47/100, Iteration 3775, Avg Loss: 0.0272
Random Sample as text from   Training
x:  نشسته �
y: �شسته ب
--------------------------------------------------
Epoch 47/100, Iteration 3780, Avg Loss: 0.0277
Random Sample as text from   Training
x: کی شد

y: �ی شد
ا
--------------------------------------------------
Epoch 47/100, Iteration 3785, Avg Loss: 0.0263
Random Sample as text from   Training
x: ه خر و ک
y:  خر و کو
--------------------------------------------------
Epoch 47/100, Iteration 3790, Avg Loss: 0.0269
Random Sample as text from   Training
x: ن زیر �
y:  زیر گ
--------------------------------------------------
Epoch 47/100, Iteration 3795, Avg Loss: 0.0267
Random Sample as text from   Training
x: �ی نف�
y: ی نفز
---------------------------------------