#Importing required libraries.

In [None]:
%%capture
!pip install tiktoken

In [None]:
import math
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import time
import pickle
from contextlib import nullcontext
import numpy as np
import time
import tiktoken
import pandas as pd
import json
import tarfile
import lzma

#The Core of GPT-2 🤖

#####Layer Norm (Normalization Layer)

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

#####Causal Self Attention

In [None]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0 ; "n_embd % n_head should be 0."

        self.mh_atten_ln = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)

        self.proj_ln = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)

        self.atten_drop = nn.Dropout(config.drop_rate)
        self.res_drop = nn.Dropout(config.drop_rate)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.drop_rate = config.drop_rate
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: Works only on PyTorch version 2.0 or higher.")
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v  = self.mh_atten_ln(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.drop_rate if self.training else 0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.atten_drop(att)
            y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.res_drop(self.proj_ln(y))
        return y


#####MLP(Multi-Layer Perceptron)

In [None]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.fc_ln    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.proj_ln  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.drop_rate)

    def forward(self, x):
        x = self.gelu(self.fc_ln(x))
        return self.dropout(self.proj_ln(x))

#####Block(communication + computation)

In [None]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn_net = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.ff_mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn_net(self.ln_1(x))
        return x + self.ff_mlp(self.ln_2(x))

#####Body of GPT

In [None]:
class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None, "vocab_size is required !!"
        assert config.block_size is not None, "block_size is required !!"

        self.config = config

        self.transformer = nn.ModuleDict(dict(
            tok_embedding = nn.Embedding(config.vocab_size, config.n_embd),
            pos_embedding = nn.Embedding(config.block_size, config.n_embd),
            dropout = nn.Dropout(config.drop_rate),
            MHS_Attn_Block = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.tok_embedding.weight = self.lm_head.weight
        self.apply(self._init_weights)

        for param_name, param in self.named_parameters():
            if param_name.endswith('proj_ln.weight'):
                torch.nn.init.normal_(param, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.pos_embedding.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        B, T = idx.size()

        assert T <= self.config.block_size, f"Max sequence length is {self.config.block_size}"

        pos = torch.arange(0, T, dtype=torch.long, device=device)
        tok_emb = self.transformer.tok_embedding(idx)
        pos_emb = self.transformer.pos_embedding(pos)
        x = self.transformer.dropout(tok_emb + pos_emb)
        for block in self.transformer.MHS_Attn_Block:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            logits = self.lm_head(x[:, [-1], :])
            loss = None

        return logits, loss

    def crop_block_size(self, block_size):
        assert block_size <= self.config.block_size, "new block_size should be < old block_size"
        self.config.block_size = block_size

        self.transformer.pos_embedding.weight = nn.Parameter(self.transformer.pos_embedding.weight[:block_size])

        for block in self.transformer.MHS_Attn_Block:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]

        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)

        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")

        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

#Initializing Model and Train it

#####Model Configuration and Hyperparameters

In [None]:
checkpts_dir = '/content/drive/MyDrive/GPT_FROM_SCRATCH/check_points'
eval_interval = 2000
log_interval = 1
eval_iters = 200
save_checkpoint = True
init_from = "scratch"

# data
db_name = 'openwebtext'
num_steps = 200
batch_size = 12
block_size = 1024

# model
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.0
bias = False

# adamw optimizer
learning_rate = 6e-4
max_iters = 600000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0

# Learning rate decay
decay_lr = True
warmup_iters = 2000
lr_decay_iters = 600000
min_lr = 6e-5

# system
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
compile = True

{'data_path': '/content/drive/MyDrive/GPT_FROM_SCRATCH/Done/file_00/file_00.txt', 'idx': 999, 'src_path': '/content/drive/MyDrive/GPT_FROM_SCRATCH/subsets/urlsf_subset00.tar', 'dist_path': '/content/drive/MyDrive/GPT_FROM_SCRATCH/Done/file_00', 'in_dir': '/content/drive/MyDrive/GPT_FROM_SCRATCH/Done/file_00/openwebtext'}


#####Data Loader


In [None]:
data_dir = os.path.join('/content/drive/MyDrive/GPT_FROM_SCRATCH', db_name)

train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')


def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

#####Initialization

> GPT-2 Configurations

In [None]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True

In [None]:
torch.manual_seed(2000)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)


# Note: init_from='resume' or ...)
init_from='resume'
iter_num = 0
best_val_loss = 1e9

# Model
model_args = dict(n_layer=n_layer,
                  n_head=n_head,
                  n_embd=n_embd,
                  block_size=block_size,
                  bias=bias,
                  vocab_size=None,
                  drop_rate=dropout)


# Building Model
print(f"Initializing The Model from : {init_from}")

if init_from == 'scratch':
    model_args['vocab_size'] =  50304
    gptconf = Config(**model_args)
    model = GPT(gptconf)

elif init_from == 'resume':
    print("Loading checkpoints ...")
    checkpoint = None
    if os.path.exists(save_ckpt_path):
        checkpoint = torch.load(save_ckpt_path, map_location=device)
        print("==> loading from new checkpoints ")
    else:
        checkpoint = torch.load(load_ckpt_path, map_location=device)
        print("==> loading from old checkpoints ")

    checkpoint_model_args = checkpoint['model_args']
    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
        model_args[k] = checkpoint_model_args[k]
    gptconf = Config(**model_args)
    model = GPT(gptconf)
    state_dict = checkpoint['model']

    unwanted_prefix = '_orig_mod.' # this prefix was added while saving check points
    for key, value in list(state_dict.items()):
        if key.startswith(unwanted_prefix):
            state_dict[key[len(unwanted_prefix):]] = state_dict.pop(key)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
    print("checkpoints loaded successfully :)")
else:
    print("Oups, There is no such option !!")


# Crop block size of the original model
if block_size < model.config.block_size:
    model.crop_block_size(block_size)
    model_args['block_size'] = block_size
model.to(device)

# initialize a GradScaler
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

print(f"scaler : {scaler}")

optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)
if init_from == 'resume':
    optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None

if compile:
    print("Compiling the model... ")
    model = torch.compile(model) # requires PyTorch 2.0
    print("Model compiled successfully :)")

Initializing The Model from : resume
number of parameters: 123.59M
scaler : <torch.cuda.amp.grad_scaler.GradScaler object at 0x7d7847f67d60>
num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 25, with 19,200 parameters
using fused AdamW: True
Compiling the model... (Estimated duration: ~= 1min)


#####Utils

In [None]:
# Estimation of Loss
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, block_size, batch_size)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


# Learning Rate Update
def get_lr(it):
    w = warmup_iters
    lr = learning_rate
    lr_di = lr_decay_iters
    if it < w:
        return lr * it / w
    if it > lr_di:
        return min_lr
    decay_ratio = (it - w) / (lr_di - w)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (lr - min_lr)


# Save Checkpoints
def save_checkpts(model, optimizer, ckpt_path, model_args, iter_num, best_val_loss):
      print("Saving checkpoints ...")
      checkpoint = {
          'model': raw_model.state_dict(),
          'optimizer': optimizer.state_dict(),
          'model_args': model_args,
          'iter_num': iter_num,
          'best_val_loss': best_val_loss,
          }
      time.sleep(2)
      print("Checkpoints Saved Successfully :)")
      torch.save(checkpoint, ckpt_path)


# Evaluate Model
def evaluate_model(iter_num, model, optimizer,model_args, eval_interval, best_val_loss, save_ckpt_path, save_checkpoint):
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss or save_checkpoint:
            best_val_loss = losses['val']
            if iter_num > 0:
                save_checkpts(
                    model=model,
                    optimizer=optimizer,
                    ckpt_path=save_ckpt_path,
                    model_args=model_args,
                    iter_num=iter_num,
                    best_val_loss=best_val_loss,
                )

    return iter_num, best_val_loss

#####Training Loop

In [None]:
X, Y = get_batch('train', block_size, batch_size)
t0 = time.time()
local_iter_num = 0
raw_model =  model
running_mfu = -1.0

while True:
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluation (Train / val)
    iter_num, best_val_loss = evaluate_model(iter_num,
                                             raw_model,
                                             optimizer,
                                             model_args,
                                             eval_interval,
                                             best_val_loss,
                                             save_ckpt_path,
                                             save_checkpoint)

    # forward -> backward -> optimizing
    for micro_step in range(num_steps):
        with ctx:
            _ , loss = raw_model(X, Y)
            loss = loss / num_steps

        X, Y = get_batch('train', block_size, batch_size)
        scaler.scale(loss).backward()

    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(raw_model.parameters(), grad_clip)

    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()

    # free gradients memory
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        lossf = loss.item() * num_steps
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
    iter_num += 1
    local_iter_num += 1

    # save checkpoints
    if iter_num % 10 == 0:
        save_checkpts(raw_model, optimizer,save_ckpt_path, model_args, iter_num, best_val_loss)

    # exit
    if iter_num > 3000 : # > max_iters:
        break