In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# We going to bulid tiny gpt-2 with Rope so we going to call this Generative Pre trained Roformer(GPR)

##  load the dataset from huggingface  

In [2]:
import os
import urllib.request

file_path = "the-verdict.txt"
url = str("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [3]:
from datasets import load_dataset

# Load English text
dataset =load_dataset(
    "wikitext",
    "wikitext-103-raw-v1",
    split="train"
)

print(len(dataset))
print(dataset[0]["text"])

README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

1801350



process the data

In [4]:
import re

def clean_wikitext(text):

    text = re.sub(r' =+ .* =+ ', ' ', text['text'])

    text = text.replace(" @-@ ", "-")
    text = text.replace(" @ ", " ")
    text = text.replace(" , ", ", ")
    text = text.replace(" . ", ". ")
    
    
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r' +', ' ', text)
    
    return {'text':text}
dataset=dataset.map(clean_wikitext)

Map:   0%|          | 0/1801350 [00:00<?, ? examples/s]

In [5]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2" )
vocab_size=tokenizer.n_vocab
dataset = dataset.filter(lambda x: len(x["text"]) > 100)
dataset
dataset = dataset.shuffle(seed=42).select(range(100_000))

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

In [6]:
text = "\n".join(text_data)
token_ids1 = tokenizer.encode(text)
print("Total tokens:", len(token_ids1))

Total tokens: 40793


In [7]:
text1 = "\n".join(dataset["text"])
token_ids = tokenizer.encode(text1)
print("Total tokens:", len(token_ids))

Total tokens: 14913286


In [8]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, token_ids, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

      
       
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=256, shuffle=True, drop_last=True,
                         num_workers=0):


    tokenizer = tiktoken.get_encoding("gpt2")

    
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [9]:
import torch 
import math
import torch.nn as nn
import torch.nn.functional as F;

In [10]:
train=create_dataloader_v1(token_ids)
valid=create_dataloader_v1(token_ids1)


lets bulid model from scratch

In [11]:
class ScaleAttention(nn.Module):
    def __init__(self,dff):
        super(ScaleAttention,self).__init__()
        self.dff=dff
        
    def forward(self,query,key,value,mask=None):
        atten_score=torch.matmul(query,key.transpose(-1,-2))
        atten_score=atten_score/math.sqrt(self.dff)
        if mask is not None:            
            mask=mask.to(atten_score.device)
            atten_score=atten_score.masked_fill(mask,float('-inf'))
        
        atten_score=F.softmax(atten_score,dim=-1)
        atten_score=torch.nan_to_num(atten_score,nan=0.0)
        atten_score=torch.matmul(atten_score,value)
        return atten_score
        

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self,dmodel,n_head,dff,max_seq,droprate=0.1):
        super(MultiHeadAttention,self).__init__()
        self.dmodel=dmodel
        self.dff=dff
        self.n_head=n_head
        self.drop=nn.Dropout(droprate)
        self.wquery=nn.Linear(self.dmodel,self.dmodel)
        self.wkey=nn.Linear(self.dmodel,self.dmodel)
        self.wvalue=nn.Linear(self.dmodel,self.dmodel)
        self.wo=nn.Linear(self.dmodel,self.dmodel)
        self.attention=ScaleAttention(self.dff)
        inf_fre=1.0/(10000**(torch.arange(0,self.dff,2).float()/self.dff))
        position=torch.arange(max_seq).float()
        angle=torch.einsum("i,j->ij",position,inf_fre)
        self.register_buffer("sine",torch.sin(angle))
        self.register_buffer("cose",torch.cos(angle))
        
       
        
        

    def Rotate_half(self,x):
        x_even=x[...,::2]
        x_odd=x[...,1::2]
        return torch.stack([-x_odd,x_even],dim=-1).flatten(-2)

    def apply_rope(self,x,start_pos=0):
        seq=x.size(-2)
        sin=self.sine[:start_pos+seq].unsqueeze(0).unsqueeze(0)
        cos=self.cose[:start_pos+seq].unsqueeze(0).unsqueeze(0)
        cos=cos.repeat_interleave(2,dim=-1)
        sin=sin.repeat_interleave(2,dim=-1)
        return x*cos+self.Rotate_half(x)*sin
        
    
    
    def split_head(self,x):
        return x.reshape(x.size(0),x.size(1), self.n_head,self.dff)
    def group_head(self,x):
        return x.reshape(x.size(0),x.size(1), self.n_head*self.dff)

    def forward(self,x,start_pos=0,mask=None):
        # x=x.permute(0, 2, 1)
        
        Q=self.split_head(self.wquery(x))
        v=self.split_head(self.wvalue(x))
        k=self.split_head(self.wkey(x))
        Q=self.apply_rope(Q.permute(0,2,1,3).contiguous(),start_pos=0)
        v=v.permute(0,2,1,3).contiguous()
        k=self.apply_rope(k.permute(0,2,1,3).contiguous(),start_pos=0)
        
        at=self.attention(Q,k,v,mask=mask)
        at=at.permute(0,2,1,3)
        at=self.group_head(at)
        at=self.drop(at)
        at=self.wo(at)
        return at
        
        
        

In [13]:
class Gelu(nn.Module):
    def __init__(self):
        super(Gelu,self).__init__()
        
    def forward(self,x):
        return 0.5* x*(1+torch.tanh((torch.sqrt(torch.tensor(2.0/torch.pi))*(x+0.044715*torch.pow(x,3)))))

In [14]:
class FeedForward(nn.Module):
    def __init__(self,dmodel,dff):
        super(FeedForward,self).__init__()
        self.dmodel=dmodel
        self.dff=dff
        self.layer=nn.Sequential(
            nn.Linear(self.dmodel,self.dff),
            Gelu(),
            nn.Linear(self.dff,self.dmodel)
        )
    def forward(self,x):
        return self.layer(x)
        

In [15]:
class Transformer(nn.Module):
    def __init__(self,dmodel,dff,n_head,f_dff,max_seq,droprate=0.1):
        super(Transformer,self).__init__()
        self.dmodel=dmodel
        self.dff=dff
        self.n_head=n_head
        self.f_dff=f_dff
        self.drop1=nn.Dropout(droprate)
        self.drop2=nn.Dropout(droprate)
        self.mha=MultiHeadAttention(dmodel=self.dmodel,dff=self.dff,n_head=self.n_head,droprate=droprate,max_seq=max_seq)
        self.ffn=FeedForward(dmodel=self.dmodel,dff=self.f_dff)
        self.lay1=nn.LayerNorm(self.dmodel)
        self.lay2=nn.LayerNorm(self.dmodel)

    def mask(self,xsize):
        return torch.triu(torch.ones(xsize,xsize),diagonal=1).bool()

    
    def forward(self,x,start_pos=0):
        mask1=self.mask(x.size(1))
        norm_x = self.lay1(x)
        x1=self.mha(norm_x,start_pos,mask=mask1)
        x=x+self.drop1(x1)
        norm=self.lay2(x)
        x1=self.ffn(norm)
        x=x+self.drop2(x1)
        return x
        
        
        
        
        

In [16]:
class GPR(nn.Module):
    def __init__(self,dmodel,dff,n_head,n_layer,f_dff,max_seq,vocab_size,droprate=0.1):
        super(GPR,self).__init__()
        self.dmodel=dmodel
        self.dff=dff
        self.n_head=n_head
        self.f_dff=f_dff
        self.n_layer=n_layer
        self.vocab_size=vocab_size
        self.norm=nn.LayerNorm(self.dmodel)
        self.transformer=nn.ModuleList([
            Transformer(dmodel=self.dmodel,dff=self.dff,n_head=self.n_head,f_dff=self.f_dff,droprate=droprate,max_seq=max_seq)
                      for _ in range(n_layer) ])
        self.embedd=nn.Embedding(self.vocab_size,self.dmodel)
        self.output=nn.Linear(self.dmodel,self.vocab_size, bias=False)
        self.output.weight = self.embedd.weight

    def forward(self, x,start_pos=0):
       
        x=self.embedd(x)
        for layer in self.transformer:
            x=layer(x,start_pos)
        x=self.norm(x)
        x=self.output(x)
        return x
        
        


In [17]:
model = GPR(
    dmodel=384,
    dff=64,          
    n_head=6,
    n_layer=6,
    f_dff=1536,
    max_seq=256,
    vocab_size=vocab_size,
    droprate=0.2
)

In [18]:
state_dict = torch.load(
    "/kaggle/input/gpr2weight12/gpr2step2026_v11.pth",
    map_location="cuda"   
)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [21]:
model.eval()

GPR(
  (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  (transformer): ModuleList(
    (0-5): 6 x Transformer(
      (drop1): Dropout(p=0.2, inplace=False)
      (drop2): Dropout(p=0.2, inplace=False)
      (mha): MultiHeadAttention(
        (drop): Dropout(p=0.2, inplace=False)
        (wquery): Linear(in_features=384, out_features=384, bias=True)
        (wkey): Linear(in_features=384, out_features=384, bias=True)
        (wvalue): Linear(in_features=384, out_features=384, bias=True)
        (wo): Linear(in_features=384, out_features=384, bias=True)
        (attention): ScaleAttention()
      )
      (ffn): FeedForward(
        (layer): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): Gelu()
          (2): Linear(in_features=1536, out_features=384, bias=True)
        )
      )
      (lay1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (lay2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    )
  )
  

In [19]:
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [20]:
total

29946240

In [21]:
trainable

29946240

### we going to train model with mixed precision and grad scale

In [22]:
import torch
from torch.amp import autocast, GradScaler
from tqdm.auto import tqdm
import os

device = "cuda"   
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4,
    weight_decay=0.01)
scaler = GradScaler()


**lr schueduler with large lr**

In [24]:
from torch.optim.lr_scheduler import LambdaLR
num_epochs = 5
total_steps = len(train) * num_epochs
warmup_steps = int(0.1 * total_steps) 

def lr_lambda(current_step):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))
    progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
    return max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) 

scheduler = LambdaLR(optimizer, lr_lambda)

In [34]:
model.train()


for i in range(5,num_epochs+1 ):
    progress = tqdm(train, desc="Training", leave=True)
    loss1=0
    for step, (x, y) in enumerate(progress):
        input_ids = x.to(device)
        targets = y.to(device)
    
        optimizer.zero_grad(set_to_none=True)
    
        with autocast(dtype=torch.float16,device_type=device):
            logits = model(input_ids)
            loss = torch.nn.functional.cross_entropy(
                logits.view(-1, logits.size(-1)),
                targets.view(-1)
            )
        loss1+=loss.item()
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        progress.set_postfix(loss=f"{loss.item():.4f}")
    print(f"{i+1} epochs  trainig loss-{loss1/len(train):.4f}")
    model.eval()

    progress1 = tqdm(valid, desc="validation", leave=True)
    loss1=0
    torch.save(model.state_dict(),f"gpr2step2026_v1{i}.pth")
    for l,(x,y) in enumerate(progress1):
        input_ids = x.to(device)
        targets = y.to(device)
    
        optimizer.zero_grad(set_to_none=True)
    
        with autocast(dtype=torch.float16,device_type=device):
            logits = model(input_ids)
            test_loss = torch.nn.functional.cross_entropy(
                logits.view(-1, logits.size(-1)),
                targets.view(-1)
            )
            loss1+=test_loss.item()
            progress1.set_postfix(loss=f"{loss1/len(valid):.4f}")       
    print(f"{i+1} epochs  valid loss-{test_loss.item():.4f}")

Training:   0%|          | 0/14563 [00:00<?, ?it/s]

6 epochs  trainig loss-5.0604


validation:   0%|          | 0/39 [00:00<?, ?it/s]

6 epochs  valid loss-9.4334


In [36]:
torch.save(model.state_dict(),f"gpr2_v15.0.pth")

In [None]:
1 epochs  trainig loss-15.8284
validation: 100%
 39/39 [00:01<00:00, 44.85it/s, loss=11.3798]
1 epochs  valid loss-11.5047
Training: 100%
 14563/14563 [14:59<00:00, 16.17it/s, loss=5.4981]
2 epochs  trainig loss-5.7788
validation: 100%
 39/39 [00:01<00:00, 43.95it/s, loss=11.0614]
2 epochs  valid loss-11.1393
Training: 100%
 14563/14563 [14:43<00:00, 16.61it/s, loss=4.8711]
Training: 100%
 14563/14563 [14:43<00:00, 16.61it/s, loss=4.8711]
3 epochs  trainig loss-5.5115
validation: 100%
 39/39 [00:01<00:00, 45.58it/s, loss=9.9873]
3 epochs  valid loss-10.0001
Training: 100%
 14563/14563 [14:36<00:00, 16.32it/s, loss=4.7956]
4 epochs  trainig loss-5.0397
validation: 100%
 39/39 [00:01<00:00, 45.74it/s, loss=9.5151]
4 epochs  valid loss-9.4670
Training: 100%
 14563/14563 [14:45<00:00, 16.60it/s, loss=5.0777]
6 epochs  trainig loss-5.0604
validation: 100%
 39/39 [00:00<00:00, 46.44it/s, loss=9.4987]
6 epochs  valid loss-9.4334

# **perplexity of the model is 157.65**

In [41]:
def generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    repetition_penalty=1.2,
    
):
    model.eval()

   

    # create input on CPU, then move
    input_ids = torch.tensor(
        tokenizer.encode(prompt),
        dtype=torch.long
    ).unsqueeze(0).to('cuda')

    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_ids)[:, -1, :]

        # repetition penalty
        if repetition_penalty != 1.0:
            for token_id in set(input_ids[0].tolist()):
                logits[:, token_id] /= repetition_penalty

        # temperature (safe)
        temperature = max(temperature, 1e-6)
        logits = logits / temperature

        # top-k
        if top_k > 0:
            values, _ = torch.topk(logits, top_k)
            min_values = values[:, -1].unsqueeze(-1)
            logits = torch.where(
                logits < min_values,
                torch.full_like(logits, -1e10),
                logits
            )

        # softmax
        probs = F.softmax(logits, dim=-1)

        # top-p (nucleus) — SAFE
        if top_p < 1.0:
            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

            # keep at least one token
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_indices_to_remove[..., 0] = False

            sorted_probs[sorted_indices_to_remove] = 0.0
            probs = torch.zeros_like(probs).scatter(
                1, sorted_indices, sorted_probs
            )

        # final safety check
        if probs.sum() <= 0:
            probs = F.softmax(logits, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

    return tokenizer.decode(input_ids[0].tolist())


print("\n\n",generate(model, tokenizer, "superman", max_new_tokens=200))



 superman, and the latter was to be one of the most popular culture. The first book by the United States, in a review for the film 's development, was released on June 13, 2011. 

 In October 2013, it became available as an international artist, with the Japanese version of the game : The Game Show ( July 14, 2008 ), which included some of the games that have been featured on " One of the best songs on the album ", but noted that there is no one of its original recordings of the series ". The company has also described the music as " the most exciting character in this episode ". It originally aired on December 15, 2010, in North America, Canada, and Australia on August 10, 2000. 

 After the war, he served in the American Civil War, where he had won three gold medals at the time, including the Royal Navy in February 2006. He was selected to serve under the command of General Daniel Burt in April 2005 when he entered the
