In [122]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
import math
from IPython.display import display, Image
import tiktoken
from torch.utils.data import DataLoader, Dataset

In [123]:
url = "https://substackcdn.com/image/fetch/w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F81c2aa73-dd8c-46bf-85b0-90e01145b0ed_1422x1460.png"
display(Image(url=url,
              width=400,
              height=500))

In [124]:
@dataclass
class Config:
    n_layers:int = 12
    d_model:int = 768
    eps:float = 1e-5
    hidden_size_multiplier:int = 4 
    num_heads:int = 12
    context_len:int = 1024
    dropout:float = 0.1
    qkv_bias:bool = False
    vocab_size:int = 50257
    
config = Config()
config   

Config(n_layers=12, d_model=768, eps=1e-05, hidden_size_multiplier=4, num_heads=12, context_len=1024, dropout=0.1, qkv_bias=False, vocab_size=50257)

In [125]:
# Layer Normalization layer
class LayerNormalization(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.eps = config.eps
        self.scale = nn.Parameter(torch.ones((config.d_model)))
        self.shift = nn.Parameter(torch.zeros((config.d_model))) 
        
    def forward(self, x: torch.Tensor):
        x_mean = x.mean(dim=-1, keepdim=True) 
        x_std = x.std(dim=-1, keepdim=True)
        x_norm = (x - x_mean) / (x_std + self.eps)
        return x * self.scale + self.shift

In [126]:
# FeedForward Layer
class FeedForwardLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        d_model = config.d_model
        hidden_size_multiplier = config.hidden_size_multiplier
        
        self.ff_layer = nn.Sequential(
            nn.Linear(d_model, hidden_size_multiplier * d_model),
            nn.GELU(),
            nn.Linear(hidden_size_multiplier * d_model, d_model)        
        )
        
    def forward(self, x):
        return x + self.ff_layer(x)

In [127]:
class SelfAttentionLayer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.query_weights = nn.Linear(config.d_model, config.d_model, bias=config.qkv_bias)
        self.key_weights = nn.Linear(config.d_model, config.d_model, bias=config.qkv_bias)
        self.value_weights = nn.Linear(config.d_model, config.d_model, bias=config.qkv_bias)
        self.out_proj = nn.Linear(config.d_model, config.d_model)
        self.num_heads = config.num_heads
        assert config.d_model % config.num_heads == 0, "d_model should be divisible by num_heads"
        self.h_dmodel = config.d_model // config.num_heads
        self.neg_inf = - 1e+5
        self.drop_out = torch.nn.Dropout(config.dropout)
        self.register_buffer('casual_mask', tensor=torch.triu(torch.ones(config.context_len, config.context_len), diagonal=1).bool())
    
    def forward(self, x):
        # x: (B, S, d_model)
        qeury_vectors = self.query_weights(x)
        key_vectors = self.key_weights(x)
        value_vectors = self.value_weights(x)
        batch_size, seq_len, d_model = x.size()
        
        # (B,S,d_model) -> (B, S, num_head, h_dmodel)
        qeury_vectors = qeury_vectors.view(batch_size, seq_len, self.num_heads, self.h_dmodel)
        key_vectors = key_vectors.view(batch_size, seq_len, self.num_heads, self.h_dmodel)
        value_vectors = value_vectors.view(batch_size, seq_len, self.num_heads, self.h_dmodel)
        
        # (B, Seq, num_heads, h_dmodel) -> (B, num_heads, Seq, h_dmodel)
        qeury_vectors = torch.permute(qeury_vectors, dims=(0, 2, 1, 3))
        key_vectors = torch.permute(key_vectors, dims=(0, 2, 1, 3))
        value_vectors = torch.permute(value_vectors, dims=(0, 2, 1, 3))
        mask = self.casual_mask[ :seq_len, : seq_len]
        
        # mask = self.casual_mask[:seq_len, :seq_len]  # (S, S)
        # mask = mask.unsqueeze(0).unsqueeze(0)  # (1, 1, S, S)
         
        attention_scores = self.calculate_attention_score(qeury_vectors, key_vectors, mask)
        contextualized_vectores = attention_scores @ value_vectors
        
        # (B, num_heads, seq, head_d) => (B, seq, num_head, head_d) => (b, seq, d_model)
        contextualized_vectores = torch.permute(contextualized_vectores, dims=(0, 2, 1, 3))
        contextualized_vectores = contextualized_vectores.contiguous().view(batch_size, seq_len, self.num_heads*self.h_dmodel)
        contextualized_vectores = self.out_proj(contextualized_vectores)
        return (contextualized_vectores, attention_scores)
    
    def calculate_attention_score(self, qeury, key, mask):
        # (B,NumHeads,Seq, h_dmodel) * (B,num_heads,h_model, seq) => (B,num_heads, seq, seq)
        k_dmodel = key.size(-1)
        attention_scores = (qeury @ key.transpose(-1,-2)) / math.sqrt(k_dmodel)
        attention_scores = torch.masked_fill(attention_scores, mask=mask, value=self.neg_inf)
        attention_scores = torch.softmax(attention_scores, dim=-1)
        return self.drop_out(attention_scores)

In [306]:
tokenizer.encode(text, allowed_special={'<|endoftext|>'})

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [329]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_len, stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []
        tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
        
        for i in range(0, len(tokens) - max_len, stride):
            self.input_ids.append(torch.tensor(tokens[i: i + max_len]))
            self.target_ids.append(torch.tensor(tokens[i+1 : i + max_len+1]))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [330]:
def create_dataloader(txt, batch_size=4, max_len=256,
                      stride=256, shuffle=True,
                      drop_last=True, num_workers=0):
    
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDataset(text=txt, tokenizer=tokenizer, max_len=max_len, stride=stride)
    
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=True,
        drop_last=True,
        num_workers=num_workers
    )
    return dataloader

In [331]:
class TransformerBlock(nn.Module):
    def __init__(self,config: Config):
        super().__init__()
        self.attention = SelfAttentionLayer(config)
        self.layer_norm1 = LayerNormalization(config)
        self.layer_norm2 = LayerNormalization(config)
        self.feedforward = FeedForwardLayer(config)
        self.dropout = nn.Dropout(config.dropout)
        
    def forward(self, x):
        shortcut = x # Residual connection
        x = self.layer_norm1(x)
        x, _ = self.attention(x)
        x = self.dropout(x)
        x = shortcut + x
        
        shortcut = x
        x = self.layer_norm2(x)
        x = self.feedforward(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [310]:
model = GPTModel(config=config)

In [192]:
model.token_embedding.weight

Parameter containing:
tensor([[-0.2034, -1.8195,  1.8681,  ..., -1.6026, -0.9439,  0.1391],
        [-0.1620,  0.1735,  1.3670,  ..., -2.3601,  0.1522,  0.3595],
        [ 0.2789, -0.8138, -0.0779,  ..., -1.1018,  0.4719, -0.3231],
        ...,
        [ 0.9673, -1.5299,  0.3638,  ...,  1.3781,  0.4221, -0.0816],
        [ 0.2337, -0.1659, -0.1909,  ..., -1.7009,  0.7488, -1.4733],
        [-1.6854,  0.8027,  0.4698,  ..., -1.5249,  0.1988, -1.1721]],
       requires_grad=True)

In [None]:
tok

In [209]:
torch.manual_seed(10)
x = torch.randint(high=10,size=(1,5),dtype=torch.int)

with torch.no_grad():
    out = model(x)
out

tensor([[[-18301.0977,   2574.8599,  -6081.0327,  ...,  10457.9951,
            5303.9194,   8569.9346],
         [  1845.4386, -14031.1133,  11964.7246,  ...,   4033.2283,
           -8675.0771,  14809.2373],
         [ -1030.9122,  12628.1152,   6296.4302,  ...,   5458.8140,
          -13570.9648,   4044.2893],
         [-12833.8730, -16818.4707,   8181.5537,  ...,   1887.0457,
             337.9720,  -1076.5248],
         [ -5351.8110,  -8994.9697,  11487.9004,  ...,   -559.9611,
            1598.4569,  11646.1064]]])

In [313]:
# Generate text
def genarate_text(input, max_len, context_len):
    model.eval()
    for _ in range(max_len):
        input = input[:, : context_len]
        
        with torch.no_grad():
            
            logits = model(input)
            last_token_logits = logits[:, -1, :]
            last_token_probs = torch.softmax(last_token_logits, dim=-1)        
            top_prob_token = torch.argmax(last_token_probs, dim=-1, keepdim=True)
            input = torch.cat([input, top_prob_token], dim=-1)
    
    return input

input = 'my name is'
input = tokenizer.encode(input)
input = torch.tensor(input).unsqueeze(0)
print(input[:, :10])
print(f'initial input : {input}')
out = genarate_text(input, max_len=5, context_len=1024)
print('out token', out)
tokenizer.decode(out.squeeze().tolist())

tensor([[1820, 1438,  318]])
initial input : tensor([[1820, 1438,  318]])
out token tensor([[ 1820,  1438,   318, 35612,  6877,  5198, 49616, 27137]])


'my name isWing hat appeal 1070296'

In [314]:
pwd

'/Users/abdulvajid/AI/PROJECTS/LLM-FROM-SCRATCH/notebooks'

In [315]:
with open('theverdict.txt','r') as f:
    text = f.read()

In [332]:
dataloader = create_dataloader(text, batch_size=8, num_workers=0)

In [333]:
for batch, target in dataloader:
    break



In [336]:
batch, target

(tensor([[   11, 17728,   257,  ...,   422,   465,  2330],
         [  673,  1908,   329,  ...,   514,  2474,   198],
         [  503,  4291,   262,  ..., 19217, 24887, 13431],
         ...,
         [ 1459,   714,  1239,  ...,  1517,   484,  1444],
         [10197,   832,   262,  ...,  9074,    13,   402],
         [18560,   438,  7091,  ...,   338,  1804,   340]]),
 tensor([[17728,   257,  8500,  ...,   465,  2330, 22645],
         [ 1908,   329,   345,  ...,  2474,   198,   198],
         [ 4291,   262,  4252,  ..., 24887, 13431,    13],
         ...,
         [  714,  1239,   423,  ...,   484,  1444,   616],
         [  832,   262, 46475,  ...,    13,   402,   271],
         [  438,  7091,   750,  ...,  1804,   340,   329]]))