In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch import nn
from helpers import MultiHeadAttention

In [3]:
!uv add tiktoken

[2mResolved [1m75 packages[0m [2min 0.53ms[0m[0m
[2mAudited [1m55 packages[0m [2min 0.01ms[0m[0m


In [4]:
GPT_CONFIG = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers' : 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [5]:
class DummyTransformerBlock(nn.Module): 
    def __init__(self, cfg): 
        super().__init__()

    def forward(self, x): 
        return x
    
class DummyLayerNorm(nn.Module): 
    def __init__(self, normalized_shape, eps=1e-5): 
        super().__init__()

    def forward(self, x): 
        return x

In [6]:
class GPT_Model(nn.Module): 
    def __init__(self, cfg): 
        super().__init__()
        self.tok_emb = nn.Embedding(num_embeddings=cfg['vocab_size'], embedding_dim=cfg['emb_dim'])
        self.pos_emb = nn.Embedding(num_embeddings=cfg['context_length'], embedding_dim=cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.transformers = nn.Sequential(
            *[DummyTransformerBlock(cfg) for i in range(cfg['n_layers'])]
        )
        self.final_norm = DummyLayerNorm(cfg['drop_rate'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)


    def forward(self, in_idx): 
        b, t = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(
            torch.arange(t, device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        print(x.shape)
        x = self.drop_emb(x)
        x = self.transformers(x)
        x = self.final_norm(x)        
        logits = self.out_head(x)
        return logits

In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
batch = []
txt1 = 'Every effort moves you'
txt2 = 'Every day holds a'
print(tokenizer.encode(txt1))
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

[6109, 3626, 6100, 345]
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [8]:
torch.manual_seed(123)
# inp = torch.randint(0, 50257, (2, 3) )
gpt = GPT_Model(GPT_CONFIG)
out = gpt(batch)
out.shape

torch.Size([2, 4, 768])


torch.Size([2, 4, 50257])

In [9]:
out

tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)

In [10]:
torch.manual_seed(123)
inp = torch.randn((2,5))
l = nn.Sequential(nn.Linear(5, 3), nn.ReLU())
out = l(inp)

mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)

print(f"{mean=},\n{var=}")

print("---")

norm = (out - mean) / torch.sqrt(var)
print(norm.shape)
mean2 = norm.mean(dim=-1, keepdim=True)
var2 = norm.var(dim=-1, keepdim=True)
print(f"{mean2=},\n{var2=}")


mean=tensor([[0.0027],
        [0.0000]], grad_fn=<MeanBackward1>),
var=tensor([[2.2188e-05],
        [0.0000e+00]], grad_fn=<VarBackward0>)
---
torch.Size([2, 3])
mean2=tensor([[-3.9736e-08],
        [        nan]], grad_fn=<MeanBackward1>),
var2=tensor([[1.0000],
        [   nan]], grad_fn=<VarBackward0>)


In [11]:
class LayerNorm(nn.Module): 
    def __init__(self, emb_dim, eps=1e-5,):
        super().__init__()
        self.eps = eps
        self.offset = nn.Parameter(torch.ones(emb_dim))
        self.scale = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x): 
        mean = x.mean(dim = -1, keepdim=True)
        var = x.var(dim = -1, keepdim=True)

        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.offset + self.scale * norm_x

In [12]:
class GELU(nn.Module): 
    def __init__(self): 
        super().__init__()

    def forward(self, x): 
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [13]:
class FeedForward(nn.Module): 
    def __init__(self, cfg): 
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim'])
        )

    def forward(self, x): 
        return self.layers(x)


In [14]:
class TransformerBlock(nn.Module): 
    def __init__(self, cfg): 
        super().__init__()
        self.ln1 = LayerNorm(emb_dim=cfg['emb_dim'])
        self.mha = MultiHeadAttention(cfg['emb_dim'], cfg['emb_dim'], cfg['context_length'], cfg['drop_rate'], n_heads=cfg['n_heads'], qkv_biases=False)
        self.drop1 = nn.Dropout(cfg['drop_rate'])
        self.ln2 = LayerNorm(emb_dim=cfg['emb_dim'])
        self.ff = FeedForward(cfg)
        self.drop2 = nn.Dropout(cfg['drop_rate'])


    def forward(self, x): 
        shortcut = x
        x = self.ln1(x)
        x = self.mha(x)
        x = self.drop1(x)
        x = x + shortcut

        shortcut = x
        x = self.ln2(x)
        x = self.ff(x)
        x = self.drop2(x)
        x = x + shortcut

        return x
    
class DummyLayerNorm(nn.Module): 
    def __init__(self, normalized_shape, eps=1e-5): 
        super().__init__()

    def forward(self, x): 
        return x

In [17]:
torch.manual_seed(123)
input = torch.randn((2, 4, 768))
tr = TransformerBlock(GPT_CONFIG)
out = tr(input)
out.shape

torch.Size([2, 12, 4, 64]) torch.Size([2, 12, 4, 64]) torch.Size([2, 12, 4, 64])
torch.Size([2, 4, 12, 64]) 768


torch.Size([2, 4, 768])

In [18]:
out

tensor([[[ 0.2422, -0.4683, -0.1459,  ...,  0.3523, -1.7843,  0.9341],
         [ 0.3987,  0.0514, -0.6592,  ...,  0.2607,  0.1367,  0.0394],
         [ 0.7005,  0.2444,  1.1003,  ..., -0.4044,  0.0659, -1.6542],
         [ 0.1096,  1.6913, -2.0804,  ...,  0.9084, -1.5746,  0.3340]],

        [[ 0.5687,  0.8201, -0.6751,  ...,  0.8507, -0.1991,  1.2828],
         [-0.2925, -0.8001, -0.6078,  ...,  1.5034, -0.0915,  0.6611],
         [-1.2220, -0.7000, -0.3639,  ...,  0.4061, -1.5129,  1.4650],
         [-0.5226, -1.1378,  0.4513,  ...,  0.6244, -0.9750,  0.9923]]],
       grad_fn=<AddBackward0>)

In [111]:
# Maybe plot relu and gelu to see

In [None]:
class GPTModel(nn.Module): 
    def __init__(self, cfg): 
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.tfs = nn.Sequential(*[TransformerBlock(cfg) for i in range(cfg['n_layers'])])