# Build GPT Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

torch.manual_seed(1337)

<torch._C.Generator at 0x132096b30>

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(len(text))

1115394


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"{''.join(chars)} | {vocab_size} chars")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | 65 chars


#### Character Level Tokenizer
1 to 1 mapping for char->int

In [4]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


#### Tokenize and Split Data

In [5]:
data = torch.tensor(encode(text), dtype=torch.long)
print(f"Total Data: {data.shape}, {data.dtype}\n")

n = int(0.9*len(data)) # first 90% will be train, rest val
train_data, val_data = data[:n], data[n:]
print(f"Train Data: {train_data.shape}, {train_data.dtype} | {train_data.shape[0]/data.shape[0]*100:.2f}%")
print(f"Val Data: {val_data.shape}, {val_data.dtype} | {val_data.shape[0]/data.shape[0]*100:.2f}%")

Total Data: torch.Size([1115394]), torch.int64

Train Data: torch.Size([1003854]), torch.int64 | 90.00%
Val Data: torch.Size([111540]), torch.int64 | 10.00%


### Data Preparation

Overview of how we want to partition the data into X,Y

In [6]:
"""
X: if block_size (context) is n, then n tensors (of length 1 to n-1) will be created via a simple sliding window
Y: the next (target) character  
"""
block_size = 8 
x,y = train_data[:block_size], train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [7]:
batch_size, block_size = 4, 8

def get_batch(split:str):
    # select split
    data = train_data if split == 'train' else val_data
    # find b random start points from 0 to len(data)-blocksize 
    ix = torch.randint(len(data)-block_size, (batch_size,))
    # basically the above sliding window algo
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch("train")


for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context} the target: {target}")
    print("\n")

when input is tensor([24]) the target: 43
when input is tensor([24, 43]) the target: 58
when input is tensor([24, 43, 58]) the target: 5
when input is tensor([24, 43, 58,  5]) the target: 57
when input is tensor([24, 43, 58,  5, 57]) the target: 1
when input is tensor([24, 43, 58,  5, 57,  1]) the target: 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) the target: 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target: 39


when input is tensor([44]) the target: 53
when input is tensor([44, 53]) the target: 56
when input is tensor([44, 53, 56]) the target: 1
when input is tensor([44, 53, 56,  1]) the target: 58
when input is tensor([44, 53, 56,  1, 58]) the target: 46
when input is tensor([44, 53, 56,  1, 58, 46]) the target: 39
when input is tensor([44, 53, 56,  1, 58, 46, 39]) the target: 58
when input is tensor([44, 53, 56,  1, 58, 46, 39, 58]) the target: 1


when input is tensor([52]) the target: 58
when input is tensor([52, 58]) the target: 1
when input is ten

#### Simple Bigram Model

In [8]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size:int):
        super().__init__()
        # create an embedding matrix 
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)

    def forward(self, idx, targets=None):
        # get data from embedding space
        logits = self.token_embedding_table(idx) # (B:batch, T:time (context), C:embedding_dim (channels))
        
        if targets is None:
            loss = None
        else:
            # dims of the batched logits 
            B,T,C = logits.shape
            # we want to squash the batches st we can evaluate with cross entropy
            logits = logits.view(B*T,C)
            targets = targets.view(B*T) # note targets is shape (B:batch, T:time)
            # use cross entropy to calculate the loss
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    # this generate function is a core part of how autoregressive models function
    def generate(self, idx, max_new_tokens:int):
        # idx is (B,T) array of indices in our context window
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # go to the last character so that we can get the nxt char prediction for that char
            logits = logits[:,-1,:] # (B,T,C) -> (B,C) @ last element
            probs = F.softmax(logits, dim=-1) # (B,C)
            # calculate the next idx from the distribution above
            next_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            # concatenate this to idx so that we can predict the nxt char of this one
            # effectively the context gets larger and larger (even though this bigram model does not need it)
            idx = torch.cat((idx, next_idx), dim=1) # (B, T+1)

        return idx


    
m = BigramLanguageModel(vocab_size)
out, loss = m(xb,yb)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))



lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


optimizer

In [9]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

#### Train the Bigram model

In [10]:
batch_size=32

for step in range(10000):
    # get the batched data
    xb,yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb,yb)
    # clear gradients
    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    # update grads
    optimizer.step()

    print(loss.item())

4.647705078125
4.722455024719238
4.717485427856445
4.6955246925354
4.676054954528809
4.682693958282471
4.67878532409668
4.65626859664917
4.60831880569458
4.618005275726318
4.576215744018555
4.73443603515625
4.60615348815918
4.540811538696289
4.590214252471924
4.597837448120117
4.573626518249512
4.599428653717041
4.612548828125
4.6582255363464355
4.647839069366455
4.584323883056641
4.5858540534973145
4.57781982421875
4.654167652130127
4.67994499206543
4.568190574645996
4.636806488037109
4.593273162841797
4.6050705909729
4.542919635772705
4.583306312561035
4.681209564208984
4.553791046142578
4.628030776977539
4.502110004425049
4.617300987243652
4.559780120849609
4.582223892211914
4.55769157409668
4.660159111022949
4.530014514923096
4.597833633422852
4.525142192840576
4.486090660095215
4.622176170349121
4.4698638916015625
4.6170148849487305
4.606544017791748
4.591933727264404
4.646505355834961
4.568349838256836
4.622153282165527
4.527293682098389
4.59453010559082
4.686464309692383
4.59879

In [11]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


M:
IUSh t,
F th he d ke alved.
Thupld, cipbll t
I: ir w, l me sie hend lor ito'l an e

I:
Gochosen e


## Transformers!! :-)

#### Understanding Attention
The next character generated must be based on the previous characters. This section builds from naive methods to understand how attention works. 

In [12]:
# consider the following toy example
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

V1: Suppose we average out the previous context w/ loops

In [13]:
xbow = torch.zeros((B,T,C)) # empty bag of words

for b in range(B): #iterate thru the batches
    for t in range(T): #iterate thru the time stamps
        # get the channels from 0 to t
        xprev = x[b,:t+1] # (t,C) 
        # average out all of the channels
        xbow[b,t] = torch.mean(xprev,0) # each mean calculated is dim C

# NOTE this is super innefficient since we have to loop over the Bs and Ts

V2: Do the above but w/ matrix multiplications

In [14]:
"""
Walk thru how the matrices can replicate the above behavior w/ a toy example
"""
a = torch.tril(torch.ones(3,3)) # LU Matrix (A,A)
a = a/a.sum(dim=1, keepdim=True) # average out row wise
b = torch.randint(0,10,(3,2)).float()  #(A,B)
c = a @ b # (A,A) @ (A,B) -> (A,B)

print(f'a = {a}')
print('-------------------------------------')
print(f'b = {b}')
print('-------------------------------------')
print(f'c = {c}')



a = tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
-------------------------------------
b = tensor([[1., 9.],
        [0., 5.],
        [9., 5.]])
-------------------------------------
c = tensor([[1.0000, 9.0000],
        [0.5000, 7.0000],
        [3.3333, 6.3333]])


In [15]:
"""
Implementation of V2
"""
w = torch.tril(torch.ones(T,T)) # LU Matrix of dims (T,T)
w = w / w.sum(1, keepdim=True)

# NOTE: x: (B,T,C), w: (T,T) so the matmul will add a batch dim 
# (T,T) @ (B,T,C) -> (B,T,T) @ (B,T,C) -> (B,T,C)
xbow_v2 = w @ x 

# Compare og solution to matmul solution
torch.allclose(xbow, xbow_v2)

True

V3: Use a Softmax

In [16]:
tril  = torch.tril(torch.ones(T,T)) # LU of dim (T,T)
w = torch.zeros(T,T) 
w = w.masked_fill(tril == 0, float("-inf")) # basically fill the upper triangle w/ -inf
# note that e^0 = 1 and e^-inf = 0 
w = F.softmax(w, dim=-1)
xbow_v3 = w @ x

torch.allclose(xbow, xbow_v3)

True

V4: The Attention Head

In [17]:
B,T,C  = 4, 8, 32 # (batch, time, channels)
x = torch.randn(B,T,C)

# ATTENTION HEAD
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear (C, head_size, bias=False)

k = key(x)    # (B,T, head_size)
q = query(x)  # (B,T, head_size)
w = q @ k.transpose(-2,-1) # (B,T,head_size) @ (B, head_size, T) --> (B,T,T)


# mask the upper triangle of weights since we dont need the information of predictions after 
tril  = torch.tril(torch.ones(T,T)) # LU of dim (T,T)
w = w.masked_fill(tril == 0, float("-inf")) # basically fill the upper triangle w/ -inf
# note that e^0 = 1 and e^-inf = 0 
w = F.softmax(w, dim=-1)


v = value(x)
# if we used x instead of v, out's shape would be B,T,C
# since we are computing using a value layer, outs shape is B,T,head_size
out = w @ v 
print(v.shape)
print(out.shape)

w[0]

torch.Size([4, 8, 16])
torch.Size([4, 8, 16])


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1436, 0.8564, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5250, 0.1493, 0.3257, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4287, 0.1321, 0.0129, 0.4263, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0883, 0.1194, 0.4669, 0.0660, 0.2596, 0.0000, 0.0000, 0.0000],
        [0.1068, 0.1227, 0.0559, 0.5151, 0.0072, 0.1924, 0.0000, 0.0000],
        [0.0930, 0.1754, 0.1408, 0.0864, 0.1743, 0.2484, 0.0817, 0.0000],
        [0.1670, 0.0627, 0.0446, 0.2311, 0.1013, 0.2390, 0.0702, 0.0842]],
       grad_fn=<SelectBackward0>)