# Build GPT Model

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

torch.manual_seed(1337)

<torch._C.Generator at 0x1292be990>

In [23]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(len(text))

1115394


In [33]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"{''.join(chars)} | {vocab_size} chars")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | 65 chars


#### Character Level Tokenizer
1 to 1 mapping for char->int

In [25]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Tokenize and Split Data

In [26]:
data = torch.tensor(encode(text), dtype=torch.long)
print(f"Total Data: {data.shape}, {data.dtype}\n")

n = int(0.9*len(data)) # first 90% will be train, rest val
train_data, val_data = data[:n], data[n:]
print(f"Train Data: {train_data.shape}, {train_data.dtype} | {train_data.shape[0]/data.shape[0]*100:.2f}%")
print(f"Val Data: {val_data.shape}, {val_data.dtype} | {val_data.shape[0]/data.shape[0]*100:.2f}%")

Total Data: torch.Size([1115394]), torch.int64

Train Data: torch.Size([1003854]), torch.int64 | 90.00%
Val Data: torch.Size([111540]), torch.int64 | 10.00%


### Data Preparation

Overview of how we want to partition the data into X,Y

In [37]:
"""
X: if block_size (context) is n, then n tensors (of length 1 to n-1) will be created via a simple sliding window
Y: the next (target) character  
"""
block_size = 8 
x,y = train_data[:block_size], train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [47]:
batch_size, block_size = 4, 8

def get_batch(split:str):
    # select split
    data = train_data if split == 'train' else val_data
    # find b random start points from 0 to len(data)-blocksize 
    ix = torch.randint(len(data)-block_size, (batch_size,))
    # basically the above sliding window algo
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch("train")
print(x.shape)

for b in range(batch_size):
    for t in range(block_size):
        context = x[b,:t+1]
        target = y[b,t]
        print(f"when input is {context} the target: {target}")
    print("\n")

torch.Size([4, 8])
when input is tensor([52]) the target: 42
when input is tensor([52, 42]) the target: 8
when input is tensor([52, 42,  8]) the target: 0
when input is tensor([52, 42,  8,  0]) the target: 0
when input is tensor([52, 42,  8,  0,  0]) the target: 23
when input is tensor([52, 42,  8,  0,  0, 23]) the target: 21
when input is tensor([52, 42,  8,  0,  0, 23, 21]) the target: 26
when input is tensor([52, 42,  8,  0,  0, 23, 21, 26]) the target: 19


when input is tensor([45]) the target: 53
when input is tensor([45, 53]) the target: 42
when input is tensor([45, 53, 42]) the target: 57
when input is tensor([45, 53, 42, 57]) the target: 0
when input is tensor([45, 53, 42, 57,  0]) the target: 23
when input is tensor([45, 53, 42, 57,  0, 23]) the target: 43
when input is tensor([45, 53, 42, 57,  0, 23, 43]) the target: 43
when input is tensor([45, 53, 42, 57,  0, 23, 43, 43]) the target: 54


when input is tensor([52]) the target: 1
when input is tensor([52,  1]) the target: 6

### Simple Bigram Model

In [76]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size:int):
        super().__init__()
        # create an embedding matrix 
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)

    def forward(self, idx, targets=None):
        # get data from embedding space
        logits = self.token_embedding_table(idx) # (B:batch, T:context, C:embedding_dim)
        
        if targets is None:
            loss = None
        else:
            # dims of the batched logits 
            B,T,C = logits.shape
            # we want to squash the batches st we can evaluate with cross entropy
            logits = logits.view(B*T,C)
            targets = targets.view(B*T) # note targets is shape (B:batch, T:context)
            # use cross entropy to calculate the loss
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    # this generate function is a core part of how autoregressive models function
    def generate(self, idx, max_new_tokens:int):
        # idx is (B,T) array of indices in our context window
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # go to the last character so that we can get the nxt char prediction for that char
            logits = logits[:,-1,:] # (B,T,C) -> (B,C) @ last element
            probs = F.softmax(logits, dim=-1) # (B,C)
            # calculate the next idx from the distribution above
            next_idx = torch.multinomial(probs, num_samples=1) # (B,1)
            # concatenate this to idx so that we can predict the nxt char of this one
            # effectively the context gets larger and larger (even though this bigram model does not need it)
            idx = torch.cat((idx, next_idx), dim=1) # (B, T+1)

        return idx


    
m = BigramLanguageModel(vocab_size)
out, loss = m(xb,yb)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))



d-liIAxOzfVYOTmryOa:ymywMXh:yjws:dcNfp :?evM!lxLJgdC&VMCeaOI!Gzl C3d' Y!nVXNtid? :nR,'-Y
ti?fQqvDuq.


optimizer

In [77]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

#### Train the Bigram model

In [87]:
batch_size=32

for step in range(10000):
    # get the batched data
    xb,yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb,yb)
    # clear gradients
    optimizer.zero_grad(set_to_none=True)
    # backprop
    loss.backward()
    # update grads
    optimizer.step()

    print(loss.item())

2.4574716091156006
2.4994096755981445
2.5237841606140137
2.4857075214385986
2.3326988220214844
2.485532760620117
2.3663136959075928
2.493509292602539
2.2752411365509033
2.434361696243286
2.373743772506714
2.471689224243164
2.5559444427490234
2.482396364212036
2.5345921516418457
2.4167704582214355
2.492621898651123
2.540714979171753
2.461087942123413
2.609520196914673
2.4723758697509766
2.5721139907836914
2.3551275730133057
2.484105348587036
2.5172927379608154
2.377882242202759
2.592900037765503
2.6063849925994873
2.476364850997925
2.463266372680664
2.4199023246765137
2.4261839389801025
2.5038974285125732
2.3876359462738037
2.343299150466919
2.4056262969970703
2.383185386657715
2.614135980606079
2.4268367290496826
2.55153489112854
2.4680063724517822
2.422163724899292
2.4711923599243164
2.588444948196411
2.42499041557312
2.5086307525634766
2.5486786365509033
2.4361073970794678
2.3028383255004883
2.492863416671753
2.5433692932128906
2.3795228004455566
2.43080997467041
2.4392971992492676
2

In [88]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Wh, thed.'lea,
Bus. pue, bul in o,--ld, me oregowh se d bllard nd ngad

T: tuch ceatavellllfinckng t
