# GPT from Scratch

## Imports

In [1]:
import tiktoken
import torch
import torch.nn as nn
import torch.optim as optim

## The Data

### Download Data

In [2]:
# !curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > "shakespeare.txt"
# !curl https://www.gutenberg.org/cache/epub/174/pg174.txt > "dorian-gray.txt"
# !curl https://www.gutenberg.org/cache/epub/64317/pg64317.txt > "gatsby.txt"
# !curl https://www.gutenberg.org/ebooks/43.txt.utf-8 > "jekyll.txt"
# !curl https://www.gutenberg.org/ebooks/76101.txt.utf-8 > "theory-of-earth.txt"

In [3]:
with open("shakespeare.txt", "r", encoding="utf-8") as f:
    bard_text = f.read()

f"Number of characters: {len(bard_text)}"

'Number of characters: 1115394'

In [4]:
# with open("dorian-gray.txt", "r", encoding="utf-8") as f:
#     dorian_text = f.read()

# f"Number of characters: {len(dorian_text)}"

In [5]:
# with open("gatsby.txt", "r", encoding="utf-8") as f:
#     gatsby_text = f.read()

# f"Number of characters: {len(gatsby_text)}"

### Vocabulary and Tokenisation

In [6]:
def get_vocab(text):
        return sorted(list(set(text)))

def char_to_idx(chars=None, text=None):
    if chars is None:
        chars = get_vocab(text)
    return {ch:i for i, ch in enumerate(chars)}

def idx_to_char(chars=None, text=None):
    if chars is None:
        chars = get_vocab(text)
    return {i:ch for i, ch in enumerate(chars)}

class TextManager:
    def __init__(self, text, enc_method="simple"):
        self.vocab = get_vocab(text)
        self.vocab_size = len(self.vocab)
        self.c_to_i = char_to_idx(self.vocab)
        self.i_to_c = idx_to_char(self.vocab)
        self.enc_method = enc_method
        
        if enc_method == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")
        else:
            self.enc = None
        
    def __str__(self):
        return f"""Vocabulary (size = {self.vocab_size}):
        {"".join(self.vocab)}
        """

    def get_vocab(self, as_str=False):
        if as_str:
            return "".join(self.vocab)
        return self.vocab
    
    def encode(self, text):
        """
        Take a string, output a list of integers.
        """
        if self.enc_method == "tiktoken":
            return self.enc.encode(text)
        return [self.c_to_i[c] for c in text]
    
    def decode(self, indices):
        """
        Take a list of integers, output a string.
        """
        if self.enc_method == "tiktoken":
            return self.enc.decode(indices)
        return "".join([self.i_to_c[idx] for idx in indices])

In [7]:
tm1 = TextManager(bard_text)
print(tm1)

Vocabulary (size = 65):
        
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
        


### Creating Dataset

In [8]:
data = torch.tensor(tm1.encode(bard_text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
test_size = 0.1
n = int((1-test_size)*data.shape[0])
train_data = data[:n]
val_data = data[n:]

print(f"Training Data ({train_data.shape})")
print(train_data[:100])
print(f"\nValidation Data ({val_data.shape})")
print(val_data[:100])

Training Data (torch.Size([1003854]))
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

Validation Data (torch.Size([111540]))
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10,  0, 19, 53, 53, 42,  1, 51, 53,
        56, 56, 53, 61,  6,  1, 52, 43, 47, 45, 46, 40, 53, 59, 56,  1, 14, 39,
        54, 58, 47, 57, 58, 39,  8,  0,  0, 14, 13, 28, 32, 21, 31, 32, 13, 10,
         0, 19, 53, 53, 42,  1, 51, 53, 56, 56, 53, 61,  6,  1, 52, 43, 47, 45,
        46, 40, 53, 59, 56,  1, 19, 56, 43, 51, 47, 53,  8,  0, 19, 53, 42,  1,
        57, 39, 60, 43,  1, 63, 53, 59,  6,  1])


### Batching

In [10]:
# Block size (T) = context length for prediction
# Batch size (B) = number of independent sequences we process in parallel
torch.manual_seed(1337)

def create_batch(data, block_size=8, batch_size=4):
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

def batch_sanity_check(xb, yb):
    batch_size, block_size = xb.shape
    print("Inputs:", xb.shape, xb, sep="\n")
    print("Targets:", yb.shape, yb, sep="\n")
    print("=" * (3*block_size + 20))

    for b in range(batch_size): # batch dimension
        for bl in range(block_size): # block (time) dimension
            context = xb[b, :bl+1]
            target = yb[b, bl]
            print(f"When input (context) is {context.tolist()} target = {target}.")

In [11]:
xb, yb = create_batch(train_data)
batch_sanity_check(xb, yb)

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
When input (context) is [24] target = 43.
When input (context) is [24, 43] target = 58.
When input (context) is [24, 43, 58] target = 5.
When input (context) is [24, 43, 58, 5] target = 57.
When input (context) is [24, 43, 58, 5, 57] target = 1.
When input (context) is [24, 43, 58, 5, 57, 1] target = 46.
When input (context) is [24, 43, 58, 5, 57, 1, 46] target = 43.
When input (context) is [24, 43, 58, 5, 57, 1, 46, 43] target = 39.
When input (context) is [44] target = 53.
When input (context) is [44, 53] target = 56.
When input (context) is [44, 53, 56] target = 1.
When input (context) is [44, 53,

## Models

### Model Helper Functions

In [12]:
def get_optimiser(opt_name, model, l_rate, **kwargs):
    if opt_name == "adam":
        return optim.AdamW(model.parameters(), lr=l_rate, **kwargs)
    elif opt_name == "sgd":
        return optim.SGD(model.parameters(), lr=l_rate, **kwargs)
    elif opt_name == "rms":
        return optim.RMSprop(model.parameters(), lr=l_rate, **kwargs)
    elif opt_name == "lbfgs":
        return optim.LBFGS(model.parameters(), lr=l_rate, **kwargs)

### Bigram Language Model

In [13]:
class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)
        self.loss = nn.CrossEntropyLoss()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, idx, targets=None):
        logits = self.embedding(idx) # (B x T x C)

        if targets is not None:
            # Reshape logits and targets
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
    
            # Calculate loss
            loss = self.loss(logits, targets)
        else:
            loss = None
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Get predictions
            logits, _ = self(idx)
            logits = logits[:, -1, :] # (B x C)
            
            # Apply softmax to probabilities
            probs = self.softmax(logits) # (B x C)

            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B x 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B x T+1)
        return idx

In [14]:
# Foward pass example
bi_lm = BigramLM(tm1.vocab_size)
logits, loss = bi_lm(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(5.0364, grad_fn=<NllLossBackward0>)


In [15]:
# Generation example
start_char = " "
start_idx = tm1.encode(start_char)
start_idx = torch.tensor(start_idx, dtype=torch.long).view((1,1))

gen_idx = bi_lm.generate(start_idx, 100)[0].tolist()
print(tm1.decode(gen_idx))

 XAQHukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [16]:
opt = get_optimiser("adam", bi_lm, 1e-03)

for steps in range(100000):
    # Sample batch of data
    xb, yb = create_batch(train_data, batch_size=32)

    # Evaluate loss
    logits, loss = bi_lm(xb, yb)
    opt.zero_grad()
    loss.backward()
    opt.step()
print(loss.item())

2.5174548625946045


In [17]:
# Generation example with trained model
train_gen_idx = bi_lm.generate(start_idx, 100)[0].tolist()
print(tm1.decode(train_gen_idx))

 ardauther LLIZARI gatho ftcohanghorad
Age cur, aur hayis;
Wheano?
QUpe.
N otord, fane hiler, withy f
