# GPT from Scratch

## Imports

In [1]:
import sys
sys.path.append(".")
from scripts import data_handling as dh
from scripts.bigram_lm import BigramLM, train, generate_text

import torch
import torch.nn as nn
import torch.optim as optim

## Set Device

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device

device(type='mps')

## The Data

### Get Data

In [3]:
bard_text = dh.get_text("shakespeare")
f"Number of characters: {len(bard_text)}"

'Number of characters: 1115394'

### Vocabulary and Tokenisation

In [4]:
tm1 = dh.TextManager(bard_text)
print(tm1)

Vocabulary (size = 65):
        
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

        First 1000 characters:
        First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor
        


### Creating Dataset

In [5]:
data = tm1.get_text_tensor()
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [6]:
test_size = 0.1
train_data, val_data = tm1.get_text_tensor_split(test_size, True)

Training Data (torch.Size([1003854]))
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

Validation Data (torch.Size([111540]))
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10,  0, 19, 53, 53, 42,  1, 51, 53,
        56, 56, 53, 61,  6,  1, 52, 43, 47, 45, 46, 40, 53, 59, 56,  1, 14, 39,
        54, 58, 47, 57, 58, 39,  8,  0,  0, 14, 13, 28, 32, 21, 31, 32, 13, 10,
         0, 19, 53, 53, 42,  1, 51, 53, 56, 56, 53, 61,  6,  1, 52, 43, 47, 45,
        46, 40, 53, 59, 56,  1, 19, 56, 43, 51, 47, 53,  8,  0, 19, 53, 42,  1,
        57, 39, 60, 43,  1, 63, 53, 59,  6,  1])


### Batching

In [7]:
# Block size (T) = context length for prediction
# Batch size (B) = number of independent sequences we process in parallel
torch.manual_seed(1337)
block_size = 8
batch_size = 4
xb, yb = dh.create_batch(train_data, block_size, batch_size, device)
dh.batch_sanity_check(xb, yb)

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='mps:0')
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='mps:0')
When input (context) is [24] target = 43.
When input (context) is [24, 43] target = 58.
When input (context) is [24, 43, 58] target = 5.
When input (context) is [24, 43, 58, 5] target = 57.
When input (context) is [24, 43, 58, 5, 57] target = 1.
When input (context) is [24, 43, 58, 5, 57, 1] target = 46.
When input (context) is [24, 43, 58, 5, 57, 1, 46] target = 43.
When input (context) is [24, 43, 58, 5, 57, 1, 46, 43] target = 39.
When input (context) is [44] target = 53.
When input (context) is [44, 53] target = 56.
When input (context) is [44, 53, 56] target = 1.


## Models

### Bigram Language Model

In [8]:
# class BigramLM(nn.Module):
#     def __init__(self, vocab_size, n_embd=32, block_size=8):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, n_embd)
#         self.pos_embedding = nn.Embedding(block_size, n_embd)
#         self.lm_head = nn.Linear(n_embd, vocab_size)
#         self.loss = nn.CrossEntropyLoss()
#         self.softmax = nn.Softmax(dim=-1)

#     def forward(self, idx, targets=None):
#         B, T = idx.shape # (B x T)
#         tok_emb = self.embedding(idx) # (B x T x C)
#         pos_emb = self.pos_embedding(torch.arange(T, device=device)) # (T x C)
#         logits = self.lm_head(tok_emb + pos_emb) # (B x T x vocab_size)

#         if targets is not None:
#             # Reshape logits and targets
#             B, T, C = logits.shape
#             logits = logits.view(B*T, C)
#             targets = targets.view(B*T)
    
#             # Calculate loss
#             loss = self.loss(logits, targets)
#         else:
#             loss = None
        
#         return logits, loss

#     def generate(self, idx, max_new_tokens):
#         for _ in range(max_new_tokens):
#             # Get predictions
#             logits, _ = self(idx)
#             logits = logits[:, -1, :] # (B x C)
            
#             # Apply softmax to probabilities
#             probs = self.softmax(logits) # (B x C)

#             # Sample from distribution
#             idx_next = torch.multinomial(probs, num_samples=1) # (B x 1)
#             idx = torch.cat((idx, idx_next), dim=1) # (B x T+1)
#         return idx

In [15]:
# # Foward pass example
# bi_lm = BigramLM(tm1.vocab_size)

# # Move model to selected device
# bi_lm.to(device)

# logits, loss = bi_lm(xb, yb)
# print(logits.shape)
# print(loss)

torch.Size([32, 65])
tensor(4.6424, device='mps:0', grad_fn=<NllLossBackward0>)


In [16]:
# # Generation example
# start_char = " "
# start_idx = tm1.encode(start_char)
# start_idx = torch.tensor(start_idx, dtype=torch.long, device=device).view((1,1))

# gen_idx = bi_lm.generate(start_idx, 100)[0].tolist()
# print(tm1.decode(gen_idx))

 U as&3mK?YMj$fEcqkPuQNRelOOOuUfZW&ewNy:r$c-jk,ECOIiHeg Eggqu
pCCrrvMtVcAoyDXPujNnv&?ofyO.vrFoJKyLTDL


In [17]:
# opt = get_optimiser("adam", bi_lm, 1e-03)

# for steps in range(100):
#     # Sample batch of data
#     xb, yb = create_batch(train_data, batch_size=32)

#     # Evaluate loss
#     logits, loss = bi_lm(xb, yb)
#     opt.zero_grad()
#     loss.backward()
#     opt.step()
# print(loss.item())

3.3296780586242676


In [18]:
# # Generation example with trained model
# train_gen_idx = bi_lm.generate(start_idx, 100)[0].tolist()
# print(tm1.decode(train_gen_idx))

 &PaLo oiimp;nytPPDuaV IYcE
bTG$eDwPxiY&s
vfkFcaNnl-hDN
Ly
Bn,w?wjelgBOu NwiWeEEce-lbustsBl!qt'3cQs;m


In [9]:
# Foward pass example
bi_lm = BigramLM(tm1.vocab_size, device=device)

# Move model to selected device
bi_lm.to(device)

logits, loss = bi_lm(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.6424, device='mps:0', grad_fn=<NllLossBackward0>)


In [10]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, bi_lm, device=device)
print(gen_text)

 U as&3mK?YMj$fEcqkPuQNRelOOOuUfZW&ewNy:r$c-jk,ECOIiHeg Eggqu
pCCrrvMtVcAoyDXPujNnv&?ofyO.vrFoJKyLTDL


In [11]:
train(bi_lm, train_data, val_data, max_iters=100000, eval_interval=4000, device=device)

Step: 0, Train Loss: 4.4576, Val Loss: 4.4575
Step: 4000, Train Loss: 2.4811, Val Loss: 2.5112
Step: 8000, Train Loss: 2.4617, Val Loss: 2.4952
Step: 12000, Train Loss: 2.4705, Val Loss: 2.5010
Step: 16000, Train Loss: 2.4549, Val Loss: 2.4999
Step: 20000, Train Loss: 2.4651, Val Loss: 2.4806
Step: 24000, Train Loss: 2.4690, Val Loss: 2.4934
Step: 28000, Train Loss: 2.4598, Val Loss: 2.4750
Step: 32000, Train Loss: 2.4535, Val Loss: 2.4727
Step: 36000, Train Loss: 2.4539, Val Loss: 2.4917
Step: 40000, Train Loss: 2.4641, Val Loss: 2.5089
Step: 44000, Train Loss: 2.4586, Val Loss: 2.5103
Step: 48000, Train Loss: 2.4589, Val Loss: 2.4972
Step: 52000, Train Loss: 2.4662, Val Loss: 2.4936
Step: 56000, Train Loss: 2.4527, Val Loss: 2.4868
Step: 60000, Train Loss: 2.4709, Val Loss: 2.5025
Step: 64000, Train Loss: 2.4539, Val Loss: 2.4891
Step: 68000, Train Loss: 2.4525, Val Loss: 2.4821
Step: 72000, Train Loss: 2.4564, Val Loss: 2.4906
Step: 76000, Train Loss: 2.4574, Val Loss: 2.4897
Step: 

In [12]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, bi_lm, device=device)
print(gen_text)

 ouspo onimppry PS:
ARThome
bur'TRDUMEYCKEESThananl-htcany
BORYOFLAMASOf gayovof,
YOUKENGBYownthcus;

