# GPT from Scratch

## Imports

In [1]:
import sys
sys.path.append(".")
from scripts import data_handling as dh
from scripts.model_helpers import train, generate_text
from scripts.bigram_lm import BigramLM
from scripts.attention_lm import AttentionLM

import torch
import torch.nn as nn
import torch.optim as optim

## Set Device

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device

device(type='mps')

## The Data

### Get Data

In [3]:
bard_text = dh.get_text("shakespeare")
f"Number of characters: {len(bard_text)}"

'Number of characters: 1115394'

### Vocabulary and Tokenisation

In [4]:
tm1 = dh.TextManager(bard_text)
print(tm1)

Vocabulary (size = 65):
        
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

        First 1000 characters:
        First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor
        


### Creating Dataset

In [5]:
data = tm1.get_text_tensor()
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [6]:
test_size = 0.1
train_data, val_data = tm1.get_text_tensor_split(test_size, True)

Training Data (torch.Size([1003854]))
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

Validation Data (torch.Size([111540]))
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10,  0, 19, 53, 53, 42,  1, 51, 53,
        56, 56, 53, 61,  6,  1, 52, 43, 47, 45, 46, 40, 53, 59, 56,  1, 14, 39,
        54, 58, 47, 57, 58, 39,  8,  0,  0, 14, 13, 28, 32, 21, 31, 32, 13, 10,
         0, 19, 53, 53, 42,  1, 51, 53, 56, 56, 53, 61,  6,  1, 52, 43, 47, 45,
        46, 40, 53, 59, 56,  1, 19, 56, 43, 51, 47, 53,  8,  0, 19, 53, 42,  1,
        57, 39, 60, 43,  1, 63, 53, 59,  6,  1])


### Batching

In [7]:
# Block size (T) = context length for prediction
# Batch size (B) = number of independent sequences we process in parallel
torch.manual_seed(1337)
block_size = 8
batch_size = 4
xb, yb = dh.create_batch(train_data, block_size, batch_size, device)
dh.batch_sanity_check(xb, yb)

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='mps:0')
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='mps:0')
When input (context) is [24] target = 43.
When input (context) is [24, 43] target = 58.
When input (context) is [24, 43, 58] target = 5.
When input (context) is [24, 43, 58, 5] target = 57.
When input (context) is [24, 43, 58, 5, 57] target = 1.
When input (context) is [24, 43, 58, 5, 57, 1] target = 46.
When input (context) is [24, 43, 58, 5, 57, 1, 46] target = 43.
When input (context) is [24, 43, 58, 5, 57, 1, 46, 43] target = 39.
When input (context) is [44] target = 53.
When input (context) is [44, 53] target = 56.
When input (context) is [44, 53, 56] target = 1.


## Models

### Bigram Language Model

In [8]:
# Foward pass example
bi_lm = BigramLM(tm1.vocab_size, device=device)

# Move model to selected device
bi_lm.to(device)

logits, loss = bi_lm(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.6424, device='mps:0', grad_fn=<NllLossBackward0>)


In [9]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, bi_lm, device=device)
print(gen_text)

 U as&3mK?YMj$fEcqkPuQNRelOOOuUfZW&ewNy:r$c-jk,ECOIiHeg Eggqu
pCCrrvMtVcAoyDXPujNnv&?ofyO.vrFoJKyLTDL


In [10]:
train(bi_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.4588, Val Loss: 4.4506
Step: 500, Train Loss: 2.7237, Val Loss: 2.7526
Step: 1000, Train Loss: 2.5888, Val Loss: 2.6141
Step: 1500, Train Loss: 2.5477, Val Loss: 2.5580
Step: 2000, Train Loss: 2.5185, Val Loss: 2.5310
Step: 2500, Train Loss: 2.5170, Val Loss: 2.5228
Step: 3000, Train Loss: 2.5025, Val Loss: 2.5135
Step: 3500, Train Loss: 2.5018, Val Loss: 2.5070
Step: 4000, Train Loss: 2.4968, Val Loss: 2.4908
Step: 4500, Train Loss: 2.4933, Val Loss: 2.5044
Step: 5000, Train Loss: 2.4836, Val Loss: 2.5037
Step: 5500, Train Loss: 2.4781, Val Loss: 2.5077
Step: 6000, Train Loss: 2.4938, Val Loss: 2.5067
Step: 6500, Train Loss: 2.4837, Val Loss: 2.4886
Step: 7000, Train Loss: 2.4784, Val Loss: 2.5041
Step: 7500, Train Loss: 2.4704, Val Loss: 2.5039
Step: 8000, Train Loss: 2.4697, Val Loss: 2.4899
Step: 8500, Train Loss: 2.4702, Val Loss: 2.5004
Step: 9000, Train Loss: 2.4667, Val Loss: 2.4805
Step: 9500, Train Loss: 2.4694, Val Loss: 2.4936


In [11]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, bi_lm, device=device)
print(gen_text)

 hispo inimppry PYO:
HESYNELET:
POLAMEYO:
HADUTHNGHAhtcany
BEThew t!
SOMANCHENEENIZEEULOLETHAnthcus;



### Single Attention Head Language Model

In [12]:
# Foward pass example
att_lm = AttentionLM(tm1.vocab_size, device=device)

# Move model to selected device
att_lm.to(device)

logits, loss = att_lm(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.3037, device='mps:0', grad_fn=<NllLossBackward0>)


In [13]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, att_lm, device=device)
print(gen_text)

 uV.FTVRKKYMlNQR;$RuqgzEUjLukW3SgQrn
'u.lLphj'!n BpF&gQ3yFvEQf,:!nPLggMn;&ofB
'?ogTB&BdYDPlg$'Qhuqiis


In [14]:
train(att_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.2733, Val Loss: 4.2712
Step: 500, Train Loss: 3.0829, Val Loss: 3.1002
Step: 1000, Train Loss: 2.7002, Val Loss: 2.7099
Step: 1500, Train Loss: 2.5722, Val Loss: 2.5827
Step: 2000, Train Loss: 2.5384, Val Loss: 2.5458
Step: 2500, Train Loss: 2.5062, Val Loss: 2.5187
Step: 3000, Train Loss: 2.5003, Val Loss: 2.5119
Step: 3500, Train Loss: 2.5031, Val Loss: 2.5123
Step: 4000, Train Loss: 2.4941, Val Loss: 2.4937
Step: 4500, Train Loss: 2.4835, Val Loss: 2.4961
Step: 5000, Train Loss: 2.4930, Val Loss: 2.5034
Step: 5500, Train Loss: 2.4749, Val Loss: 2.5127
Step: 6000, Train Loss: 2.4816, Val Loss: 2.5148
Step: 6500, Train Loss: 2.4876, Val Loss: 2.4921
Step: 7000, Train Loss: 2.4817, Val Loss: 2.5035
Step: 7500, Train Loss: 2.4853, Val Loss: 2.4976
Step: 8000, Train Loss: 2.4730, Val Loss: 2.4951
Step: 8500, Train Loss: 2.4732, Val Loss: 2.4953
Step: 9000, Train Loss: 2.4691, Val Loss: 2.4965
Step: 9500, Train Loss: 2.4751, Val Loss: 2.5041


In [15]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, att_lm, device=device)
print(gen_text)

 cont rray 

ICoyockield, murs, in mamyot hindyongmyooelod Vofetthendy se
Hefil brveseay alsteanerm t


### Multi-Head Attention Language Model

In [16]:
# Foward pass example
mult_att_lm = AttentionLM(tm1.vocab_size, n_heads=4, device=device)

# Move model to selected device
mult_att_lm.to(device)

logits, loss = mult_att_lm(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.1802, device='mps:0', grad_fn=<NllLossBackward0>)


In [17]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, mult_att_lm, device=device)
print(gen_text)

 o,tT;p!md!LcHXxZJYBQDW,CgXXEtYgtSSTT'G
-U$!Qsn.wgN'UOKQykzyedT,eZQjE.QAUyBsXj,HKVjz-O'rqFe.F nJASWyf


In [18]:
train(mult_att_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.3135, Val Loss: 4.3091
Step: 500, Train Loss: 2.8094, Val Loss: 2.8234
Step: 1000, Train Loss: 2.6415, Val Loss: 2.6467
Step: 1500, Train Loss: 2.5317, Val Loss: 2.5298
Step: 2000, Train Loss: 2.4562, Val Loss: 2.4657
Step: 2500, Train Loss: 2.4004, Val Loss: 2.4191
Step: 3000, Train Loss: 2.3717, Val Loss: 2.3883
Step: 3500, Train Loss: 2.3395, Val Loss: 2.3495
Step: 4000, Train Loss: 2.3146, Val Loss: 2.3250
Step: 4500, Train Loss: 2.2995, Val Loss: 2.2915
Step: 5000, Train Loss: 2.2676, Val Loss: 2.2836
Step: 5500, Train Loss: 2.2560, Val Loss: 2.2780
Step: 6000, Train Loss: 2.2577, Val Loss: 2.2701
Step: 6500, Train Loss: 2.2336, Val Loss: 2.2724
Step: 7000, Train Loss: 2.2420, Val Loss: 2.2685
Step: 7500, Train Loss: 2.2214, Val Loss: 2.2546
Step: 8000, Train Loss: 2.2087, Val Loss: 2.2683
Step: 8500, Train Loss: 2.2193, Val Loss: 2.2507
Step: 9000, Train Loss: 2.2075, Val Loss: 2.2503
Step: 9500, Train Loss: 2.2133, Val Loss: 2.2490


In [19]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, mult_att_lm, device=device)
print(gen_text)

 halut que fillived pilitd thimentherroy-pithe ans I and
ovy, wearuthe no you wiler startis ig you ee
