# GPT from Scratch

## Imports

In [1]:
import sys
sys.path.append(".")
from scripts import data_handling as dh
from scripts.model_helpers import train, generate_text
from scripts.bigram_lm import BigramLM
from scripts.attention_lm import AttentionLM
from scripts.transformer_lm import TransformerLM

import torch
import torch.nn as nn
import torch.optim as optim

## Set Device

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

device

device(type='mps')

## The Data

### Get Data

In [3]:
bard_text = dh.get_text("shakespeare")
f"Number of characters: {len(bard_text)}"

'Number of characters: 1115394'

### Vocabulary and Tokenisation

In [4]:
tm1 = dh.TextManager(bard_text)
print(tm1)

Vocabulary (size = 65):
        
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

        First 1000 characters:
        First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor
        


### Creating Dataset

In [5]:
data = tm1.get_text_tensor()
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [6]:
test_size = 0.1
train_data, val_data = tm1.get_text_tensor_split(test_size, True)

Training Data (torch.Size([1003854]))
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

Validation Data (torch.Size([111540]))
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10,  0, 19, 53, 53, 42,  1, 51, 53,
        56, 56, 53, 61,  6,  1, 52, 43, 47, 45, 46, 40, 53, 59, 56,  1, 14, 39,
        54, 58, 47, 57, 58, 39,  8,  0,  0, 14, 13, 28, 32, 21, 31, 32, 13, 10,
         0, 19, 53, 53, 42,  1, 51, 53, 56, 56, 53, 61,  6,  1, 52, 43, 47, 45,
        46, 40, 53, 59, 56,  1, 19, 56, 43, 51, 47, 53,  8,  0, 19, 53, 42,  1,
        57, 39, 60, 43,  1, 63, 53, 59,  6,  1])


### Batching

In [7]:
# Block size (T) = context length for prediction
# Batch size (B) = number of independent sequences we process in parallel
# torch.manual_seed(1337)
block_size = 8
batch_size = 4
xb, yb = dh.create_batch(train_data, block_size, batch_size, device)
dh.batch_sanity_check(xb, yb)

Inputs:
torch.Size([4, 8])
tensor([[43, 50, 50,  1, 35, 39, 56, 61],
        [46, 47, 56, 58,  1, 39, 52, 42],
        [59, 41, 46,  1, 58, 53,  1, 42],
        [ 1, 58, 53,  1, 51, 43,  6,  1]], device='mps:0')
Targets:
torch.Size([4, 8])
tensor([[50, 50,  1, 35, 39, 56, 61, 47],
        [47, 56, 58,  1, 39, 52, 42,  1],
        [41, 46,  1, 58, 53,  1, 42, 53],
        [58, 53,  1, 51, 43,  6,  1, 25]], device='mps:0')
When input (context) is [43] target = 50.
When input (context) is [43, 50] target = 50.
When input (context) is [43, 50, 50] target = 1.
When input (context) is [43, 50, 50, 1] target = 35.
When input (context) is [43, 50, 50, 1, 35] target = 39.
When input (context) is [43, 50, 50, 1, 35, 39] target = 56.
When input (context) is [43, 50, 50, 1, 35, 39, 56] target = 61.
When input (context) is [43, 50, 50, 1, 35, 39, 56, 61] target = 47.
When input (context) is [46] target = 47.
When input (context) is [46, 47] target = 56.
When input (context) is [46, 47, 56] target =

## Models

### Bigram Language Model

In [8]:
# Foward pass example
bi_lm = BigramLM(tm1.vocab_size, device=device)

# Move model to selected device
bi_lm.to(device)

logits, loss = bi_lm(xb, yb)
print(logits.shape)
print(loss)
print(bi_lm)
print(f"Number of trainable parameters: {sum(p.numel() for p in bi_lm.parameters())}")

torch.Size([32, 65])
tensor(4.3622, device='mps:0', grad_fn=<NllLossBackward0>)
BigramLM(
  (embedding): Embedding(65, 32)
  (pos_embedding): Embedding(8, 32)
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
  (loss): CrossEntropyLoss()
  (softmax): Softmax(dim=-1)
)
Number of trainable parameters: 4481


In [9]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, bi_lm, device=device)
print(gen_text)

 xs:QcF'pNW&KteM,u& VJyX $kkgqbpT;s PiU,ZjjFSIGeEpfhLYj!ZNa'&:Gp$H!JHpp3$x?HowoKRSETiaTMUkO.AL&,gf,P.


In [10]:
train(bi_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.4697, Val Loss: 4.4810
Step: 500, Train Loss: 2.7120, Val Loss: 2.7490
Step: 1000, Train Loss: 2.5849, Val Loss: 2.6039
Step: 1500, Train Loss: 2.5519, Val Loss: 2.5371
Step: 2000, Train Loss: 2.5161, Val Loss: 2.5259
Step: 2500, Train Loss: 2.5149, Val Loss: 2.5106
Step: 3000, Train Loss: 2.4840, Val Loss: 2.5102
Step: 3500, Train Loss: 2.4847, Val Loss: 2.5116
Step: 4000, Train Loss: 2.4891, Val Loss: 2.5051
Step: 4500, Train Loss: 2.4953, Val Loss: 2.5013
Step: 5000, Train Loss: 2.4880, Val Loss: 2.4974
Step: 5500, Train Loss: 2.4796, Val Loss: 2.4942
Step: 6000, Train Loss: 2.4752, Val Loss: 2.4909
Step: 6500, Train Loss: 2.4846, Val Loss: 2.4930
Step: 7000, Train Loss: 2.4729, Val Loss: 2.4863
Step: 7500, Train Loss: 2.4730, Val Loss: 2.4947
Step: 8000, Train Loss: 2.4690, Val Loss: 2.4863
Step: 8500, Train Loss: 2.4759, Val Loss: 2.4840
Step: 9000, Train Loss: 2.4652, Val Loss: 2.4972
Step: 9500, Train Loss: 2.4652, Val Loss: 2.4966


In [11]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, bi_lm, device=device)
print(gen_text)

 us me.
TUSELAPOManniveny bilesfadl,
INRDUKERENEGERDULUKESWhe'homencthidued fu?
MO:ileequby,
MENGLADW


### Single Attention Head Language Model

In [12]:
# Foward pass example
att_lm = AttentionLM(tm1.vocab_size, device=device)

# Move model to selected device
att_lm.to(device)

logits, loss = att_lm(xb, yb)
print(logits.shape)
print(loss, end="\n\n")
print(att_lm)
print(f"Number of trainable parameters: {sum(p.numel() for p in att_lm.parameters())}")

torch.Size([32, 65])
tensor(4.1374, device='mps:0', grad_fn=<NllLossBackward0>)

AttentionLM(
  (embedding): Embedding(65, 32)
  (pos_embedding): Embedding(8, 32)
  (ffwd): FeedForward(
    (net): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
  (sa_heads): AttentionHead(
    (key): Linear(in_features=32, out_features=32, bias=False)
    (query): Linear(in_features=32, out_features=32, bias=False)
    (value): Linear(in_features=32, out_features=32, bias=False)
    (softmax): Softmax(dim=-1)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (loss): CrossEntropyLoss()
  (softmax): Softmax(dim=-1)
)
Number of trainable parameters: 8609


In [13]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, att_lm, device=device)
print(gen_text)

 Ar!jfvLOE3.vRXi
LbW&eLw;qDXq& I3vPLLJ!,.WBh&YefmpNkXpwMOrGAFqaAtggrb $'$yepLvjh$q-agcrr
:KJCqheTdmdi


In [14]:
train(att_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.2016, Val Loss: 4.1990
Step: 500, Train Loss: 2.9457, Val Loss: 2.9653
Step: 1000, Train Loss: 2.6220, Val Loss: 2.6405
Step: 1500, Train Loss: 2.5709, Val Loss: 2.5715
Step: 2000, Train Loss: 2.5408, Val Loss: 2.5430
Step: 2500, Train Loss: 2.5257, Val Loss: 2.5326
Step: 3000, Train Loss: 2.5117, Val Loss: 2.5291
Step: 3500, Train Loss: 2.5030, Val Loss: 2.5189
Step: 4000, Train Loss: 2.5015, Val Loss: 2.5171
Step: 4500, Train Loss: 2.4889, Val Loss: 2.5239
Step: 5000, Train Loss: 2.4872, Val Loss: 2.5135
Step: 5500, Train Loss: 2.4817, Val Loss: 2.5115
Step: 6000, Train Loss: 2.4927, Val Loss: 2.5106
Step: 6500, Train Loss: 2.4828, Val Loss: 2.5074
Step: 7000, Train Loss: 2.4808, Val Loss: 2.5027
Step: 7500, Train Loss: 2.4796, Val Loss: 2.5055
Step: 8000, Train Loss: 2.4834, Val Loss: 2.5037
Step: 8500, Train Loss: 2.4671, Val Loss: 2.4999
Step: 9000, Train Loss: 2.4769, Val Loss: 2.5005
Step: 9500, Train Loss: 2.4761, Val Loss: 2.5002


In [15]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, att_lm, device=device)
print(gen_text)

 ayereere ror avele, th thnt-be; t prk; tonth RKEThorg y ars tisse.
Af he stand
HARO:

Wind r; ge prt


### Multi-Head Attention Language Model

In [16]:
# Foward pass example
mult_att_lm = AttentionLM(tm1.vocab_size, n_heads=4, device=device)

# Move model to selected device
mult_att_lm.to(device)

logits, loss = mult_att_lm(xb, yb)
print(logits.shape)
print(loss)
print()
print(mult_att_lm)
print(f"Number of trainable parameters: {sum(p.numel() for p in mult_att_lm.parameters())}")

torch.Size([32, 65])
tensor(4.2453, device='mps:0', grad_fn=<NllLossBackward0>)

AttentionLM(
  (embedding): Embedding(65, 32)
  (pos_embedding): Embedding(8, 32)
  (ffwd): FeedForward(
    (net): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
  (sa_heads): MultiHeadAttention(
    (heads): ModuleList(
      (0-3): 4 x AttentionHead(
        (key): Linear(in_features=32, out_features=8, bias=False)
        (query): Linear(in_features=32, out_features=8, bias=False)
        (value): Linear(in_features=32, out_features=8, bias=False)
        (softmax): Softmax(dim=-1)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (loss): CrossEntropyLoss()
  (softmax): Softmax(dim=-1)
)
Number of trainable parameters: 8609


In [17]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, mult_att_lm, device=device)
print(gen_text)

 We :tTp:ABCMuviYw;xYTXpHENyqODZRlGLC:fbXlnk'SKEuEutfa.S clisGfZHS'G E;xitNRxobrSGlMzS
'GLSofo?yI-EE:


In [18]:
train(mult_att_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.2628, Val Loss: 4.2557
Step: 500, Train Loss: 2.7816, Val Loss: 2.7971
Step: 1000, Train Loss: 2.5764, Val Loss: 2.5725
Step: 1500, Train Loss: 2.4935, Val Loss: 2.4959
Step: 2000, Train Loss: 2.4413, Val Loss: 2.4514
Step: 2500, Train Loss: 2.4194, Val Loss: 2.4165
Step: 3000, Train Loss: 2.3828, Val Loss: 2.4009
Step: 3500, Train Loss: 2.3624, Val Loss: 2.3669
Step: 4000, Train Loss: 2.3274, Val Loss: 2.3366
Step: 4500, Train Loss: 2.3056, Val Loss: 2.3321
Step: 5000, Train Loss: 2.3002, Val Loss: 2.3229
Step: 5500, Train Loss: 2.2739, Val Loss: 2.3089
Step: 6000, Train Loss: 2.2739, Val Loss: 2.2921
Step: 6500, Train Loss: 2.2598, Val Loss: 2.2888
Step: 7000, Train Loss: 2.2486, Val Loss: 2.2856
Step: 7500, Train Loss: 2.2351, Val Loss: 2.2798
Step: 8000, Train Loss: 2.2375, Val Loss: 2.2775
Step: 8500, Train Loss: 2.2246, Val Loss: 2.2762
Step: 9000, Train Loss: 2.2250, Val Loss: 2.2606
Step: 9500, Train Loss: 2.2168, Val Loss: 2.2521


In [19]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, mult_att_lm, device=device)
print(gen_text)

 to then nod inont bee yourss!
S: pe asto thutheiche least well I thee nome, boie so the holjer meast


### Transformer Model

In [20]:
# Foward pass example
trans_lm = TransformerLM(tm1.vocab_size, device=device)

# Move model to selected device
trans_lm.to(device)

logits, loss = trans_lm(xb, yb)
print(logits.shape)
print(loss)
print()
print(trans_lm)
print(f"Number of trainable parameters: {sum(p.numel() for p in trans_lm.parameters())}")

torch.Size([32, 65])
tensor(4.4167, device='mps:0', grad_fn=<NllLossBackward0>)

TransformerLM(
  (embedding): Embedding(65, 32)
  (pos_embedding): Embedding(8, 32)
  (blocks): Sequential(
    (0): TransformerBlock(
      (sa): TransformerMultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x AttentionHead(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
            (softmax): Softmax(dim=-1)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): TransformerFeedForward(
        (net): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
   

In [21]:
# Generation example with untrained model
gen_text = generate_text(" ", tm1, trans_lm, device=device)
print(gen_text)

 .kbKVsgGy?sNerB$D'inYUQw&Z--NdjofO-R ;.Rrh
GhyceRVr DzQOagUOKVj-&yKbgUmD-?GDwjaCFPxEFSUb3jyqr;C:mz.C


In [22]:
train(trans_lm, train_data, val_data, device=device)

Step: 0, Train Loss: 4.3652, Val Loss: 4.3713
Step: 500, Train Loss: 2.4601, Val Loss: 2.4710
Step: 1000, Train Loss: 2.3337, Val Loss: 2.3398
Step: 1500, Train Loss: 2.2586, Val Loss: 2.2791
Step: 2000, Train Loss: 2.2113, Val Loss: 2.2417
Step: 2500, Train Loss: 2.1739, Val Loss: 2.2065
Step: 3000, Train Loss: 2.1432, Val Loss: 2.1803
Step: 3500, Train Loss: 2.1388, Val Loss: 2.1684
Step: 4000, Train Loss: 2.1236, Val Loss: 2.1476
Step: 4500, Train Loss: 2.0894, Val Loss: 2.1345
Step: 5000, Train Loss: 2.0792, Val Loss: 2.1289
Step: 5500, Train Loss: 2.0621, Val Loss: 2.1247
Step: 6000, Train Loss: 2.0410, Val Loss: 2.1147
Step: 6500, Train Loss: 2.0289, Val Loss: 2.1010
Step: 7000, Train Loss: 2.0324, Val Loss: 2.0801
Step: 7500, Train Loss: 2.0099, Val Loss: 2.0897
Step: 8000, Train Loss: 2.0083, Val Loss: 2.0924
Step: 8500, Train Loss: 2.0065, Val Loss: 2.0918
Step: 9000, Train Loss: 1.9940, Val Loss: 2.0672
Step: 9500, Train Loss: 1.9919, Val Loss: 2.0655


In [23]:
# Generation example with trained model
gen_text = generate_text(" ", tm1, trans_lm, device=device)
print(gen_text)

 the buract on kees: ofture as difint, the and for, seentle, oom iapre; all, werest me, my good muce 
