# Following karpathy's 'Let's build GPT'

In [34]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!mkdir data
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o data/tinyshakespeare

mkdir: data: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  2729k      0 --:--:-- --:--:-- --:--:-- 2736k


In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F


In [26]:
with open('data/tinyshakespeare') as f:
    text = f.read()

In [27]:
!mkdir models
import sentencepiece as spm
spm.SentencePieceTrainer.train(input='data/tinyshakespeare',
                               model_prefix='models/shakespeare_tokenizer_model',
                               vocab_size=1000,
                               character_coverage=1.0,
                               model_type='unigram',
                               remove_extra_whitespaces=False,
                               user_defined_symbols=["\n", "\r"])

mkdir: models: File exists


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/tinyshakespeare
  input_format: 
  model_prefix: models/shakespeare_tokenizer_model
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: 

  user_defined_symbols: 
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s

In [28]:
sp = spm.SentencePieceProcessor()
sp.load('models/shakespeare_tokenizer_model.model')
vocab_size = sp.get_piece_size()

In [29]:
data = torch.tensor(sp.encode(text))

traindata = data[:int(0.9 * len(data))]
testdata = data[int(0.9 * len(data)):]

torch.manual_seed(1337)
def get_batch(data, device, batch_size, block_size):
    ix = torch.randint(0, len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

get_batch(traindata, 'cpu', 4, 8)

(tensor([[  3, 175,  13,  66, 610,  26,  27, 200],
         [ 97, 128,  10,   5,  77,  11,  46, 109],
         [ 39,  16,  12, 709,  30,   3,   3, 191],
         [101, 182,  20, 242,   5,  94, 388, 119]]),
 tensor([[175,  13,  66, 610,  26,  27, 200,  60],
         [128,  10,   5,  77,  11,  46, 109, 130],
         [ 16,  12, 709,  30,   3,   3, 191,  57],
         [182,  20, 242,   5,  94, 388, 119,  36]]))

In [30]:
class SelfAttentionHead(nn.Module):
    def __init__(self, embed_dim, head_size):
        super().__init__()
        self.head_size = head_size
        self.query = nn.Linear(embed_dim, head_size)
        self.key = nn.Linear(embed_dim, head_size)
        self.value = nn.Linear(embed_dim, head_size)

    def forward(self, inputs): # (B, T, C)
        B, T, C = inputs.shape
        q = self.query(inputs) # (B, T, head_size)
        k = self.key(inputs) # (B, T, head_size)

        attn = q @ k.transpose(-2, -1)
        scaled_attn = attn / (self.head_size ** 0.5)

        tril = torch.tril(torch.ones((T, T), device=inputs.device))
        masked_attn = scaled_attn.masked_fill(tril == 0, float('-inf'))
        masked_attn = F.softmax(masked_attn, dim=-1)

        v = self.value(inputs)
        return masked_attn @ v
sample_head = SelfAttentionHead(32, 8)
print(sample_head(torch.randn((batch_size, block_size, 32))).shape)

class MultiheadedSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(embed_dim, embed_dim // num_heads) for _ in range(num_heads)])

    def forward(self, inputs): # (B, T, C=embed_dim)
        outputs = torch.cat([h(inputs) for h in self.heads], dim=-1)
        return outputs

sample_multi_head = MultiheadedSelfAttention(32, 8)

class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.multi_headed_attn = MultiheadedSelfAttention(embed_dim, num_heads)
        self.feed_forward = nn.Sequential(nn.Linear(embed_dim, 4*embed_dim), nn.ReLU(), nn.Linear(4*embed_dim, embed_dim), nn.Dropout())
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
    
    def forward(self, inputs):
        outs = inputs + self.multi_headed_attn(self.ln1(inputs))
        outs = outs + self.feed_forward(self.ln2(outs))
        return outs

class SimpleGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_decoder_layers):
        super().__init__()
        self.tok_emb_table = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb_table = nn.Embedding(block_size, embed_dim)
        self.decoder_blocks = nn.Sequential(*[DecoderBlock(embed_dim, num_heads) for _ in range(num_decoder_layers)])
        self.lm_head = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, inputs): # inputs: (B, T)
        B, T = inputs.shape
        tok_embed = self.tok_emb_table(inputs) # (B, T) -> (B, T, C=embed_dim)
        pos_embed = self.pos_emb_table(torch.arange(T, device=inputs.device)) # (T, C=embed_dim)
        x = tok_embed + pos_embed  # (B, T, C)
        x = self.decoder_blocks(x)
        logits = self.lm_head(x) # (B, T, C) -> (B, T, vocab_size)
        return logits
    
    def generate(self, context, num_tokens): # context: (1, T)
        for _ in range(num_tokens):
            logits = self(context[:, -block_size:])[:,-1,:]
            probs = F.softmax(logits, dim=1)
            next_token = torch.multinomial(probs, num_samples=1)
            context = torch.cat((context, next_token), dim=1)
        return context

sample_model = SimpleGPT(vocab_size, 32, 4, 2)
print(sp.decode(sample_model.generate(torch.zeros((1,1), dtype=torch.long), 10)[0].tolist()))

torch.Size([4, 8, 8])
 ⁇  Wherell meR whom WhoESaveC course


In [32]:
# Training
from tqdm import tqdm

embed_dims = 32
num_heads = 4
num_decoder_layers = 2
eval_iters = 100
eval_interval = 1000
num_training_iters = 10000
batch_size = 4
block_size = 8

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SimpleGPT(vocab_size, embed_dims, num_heads, num_decoder_layers)
model.to(device)
print(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters())

@torch.no_grad
def estimate_loss(dataset):
    losses = torch.zeros(eval_iters)
    model.eval()
    for i in range(eval_iters):
        inputs, targets = get_batch(dataset, device, batch_size, block_size)
        logits = model(inputs)
        B, T, C = logits.shape
        loss = loss_fn(logits.view(B*T, C), targets.view(B*T))
        losses[i] = loss.item()
    model.train()
    return losses.mean()


for i in tqdm(range(num_training_iters)):
    inputs, targets = get_batch(traindata, device, batch_size, block_size)
    optimizer.zero_grad()
    logits = model(inputs)
    B, T, C = logits.shape
    loss = loss_fn(logits.view(B*T, C), targets.view(B*T))
    loss.backward()
    optimizer.step()
    if i % eval_interval == 0 or i == num_training_iters - 1:
        print(f"Train Loss={estimate_loss(traindata)} Test Loss={estimate_loss(testdata)}")
    

SimpleGPT(
  (tok_emb_table): Embedding(1000, 32)
  (pos_emb_table): Embedding(8, 32)
  (decoder_blocks): Sequential(
    (0): DecoderBlock(
      (multi_headed_attn): MultiheadedSelfAttention(
        (heads): ModuleList(
          (0-3): 4 x SelfAttentionHead(
            (query): Linear(in_features=32, out_features=8, bias=True)
            (key): Linear(in_features=32, out_features=8, bias=True)
            (value): Linear(in_features=32, out_features=8, bias=True)
          )
        )
      )
      (feed_forward): Sequential(
        (0): Linear(in_features=32, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=32, bias=True)
        (3): Dropout(p=0.5, inplace=False)
      )
      (ln1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
    (1): DecoderBlock(
      (multi_headed_attn): MultiheadedSelfAttention(
        (heads): ModuleList(
          (0-3): 4

  0%|          | 9/10000 [00:00<11:22, 14.64it/s]  

Train Loss=7.270721435546875 Test Loss=7.286463737487793


 10%|█         | 1018/10000 [00:12<02:53, 51.91it/s]

Train Loss=5.060835838317871 Test Loss=4.926959991455078


 20%|██        | 2019/10000 [00:23<02:20, 56.77it/s] 

Train Loss=4.615677356719971 Test Loss=4.776318550109863


 30%|███       | 3020/10000 [00:34<02:02, 57.10it/s] 

Train Loss=4.535714626312256 Test Loss=4.566429138183594


 40%|████      | 4017/10000 [00:44<01:48, 55.22it/s] 

Train Loss=4.2885422706604 Test Loss=4.4173736572265625


 50%|█████     | 5014/10000 [00:54<01:14, 67.04it/s] 

Train Loss=4.27218770980835 Test Loss=4.333480358123779


 60%|██████    | 6021/10000 [01:04<00:56, 69.95it/s] 

Train Loss=4.175124168395996 Test Loss=4.310807704925537


 70%|███████   | 7016/10000 [01:14<00:47, 62.38it/s] 

Train Loss=4.285872459411621 Test Loss=4.289434432983398


 80%|████████  | 8021/10000 [01:23<00:31, 62.76it/s] 

Train Loss=3.966357707977295 Test Loss=4.185142993927002


 90%|█████████ | 9014/10000 [01:33<00:14, 68.44it/s] 

Train Loss=4.032195091247559 Test Loss=4.21633768081665


100%|██████████| 10000/10000 [01:43<00:00, 96.77it/s]

Train Loss=3.9990339279174805 Test Loss=4.194402694702148





In [33]:
print(sp.decode(model.generate(torch.zeros((1,1), dtype=torch.long, device = device), 1000)[0].tolist()))

 ⁇ ! Aleads;
Cloteet him Romeo'er:
Bygainsts ach:
Toant prahing me wedid
The suitan fe against music, his this lall to this lady
rie, now, and give votoing, is as these Your as you are stay in chays that.

HASTINGUDS:
Narven againe Mixoughsight on worth:
When the sea myselfs on.
O prince me; and pilly lie' with gravely.

BRUTUS:
Whis such his a maids; and samy is conliseleceue
That I warrant your mi happy heart hand, and report,
And Servingmanut should your kind'll
th-thurkalth-likeiono
Andows cu scent.
Ans no igenty fool,
Raseit in allor forcuk to progoows with usaration.

Forite would IV:
Nip worthut bid his no you
Wo in her ending'dicenred I more be it:
Is points:
For leaves ing's Da thems:
Pry that offices 'S:
O, though not the win, is wor thy from?
Sar, If upon thy worth flemen long me;
Thectterwup.

KING RICHARD, strong mistheted and the heartern
Firpon which he have the gosesccuution but with the world; who' spion't'dfulisher'd your sengoutend since powers go,
Whousread our fath