# Decoder-only based GPT (language model)

Here we take a transformer block, the decoder in particular, and use it for the task of language modeling. In general, this is how GPTs are trained. We will do this on a much smaller scale.

We take everything we've already built and leverage it in the way Karpathy implements a character level LM here:

In [1]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import random_split

from transformers.transformer_blocks import Transformer
from torch.utils.data import Dataset, DataLoader
import tokenmonster

In [2]:
DEVICE = torch.device('mps')

In [3]:
harry_potter_text = " "
for i in range(1, 4): # first 4 books
    with open(f'data/hp{i}.txt', 'r', encoding='utf-8') as f:
        harry_potter_text += f.read()
print(len(harry_potter_text))

1548865


## Tokenization
Instead of character level, we're going to model this LM using a tokenizer. in particular, we're going to try to use OpenAI's tiktoken with the gpt2 50k tokenizer. This might end up being too large of a vocab size given compute constraints, but

In [4]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
vocab = tokenmonster.load("fiction-1024-consistent-v1")
tokens = vocab.tokenize("This is a test.")

In [6]:
tokens

array([138, 918, 108, 318, 202,  17], dtype=uint16)

In [7]:
token_example = vocab.tokenize("hello world test monster tokenizer")

In [8]:
token_example

array([ 37, 445, 174, 785, 318, 202, 465, 547, 321, 169, 181, 218,  62],
      dtype=uint16)

In [9]:
[vocab.decode([token]) for token in token_example]

['',
 ' hel',
 'lo',
 ' world',
 ' te',
 'st',
 ' mon',
 'ster',
 ' to',
 'ke',
 'ni',
 'ze',
 'r']

In [10]:
tokens = np.array(vocab.tokenize(harry_potter_text), dtype=np.float16)

In [11]:
dataset = torch.tensor(tokens, dtype=torch.long)
print(dataset.shape, dataset.dtype)

torch.Size([642743]) torch.int64


In [12]:
class HPDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Return the total number of possible sequences
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Fetch a single sequence x and its corresponding target y
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

BLOCK_SIZE = 25
hp_data = HPDataset(dataset, BLOCK_SIZE)

test_block = torch.tensor([dataset[i] for i in range(100)])
test_list = test_block.tolist()
print(vocab.decode(test_list))

train_size = int(len(hp_data) * 0.8)
test_size = int(len(hp_data) * 0.1)
val_size = len(hp_data) - train_size - test_size

print(f"train set size: {train_size}, test: {test_size}, val: {val_size}, data size: {len(dataset)}, dataset_size: {hp_data.__len__()}")

train_dataset, val_dataset, test_dataset = random_split(hp_data, [train_size, val_size, test_size])

batch_size = 64
train_loader, val_loader, test_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True), DataLoader(val_dataset, batch_size=batch_size, shuffle=True), DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

 Harry Potter and the Sorcerer's Stone
CHAPTER ONE
THE BOY WHO LIVED
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved
train set size: 514174, test: 64271, val: 64273, data size: 642743, dataset_size: 642718


In [13]:
print(len(train_loader))
print(len(test_loader))
print(len(val_loader))

8034
1005
1005


In [14]:
print(train_dataset.__getitem__(0))

(tensor([152, 472, 759, 123, 217, 343, 194,  15, 412,  36, 264,  62, 196, 129,
         38, 124, 132, 123,  37, 266, 195, 516, 315, 161, 537]), tensor([472, 759, 123, 217, 343, 194,  15, 412,  36, 264,  62, 196, 129,  38,
        124, 132, 123,  37, 266, 195, 516, 315, 161, 537, 864]))


In [15]:
class rowlingGPT(nn.Module):
    """
    JK Rowling would probably not approve
    """
    def __init__(self, d_k, d_model, d_v, d_ff, num_heads, num_layers, vocab_size, dropout=0.1) -> None:
        super().__init__()
        self.decoder_transformer = Transformer(d_k, d_model, d_v, d_ff, num_heads, num_layers, vocab_size=vocab_size, mask=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        out = self.decoder_transformer(x)
        return self.fc(self.layer_norm(out))

In [16]:
def compute_loss(y_target, y_pred, loss_function):
    B, T, C = y_pred.shape
    y_pred = y_pred.view(B*T, C)
    _, max_indices = torch.max(y_pred, dim=1)
    y_target_list = y_target.tolist()
    max_indices = max_indices.tolist()
    y_target = y_target.view(B*T)
    return loss_function(y_pred, y_target)

In [17]:
def generate(model, prompt: str, device,n = 200, block_size=BLOCK_SIZE):
  prompt_array = vocab.tokenize(prompt)
  prompt_array = np.array(prompt_array[:block_size], dtype=np.int16)
  decoded = vocab.decode(prompt_array)
  print(f"prompt: {decoded}")
  cumulative_array = prompt_array
  for i in range(n):
    prompt_tensor = torch.tensor(prompt_array, dtype=torch.long).to(device)
    next_token = predict_next_token(model, prompt_tensor.unsqueeze(0))
    next_token_np = next_token.cpu().numpy().flatten()
    cumulative_array = np.append(cumulative_array, next_token_np)
    prompt_array = np.append(prompt_array[1:], next_token_np)
    test_list = cumulative_array.tolist()
  print(vocab.decode(test_list))

In [18]:
def predict_next_token(model, block):
  with torch.no_grad():
    y_pred = model(block)
    token_probs = nn.functional.softmax(y_pred, dim=-1)
    _, max_idx = torch.max(token_probs, dim=-1)
  return max_idx.squeeze()[-1]  # return only the last next token prediction

In [19]:
def train(model, train_loader, val_loader, loss_function, optim, epochs, device):
    losses = [] #group losses for loss visualization
    running_loss = 0.0
    val_losses = []
    for epoch in range(epochs):
        model.train()
        print("Epoch %d / %d" % (epoch+1, epochs))
        print("-"*10)

        for i, batch_data in enumerate(train_loader):
            x, y = batch_data
            x, y = x.to(device), y.to(device)
            y_pred = model(x)

            loss = compute_loss(y, y_pred, loss_function)
            optim.zero_grad()
            loss.backward()
            optim.step()
            running_loss += loss.item()
            losses.append(loss)

            if (i+1) % 1000 == 0:
                print("Step: {}, average training loss over last 1000 steps: {:.4f}".format(i+1, running_loss/1000))
                running_loss = 0.0

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for i, batch_data in enumerate(val_loader):
                x, y = batch_data  # FIX: use x, y order for validation too
                x, y = x.to(device), y.to(device)
                y_pred = model(x)
                loss = compute_loss(y, y_pred, loss_function)
                _, predicted_labels = torch.max(y_pred, 1)
                val_loss += loss.item()

            val_losses.append(val_loss)
        print("Epoch: {}, validation loss: {:.4f}".format(epoch+1, val_loss/len(val_loader)))
        print("Generated text: ")
        generate(model, "Harry", device=DEVICE, n=20)

    return losses, val_losses

In [20]:
LEARNING_RATE = 6e-4
NUM_EPOCHS = 18
DROPOUT = 0.2
D_MODEL = 1024
NUM_HEADS = 8
D_K = int(D_MODEL / NUM_HEADS)
D_V = D_K
D_FF = D_MODEL * 4
NUM_LAYERS = 2
VOCAB_SIZE = vocab.vocab_size

In [21]:
model = rowlingGPT(D_K, D_MODEL, D_V, D_FF, num_heads=NUM_HEADS, num_layers=NUM_LAYERS, vocab_size=VOCAB_SIZE)
model = model.to(DEVICE)

In [22]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [23]:
DEVICE

device(type='mps')

In [24]:
train_loss, val_loss = train(model, train_loader, val_loader, torch.nn.functional.cross_entropy, optimizer, NUM_EPOCHS, DEVICE)

Epoch 1 / 18
----------
Step: 1000, average training loss over last 1000 steps: 3.1862
Step: 2000, average training loss over last 1000 steps: 2.6507
Step: 3000, average training loss over last 1000 steps: 2.4584
Step: 4000, average training loss over last 1000 steps: 2.3036
Step: 5000, average training loss over last 1000 steps: 2.1652
Step: 6000, average training loss over last 1000 steps: 2.0403
Step: 7000, average training loss over last 1000 steps: 1.9243
Step: 8000, average training loss over last 1000 steps: 1.8099
Epoch: 1, validation loss: 1.5744
Generated text: 
prompt: Harry
Harry, who was still watching them. He was going to ask Harry, who was still
Epoch 2 / 18
----------
Step: 1000, average training loss over last 1000 steps: 1.7296
Step: 2000, average training loss over last 1000 steps: 1.5897
Step: 3000, average training loss over last 1000 steps: 1.5146
Step: 4000, average training loss over last 1000 steps: 1.4422
Step: 5000, average training loss over last 1000 steps

In [32]:
text_sample = dataset

In [33]:
print(text_sample[0])
test_block = torch.tensor([text_sample[i] for i in range(8)])

tensor(36)


In [34]:
test_block

tensor([ 36, 264,  62, 196,  36, 301,  64, 382])

In [35]:
test_list = test_block.tolist()

In [36]:
vocab.decode(test_list)

' Harry Potter'

In [37]:
test_block = torch.tensor([text_sample[i] for i in range(100)])
test_list = test_block.tolist()
vocab.decode(test_list)

" Harry Potter and the Sorcerer's Stone\nCHAPTER ONE\nTHE BOY WHO LIVED\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved"

In [38]:
generate(model, "Harry Potter and the monkeys", device= DEVICE, n = 1000)

prompt: Harry Potter and the monkeys
Harry Potter and the monkeyship, sit. Doesn't want to be seen. She's a horrible mess. Saw her running through the landscape up on the fourth floor, sir, dodging between the trees. Crying something dreadful," he said happily. "Poor thing," he added unconvincingly.
����"Did she say who did it?" said Dumbledore, still sounding amused. "Search the skies, if you will.... Hagrid, I could do with a cup of tea. Or a large brandy."
����"O' -- o' course, Professor," said Hagrid, who sounded weak with happiness. "Come in, come in...."
����Harry and Hermione dashed across to him.
����"Ron -- are you okay?"
����But they all refused to say.
����"She says the crystal ball's told her that if I tell you, I'll have a horrible accident!" squeaked Neville as he clambered back down the ladder toward Harry and Ron, who had now reached the landing.
����"That's convenient," snorted Ron. "You know, I'm starting to think Hermione was right about her" -- he jabbed his thumb t