# Decoder-only based GPT (language model)

Here we take a transformer block, the decoder in particular, and use it for the task of language modeling. In general, this is how GPTs are trained. We will do this on a much smaller scale.

We take everything we've already built and leverage it in the way Karpathy implements a character level LM here:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp drive/MyDrive/Transformers/models/transformer_blocks.py .
!cp drive/MyDrive/Transformers/models/modules.py .
!cp -r drive/MyDrive/Transformers/data/ .

In [3]:
!pip install tokenmonster

Collecting tokenmonster
  Downloading tokenmonster-1.1.12.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tokenmonster
  Building wheel for tokenmonster (setup.py) ... [?25l[?25hdone
  Created wheel for tokenmonster: filename=tokenmonster-1.1.12-py3-none-any.whl size=15820 sha256=fea3b58ae6f234c3eb53dae983e4f1aef1a43b7c0f439b0e7a7be6a1cdb7c9ae
  Stored in directory: /root/.cache/pip/wheels/aa/49/56/9db5eb8fd22ea838f03cc48cc4e096d0f1e810dff3e4559abe
Successfully built tokenmonster
Installing collected packages: tokenmonster
Successfully installed tokenmonster-1.1.12


In [4]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import random_split
import sys
sys.path.append("~/")
from transformer_blocks import Transformer
from torch.utils.data import Dataset, DataLoader
import tokenmonster

In [5]:
harry_potter_text = " "
for i in range(2):
    book_num = i+1
    with open(f'/content/data/hp{book_num}.txt', 'r', encoding='utf-8') as f:
        harry_potter_text += f.read()
print(len(harry_potter_text))

931872


## Tokenization
Instead of character level, we're going to model this LM using a tokenizer. in particular, we're going to try to use OpenAI's tiktoken with the gpt2 50k tokenizer. This might end up being too large of a vocab size given compute constraints, but

In [6]:
vocab = tokenmonster.load("englishcode-8000-consistent-v1")
tokens = vocab.tokenize("This is a test.")

In [7]:
tokens

array([ 401, 6799, 2856,   17], dtype=uint16)

In [8]:
token_example = vocab.tokenize("hello world test monster tokenizer")

In [9]:
token_example

array([  37, 3346, 3752, 2856, 1768, 2239, 3681, 1231,   62], dtype=uint16)

In [10]:
[vocab.decode([token]) for token in token_example]

['', ' hello', ' world', ' test', ' mon', 'ster', ' token', 'ize', 'r']

In [11]:
tokens = np.array(vocab.tokenize(harry_potter_text), dtype=np.float16)

In [12]:
dataset = torch.tensor(tokens, dtype=torch.long)
print(dataset.shape, dataset.dtype)

torch.Size([263527]) torch.int64


In [13]:
train_val_size = int(len(dataset) * 0.9)
test_size = len(dataset) - train_val_size
train_val_data, test_data = random_split(dataset, [train_val_size, test_size])

train_size = int(len(train_val_data) * 0.9)
val_size = len(train_val_data) - train_size
train_data, val_data = random_split(train_val_data, [train_size, val_size])

In [14]:
print(f"train set size: {train_size}, test: {test_size}, val: {val_size}")

train set size: 213456, test: 26353, val: 23718


In [15]:
torch.manual_seed(10000)
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(data):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])  # for any sequence x, the target y will be the next 8 tokens
    return x, y

xb, yb = get_batch(train_data)

print(xb.shape, yb.shape)

context = xb[0, :2]
target = yb[0,1]
print(f"when input is {context.tolist()} the target: {target}")

torch.Size([32, 8]) torch.Size([32, 8])
when input is [37, 576] the target: 5332


In [16]:
class HPDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Return the total number of possible sequences
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Fetch a single sequence x and its corresponding target y
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

block_size = 8
train_dataset, val_dataset, test_dataset = HPDataset(train_data, block_size), HPDataset(val_data, block_size), HPDataset(test_data, block_size)

batch_size = 16
train_loader, val_loader, test_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True), DataLoader(val_dataset, batch_size=batch_size, shuffle=True), DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [17]:
print(len(train_loader))
print(len(test_loader))
print(len(val_loader))

13341
1647
1482


In [18]:
class zeptoGPT(nn.Module):
    """
    zepto because it's a really small GPT
    """
    def __init__(self, d_k, d_model, d_v, d_ff, num_heads, num_layers, vocab_size, dropout=0.1) -> None:
        super().__init__()
        self.decoder_transformer = Transformer(d_k, d_model, d_v, d_ff, num_heads, num_layers, vocab_size=vocab_size, mask=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        out = self.decoder_transformer(x)
        return self.dropout(self.fc(out))

In [19]:
def compute_loss(y_target, y_pred, loss_function):
    B, T, C = y_pred.shape
    y_pred = y_pred.view(B*T, C)
    _, max_indices = torch.max(y_pred, dim=1)
    y_target_list = y_target.tolist()
    max_indices = max_indices.tolist()
    y_target = y_target.view(B*T)
    return loss_function(y_pred, y_target)

In [20]:
def train(model, train_loader, val_loader, loss_function, optim, epochs, device):
    losses = [] #group losses for loss visualization
    running_loss = 0.0
    val_losses = []
    for epoch in range(epochs):
        model.train()
        print("Epoch %d / %d" % (epoch+1, epochs))
        print("-"*10)

        for i, batch_data in enumerate(train_loader):
            x, y = batch_data
            x, y = x.to(device), y.to(device)
            y_pred = model(x)

            loss = compute_loss(y, y_pred, loss_function)
            optim.zero_grad()
            loss.backward()
            optim.step()
            running_loss += loss.item()
            losses.append(loss)

            if (i+1) % 1000 == 0:
                print("Step: {}, average training loss over last 1000 steps: {:.4f}".format(i+1, running_loss/1000))
                running_loss = 0.0

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            correct_pred = 0.0
            num_samples = 0
            for i, batch_data in enumerate(val_loader):
                (y, x) = batch_data
                y, x = y.to(device), x.to(device)
                y_pred = model(x)
                loss = compute_loss(y, y_pred, loss_function)
                _, predicted_labels = torch.max(y_pred, 1)
                num_samples+=predicted_labels.shape[0]
                val_loss += loss.item()

            val_losses.append(val_loss)
        print("Epoch: {}, validation loss: {:.4f}".format(epoch+1, val_loss/len(val_loader)))

    return losses, val_losses

In [21]:
LEARNING_RATE = 1e-3
NUM_EPOCHS = 50
DROPOUT = 0.2
D_MODEL = 1024
NUM_HEADS = 8
D_K = D_MODEL / NUM_HEADS
D_V = D_K
D_FF = D_MODEL * 4
NUM_LAYERS = 4
VOCAB_SIZE = vocab.vocab_size
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [22]:
model = zeptoGPT(D_K, D_MODEL, D_V, D_FF, num_heads=NUM_HEADS, num_layers=NUM_LAYERS, vocab_size=VOCAB_SIZE)
model = model.to(DEVICE)

In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [24]:
DEVICE

device(type='cuda', index=0)

In [None]:
def predict_next_token(model, block):
  with torch.no_grad():
    y_pred = model(block)
    token_probs = nn.functional.softmax(y_pred, dim=-1)
    _, max_idx = torch.max(token_probs)
  return max_idx

In [25]:
train_loss, val_loss = train(model, train_loader, val_loader, torch.nn.functional.cross_entropy, optimizer, NUM_EPOCHS, DEVICE)

Epoch 1 / 50
----------
Step: 1000, average training loss over last 1000 steps: 7.3157
Step: 2000, average training loss over last 1000 steps: 6.9124
Step: 3000, average training loss over last 1000 steps: 6.7638
Step: 4000, average training loss over last 1000 steps: 6.6698
Step: 5000, average training loss over last 1000 steps: 6.5926
Step: 6000, average training loss over last 1000 steps: 6.5387
Step: 7000, average training loss over last 1000 steps: 6.4976
Step: 8000, average training loss over last 1000 steps: 6.4597
Step: 9000, average training loss over last 1000 steps: 6.4301
Step: 10000, average training loss over last 1000 steps: 6.4039
Step: 11000, average training loss over last 1000 steps: 6.3679
Step: 12000, average training loss over last 1000 steps: 6.3494
Step: 13000, average training loss over last 1000 steps: 6.3271
Epoch: 1, validation loss: 7.5715
Epoch 2 / 50
----------
Step: 1000, average training loss over last 1000 steps: 8.3490
Step: 2000, average training los

KeyboardInterrupt: ignored

In [29]:
text_sample = train_data

In [32]:
print(text_sample[0])
test_block = torch.tensor([text_sample[i] for i in range(8)])

tensor(552)


In [33]:
test_block

tensor([ 552,  277,  241,  909, 1788, 2001, 1542,   69])

In [39]:
test_list = test_block.tolist()

In [40]:
vocab.decode(test_list)

BrokenPipeError: ignored