# Decoder-only based GPT (language model)

Here we take a transformer block, the decoder in particular, and use it for the task of language modeling. In general, this is how GPTs are trained. We will do this on a much smaller scale.

We take everything we've already built and leverage it in the way Karpathy implements a character level LM here: 

In [21]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import random_split
import sys 
sys.path.append("../models")
from transformer_blocks import Transformer
from torch.utils.data import Dataset, DataLoader
import tokenmonster

In [22]:
harry_potter_text = " "
for i in range(4):
    book_num = i+1
    with open(f'../data/hp{book_num}.txt', 'r', encoding='utf-8') as f:
        harry_potter_text += f.read()
print(len(harry_potter_text))

2652650


## Tokenization
Instead of character level, we're going to model this LM using a tokenizer. in particular, we're going to try to use OpenAI's tiktoken with the gpt2 50k tokenizer. This might end up being too large of a vocab size given compute constraints, but

In [23]:
vocab = tokenmonster.load("englishcode-8000-consistent-v1")
tokens = vocab.tokenize("This is a test.")

In [24]:
tokens

array([ 401, 6799, 2856,   17], dtype=uint16)

In [25]:
token_example = vocab.tokenize("hello world test monster tokenizer")

In [26]:
token_example

array([  37, 3346, 3752, 2856, 1768, 2239, 3681, 1231,   62], dtype=uint16)

In [27]:
[vocab.decode([token]) for token in token_example]

['', ' hello', ' world', ' test', ' mon', 'ster', ' token', 'ize', 'r']

In [28]:
tokens = np.array(vocab.tokenize(harry_potter_text), dtype=np.float16)

In [29]:
dataset = torch.tensor(tokens, dtype=torch.long)
print(dataset.shape, dataset.dtype)

torch.Size([740637]) torch.int64


In [30]:
train_val_size = int(len(dataset) * 0.9)  
test_size = len(dataset) - train_val_size
train_val_data, test_data = random_split(dataset, [train_val_size, test_size])

train_size = int(len(train_val_data) * 0.9)  
val_size = len(train_val_data) - train_size
train_data, val_data = random_split(train_val_data, [train_size, val_size])

In [31]:
print(f"train set size: {train_size}, test: {test_size}, val: {val_size}")

train set size: 599915, test: 74064, val: 66658


In [32]:
torch.manual_seed(10000)
batch_size = 8 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(data):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])  # for any sequence x, the target y will be the next 8 tokens
    return x, y

xb, yb = get_batch(train_data)

print(xb.shape, yb.shape)

context = xb[0, :2]
target = yb[0,1]
print(f"when input is {context.tolist()} the target: {target}")

torch.Size([8, 8]) torch.Size([8, 8])
when input is [497, 56] the target: 554


In [33]:
class HPDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Return the total number of possible sequences
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Fetch a single sequence x and its corresponding target y
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

block_size = 8
train_dataset, val_dataset, test_dataset = HPDataset(train_data, block_size), HPDataset(val_data, block_size), HPDataset(test_data, block_size)

batch_size = 32 
train_loader, val_loader, test_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True), DataLoader(val_dataset, batch_size=batch_size, shuffle=True), DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [34]:
print(len(train_loader))
print(len(test_loader))
print(len(val_loader))

18748
2315
2083


In [35]:
class zeptoGPT(nn.Module):
    """
    zepto because it's a really small GPT
    """
    def __init__(self, d_k, d_model, d_v, d_ff, num_heads, num_layers, vocab_size, dropout=0.1) -> None:
        super().__init__()
        self.decoder_transformer = Transformer(d_k, d_model, d_v, d_ff, num_heads, num_layers, vocab_size=vocab_size, mask=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        out = self.decoder_transformer(x)
        return self.dropout(self.fc(out))

In [36]:
def train(model, train_loader, val_loader, loss_function, optim, epochs, device):
    losses = [] #group losses for loss visualization 
    running_loss = 0.0
    val_losses = []
    for epoch in range(epochs):
        model.train()
        print("Epoch %d / %d" % (epoch+1, epochs))
        print("-"*10)
    
        for i, batch_data in enumerate(train_loader):
            x, y = batch_data
            logits = model(x)
            print(logits.shape)
            print(y.shape)
            loss = loss_function(logits, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            running_loss += loss.item()
            losses.append(loss)

            if (i+1) % 1000 == 0:
                print("Step: {}, average training loss over last 2000 steps: {:.4f}".format(i+1, running_loss/1000))
                running_loss = 0.0

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            correct_pred = 0.0
            num_samples = 0
            for i, batch_data in enumerate(val_loader):
                (y, x, x_size) = batch_data
                y, x, x_size = y.to(device), x.to(device), x_size.to(device)
                logits = model(x)
                loss = loss_function(logits, y)
                _, predicted_labels = torch.max(logits, 1)
                correct_pred += (predicted_labels.long() == y.long()).sum()
                num_samples+=predicted_labels.shape[0]
                val_loss += loss.item()
            
            val_accuracy = (correct_pred / num_samples) * 100
            val_losses.append(val_loss)
        print("Epoch: {}, validation loss: {:.4f}, val accuracy: {:.2f}".format(epoch+1, val_loss/len(val_loader), val_accuracy))
    
    return losses, val_losses

In [37]:
LEARNING_RATE = 1e-3
NUM_EPOCHS = 50
DROPOUT = 0.2
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
D_K = 128
D_V = D_K
D_MODEL = D_K * 2
D_FF = D_MODEL * 4
NUM_LAYERS = 4
OUTPUT_DIM = 4
VOCAB_SIZE = vocab.vocab_size

In [38]:
model = zeptoGPT(D_K, D_MODEL, D_V, D_FF, num_heads=8, num_layers=2, vocab_size=VOCAB_SIZE)
model = model.to(DEVICE)

In [39]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [40]:
train_loss, val_loss = train(model, train_loader, val_loader, torch.nn.functional.cross_entropy, optimizer, NUM_EPOCHS, DEVICE)

Epoch 1 / 50
----------
torch.Size([32, 8, 8000])


RuntimeError: Expected target size [32, 8000], got [32, 8]