A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [11]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
import numpy as np
set_seed(3407)

In [2]:
# To implement BERT, we will modify the train/test data in the following way:
# We will reverse the input sequence as the output sequence we want the model to learn. 
# We will first replace a random x% of the tokens in the train data with a special token and have the correct tokens in the output when training
# We will then replace a random x% of the tokens in the train data with a wrong token and have the correct tokens in the output when training

In [3]:
import pickle

class InOrderRepeatingDataset(Dataset):
    """ 
    Dataset for the masked-language-modeling problem. E.g. for problem of length 10 with 3 characters (0,1,2), and 3 as a masking token:
    Input: '1201201301' -> Output: '1201201201'
    """

    def __init__(self, split, length=6, num_chars = 3, p_mask = 0.2):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_chars = num_chars
        self.p_mask = p_mask
    
    def __len__(self):
        return 10000 # ...
    
    def get_vocab_size(self):
        return self.num_chars
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length

    def toy_language_model(self):
        # generate a tensor of size self.length, using num_chars, following a simple correlation structure
        # the output is the first self.length elements a randomly rotated version of {0, 1, 2, .., n-1, 0, 1, 2..., n-1, 0, 1...}
        x = torch.cat([torch.arange(0, self.num_chars)]*(int(np.ceil((self.num_chars + self.length)/self.num_chars))))
        start_ind = int(np.random.random() * self.num_chars)
        return x[start_ind : start_ind+self.length]
        
    def __getitem__(self, idx):
        # generate some random characters as per 
        inp = self.toy_language_model()
        mask_rands = torch.rand(size=(self.length,)) 
        mask = mask_rands < self.p_mask
        mask_token = self.num_chars 
        inp_masked = mask.long() * mask_token + (1 - mask.long()) * inp
        
        # solve the task: MLM
        sol = inp.clone()
        
        return inp_masked, sol, mask

In [12]:
# print an example instance of the dataset
train_dataset = InOrderRepeatingDataset('train', length = 6, p_mask = 0.1)
test_dataset = InOrderRepeatingDataset('test', length = 6, p_mask = 0.1)
x, y, z = train_dataset[0]
print('x','y','z')
print('-----')
for a, b, c in zip(x,y,z):
    print(int(a),int(b), int(c))

x y z
-----
1 1 0
2 2 0
0 0 0
1 1 0
2 2 0
0 0 0


In [23]:
len(train_dataset), len(test_dataset)

(10000, 10000)

In [24]:
# create a GPT instance
#from mingpt.model import GPT
from mingpt.model_minBERT import BERT

model_config = BERT.get_default_config()
model_config.model_type = 'BERT-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = BERT(model_config)

number of parameters: 0.09M


In [25]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [26]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    corrects = torch.tensor([], dtype = torch.bool)
    for b, (x, y, z) in enumerate(loader):
        # b is just batch number
        # z is the boolean mask for the batch
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x
        sol = y
        sol_candidate = model.generate(inp)
        c = (sol[z] == sol_candidate[z]) 
        corrects = torch.cat([corrects, c])
        if max_batches is not None and b+1 >= max_batches:
            break
    correct = len(corrects[corrects == True])
    total = len(corrects)
    accuracy = correct/total
    print(correct, total, accuracy)
    return accuracy

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer, 'train', max_batches=50)
    test_score  = eval_split(trainer, 'test',  max_batches=50)

1011 2980 0.33926174496644296
1031 2997 0.3440106773440107


In [27]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        test_accuracy  = eval_split(trainer, 'test',  max_batches=50)
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        print('test accuracy:', test_accuracy)
trainer.set_callback('on_batch_end', batch_end_callback)
trainer.run()

1116 3044 0.36662286465177396
iter_dt 0.00ms; iter 0: train loss 1.07765
test accuracy: 0.36662286465177396
2601 2957 0.8796077105174163
iter_dt 19.54ms; iter 100: train loss 0.31073
test accuracy: 0.8796077105174163
2916 2922 0.997946611909651
iter_dt 18.45ms; iter 200: train loss 0.01816
test accuracy: 0.997946611909651
2975 2975 1.0
iter_dt 21.91ms; iter 300: train loss 0.00151
test accuracy: 1.0
2953 2962 0.9969615124915597
iter_dt 18.67ms; iter 400: train loss 0.00024
test accuracy: 0.9969615124915597
2959 2961 0.9993245525160419
iter_dt 18.68ms; iter 500: train loss 0.00013
test accuracy: 0.9993245525160419
2959 2959 1.0
iter_dt 24.47ms; iter 600: train loss 0.00008
test accuracy: 1.0
3005 3006 0.9996673320026613
iter_dt 25.64ms; iter 700: train loss 0.00002
test accuracy: 0.9996673320026613
2884 2886 0.9993069993069993
iter_dt 21.00ms; iter 800: train loss 0.00001
test accuracy: 0.9993069993069993
2975 2984 0.9969839142091153
iter_dt 18.52ms; iter 900: train loss 0.20064
test ac

In [18]:
# now let's perform some evaluation
model.eval();

In [19]:
len(train_dataset), len(test_dataset)

(10000, 10000)

3031 3031 1.0
3046 3046 1.0


In [21]:
# let's run a random given sequence through the model as well
inp = torch.tensor([[0, 1, 3, 0, 3, 2]], dtype=torch.long).to(trainer.device)
sol = torch.tensor([0, 1, 2, 0, 1, 2], dtype=torch.long)
with torch.no_grad():
    sol_candidate = model.generate(inp)
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt               :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

input sequence  : [[0, 1, 3, 0, 3, 2]]
predicted sorted: [[0, 1, 2, 0, 1, 2]]
gt               : [0, 1, 2, 0, 1, 2]
matches         : True
