In [1]:
import torch
from torch.utils.data import Subset
from transformers import GPT2Tokenizer

%reload_ext autoreload
%autoreload 2

from src import data, modules, pipeline

In [3]:
batch_size = 16

dataset = data.TinyStoriesDataset(1024, num_stories=500)
train_ds = Subset(dataset, list(range(batch_size)))
eval_ds = Subset(dataset, list(range(batch_size, 2 * batch_size)))

Tokenizing Stories: 100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 1315.15 stories/s]


In [None]:
# Overfit on a single batch

num_heads = 12
embed_dim = 768
context_len = 1024
vocab_size = 50257
device = "cuda"

g = torch.Generator().manual_seed(42)
model = modules.GPT2(vocab_size, embed_dim, context_len, num_heads)
model.to(device)
train_ds = Subset(dataset, list(range(batch_size)))
eval_ds = Subset(dataset, list(range(batch_size, 2 * batch_size)))

pipeline.train_gpt2(
    model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    num_epochs=100,
    batch_size=batch_size,
    logging_interval=10,
    device=device,
    generator=g,
)

  0%|          | 0/100 [00:00<?, ?batches/s]

Epoch    0 | Minibatch    0 | Avg Train Loss: 10.972 | Eval Loss: 9.684 | Tokens/ms: 13.63 | Avg Forward Time: 456.21 | Avg Backward Time: 745.97
Epoch   10 | Minibatch    0 | Avg Train Loss: 8.038 | Eval Loss: 6.997 | Tokens/ms: 13.66 | Avg Forward Time: 455.63 | Avg Backward Time: 743.43
Epoch   20 | Minibatch    0 | Avg Train Loss: 5.800 | Eval Loss: 6.035 | Tokens/ms: 13.62 | Avg Forward Time: 457.50 | Avg Backward Time: 745.86
Epoch   30 | Minibatch    0 | Avg Train Loss: 4.709 | Eval Loss: 5.757 | Tokens/ms: 13.57 | Avg Forward Time: 458.83 | Avg Backward Time: 748.55
Epoch   40 | Minibatch    0 | Avg Train Loss: 3.851 | Eval Loss: 5.670 | Tokens/ms: 13.54 | Avg Forward Time: 460.89 | Avg Backward Time: 749.19
Epoch   50 | Minibatch    0 | Avg Train Loss: 3.000 | Eval Loss: 6.003 | Tokens/ms: 13.50 | Avg Forward Time: 461.31 | Avg Backward Time: 751.99
Epoch   60 | Minibatch    0 | Avg Train Loss: 2.694 | Eval Loss: 5.918 | Tokens/ms: 13.49 | Avg Forward Time: 461.83 | Avg Backwa

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "openai-community/gpt2", clean_up_tokenization_spaces=False
)
g = torch.Generator(device=device).manual_seed(42)
completions = pipeline.generate_completion(
    "Once upon a time,",
    tokenizer,
    model,
    generator=g,
    loading_bar_prefix="Completions",
    num_completions=3,
    completion_len=100,
    device=device,
)

Completions (cuda): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 130.91it/s]


In [11]:
for c in completions:
    print(c)
    print()
    print("-" * 80)
    print()

Once upon a time, there in no that things receive the Chief.... When Speech separated kiss, the cakeCON seemed to bed Continent all A Powers Turner sad. The Grace what plantsvern, she how again. Thepuff named Tim. The spring secured, be close came be fun. When fish. She hurtThingscture, � Ty, run and the theHello, at cool be tempor different be sped, And they liveddocker I fl right be fall swam because DillonWhere - a tall fox played all

--------------------------------------------------------------------------------

Once upon a time, give jazz splash intraven white different missed his wouldunicip feelffield happening. When she popping, for Nice and when shaming,While for waited for out.<|endoftext|>It back go!"� titles day, she said, she river, by that rubbed rabbit. thankful, badly, she now. In the crow made walked enjoy We should speed. One day at first cold. In the had onwardsStan, with filled with waved goodbye to Alphabet for Kathy decided to beour sad. As Cousinsento, she gra