In [1]:
import os
import random
import torch
import time
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from datasets import load_dataset
from tqdm import tqdm
import logging
from torch.utils.tensorboard import SummaryWriter

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logging.info("Starting")

device = torch.device("cuda")

torch._dynamo.list_backends()

2024-11-26 13:42:45,669 INFO Starting


['cudagraphs', 'inductor', 'onnxrt', 'openxla', 'tvm']

In [2]:
dataset = load_dataset("pszemraj/simple_wikipedia_LM", "default")

In [9]:
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.UnigramTrainer(
    vocab_size=300,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
)
tokenizer.train_from_iterator(tqdm(dataset['train']['text'][:1000]), trainer=trainer)

# # tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2", add_prefix_space=True)
# tokenizer.pad_token = tokenizer.eos_token

100%|██████████| 1000/1000 [00:00<00:00, 14900.15it/s]






In [10]:
chunk_size = 1024

train_tokens = tokenizer.encode(
    '\n\n'.join(dataset['train']['text'][:1000]),
)
train_tokens = torch.tensor(train_tokens.ids).reshape(1, -1)
truncated_length = train_tokens.shape[1] // chunk_size * chunk_size
train_tokens = train_tokens[:, :truncated_length].reshape(-1, chunk_size)
train_tokens = train_tokens[torch.randperm(train_tokens.shape[0])]
train_tokens = train_tokens.to(device)
print(f"{train_tokens.shape=}")

train_tokens.shape=torch.Size([796, 1024])


In [11]:
config = GPT2Config(
    vocab_size=tokenizer.get_vocab_size(),
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12
)
model = GPT2LMHeadModel(config).to(device)
model.apply(model._init_weights)
logging.info(f"Parameter count: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

2024-11-25 17:13:50,914 INFO Parameter count: 86.07M


In [12]:
model.train()
initial_lr = 7e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr)

In [13]:
writer = SummaryWriter(comment=f"{initial_lr:=.2e},tokenizer{tokenizer.get_vocab_size()}")

attention_mask = torch.ones(1, chunk_size).to(device)

for epoch in range(10):
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, 
        T_max=len(train_tokens),
        eta_min=initial_lr * 0.1,
        last_epoch=epoch-1,
    )
    for chunk_idx in tqdm(range(len(train_tokens)), desc=f"epoch{epoch}"):
        step_start_time = time.time()
        chunk = train_tokens[chunk_idx, :].to(device)
        outputs = model(
            input_ids=chunk,
            attention_mask=attention_mask,
            labels=chunk,
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if chunk_idx % 1 == 0:
            lr = scheduler.get_last_lr()[0]
            writer.add_scalar('Loss/train', loss.item(), epoch * len(train_tokens) + chunk_idx)
            writer.add_scalar('Learning Rate', lr, epoch * len(train_tokens) + chunk_idx)
            writer.add_scalar('Time/step', time.time() - step_start_time, epoch * len(train_tokens) + chunk_idx)


epoch0: 100%|██████████| 796/796 [10:12<00:00,  1.30it/s]
epoch1: 100%|██████████| 796/796 [10:09<00:00,  1.31it/s]
epoch2: 100%|██████████| 796/796 [09:51<00:00,  1.35it/s]
epoch3: 100%|██████████| 796/796 [10:15<00:00,  1.29it/s]
epoch4:   1%|          | 8/796 [00:06<11:17,  1.16it/s]


KeyboardInterrupt: 