In [1]:
from typing import List

import tensorflow as tf

from tokenizers import Tokenizer

from gpt.trainer import (Trainer, TrainerConfig, )
from gpt.modeling import (GPT, GPT1Config, )

In [2]:
class TokenizedDataset(tf.data.Dataset):
    def _gen_examples_from(data: str, block_size: int):
        def _gen():
            nb_examples = len(data)-block_size
            for idx in range(nb_examples):
                # grab a chunk of (block_size + 1) characters from the data
                chunk = data[idx:idx + block_size + 1]
                x = tf.convert_to_tensor(chunk[:-1])
                y = tf.convert_to_tensor(chunk[1:])
                yield x, y
        return _gen

    def __new__(
        cls, input_ids: List[int], block_size: int, batch_size: int
    ):
        # nb_examples = len(data)-block_size
        dataset =  (
            tf.data.Dataset.from_generator(
                cls._gen_examples_from(input_ids, block_size),
                output_signature=(
                    tf.TensorSpec(shape=(block_size,), dtype=tf.int32),
                    tf.TensorSpec(shape=(block_size,), dtype=tf.int32))
                )
                # .shuffle(nb_examples, reshuffle_each_iteration=True)
                .batch(batch_size, drop_remainder=True)
                .repeat()
                .prefetch(tf.data.experimental.AUTOTUNE)
        )
        return dataset

In [3]:
BLOCK_SIZE=64
BATCH_SIZE=512

def encode_text_to_ids(tokenizer, text: str):
    output = tokenizer.encode(text)
    return output.ids

pretrained_tokenizer = Tokenizer.from_file("./data/tokenizer.json")
vocab_size = pretrained_tokenizer.get_vocab_size()

text = open("./data/tinyshakespeare.txt").read()
input_ids = encode_text_to_ids(pretrained_tokenizer, text)
train_dataset = TokenizedDataset(
    input_ids, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
)
nb_examples = len(input_ids)-BLOCK_SIZE
nb_optimization_steps = nb_examples//BATCH_SIZE

2022-08-04 11:10:22.356991: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
vocab_size

18145

In [5]:
nb_examples

261907

In [6]:
nb_optimization_steps

511

In [7]:
EPOCHS=1
LEARNING_RATE=0.003#6.25e-5

total_number_optimization_steps = nb_optimization_steps * EPOCHS

print(f"total number optimization steps = {total_number_optimization_steps}")

config = GPT1Config(
    vocab_size=vocab_size, block_size=BLOCK_SIZE,
    n_layer=3, n_head=3, n_embd=48
)
tconf = TrainerConfig(
    max_epochs=EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE,
    do_lr_decay=True, warmup_ratio=0.1, cosine_decay_alpha=0.0,
    total_number_optimization_steps=total_number_optimization_steps, log_every_steps=10,
    ckpt_path='./logs', trial_id='shakespeare_token_level'
    # warmup_tokens=0, final_tokens=0
    # warmup_tokens=20*512, final_tokens=2*nb_examples*BLOCK_SIZE
)

model = GPT(config)

total number optimization steps = 511


In [None]:
trainer = Trainer(
    model, train_dataset, total_number_optimization_steps, config=tconf
)

trainer.train()

In [None]:
context = "O God, O God!"
# x = tf.constant([stoi[s] for s in context], dtype=tf.int32)[None,...]
x = tf.convert_to_tensor(pretrained_tokenizer.encode(context).ids, dtype=tf.int32)[None, ...]
y = model.sample(x, 100, temperature=1.0, sample=True, top_k=10)[0]
# completion = ''.join([itos[int(i)] for i in y])
completion = pretrained_tokenizer.decode(y)
print(completion)