In [1]:
from typing import List

import tensorflow as tf

from tokenizers import Tokenizer

from gpt.trainer import (Trainer, TrainerConfig,)
from gpt.modeling import (GPT, GPT1Config,)

2022-08-04 14:18:37.380037: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Token level GPT

In [2]:
def clean_up_tokenization(out_string: str) -> str:
    # From: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3494
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [3]:
class TokenizedDataset(tf.data.Dataset):
    def _gen_examples_from(data: List[int], block_size: int):
        def _gen():
            nb_examples = len(data)-block_size
            for idx in range(nb_examples):
                # grab a chunk of (block_size + 1) characters from the data
                chunk = data[idx:idx + block_size + 1]
                x = tf.convert_to_tensor(chunk[:-1])
                y = tf.convert_to_tensor(chunk[1:])
                yield x, y
        return _gen

    def __new__(
        cls, input_ids: List[int], block_size: int, batch_size: int
    ):
        # nb_examples = len(data)-block_size
        dataset =  (
            tf.data.Dataset.from_generator(
                cls._gen_examples_from(input_ids, block_size),
                output_signature=(
                    tf.TensorSpec(shape=(block_size,), dtype=tf.int32),
                    tf.TensorSpec(shape=(block_size,), dtype=tf.int32))
                )
                # .shuffle(nb_examples, reshuffle_each_iteration=True)
                .batch(batch_size, drop_remainder=True)
                .repeat()
                .prefetch(tf.data.experimental.AUTOTUNE)
        )
        return dataset

In [26]:
BLOCK_SIZE=128
BATCH_SIZE=512

def encode_text_to_ids(tokenizer, text: str):
    output = tokenizer.encode(text)
    return output.ids

pretrained_tokenizer = Tokenizer.from_file("./data/tokenizer.json")
vocab_size = pretrained_tokenizer.get_vocab_size()

text = open("./data/tinyshakespeare.txt").read()
input_ids = encode_text_to_ids(pretrained_tokenizer, text)
train_dataset = TokenizedDataset(
    input_ids, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
)
nb_examples = len(input_ids)-BLOCK_SIZE
nb_optimization_steps = nb_examples//BATCH_SIZE

In [27]:
vocab_size

18146

In [28]:
nb_examples

301842

In [29]:
nb_optimization_steps

589

In [36]:
EPOCHS=4
LEARNING_RATE=0.003#6.25e-5

total_number_optimization_steps = nb_optimization_steps * EPOCHS

print(f"total number optimization steps = {total_number_optimization_steps}")

config = GPT1Config(
    vocab_size=vocab_size, block_size=BLOCK_SIZE,
    n_layer=8, n_head=8, n_embd=512
)
tconf = TrainerConfig(
    max_epochs=EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE,
    do_lr_decay=True, warmup_ratio=0.05, cosine_decay_alpha=0.0,
    total_number_optimization_steps=total_number_optimization_steps, log_every_steps=10,
    ckpt_path='./logs', trial_id='shakespeare_token_level'
)

model = GPT(config)

total number optimization steps = 2356


In [37]:
trainer = Trainer(
    model, train_dataset, total_number_optimization_steps, config=tconf
)

trainer.train()

step 2356: loss 4.67317 - acc 22.33% - lr 0.000000: 100%|██████████| 2356/2356 [30:59<00:00,  1.27it/s]


In [40]:
context = "O God, O God!"
x = tf.convert_to_tensor(pretrained_tokenizer.encode(context).ids, dtype=tf.int32)[None, ...]
y = model.sample(x, 1000, temperature=1.0, sample=True, top_k=10)[0]
completion = clean_up_tokenization(
    pretrained_tokenizer.decode(y, skip_special_tokens=False)
)
print(completion)

O God, O God! 
 
 
 
 KING HENRY VI shall be gone ; 
 And thou hast not be not not I must have so to the day, 
 I know 
 For I am gone : 
 I's head 
 That thou shalt you. 
 
 And he had not not not, and be, 
 To have, 
 To make you have I will the queen, 
 The rest of your own : 
 That you, that you, 
 And thou art, 
 But he was in a Duke of the prince. 
 
 
 My 
 KING RICHARD II : 
 And thou, for this man 
 And we do I know you be a day, 
 And thou wilt thou art in this : 
 That I would not, if you are a world : 
 The day is I am so, 
 But we shall be done, 
 And so, 
 To be the queen, 
 For the queen! 
 
 
 
 And, 
 But in thy heart? 
 What, 
 And not the world is the rest! 
 
 
 But we shall not. 
 ROMEO : 
 
 
 
 The crown! 
 
 
 KING HENRY VI : 
 
 
 I do be gone? Come, and I shall be so, if I must, 
 And I do not not not so to be a Duke of his heart? 
 
 O, 
 I's blood of that is the world'tis a Duke of thy heart. 
 
 
 
 
 ROMEO : 
 KING HENRY VI 
 I am to make this. 
 
 
 And t

### Char level GPT

In [14]:
class CharDataset(tf.data.Dataset):
    @staticmethod
    def compute_vocab_from_text(text: str):
        chars = sorted(list(set(text)))
        data_size, vocab_size = len(text), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        stoi = { ch:i for i,ch in enumerate(chars) }
        itos = { i:ch for i,ch in enumerate(chars) }
        return stoi, itos

    def _gen_examples_from(text: str, block_size: int):
        def _gen():
            stoi, _ = CharDataset.compute_vocab_from_text(text)
            nb_examples = len(text)-block_size
            for idx in range(nb_examples):
                # grab a chunk of (block_size + 1) characters from the data
                chunk = text[idx:idx + block_size + 1]
                # encode every character to an integer
                dix = [stoi[s] for s in chunk]
                x = tf.convert_to_tensor(dix[:-1])
                y = tf.convert_to_tensor(dix[1:])
                yield x, y
        return _gen

    def __new__(
        cls, text: str, block_size: int, batch_size: int
    ):
        # nb_examples = len(text)-block_size
        dataset =  (
            tf.data.Dataset.from_generator(
                cls._gen_examples_from(text, block_size),
                output_signature=(
                    tf.TensorSpec(shape=(block_size,), dtype=tf.int32),
                    tf.TensorSpec(shape=(block_size,), dtype=tf.int32))
                )
                # .shuffle(nb_examples, reshuffle_each_iteration=True)
                .batch(batch_size, drop_remainder=True)
                .repeat()
                .prefetch(tf.data.experimental.AUTOTUNE)
        )
        return dataset

In [15]:
BLOCK_SIZE=128
BATCH_SIZE=512
EPOCHS=2

LEARNING_RATE=0.003#6.25e-5

In [20]:
text = open("./data/tinyshakespeare.txt").read()
train_dataset_char = CharDataset(
    text, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
)
nb_examples = len(text)-BLOCK_SIZE
nb_optimization_steps = nb_examples//BATCH_SIZE
total_number_optimization_steps = nb_optimization_steps*EPOCHS

stoi, itos = CharDataset.compute_vocab_from_text(text)
vocab_size = len(stoi)

data has 1115393 characters, 64 unique.


In [22]:
print(f"vocab size = {len(stoi)}")
print(f"no. examples = {nb_examples}")
print(f"no. optimization steps = {nb_optimization_steps}")
print(f"no. total optimization steps = {total_number_optimization_steps}")

vocab size = 64
no. examples = 1115265
no. optimization steps = 2178
no. total optimization steps = 4356


In [23]:
for x, y in train_dataset_char:
    print(x, y)
    break

data has 1115393 characters, 64 unique.
tf.Tensor(
[[17 46 55 ... 57 52  1]
 [46 55 56 ... 52  1 41]
 [55 56 57 ...  1 41 46]
 ...
 [ 5  1 57 ...  0 60 45]
 [ 1 57 45 ... 60 45 52]
 [57 45 42 ... 45 52 49]], shape=(512, 128), dtype=int32) tf.Tensor(
[[46 55 56 ... 52  1 41]
 [55 56 57 ...  1 41 46]
 [56 57  1 ... 41 46 42]
 ...
 [ 1 57 45 ... 60 45 52]
 [57 45 42 ... 45 52 49]
 [45 42  1 ... 52 49 42]], shape=(512, 128), dtype=int32)


In [25]:
config_char = GPT1Config(
    vocab_size=vocab_size, block_size=BLOCK_SIZE,
    n_layer=8, n_head=8, n_embd=512
)
tconf_char = TrainerConfig(
    max_epochs=EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE,
    do_lr_decay=True, warmup_ratio=0.3, cosine_decay_alpha=0.0,
    total_number_optimization_steps=total_number_optimization_steps, log_every_steps=10,
    ckpt_path='./logs', trial_id='shakespeare_token_level'
)

model_char = GPT(config_char)

trainer_char = Trainer(
    model_char, train_dataset_char, total_number_optimization_steps, config=tconf_char
)

trainer_char.train()

  0%|          | 0/4356 [00:00<?, ?it/s]

data has 1115393 characters, 64 unique.


2022-08-04 12:16:03.626045: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
step 2176: loss 1.69441 - acc 40.61% - lr 0.000567:  50%|████▉     | 2177/4356 [17:26<17:05,  2.12it/s]

data has 1115393 characters, 64 unique.


step 4354: loss 1.82917 - acc 44.29% - lr 0.000000: 100%|█████████▉| 4355/4356 [34:32<00:00,  2.12it/s]

data has 1115393 characters, 64 unique.


step 4356: loss 1.65761 - acc 44.30% - lr 0.000000: 100%|██████████| 4356/4356 [34:33<00:00,  2.10it/s]


In [27]:
context = "O God, O God!"
x = tf.convert_to_tensor([stoi[c] for c in context], dtype=tf.int32)[None, ...]
y = model_char.sample(x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = "".join([itos[int(i)] for i in y])
print(completion)

O God, O God! thou clep hurle of hysure of,
For that heave, fault the when boy of the btoth?

BRNOLAN:
No brings? and with thereful not of mastand
The pasit boy, that belive father: slack.

KING RALY CORNOMHERDND II:
As to his dest in extech'd tear the citys hold
Why, would a fair beging his with brood:
Ime have mind, and to from in my mortor:
Here to servent may, and ascountage is strive is mercy
I do the cousing so be hence
Who bown with fractice one of my sands:
And of weet this hatharge.

DY ANRY:
And to sempoke backs:
O, that this soons bats haition sovight,
The sucherence shallest as incred the pride and my crreasond:
With they strause and with waye sin abistion
Freep that so desin me the doth our husbmit
Age be have is which othan one, seem the servest:
My loves feeky say the to he, split,
What than a the come me of mighter more be was their to here think of in
Angen, with be had angeranly of the deserved
And the head the for wreturn be her will had of whichier:
Their foals be s