In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mingptf.model import GPT
import numpy as np
import tensorflow as tf
from mingptf.utils import set_seed
from mingptf.bpe import BPETokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from mingptf.utils import get_default_train_config
from random import shuffle
set_seed(3407)
tokenizer = BPETokenizer()

2023-01-10 01:27:46.813634: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
max_seq_len = 128
batch_size = 8
epoch = 25

In [14]:
"""
sample_text has transfoerm code
"""
def get_sent_list(filename):
    with open(filename) as file:
        lines = [line.rstrip() for line in file]
    return lines

def get_ids(seq):
    return tokenizer(seq).numpy().tolist()[0]

def pad_seq(inp):
    return pad_sequences(inp, padding='post', maxlen=max_seq_len)

In [15]:
lines = get_sent_list("./sample_text.txt")
shuffle(lines)
print(len(lines))

train = lines[:20]
test = lines[20:]

312


In [16]:
get_ids("<|endoftext|>")

[27, 91, 437, 1659, 5239, 91, 29]

In [17]:
train_input = pad_seq([get_ids("<|endoftext|>"+seq) for seq in train])
train_target = pad_seq([get_ids(seq+"<|endoftext|>") for seq in train])

test_input = pad_seq([get_ids("<|endoftext|>"+seq) for seq in test])
test_target = pad_seq([get_ids(seq+"<|endoftext|>") for seq in test])

In [18]:
train_data = (tf.data.Dataset.from_tensor_slices((train_input, train_target)).
                shuffle(buffer_size=25000).
                batch(batch_size).
                repeat(epoch))
test_data = (tf.data.Dataset.from_tensor_slices((test_input, test_target)).
                shuffle(buffer_size=25000).
                batch(batch_size).
                repeat(epoch))

In [19]:
model_config = GPT.get_default_config()
train_config = GPT.get_default_config()

model_config.model_type = 'gpt-micro'


model_config.vocab_size = 50257
model_config.block_size = 128
model = GPT(model_config)

train_config = get_default_train_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000


model.configure_optimizers(train_config)

number of parameters: 7.24M


In [25]:
model.fit(train_data, test_data, test_freq=5)

Steps 80: train loss 47.20955: test loss 51.71170 : perplexity 1899033791622039541383168.00000
Steps 85: train loss 52.81493: test loss 57.11445 : perplexity 35349099063452029006381056.00000
Steps 90: train loss 16.32367: test loss 39.56030 : perplexity 195427420059205632.00000
Steps 95: train loss 43.51852: test loss 68.83904 : perplexity 1650259242642114740603445650338611200.00000
Steps 100: train loss 42.31688: test loss 50.71311 : perplexity 11115445704403898597376.00000
Steps 105: train loss 28.92765: test loss 67.16721 : perplexity 294531313116797317694137084936192.00000
Steps 110: train loss 56.56803: test loss 54.34906 : perplexity 486127875236799249907712.00000
Steps 115: train loss 47.35986: test loss 60.51856 : perplexity 8111255750170134660017958158336.00000
Steps 120: train loss 15.88312: test loss 73.68176 : perplexity 1734187061300400210174363440720642048.00000
Steps 125: train loss 45.62486: test loss 54.42824 : perplexity 64432601089382143828126859264.00000
Steps 130: 

In [26]:
def generate(prompt='', num_samples=10, no_gen_tokens=20, no_samples=5, temperature=1, top_k=8):
    x = tokenizer(prompt)
    samples = []
    for _ in range(no_samples):
        y = model.generate(x, max_new_tokens=no_gen_tokens, 
                           temperature=temperature,
                           top_k=top_k)
        samples.append(tokenizer.decode(y.numpy()))
    return samples

In [27]:
generate(prompt='def')

['def<    <     |     of  ',
 'def    =<               ',
 'def<     .< <   _      ',
 'def  of < =      .       ',
 'def                    ']