In [34]:
import pandas as pd
import numpy as np
import h5py
from datasets import Dataset
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast

First, we need to load data and tokenize it.  This could, in  principle, be done in the datasets __getitem__, but why add work to IO when we can make it so much faster?


In [51]:
tokenizer = Tokenizer.from_pretrained('gpt2') #using a prior trained tokenizer
print(tokenizer.get_vocab_size())
split = 'test'
data = pd.read_csv(f'data/shakespeare/{split}.csv')
tokenized = tokenizer.encode(data['text'].values[0])

50257


Now, simply change the shaping of the array to form input sequences, and save as an H5 file for later.  Since this is GPT, we dont really need labels or attention masks.  We do make the input 1-token longer than necessary so that we can have the input be `tokens[:-1]`  and the label be `tokens[1:]`.


In [47]:
# get tokens from the string (its just one long string...)
input_ids = np.array(tokenized.ids)
# Pre-set sequence length (aka block size)
seq_len = 256
nsamples = input_ids.size // (seq_len+1)
input_ids = input_ids[:nsamples*seq_len].reshape((nsamples, seq_len))
print(input_ids.shape)
# save as h5 file for easy use later
with h5py.File(f'data/shakespeare/{split}.h5', 'w') as f:
    f.create_dataset('input_ids', data=input_ids, dtype=np.uint16) # need uint16 to accomodate 50K tokens



(70, 256)


In [53]:
thing = np.random.randint(0, 50257, (10,))
tokenizer.decode([198])

'\n'