In [111]:
import pathlib
import random

from datasets import load_dataset
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

from tokenizer import load_tokenizer, eod_token

In [112]:
vocab = pathlib.Path('./data/bpe-normal-number-preservation-vocab.json')
merges = pathlib.Path('./data/bpe-normal-number-preservation-merges.txt')
context_size = 1024

In [113]:
# load the tokenizer
tokenizer = load_tokenizer(vocab, merges)
tokenizer.padding_side='right'
print(tokenizer)

GPT2TokenizerFast(name_or_path='', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [114]:
dataset = load_dataset('nyu-mll/glue', 'sst2')
train_split = dataset['train']
row = train_split[random.randint(0, len(train_split)-1)]
print(row)

{'sentence': "we 're wrapped up in the characters , how they make their choices , and why ", 'label': 1, 'idx': 57001}


In [115]:
tokens = tokenizer(row['sentence'], truncation=True, padding='max_length', max_length=context_size)
print(tokens)
print(tokens['input_ids'])

{'input_ids': [352, 1097, 260, 12654, 513, 286, 261, 3619, 1619, 688, 475, 783, 496, 7207, 1619, 294, 1416, 220, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000, 50000

In [116]:
def process(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=context_size)

encoded_dataset = dataset.map(process, batched=True, remove_columns=dataset['train'].column_names)

Map: 100%|██████████| 67349/67349 [00:00<00:00, 103363.56 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 67001.91 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 77523.29 examples/s]


In [117]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# create a DataLoader just like Trainer would
train_loader = DataLoader(
    encoded_dataset['train'],
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator
)

# print one batch
batch = next(iter(train_loader))
for k, v in batch.items():
    print(k, v.shape, v)
    break

input_ids torch.Size([32, 33]) tensor([[ 1091,   257,  4040,  ..., 50000, 50000, 50000],
        [ 1748,  4746,   341,  ..., 50000, 50000, 50000],
        [  363,  1097,   260,  ..., 50000, 50000, 50000],
        ...,
        [ 1782,    12, 11549,  ..., 50000, 50000, 50000],
        [10615,  1522,   220,  ..., 50000, 50000, 50000],
        [  975,   320,  2906,  ..., 50000, 50000, 50000]])
