In [3]:
from tqdm import tqdm
import torch
from datasets import concatenate_datasets, load_dataset
from transformers import BertTokenizerFast

In [4]:
bookcorpus = load_dataset("bookcorpus", split="train")
wiki = load_dataset("wikipedia", "20220301.en", split="train")
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column
 
assert bookcorpus.features.type == wiki.features.type
dataset = concatenate_datasets([bookcorpus, wiki])

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [7]:
def batch_iterator(dataset, batch_size=10000):
    for i in tqdm(range(0, len(dataset), batch_size)):
        yield dataset[i : i + batch_size]["text"]

In [8]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [14]:
tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(dataset), vocab_size=30522)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8047/8047 [07:56<00:00, 16.89it/s]







In [27]:
tokenizer.save_pretrained("../tokenizer")

('../tokenizer/tokenizer_config.json',
 '../tokenizer/special_tokens_map.json',
 '../tokenizer/vocab.txt',
 '../tokenizer/added_tokens.json',
 '../tokenizer/tokenizer.json')

In [37]:
text = dataset[1002]["text"]
tokens = tokenizer.tokenize(text)
print(f"{text}\ntokens: {tokens}")

aidan appeared then , carrying all the drinks on a tray .
tokens: ['aid', '##an', 'appeared', 'then', ',', 'carry', '##ing', 'all', 'the', 'drink', '##s', 'on', 'a', 'tra', '##y', '.']


In [38]:
encoding = tokenizer(text)

In [39]:
encoding

{'input_ids': [2, 29859, 25035, 27159, 25450, 16, 28625, 25047, 25236, 25031, 29110, 19877, 25072, 43, 30460, 19871, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [40]:
encoding.tokens()

['[CLS]',
 'aid',
 '##an',
 'appeared',
 'then',
 ',',
 'carry',
 '##ing',
 'all',
 'the',
 'drink',
 '##s',
 'on',
 'a',
 'tra',
 '##y',
 '.',
 '[SEP]']

In [41]:
encoding.word_ids()

[None, 0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, None]

In [47]:
start, end = encoding.word_to_chars(4)
text[start:end]

'carrying'

In [43]:
text[0:5]

'aidan'