### Training BPE based Tokenizer

In [10]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token="<UNK>"))

In [11]:
t = Tokenizer.from_file('/workspaces/multi-tokenizer/multi_tokenizer/pretrained/english_tokenizer.json')

In [13]:
t.get_vocab()

{'ĠMead': 16495,
 'ĠBud': 11957,
 'Ġhotter': 8919,
 'Ġposed': 15320,
 'Ġinter': 7928,
 'Marta': 19882,
 'eline': 10056,
 'George': 5709,
 'Ġpoun': 14096,
 'rey': 7511,
 'Ġhall': 3909,
 'Ġfrosty': 18958,
 'Ġpyram': 15947,
 'dd': 554,
 'Ġships': 8719,
 'Chuck': 12993,
 'Ġoverjoyed': 5555,
 'Ġbaaed': 14712,
 'ĠThatÃ¢': 7750,
 'Ġshivers': 12969,
 'Ġdevoured': 13792,
 'Ġmuff': 3809,
 'osquitoes': 13926,
 'Ġpaddock': 15767,
 'Ġsympathet': 17171,
 'ched': 696,
 'llies': 18295,
 'Ġhissy': 18360,
 'Ġsaltier': 19403,
 'ą': 204,
 'Ġumbrella': 3704,
 'ature': 3131,
 'Ġtaller': 4659,
 'Ġnext': 1013,
 'ĠKarl': 12875,
 'Ġsquirm': 12850,
 'Ġthin': 2268,
 'Ġsquinted': 8806,
 'Ġseated': 15216,
 'Ġmommie': 18379,
 'k': 85,
 'ÅĵWeÃ¢': 9079,
 'Ġtid': 2370,
 ".'": 5708,
 'Ġparade': 3842,
 'Pain': 14954,
 'ĠKaylee': 11899,
 'Ġbackup': 15240,
 'Stacey': 17016,
 'Ġapologised': 8174,
 'Ġknee': 2286,
 'Ġeither': 3999,
 'Ġtwenty': 7289,
 'ĠMixing': 12729,
 'Ġtaps': 11199,
 'Win': 17966,
 'Ġscowl': 17420,
 'Ġdefea

In [36]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["<UNK>", "<CLS>", "<SEP>", "<PAD>", "<MASK>"])

In [37]:
from tokenizers.pre_tokenizers import Whitespace, ByteLevel, BertPreTokenizer, Metaspace
from tokenizers.processors import TemplateProcessing, BertProcessing
# tokenizer.pre_tokenizer = Whitespace()
# tokenizer.pre_tokenizer = ByteLevel()
# tokenizer.pre_tokenizer = BertPreTokenizer()
tokenizer.pre_tokenizer = Metaspace(replacement=" ", prepend_scheme="never")
tokenizer.post_processor = TemplateProcessing(
    single="<CLS> $A:1 <SEP>:1",
    pair="<CLS> $A:1 <SEP> $B:1 <SEP>",
    special_tokens=[("<CLS>", 1), ("<SEP>", 2)],
)

In [38]:
tokenizer.pre_tokenizer.pre_tokenize_str("Hello, y'all! How are you 😁 ?")

[('Hello,', (0, 6)),
 (" y'all!", (6, 13)),
 (' How', (13, 17)),
 (' are', (17, 21)),
 (' you', (21, 25)),
 (' 😁', (25, 27)),
 (' ?', (27, 29))]

In [39]:
from datasets import load_dataset
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]

def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [40]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)






In [41]:
# tokenizer.save("tokenizer-wiki.json")

In [42]:
# tokenizer = Tokenizer.from_file("tokenizer-wiki.json")

In [43]:
output = tokenizer.encode("Hello, y'all! How are you 😁?")

In [44]:
print(output.tokens)

['<CLS>', 'H', 'ello', ',', ' y', "'", 'all', '!', ' How', ' are', ' you', ' ', '<UNK>', '?', '<SEP>']


In [45]:
print(output.ids)

[1, 46, 15853, 18, 1230, 13, 1191, 7, 4081, 1215, 2403, 6, 0, 37, 2]


In [46]:
tokenizer.decode(output.ids)

"H ello ,  y ' all !  How  are  you   ?"

### Training a Pretrained Tokenizer

In [1]:
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]
def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-23-8B")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


tokenizer_config.json:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.5M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
tokenizer.is_fast

True

In [8]:
tokenizer.vocab_size

255000

In [9]:
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=25000)

In [11]:
tokenizer.decode(tokenizer.encode("Hello, y'all! How are you 😁?"))

"<BOS_TOKEN>Hello, y'all! How are you 😁?"

In [12]:
new_tokenizer.decode(new_tokenizer.encode("Hello, y'all! How are you 😁?"))

"<BOS_TOKEN>Hello, y'all! How are you ���?"

In [18]:
new_tokenizer

CohereTokenizerFast(name_or_path='CohereForAI/aya-23-8B', vocab_size=25000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<BOS_TOKEN>', 'eos_token': '<|END_OF_TURN_TOKEN|>', 'pad_token': '<PAD>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<PAD>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<UNK>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<CLS>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<SEP>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<MASK_TOKEN>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<BOS_TOKEN>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: AddedTo

In [20]:
type(new_tokenizer)

transformers.models.cohere.tokenization_cohere_fast.CohereTokenizerFast

## ByteLevelBPE Tokenizer

In [5]:
from tokenizers import ByteLevelBPETokenizer

In [9]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-raw-v1" ,split="train")
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]
def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train_from_iterator(batch_iterator(), vocab_size=25000, min_frequency=2, special_tokens=["<UNK>", "<CLS>", "<SEP>", "<PAD>", "<MASK>"])






In [40]:
output = tokenizer.encode("Hello, y'all! How are you 😁?")


In [41]:
output

Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [32]:
print(tokenizer.decode(output.ids))

Hello, y'all! How are you 😁?


In [33]:
tokenizer.pre_tokenizer.pre_tokenize_str("Hello, y'all! How are you 😁?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('Ġy', (6, 8)),
 ("'", (8, 9)),
 ('all', (9, 12)),
 ('!', (12, 13)),
 ('ĠHow', (13, 17)),
 ('Ġare', (17, 21)),
 ('Ġyou', (21, 25)),
 ('ĠðŁĺģ?', (25, 28))]

In [39]:
list(tokenizer.get_vocab().items())[:10]

[('vogns', 17925),
 ('Ġ272', 24025),
 ('omes', 2346),
 ('Ġpunishment', 9756),
 ('ĠTric', 21199),
 ('ĠRelief', 21856),
 ('ĠTrotternish', 23346),
 ('enant', 4555),
 ('Ġabsor', 6931),
 ('adm', 3761)]

In [34]:
tokenizer.save("bytelevelbpe-tokenizer-wiki.json")