In [1]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [2]:
from datasets import load_dataset

ds = load_dataset('billingsmoore/LotsawaHouse-bo-en', split='train')['bo']

In [4]:
print(ds[0])
enc = tokenizer.encode(ds[0])
print(enc)
dec = tokenizer.decode(enc)
print(dec)

བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།
[3, 2, 1]
<unk></s>


In [5]:
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

255

In [6]:
print(ds[0])
enc = tokenizer.encode(ds[0])
print(enc)
dec = tokenizer.decode(enc)
print(dec)

བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།
[32186, 32279, 32111, 32188, 32111, 32181, 32168, 32111, 32199, 32283, 32166, 32111, 32184, 32196, 32214, 32111, 32199, 32283, 32111, 32199, 32111, 32185, 32277, 32166, 32111, 32196, 32190, 32199, 32111, 32199, 32224, 32113, 32113, 1]
བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།</s>


## Inspect lengths

In [8]:
enc_lst = [tokenizer.encode(elt) for elt in ds]

Token indices sequence length is longer than the specified maximum sequence length for this model (1181 > 512). Running this sequence through the model will result in indexing errors


In [9]:
enc_lengths = [len(elt) for elt in enc_lst]

In [10]:
import numpy as np

np.mean(enc_lengths)

np.float64(37.767579632643574)

## Now Custom Tokenizer

In [14]:
dataset = load_dataset('billingsmoore/LotsawaHouse-bo-en')

In [16]:
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

# Initialize and train the tokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    (dataset['train']['bo'] + dataset['train']['en']),
    vocab_size=32_000,
    min_frequency=5,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]", "<unk>"]
)

# Wrap the tokenizer with PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="[PAD]",  # Set padding token
    bos_token="[BOS]",
    eos_token="[EOS]"
)


# Encode and decode example
enc = tokenizer.encode(ds[0])
dec = tokenizer.decode(enc)
print(dec)




བླ་མ་དང་ལྷག་པའི་ལྷ་ལ་ཕྱག་འཚལ་ལོ།།




In [17]:
enc_lst = [tokenizer.encode(elt) for elt in ds]
enc_lengths = [len(elt) for elt in enc_lst]
np.mean(enc_lengths)

np.float64(4.6495419669844225)