In [1]:
from multi_tokenizer import MultiTokenizer, PretrainedTokenizers

Using `split_text=True` to split text into sentences which helps to improve the 
language detection accuracy

In [4]:
lang_tokenizers = [
    PretrainedTokenizers.ENGLISH,
    PretrainedTokenizers.CHINESE,
    PretrainedTokenizers.HINDI,
]
fallback_tokenizer = PretrainedTokenizers.ENGLISH

tokenizer = MultiTokenizer(lang_tokenizers, fallback_tokenizer, split_text=True)

In [17]:
# sentence = "The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है."
# sentence = "Translate this hindi sentence to english - बिल्ली बहुत प्यारी है."
sentence = "The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ."
tokenizer.pre_tokenize(sentence)

[('<EN>', (0, 1)),
 ('The', (1, 4)),
 ('Ġcat', (4, 8)),
 ('Ġis', (8, 11)),
 ('Ġcute', (11, 16)),
 ('.', (16, 17)),
 ('</EN>', (15, 16)),
 (' ', (16, 17)),
 ('<ZH>', (17, 18)),
 ('çĮ«å¾Īåı¯çĪ±', (18, 22)),
 ('.', (22, 23)),
 ('</ZH>', (21, 22)),
 (' ', (22, 23)),
 ('<HI>', (23, 24)),
 ('à¤¬', (24, 25)),
 ('à¤¿', (25, 26)),
 ('à¤²', (26, 27)),
 ('à¥į', (27, 28)),
 ('à¤²', (28, 29)),
 ('à¥Ģ', (29, 30)),
 ('Ġà¤¬à¤¹', (30, 33)),
 ('à¥ģ', (33, 34)),
 ('à¤¤', (34, 35)),
 ('Ġà¤ª', (35, 37)),
 ('à¥į', (37, 38)),
 ('à¤¯', (38, 39)),
 ('à¤¾', (39, 40)),
 ('à¤°', (40, 41)),
 ('à¥Ģ', (41, 42)),
 ('Ġà¤¹', (42, 44)),
 ('à¥Ī.', (44, 46)),
 ('</HI>', (44, 45)),
 (' නර්තනය ඉතා ආදරේ.', (45, 62))]

In [18]:
len(sentence)

62

In [19]:
tokenizer.get_vocab_size()

25000

In [20]:
tokens = tokenizer.tokenize(sentence)
ids = tokenizer.encode(sentence)

In [21]:
print(tokens)
print(ids)

['<EN>', 'The', 'Ġcat', 'Ġis', 'Ġcute', '.', '</EN>', 'Ġ', '<ZH>', 'çĮ', '«', 'å¾Ī', 'åı¯', 'çĪ', '±', '.', '</ZH>', 'Ġ', '<HI>', 'à¤¬', 'à¤¿', 'à¤²', 'à¥į', 'à¤²', 'à¥Ģ', 'Ġà¤¬à¤¹', 'à¥ģ', 'à¤¤', 'Ġà¤ª', 'à¥į', 'à¤¯', 'à¤¾', 'à¤°', 'à¥Ģ', 'Ġà¤¹', 'à¥Ī.', '</HI>', 'Ġ', 'à', '¶', '±', 'à', '¶', '»', 'à', '·', 'Ĭ', 'à', '¶', 'Ń', 'à', '¶', '±', 'à', '¶', 'º', 'Ġ', 'à', '¶', 'ī', 'à', '¶', 'Ń', 'à', '·', 'ı', 'Ġ', 'à', '¶', 'Ĩ', 'à', '¶', '¯', 'à', '¶', '»', 'à', '·', 'ļ', '.']
[3, 383, 714, 416, 2065, 24, 4, 231, 7, 1512, 115, 9849, 368, 439, 120, 24, 8, 231, 9, 329, 277, 285, 282, 285, 273, 342, 286, 283, 294, 282, 292, 270, 272, 273, 287, 919, 10, 231, 167, 125, 120, 167, 125, 130, 167, 126, 243, 167, 125, 266, 167, 125, 120, 167, 125, 129, 231, 167, 125, 242, 167, 125, 266, 167, 126, 248, 231, 167, 125, 239, 167, 125, 118, 167, 125, 130, 167, 126, 259, 24]


In [22]:
len(tokens)

80

In [23]:
print("Decoded String:", tokenizer.decode(ids))
print("Original String:", sentence)

Decoded String: The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.
Original String: The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.


In [24]:
from transformers import AutoTokenizer

aya_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-23-8B")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
tokens = aya_tokenizer.tokenize(sentence)
print(len(tokens))
print(tokens)

54
['The', 'Ġcat', 'Ġis', 'Ġcute', '.', 'Ġ', 'çĮ«', 'å¾Ī', 'åı¯', 'çĪ±', '.', 'Ġà¤¬', 'à¤¿', 'à¤²', 'à¥į', 'à¤²', 'à¥Ģ', 'Ġà¤¬à¤¹', 'à¥ģ', 'à¤¤', 'Ġà¤ª', 'à¥į', 'à¤¯', 'à¤¾', 'à¤°', 'à¥Ģ', 'Ġà¤¹', 'à¥Ī.', 'Ġà¶', '±', 'à¶', '»', 'à·Ĭ', 'à¶', 'Ń', 'à¶', '±', 'à¶', 'º', 'Ġà¶', 'ī', 'à¶', 'Ń', 'à·', 'ı', 'Ġà¶', 'Ĩ', 'à¶', '¯', 'à¶', '»', 'à·', 'ļ', '.']


In [26]:
len(aya_tokenizer.get_vocab())

255029

In [27]:
ids = aya_tokenizer.encode(sentence)
print(ids)

[5, 2162, 8592, 1801, 44997, 21, 228, 50826, 14441, 4996, 15069, 21, 5144, 2337, 2973, 2173, 2973, 2491, 34215, 3695, 2560, 3468, 2173, 2857, 2054, 2169, 2491, 3372, 208810, 187055, 117, 77103, 127, 200914, 77103, 263, 77103, 117, 77103, 126, 187055, 239, 77103, 263, 71791, 245, 187055, 236, 77103, 115, 77103, 127, 71791, 256, 21]


In [28]:
print("Decoded String:", aya_tokenizer.decode(ids))
print("Original String:", sentence)

Decoded String: <BOS_TOKEN>The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.
Original String: The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.


In [29]:
%timeit aya_tokenizer.encode(sentence)

243 μs ± 29.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [31]:
%timeit tokenizer.encode(sentence)

892 μs ± 24.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
