# 🔵 **Import Libraries**

In [1]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
import tokenizers
import tiktoken
import transformers
from transformers import AutoTokenizer
import datasets
from datasets import load_dataset
import numpy as np
from IPython.display import Markdown, display
import time

In [2]:
print('transformers: ', transformers.__version__)
print('datasets: ', datasets.__version__)
print('tokenizers: ', tokenizers.__version__)
print('tiktoken: ', tiktoken.__version__)
print('numpy: ', np.__version__)

transformers:  4.52.3
datasets:  3.6.0
tokenizers:  0.21.1
tiktoken:  0.9.0
numpy:  1.26.4


# 🔵 **TinyStories Dataset**

## 🔵 **LoadDataset**

In [3]:
dataset = load_dataset("roneneldan/TinyStories")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [4]:
train_dataset = dataset['train']
valid_dataset = dataset['validation']
print('Number of train rows(stories): ', train_dataset.num_rows/1e6, 'Million rows(stories)')
print('Number of valid rows(stories): ', valid_dataset.num_rows/1e6, 'Million rows(stories)')

Number of train rows(stories):  2.119719 Million rows(stories)
Number of valid rows(stories):  0.02199 Million rows(stories)


In [5]:
[print(f'story{i+1}: \n\n', train_dataset[i]['text'], '\n') for i in range(5)]

story1: 

 One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together. 

story2: 

 Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.

One day, Beep was driving in the park when he saw a big tree. The tree had many lea

[None, None, None, None, None]

## 🔵 **Tokenize**

In [15]:
model = models.BPE(unk_token = '[UNK]')
tokenizer = Tokenizer(model = model)
tokenizer.add_special_tokens(['[UNK]', '[CLS]', '[SEP]'])
tokenizer.pre_tokenizer  = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(vocab_size = 50000, min_frequency = 0, show_progress = True, special_tokens = ['[UNK]', '[CLS]', '[SEP]'])
tokenizer.post_processor = processors.TemplateProcessing(single="[CLS] $0 [SEP]",
                                                         special_tokens=[("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]"))])

tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False)

In [17]:
sample = " ".join(valid_dataset['text'])
sample[0:100]

'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled a'

#### 🔵 **train Tokenizer**

In [19]:
try:
    tokenizer = tokenizer.from_file('../PretrainedBPETokenizers/bpe_tokenizer-50k.json')
except:
    tokenizer.train_from_iterator(train_dataset['text'], trainer)
    tokenizer.save('../PretrainedBPETokenizers/bpe_tokenizer-50k.json')

In [20]:
encoded_text = tokenizer.encode(sample)
encoded_text.tokens[0:10]

['[CLS]',
 'Spot',
 '.',
 'ĠSpot',
 'Ġsaw',
 'Ġthe',
 'Ġshiny',
 'Ġcar',
 'Ġand',
 'Ġsaid']

In [22]:
sample[0:100]

'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled a'

In [23]:
tokenizer.decode(encoded_text.ids)[0:100]

'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled a'

### 🔵 **tokenizer-20k**

In [27]:
tokenizer = tokenizer.from_file('../PretrainedBPETokenizers/bpe_tokenizer-20k.json')

In [29]:
len(tokenizer.get_vocab())

20000

In [31]:
vocab = tokenizer.get_vocab()
sorted_vocab = dict(sorted(vocab.items(), key = lambda item:item[1]))
list(sorted_vocab.keys())[0:13]

['[UNK]', '[CLS]', '[SEP]', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [33]:
list(sorted_vocab.keys())[-20:]

['Ġglancing',
 'Ġlists',
 'Ġuncles',
 'ffled',
 'Ġdisliked',
 'Ġamusing',
 'ĠRy',
 'ĠRiya',
 'Ġbaffled',
 'ĠDoc',
 'ĠDovey',
 'sterious',
 'shade',
 'Ġshopped',
 'Ġsquirting',
 'ĠKiss',
 'clear',
 'Ġwelled',
 'Ġcauses',
 'ĠOxy']

In [168]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample).ids
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 16986.18 ± 273.01 (ms)
decoding time: 1465.94 ± 31.05 (ms)


### 🔵 **tokenizer-10k**

In [175]:
tokenizer = tokenizer.from_file('../PretrainedBPETokenizers/bpe_tokenizer-10k.json')

In [176]:
len(tokenizer.get_vocab())

10000

In [179]:
vocab = tokenizer.get_vocab()
sorted_vocab = dict(sorted(vocab.items(), key = lambda item:item[1]))
list(sorted_vocab.keys())[0:13]

['[UNK]', '[CLS]', '[SEP]', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [181]:
list(sorted_vocab.keys())[-20:]

['ĠCarol',
 'Ġsnuggling',
 'Ġplumber',
 'Ġrestoring',
 'Ġwinners',
 'Ġpounding',
 'ĠStanley',
 'elve',
 'Ġswoop',
 'Ġfeathered',
 'Ġorganised',
 'Ġtaps',
 'Ġfashionable',
 'Ġstinky',
 'Ġgorgeous',
 'Ġrider',
 'Ġdeserves',
 'ĠPlay',
 'Ġmacaroni',
 'ĠNell']

In [183]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample).ids
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 17835.52 ± 732.87 (ms)
decoding time: 1461.94 ± 16.32 (ms)


### 🔵 **tokenizer-5k**

In [184]:
tokenizer = tokenizer.from_file('../PretrainedBPETokenizers/bpe_tokenizer-5k.json')

In [185]:
len(tokenizer.get_vocab())

5000

In [186]:
vocab = tokenizer.get_vocab()
sorted_vocab = dict(sorted(vocab.items(), key = lambda item:item[1]))
list(sorted_vocab.keys())[0:13]

['[UNK]', '[CLS]', '[SEP]', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [187]:
list(sorted_vocab.keys())[-20:]

['Ġsneezed',
 'Ġsport',
 'Ġwizard',
 'ĠEach',
 'Ġupstairs',
 'Ġthrilled',
 'Ġprovide',
 'Ġbrain',
 'Ġawe',
 'Ġdiscuss',
 'Jake',
 'Ġpanic',
 'Ġtwisted',
 'Ġglowed',
 'Ġuniverse',
 'Ġfiref',
 'Ġtherm',
 'Ġvacation',
 'Ġworse']

In [197]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample).ids
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 19151.33 ± 1104.25 (ms)
decoding time: 1553.28 ± 62.15 (ms)


### 🔵 **tokenizer-1k**

In [199]:
tokenizer = tokenizer.from_file('../PretrainedBPETokenizers/bpe_tokenizer-1k.json')

In [200]:
len(tokenizer.get_vocab())

1000

In [201]:
vocab = tokenizer.get_vocab()
sorted_vocab = dict(sorted(vocab.items(), key = lambda item:item[1]))
list(sorted_vocab.keys())[0:13]

['[UNK]', '[CLS]', '[SEP]', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [202]:
list(sorted_vocab.keys())[-20:]

['Ġde',
 'able',
 'Ġimportant',
 'Ġremember',
 'Ġfish',
 'llow',
 'Ġsound',
 'Ġslide',
 'Ġus',
 'maz',
 'Ġreplied',
 'Ġamaz',
 'Ġac',
 'lease',
 'Ġwork',
 'Ġwatch',
 'Ġrain',
 'Ġshowed',
 'Ġrabbit',
 'Ġokay']

In [203]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample).ids
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 21763.89 ± 595.26 (ms)
decoding time: 1851.72 ± 52.17 (ms)


### 🔵 **tokenizer-500**

In [205]:
tokenizer = tokenizer.from_file('../PretrainedBPETokenizers/bpe_tokenizer-500.json')

In [206]:
len(tokenizer.get_vocab())

500

In [207]:
vocab = tokenizer.get_vocab()
sorted_vocab = dict(sorted(vocab.items(), key = lambda item:item[1]))
list(sorted_vocab.keys())[0:13]

['[UNK]', '[CLS]', '[SEP]', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [208]:
list(sorted_vocab.keys())[-20:]

['Ġmu',
 'ook',
 'ach',
 'Ġagain',
 'Ġhome',
 'Ġwhen',
 'Ġgood',
 'hat',
 'ep',
 'Ġwho',
 'est',
 'ried',
 'Ġfound',
 'Ġfl',
 'Ġthen',
 'Ġch',
 'Ġdec',
 'pped',
 'Ġwal',
 'as']

In [210]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample).ids
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 22945.48 ± 857.94 (ms)
decoding time: 2191.75 ± 116.26 (ms)


### 🔵 **Pretrained-Tokenizer**

#### 🔵 **Bert**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer

In [214]:
tokenizer.vocab_size

28996

In [217]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 18388.68 ± 1615.23 (ms)
decoding time: 7171.89 ± 395.4 (ms)


#### 🔵 **Bert-Fast**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer

In [220]:
tokenizer.vocab_size

30522

In [221]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

Token indices sequence length is longer than the specified maximum sequence length for this model (4583858 > 512). Running this sequence through the model will result in indexing errors


encoding time: 19460.61 ± 594.38 (ms)
decoding time: 6955.56 ± 295.61 (ms)


#### 🔵 **GPT-2**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer

In [224]:
tokenizer.vocab_size

50257

In [225]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

Token indices sequence length is longer than the specified maximum sequence length for this model (4741881 > 1024). Running this sequence through the model will result in indexing errors


encoding time: 18704.84 ± 1108.64 (ms)
decoding time: 1768.74 ± 91.4 (ms)


#### 🔵 **GPT-Neo-2.7B**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')
tokenizer

In [228]:
tokenizer.vocab_size

50257

In [229]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

Token indices sequence length is longer than the specified maximum sequence length for this model (4741881 > 2048). Running this sequence through the model will result in indexing errors


encoding time: 22150.48 ± 4800.37 (ms)
decoding time: 2202.27 ± 1282.19 (ms)


#### 🔵 **GPT-Neo-125m**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer

In [232]:
tokenizer.vocab_size

50257

In [233]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

Token indices sequence length is longer than the specified maximum sequence length for this model (4741881 > 2048). Running this sequence through the model will result in indexing errors


encoding time: 18917.33 ± 516.09 (ms)
decoding time: 2041.46 ± 27.37 (ms)


#### 🔵 **DeepSeek-v3**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-V3-0324')
tokenizer

In [236]:
tokenizer.vocab_size

128000

In [253]:
en = []
de = []
for i in range(10):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 20907.09 ± 189.09 (ms)
decoding time: 1638.41 ± 24.24 (ms)


#### 🔵 tiktoken

In [47]:
tokenizer = tiktoken.get_encoding('gpt2')
tokenizer

<Encoding 'gpt2'>

In [49]:
en = []
de = []
for i in range(100):
  en_s = time.time()
  ids = tokenizer.encode(sample)
  en.append(time.time() - en_s)
  de_s = time.time()
  out = tokenizer.decode(ids)
  de.append(time.time() - de_s)

print(f"encoding time: {round(np.mean(en).item()*1e3, 2)} ± {round(np.std(en)*1e3, 2)} (ms)")
print(f"decoding time: {round(np.mean(de).item()*1e3, 2)} ± {round(np.std(de)*1e3, 2)} (ms)")

encoding time: 2437.79 ± 462.11 (ms)
decoding time: 200.31 ± 34.38 (ms)


In [106]:
# glove-wiki-gigaword-300
# fasttext-wiki-news-subwords-300
word2vec_model = api.load("word2vec-google-news-300")

In [107]:
word2vec_model

<gensim.models.keyedvectors.KeyedVectors at 0x1879ef77bf0>

In [109]:
word2vec_model.vectors.shape

(3000000, 300)

In [110]:
word2vec_model['king'].shape

(300,)

In [111]:
word2vec_model.most_similar('king', topn = 5)

[('kings', 0.7138045430183411),
 ('queen', 0.6510957479476929),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474)]

In [112]:
# king - man + woman = queen
word2vec_model.most_similar(positive = ['king', 'woman'], negative = ['man'], topn = 2)

[('queen', 0.7118193507194519), ('monarch', 0.6189674735069275)]

In [113]:
word2vec_model.most_similar(positive = ['bird', 'water'], negative = ['sky'], topn = 2)

[('birds', 0.5283597111701965), ('freshwater', 0.5271045565605164)]

In [114]:
word2vec_model.most_similar(positive = ['school', 'teacher'], negative = ['student'], topn = 2)

[('elementary', 0.7351498603820801), ('teachers', 0.6516026854515076)]

In [115]:
word2vec_model.most_similar(positive = ['night', 'light'], negative = ['dark'], topn = 2)

[('evening', 0.5596994757652283), ('afternoon', 0.5337650775909424)]

In [116]:
word2vec_model.similarity('night', 'light')

0.103139475

In [117]:
word2vec_model.similarity('night', 'dark')

0.22546582

In [118]:
word2vec_model.similarity('night', 'black')

0.082372606

In [119]:
word2vec_model.similarity('school', 'teacher')

0.6382406

In [120]:
word2vec_model.similarity('school', 'student')

0.6055627

In [121]:
word2vec_model.similarity('teacher', 'student')

0.63013655

In [122]:
word2vec_model.similarity('white', 'black')

0.80922145

In [123]:
word2vec_model.similarity('white', 'light')

0.28512627

In [150]:
sentences = [['I', 'love', 'natural', 'language', 'processing'], 
             ['Word2vec', 'is' ,'a', 'gereat', 'tool'],
             ['I', 'enjoy', 'working', 'with', 'text', 'data']]
sentences

[['I', 'love', 'natural', 'language', 'processing'],
 ['Word2vec', 'is', 'a', 'gereat', 'tool'],
 ['I', 'enjoy', 'working', 'with', 'text', 'data']]

In [152]:
model = Word2Vec(sentences, vector_size = 100, window = 5, min_count = 1, workers = 2)

In [154]:
model

<gensim.models.word2vec.Word2Vec at 0x18a5ce4fd10>

In [170]:
model.wv['tool'].shape

(100,)

In [176]:
len(model.wv), model.wv.vectors.shape

(15, (15, 100))

In [182]:
model.wv.most_similar('I', topn = 2)

[('is', 0.21617145836353302), ('language', 0.09310111403465271)]

In [188]:
model.wv.similarity('is', 'a')

0.044689223

In [223]:
sample = valid_dataset['text']
sentences = [simple_preprocess(st) for st in sample]
sentences[0][0:10]

['spot', 'spot', 'saw', 'the', 'shiny', 'car', 'and', 'said', 'wow', 'kitty']

In [224]:
model = Word2Vec(sentences, vector_size = 100, window = 5, min_count = 1, workers = 2)

In [225]:
model.wv.vectors.shape

(11705, 100)

In [226]:
model.wv.most_similar('dark', topn = 5)

[('gloomy', 0.7080708146095276),
 ('lightning', 0.6315677165985107),
 ('darker', 0.6087536215782166),
 ('cloud', 0.5847439765930176),
 ('storm', 0.5832608938217163)]

In [227]:
model.wv.most_similar('school', topn = 5)

[('meeting', 0.7267405986785889),
 ('restaurant', 0.7121162414550781),
 ('zoo', 0.6936815977096558),
 ('class', 0.6633730530738831),
 ('airport', 0.6617036461830139)]

In [228]:
model.wv.similarity('king', 'queen')

0.8407515

In [229]:
model.wv['king'].shape

(100,)