# Tokenizers

we explore the world of Tokenizers


In [1]:
!pip install -q transformers

In [2]:
from transformers import AutoTokenizer

In [5]:
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get('HF_TOKEN')
login(hf_token,add_to_git_credential=True)


In [20]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer.encode(text)
print(tokens)

[128000, 40, 1097, 12304, 311, 1501, 9857, 12509, 304, 1957, 311, 856, 445, 11237, 25175]


In [22]:
decoded_text=tokenizer.decode(tokens)
print(decoded_text)

<|begin_of_text|>I am excited to show Tokenizers in action to my LLM engineers


In [23]:
individual_tokens= tokenizer.batch_decode(tokens)
print(individual_tokens)

['<|begin_of_text|>', 'I', ' am', ' excited', ' to', ' show', ' Token', 'izers', ' in', ' action', ' to', ' my', ' L', 'LM', ' engineers']


In [24]:
print(f"vocab size is {tokenizer.vocab_size}")

vocab size is 128000


In [25]:
special_tokens = tokenizer.get_added_vocab()
print(special_tokens)

{'<|begin_of_text|>': 128000, '<|end_of_text|>': 128001, '<|reserved_special_token_0|>': 128002, '<|reserved_special_token_1|>': 128003, '<|finetune_right_pad_id|>': 128004, '<|reserved_special_token_2|>': 128005, '<|start_header_id|>': 128006, '<|end_header_id|>': 128007, '<|eom_id|>': 128008, '<|eot_id|>': 128009, '<|python_tag|>': 128010, '<|reserved_special_token_3|>': 128011, '<|reserved_special_token_4|>': 128012, '<|reserved_special_token_5|>': 128013, '<|reserved_special_token_6|>': 128014, '<|reserved_special_token_7|>': 128015, '<|reserved_special_token_8|>': 128016, '<|reserved_special_token_9|>': 128017, '<|reserved_special_token_10|>': 128018, '<|reserved_special_token_11|>': 128019, '<|reserved_special_token_12|>': 128020, '<|reserved_special_token_13|>': 128021, '<|reserved_special_token_14|>': 128022, '<|reserved_special_token_15|>': 128023, '<|reserved_special_token_16|>': 128024, '<|reserved_special_token_17|>': 128025, '<|reserved_special_token_18|>': 128026, '<|rese

In [26]:
messages = [{
    "role": "system","content":"You are a helpful assistant."
},{
    "role": "user","content":"Tell a light-hearted joke for Data scientists"
}]

In [28]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", trust_remote_code=True)

In [29]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [30]:
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for Data scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [31]:
text="I am excited to show Tokenizers in action to my LLM engineers"

In [35]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [33]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)

tokens = phi3_tokenizer.encode(text)
print(tokens)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

[306, 626, 24173, 304, 1510, 25159, 19427, 297, 3158, 304, 590, 365, 26369, 6012, 414]


In [36]:
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)

tokens = qwen2_tokenizer.encode(text)
print(tokens)

star_coder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME)

tokens = star_coder2_tokenizer.encode(text)
print(tokens)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[40, 1079, 12035, 311, 1473, 9660, 12230, 304, 1917, 311, 847, 444, 10994, 24198]


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

[78, 3874, 13511, 13625, 391, 2539, 6520, 9266, 347, 2146, 391, 1690, 14325, 82, 6894, 502]


In [38]:
print(qwen2_tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True))
print()
print(tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True))

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Tell a light-hearted joke for Data scientists<|im_end|>
<|im_start|>assistant


<|system|>
You are a helpful assistant.<|end|>
<|user|>
Tell a light-hearted joke for Data scientists<|end|>
<|assistant|>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for Data scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [39]:
code="""
def add(a,b):
  return a+b
"""

tokens = star_coder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token} -> {star_coder2_tokenizer.decode([token])}")

222 -> 

610 -> def
1035 ->  add
45 -> (
102 -> a
49 -> ,
103 -> b
731 -> ):
353 -> 
 
461 ->  return
331 ->  a
48 -> +
103 -> b
222 -> 

