In [39]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoTokenizer
from basic.utils import dict_to_table
load_dotenv(override=True)

True

In [10]:
login(token = os.getenv('HF_TOKEN'), add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Encoding

## OpenAI tokenization
https://platform.openai.com/tokenizer

In [30]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o-mini")
text = "What is the capital of Poland?"

# Encode into token IDs
tokens = enc.encode(text)
print("Token IDs:", tokens)

# Decode IDs back into strings
print("Decoded pieces:", [enc.decode([t]) for t in tokens])

Token IDs: [4827, 382, 290, 9029, 328, 50029, 30]
Decoded pieces: ['What', ' is', ' the', ' capital', ' of', ' Poland', '?']


In [31]:
# The same text may be tokenized differently
tokens = enc.encode("of of")
print("Token IDs:", tokens)
print("Decoded pieces:", [enc.decode([t]) for t in tokens])

Token IDs: [1440, 328]
Decoded pieces: ['of', ' of']


## LLama tokenization

In [48]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B-Instruct', trust_remote_code=True)
text = "What is the capital of Poland?"
tokens = tokenizer.encode(text)
tokens

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'LlamaTokenizerFast'.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


[128000, 3923, 374, 279, 6864, 315, 28702, 30]

In [49]:
tokenizer.decode(tokens)

'<|begin_of_text|>What is the capital of Poland?'

In [50]:
dict_to_table(tokenizer.vocab)

Unnamed: 0,Key,Value
0,Ġheroic,56375
1,.Today,73772
2,ĠSurveillance,76458
3,Known,49306
4,ĠFontAwesome,60011
...,...,...
128252,/site,48981
128253,zim,118301
128254,!(:,50216
128255,.Query,16060


In [51]:
def token_to_text(t, value):
    return [k for k, v in t.items() if v == value]

In [52]:
token_to_text(tokenizer.vocab, 3923)

['What']

In [53]:
dict_to_table(tokenizer.get_added_vocab())

Unnamed: 0,Key,Value
0,<|begin_of_text|>,128000
1,<|end_of_text|>,128001
2,<|reserved_special_token_0|>,128002
3,<|reserved_special_token_1|>,128003
4,<|finetune_right_pad_id|>,128004
...,...,...
252,<|reserved_special_token_244|>,128252
253,<|reserved_special_token_245|>,128253
254,<|reserved_special_token_246|>,128254
255,<|reserved_special_token_247|>,128255


## Tokenizing the chat conversation

In [54]:
messages=[
    {"role": "system", "content": "You are a helpful assistant that is good with geography."},
    {"role": "user", "content": "What is the capital of Poland?"},
    {"role": "assistant", "content": "The capital of Poland is Warsaw."},
    {"role": "user", "content": "And of Germany?"},
]

In [55]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Sep 2025

You are a helpful assistant that is good with geography.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Poland?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of Poland is Warsaw.<|eot_id|><|start_header_id|>user<|end_header_id|>

And of Germany?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


