In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoTokenizer
from basic.utils import dict_to_table
load_dotenv(override=True)

In [None]:
login(token = os.getenv('HF_TOKEN'), add_to_git_credential=True)

# Encoding

## OpenAI tokenization
https://platform.openai.com/tokenizer

https://tiktokenizer.vercel.app/

In [None]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o-mini")
text = "What is the capital of Poland?"

# Encode into token IDs
tokens = enc.encode(text)
print("Token IDs:", tokens)

# Decode IDs back into strings
print("Decoded pieces:", [enc.decode([t]) for t in tokens])

In [None]:
# The same text may be tokenized differently
tokens = enc.encode("of of")
print("Token IDs:", tokens)
print("Decoded pieces:", [enc.decode([t]) for t in tokens])

## LLama tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B-Instruct', trust_remote_code=True)
text = "What is the capital of Poland?"
tokens = tokenizer.encode(text)
tokens

In [None]:
tokenizer.decode(tokens)

In [None]:
dict_to_table(tokenizer.vocab)

In [None]:
def token_to_text(t, value):
    return [k for k, v in t.items() if v == value]

In [None]:
token_to_text(tokenizer.vocab, 3923)

In [None]:
dict_to_table(tokenizer.get_added_vocab())

## Tokenizing the chat conversation

In [None]:
messages=[
    {"role": "system", "content": "You are a helpful assistant that is good with geography."},
    {"role": "user", "content": "What is the capital of Poland?"},
    {"role": "assistant", "content": "The capital of Poland is Warsaw."},
    {"role": "user", "content": "And of Germany?"},
]

In [None]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)