In [20]:
# Import Libraries
import os
from huggingface_hub import login
from transformers import AutoTokenizer

hf_token = os.getenv('HF_TOKEN')
login(token=hf_token, add_to_git_credential=True)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [21]:
# Accessing Llama 3.1 from Meta
llama_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='meta-llama/Meta-Llama-3.1-8B', trust_remote_code = True)
text = "Tokenization, in the realm of Artificial Intelligence (AI), refers to the process of converting input text into smaller units or 'tokens' such as words or subwords."

In [22]:
# Encode from text to Tokens
text_tokens = llama_tokenizer.encode(text=text)
print(f"Tokens : {text_tokens}")

Tokens : [128000, 3404, 2065, 11, 304, 279, 22651, 315, 59294, 22107, 320, 15836, 705, 19813, 311, 279, 1920, 315, 34537, 1988, 1495, 1139, 9333, 8316, 477, 364, 31666, 6, 1778, 439, 4339, 477, 1207, 5880, 13]


In [23]:
# Decode from Tokens into original Text
decoded_text = llama_tokenizer.decode(token_ids=text_tokens)
print(f"Decoded Text : {decoded_text}")

Decoded Text : <|begin_of_text|>Tokenization, in the realm of Artificial Intelligence (AI), refers to the process of converting input text into smaller units or 'tokens' such as words or subwords.


In [24]:
# Can use Batch Decode to get list
batch_decoded_text = llama_tokenizer.batch_decode(sequences=text_tokens)
print(f"Batch Decoded Text : {batch_decoded_text}")

Batch Decoded Text : ['<|begin_of_text|>', 'Token', 'ization', ',', ' in', ' the', ' realm', ' of', ' Artificial', ' Intelligence', ' (', 'AI', '),', ' refers', ' to', ' the', ' process', ' of', ' converting', ' input', ' text', ' into', ' smaller', ' units', ' or', " '", 'tokens', "'", ' such', ' as', ' words', ' or', ' sub', 'words', '.']


In [25]:
# Tokenizer Vocabulary
print(f"Llama 3.1 8B Tokenizer Vocabulary : {llama_tokenizer.vocab}")
print(f"Llama 3.1 8B Tokenizer Added Vocabulary : {llama_tokenizer.get_added_vocab()}")

Llama 3.1 8B Tokenizer Added Vocabulary : {'<|begin_of_text|>': 128000, '<|end_of_text|>': 128001, '<|reserved_special_token_0|>': 128002, '<|reserved_special_token_1|>': 128003, '<|finetune_right_pad_id|>': 128004, '<|reserved_special_token_2|>': 128005, '<|start_header_id|>': 128006, '<|end_header_id|>': 128007, '<|eom_id|>': 128008, '<|eot_id|>': 128009, '<|python_tag|>': 128010, '<|reserved_special_token_3|>': 128011, '<|reserved_special_token_4|>': 128012, '<|reserved_special_token_5|>': 128013, '<|reserved_special_token_6|>': 128014, '<|reserved_special_token_7|>': 128015, '<|reserved_special_token_8|>': 128016, '<|reserved_special_token_9|>': 128017, '<|reserved_special_token_10|>': 128018, '<|reserved_special_token_11|>': 128019, '<|reserved_special_token_12|>': 128020, '<|reserved_special_token_13|>': 128021, '<|reserved_special_token_14|>': 128022, '<|reserved_special_token_15|>': 128023, '<|reserved_special_token_16|>': 128024, '<|reserved_special_token_17|>': 128025, '<|res

In [26]:
# Instruct Variants - meta-llama/Meta-Llama-3.1-8B-Instruct Model
instruct_llama_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="meta-llama/Meta-Llama-3.1-8B-Instruct", trust_remote_code=True)
openai_prompts = [
    {"role": "system", "content": "You are a helpful AI Assistant."},
    {"role": "user", "content": "Tell me a light-hearted joke about Chess."}
    ]

instruct_llama_prompts = instruct_llama_tokenizer.apply_chat_template(conversation=openai_prompts, tokenize=False, add_generation_prompt=True)
print(f"OpenAI Prompt : {openai_prompts}")
print(f"Llama-3.1-8B-Instruct Prompt : {instruct_llama_prompts}")

OpenAI Prompt : [{'role': 'system', 'content': 'You are a helpful AI Assistant.'}, {'role': 'user', 'content': 'Tell me a light-hearted joke about Chess.'}]
Llama-3.1-8B-Instruct Prompt : <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful AI Assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell me a light-hearted joke about Chess.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [27]:
# Different Model Tokenizers - PHI3
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
phi3_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=PHI3_MODEL_NAME, trust_remote_code=True)
phi3_encoded_tokens = phi3_tokenizer.encode(text=text)
phi3_batch_decoded_text = phi3_tokenizer.batch_decode(sequences=phi3_encoded_tokens)
instruct_phi3_prompts = phi3_tokenizer.apply_chat_template(conversation=openai_prompts, tokenize=False, add_generation_prompt=True)

print(f"MODEL NAME : {PHI3_MODEL_NAME}")
print(f"ENCODED TOKENS : {phi3_encoded_tokens}")
print(f"DECODED BATCH TEXT : {phi3_batch_decoded_text}")
print(f"INSTRUCT PROMPT : {instruct_phi3_prompts}")


MODEL NAME : microsoft/Phi-3-mini-4k-instruct
ENCODED TOKENS : [25159, 2133, 29892, 297, 278, 1855, 29885, 310, 3012, 928, 616, 3159, 28286, 313, 23869, 511, 14637, 304, 278, 1889, 310, 17415, 1881, 1426, 964, 7968, 10340, 470, 525, 517, 12360, 29915, 1316, 408, 3838, 470, 1014, 9303, 29889]
DECODED BATCH TEXT : ['Token', 'ization', ',', 'in', 'the', 'real', 'm', 'of', 'Art', 'ific', 'ial', 'Int', 'elligence', '(', 'AI', '),', 'refers', 'to', 'the', 'process', 'of', 'converting', 'input', 'text', 'into', 'smaller', 'units', 'or', "'", 'to', 'kens', "'", 'such', 'as', 'words', 'or', 'sub', 'words', '.']
INSTRUCT PROMPT : <|system|>
You are a helpful AI Assistant.<|end|>
<|user|>
Tell me a light-hearted joke about Chess.<|end|>
<|assistant|>



In [28]:
# Different Model Tokenizers - QWEN2
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
qwen2_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=QWEN2_MODEL_NAME, trust_remote_code=True)
qwen2_encoded_tokens = qwen2_tokenizer.encode(text=text)
qwen2_batch_decoded_text = qwen2_tokenizer.batch_decode(sequences=qwen2_encoded_tokens)
instruct_qwen2_prompts = qwen2_tokenizer.apply_chat_template(conversation=openai_prompts, tokenize=False, add_generation_prompt=True)

print(f"MODEL NAME : {QWEN2_MODEL_NAME}")
print(f"ENCODED TOKENS : {qwen2_encoded_tokens}")
print(f"DECODED BATCH TEXT : {qwen2_batch_decoded_text}")
print(f"INSTRUCT PROMPT : {instruct_qwen2_prompts}")


MODEL NAME : Qwen/Qwen2-7B-Instruct
ENCODED TOKENS : [3323, 2022, 11, 304, 279, 21889, 315, 58194, 21392, 320, 15469, 701, 19257, 311, 279, 1882, 315, 33437, 1946, 1467, 1119, 9155, 8153, 476, 364, 30566, 6, 1741, 438, 4244, 476, 1186, 5761, 13]
DECODED BATCH TEXT : ['Token', 'ization', ',', ' in', ' the', ' realm', ' of', ' Artificial', ' Intelligence', ' (', 'AI', '),', ' refers', ' to', ' the', ' process', ' of', ' converting', ' input', ' text', ' into', ' smaller', ' units', ' or', " '", 'tokens', "'", ' such', ' as', ' words', ' or', ' sub', 'words', '.']
INSTRUCT PROMPT : <|im_start|>system
You are a helpful AI Assistant.<|im_end|>
<|im_start|>user
Tell me a light-hearted joke about Chess.<|im_end|>
<|im_start|>assistant



In [29]:
# Different Model Tokenizers - STARCODER2
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"
code_text = """
def hello_world(person):
  print("Hello", person)
"""
sc2_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=STARCODER2_MODEL_NAME, trust_remote_code=True)
sc2_encoded_tokens = sc2_tokenizer.encode(text=code_text)
print(f"MODEL NAME : {STARCODER2_MODEL_NAME}")
print(f"ENCODED TOKENS : {sc2_encoded_tokens}")
for token in sc2_encoded_tokens:
  print(f"{token}={sc2_tokenizer.decode(token)}")


MODEL NAME : bigcode/starcoder2-3b
ENCODED TOKENS : [222, 610, 17966, 100, 5879, 45, 6427, 731, 353, 1489, 459, 8302, 411, 4944, 46, 222]
222=

610=def
17966= hello
100=_
5879=world
45=(
6427=person
731=):
353=
 
1489= print
459=("
8302=Hello
411=",
4944= person
46=)
222=

