<a href="https://colab.research.google.com/github/akankshakusf/LLM-Mini-Projects/blob/master/4.Hugging_FacePipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

In [2]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [3]:
#import tokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B',trust_remote_code=True)

In [4]:
text = "I am excited to see the how tokenizers work in LLMs"

#instantiate the tokenizer
tokens=tokenizer.encode(text)
tokens

[128000,
 40,
 1097,
 12304,
 311,
 1518,
 279,
 1268,
 4037,
 12509,
 990,
 304,
 445,
 11237,
 82]

In [6]:
len(tokens)

15

In [7]:
tokenizer.decode(tokens)

'<|begin_of_text|>I am excited to see the how tokenizers work in LLMs'

In [8]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'I',
 ' am',
 ' excited',
 ' to',
 ' see',
 ' the',
 ' how',
 ' token',
 'izers',
 ' work',
 ' in',
 ' L',
 'LM',
 's']

In [9]:
#tokinizer vocal
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|reserved_special_token_2|>': 128004,
 '<|reserved_special_token_3|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|reserved_special_token_4|>': 128008,
 '<|eot_id|>': 128009,
 '<|reserved_special_token_5|>': 128010,
 '<|reserved_special_token_6|>': 128011,
 '<|reserved_special_token_7|>': 128012,
 '<|reserved_special_token_8|>': 128013,
 '<|reserved_special_token_9|>': 128014,
 '<|reserved_special_token_10|>': 128015,
 '<|reserved_special_token_11|>': 128016,
 '<|reserved_special_token_12|>': 128017,
 '<|reserved_special_token_13|>': 128018,
 '<|reserved_special_token_14|>': 128019,
 '<|reserved_special_token_15|>': 128020,
 '<|reserved_special_token_16|>': 128021,
 '<|reserved_special_token_17|>': 128022,
 '<|reserved_special_token_18|>': 128023,
 '<|reserved_special_token_19|>': 128024,
 '<|reserved_special_token_

# Instruct variants of models

In [18]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B-Instruct', trust_remote_code=True)

In [19]:
messages = [
    {"role":"system","content":"You are a helpful assistant"},
    {"role":"user","content":"Tell a light-heighted joke for a room of Data Scientist"}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-heighted joke for a room of Data Scientist<|eot_id|><|start_header_id|>assistant<|end_header_id|>




# Trying new models

In [20]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [21]:
#instantiate phi model
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)
text = "I am excited to see how the tokenizers work"

tokens= phi3_tokenizer.encode(text)
print(phi3_tokenizer.batch_decode(tokens))

['I', 'am', 'excited', 'to', 'see', 'how', 'the', 'token', 'izers', 'work']


In [14]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages,tokenize = False, add_generation_prompt = True))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-heighted joke for a room of Data Scientist<|eot_id|><|start_header_id|>assistant<|end_header_id|>



<|system|>
You are a helpful assistant<|end|>
<|user|>
Tell a light-heighted joke for a room of Data Scientist<|end|>
<|assistant|>



In [22]:
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)

text = "I am excited to see how the tokenizers work"
print(tokenizer.encode(text))
print()
print(phi3_tokenizer.encode(text))
print()
print(qwen2_tokenizer.encode(text))

[128000, 40, 1097, 12304, 311, 1518, 1268, 279, 4037, 12509, 990]

[306, 626, 24173, 304, 1074, 920, 278, 5993, 19427, 664]

[40, 1079, 12035, 311, 1490, 1246, 279, 3950, 12230, 975]


In [16]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-heighted joke for a room of Data Scientist<|eot_id|><|start_header_id|>assistant<|end_header_id|>



<|system|>
You are a helpful assistant<|end|>
<|user|>
Tell a light-heighted joke for a room of Data Scientist<|end|>
<|assistant|>


<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-heighted joke for a room of Data Scientist<|im_end|>
<|im_start|>assistant



In [23]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = starcoder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={starcoder2_tokenizer.decode(token)}")

222=

610=def
17966= hello
100=_
5879=world
45=(
6427=person
731=):
353=
 
1489= print
459=("
8302=Hello
411=",
4944= person
46=)
222=

