In [1]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig

import torch
import os,sys
import gc

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
hf_token = os.getenv("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### Llama base model 

Load tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

In [4]:
text= "AI Engineering is amazing"
tokens= tokenizer.encode(text)
print(tokens)

[128000, 15836, 17005, 374, 8056]


In [5]:
tokenizer.decode(tokens)

'<|begin_of_text|>AI Engineering is amazing'

In [6]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>AI Engineering is amazing']

### Llama instruct model

Chat Templates

In [7]:
# Llama instruct model
# instruct models - base models fine-tuned to act in form of question-answering
# like in the openai api, we can also apply chat template with the instruct model

In [8]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

In [9]:
messages = [
    {"role": "system", "content": "You're a helpful assistant"},
    {"role": "user", "content": "Tell a joke about monthly salary earners"}
]

In [10]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You're a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a joke about monthly salary earners<|eot_id|><|start_header_id|>assistant<|end_header_id|>




##### Models and their templates.

When we apply chat templates on messages, we are letting it assume the structure of training data used when fine-tuning base models to instruct models. 

Essentially, base models were pre-trained to predict next words. Instruct models are a variant of the base model fine-tuned for end users to have the ability to interact with LLMs and the models producing meaningful responses like ChatGPT style.

The Llama chat template applied on the message above has added extra start of sentence, end of sentence tokens, tags to separate users and assistant in a way the models understand. 

We will see what this looks like with other models.

#### Other Models

In [11]:
phi4 = "microsoft/Phi-4-mini-instruct"
deepseek = "deepseek-ai/DeepSeek-V3.1"
qwen_coder = "Qwen/Qwen2.5-7B-Instruct"

In [12]:
phi_tokenizer = AutoTokenizer.from_pretrained(phi4)
ds_tokenizer = AutoTokenizer.from_pretrained(deepseek)
qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_coder)

This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.
`rope_parameters`'s factor field must be a float >= 1, got 40
`rope_parameters`'s beta_fast field must be a float, got 32
`rope_parameters`'s beta_slow field must be a float, got 1


In [13]:
prompt = phi_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("phi4: \n", prompt)

phi4: 
 <|system|>You're a helpful assistant<|end|><|user|>Tell a joke about monthly salary earners<|end|><|assistant|>


In [14]:
prompt = ds_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("deepseek: \n",prompt)

deepseek: 
 <｜begin▁of▁sentence｜>You're a helpful assistant<｜User｜>Tell a joke about monthly salary earners<｜Assistant｜></think>


In [15]:
prompt = qwen_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("qwen_coder: \n",prompt)

qwen_coder: 
 <|im_start|>system
You're a helpful assistant<|im_end|>
<|im_start|>user
Tell a joke about monthly salary earners<|im_end|>
<|im_start|>assistant



### Generating with HF Models

In [16]:
LLAMA = "meta-llama/Llama-3.1-8B-Instruct"

In [17]:
# quantize the model to load and use less memory
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4", 
)

In [18]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token

In [19]:
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

In [20]:
inputs

{'input_ids': tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,   2351,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  41551,
            264,  22380,    922,  15438,  16498,  97063, 128009]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [21]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

In [22]:
# model

In [23]:
inputs.input_ids

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,   2351,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  41551,
            264,  22380,    922,  15438,  16498,  97063, 128009]],
       device='cuda:0')

In [24]:
outputs = model.generate(inputs.input_ids, max_new_tokens=120)
outputs

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,   2351,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  41551,
            264,  22380,    922,  15438,  16498,  97063, 128009, 128006,  78191,
         128007,    271,  10445,   1550,    279,  15438,  16498,   2487,   1215,
           4546,    264,  36865,    311,    279,   6201,   1980,  18433,    814,
           4934,    311,   1935,    872,  16498,    311,    279,   1828,   2237,
             13, 128009]], device='cuda:0')

In [1]:
tokenizer.decode(outputs[0])

NameError: name 'tokenizer' is not defined

In [26]:
# clean up disk

# del model, inputs, tokenizer, outputs
# gc.collect()
# torch.cuda.empty_cache()

In [31]:
# del model, inputs, tokenizer, outputs
# torch.cuda.empty_cache()

In [30]:
outputs

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,   2675,   2351,
            264,  11190,  18328, 128009, 128006,    882, 128007,    271,  41551,
            264,  22380,    922,  15438,  16498,  97063, 128009, 128006,  78191,
         128007,    271,  10445,   1550,    279,  15438,  16498,   2487,   1215,
           4546,    264,  36865,    311,    279,   6201,   1980,  18433,    814,
           4934,    311,   1935,    872,  16498,    311,    279,   1828,   2237,
             13, 128009]], device='cuda:0')

In [27]:
def generate(model, messages, quant=True, max_new_tokens=80):
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.pad_token = tokenizer.eos_token
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

    attention_mask = torch.ones