In [1]:
from transformers import LlamaForCausalLM, BitsAndBytesConfig, AutoTokenizer

# quantization config
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"

llama_model = LlamaForCausalLM.from_pretrained(MODEL_NAME, quantization_config=quantization_config, device_map="auto")

In [7]:
import sys
sys.path.append("../src")
from models.modeling_no_grid import LlamaNoGridForCausalLM
from models.modeling_table_llama import TableLlamaConfig
import torch

In [21]:
import torch
if "no_grid_model" in globals():
    del no_grid_model
    torch.cuda.empty_cache()
    
config = TableLlamaConfig.from_pretrained(MODEL_NAME)
config.rope_table_llama = {
    "x_channels_start": 0,
    "x_channels_end": 128,
    "x_channels_step": 2,
    "y_channels_start": 1,
    "y_channels_end": 128,
    "y_channels_step": 2,
}

no_grid_model = LlamaNoGridForCausalLM.from_pretrained(
    MODEL_NAME, 
    quantization_config=quantization_config, 
    device_map="auto",
    config=config
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [22]:
message = [
    {"role": "user", "content": "Hello, how are you?"}
]

message_text = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(message_text, return_tensors="pt").to(no_grid_model.device)

# Prepare the column ids
column_ids = torch.ones(inputs.input_ids.shape[0], inputs.input_ids.shape[1], dtype=torch.long).to(no_grid_model.device)
row_ids = torch.zeros(inputs.input_ids.shape[0], inputs.input_ids.shape[1], dtype=torch.long).to(no_grid_model.device)
inputs["column_ids"] = column_ids
inputs["row_ids"] = row_ids

print(inputs)

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   1627,   4723,    220,   2366,     19,    271, 128009,
         128006,    882, 128007,    271,   9906,     11,   1268,    527,    499,
             30, 128009, 128006,  78191, 128007,    271]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0'), 'column_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0'), 'row_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')}


In [23]:

outputs = no_grid_model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[[[1 1 1 ... 1 1 1]
  [0 0 0 ... 0 0 0]
  [1 1 1 ... 1 1 1]
  ...
  [0 0 0 ... 0 0 0]
  [1 1 1 ... 1 1 1]
  [0 0 0 ... 0 0 0]]]
[[[42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]
  [42]]]
[[[43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]
  [43]]]
[[[44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]
  [44]]]
[[[45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45]
  [45

In [None]:
def test_no_grid(user_message):
    message = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": user_message}
    ]
    # Tokenize the message
    message_text = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(message_text, return_tensors="pt").to(llama_model.device)
    # Generate the response with llama3.2
    llama_outputs = llama_model.generate(**inputs, max_new_tokens=64, do_sample=False)
    llama_response = tokenizer.decode(llama_outputs[0], skip_special_tokens=True)
    no_grid_outputs = no_grid_model.generate(**inputs, max_new_tokens=64, do_sample=False)
    no_grid_response = tokenizer.decode(no_grid_outputs[0], skip_special_tokens=True)
    return llama_response, no_grid_response
    
user_messages = [
    "What is the capital of France?",
    "Why is the sky blue?",
    "What is the weather in San Francisco?",
]

for user_message in user_messages:
    llama_response, no_grid_response = test_no_grid(user_message)
    print("-" * 100)
    print(f"User message: {user_message}")
    print(f"Llama response: {llama_response}")
    print(f"No grid response: {no_grid_response}")