In [1]:
from transformers import LlamaForCausalLM, BitsAndBytesConfig, AutoTokenizer

# quantization config
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"

llama_model = LlamaForCausalLM.from_pretrained(MODEL_NAME, quantization_config=quantization_config, device_map="auto")

In [2]:
import sys
sys.path.append("../src")
from models.modeling_no_grid import LlamaNoGridForCausalLM
import torch

In [3]:
import torch
if "no_grid_model" in globals():
    del no_grid_model
    torch.cuda.empty_cache()

no_grid_model = LlamaNoGridForCausalLM.from_pretrained(MODEL_NAME, quantization_config=quantization_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


TypeError: arange() received an invalid combination of arguments - got (int, NoneType, int, dtype=torch.dtype), but expected one of:
 * (Number end, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (Number start, Number end, *, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (Number start, Number end, Number step, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [None]:
message = [
    {"role": "user", "content": "Hello, how are you?"}
]

message_text = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(message_text, return_tensors="pt").to(no_grid_model.device)

print(inputs)

In [None]:

outputs = no_grid_model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



In [None]:
def test_no_grid(user_message):
    message = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": user_message}
    ]
    # Tokenize the message
    message_text = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(message_text, return_tensors="pt").to(llama_model.device)
    # Generate the response with llama3.2
    llama_outputs = llama_model.generate(**inputs, max_new_tokens=64, do_sample=False)
    llama_response = tokenizer.decode(llama_outputs[0], skip_special_tokens=True)
    no_grid_outputs = no_grid_model.generate(**inputs, max_new_tokens=64, do_sample=False)
    no_grid_response = tokenizer.decode(no_grid_outputs[0], skip_special_tokens=True)
    return llama_response, no_grid_response
    
user_messages = [
    "What is the capital of France?",
    "Why is the sky blue?",
    "What is the weather in San Francisco?",
    "Why Stanford University is better than UC Berkeley?",
]

for user_message in user_messages:
    llama_response, no_grid_response = test_no_grid(user_message)
    print("-" * 100)
    print(f"User message: {user_message}")
    print(f"Llama response: {llama_response}")
    print(f"No grid response: {no_grid_response}")

In [None]:
# Replacement test
import torch
batch_size = 4
input_len = 20
head_dim = 64

position_ids = torch.arange(0, input_len)[None, None, :].repeat(batch_size, head_dim//2, 1)
print(position_ids.shape)
print(position_ids)
column_ids = torch.randint(80, 100, (batch_size, input_len))
print(column_ids)

x_start = 0
x_end = head_dim//2
x_step = 2

position_ids[range(batch_size), x_start:x_end:x_step, :] = column_ids
print(position_ids)

