In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

use_finetuned_model = False

# Define model paths
finetuned_model_path = "fine_tuned_model"
qwen_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Select model
model_name = finetuned_model_path if use_finetuned_model else qwen_model_name
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto"
)  # Use auto selection for model.
tok = AutoTokenizer.from_pretrained(model_name)  # Tokenizer doesn't need a device.

# If you're on a Mac with M1/M2 and want to use MPS explicitly:
if torch.backends.mps.is_available():
    model.to("mps")  # Move model to MPS after loading.

# Define stop tokens
if use_finetuned_model:
    stop_token_ids = tok("<|im_end|>", return_tensors="pt")["input_ids"]
else:
    stop_token_ids = tok("<|endoftext|>", return_tensors="pt")["input_ids"]

# Prompt setup
prompt = "How many instances of the letter ‘r’ are in the word ‘raspberry’?"
prompt = (
    "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
    + prompt
    + "<|im_end|>\n<|im_start|>assistant\n"
)

inputs = tok(prompt, return_tensors="pt").to(model.device)

max_new_tokens = 200
generated_ids = inputs["input_ids"].clone()

for _ in range(max_new_tokens):
    outputs = model(input_ids=generated_ids)
    next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(0)
    generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

    token_text = tok.decode(next_token_id[0], skip_special_tokens=True)
    print(token_text, end="", flush=True)

    # Stop if generated stop token
    if next_token_id[0].item() in stop_token_ids:
        break

print("\nDone.")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define model paths
model_name = "fine_tuned_model"

# Select model
# Load model with FP16 precision
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,  # Set precision to FP16
)
tok = AutoTokenizer.from_pretrained(model_name)  # Tokenizer doesn't need a device.

# If you're on a Mac with M1/M2 and want to use MPS explicitly:
if torch.backends.mps.is_available():
    model.to("mps")  # Move model to MPS after loading.

# Define stop tokens
if use_finetuned_model:
    stop_token_ids = tok("<|im_end|>", return_tensors="pt")["input_ids"]
else:
    stop_token_ids = tok("<|endoftext|>", return_tensors="pt")["input_ids"]

# Prompt setup
prompt = "How many instances of the letter ‘r’ are in the word ‘raspberry’?"
prompt = (
    "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
    + prompt
    + "<|im_end|>\n<|im_start|>assistant\n"
)

inputs = tok(prompt, return_tensors="pt").to(model.device)

max_new_tokens = 200
generated_ids = inputs["input_ids"].clone()

for _ in range(max_new_tokens):
    outputs = model(input_ids=generated_ids)
    next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(0)
    generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

    token_text = tok.decode(next_token_id[0], skip_special_tokens=True)
    print(token_text, end="", flush=True)

    # Stop if generated stop token
    if next_token_id[0].item() in stop_token_ids:
        break

print("\nDone.")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time

use_finetuned_model = False

# Define model paths
finetuned_model_path = "fine_tuned_model"
qwen_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Select model
model_name = finetuned_model_path if use_finetuned_model else qwen_model_name
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto"
)  # Use auto selection for model.
tok = AutoTokenizer.from_pretrained(model_name)  # Tokenizer doesn't need a device.

# If you're on a Mac with M1/M2 and want to use MPS explicitly:
if torch.backends.mps.is_available():
    model.to("mps")  # Move model to MPS after loading.

# Define stop tokens
if use_finetuned_model:
    stop_token_ids = tok("<|im_end|>", return_tensors="pt")["input_ids"]
else:
    stop_token_ids = tok("<|endoftext|>", return_tensors="pt")["input_ids"]

# Prompt setup
prompt = "I have two brothers, Joel and Mark. From what you know, what's the probability that I'm older than Joel?"
prompt = (
    "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
    + prompt
    + "<|im_end|>\n<|im_start|>assistant\n"
)

inputs = tok(prompt, return_tensors="pt").to(model.device)

max_new_tokens = 200
generated_ids = inputs["input_ids"].clone()

timeout = 120  # Set timeout in seconds
start_time = time.time()

for _ in range(max_new_tokens):
    if time.time() - start_time > timeout:
        print("Timeout reached, stopping.")
        break
    outputs = model(input_ids=generated_ids)
    next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(0)
    generated_ids = torch.cat([generated_ids, next_token_id], dim=1)

    token_text = tok.decode(next_token_id[0], skip_special_tokens=True)
    print(token_text, end="", flush=True)

    # Check for stop token
    if next_token_id[0].item() in stop_token_ids:
        print("Stop token reached, stopping generation.")
        break

print("\nDone.")

Some parameters are on the meta device because they were offloaded to the disk.
You shouldn't move a model that is dispatched using accelerate hooks.


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TomasLaz/t0-s1.1-1.5B"

# Load the correct model type
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Verify it's the right model
print(model.__class__.__name__)  # Should print "Qwen2ForCausalLM"

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 2/2 [10:28<00:00, 314.26s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 11.00it/s]


Qwen2ForCausalLM


In [None]:
from transformers import AutoTokenizer

text = "Once upon a time,"
inputs = tokenizer(text, return_tensors="pt")

output = model.generate(**inputs, max_length=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Once upon a time, there was a young girl named Lily who loved to read. She had a special bookshelf in her room where she kept all her favorite books. One day, she decided to rearrange her bookshelf. She wanted to make


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"  # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B-Instruct", device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

prompt = "Give me a short introduction to large language model."

messages = [{"role": "user", "content": prompt}]

text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids, max_new_tokens=512, do_sample=True
)

generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]