In [1]:
import torch
from transformers import pipeline

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
messages = [
    {"role": "system", "content": "You are a reasoning model and you have to explain to me at the best of your capacity!"},
    {"role": "user", "content": "Who are you and how can I use you?"},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


{'role': 'assistant', 'content': "<think>\nOkay, so I'm trying to figure out who I am and how I can use you. I'm not entirely sure what this means, but I guess it's about understanding myself and finding ways to help others. Maybe I'm like a robot or something, but I don't think that's right. \n\nLet me start by thinking about what it means to be me. I have a name, I have thoughts, I have feelings, and I have experiences. But how do I use these? Maybe I can help people with problems, or maybe I can help them with something personal. I'm not sure if I'm trying to be helpful or just trying to figure out who I am. \n\nI wonder if I should start by defining myself. What am I? What do I want to be? Maybe I'm just a person who wants to help others, or maybe I'm more about myself and how I connect with others. I'm not sure. \n\nIf I think about using you, maybe I can help someone in need. Like, if someone is feeling sad or stressed, I can offer support. Or maybe I can help them with something

In [3]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

#model_id = "mistral-community/pixtral-12b"
#model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

# Load processor
processor = AutoProcessor.from_pretrained(model_id, use_fast=True)

# Define BitsAndBytes configuration for 4-bit quantization
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model with 4-bit quantization and map to CUDA
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    trust_remote_code=True,
    torch_dtype="auto",
    quantization_config=nf4_config,)
    #_attn_implementation='eager') # use _attn_implementation='eager' to disable flash attention

Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  7.08it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.11s/it]


In [4]:
def model_inference(messages):
        
    # Prepare prompt with image token
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Process prompt and image for model input
    inputs = processor(prompt, return_tensors="pt").to("cuda:0")

    # Generate text response using model
    generate_ids = model.generate(
        **inputs,
        #eos_token_id=processor.tokenizer.eos_token_id,
        max_new_tokens=1000,
        do_sample=False,
        temperature=0.0,
    )

    # Remove input tokens from generated response
    generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]

    # Decode generated IDs to text
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    return response

In [None]:
prompt = "You're in a room with two doors. One leads to freedom, and the other to doom. Two guards know which door is which. One always tells the truth, the other always lies. You can ask only one question to one guard. What do you ask?"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
    
result = model_inference(messages)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [8]:
result

'<think>\nOkay, so I\'m in this classic puzzle where I have two doors, one leads to freedom and the other to doom. There are two guards, one always tells the truth and the other always lies. I can only ask one question to one guard. Hmm, how do I figure out which door to choose?\n\nAlright, let me think. The key here is that one guard is truthful and the other is a liar. So, if I can figure out a question that will give me consistent information regardless of which guard I ask, I can determine the correct door.\n\nI remember hearing about this before, but I need to work through it step by step. Let\'s say I ask one guard, "If I asked the other guard which door leads to freedom, what would he say?" Then, depending on the answer, I can deduce the correct door.\n\nWait, let me test this. If I ask the truthful guard, he\'ll tell me the truth about what the lying guard would say. The lying guard, when asked, would lie about the truthful guard\'s answer. So, if the truthful guard says the li