In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import TextIteratorStreamer
from threading import Thread
import threading

model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

In [None]:
def stream_chat(messages, tokenizer, model, max_new_tokens=32768, enable_thinking=True):
    """
    Stream model responses token-by-token in real time.
    """
    # Prepare inputs
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=enable_thinking
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Streamer to capture tokens
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=False)

    # Run generation in a background thread
    generation_kwargs = dict(
        **model_inputs,
        max_new_tokens=max_new_tokens,
        streamer=streamer
    )
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield tokens in real time
    for new_text in streamer:
        yield new_text

In [None]:
# ------------------ USAGE ------------------ #
prompt = "Is 9.10 bigger than 9.9 ?"
messages = [
    {"role": "user", "content": prompt}
]

print("Assistant: ", end="", flush=True)
response = ''
for token in stream_chat(messages, tokenizer, model,enable_thinking=False):
    print(token, end="", flush=True)  # real-time print
    response += token
print("\n\nFinal Response:", response)

In [None]:
# ------------------ USAGE ------------------ #
prompt = "Is 9.10 bigger than 9.9 ?"
messages = [
    {"role": "user", "content": prompt}
]

print("Assistant: ", end="", flush=True)
response = ''
for token in stream_chat(messages, tokenizer, model):
    print(token, end="", flush=True)  # real-time print
    response += token
print("\n\nFinal Response:", response)

In [None]:
# ------------------ USAGE ------------------ #
prompt = "How many countries are there in Africa , please give me their names ?"
messages = [
    {"role": "user", "content": prompt}
]

print("Assistant: ", end="", flush=True)
response = ''
for token in stream_chat(messages, tokenizer, model):
    print(token, end="", flush=True)  # real-time print
    response += token
print("\n\nFinal Response:", response)