In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA (GPU) is available.")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU device: {torch.cuda.current_device()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}") # Get name of the first GPU
else:
    print("CUDA (GPU) is not available. Using CPU.")

CUDA (GPU) is available.
Number of GPUs: 1
Current GPU device: 0
GPU Name: NVIDIA GeForce RTX 5090 Laptop GPU


NVIDIA GeForce RTX 5090 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_89 sm_90 compute_90.
If you want to use the NVIDIA GeForce RTX 5090 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "Qwen/Qwen3-14B"
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [3]:
# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

model_inputs


{'input_ids': tensor([[151644,    872,    198,  35127,    752,    264,   2805,  16800,    311,
           3460,   4128,   1614,     13, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [4]:
# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

output_ids

[151667,
 198,
 32313,
 11,
 279,
 1196,
 374,
 10161,
 369,
 264,
 2805,
 16800,
 311,
 3460,
 4128,
 4119,
 13,
 6771,
 752,
 1191,
 553,
 26312,
 1128,
 264,
 3460,
 4128,
 1614,
 374,
 13,
 358,
 1265,
 6286,
 429,
 807,
 2299,
 15235,
 5942,
 16176,
 389,
 12767,
 14713,
 315,
 1467,
 821,
 13,
 10696,
 10339,
 862,
 1376,
 4419,
 1075,
 8660,
 323,
 23163,
 3738,
 12681,
 1467,
 382,
 40,
 1184,
 311,
 3421,
 862,
 8357,
 11,
 1075,
 35764,
 4755,
 11,
 4378,
 7343,
 11,
 476,
 10822,
 13,
 7281,
 11,
 432,
 594,
 2989,
 311,
 5185,
 862,
 16928,
 11,
 1741,
 438,
 2745,
 49823,
 1824,
 323,
 11589,
 6351,
 9079,
 13,
 1988,
 358,
 13133,
 944,
 10667,
 311,
 6286,
 279,
 11513,
 11,
 1075,
 54480,
 4963,
 323,
 4650,
 49083,
 13,
 13655,
 432,
 63594,
 714,
 38219,
 13,
 6771,
 752,
 5944,
 432,
 1119,
 264,
 2421,
 2797,
 3501,
 2041,
 3709,
 2238,
 10916,
 624,
 151668,
 271,
 32,
 3070,
 16767,
 4128,
 1614,
 320,
 4086,
 44,
 32295,
 374,
 458,
 10847,
 20443,
 11229,
 1849,

In [None]:
# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)


NVIDIA GeForce RTX 5090 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_89 sm_90 compute_90.
If you want to use the NVIDIA GeForce RTX 5090 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


thinking content: <think>
Okay, the user wants a short introduction to large language models. Let me start by defining what they are. I should mention that they're AI models trained on vast amounts of text data. Maybe explain their capabilities, like generating text, answering questions, and understanding context.

I need to highlight their applicationsâ€”like chatbots, content creation, and translation. Also, it's important to note their ability to handle multiple languages and adapt to different tasks. But I should keep it concise. Don't forget to mention the underlying technology, such as deep learning and neural networks. Make sure it's easy to understand without too much jargon. Let me check if I covered the key points: definition, training data, capabilities, applications, and technology. Yeah, that should work. Keep it under a paragraph or two.
</think>
content: A **large language model (LLM)** is an advanced artificial intelligence system trained on vast amounts of text data to