In [None]:
!pip uninstall transformers --y
!pip install transformers==4.51.3

# Installing collected packages: transformers
# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
# neuronx-distributed-inference 0.3.5591+f50feae2 requires transformers==4.48.*, but you have transformers 4.51.3 which is incompatible.
# Successfully installed transformers-4.51.3

### You may ignore the error that nxdi is not compatible with transformers ==4.48.*

In [1]:
!pip list | grep neuron
!pip list | grep transformers

libneuronxla                  2.2.3493.0+78c3e78c
neuronx-cc                    2.18.121.0+9e31e41a
neuronx-distributed           0.12.12111+cdd84048
neuronx-distributed-inference 0.3.5591+f50feae2
torch-neuronx                 2.6.0.2.7.5413+113e6810
transformers                  4.51.3


In [None]:
import torch
from transformers import AutoTokenizer, GenerationConfig
from neuronx_distributed_inference.models.config import NeuronConfig, OnDeviceSamplingConfig
from neuronx_distributed_inference.utils.hf_adapter import HuggingFaceGenerationAdapter, load_pretrained_config

# Model Download

In [3]:
model_path = "/home/ubuntu/model_hf_qwen/qwen/"
traced_model_path = "/home/ubuntu/traced_model_qwen3/qwen3/"

In [None]:
from huggingface_hub import snapshot_download

snapshot_download("Qwen/Qwen3-8B", local_dir=model_path)

# Compilation

In [None]:
from modeling_qwen3 import Qwen3InferenceConfig, NeuronQwen3ForCausalLM

def run_qwen3_compile():
    # Initialize configs and tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token

    generation_config = GenerationConfig.from_pretrained(model_path)
    generation_config_kwargs = {
        "do_sample": True,
        "top_k": 1,
        "pad_token_id": tokenizer.pad_token_id,
    }
    generation_config.update(**generation_config_kwargs)
 
    neuron_config = NeuronConfig(
        tp_degree=8,
        batch_size=1,
        max_context_length=1024, 
        seq_len=2048, 
        on_device_sampling_config=OnDeviceSamplingConfig(top_k=5),
        enable_bucketing=True,
        context_encoding_buckets=[1024],
        token_generation_buckets=[2048],
        flash_decoding_enabled=False,
        torch_dtype=torch.bfloat16,
        fused_qkv=False,
        attn_kernel_enabled=True,
        attn_cls="NeuronQwen3Attention"
    )
    config = Qwen3InferenceConfig(
        neuron_config,
        load_config=load_pretrained_config(model_path),
    )
    
    # Compile and save model.
    print("\nCompiling and saving model...")
    model = NeuronQwen3ForCausalLM(model_path, config)
    model.compile(traced_model_path)
    tokenizer.save_pretrained(traced_model_path)

In [None]:
run_qwen3_compile()

# Testing

In [None]:
from modeling_qwen3 import Qwen3InferenceConfig, NeuronQwen3ForCausalLM

model = NeuronQwen3ForCausalLM(traced_model_path)
model.load(traced_model_path)

In [5]:
config = model.get_config_cls()
config.get_neuron_config_cls()

neuronx_distributed_inference.models.config.NeuronConfig

In [6]:
model.config.num_attention_heads

32

In [7]:
model.config.num_key_value_heads

8

In [8]:
model.config.hidden_size

4096

In [None]:
tokenizer = AutoTokenizer.from_pretrained(traced_model_path)
tokenizer.pad_token = tokenizer.eos_token
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config_kwargs = {
    "do_sample": False,
    "temperature": 0.9,
    "top_k": 5,
    "pad_token_id": tokenizer.pad_token_id,
}
generation_config.update(**generation_config_kwargs)
generation_model = HuggingFaceGenerationAdapter(model)
messages = [{'role': 'user', 'content': "What's your name?"}]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
)
inputs = tokenizer([text], return_tensors="pt")
input_ids = inputs['input_ids']  

outputs = generation_model.generate(
    input_ids=input_ids,
    max_new_tokens=512
)

In [10]:
output_ids = outputs[0][len(inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

thinking content: 
content: My name is Qwen, and I'm a large language model developed by Alibaba Cloud. How can I assist you today?


# Thinking example

In [11]:
model.reset()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(traced_model_path)
tokenizer.pad_token = tokenizer.eos_token
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config_kwargs = {
    "do_sample": False,
    "temperature": 0.9,
    "top_k": 5,
    "pad_token_id": tokenizer.pad_token_id,
}
generation_config.update(**generation_config_kwargs)
generation_model = HuggingFaceGenerationAdapter(model)
messages = [{'role': 'system', 'content': "Only think through one example before providing the correct answer"},
             {'role': 'user', 'content': "What is 83 * 110 + 34?"}
        ]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
inputs = tokenizer([text], return_tensors="pt")
input_ids = inputs['input_ids'] 
outputs = generation_model.generate(
    input_ids=input_ids,
    max_new_tokens=1024
)

In [14]:
output_ids = outputs[0][len(inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print('####'*80)
print("content:", content)

thinking content: <think>
Okay, let's see. I need to calculate 83 multiplied by 110 and then add 34 to the result. Hmm, let me break this down step by step. First, I should handle the multiplication part: 83 times 110. 

Wait, multiplying by 110 might be easier if I think of it as multiplying by 100 and then adding 10 times the number. Because 110 is 100 + 10. So, 83 times 100 is 8300, and 83 times 10 is 830. Then adding those two together: 8300 + 830. Let me check that. 8300 plus 800 is 9100, and then plus 30 more would be 9130. So, 83 * 110 equals 9130?

Wait, let me verify that another way. Maybe using the standard multiplication method. Let's write it out:

   83
x110
------
First, multiply 83 by 0 (the units place of 110), which gives 0.
Then multiply 83 by 1 (the tens place of 110), which is 83, but since it's in the tens place, it's actually 830.
Then multiply 83 by 1 (the hundreds place of 110), which is 83, but since it's in the hundreds place, it's 8300.
Adding those together

In [15]:
ans = 83*110+34
ans

9164

# Run Benchmarks

In [None]:
dir = '/opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/lib/python3.10/site-packages/neuronx_distributed_inference/'
!cp modeling_qwen3.py {dir}

#Edit the inference_demo.py file to include the following:

```python
from .modeling_qwen import NeuronQwen3ForCausalLM

MODEL_TYPES = {
    "llama": {"causal-lm": NeuronLlamaForCausalLM},
    "mixtral": {"causal-lm": NeuronMixtralForCausalLM},
    "dbrx": {"causal-lm": NeuronDbrxForCausalLM},
    'qwen3': {"causal-lm": NeuronQwen3ForCausalLM}
}
```

In [2]:
!inference_demo \
    --model-type qwen3 \
    --task-type causal-lm \
    run \
    --model-path /home/ubuntu/model_hf_qwen/qwen/ \
    --compiled-model-path /home/ubuntu/traced_model_qwen/qwen/logit \
    --torch-dtype bfloat16 \
    --tp-degree 8 \
    --batch-size 1 \
    --max-context-length 16 \
    --seq-len 32 \
    --enable-bucketing \
    --pad-token-id 151645 \
    --prompt "To be, or not to be" \
    --check-accuracy-mode logit-matching \
    --benchmark

  from neuronx_distributed.modules.moe.blockwise import (
  from neuronx_distributed.modules.moe.blockwise import (
  from neuronx_distributed.modules.moe.blockwise import (
  from neuronx_distributed_inference.modules.custom_calls import neuron_cumsum
  return fn(*args, **kwargs)
  from neuronx_distributed_inference.modules.attention.gqa import GQA, GroupQueryAttention_QKV
  from neuronx_distributed_inference.modules.attention.gqa import GQA, GroupQueryAttention_QKV
  from neuronx_distributed_inference.modules.attention.gqa import GQA, GroupQueryAttention_QKV
  from neuronx_distributed_inference.modules.attention.attention_base import NeuronAttentionBase
  from neuronx_distributed_inference.modules.attention.attention_base import NeuronAttentionBase
  from neuronx_distributed_inference.models.dbrx.modeling_dbrx import NeuronDbrxForCausalLM
  from neuronx_distributed_inference.models.mixtral.modeling_mixtral import NeuronMixtralForCausalLM
  from .modeling_mllama_vision import NeuronMl