### Classification example inference with Ryzen AI

This example demonstrates the 5 steps of classification model inference on the embedded Neural Processing Unit (NPU) in your AMD Ryzen AI enabled PC. The steps are as follows:

In [1]:
# Import necessary libraries
import os
import torch
import torch.nn as nn
import torchvision
import subprocess
import onnxruntime
import numpy as np
import onnx
import shutil
import time 
from timeit import default_timer as timer
from quark.onnx import ModelQuantizer  
from quark.onnx.quantization.config import Config, get_default_config  
from utils_custom import ImageDataReader, evaluate_onnx_model 
import json  
import shutil
import sys

[32m
[QUARK-INFO]: Checking custom ops library ...[0m
[32m
[QUARK-INFO]: The CPU version of custom ops library already exists.[0m
[32m
[QUARK-INFO]: Checked custom ops library.[0m


#### 1. Get Model

Here, we'll use the resnet50 model as an example. You may choose any classification models train with Imagenet from torchvision.

In [4]:
# ---------------- Model Setup ---------------- #

# Define directories
models_dir = "models"
os.makedirs(models_dir, exist_ok=True)

# Load pre-trained ResNet50 model
model = torchvision.models.resnet50(weights="IMAGENET1K_V2")

# Save the model
model.to("cpu")
torch.save(model, os.path.join(models_dir, "resnet50.pt"))


#### 2. Export to ONNX

The model inference with Ryzen AI is based on onnxruntime. The following code is used for exporting a PyTorch model to the ONNX (Open Neural Network Exchange) format. The ONNX file is needed to use the AMD Quark Quantizer.

In [5]:
# Export model to ONNX
dummy_inputs = torch.randn(1, 3, 224, 224)
input_names = ['input']
output_names = ['output']
dynamic_axes = {'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
tmp_model_path = os.path.join(models_dir, "resnet50.onnx")

torch.onnx.export(
    model,
    dummy_inputs,
    tmp_model_path,
    export_params=True,
    opset_version=13,  # Recommended opset
    input_names=input_names,
    output_names=output_names,
    dynamic_axes=dynamic_axes,
)

print(f"✅ Model exported to ONNX at: {tmp_model_path}")

✅ Model exported to ONNX at: models\resnet50.onnx


#### 3. Quantize Model

Using the AMD Quark Quantizer and providing the newly exported ONNX model, we'll quantize the model. The quantization progress will need the calibration data from Imagenet. Download the data from [here](https://huggingface.co/datasets/imagenet-1k/tree/main/data) to download it.
You need to register on Hugging Face and download the following file:
**val_images.tar.gz**.
This file contains a subset of ImageNet images used specifically for calibration.

Once downloaded, move the file to your working directory (val_images) and extract the dataset into the calib_data directory..
Below code will read the images from val_image folder and create a calib_data folder.

In [6]:
#
# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
#
# if len(sys.argv) < 3:
#     print("Usage: python prepare_val_data.py <val_data_path> <calib_data_path>")
#     sys.exit(1)

source_folder = 'val_images'
calib_data_path = 'calib_data'

if not os.path.exists(source_folder):
    print("The provided data path does not exist.")
    sys.exit(1)

files = os.listdir(source_folder)

for filename in files:
    if not filename.startswith('ILSVRC2012_val_') or not filename.endswith(
            '.JPEG'):
        continue

    n_identifier = filename.split('_')[-1].split('.')[0]
    folder_name = n_identifier
    folder_path = os.path.join(source_folder, folder_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    file_path = os.path.join(source_folder, filename)
    destination = os.path.join(folder_path, filename)
    shutil.move(file_path, destination)

print("File organization complete.")

if not os.path.exists(calib_data_path):
    os.makedirs(calib_data_path)

destination_folder = calib_data_path

subfolders = os.listdir(source_folder)

for subfolder in subfolders:
    source_subfolder = os.path.join(source_folder, subfolder)
    destination_subfolder = os.path.join(destination_folder, subfolder)
    os.makedirs(destination_subfolder, exist_ok=True)

    files = os.listdir(source_subfolder)

    if files:
        file_to_copy = files[0]
        source_file = os.path.join(source_subfolder, file_to_copy)
        destination_file = os.path.join(destination_subfolder, file_to_copy)

        shutil.copy(source_file, destination_file)

print("Creating calibration dataset complete.")


File organization complete.
Creating calibration dataset complete.


In [7]:
# ---------------- Quark Quantization ---------------- #

# Define dataset directory
calib_dir = "calib_data" 

# Set input & output ONNX model paths
input_model_path = tmp_model_path
output_model_path = os.path.join(models_dir, "resnet50_quantized.onnx")

# Preprocessing transformations
preprocess = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load dataset
calib_dataset = torchvision.datasets.ImageFolder(root=calib_dir, transform=preprocess)

#Data set 
num_calib_data = 54  
calib_dataset = torch.utils.data.Subset(calib_dataset, range(num_calib_data))

# Define DataLoader for Calibration
calibration_dataloader = torch.utils.data.DataLoader(calib_dataset, batch_size=6, shuffle=False)

# Configure Quark Quantization
quant_config = get_default_config("XINT8")  # Use XINT8 quantization  
config = Config(global_quant_config=quant_config)

# Create an ONNX Quantizer  
quantizer = ModelQuantizer(config)  

# Perform Quark Quantization  
quant_model = quantizer.quantize_model(
    model_input=input_model_path,   
    model_output=output_model_path,   
    calibration_data_reader=ImageDataReader(calibration_dataloader)  # Use ImageDataReader from utils_custom
)

print(f"✅ Quark Quantized model saved at: {output_model_path}")

[32m
[QUARK-INFO]: The input ONNX model models\resnet50.onnx can create InferenceSession successfully[0m


[QUARK_INFO]: Time information:
2025-06-30 11:53:36.817912
[QUARK_INFO]: OS and CPU information:
                                        system --- Windows
                                          node --- xhdnucstr12
                                       release --- 10
                                       version --- 10.0.26100
                                       machine --- AMD64
                                     processor --- AMD64 Family 26 Model 36 Stepping 0, AuthenticAMD
[QUARK_INFO]: Tools version information:
                                        python --- 3.10.0
                                          onnx --- 1.18.0
                                   onnxruntime --- 1.22.0.dev20250626
                                    quark.onnx --- 0.9+1a74724+1a74724
[QUARK_INFO]: Quantized Configuration information:
                                   model_input --- models\resnet50.onnx
                                  model_output --- models\resnet50_quantized.onnx
    

[32m
[QUARK-INFO]: Obtained calibration data with 9 iters[0m
[32m
[QUARK-INFO]: Removed initializers from input[0m
[32m
[QUARK-INFO]: Simplified model sucessfully[0m
[32m
[QUARK-INFO]: Loading model...[0m
[32m
[QUARK-INFO]: The input ONNX model C:/Users/akumar23/AppData/Local/Temp/vai.simp.5bw9altr/model_simp.onnx can run inference successfully[0m
[32m
[QUARK-INFO]: Start CrossLayerEqualization...[0m
[32m
[QUARK-INFO]: CrossLayerEqualization pattern num: 32[0m
[32m
[QUARK-INFO]: Total CrossLayerEqualization steps: 1[0m
[32m
[QUARK-INFO]: CrossLayerEqualization Done.[0m
[32m
[QUARK-INFO]: optimize the model for better hardware compatibility.[0m
[33m
[33m
[32m
[QUARK-INFO]: Start calibration...[0m
[32m
[QUARK-INFO]: Start collecting data, runtime depends on your model size and the number of calibration dataset.[0m
[32m
[QUARK-INFO]: Finding optimal threshold for each tensor using PowerOfTwoMethod.MinMSE algorithm ...[0m
[32m
[QUARK-INFO]: Use all calibration 

[32m
[QUARK-INFO]: The quantized information for all operation types is shown in the table below.[0m
[32m
[QUARK-INFO]: The discrepancy between the operation types in the quantized model and the float model is due to the application of graph optimization.[0m


✅ Quark Quantized model saved at: models\resnet50_quantized.onnx


#### 4. Model inference on CPU / iGPU / NPU with single image

Now we have successfully quantized the model, and we will use the onnxruntime to do the inference on CPU, iGPU and NPU.

In [8]:
from PIL import Image

def load_labels(path):
    with open(path) as f:
        data = json.load(f)
    return np.asarray(data)

def preprocess_image(input):
    normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
  
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Resize((224, 224)),
        normalize,
    ])
    img_tensor = transform(input).unsqueeze(0)
    return img_tensor.numpy()

def softmax(x):
    x = x.reshape(-1)
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def postprocess(result):
    return softmax(np.array(result)).tolist()

labels = load_labels('data/imagenet-simple-labels.json')
image = Image.open('data/dog.jpg')

print("Image size: ", image.size)
input_data = preprocess_image(image)

Image size:  (224, 224)


#### CPU Inference

In [9]:
# Run inference on CPU
onnx_model_path = output_model_path
cpu_options = onnxruntime.SessionOptions()

cpu_session = onnxruntime.InferenceSession(
    onnx_model_path,
    providers=['CPUExecutionProvider'],
    sess_options=cpu_options,
)

start = timer()
cpu_outputs = cpu_session.run(None, {'input': input_data})
end = timer()

cpu_results = postprocess(cpu_outputs)
inference_time = np.round((end - start) * 1000, 2)
idx = np.argmax(cpu_results)

print('----------------------------------------')
print(f'Final top prediction is: {labels[idx]}')
print('----------------------------------------')
print(f'Inference time: {inference_time} ms')
print('----------------------------------------')

sort_idx = np.flip(np.squeeze(np.argsort(cpu_results)))
print('------------ Top 5 labels are: ----------------------------')
print(labels[sort_idx[:5]])
print('-----------------------------------------------------------')

----------------------------------------
Final top prediction is: Golden Retriever
----------------------------------------
Inference time: 37.91 ms
----------------------------------------
------------ Top 5 labels are: ----------------------------
['Golden Retriever' 'Labrador Retriever' 'Norwich Terrier'
 'Curly-coated Retriever' 'Flat-Coated Retriever']
-----------------------------------------------------------


#### iGPU Inference

In [10]:
#iGPU inference
dml_options = onnxruntime.SessionOptions()

# Create Inference Session to run the quantized model on the iGPU
dml_session = onnxruntime.InferenceSession(
    onnx_model_path,
    providers = ['DmlExecutionProvider'],
    provider_options = [{"device_id": "0"}]
)
start = time.time()
dml_outputs = dml_session.run(None, {'input': input_data})
end = time.time()

dml_results = postprocess(dml_outputs)
inference_time = np.round((end - start) * 1000, 2)
idx = np.argmax(dml_results)

print('----------------------------------------')
print('Final top prediction is: ' + labels[idx])
print('----------------------------------------')

print('----------------------------------------')
print('Inference time: ' + str(inference_time) + " ms")
print('----------------------------------------')

sort_idx = np.flip(np.squeeze(np.argsort(dml_results)))
print('------------ Top 5 labels are: ----------------------------')
print(labels[sort_idx[:5]])
print('-----------------------------------------------------------') 

----------------------------------------
Final top prediction is: Golden Retriever
----------------------------------------
----------------------------------------
Inference time: 101.22 ms
----------------------------------------
------------ Top 5 labels are: ----------------------------
['Golden Retriever' 'Labrador Retriever' 'Norwich Terrier'
 'Curly-coated Retriever' 'Flat-Coated Retriever']
-----------------------------------------------------------


#### NPU Inference

In [11]:
# set the RYZEN_AI_INSTALLATION_PATH location.
# Reader can find the installation path either under C:\Program Files or the path defined at the time of installation.
# example
os.environ['RYZEN_AI_INSTALLATION_PATH']='C:\Program Files\RyzenAI\1.5.0-0627'

In [None]:
#NPU inference

# Before running, we need to set the ENV variable for the specific NPU we have
# Run pnputil as a subprocess to enumerate PCI devices
command = r'pnputil /enum-devices /bus PCI /deviceids '
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
# Check for supported Hardware IDs
npu_type = ''
if 'PCI\\VEN_1022&DEV_1502&REV_00' in stdout.decode(): npu_type = 'PHX/HPT'
if 'PCI\\VEN_1022&DEV_17F0&REV_00' in stdout.decode(): npu_type = 'STX'
if 'PCI\\VEN_1022&DEV_17F0&REV_10' in stdout.decode(): npu_type = 'STX'
if 'PCI\\VEN_1022&DEV_17F0&REV_11' in stdout.decode(): npu_type = 'STX'

print(f"APU Type: {npu_type}")

install_dir = os.environ['RYZEN_AI_INSTALLATION_PATH']
print(install_dir)

match npu_type:
    case 'PHX/HPT':
        print("Setting provider options for PHX/HPT")
        xclbin_file = os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'phoenix', '4x4.xclbin')
        provider_options = [{
              'target': 'X1',
              'xclbin': xclbin_file,
              'ai_analyzer_visualization': True,
              'ai_analyzer_profiling': True,
          }]
    case 'STX':
        print("Setting provider options for STX")
        provider_options = [{
              'ai_analyzer_visualization': True,
              'ai_analyzer_profiling': True,
          }]
    case _:
        print("Unrecognized APU type. Exiting.")
        exit()

npu_session = onnxruntime.InferenceSession(
    onnx_model_path,
    providers = ['VitisAIExecutionProvider'],
    provider_options = provider_options
)

start = time.time()
npu_outputs = npu_session.run(None, {'input': input_data})
end = time.time()

npu_results = postprocess(npu_outputs)
inference_time = np.round((end - start) * 1000, 2)
idx = np.argmax(npu_results)

print('----------------------------------------')
print('Final top prediction is: ' + labels[idx])
print('----------------------------------------')

print('----------------------------------------')
print('Inference time: ' + str(inference_time) + " ms")
print('----------------------------------------')

sort_idx = np.flip(np.squeeze(np.argsort(npu_results)))
print('------------ Top 5 labels are: ----------------------------')
print(labels[sort_idx[:5]])
print('-----------------------------------------------------------')

APU Type: STX
C:\Program Files\RyzenAI.5.0-0627
Setting xclbin file for STX
C:\Program Files\RyzenAI\1.5.0-0627\voe-4.0-win_amd64\xclbins\strix\AMD_AIE2P_4x4_Overlay.xclbin
----------------------------------------
Final top prediction is: Golden Retriever
----------------------------------------
----------------------------------------
Inference time: 12.82 ms
----------------------------------------
------------ Top 5 labels are: ----------------------------
['Golden Retriever' 'Labrador Retriever' 'Norwich Terrier'
 'Curly-coated Retriever' 'Flat-Coated Retriever']
-----------------------------------------------------------


#### 5. Model Analysis on NPU

After NPU inference, there are several '.json' files generated by the Ryzen AI tracing tool, which could be open by the AI Analyzer for further optimization.

In [1]:
!aianalyzer ./ -p 8001

^C
