In [48]:
!python -m pip install -r requirements.txt



In [49]:
import torch
import torch.nn as nn
import os
import subprocess
import onnxruntime
import numpy as np
import onnx
import shutil
from timeit import default_timer as timer
import vai_q_onnx

In [50]:
def get_apu_info():
    # Run pnputil as a subprocess to enumerate PCI devices
    command = r'pnputil /enum-devices /bus PCI /deviceids '
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    # Check for supported Hardware IDs
    apu_type = ''
    if 'PCI\\VEN_1022&DEV_1502&REV_00' in stdout.decode(): apu_type = 'PHX/HPT'
    if 'PCI\\VEN_1022&DEV_17F0&REV_00' in stdout.decode(): apu_type = 'STX'
    if 'PCI\\VEN_1022&DEV_17F0&REV_10' in stdout.decode(): apu_type = 'STX'
    if 'PCI\\VEN_1022&DEV_17F0&REV_11' in stdout.decode(): apu_type = 'STX'
    return apu_type
apu_type = get_apu_info()
print(f"APU Type: {apu_type}")

APU Type: PHX/HPT


In [51]:
def set_environment_variable(apu_type):

    install_dir = os.environ['RYZEN_AI_INSTALLATION_PATH']
    match apu_type:
        case 'PHX/HPT':
            print("Setting environment for PHX/HPT")
            os.environ['XLNX_VART_FIRMWARE']= os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'phoenix', '1x4.xclbin')
            os.environ['NUM_OF_DPU_RUNNERS']='1'
            os.environ['XLNX_TARGET_NAME']='AMD_AIE2_Nx4_Overlay'
        case 'STX':
            print("Setting environment for STX")
            os.environ['XLNX_VART_FIRMWARE']= os.path.join(install_dir, 'voe-4.0-win_amd64', 'xclbins', 'strix', 'AMD_AIE2P_Nx4_Overlay.xclbin')
            os.environ['NUM_OF_DPU_RUNNERS']='1'
            os.environ['XLNX_TARGET_NAME']='AMD_AIE2_Nx4_Overlay'
        case _:
            print("Unrecognized APU type. Exiting.")
            exit()
    print('XLNX_VART_FIRMWARE=', os.environ['XLNX_VART_FIRMWARE'])
    print('NUM_OF_DPU_RUNNERS=', os.environ['NUM_OF_DPU_RUNNERS'])
    print('XLNX_TARGET_NAME=', os.environ['XLNX_TARGET_NAME'])

os.environ['RYZEN_AI_INSTALLATION_PATH'] = "/home/user/ryzen_ai-1.4.0"

set_environment_variable(apu_type)

Setting environment for PHX/HPT
XLNX_VART_FIRMWARE= /home/user/ryzen_ai-1.4.0\voe-4.0-win_amd64\xclbins\phoenix\1x4.xclbin
NUM_OF_DPU_RUNNERS= 1
XLNX_TARGET_NAME= AMD_AIE2_Nx4_Overlay


Model

In [52]:
class WeightedSumConv1D(nn.Module):
    def __init__(self, weights: torch.Tensor):
        super().__init__()
        P = weights.shape[1]
        self.conv1d = nn.Conv1d(3, P, kernel_size=1, bias=False)
        self.conv1d.weight.data = weights.T.unsqueeze(-1)
        self.conv1d.weight.requires_grad = False

    def forward(self, x):
        out = self.conv1d(x)          # (1, P, M)
        return out.squeeze(0) # torch.mul(out, scale).squeeze(0)  # (P, M)

In [53]:
class CosineLayer(nn.Module):
    def forward(self, x):
        return torch.cos(x)

class SineLayer(nn.Module):
    def forward(self, x):
        return torch.sin(x)

In [54]:
class ElementwiseMultiply(nn.Module):
    def forward(self, x, y):
        return torch.mul(x, y)

class Subtract(nn.Module):
    def forward(self, x, y):
        return torch.sub(x, y)

class Average(nn.Module):
    def __init__(self, M):
        super().__init__()
        self.pool = nn.AvgPool1d(kernel_size=M)

    def forward(self, x):
        out = self.pool(x) # (M, 1)
        out = out.squeeze(-1) # (M,)
        return out

In [55]:
class RealImagDiffAverage(nn.Module):
    def __init__(self, M):
        super().__init__()
        self.cos = CosineLayer()
        self.sin = SineLayer()
        self.mul = ElementwiseMultiply()
        self.sub = Subtract()
        # self.avg = Average(M)

    def forward(self, x, input1, input2, scale):
        """
        x: Tensor of shape (P, M)
        input1: Tensor of shape (M) - multiplier for the real part (cos)
        input2: Tensor of shape (M) - multiplier for the imaginary part (sin)

        Output: Tensor of shape (P) - final result after the operations
        """
        # Multiply by the scale factor
        A = self.mul(x, scale.view(-1, 1))

        # Apply cosine and sine element-wise to each row of x
        real = self.cos(A)  # shape: (P, M)
        imag = self.sin(A)   # shape: (P, M)

        # Multiply element-wise with input1 and input2 respectively
        R = self.mul(real, input1)  # shape: (P, M)
        I = self.mul(imag, input2)  # shape: (P, M)

        # Subtract R and I element-wise
        diff = self.sub(R, I)  # shape: (P, M)

        # Average the result over the M dimension (mean of each row)
        return torch.mean(diff, dim=1)  # shape: (P,)

In [56]:
class CombinedModule(nn.Module):
    def __init__(self, weights: torch.Tensor, M):
        super().__init__()
        self.weighted_sum = WeightedSumConv1D(weights)
        self.real_imag = RealImagDiffAverage(M)

    def forward(self, x, input1, input2, scale):
        out = self.weighted_sum(x)
        return self.real_imag(out, input1, input2, scale)

In [57]:
import os
import zipfile

extract_dir = "unzipped_files"  # or any directory you want to extract to
output = "unzipped_files/inputfiles.zip"

# Unzip the file
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Files extracted to: {extract_dir}")

Files extracted to: unzipped_files


In [58]:
import numpy as np
SPEED_OF_LIGHT = 299792458.0

def read_npy_data(path):
    baselines = np.load(f"{path}/baselines.npy")
    visibilities = np.load(f"{path}/vis.npy")[0]
    frequency = np.load(f"{path}/freq.npy")
    return (frequency, visibilities, baselines)
npix_l, npix_m = 128, 128
frequency, visibilities, baselines = read_npy_data(path=extract_dir)
visR, visI = np.real(visibilities), np.imag(visibilities)
u, v, w = [baselines[:, :, i] for i in range(3)]
l, m = np.meshgrid(np.linspace(-1, 1, npix_l), np.linspace(1, -1, npix_m))
with np.errstate(all='ignore'):
    n = np.sqrt(1 - l**2 - m**2) - 1
    nan_mask = np.isnan(n)
    n = np.nan_to_num(n) # or else it doesnt work
print("Visibilities")
print("visR:", visR.shape, "visI:",  visI.shape)
print("Baselines")
print("u:", u.shape, "v:", v.shape, "w:", w.shape)
print("Frequency")
print("freq:", frequency.shape, "=", frequency[0])
print("LMN")
print("l:", l.shape, "m:", m.shape, "n:", n.shape)

Visibilities
visR: (96, 96) visI: (96, 96)
Baselines
u: (96, 96) v: (96, 96) w: (96, 96)
Frequency
freq: (1,) = 58593750.0
LMN
l: (128, 128) m: (128, 128) n: (128, 128)


In [59]:
# Define dimensions
M = 96**2  # Size of each input vector
P = npix_l*npix_m  # Number of projections

# Create random NumPy inputs
baselinesInputNP = np.array([u.flatten(), v.flatten(), w.flatten()]).astype(np.float32) # (3, M)
realVisInputNP = visR.flatten().astype(np.float32)       # (M,)
imagVisInputNP = visI.flatten().astype(np.float32)       # (M,)
factor = -2*frequency[0]*np.pi/SPEED_OF_LIGHT            # scalar
scaleNP = np.repeat(factor, P).astype(np.float32)        # (P,)
weightsNP = np.array([l.flatten(), m.flatten(), n.flatten()]).astype(np.float32)   # (3, P)

# Convert NumPy arrays to PyTorch tensors
baselinesInput = torch.from_numpy(baselinesInputNP)
realVisInput = torch.from_numpy(realVisInputNP)
imagVisInput = torch.from_numpy(imagVisInputNP)
scale = torch.from_numpy(scaleNP)
weights = torch.from_numpy(weightsNP)

Declaring Model

In [60]:
torch_model = CombinedModule(weights, M)
torch_model.eval()

CombinedModule(
  (weighted_sum): WeightedSumConv1D(
    (conv1d): Conv1d(3, 16384, kernel_size=(1,), stride=(1,), bias=False)
  )
  (real_imag): RealImagDiffAverage(
    (cos): CosineLayer()
    (sin): SineLayer()
    (mul): ElementwiseMultiply()
    (sub): Subtract()
  )
)

Export to ONNX

In [61]:
x, input1, input2, scale = (torch.randn(1, 3, M), torch.randn(M), torch.randn(M), torch.randn(P))
inputs = {"x": x, "input1": input1, "input2": input2, "scale": scale}
dynamic_axes = {"input": {0: "batch_size"}, "output": {0: "batch_size"}}

model_path = "models/lofty.onnx"

# Call export function
torch.onnx.export(
        torch_model,
        inputs,
        model_path,
        export_params=True,
        opset_version=13,  # Recommended opset
        input_names=['input'],
        output_names=['output'],
        dynamic_axes=dynamic_axes,
    )

In [62]:
# pip install cmake amd-quark (this is how you install quark)
from quark.onnx.quantization.config import Config, get_default_config

from quark.onnx import ModelQuantizer

# `input_model_path` is the path to the original, unquantized ONNX model.
input_model_path = "models/lofty.onnx"

# `output_model_path` is the path where the quantized model will be saved.
output_model_path = "models/lofty_quantized.onnx"

# Use default quantization configuration
quant_config = get_default_config("XINT8")
quant_config.extra_options["UseRandomData"] = True
# Defines the quantization configuration for the whole model
config = Config(global_quant_config=quant_config)
print("The configuration of the quantization is {}".format(config))

# Create an ONNX Quantizer
quantizer = ModelQuantizer(config)

# Quantize the ONNX model
quant_model = quantizer.quantize_model(model_input = input_model_path,
                                       model_output = output_model_path,
                                       calibration_data_path = None)

print('Calibrated and quantized model saved at:', output_model_path)

[32m
[QUARK-INFO]: The input ONNX model models/lofty.onnx can create InferenceSession successfully[0m
[32m
[QUARK-INFO]: Random input name input shape [1, 3, 9216] type <class 'numpy.float32'> [0m
[32m
[QUARK-INFO]: Random input name onnx::Mul_1 shape [9216] type <class 'numpy.float32'> [0m
[32m
[QUARK-INFO]: Random input name onnx::Mul_2 shape [9216] type <class 'numpy.float32'> [0m
[32m
[QUARK-INFO]: Random input name onnx::Reshape_3 shape [16384] type <class 'numpy.float32'> [0m
[32m
[QUARK-INFO]: Obtained calibration data with 1 iters[0m
[32m
[QUARK-INFO]: Removed initializers from input[0m
[32m
[QUARK-INFO]: Simplified model sucessfully[0m
[32m
[QUARK-INFO]: Duplicate the shared initializers in the model for separate quantization use across different nodes![0m
[32m
[QUARK-INFO]: Loading model...[0m


[QUARK_INFO]: Time information:
2025-06-03 13:03:53.646577
[QUARK_INFO]: OS and CPU information:
                                        system --- Windows
                                          node --- xir-xup-w25
                                       release --- 10
                                       version --- 10.0.26100
                                       machine --- AMD64
                                     processor --- AMD64 Family 25 Model 116 Stepping 1, AuthenticAMD
[QUARK_INFO]: Tools version information:
                                        python --- 3.10.17
                                          onnx --- 1.16.1
                                   onnxruntime --- 1.20.1
                                    quark.onnx --- 0.8+2fc870b
[QUARK_INFO]: Quantized Configuration information:
                                   model_input --- models/lofty.onnx
                                  model_output --- models/lofty_quantized.onnx
                       calib

[32m
[QUARK-INFO]: The input ONNX model C:/Users/mruiz/AppData/Local/Temp/vai.cpinit.33lo1k8z/model_cpinit.onnx can run inference successfully[0m
[32m
[QUARK-INFO]: optimize the model for better hardware compatibility.[0m
[33m
[33m
[32m
[QUARK-INFO]: Start calibration...[0m
[32m
[QUARK-INFO]: Start collecting data, runtime depends on your model size and the number of calibration dataset.[0m
[32m
[QUARK-INFO]: Finding optimal threshold for each tensor using PowerOfTwoMethod.MinMSE algorithm ...[0m
[32m
[QUARK-INFO]: Use all calibration data to calculate min mse[0m
Computing range: 100%|██████████| 14/14 [01:31<00:00,  6.53s/tensor]
[32m
[QUARK-INFO]: Finished the calibration of PowerOfTwoMethod.MinMSE which costs 94.4s[0m
[32m
[QUARK-INFO]: Remove QuantizeLinear & DequantizeLinear on certain operations(such as conv-relu).[0m
[33m
[32m
[QUARK-INFO]: Adjust the quantize info to meet the compiler constraints[0m


The operation types and their corresponding quantities of the input float model is shown in the table below.


The quantized information for all operation types is shown in the table below.
The discrepancy between the operation types in the quantized model and the float model is due to the application of graph optimization.


Calibrated and quantized model saved at: models/lofty_quantized.onnx


Run in CPU

In [63]:
# Specify the path to the quantized ONNZ Model
quantized_model_path = r'models/lofty_quantized.onnx' # quatized model doesnt work, but the normal does, weird
model = onnx.load(quantized_model_path)

# Create some random input data for testing
input_data = {"input": baselinesInputNP[np.newaxis, :, :], "onnx::Mul_1": realVisInputNP, "onnx::Mul_2": imagVisInputNP, "onnx::Reshape_3": scaleNP}

cpu_options = onnxruntime.SessionOptions()

# Create Inference Session to run the quantized model on the CPU
cpu_session = onnxruntime.InferenceSession(
    model.SerializeToString(),
    providers = ['CPUExecutionProvider'],
    sess_options=cpu_options,
)

for input_info in cpu_session.get_inputs():
    print(f"Input name: {input_info.name}, shape: {input_info.shape}")

# Run Inference
start = timer()
cpu_results = cpu_session.run(None, input_data)
cpu_total = timer() - start

Input name: input, shape: ['batch_size', 3, 9216]
Input name: onnx::Mul_1, shape: [9216]
Input name: onnx::Mul_2, shape: [9216]
Input name: onnx::Reshape_3, shape: [16384]


Run in NPU

In [64]:
# We want to make sure we compile everytime, otherwise the tools will use the cached version
# Get the current working directory
current_directory = os.getcwd()
directory_path = os.path.join(current_directory,  r'cache\hello_cache')
cache_directory = os.path.join(current_directory,  r'cache')

# Check if the directory exists and delete it if it does.
if os.path.exists(directory_path):
    shutil.rmtree(directory_path)
    print(f"Directory deleted successfully. Starting Fresh.")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory 'C:\Users\mruiz\npucloud_userdata\antonio-fortanet-capetillo-tudelft\ryzenaisw\cache\hello_cache' does not exist.


In [65]:
install_dir = os.environ['RYZEN_AI_INSTALLATION_PATH']
config_file_path = os.path.join(install_dir, 'voe-4.0-win_amd64', 'vaip_config.json') # Path to the NPU config file

aie_options = onnxruntime.SessionOptions()

aie_session = onnxruntime.InferenceSession(
    model.SerializeToString(),
    providers=['VitisAIExecutionProvider'],
    sess_options=aie_options,
    provider_options = [{'config_file': config_file_path,
                         'cacheDir': cache_directory,
                         'cacheKey': 'hello_cache'}]
)

RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Exception during initialization: private: static void __cdecl google::protobuf::FieldDescriptor::TypeOnceInit(class google::protobuf::FieldDescriptor const * __ptr64)
public: virtual unsigned char * __ptr64 __cdecl google::protobuf::internal::ZeroFieldsBase::_InternalSerialize(unsigned char * __ptr64,class google::protobuf::io::EpsCopyOutputStream * __ptr64)const __ptr64
__CxxFrameHandler4
(unknown)
RtlCaptureContext2
public: static class vaip_core::ConfigProto __cdecl vaip_core::Config::parse_from_string(char const * __ptr64)
class std::vector<class std::unique_ptr<class vaip_core::ExecutionProvider,struct std::default_delete<class vaip_core::ExecutionProvider> >,class std::allocator<class std::unique_ptr<class vaip_core::ExecutionProvider,struct std::default_delete<class vai
class std::vector<class std::unique_ptr<class vaip_core::ExecutionProvider,struct std::default_delete<class vaip_core::ExecutionProvider> >,class std::allocator<class std::unique_ptr<class vaip_core::ExecutionProvider,struct std::default_delete<class vai
compile_onnx_model_vitisai_ep_with_error_handling
(unknown)
(unknown)
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
PyInit_onnxruntime_pybind11_state
public: void __cdecl pybind11::error_already_set::discard_as_unraisable(class pybind11::object) __ptr64
PyCFunction_GetFlags
_PyObject_MakeTpCall
PyMethod_Self
_PyOS_URandomNonblock
PyEval_GetFuncDesc
_PyEval_EvalFrameDefault
_PyEval_EvalFrameDefault
_PyFunction_Vectorcall
_PyOS_URandomNonblock
PyEval_GetFuncDesc
_PyEval_EvalFrameDefault


In [44]:
# Run Inference
start = timer()
npu_results = aie_session.run(None, input_data)
npu_total = timer() - start

NameError: name 'aie_session' is not defined