In [1]:
import numpy as np
import onnxruntime
import onnx
from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize, CalibrationMethod
from onnxruntime.quantization import CalibrationDataReader
from PIL import Image

print("onnx version: ", onnx.__version__)
print("onnxruntime version: ", onnxruntime.__version__)


onnx version:  1.14.1
onnxruntime version:  1.17.0


In [2]:
# pip install onnxsim
# input shape tested by `test_size.py`
!onnxsim model.onnx model-sim.onnx --overwrite-input-shape 1,3,128,128

Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m                  [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add                │ 15             │ 15               │
│ BatchNormalization │ 49             │ [1;32m1               [0m │
│ Concat             │ 1              │ 1                │
│ Constant           │ 300            │ [1;32m148             [0m │
│ Conv               │ 72             │ 72               │
│ ConvTranspose      │ 2              │ 2                │
│ GlobalAveragePool  │ 8              │ 8                │
│ HardSigmoid        │ 8              │ 8                │
│ HardSwish          │ 20             │ 20               │
│ Mul                │ 8              │ 8                │
│ Relu               │ 22             │ 22               │
│ Resize             │ 6

In [3]:
# preprocessing
!python -m onnxruntime.quantization.preprocess --input model-sim.onnx --output model-preprocessed.onnx

In [4]:
input_model_path = 'model-preprocessed.onnx'
output_model_path = 'model-quant.onnx'

calibration_dataset_path = None

In [5]:
# file:///Users/tony/STM32Cube/Repository/Packs/STMicroelectronics/X-CUBE-AI/8.1.0/Documentation/quantization.html#ref_onnx_static_quant
# Note that for a quick evaluation in term of inference time and memory footprint, the XXXDataReader object can be updated to generate the fake image with the random data.
# IMPORTANT! Use real data instead of random data for calibration in the real scenario.

class DetDataReader(CalibrationDataReader):
    def __init__(self, calibration_image_folder: str, model_path: str):
        self.enum_data = None

        # Use inference session to get input shape.
        session = onnxruntime.InferenceSession(model_path, None)
        (_, chnannel, height, width) = session.get_inputs()[0].shape

        # Generate the random data in the half-open interval [0.0, 1.0).
        self.nhwc_data_list = [np.random.random_sample((1, chnannel, height, width)).astype(np.float32)
                               for i in range(20)]

        self.input_name = session.get_inputs()[0].name
        self.datasize = len(self.nhwc_data_list)

    def get_next(self):
        if self.enum_data is None:
            self.enum_data = iter(
                [{self.input_name: nhwc_data} for nhwc_data in self.nhwc_data_list]
            )
        return next(self.enum_data, None)

    def rewind(self):
        self.enum_data = None

In [6]:
dr = DetDataReader(
        calibration_dataset_path, input_model_path
    )

conf = StaticQuantConfig(
    calibration_data_reader=dr,
    quant_format=QuantFormat.QDQ,
    calibrate_method=CalibrationMethod.MinMax,
    # optimize_model=True,
    activation_type=QuantType.QInt8,
    weight_type=QuantType.QInt8,
    # nodes_to_exclude=['resnetv17_dense0_fwd', ..],
    # nodes_to_quantize=['resnetv17_dense0_fwd', ..],
    per_channel=False)
      
quantize(input_model_path, output_model_path, conf)

  quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
