In [1]:

# Install dependencies
!pip install -q torch torchvision onnx onnxruntime-gpu onnxruntime-tools onnxconverter-common

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m121.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Import packages
import torch
import torchvision.models as models
import numpy as np
import onnx
import os
import time
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxconverter_common import float16
import onnxruntime as ort

In [3]:
# Export pretrained ResNet18 to ONNX
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.eval()

dummy_input = torch.randn(1, 3, 224, 224)
torch.onnx.export(
    model,
    dummy_input,
    "resnet-18.onnx",
    input_names=["input"],
    output_names=["output"],
    opset_version=11,
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}
)

print("Exported resnet-18.onnx")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 186MB/s]


Exported resnet-18.onnx


In [4]:
# Load model (currently FP32)
model = onnx.load("resnet-18.onnx")

# Convert it to FP16 for lower latency during inference
model_fp16 = float16.convert_float_to_float16(model)

# Create ONNX file for our quantized model
onnx.save(model_fp16, "resnet-18_quantized.onnx")



In [5]:
# Benchmark function
def benchmark(model_path, runs=100, provider="CUDAExecutionProvider"):
    print(f"Benchmarking {model_path}")

    sess = ort.InferenceSession(model_path, providers=[provider])
    input_name = sess.get_inputs()[0].name
    dummy = np.random.randn(1, 3, 224, 224)

    # Convert dummy input to float16 if the model is quantized
    if "quantized" in model_path:
        dummy = dummy.astype(np.float16)
    else:
        dummy = dummy.astype(np.float32)


    # Initial runs to get overhead/optimizations out of the way
    for _ in range(15):
        sess.run(None, {input_name: dummy})

    # Actual benchmarking
    start = time.perf_counter()
    for _ in range(runs):
        sess.run(None, {input_name: dummy})
    end = time.perf_counter()

    avg_ms = (end - start) / runs * 1000
    print(f"{model_path} avg latency: {avg_ms:.2f} ms\n")

In [6]:
# Run benchmarks on GPU
benchmark("resnet-18.onnx")
benchmark("resnet-18_quantized.onnx")

Benchmarking resnet-18.onnx
resnet-18.onnx avg latency: 3.70 ms

Benchmarking resnet-18_quantized.onnx
resnet-18_quantized.onnx avg latency: 1.90 ms

