---
title: "Speeding up Inference with Onnx and TensorRT"
author: "chris"
date: 2024-05-27
draft: false
---

follow up post to pytorch quantization ... can we make it faster with GPU and TensorRT

get FasterRCNN as before with a resnet101 backbone...

In [None]:
%%capture
import torch
from torchvision.models.resnet import ResNet, Bottleneck, ResNet101_Weights
from torchvision.models._utils import IntermediateLayerGetter
from torchvision.models.detection.backbone_utils import BackboneWithFPN
from torchvision.models.detection.faster_rcnn import FasterRCNN


def resnet_101():
    resnet = ResNet(block=Bottleneck, layers=[3, 4, 23, 3])
    resnet.load_state_dict(ResNet101_Weights.DEFAULT.get_state_dict(progress=True))
    return resnet


resnet = resnet_101()

# same as before, get intermediate layers and their output dimensions
returned_layers = [1, 2, 3, 4]
return_layers = {f"layer{k}": str(v) for v, k in enumerate(returned_layers)}
in_channels_list = []
for k1, m1 in resnet.named_children():
  if 'layer' in k1:
    in_channels_list.append((m1[-1].bn3.num_features))

rcnn = FasterRCNN(
    BackboneWithFPN(
        backbone=resnet,
        return_layers=return_layers,
        in_channels_list=in_channels_list,
        out_channels=256,
        extra_blocks=None,
        norm_layer=None,
        ),
    num_classes=2
)

rcnn.eval()

time the RCNN on both CPU and GPU.  I don't recall what the specs were the last time I used colab to profile the inference time so I'll document that here as well.  I'm using a T4 GPU and the following CPU

In [None]:
# !cat /proc/cpuinfo  | grep 'name' | uniq
!lscpu | grep 'name'

model name	: Intel(R) Xeon(R) CPU @ 2.20GHz


In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-8e4bcaf6-e88c-6df2-f337-32e33d96a494)


In [None]:
# random image
image = torch.rand(3, 200, 200)
# put on CPU
rcnn.to(torch.device('cpu'))
image_cpu = image.to(torch.device('cpu'))

with torch.no_grad():
    cpu_time = %timeit -o rcnn([image_cpu])

1.86 s ± 190 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
from copy import deepcopy

# on GPU
rcnn_gpu = deepcopy(rcnn).to(torch.device('cuda'))
# rcnn.to(torch.device('cuda'))
image_gpu = image.to(torch.device('cuda'))

with torch.no_grad():
    gpu_time = %timeit -o rcnn_gpu([image_gpu])

98.8 ms ± 260 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


we can also test with half precision...

In [None]:
rcnn_gpu_half = rcnn_gpu.half().to(torch.device('cuda'))
input_half = image_gpu.half()

with torch.no_grad():
    gpu_half_time = %timeit -o rcnn_gpu_half([input_half])

42.7 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


also re-clock the quantized model using FX Graph Mode since it's performance is also CPU specific

In [None]:
%%capture
from torch.ao.quantization import quantize_fx
from torch.ao.quantization.qconfig_mapping import get_default_qconfig_mapping


quant_rcnn = deepcopy(rcnn)

qconfig_mapping = get_default_qconfig_mapping("fbgemm")  # "qnnpack"
# assume calibrated already
quant_rcnn.eval()
quant_rcnn.to(torch.device('cpu'))
# prepare and quantize
example_input = torch.randn(1, 3, 200, 200)
quant_rcnn.backbone = quantize_fx.prepare_fx(quant_rcnn.backbone, qconfig_mapping, example_input)
quant_rcnn.backbone = quantize_fx.convert_fx(quant_rcnn.backbone)

script_module = torch.jit.script(quant_rcnn)
script_module.save("./quant_rcnn.pt")
quant_rcnn_jit = torch.jit.load("./quant_rcnn.pt", map_location=torch.device('cpu'))

In [None]:
import warnings

# warmup
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for _ in range(3):
        __ = quant_rcnn_jit([image_cpu])

with torch.no_grad():
    quant_time = %timeit -o quant_rcnn_jit([image_cpu])

1.35 s ± 223 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


convert float model to onnx...

In [None]:
!pip install onnx
!pip install onnxruntime



In [None]:
import onnx

# onnx runs on cpu
rcnn.to(torch.device('cpu'))
# hack:
# onnx wants a tuple of 2 or bombs, but for some reason is ok with a none type
# known issue https://github.com/zhiqwang/yolort/issues/485
torch.onnx.export(rcnn, ([image], None), "rcnn.onnx", opset_version = 11)
# make sure the onnx proto is valid
rcnn_onnx = onnx.load("rcnn.onnx")
onnx.checker.check_model(rcnn_onnx)

  (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
  boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  assert condition, message
  torch.tensor(s, dtype=torch.float32, device=boxes.device)
  / torch.tensor(s_orig, dtype=torch.float32, device=boxes.device)


run inference on onnx model, make sure outputs are as expected, then clock-it...

In [None]:
import onnxruntime
import numpy as np

ort_session = onnxruntime.InferenceSession("rcnn.onnx", providers=["CPUExecutionProvider"])
# good to make sure inputs are as expected with: 'ort_session.get_inputs()'

# onnx wants numpy tensor not torch tensor
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# get a prediction.  note onnx doesn't need a list input like torch model did...
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(image)}
ort_outs = ort_session.run(None, ort_inputs)

In [None]:
# onxx outputs are list of three arrays corresponding to 'boxes', 'labels', and 'scores'
print("onnx out shapes: ", [arr.shape for arr in ort_outs])
# quant model out is tuple of (losses, outputs)
torch_outs = __[1][0]
print("torch out shapes: ", [torch_outs[k].shape for k in torch_outs])

onnx out shapes:  [(100, 4), (100,), (100,)]
torch out shapes:  [torch.Size([100, 4]), torch.Size([100]), torch.Size([100])]


In [None]:
onnx_time = %timeit -o ort_session.run(None, ort_inputs)

1.62 s ± 148 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


convert from onnx to tensorRT...

In [None]:
# TODO ...
# import tensorrt

In [None]:
#| code-fold: true
# plot latency for all methods (bar chart)