```
The MIT License (MIT)

Copyright (c) 2020 NVIDIA

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

```

# Model Converstion to NVIDIA TensorRT & Inference

Walthrough a generic pipeline for:
- Converting a Pytorch network to TensorRT (via ONNX)
- With and without dynamic batch
- Steps for Running inference using a TensorRT engine in Python

#### Environment
All steps executed using **NGC Pytorch Docker (v 20.06)**
* [GPU Dashboards](https://medium.com/rapids-ai/gpu-dashboards-in-jupyter-lab-757b17aae1d5) installed using
```
pip install jupyterlab-nvdashboard
jupyter labextension install jupyterlab-nvdashboard
```
* [Netron](https://github.com/lutzroeder/netron) for network visualization

#### Dataset
* [NIH ChestXray 14 dataset](https://www.nih.gov/news-events/news-releases/nih-clinical-center-provides-one-largest-publicly-available-chest-x-ray-datasets-scientific-community)
* 112,120 frontal view chest xrays from 30,805 unique subjects
* X-ray images are available as bitmaps extracted from the DICOM file

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from novograd import Novograd
import os


In [2]:
shirnked_labels={0: 'healthy', 1: 'unknow_rep_illness', 2: 'covid_positive'}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
covid_np=np.load('covid.npy')
healthy_np=np.load('healthy.npy')
covid_np.shape, healthy_np.shape

((1, 1, 32000), (1, 1, 32000))

### Model

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
#import torchaudio
import pandas as pd
import numpy as np
from novograd import Novograd
class COVID(nn.Module):
    def __init__(self):
        super(COVID, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 80, 4)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(128, 128, 3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(128, 256, 3)
        self.bn3 = nn.BatchNorm1d(256)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(256, 512, 3)
        self.bn4 = nn.BatchNorm1d(512)
        self.pool4 = nn.MaxPool1d(4)
        self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
        self.fc1 = nn.Linear(512, 3)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = self.avgPool(x)
        x = x.permute(0, 2, 1) #change the 512x1 to 1x512
        x = self.fc1(x)
        return F.log_softmax(x, dim = 2)

model = COVID()
model.to(device)
optimizer = Novograd(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)



In [4]:
import onnx
#Load the ONNX Model
model_onnx = onnx.load("./saved_model/relabelled_covid.onnx")

# # Check that the IR is well formed
onnx.checker.check_model(model_onnx)

# # Print a human readable representation of the graph
# onnx.helper.printable_graph(model_onnx.graph)

In [5]:
!ls -lh ./saved_model/*.onnx

-rw-r--r-- 1 root root 2.2M Nov  3 09:19 ./saved_model/relabelled_covid.onnx


**Check the ONNX network in Netron (at http://localhost:8080/)**

### Test inference

### Pytorch

### ONNX

In [9]:
#Run ONNX inference
import onnxruntime as ort

def onnx_prediction(onnx_model, input_numpy ):
    
    ort_session = ort.InferenceSession(onnx_model)
    
    outputs = ort_session.run(None, {'input': input_numpy})
    print(type(outputs))
    
    print(outputs[0][0], outputs[0][0].shape)
    pred=np.argmax(outputs[0][0], axis=-1)
    out_pred=shirnked_labels[pred[0]]
    return out_pred

output=onnx_prediction('./saved_model/relabelled_covid.onnx', covid_np)

print("onnx runtime prediction :{} | actual label :{}".format(output, 'covid_positive'))


<class 'list'>
[[-8.9343977e+00 -7.0125051e+00 -1.0328917e-03]] (1, 3)
onnx runtime prediction :covid_positive | actual label :covid_positive


(8, 7)

# TensorRT

### Key Concepts
**Network Definition**: interface provides methods for the application to specify the definition of a network.

**Builder Configuration**: interface specifies details for creating an engine.

**Builder**: allows the creation of an optimized engine from a network definition and a builder configuration.

**Engine**: allows the application to execute inference. It supports synchronous and asynchronous execution, profiling, and enumeration and querying of the bindings for the engine inputs and outputs.

An **Optimization profile** specifies constraints on dynamic dimensions. It describes a range of dimensions for each network input and the dimensions that the auto-tuner should use for optimization. When using runtime dimensions, you must create at least one optimization profile at build time. Two profiles can specify disjoint or overlapping ranges.

For example, one profile might specify a minimum size of [3,100,200], a maximum size of [3,200,300], and optimization dimensions of [3,150,250] while another profile might specify min, max and optimization dimensions of [3,200,100], [3,300,400], and [3,250,250].



> Note, if your TensorRT engine has fixed batch size and input shapes, then you **do not** need to worry about optimization profile(s)



### Note

*Implicit batch* networks were previously the standard up until TensorRT 6. They supported variable batch size through the use of the builder.maxBatchSize attribute, but do not support variable shapes for any of the other dimensions.

*Explicit Batch* networks introduced a few changes to the TensorRT API. 
First, inference is instead performed using execute_v2(bindings) and execute_async_v2(bindings, stream) , which no longer require a batch_size argument since it is taken from the context binding dimensions explicitly.

## Convert Model to TRT

Sample commands

Simple network with no dynamic batch dimension
```
trtexec --explicitBatch \
            --onnx=model_weights.onnx \
            --saveEngine=trt_export.engine 
```

With dynamic batch
``` 
trtexec --explicitBatch \
            --onnx=model_weights_dynamicbatch.onnx \
            --minShapes=input:1x1x48x48 \
            --optShapes=input:4x1x48x48 \
            --maxShapes=input:8x1x48x48 \
            --shapes=input:4x1x48x48 \
            --saveEngine=trt_export_dynamicbatch.engine 
```

For generating and engine in FP16
``` 
trtexec --explicitBatch \
            --onnx=model_weights_dynamicbatch.onnx \
            --minShapes=input:1x1x48x48 \
            --optShapes=input:4x1x48x48 \
            --maxShapes=input:8x1x48x48 \
            --shapes=input:4x1x48x48 \
            --saveEngine=trt_export_dynamicbatch_fp16.engine \
            --fp16
```

In [24]:
cmd = 'trtexec --explicitBatch --onnx=./saved_model/relabelled_covid.onnx --minShapes=input:1x1x32000 --optShapes=input:1x1x32000 --maxShapes=input:1x1x32000  --shapes=input:1x1x32000 --saveEngine=./saved_model/covid.engine'
print(cmd)
os.system(cmd)

trtexec --explicitBatch --onnx=./saved_model/relabelled_covid.onnx --minShapes=input:1x1x32000 --optShapes=input:1x1x32000 --maxShapes=input:1x1x32000  --shapes=input:1x1x32000 --saveEngine=./saved_model/covid.engine


0

In [22]:
cmd = 'python onnx_to_tensorrt7.py --model=./saved_model/relabelled_covid.onnx --output=covid.engine'
print(cmd)
#os.system(cmd)

python onnx_to_tensorrt7.py --model=./saved_model/relabelled_covid.onnx --output=covid.engine


512

#### Inspect Engine

In [18]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

In [19]:
def load_engine(filename):
    # Load serialized engine file into memory
    with open(filename, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

def inspect_engine(engine):
    profile_meta = {}
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    for profile_index in range(engine.num_optimization_profiles):
        start_binding = profile_index * num_bindings_per_profile
        end_binding = start_binding + num_bindings_per_profile
        
        binding_meta = {}
        for binding_index in range(start_binding, end_binding):
            key = "Binding {}".format(binding_index)
            binding_meta[key] = {
                "profile": profile_index,
                "binding_index": binding_index,
                "binding_shape": engine.get_binding_shape(binding_index),
                "binding_dtype": engine.get_binding_dtype(binding_index),
                "binding_name": engine.get_binding_name(binding_index),
            }

            if engine.binding_is_input(binding_index):
                binding_meta[key]["binding_type"] = "INPUT"
                binding_meta[key]["profile_shape"] = engine.get_profile_shape(profile_index, binding_index)
            else:
                binding_meta[key]["binding_type"] = "OUTPUT"

        profile_meta["Profile {}".format(profile_index)] = binding_meta

    from pprint import pprint
    pprint(profile_meta)

In [25]:
engine_path = './saved_model/covid.engine'
# Load a serialized engine into memory
engine = load_engine(engine_path)
# View various attributes of engine
inspect_engine(engine)

{'Profile 0': {'Binding 0': {'binding_dtype': DataType.FLOAT,
                             'binding_index': 0,
                             'binding_name': 'input',
                             'binding_shape': (1, 1, 32000),
                             'binding_type': 'INPUT',
                             'profile': 0,
                             'profile_shape': [(1, 1, 32000),
                                               (1, 1, 32000),
                                               (1, 1, 32000)]},
               'Binding 1': {'binding_dtype': DataType.FLOAT,
                             'binding_index': 1,
                             'binding_name': 'output',
                             'binding_shape': (1, 1, 3),
                             'binding_type': 'OUTPUT',
                             'profile': 0}}}


In [26]:
# Create context, this can be re-used
context = engine.create_execution_context()
# Profile 0 (first profile) is used by default
context.active_optimization_profile = 0
print("Active Optimization Profile: {}".format(context.active_optimization_profile))

Active Optimization Profile: 0


#### Binding Indices and Shape

In [27]:
def get_binding_idxs(engine, profile_index):
    # Calculate start/end binding indices for current context's profile
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    start_binding = profile_index * num_bindings_per_profile
    end_binding = start_binding + num_bindings_per_profile

    print("Engine/Binding Metadata")
    print("\tNumber of optimization profiles: {}".format(engine.num_optimization_profiles))
    print("\tNumber of bindings per profile: {}".format(num_bindings_per_profile))
    print("\tFirst binding for profile {}: {}".format(profile_index, start_binding))
    print("\tLast binding for profile {}: {}".format(profile_index, end_binding-1))


    # Separate input and output binding indices for convenience
    input_binding_idxs = []
    output_binding_idxs = []
    for binding_index in range(start_binding, end_binding):
        if engine.binding_is_input(binding_index):
            input_binding_idxs.append(binding_index)
        else:
            output_binding_idxs.append(binding_index)

    return input_binding_idxs, output_binding_idxs

In [28]:
# These binding_idxs can change if either the context or the 
# active_optimization_profile are changed
input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, context.active_optimization_profile)
input_names = [engine.get_binding_name(binding_idx) for binding_idx in input_binding_idxs]

Engine/Binding Metadata
	Number of optimization profiles: 1
	Number of bindings per profile: 2
	First binding for profile 0: 0
	Last binding for profile 0: 1


In [29]:
def is_dynamic(shape):
    return any(dim is None or dim < 0 for dim in shape)

def get_image_inputs(engine, context, input_binding_idxs,seed=42):
    # Input data for inference
    host_inputs = []
    print("Generating Random Inputs")
    print("\tUsing random seed: {}".format(seed))
    np.random.seed(seed)
    for binding_index in input_binding_idxs:
        # If input shape is fixed, we'll just use it
        input_shape = context.get_binding_shape(binding_index)
        input_name = engine.get_binding_name(binding_index)
        print("\tInput [{}] shape: {}".format(input_name, input_shape))
        # If input shape is dynamic, we'll arbitrarily select one of the
        # the min/opt/max shapes from our optimization profile
        if is_dynamic(input_shape):
            profile_index = context.active_optimization_profile
            profile_shapes = engine.get_profile_shape(profile_index, binding_index)
            print("\tProfile Shapes for [{}]: [kMIN {} | kOPT {} | kMAX {}]".format(input_name, *profile_shapes))
            # 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max)
            input_shape = profile_shapes[1]
            print("\tInput [{}] shape was dynamic, setting inference shape to {}".format(input_name, input_shape))

        host_inputs.append(np.random.random(input_shape).astype(np.float32))

    return host_inputs


In [30]:
# Generate random inputs based on profile shapes
host_inputs = get_image_inputs(engine, context, input_binding_idxs)

Generating Random Inputs
	Using random seed: 42
	Input [input] shape: (1, 1, 32000)


In [31]:
# Allocate device memory for inputs. This can be easily re-used if the
# input shapes don't change
device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs]

# Copy host inputs to device, this needs to be done for each new input
for h_input, d_input in zip(host_inputs, device_inputs):
    cuda.memcpy_htod(d_input, h_input)

print("Input Metadata")
print("\tNumber of Inputs: {}".format(len(input_binding_idxs)))
print("\tInput Bindings for Profile {}: {}".format(context.active_optimization_profile, input_binding_idxs))
print("\tInput names: {}".format(input_names))
print("\tInput shapes: {}".format([inp.shape for inp in host_inputs]))

Input Metadata
	Number of Inputs: 1
	Input Bindings for Profile 0: [0]
	Input names: ['input']
	Input shapes: [(1, 1, 32000)]


In [32]:
def setup_binding_shapes(engine, context, host_inputs, input_binding_idxs,
                  output_binding_idxs, has_input_shape_changed=False):
    # Explicitly set the dynamic input shapes, so the dynamic output
    # shapes can be computed internally

    for host_input, binding_index in zip(host_inputs, input_binding_idxs):
        context.set_binding_shape(binding_index, host_input.shape)

    assert(context.all_binding_shapes_specified)

    host_outputs = [None] * len(output_binding_idxs)
    device_outputs = [None] * len(output_binding_idxs)
    for i, binding_index in enumerate(output_binding_idxs):
        output_shape = context.get_binding_shape(binding_index)
        # print("output_shape", output_shape)
        # Allocate buffers to hold output results after copying back to host
        host_outputs[i] = np.empty(output_shape, dtype=np.float32)
        # Allocate output buffers on device
        device_outputs[i] = cuda.mem_alloc(host_outputs[i].nbytes)

    return host_outputs, device_outputs

In [33]:
# This needs to be called everytime your input shapes change
# If your inputs are always the same shape (same batch size, etc.),
# then you will only need to call this once
host_outputs, device_outputs = setup_binding_shapes(
    engine, context, host_inputs, input_binding_idxs, output_binding_idxs,
)
output_names = [engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs]

print("Output Metadata")
print("\tNumber of Outputs: {}".format(len(output_binding_idxs)))
print("\tOutput names: {}".format(output_names))
print("\tOutput shapes: {}".format([out.shape for out in host_outputs]))
print("\tOutput Bindings for Profile {}: {}".format(context.active_optimization_profile, output_binding_idxs))

Output Metadata
	Number of Outputs: 1
	Output names: ['output']
	Output shapes: [(1, 1, 3)]
	Output Bindings for Profile 0: [1]


### Run TRT inference!

In [34]:
# Bindings are a list of device pointers for inputs and outputs
bindings = device_inputs + device_outputs

# Inference
context.execute_v2(bindings)

# Copy outputs back to host to view results
for h_output, d_output in zip(host_outputs, device_outputs):
    cuda.memcpy_dtoh(h_output, d_output)

# View outputs
print("Inference Outputs Shape:", host_outputs[0].shape)

Inference Outputs Shape: (1, 1, 3)


In [35]:
def get_trt_inference_outputs(context,host_inputs,host_outputs,device_inputs, device_outputs):
    if not isinstance(host_inputs, list) and isinstance(host_inputs, np.ndarray): host_inputs = [host_inputs]
    else: raise ValueError('host inputs must be list of numpy-nd arrays')
            
    for h_input, d_input in zip(host_inputs, device_inputs): # Copy new inputs
        cuda.memcpy_htod(d_input, h_input)
    
    bindings = device_inputs + device_outputs
    context.execute_v2(bindings)

    for h_output, d_output in zip(host_outputs, device_outputs):
        cuda.memcpy_dtoh(h_output, d_output)  # Get outputs from device
    
    return host_outputs


In [46]:
def trt_prediction(outputs):
    pred=np.argmax(outputs[0][0], axis=-1)
    out_pred=shirnked_labels[pred[0]]
    return out_pred

In [40]:
result_trt_fp32 = get_trt_inference_outputs(context,covid_np,host_outputs,device_inputs,device_outputs)
trt_prediction(result_trt_fp32)


'covid_positive'

In [41]:
result_trt_fp32 = get_trt_inference_outputs(context,healthy_np,host_outputs,device_inputs,device_outputs)
trt_prediction(result_trt_fp32)

'healthy'

In [42]:
del context
del engine

### FP16 inference

In [43]:
cmd = 'trtexec --explicitBatch --onnx=./saved_model/relabelled_covid.onnx --minShapes=input:1x1x32000 --optShapes=input:1x1x32000 --maxShapes=input:1x1x32000  --shapes=input:1x1x32000 --saveEngine=./saved_model/covid_fp16.engine --fp16'
print(cmd)
os.system(cmd)

trtexec --explicitBatch --onnx=./saved_model/relabelled_covid.onnx --minShapes=input:1x1x32000 --optShapes=input:1x1x32000 --maxShapes=input:1x1x32000  --shapes=input:1x1x32000 --saveEngine=./saved_model/covid_fp16.engine --fp16


0

In [44]:
!ls -lh ./saved_model/*.engine

-rw-r--r-- 1 root root 2.3M Nov  4 08:15 ./saved_model/covid.engine
-rw-r--r-- 1 root root 1.2M Nov  4 08:22 ./saved_model/covid_fp16.engine


In [45]:
engine = load_engine('./saved_model/covid_fp16.engine')
inspect_engine(engine)

context = engine.create_execution_context()
context.active_optimization_profile = 0

input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, context.active_optimization_profile)

host_inputs = get_image_inputs(engine, context, input_binding_idxs)
device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs]

for h_input, d_input in zip(host_inputs, device_inputs):
    cuda.memcpy_htod(d_input, h_input)

# Placeholder for output buffers, will resize as necessary
host_outputs, device_outputs = setup_binding_shapes(engine, context, host_inputs,
                                                    input_binding_idxs, output_binding_idxs)
bindings = device_inputs + device_outputs

{'Profile 0': {'Binding 0': {'binding_dtype': DataType.FLOAT,
                             'binding_index': 0,
                             'binding_name': 'input',
                             'binding_shape': (1, 1, 32000),
                             'binding_type': 'INPUT',
                             'profile': 0,
                             'profile_shape': [(1, 1, 32000),
                                               (1, 1, 32000),
                                               (1, 1, 32000)]},
               'Binding 1': {'binding_dtype': DataType.FLOAT,
                             'binding_index': 1,
                             'binding_name': 'output',
                             'binding_shape': (1, 1, 3),
                             'binding_type': 'OUTPUT',
                             'profile': 0}}}
Engine/Binding Metadata
	Number of optimization profiles: 1
	Number of bindings per profile: 2
	First binding for profile 0: 0
	Last binding for profile 0: 1
Generati

In [48]:
result_trt_fp16 = get_trt_inference_outputs(context,healthy_np,host_outputs,device_inputs,device_outputs)
trt_prediction(result_trt_fp16)

'healthy'

In [49]:
result_trt_fp16 = get_trt_inference_outputs(context,covid_np,host_outputs,device_inputs,device_outputs)
trt_prediction(result_trt_fp16)

'covid_positive'

### Profiling

`nsys profile -y 0 -w true -t cudnn,cuda,osrt,nvtx -o Report.qdrep python run_inference.py`