# TensorRT Python Workflow

## Part I. Model preparation

This part does not depend on the TensorFlow version.

Load the dependencies.

In [None]:
# TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
print("Tensorflow version: ", tf.version.VERSION)

# remember the tf version for further use
TF_VER = tf.version.VERSION.split(".")[0]

# Other supporting libraries
import os
import datetime
import matplotlib.pyplot as plt
import numpy as np
import time

### Loading dataset

In this example, we will be working with the <a href="https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/">Fashion-MNIST</a> dataset.

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

num_classes = len(class_names)


def load_data():
    fashion_mnist = keras.datasets.fashion_mnist

    (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

    # Add a new axis
    train_images = train_images[:, :, :, np.newaxis]
    test_images = test_images[:, :, :, np.newaxis]

    # Convert class vectors to binary class matrices.
    train_labels = to_categorical(train_labels, num_classes)
    test_labels = to_categorical(test_labels, num_classes)

    # Data normalization
    train_images = train_images.astype('float32')
    test_images = test_images.astype('float32')
    train_images /= 255
    test_images /= 255
    
    #Note: TensorRT only supports 'channels first' input data type
    train_images = np.rollaxis(train_images, 3, 1) 
    test_images = np.rollaxis(test_images, 3, 1) 
    
    return train_images,train_labels, test_images, test_labels

# Load data
train_images,train_labels, test_images, test_labels = load_data()

print(train_images.shape)
print(train_labels.shape)

### Model definition

TensorRT supported operations: [https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html](https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html)

In [None]:
# LeNet-5 model
class LeNet(keras.Sequential):
    def __init__(self, input_shape, nb_classes):
        super().__init__()
        
        self.add(keras.layers.InputLayer(input_shape=input_shape, name="InputLayer"))
        self.add(keras.layers.Conv2D(filters=32, kernel_size=(5,5), padding='same', activation=tf.nn.tanh, data_format='channels_first'))
        self.add(keras.layers.MaxPool2D(strides=2))
        self.add(keras.layers.Conv2D(filters=48, kernel_size=(5,5), padding='valid', activation=tf.nn.tanh))
        self.add(keras.layers.MaxPool2D(strides=2))
        
        self.add(keras.layers.Flatten(name="flatten"))
        self.add(keras.layers.Dense(120, activation=tf.nn.tanh))
        self.add(keras.layers.Dense(84, activation=tf.nn.tanh))
        self.add(keras.layers.Dense(nb_classes))
        self.add(keras.layers.Activation(activation=tf.nn.softmax, name="OutputLayer"))

        self.compile(optimizer='adam',
                    loss=tf.keras.losses.categorical_crossentropy,
                    metrics=['accuracy'])

Initialize the model.

In [None]:
model = LeNet(train_images[0].shape, num_classes)
model.summary()

### Model training

In [None]:
model.fit(x=train_images, y=train_labels, 
            epochs=5, 
            validation_data=(test_images, test_labels), 
            verbose=1)
    
# Evaluate the model on test data
test_loss, test_acc = model.evaluate(test_images, test_labels)
print("Test loss: {}\nTest accuracy: {}".format(test_loss, test_acc))

### Saving the model

Prepare the output directory.

In [None]:
def maybe_mkdir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        
MODEL_DIR = "./models"  

# Make directory to save model in if it doesn't exist already
maybe_mkdir(MODEL_DIR)

Depending on the TensorFlow version, save model as SavedModel for TF 2.x and as frozen graph for TF 1.x.

In [None]:
def save_model_tf1(model, model_path):
    output_names = model.output.op.name
    sess = tf.keras.backend.get_session()

    graphdef = sess.graph.as_graph_def()
    
    frozen_graph = tf.graph_util.convert_variables_to_constants(sess, graphdef, [output_names])
    frozen_graph = tf.graph_util.remove_training_nodes(frozen_graph)
    
    with open(model_path, "wb") as ofile:
        ofile.write(frozen_graph.SerializeToString())

        
def saved_frozen_graph_tf2(model, logdir, name, verbose=True):
    from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
    
    # Convert Keras model to ConcreteFunction
    full_model = tf.function(lambda x: model(x))
    full_model = full_model.get_concrete_function(
        tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype, name="InputLayer"))

     # Get frozen ConcreteFunction
    frozen_func = convert_variables_to_constants_v2(full_model)
    frozen_func.graph.as_graph_def()

    layers = [op.name for op in frozen_func.graph.get_operations()]
    if verbose:
        print("-" * 50)
        print("Frozen model layers: ")
        for layer in layers:
            print(layer)

        print("-" * 50)
        print("Frozen model inputs: ")
        print(frozen_func.inputs)
        print("Frozen model outputs: ")
        print(frozen_func.outputs)


        
    # Save frozen graph from frozen ConcreteFunction to hard drive
    tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
                        logdir=logdir,
                        name=name,
                        as_text=False)
    # as text
    name2 = "frozen_graph_tf2.pbtxt"
    tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
                        logdir=logdir,
                        name=name2,
                        as_text=True)
        
        
def save_model_tf2(model, model_path):
    model.save(model_path)

    
OUT_MODEL_PATH = ''

if (TF_VER=='1'):
    FROZEN_GRAPH_PATH_TF1 = os.path.join(MODEL_DIR, "frozen_graph.pb")
    save_model_tf1(model, FROZEN_GRAPH_PATH_TF1)
else:
    OUT_MODEL_PATH = os.path.join(MODEL_DIR, 'saved_model')
    save_model_tf2(model, OUT_MODEL_PATH)
    name = "frozen_graph_tf2.pb"
    FROZEN_GRAPH_PATH_TF2 = os.path.join(MODEL_DIR, name)
    saved_frozen_graph_tf2(model, MODEL_DIR, name)

### Performing inference

![Simple case](img/deploy-trained-model.png)

Setting the batch size for the whole notebook

In [None]:
BATCH_SIZE = 16

Visualizing some results

In [None]:
def visualize():
    # set up the figure
    fig = plt.figure(figsize=(15, 7))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

    # plot the images: each image is 28x28 pixels
    for i in range(50):
        ax = fig.add_subplot(5, 10, i + 1, xticks=[], yticks=[])
        ax.imshow(test_images[i,:].reshape((28,28)),cmap=plt.cm.gray_r, interpolation='nearest')

        if prediction_values[i] == np.argmax(test_labels[i]):
            # label the image with the blue text
            ax.text(0, 7, class_names[prediction_values[i]], color='blue')
        else:
            # label the image with the red text
            ax.text(0, 7, class_names[prediction_values[i]], color='red')
            
            
inf = model.predict(x=test_images)
prediction_values = np.argmax(inf, axis=-1)
visualize()            

Lets's measure inference time.

In [None]:
# Pick a random batch by randomly selecting the index of the first image in the batch
image_index = int(np.random.randint(0, test_images.shape[0] - BATCH_SIZE, size=1)[0])
random_batch = test_images[image_index:(image_index + BATCH_SIZE)]

print("Randomly selected index:", image_index)

Visualize the randomly selected batch.

In [None]:
import math

def visualize_batch(rnd_idx, batch_size, predictions = False):
    
    subplot_rows = math.ceil(batch_size / 10)
    subplot_cols = 10
    
    # set up the figure
    fig = plt.figure(figsize=(15, subplot_rows * 1.5))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

    # plot the images: each image is 28x28 pixels
    j = 0
    for i in range(rnd_idx, rnd_idx + batch_size):
        ax = fig.add_subplot(subplot_rows, subplot_cols, j + 1, xticks=[], yticks=[])
        ax.imshow(test_images[i,:].reshape((28,28)),cmap=plt.cm.gray_r, interpolation='nearest')
        if predictions is False:
            ax.text(0, 7, class_names[np.nonzero(test_labels[i])[0][0]], color='green')
        else:
            if predictions[j] == class_names[np.nonzero(test_labels[i])[0][0]]:
                ax.text(0, 7, predictions[j], color='blue')
            else:
                ax.text(0, 7, predictions[j], color='red')
        j += 1
        
            
        
visualize_batch(image_index, BATCH_SIZE)

Run inference multiple times to estimate average time.

In [None]:
warmup_iter = 50
for i in range(warmup_iter):
    prediction = model.predict(x=random_batch)

loop_count = 200
start_time = time.time()
for i in range(loop_count):
    prediction = model.predict(x=random_batch)
    
print("Unoptimized inference time: %s ms in average" %((time.time() - start_time)* 1000 / loop_count))

predicted_labels = [class_names[np.argmax(prediction, axis=-1)[i]] for i in range(len(prediction))]

visualize_batch(image_index, BATCH_SIZE, predicted_labels)

Remember the random image index, batch size and the version of TensorFlow used to saved the frozen graph for future comparison.

In [None]:
%%capture cap --no-stderr
print(image_index)
print(BATCH_SIZE)
print(TF_VER)

In [None]:
TMP_NUMBER_PATH = os.path.join(MODEL_DIR, "rnd_idx.txt")
with open(TMP_NUMBER_PATH, 'w') as f:
    f.write(cap.stdout)

## Part II. TensorRT optimized inference with ONNX format

### Converting the model to ONNX format

This part does not depend on the TensorFlow version.

In [None]:
# install the onnx converters from the source
!pip install --quiet -U git+https://github.com/microsoft/onnxconverter-common
!pip install --quiet -U git+https://github.com/onnx/keras-onnx

Load dependencies.

In [None]:
import keras2onnx
print("keras2onnx version is "+keras2onnx.__version__)

Convert the model to ONNX and save it.

In [None]:
onnx_model = keras2onnx.convert_keras(model, 'fashion-mnist-onnx', debug_mode=1)

OUT_ONNX_MODEL = MODEL_DIR + "/model.onnx"
keras2onnx.save_model(onnx_model, OUT_ONNX_MODEL)

Note: if you are not using Keras, but plain TensorFlow, you can use the [tensorflow-onnx](https://github.com/onnx/tensorflow-onnx) for the conversion. If you are training in PyTorch you can use the [torch.onnx](https://pytorch.org/docs/master/onnx.html).

### CPU execution

Prepare ONNX runtime.

In [None]:
!pip install --quiet -U onnxruntime

In [None]:
import onnxruntime
sess_options = onnxruntime.SessionOptions()
sess = onnxruntime.InferenceSession(OUT_ONNX_MODEL, sess_options)
data = [random_batch.astype(np.float32)]
input_names = sess.get_inputs()
feed = dict([(input.name, data[n]) for n, input in enumerate(sess.get_inputs())])

Lets's measure inference time. Compare this execution time with unoptimized inference. Execute this cell several times to exclude the GPU 'warming up' effect.

In [None]:
warmup_iter = 50
for i in range(warmup_iter):
    onnx_predicted_label = sess.run(None, feed)[0].argmax()

loop_count = 200

start_time = time.time()
for i in range(loop_count):
    onnx_predicted_label = sess.run(None, feed)[0].argmax(axis=1)
print("ONNX inference time: %s ms in average" %((time.time() - start_time)* 1000 / loop_count))

predicted_labels = [class_names[onnx_predicted_label[i]] for i in range(len(onnx_predicted_label))]

visualize_batch(image_index, BATCH_SIZE, predicted_labels)

In [None]:
sess.get_providers()

### GPU execution 

At the time of writing the default GPU build (version 1.5.3) requires CUDA runtime libraries being installed on the system:
Version: CUDA 10.2 and cuDNN 8.0.3. If you are having a newer insrtallation, it won't work.

Just for the reference, to use the GPU accelerated onnxruntime, install it as follows 

```
!pip uninstall --quiet onnxruntime -y
!pip install --quiet -U onnxruntime-gpu
```

That will lead to `sess.get_providers()` showing further options, like `'TensorrtExecutionProvider'`. By default the execution context will be switched to the GPU.

### Creating the TensorRT engine from ONNX

If you skipped the previous part, install ONNX.

In [None]:
!pip install onnx

Again, if you are starting just from this part, execute the following cells to load the data and dependencies.

In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
    
restartkernel()

In [None]:
%run kernel_reload.py

In [None]:
import tensorrt

TRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
trt_runtime = tensorrt.Runtime(TRT_LOGGER)

# ONNX parser only supports networks with an explicit batch dimension
explicit_batch = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def build_engine(onnx_path, shape):

    """
    This is the function to create the TensorRT engine
    Args:
       onnx_path : Path to onnx_file. 
       shape : Shape of the input of the ONNX file. 
    """
    with tensorrt.Builder(TRT_LOGGER) as builder, builder.create_network(explicit_batch) as network, tensorrt.OnnxParser(network, TRT_LOGGER) as parser:        
        builder.max_workspace_size = 8000000000
        with open(onnx_path, 'rb') as model:
            parser.parse(model.read())
        network.get_input(0).shape = shape
        engine = builder.build_cuda_engine(network)
        return engine
    
    
def load_engine(trt_runtime, plan_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

In [None]:
OUT_ONNX_MODEL = MODEL_DIR + "/model.onnx"

In [None]:
import argparse
from onnx import ModelProto
 
ONNX_TRT_ENGINE_PATH = MODEL_DIR + "/trt_onnx_out.plan"
onnx_path = OUT_ONNX_MODEL

model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())
 
    d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
    d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
    d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
    shape = [BATCH_SIZE , d0, d1 ,d2]

    with build_engine(onnx_path, shape = shape) as engine:
    
        # save to file
        serialized_engine = engine.serialize()

        with open(ONNX_TRT_ENGINE_PATH, "wb") as f:
            f.write(serialized_engine)

### Running inference from the TensorRT engine

In [None]:
import pycuda.driver as cuda
import numpy as np
import pycuda.autoinit 


# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) 
# to hold host inputs/outputs
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = tensorrt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = tensorrt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


# This function is generalized for multiple inputs/outputs for full dimension networks.
# (for TensorRT 7.x or higher)
def do_inference_v2(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


def load_normalized_test_case(case_num, pagelocked_buffer, batch=1):
    _, _, x_test, y_test = load_data()
    
    if batch > 1:
        test_batch = x_test[case_num:(case_num + BATCH_SIZE)].ravel()
    else:
        test_batch = x_test[case_num].ravel()
    np.copyto(pagelocked_buffer, test_batch)
    
    if batch > 1:
        return y_test[case_num:(case_num + BATCH_SIZE)]
    else:
        return y_test[case_num]


def run_trt_inference(context, loop_count = 200, warmup=50):
    test_case = np.argmax(load_normalized_test_case(idx, inputs[0].host, BATCH_SIZE), axis=-1)

    # run multiple predictions to measure time
    for i in range(warmup):
        # The do_inference function will return a list of outputs - we only have one in this case.
        [pred] = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

    start_time = time.time()
    for i in range(loop_count):
        # The do_inference function will return a list of outputs - we only have one in this case.   
        [pred] = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

    print("TRT inference time: %s ms in average" %((time.time() - start_time) * 1000 / loop_count))  

    #print(pred)
    pred_resh = np.reshape(pred, (BATCH_SIZE, len(class_names)))
    
    predicted_labels = [class_names[np.argmax(pred_resh, axis=-1)[i]] for i in range(len(pred_resh))]
    visualize_batch(idx, BATCH_SIZE, predicted_labels)

Load the generated engine and execute inference.

In [None]:
with open(ONNX_TRT_ENGINE_PATH, "rb") as f, tensorrt.Runtime(TRT_LOGGER) as trt_runtime:
     with trt_runtime.deserialize_cuda_engine(f.read()) as loaded_engine:
        # allocate buffers
        inputs, outputs, bindings, stream = allocate_buffers(loaded_engine)

        # create execution context
        with loaded_engine.create_execution_context() as context:
            print("\n=== Testing ===")
            run_trt_inference(context)

## Part III. TF-TRT in TensorFlow 2.x (SavedModel) 

![saved model](img/saved-model.png)

You shall be in the TensorFlow 2.x environment to perform this part.

In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
    
restartkernel()

In [None]:
%run kernel_reload.py

In [None]:
if TF_VER == '1':
    print("This part of the tutorial is designed for the TensorFlow 2.x. Please switch the environment.")

from tensorflow.python.compiler.tensorrt import trt_convert as trt 

Check the contents of the saved model directory.

In [None]:
!saved_model_cli show --all --dir $OUT_MODEL_PATH

Convert the model to TRT.

In [None]:
TRT_OUTPUT_PATH = os.path.join(MODEL_DIR, "optimized_model")

# Here we overwrite the default conversion parameters to suit our needs.
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP32,
    #minimum_segment_size = 3,
    #max_batch_size=1,
    max_workspace_size_bytes=8000000000
)

converter = trt.TrtGraphConverterV2(input_saved_model_dir=OUT_MODEL_PATH, conversion_params=conversion_params)
converter.convert()
converter.save(TRT_OUTPUT_PATH)

Test the optimized model.

In [None]:
def run_trt_test(loaded_model, loop_count = 200, warmup_inter = 50):

    for i in range(warmup_inter):
        prediction = loaded_model(tf.constant(random_batch))
    
    start_time = time.time()
    for i in range(loop_count):
        prediction = loaded_model(tf.constant(random_batch))

    print("TF-TRT inferences with %s ms in average" %((time.time() - start_time) * 1000 / loop_count))
    
    predicted_labels = [class_names[np.argmax(prediction, axis=-1)[i]] for i in range(len(prediction))]

    visualize_batch(idx, BATCH_SIZE, predicted_labels)
    
    
random_batch = test_images[idx:(idx + BATCH_SIZE)]  
loaded = tf.saved_model.load(TRT_OUTPUT_PATH)
run_trt_test(loaded)

### TF-TRT FP16

In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
    
restartkernel()

In [None]:
%run kernel_reload.py

In [None]:
print('Converting to TF-TRT FP16...')
TRT_OUTPUT_PATH_TFTRT_FP16 = os.path.join(MODEL_DIR, "TFTRT_FP16")

conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.FP16,
    #minimum_segment_size = 3,
    #max_batch_size=1,
    maximum_cached_engines=8,
    max_workspace_size_bytes=8000000000)
converter = trt.TrtGraphConverterV2(
   input_saved_model_dir=OUT_MODEL_PATH, conversion_params=conversion_params)
converter.convert()
converter.save(output_saved_model_dir=TRT_OUTPUT_PATH_TFTRT_FP16)

print('Done Converting to TF-TRT FP16')

Execute inference.

In [None]:
loaded = tf.saved_model.load(TRT_OUTPUT_PATH_TFTRT_FP16)

In [None]:
run_trt_test(loaded)

### TF-TRT INT8

In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
    
restartkernel()

In [None]:
%run kernel_reload.py

In [None]:
batched_input = tf.constant(test_images)
print('batched_input shape: ', batched_input.shape)

In [None]:
print('Converting to TF-TRT INT8...')
TRT_OUTPUT_PATH_TFTRT_INT8 = os.path.join(MODEL_DIR, "TFTRT_INT8")

conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
    precision_mode=trt.TrtPrecisionMode.INT8, 
    #minimum_segment_size = 3,
    #max_batch_size=1,
    max_workspace_size_bytes=8000000000, 
    use_calibration=True)

converter = trt.TrtGraphConverterV2(
    input_saved_model_dir=OUT_MODEL_PATH, 
    conversion_params=conversion_params)

def calibration_input_fn():
    yield (batched_input, )
converter.convert(calibration_input_fn=calibration_input_fn)

converter.save(output_saved_model_dir=TRT_OUTPUT_PATH_TFTRT_INT8)
print('Done Converting to TF-TRT INT8')

In [None]:
loaded = tf.saved_model.load(TRT_OUTPUT_PATH_TFTRT_INT8)

In [None]:
run_trt_test(loaded)

### Experimenting with different segment size values 

In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
    
restartkernel()

In [None]:
%run kernel_reload.py

Let's check the default parameters of the converter.

In [None]:
trt.DEFAULT_TRT_CONVERSION_PARAMS

<b>Excercise:</b> in your example, try varying the values of `minimum_segment_size` and `max_batch_size`.

### If you have not noticed any difference in performance after applying quantization

The first reason might be that the model is too small to see any difference (like in the given example). 

Furthermore, not all hardware can support quantization. Refer to this table to check, if your GPU is capable of this operation: [https://docs.nvidia.com/deeplearning/sdk/tensorrt-support-matrix/index.html#hardware-precision-matrix](https://docs.nvidia.com/deeplearning/sdk/tensorrt-support-matrix/index.html#hardware-precision-matrix)

And finally, not every layer supports different precisions. Check the list of supported precision modes per TensorRT layer here: [https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#layers-precision-matrix](https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#layers-precision-matrix)

## Part IV. Converting to UFF in TensorFlow 1.x and exporting the TRT engine (optional part)

In TensorFlow 2.x the UFF format has been deprecated, however, it's still being widely used with TensorRT, especially when it comes to custom layer plugins. 

In order to perform the following part of this tutorial, switch your environment to the one with an older TensorFlow version. Refer to the README.md to learn how to open the jupyter notebook in a different environment.

In the newly opened notebook, open the same `trt_python_workflow.ipynb` and continue from this place. You may also keep this document for the reference, but make sure to shutdown its kernel (Kernel -> Shutdown) to free the GPU resources.

In [None]:
from IPython.display import display_html
def restartkernel():
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
    
restartkernel()

In [None]:
%run kernel_reload.py

In [None]:
if TF_VER == '2':
    print("This part of the tutorial is designed for the TensorFlow 1.x. Please switch the environment.")

### Transforming to UFF

In [None]:
import graphsurgeon as gs 
import uff
import tensorrt 
import pycuda.driver as cuda
import pycuda.autoinit
import time

Convert to UFF.

In [None]:
UFF_MODEL_PATH = './models/out_model.uff'

In [None]:
if TF_VER_used_for_train==2:
    output_nodes = ['Identity']
    dynamic_graph = gs.DynamicGraph(FROZEN_GRAPH_PATH_TF2)
else:
    output_nodes = ['OutputLayer/Softmax']
    dynamic_graph = gs.DynamicGraph(FROZEN_GRAPH_PATH_TF1)

uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(), output_nodes=output_nodes, output_filename=UFF_MODEL_PATH, text=True)
print("converted to UFF")

### Building the TRT engine

The following part should be performed on target hardware. 

It's very important to specify the model data correctly.

In [None]:
class ModelData(object):
    INPUT_NAME = "InputLayer"
    INPUT_SHAPE = (1, 28, 28)
    if TF_VER_used_for_train==2:
        OUTPUT_NAME = "Identity"
    else:
        OUTPUT_NAME = "OutputLayer/Softmax"
    DATA_TYPE = tensorrt.float32

Check if the model file existst

In [None]:
if not os.path.isfile(UFF_MODEL_PATH):
    raise IOError("\n{}\n{}\n{}\n".format(
        "Failed to find the model file ({}).".format(UFF_MODEL_PATH)
    ))

Necessary code to build and use the TRT engine:

In [None]:
def build_engine(model_path):
    builder = tensorrt.Builder(TRT_LOGGER) 
    builder.max_batch_size = BATCH_SIZE
    network = builder.create_network() 

    builder_config = builder.create_builder_config()
    builder_config.max_workspace_size = 8000000000
    #builder_config.set_flag(tensorrt.BuilderFlag.FP16)

    parser = tensorrt.UffParser()
    parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE)
    parser.register_output(ModelData.OUTPUT_NAME)
    parser.parse(model_path, network)

    return builder.build_engine(network, builder_config)



# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

    
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) 
# to hold host inputs/outputs
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = tensorrt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = tensorrt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]



def load_normalized_test_case(case_num, pagelocked_buffer, batch=1):
    _, _, x_test, y_test = load_data()
    
    if batch > 1:
        test_batch = x_test[case_num:(case_num + BATCH_SIZE)].ravel()
    else:
        test_batch = x_test[case_num].ravel()
    np.copyto(pagelocked_buffer, test_batch)
    
    if batch > 1:
        return y_test[case_num:(case_num + BATCH_SIZE)]
    else:
        return y_test[case_num]


def run_trt_inference(context, loop_count = 200, warmup=50):
    
    test_case = np.argmax(load_normalized_test_case(idx, inputs[0].host, BATCH_SIZE), axis=-1)

    # run multiple predictions to measure time
    for i in range(warmup):
        # The do_inference function will return a list of outputs - we only have one in this case.
        [pred] = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=BATCH_SIZE)

    start_time = time.time()
    for i in range(loop_count):
        # The do_inference function will return a list of outputs - we only have one in this case.   
        [pred] = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=BATCH_SIZE)

    print("TRT inference time: %s ms in average" %((time.time() - start_time) * 1000 / loop_count)) 

    #print(pred)
    pred_resh = np.reshape(pred, (BATCH_SIZE, len(class_names)))
    
    predicted_labels = [class_names[np.argmax(pred_resh, axis=-1)[i]] for i in range(len(pred_resh))]
    visualize_batch(idx, BATCH_SIZE, predicted_labels)        

Build and use the engine for inference

In [None]:
TRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)

with build_engine(UFF_MODEL_PATH) as engine:
    
    # save to file
    serialized_engine = engine.serialize()

    TRT_ENGINE_PATH = "models/out_model.engine"

    with open(TRT_ENGINE_PATH, "wb") as f:
        f.write(serialized_engine)
    
    # allocate buffers
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    
    # create execution context
    with engine.create_execution_context() as context:
        print("\n=== Testing ===")
        run_trt_inference(context)

### Load the engine from file.

In [None]:
with open(TRT_ENGINE_PATH, "rb") as f, tensorrt.Runtime(TRT_LOGGER) as runtime:
     with runtime.deserialize_cuda_engine(f.read()) as loaded_engine:
        # allocate buffers
        inputs, outputs, bindings, stream = allocate_buffers(loaded_engine)

        # create execution context
        with loaded_engine.create_execution_context() as context:
            print("\n=== Testing ===")
            run_trt_inference(context)