# How to build a TensorRT graph using saved_model API and use it for prediction?

## Necessary imports

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import gzip
import os, sys
import multiprocessing
from requests import get
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from mnist import mnist
import tensorflow.contrib.tensorrt as trt
import time

## Class to work with saved_model

In [2]:
class trtUsingSavedModel(object):
    def __init__(self, saved_model_dir, saved_model_tag_list, max_batch_size, trt_gpu_allocation_size_in_bytes, precision_mode):
         
        # config for tensorrt
        allowed_precision_modes = ['FP32', 'FP16', 'INT8']
        if not precision_mode in allowed_precision_modes:
            print('Incorrect precision mode string provided, \
                  please choose one from: {} and try again.'.format(allowed_precision_modes))
            sys.exit()

        self.max_batch_size = max_batch_size
        self.max_workspace_size_bytes = trt_gpu_allocation_size_in_bytes
        self.precision_mode = precision_mode

        # config for tensorflow 
        config = tf.ConfigProto(device_count={'GPU':1, 'CPU':3})
        config.gpu_options.allow_growth=True
        config.gpu_options.per_process_gpu_memory_fraction=0.30

        # from saved_model to trt graph
        self.graph = self.import_from_graph_def(self.saved_modelToTRTGraph(saved_model_dir, saved_model_tag_list))
        
        self.sess = tf.Session(graph=self.graph, config=config)
        
        self.x = self.graph.get_tensor_by_name('input:0')
        self.y = self.graph.get_tensor_by_name('output:0')
        
    def saved_modelToTRTGraph(self, saved_model_dir, saved_model_tag_list):
        # Create a TensorRT inference graph from a SavedModel:
        trt_graph = trt.create_inference_graph(
            input_graph_def=None,
            outputs=None,
            input_saved_model_dir=saved_model_dir,
            input_saved_model_tags=saved_model_tag_list,
            max_batch_size=self.max_batch_size,
            max_workspace_size_bytes=self.max_workspace_size_bytes,
            precision_mode=self.precision_mode
        )
        
        print("saved_model successfully converted to TensorRT model!")
        print()
        
        print('=== INFO ===')
        # check how many ops that is converted to TensorRT engine
        trt_engine_nodes = len([1 for n in trt_graph.node if str(n.op) == 'TRTEngineOp'])
        print("numb. of trt_engine_nodes in TensorRT graph:", trt_engine_nodes)
        all_nodes = len([1 for n in trt_graph.node])
        print("numb. of all_nodes in TensorRT graph:", all_nodes)
        print('Node names:', [n.name for n in trt_graph.node])
        
        return trt_graph
    
    def import_from_graph_def(self, trt_graph):
        with tf.Graph().as_default() as graph:
            tf.import_graph_def(trt_graph, name='')
        return graph
    
    def calculate_accuracy(self, y_pred, y_test):
        correct_instances = np.where(y_test == y_pred)[0].shape[0]
        total_instances = y_test.shape[0]
        accuracy = float(correct_instances)/total_instances
        print('Accuracy: {}'.format(accuracy*100.0))    
    
    def predict_using_loaded_graph(self, X_test):
        return np.argmax(self.sess.run(self.y, feed_dict={self.x: X_test}), axis=1)

## Load MNIST data

In [3]:
obj = mnist()
X_train, y_train, X_test, y_test = obj.load_data()

# split into validation and test set from the test set alone
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, random_state=0)

In [4]:
print('==== Printing shapes of data ===')
print()
print('Train data:      ', X_train.shape, y_train.shape)
print('Test data:       ', X_test.shape, y_test.shape)
print('Validation data: ', X_validation.shape, y_validation.shape)

==== Printing shapes of data ===

Train data:       (60000, 32, 32, 1) (60000,)
Test data:        (7500, 32, 32, 1) (7500,)
Validation data:  (2500, 32, 32, 1) (2500,)


## Instantiate class with necessary parameters
* There are a few very important parameters to be taken care of when freezing a TensorRT graph, they are:    
    * input_saved_model_dir
    * input_saved_model_tags
    * max_batch_size
    * trt_gpu_allocation_size_in_bytes
    * precision_mode
* __Note:__ In this implementation we also need to set the following positional arguments for *create_inference_graph* function  to *__'None'__*
    * input_graph_def
    * outputs

In [5]:
model = trtUsingSavedModel('./models/mnist/lenet/using_SavedModelBuilder/', ['serve'], max_batch_size=2, trt_gpu_allocation_size_in_bytes=1 << 25, precision_mode='FP32')

INFO:tensorflow:Running against TensorRT version 5.0.2
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./models/mnist/lenet/using_SavedModelBuilder/variables/variables
Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
INFO:tensorflow:Froze 10 variables.
INFO:tensorflow:Converted 10 variables to const ops.
saved_model successfully converted to TensorRT model!

=== INFO ===
numb. of trt_engine_nodes in TensorRT graph: 1
numb. of all_nodes in TensorRT graph: 3
Node names: ['input', 'TRTEngineOp_0', 'output']


## Predict using TensorRT model

In [6]:
t1 = time.time()
y_pred = model.predict_using_loaded_graph(X_test)
model.calculate_accuracy(y_pred, y_test)
t2 = time.time()

print('Exec time for TensorRT: ', t2-t1)

Accuracy: 95.94666666666667
Exec time for TensorRT:  1.1846463680267334


## References 

* https://docs.nvidia.com/deeplearning/dgx/tf-trt-user-guide/index.html#capabilities