# How to build a TensorRT graph using frozen model and use it for prediction?

## Necessary imports

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import gzip
import os, sys
import multiprocessing
from requests import get
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from mnist import mnist
import tensorflow.contrib.tensorrt as trt
import time

## Class to convert TensorFlow model to TensorRT model

In [2]:
class convertTFGraphToTRT(object):

    def __init__(self, max_batch_size, trt_gpu_allocation_size_in_bytes, precision_mode):
        
        # config for tensorrt
        allowed_precision_modes = ['FP32', 'FP16', 'INT8']
        if not precision_mode in allowed_precision_modes:
            print('Incorrect precision mode string provided, \
                  please choose one from: {} and try again.'.format(allowed_precision_modes))
            sys.exit()
            
        self.max_batch_size = max_batch_size
        self.max_workspace_size_bytes = trt_gpu_allocation_size_in_bytes
        self.precision_mode = precision_mode
                
    def import_graph_def(self, frozen_graph_path):
            with tf.gfile.GFile(frozen_graph_path, 'rb') as f:
                # initialize a varible with graphdef which is a 
                # serialized version of the graph
                graph_def = tf.GraphDef()
                # load graphdef from protobuf file
                graph_def.ParseFromString(f.read())
            return graph_def
    
    def frozen_modelToTRTGraph(self, frozen_graph_path, output_node_list):
        # load frozen tf graph/model
        if not isinstance(output_node_list, list):
            print('please provide a list of output nodes instead of a string')
            sys.exit()
        
        frozen_graph = self.import_graph_def(frozen_graph_path)

        # convert (optimize) frozen model to TensorRT model
        trt_graph = trt.create_inference_graph(
            # frozen model
            input_graph_def=frozen_graph,
            outputs=output_node_list,
            # specify your max batch size
            max_batch_size=2,
            # specify the max workspace
            max_workspace_size_bytes=2*(10**9),
            # precision, can be "FP32" (32 floating point precision) or "FP16"
            precision_mode=self.precision_mode) 
        # Finally we serialize and dump the output graph to the filesystem
        with tf.gfile.GFile(frozen_graph_path.replace('.pb', '_trt_'+self.precision_mode+'.pb'), 'wb') as f:
            f.write(trt_graph.SerializeToString())

        print("TensorRT model is successfully stored!")
        print()
        
        # check how many ops of the original frozen model
        all_nodes = len([1 for n in frozen_graph.node])
        print("numb. of all_nodes in frozen graph:", all_nodes)
        # check how many ops that is converted to TensorRT engine
        trt_engine_nodes = len([1 for n in trt_graph.node if str(n.op) == 'TRTEngineOp'])
        print("numb. of trt_engine_nodes in TensorRT graph:", trt_engine_nodes)
        all_nodes = len([1 for n in trt_graph.node])
        print("numb. of all_nodes in TensorRT graph:", all_nodes)

## Instantiate class with necessary parameters
* There are a few very important parameters to be taken care of when freezing a TensorRT graph, they are:
    * max_batch_size
    * trt_gpu_allocation_size_in_bytes
    * precision_mode

In [3]:
obj = convertTFGraphToTRT(max_batch_size=2, trt_gpu_allocation_size_in_bytes=1 << 25, precision_mode='FP32')

## convert model
* This requires the following:
    * frozen model that needs to be converted
    * list of output nodes

In [4]:
obj.frozen_modelToTRTGraph(frozen_graph_path='./models/mnist/lenet/frozen_model.pb', output_node_list=['output'])

INFO:tensorflow:Running against TensorRT version 5.0.2
TensorRT model is successfully stored!

numb. of all_nodes in frozen graph: 40
numb. of trt_engine_nodes in TensorRT graph: 1
numb. of all_nodes in TensorRT graph: 3


## Class to load frozen models
* A TensorRT model can be loaded and used just as a normal TensorFlow model
* The part below here is taken from the notebook on [Working with Frozen TensorFlow model](Working-with-Frozen-TensorFlow-model.ipynb)

In [5]:
class loadFrozenGraph(object):

    def __init__(self, frozen_graph_path):
        
        # config for tensorflow 
        config = tf.ConfigProto(device_count={'GPU':1, 'CPU':3})
        config.gpu_options.allow_growth=True
        config.gpu_options.per_process_gpu_memory_fraction=0.30

        
        # import frozen graph
        self.graph = self.import_graph(frozen_graph_path)
        
        # MOST IMPORTANT - pass the loaded graph when creating session
        self.sess = tf.Session(config=config, graph=self.graph)
        
        # get input and output tensors
        self.x = self.graph.get_tensor_by_name('prefix/input:0')
        self.y = self.graph.get_tensor_by_name('prefix/output:0')   
        
    def import_graph(self, frozen_graph_path):
            # just a TF way to load a file in desired mode
            # we can also use python file api as well, if loading from local FS
            # for more, checkout the link in the 'Learnings' section
            with tf.gfile.GFile(frozen_graph_path, 'rb') as f:
                # initialize a varible with graphdef which is a 
                # serialized version of the graph
                graph_def = tf.GraphDef()
                # load graphdef from protobuf file
                graph_def.ParseFromString(f.read())
            
            # create an empty graph - bound in a scope here
            # and import the graph def into it
            with tf.Graph().as_default() as graph:
                tf.import_graph_def(graph_def, name='prefix')

            return graph
    
    def get_tensor_names(self):
        # print operations
        for op in self.graph.get_operations():
            print(op.name)
    
    def predict_from_frozen_graph(self, X_test):
        return np.argmax(self.sess.run(self.y, feed_dict={self.x: X_test}), axis=1)
    
    def calculate_accuracy(self, y_pred, y_test):
        correct_instances = np.where(y_test == y_pred)[0].shape[0]
        total_instances = y_test.shape[0]
        accuracy = float(correct_instances)/total_instances
        print('Accuracy: {}'.format(accuracy*100.0))

## Class to work with saved_model

In [6]:
class trtUsingSavedModel(object):
    def __init__(self, saved_model_dir, saved_model_tag_list):
        
        self.graph = self.import_from_graph_def(self.saved_modelToTRTGraph(saved_model_dir, saved_model_tag_list))
        
        self.x = self.graph.get_tensor_by_name('input:0')
        self.y = self.graph.get_tensor_by_name('output:0')
        
        self.sess = tf.Session(graph=self.graph)
        
    def saved_modelToTRTGraph(self, saved_model_dir, saved_model_tag_list):
        # Create a TensorRT inference graph from a SavedModel:
        trt_graph = trt.create_inference_graph(
            input_graph_def=None,
            outputs=None,
            input_saved_model_dir=saved_model_dir,
            input_saved_model_tags=saved_model_tag_list,
            max_batch_size=2,
            max_workspace_size_bytes=2*(10**9),
            precision_mode='FP32'
        )
        
        print([n.name for n in trt_graph.node])
        return trt_graph
    
    def import_from_graph_def(self, trt_graph):
        with tf.Graph().as_default() as graph:
            tf.import_graph_def(trt_graph, name='')
        return graph
    
    def calculate_accuracy(self, y_pred, y_test):
        correct_instances = np.where(y_test == y_pred)[0].shape[0]
        total_instances = y_test.shape[0]
        accuracy = float(correct_instances)/total_instances
        print('Accuracy: {}'.format(accuracy*100.0))    
    
    def predict_using_loaded_graph(self, X_test):
        return np.argmax(self.sess.run(self.y, feed_dict={self.x: X_test}), axis=1)

## Load MNIST data

In [7]:
obj = mnist()
X_train, y_train, X_test, y_test = obj.load_data()

# split into validation and test set from the test set alone
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, random_state=0)

In [8]:
print('==== Printing shapes of data ===')
print()
print('Train data:      ', X_train.shape, y_train.shape)
print('Test data:       ', X_test.shape, y_test.shape)
print('Validation data: ', X_validation.shape, y_validation.shape)

==== Printing shapes of data ===

Train data:       (60000, 32, 32, 1) (60000,)
Test data:        (7500, 32, 32, 1) (7500,)
Validation data:  (2500, 32, 32, 1) (2500,)


## Load TensorRT frozen model

In [9]:
model = loadFrozenGraph('./models/mnist/lenet/frozen_model_trt_FP32.pb')

## Predict using TensorRT model
* While I could have made a comparative study here to benchmark TensorRT model against TensorFlow models but I already know how well the former works. Also, there are plenty of such work available on the internet that compares both.
* Check links in the reference section

In [10]:
t1 = time.time()
y_pred = model.predict_from_frozen_graph(X_test)
model.calculate_accuracy(y_pred, y_test)
t2 = time.time()

print('Exec time for TensorRT frozen model: ', t2-t1)

Accuracy: 95.94666666666667
Exec time for TensorRT frozen model:  2.6548843383789062


## References

* Original source: https://docs.nvidia.com/deeplearning/dgx/tf-trt-user-guide/index.html - this contains a list of detailed parameters that needs to be explored further.
* Others
    * https://github.com/ardianumam/Tensorflow-TensorRT
    * http://litaotju.github.io/2019/01/24/Tensorflow-Tutorial-6,-Using-TensorRT-to-speedup-inference/
    * https://tsmatz.wordpress.com/2018/07/07/tensorrt-tensorflow-python-on-azure-tutorial/
    * https://developers.googleblog.com/2018/03/tensorrt-integration-with-tensorflow.html