# Get Pretrained Model:

In [1]:
from tensorflow.keras.applications import ResNet50

In [2]:
model = ResNet50(include_top=True, weights='imagenet')

model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1_conv[0][0]                 
___________________________________________________________________________________________

In [3]:
model.save('resnet50_saved_model') 

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: resnet50_saved_model/assets


In [4]:
from tensorflow.python.compiler.tensorrt import trt_convert as trt

In [5]:
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS

conversion_params

TrtConversionParams(rewriter_config_template=None, max_workspace_size_bytes=1073741824, precision_mode='FP32', minimum_segment_size=3, is_dynamic_op=True, maximum_cached_engines=1, use_calibration=True, max_batch_size=1, allow_build_at_runtime=True)

In [6]:
conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=trt.TrtPrecisionMode.FP32,
                                                               max_workspace_size_bytes=8000000000)
conversion_params_fp16 = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=trt.TrtPrecisionMode.FP16,
                                                               max_workspace_size_bytes=8000000000)

In [7]:
conversion_params_int8 = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=trt.TrtPrecisionMode.INT8,
                                                               max_workspace_size_bytes=8000000000)

# Run the conversion and save it

### FP32

In [8]:
print('Converting to TF-TRT FP32...')


converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
                                    conversion_params=conversion_params)
converter.convert() # conversion params should be kwargs here
converter.save(output_saved_model_dir='resnet50_saved_model_TFTRT_FP32')

print('Done Converting to TF-TRT FP32')

Converting to TF-TRT FP32...
INFO:tensorflow:Linked TensorRT version: (6, 0, 1)
INFO:tensorflow:Loaded TensorRT version: (6, 0, 1)
INFO:tensorflow:Could not find TRTEngineOp_0_0 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.
INFO:tensorflow:Assets written to: resnet50_saved_model_TFTRT_FP32/assets
Done Converting to TF-TRT FP32


### FP16

In [9]:
print('Converting to TF-TRT FP16...')


converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
                                    conversion_params=conversion_params_fp16)
converter.convert()
converter.save(output_saved_model_dir='resnet50_saved_model_TFTRT_FP16')
print('Done Converting to TF-TRT FP16')

Converting to TF-TRT FP16...
INFO:tensorflow:Linked TensorRT version: (6, 0, 1)
INFO:tensorflow:Loaded TensorRT version: (6, 0, 1)
INFO:tensorflow:Could not find TRTEngineOp_1_0 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime.
INFO:tensorflow:Assets written to: resnet50_saved_model_TFTRT_FP16/assets
Done Converting to TF-TRT FP16


### INT8

In [10]:
import numpy as np
import tensorflow as tf

dummy_calibration_batch = np.zeros((8, 224, 224, 3))

In [11]:
print('Converting to TF-TRT INT8...')


converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
                                    conversion_params=conversion_params_int8)

def calibration_input_fn():
    yield (tf.constant(dummy_calibration_batch.astype('float32')), )

converter.convert(calibration_input_fn=calibration_input_fn)

converter.save(output_saved_model_dir='resnet50_saved_model_TFTRT_INT8')
print('Done Converting to TF-TRT INT8')

Converting to TF-TRT INT8...
INFO:tensorflow:Linked TensorRT version: (6, 0, 1)
INFO:tensorflow:Loaded TensorRT version: (6, 0, 1)
INFO:tensorflow:Assets written to: resnet50_saved_model_TFTRT_INT8/assets
Done Converting to TF-TRT INT8


# Time original model:

In [12]:
import tensorflow as tf

In [13]:
# Our original model
model_unoptimized = tf.keras.models.load_model('resnet50_saved_model')



In [14]:
import numpy as np
dummy_input_batch = np.zeros((32, 224, 224, 3))

# Warm up
result = model_unoptimized.predict(dummy_input_batch)

In [15]:
%%time

for i in range(10):
    result = model_unoptimized.predict(dummy_input_batch)

print(result)
print(result.shape)

[[1.6964244e-04 3.3007504e-04 6.1350605e-05 ... 1.4622412e-05
  1.4449906e-04 6.6087098e-04]
 [1.6964244e-04 3.3007504e-04 6.1350605e-05 ... 1.4622412e-05
  1.4449906e-04 6.6087098e-04]
 [1.6964244e-04 3.3007504e-04 6.1350605e-05 ... 1.4622412e-05
  1.4449906e-04 6.6087098e-04]
 ...
 [1.6964244e-04 3.3007504e-04 6.1350605e-05 ... 1.4622412e-05
  1.4449906e-04 6.6087098e-04]
 [1.6964244e-04 3.3007504e-04 6.1350605e-05 ... 1.4622412e-05
  1.4449906e-04 6.6087098e-04]
 [1.6964244e-04 3.3007504e-04 6.1350605e-05 ... 1.4622412e-05
  1.4449906e-04 6.6087098e-04]]
(32, 1000)
CPU times: user 719 ms, sys: 151 ms, total: 870 ms
Wall time: 752 ms


# Load TRT FP32 model

In [16]:
from tensorflow.python.saved_model import tag_constants
import tensorflow as tf

In [17]:
saved_model_loaded = tf.saved_model.load('resnet50_saved_model_TFTRT_FP32', tags=[tag_constants.SERVING])
wrapper_fp32 = saved_model_loaded.signatures['serving_default']

In [18]:
def predict(x, wrapper):
    x = tf.constant(x.astype('float32'))
    labeling = wrapper(x)
    preds = labeling['predictions'].numpy()
    return preds

# Warm up

In [19]:
# warmup
predict(dummy_input_batch, wrapper_fp32)

array([[1.6964250e-04, 3.3007396e-04, 6.1350263e-05, ..., 1.4622303e-05,
        1.4449868e-04, 6.6086580e-04],
       [1.6964250e-04, 3.3007396e-04, 6.1350263e-05, ..., 1.4622303e-05,
        1.4449868e-04, 6.6086580e-04],
       [1.6964250e-04, 3.3007396e-04, 6.1350263e-05, ..., 1.4622303e-05,
        1.4449868e-04, 6.6086580e-04],
       ...,
       [1.6964250e-04, 3.3007396e-04, 6.1350263e-05, ..., 1.4622303e-05,
        1.4449868e-04, 6.6086580e-04],
       [1.6964250e-04, 3.3007396e-04, 6.1350263e-05, ..., 1.4622303e-05,
        1.4449868e-04, 6.6086580e-04],
       [1.6964250e-04, 3.3007396e-04, 6.1350263e-05, ..., 1.4622303e-05,
        1.4449868e-04, 6.6086580e-04]], dtype=float32)

# Time FP32

In [20]:
%%time

for i in range(10):
    result = predict(dummy_input_batch, wrapper_fp32)

print(result)
print(result.shape)

[[1.6964250e-04 3.3007396e-04 6.1350263e-05 ... 1.4622303e-05
  1.4449868e-04 6.6086580e-04]
 [1.6964250e-04 3.3007396e-04 6.1350263e-05 ... 1.4622303e-05
  1.4449868e-04 6.6086580e-04]
 [1.6964250e-04 3.3007396e-04 6.1350263e-05 ... 1.4622303e-05
  1.4449868e-04 6.6086580e-04]
 ...
 [1.6964250e-04 3.3007396e-04 6.1350263e-05 ... 1.4622303e-05
  1.4449868e-04 6.6086580e-04]
 [1.6964250e-04 3.3007396e-04 6.1350263e-05 ... 1.4622303e-05
  1.4449868e-04 6.6086580e-04]
 [1.6964250e-04 3.3007396e-04 6.1350263e-05 ... 1.4622303e-05
  1.4449868e-04 6.6086580e-04]]
(32, 1000)
CPU times: user 243 ms, sys: 60.8 ms, total: 304 ms
Wall time: 298 ms


# Time FP16

In [21]:
saved_model_loaded = tf.saved_model.load('resnet50_saved_model_TFTRT_FP16', tags=[tag_constants.SERVING])
wrapper_fp16 = saved_model_loaded.signatures['serving_default']

In [22]:
# warmup
predict(dummy_input_batch, wrapper_fp16)

array([[1.7087246e-04, 3.2928490e-04, 6.0941329e-05, ..., 1.4803631e-05,
        1.4508840e-04, 6.5903127e-04],
       [1.7087246e-04, 3.2928490e-04, 6.0941329e-05, ..., 1.4803631e-05,
        1.4508840e-04, 6.5903127e-04],
       [1.7087246e-04, 3.2928490e-04, 6.0941329e-05, ..., 1.4803631e-05,
        1.4508840e-04, 6.5903127e-04],
       ...,
       [1.7087246e-04, 3.2928490e-04, 6.0941329e-05, ..., 1.4803631e-05,
        1.4508840e-04, 6.5903127e-04],
       [1.7087246e-04, 3.2928490e-04, 6.0941329e-05, ..., 1.4803631e-05,
        1.4508840e-04, 6.5903127e-04],
       [1.7087246e-04, 3.2928490e-04, 6.0941329e-05, ..., 1.4803631e-05,
        1.4508840e-04, 6.5903127e-04]], dtype=float32)

In [23]:
%%time

for i in range(10):
    result = predict(dummy_input_batch, wrapper_fp16)

print(result)
print(result.shape)

[[1.7087246e-04 3.2928490e-04 6.0941329e-05 ... 1.4803631e-05
  1.4508840e-04 6.5903127e-04]
 [1.7087246e-04 3.2928490e-04 6.0941329e-05 ... 1.4803631e-05
  1.4508840e-04 6.5903127e-04]
 [1.7087246e-04 3.2928490e-04 6.0941329e-05 ... 1.4803631e-05
  1.4508840e-04 6.5903127e-04]
 ...
 [1.7087246e-04 3.2928490e-04 6.0941329e-05 ... 1.4803631e-05
  1.4508840e-04 6.5903127e-04]
 [1.7087246e-04 3.2928490e-04 6.0941329e-05 ... 1.4803631e-05
  1.4508840e-04 6.5903127e-04]
 [1.7087246e-04 3.2928490e-04 6.0941329e-05 ... 1.4803631e-05
  1.4508840e-04 6.5903127e-04]]
(32, 1000)
CPU times: user 147 ms, sys: 24.3 ms, total: 171 ms
Wall time: 167 ms


# Time INT8:

In [24]:
saved_model_loaded = tf.saved_model.load('resnet50_saved_model_TFTRT_INT8', tags=[tag_constants.SERVING])
wrapper_int8 = saved_model_loaded.signatures['serving_default']

In [25]:
# warmup
predict(dummy_input_batch, wrapper_int8)

array([[8.2323175e-05, 6.4522948e-04, 1.3345230e-04, ..., 2.4260997e-05,
        1.4522210e-04, 3.0344336e-03],
       [8.2323175e-05, 6.4522948e-04, 1.3345230e-04, ..., 2.4260997e-05,
        1.4522210e-04, 3.0344336e-03],
       [8.2323175e-05, 6.4522948e-04, 1.3345230e-04, ..., 2.4260997e-05,
        1.4522210e-04, 3.0344336e-03],
       ...,
       [8.2323175e-05, 6.4522948e-04, 1.3345230e-04, ..., 2.4260997e-05,
        1.4522210e-04, 3.0344336e-03],
       [8.2323175e-05, 6.4522948e-04, 1.3345230e-04, ..., 2.4260997e-05,
        1.4522210e-04, 3.0344336e-03],
       [8.2323175e-05, 6.4522948e-04, 1.3345230e-04, ..., 2.4260997e-05,
        1.4522210e-04, 3.0344336e-03]], dtype=float32)

In [26]:
%%time

for i in range(10):
    result = predict(dummy_input_batch, wrapper_int8)

print(result)
print(result.shape)

[[8.2323175e-05 6.4522948e-04 1.3345230e-04 ... 2.4260997e-05
  1.4522210e-04 3.0344336e-03]
 [8.2323175e-05 6.4522948e-04 1.3345230e-04 ... 2.4260997e-05
  1.4522210e-04 3.0344336e-03]
 [8.2323175e-05 6.4522948e-04 1.3345230e-04 ... 2.4260997e-05
  1.4522210e-04 3.0344336e-03]
 ...
 [8.2323175e-05 6.4522948e-04 1.3345230e-04 ... 2.4260997e-05
  1.4522210e-04 3.0344336e-03]
 [8.2323175e-05 6.4522948e-04 1.3345230e-04 ... 2.4260997e-05
  1.4522210e-04 3.0344336e-03]
 [8.2323175e-05 6.4522948e-04 1.3345230e-04 ... 2.4260997e-05
  1.4522210e-04 3.0344336e-03]]
(32, 1000)
CPU times: user 134 ms, sys: 11.6 ms, total: 146 ms
Wall time: 142 ms
