# Inference of Pydnet-Pytorch pretrained model on an image

Run inference on an image and display the output and test the runtime on local machine: Pytorch 1.6, CUDA 10.1 SuperServer with GTX 2080 TI GPU, Tensorflow 2.4.0-dev.

Also optimize the model for faster inference using 

- [x] JIT from Pytorch
- [ ] Jit from NVIDIA (TrTorch)
- [ ] Mobile optimizer of Pytorch  --> Gives error. Try blacklisting each of the optimization and check if error disappears
- [x] ONNX and it's optimizer
- [x] ONNX runtime
- [ ] TensorRT
- [ ] TVM
- [x] Convert ONNX model to Tenseoflow and compare runtime. must have align_corners=True in upsample for this

In [None]:
#!pip install tf-nightly

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import torch # v 1.6
import torchvision.transforms.functional as TF
from pydnet import PyddepthInference, Pydnet
import torch.utils.mobile_optimizer as mobile_optimizer
import numpy as np
import tensorflow as tf # 2.4.0-dev

device = "cpu"

In [None]:
# ONNXRuntime-GPU requires CUDA 10.1
#!pip install onnxruntime
#!conda install -c conda-forge onnx --yes
import onnx
import onnxruntime as ort

Load input data

In [None]:
img1 = Image.open('test/1.png')
img1 = img1.resize((640, 192), Image.ANTIALIAS)
example1 = TF.to_tensor(img1).unsqueeze_(0).to(device)

example1_tf = example1.to('cpu').detach().numpy().astype(np.float32)
#example1_tf = np.transpose(example1_tf, [0, 2, 3, 1])
example1_tf.shape

print(example1.shape)
plt.imshow(img1);

Load models

In [None]:
# bla=Pydnet(mobile_version=True, my_version=False)
# loaded_dict = torch.utils.model_zoo.load_url("https://github.com/zshn25/Pydnet-Pytorch/blob/forMonodepth2/mobile_pydnet.pth", 
#                                              map_location= lambda storage, loc: storage)

#bla.load_state_dict(loaded_dict)


In [None]:
# pyddepth = PyddepthInference(mobile_version=True, my_version=False, pretrained=False)

# loaded_dict_enc = torch.load("mobile_pydnet.pth", map_location=device)

pyddepth = PyddepthInference(pretrained=True)

# loaded_dict_enc = torch.load("mobile_pydnet.pth", map_location=device)

# new_dict_enc = {}
# for k,v in loaded_dict_enc.items():
#     new_dict_enc[k.replace("module.", "")] = loaded_dict_enc[k]

# pyddepth.load_state_dict(new_dict_enc, strict=False)
pyddepth.to(device)
pyddepth.eval();

In [None]:
%timeit with torch.no_grad(): pyddepth(example1)

output=pyddepth(example1)
output1 = output.to('cpu').detach().numpy()
output1 = output1.squeeze()

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

Model pruning

In [None]:
import torch.nn.utils.prune as prune

# Collect parameters to prune
parameters_to_prune = ()
for name, module in pyddepth.named_modules():
      if hasattr(module, "weight"):
        parameters_to_prune += ((module, 'weight'),)

In [None]:
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.35
)

for p,_ in parameters_to_prune:
    prune.remove(p, name="weight")

In [None]:
%timeit with torch.no_grad(): pyddepth(example1)

output=pyddepth(example1)
output1 = output.to('cpu').detach().numpy()
output1 = output1.squeeze()

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

In [None]:
# Verify the global sparsity
num=0;den=0
for name, module in pyddepth.named_modules():
      if hasattr(module, "weight"):
            num += torch.sum(module.weight == 0)
            den += module.weight.nelement()
            
print("Global sparsity: {:.2f}%".format(
        100. * float(num) / float(den)))

In [None]:
torch.save(pyddepth, "mobile_pydnet_pruned35.pth")



## Optimization for faster inference

### Jit Trace

In [None]:
traced_model = torch.jit.trace(pyddepth, example1)
%timeit traced_model(example1)

output=traced_model(example1)
output1 = output.to('cpu').detach().numpy()
output1 = output1.squeeze()

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

In [None]:
#scripted_model = torch.jit.script(pyddepth, example1)
#%timeit scripted_model(example1)

JIT Traced model is faster than original model. Using JIT traced from now on.

### Pytorch's mobile optimizer:

In [None]:
optimized_traced_model = mobile_optimizer.optimize_for_mobile(traced_model)

#%timeit with torch.no_grad(): optimized_traced_model(example1) # check https://discuss.pytorch.org/t/runtimeerror-mobile-optimized-model-cannot-be-inferenced-on-gpu/94098

ONNX

In [None]:
# Export model. 
# Using opset version 11 as the model contains nn.Upsample (which is supported by opset version >=11)
onnx_model = torch.onnx.export(traced_model,               # model being run
                              example1,                         # model input (or a tuple for multiple inputs)
                              "mobile_pydnet.onnx",   # where to save the model (can be a file or file-like object)
                              example_outputs=output,
                              export_params=True,        # store the trained parameter weights inside the model file
                              opset_version=11,          # the ONNX version to export the model to
                              keep_initializers_as_inputs=True,
                              do_constant_folding=True,  # whether to execute constant folding for optimization
                              input_names = ['input'],   # the model's input names
                              output_names = ['output'], # the model's output names
                              dynamic_axes={'input' : {0 : 'batch_size'},    # variable lenght axes
                                            'output' : {0 : 'batch_size'}})

In [None]:
# ONNXRuntime-GPU requires CUDA 10.1
#!pip install onnxruntime
#!conda install -c conda-forge onnx --yes
import onnx
import onnxruntime as ort

In [None]:
model = onnx.load("mobile_pydnet.onnx")

# Check that the IR is well formed
onnx.checker.check_model(model)

# Print a human readable representation of the graph
#onnx.helper.printable_graph(model.graph)


Model visualization

In [None]:
#!pip install netron

In [None]:
import netron
#netron.start("mobile_pydnet.onnx")

ONNX Optimizer: See [this](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md#optimizing-an-onnx-model)

In [None]:
from onnx import optimizer

onnx.checker.check_model(model)
onnx.helper.strip_doc_string(model)
optimized_model = onnx.shape_inference.infer_shapes(model)

optimizers_list = ['eliminate_deadend', 'eliminate_nop_dropout',
                                            'eliminate_nop_monotone_argmax', 'eliminate_nop_pad',
                                            'extract_constant_to_initializer', 'eliminate_unused_initializer',
                                            'eliminate_nop_transpose', 
                                            # disable this optimizer until https://github.com/onnx/optimizer/issues/3 gets fixed
                                            'fuse_add_bias_into_conv',
                                            'fuse_consecutive_concats',
                                            'fuse_consecutive_log_softmax',
                                            'fuse_consecutive_reduce_unsqueeze', 'fuse_consecutive_squeezes',
                                            'fuse_consecutive_transposes', 'fuse_matmul_add_bias_into_gemm',
                                            'fuse_pad_into_conv', 'fuse_transpose_into_gemm']
optimized_model = optimizer.optimize(optimized_model, optimizers_list,#optimizer.get_available_passes(),
                                     fixed_point=True)
onnx.checker.check_model(optimized_model)

onnx.save(optimized_model, "optimized_mobile_pydnet.onnx")

In [None]:
import numpy as np
ort_input = example1.to('cpu').detach().numpy().astype(np.float32)

ort_session = ort.InferenceSession('optimized_mobile_pydnet.onnx')

#print("This is on CPU as ONNXRuntime is on CPU")
%timeit outputs = ort_session.run(None,  {ort_session.get_inputs()[0].name: ort_input})[0]

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

In [None]:
## Pytorch to Keras
#!pip install onnx2keras
# from onnx2keras import onnx_to_keras
# k_model = onnx_to_keras(onnx_model=model, input_names=['input'])

### Convert to Tensorflow

In [None]:
model = onnx.load("optimized_mobile_pydnet.onnx")

In [None]:
#!pip install tensorflow --force-reinstall
#!pip install tensorflow-addons
# Install onnx-tensorflow as follows in terminal
#!pip uninstall onnx-tf --yes
#!pip install git+https://github.com/onnx/onnx-tensorflow.git
#!git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow
#!pip install -e .

In [None]:
from onnx_tf.backend import prepare
import onnx
#model = onnx.load("mobile_pydnet_interp.onnx")

tf_rep = prepare(model)

# Input nodes to the model
print('inputs:', tf_rep.inputs)

# Output nodes from the model
print('outputs:', tf_rep.outputs)

# All nodes in the model
print('tensor_dict:')
print(tf_rep.tensor_dict)

Tensorflow supports inputs in the format `NHWC` but our inputs are in the format `NCHW`. So, we change our input shape as follows

In [None]:
print("This is on CPU as ONNXRuntime is on CPU")
%timeit tf_rep.run(example1_tf)

output1 = tf_rep.run(example1_tf)[0].squeeze()
print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

# # export tensorFlow backend to tensorflow tf file
tf_rep.export_graph('mobile_pydnet_pruned35/')

In [None]:
# Inference using Tensorflow # must be v2

# Run inference on SavedModel using Tensorflow
imported = tf.saved_model.load("mobile_pydnet_pruned35/") # tf.keras.models.load_model('mobile_pydnet_pruned35')
%timeit imported(input=example1_tf)

output1 = imported(input=example1.to('cpu').detach().numpy().astype(np.float32))[0].numpy().squeeze()
print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

In [None]:
## Tensorflo model optimization
# from tensorflow.python.tools import optimize_for_inference_lib
# input_graph_def = graph_pb2.GraphDef()
# output_graph_def = optimize_for_inference_lib.optimize_for_inference(

#       input_graph_def,
#       FLAGS.input_names.split(","),
#       FLAGS.output_names.split(","), FLAGS.placeholder_type_enum)

### Convert to Tensorflow Lite

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model("mobile_pydnet_pruned35")
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
                                       tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = converter.convert()

with tf.io.gfile.GFile('mobile_pydnet_pruned35.tflite', 'wb') as f:
    f.write(tflite_model)

In [None]:
# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="mobile_pydnet_pruned35.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Test the model on random input data.
input_shape = input_details[0]['shape']
#input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32) # example1.to('cpu').detach().numpy().astype(np.float32)
interpreter.set_tensor(input_details[0]['index'], examplee1_tf)

print(input_shape)
%timeit interpreter.invoke()
print("done")
# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.

output1 = interpreter.get_tensor(output_details[0]['index']).squeeze()
print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

___
### Legacy code

In [None]:
# Convert from pb to SavedModel

import tensorflow.compat.v1 as tf 
tf.disable_v2_behavior()
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
export_dir = 'saved_nopad'
graph_pb = 'mobile_pydnet_pruned35/saved_model.pb'

# builder = tf.saved_model.builder.SavedModelBuilder(export_dir)

with tf.gfile.GFile(graph_pb, "rb") as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())

sigs = {}
with tf.Session(graph=tf.Graph()) as sess:
    tf.import_graph_def(graph_def, name="")
    g = tf.get_default_graph()
    inp = g.get_tensor_by_name('input:0')
    out = g.get_tensor_by_name("output:0")

#     sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
#         tf.compat.v1.saved_model.signature_def_utils.predict_signature_def(
#             {"input": inp}, {"output": out})

#     builder.add_meta_graph_and_variables(sess,
#                                          [tag_constants.SERVING],
#                                          signature_def_map=sigs)
#     builder.save()
    #input_tensor_shape = sess.graph.get_tensor_by_name('input:0').shape.as_list()
    output = sess.run(out, {"input:0": example1.to('cpu').detach().numpy()}) # --> Inference
    
    g.finalize()


#converter = tf.lite.TFLiteConverter.from_saved_model("saved")
#tflite_model = converter.convert()
    
#     [n.name for n in tf.get_default_graph().as_graph_def().node]

# for i in tf.get_default_graph().get_operations():
#     print(i)
# print(tf.get_default_graph())

In [None]:
plt.imshow(output.squeeze())

In [None]:
model.test(data = x_test)

In [None]:
#tf.enable_control_flow_v2()
converter = tf.lite.TFLiteConverter.from_frozen_graph('mobile_pydnet_pruned35.pb', #TensorFlow freezegraph .pb model file
                                                      input_arrays=['input'], # name of input arrays as defined in torch.onnx.export function before.
                                                      output_arrays=['output']  # name of output arrays defined in torch.onnx.export function before.
                                                      )
#converter.experimental_new_converter = True
#converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]

# tell converter which type of optimization techniques to use
# converter.optimizations = [tf.lite.Optimize.DEFAULT]
# to view the best option for optimization read documentation of tflite about optimization
# go to this link https://www.tensorflow.org/lite/guide/get_started#4_optimize_your_model_optional

# convert the model 
tf_lite_model = converter.convert()

In [None]:
func = tf.saved_model.load('.')
#.signatures["serving_default"] 
out = func( tf.constant(10,tf.float32) )

In [None]:
with tf.compat.v1.gfile.GFile('mobile_pydnet_pruned35.pb', "rb") as f:
    graph_def = tf.compat.v1.GraphDef()
    graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
    tf.import_graph_def(graph_def, name='')
    tf.io.write_graph(graph_def, 'tmp/', 'hashtable.pbtxt')
    
#data_input = tf.placeholder(name='input', dtype=tf.float32, shape=[None, 192, 640, 3])
inpu = tf.get_default_graph().get_tensor_by_name("input:0")
emb = tf.get_default_graph().get_tensor_by_name("embeddings:0")
phase = tf.get_default_graph().get_tensor_by_name("phase_train:0")
tf.saved_model.simple_save(sess,"..\\teste_model_2\\",inputs={"input":inpu,"phase":phase},outputs={"output":emb})

In [None]:
model = tf.saved_model.load(".")
concrete_func = model.signatures[
  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
concrete_func.inputs[0].set_shape([1, 192, 640, 3])
converter = TFLiteConverter.from_concrete_functions([concrete_func])

In [None]:
model

In [None]:
#graph_def = tf.get_default_graph().as_graph_def()

with tf.control_dependencies([tf.compat.v1.initializers.tables_initializer()]):
      input_int64_tensor = tf.compat.v1.placeholder(tf.int64, shape=[1])
      input_string_tensor = tf.compat.v1.placeholder(tf.string, shape=[1])
      out_string_tensor = int64_to_string_table.lookup(input_int64_tensor)
      out_int64_tensor = string_to_int64_table.lookup(input_string_tensor)

converter = tf.lite.TFLiteConverter(graph,
                                  [input_int64_tensor, input_string_tensor],
                                  [out_string_tensor, out_int64_tensor])

supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]
converter.target_spec.supported_ops = supported_ops
converter.allow_custom_ops = True
tflite_model = converter.convert()

In [None]:

# Get frozen ConcreteFunction
frozen_func = convert_variables_to_constants_v2(graph)
frozen_func.graph.as_graph_def()

In [None]:
converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph('mobile_pydnet_pruned35.pb', #TensorFlow freezegraph .pb model file
                                                      input_arrays=['input'], # name of input arrays as defined in torch.onnx.export function before.
                                                      output_arrays=['output']  # name of output arrays defined in torch.onnx.export function before.
                                                      )
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

In [None]:
model = tf.saved_model.load(os.path.join(cwd,"mobile_pydnet_pruned35.pb"))
concrete_func = model.signatures[
  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
concrete_func.inputs[0].set_shape([1, 192, 640, 3])
converter = TFLiteConverter.from_concrete_functions([concrete_func])

In [None]:
# Convert the model to Tensorflow Lite.
converter = tf.lite.TFLiteConverter.from_saved_model('mobile_pydnet_pruned35.pb')
tflite_model = converter.convert()