# Inference of Pydnet-Pytorch pretrained model on an image

Run inference on an image and display the output and test the runtime on local machine: Pytorch 1.6, CUDA 10.2 SuperServer with GTX 2080 TI GPU.

Also optimize the model for faster inference using 

- [x] JIT from Pytorch
- [ ] Jit from NVIDIA (TrTorch)
- [ ] Mobile optimizer of Pytorch  --> Gives error. Try blacklisting each of the optimization and check if error disappears
- [ ] ONNX and it's optimizer
- [ ] ONNX runtime
- [ ] TensorRT
- [ ] TVM
- [ ] Convert ONNX model to Tenseoflow and compare runtime. must have align_corners=True in upsample for this (requires retraining)

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torchvision.transforms.functional as TF
from pydnet import PyddepthInference, Pydnet
import torch.utils.mobile_optimizer as mobile_optimizer

device = "cuda"

Load input data

In [None]:
img1 = Image.open('test/1.png')
img1 = img1.resize((640, 192), Image.ANTIALIAS)
example1 = TF.to_tensor(img1).unsqueeze_(0).to(device)

print(example1.shape)
plt.imshow(img1);

Load models

In [None]:
# bla=Pydnet(mobile_version=True, my_version=False)
# loaded_dict = torch.utils.model_zoo.load_url("https://github.com/zshn25/Pydnet-Pytorch/blob/forMonodepth2/mobile_pydnet.pth", 
#                                              map_location= lambda storage, loc: storage)

#bla.load_state_dict(loaded_dict)


In [None]:
pyddepth = PyddepthInference(mobile_version=True, my_version=False, pretrained=False)

loaded_dict_enc = torch.load("mobile_pydnet.pth", map_location=device)

# pyddepth = PyddepthInference(mobile_version=True, my_version=True, pretrained=False)

# loaded_dict_enc = torch.load("my_mobile_pydnet.pth", map_location=device)

new_dict_enc = {}
for k,v in loaded_dict_enc.items():
    new_dict_enc[k.replace("module.", "")] = loaded_dict_enc[k]
    
pyddepth.load_state_dict(new_dict_enc, strict=False)
pyddepth.to(device)
pyddepth.eval();

In [None]:
%timeit with torch.no_grad(): pyddepth(example1)

output=pyddepth(example1)
output1 = output.to('cpu').detach().numpy()
output1 = output1.squeeze()

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

Model pruning

In [None]:
import torch.nn.utils.prune as prune

# Collect parameters to prune
parameters_to_prune = ()
for name, module in pyddepth.named_modules():
      if hasattr(module, "weight"):
        parameters_to_prune += ((module, 'weight'),)

In [None]:
prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.35
)

for p,_ in parameters_to_prune:
    prune.remove(p, name="weight")

In [None]:
%timeit with torch.no_grad(): pyddepth(example1)

output=pyddepth(example1)
output1 = output.to('cpu').detach().numpy()
output1 = output1.squeeze()

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

In [None]:
torch.save(pyddepth, "mobile_pydnet_pruned35.pth")



## Optimization for faster inference

### Jit Trace

In [None]:
traced_model = torch.jit.trace(pyddepth, example1)
%timeit traced_model(example1)

output=traced_model(example1)
output1 = output.to('cpu').detach().numpy()
output1 = output1.squeeze()

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

In [None]:
#scripted_model = torch.jit.script(pyddepth, example1)
#%timeit scripted_model(example1)

JIT Traced model is faster than original model. Using JIT traced from now on.

### Pytorch's mobile optimizer:

In [None]:
optimized_traced_model = mobile_optimizer.optimize_for_mobile(traced_model)

#%timeit with torch.no_grad(): optimized_traced_model(example1) # check https://discuss.pytorch.org/t/runtimeerror-mobile-optimized-model-cannot-be-inferenced-on-gpu/94098

ONNX

In [None]:
# Export model. 
# Using opset version 11 as the model contains nn.Upsample (which is supported by opset version >=11)
onnx_model = torch.onnx.export(traced_model,               # model being run
                              example1,                         # model input (or a tuple for multiple inputs)
                              "mobile_pydnet.onnx",   # where to save the model (can be a file or file-like object)
                              example_outputs=output,
                              export_params=True,        # store the trained parameter weights inside the model file
                              opset_version=11,          # the ONNX version to export the model to
                              keep_initializers_as_inputs=True,
                              do_constant_folding=True,  # whether to execute constant folding for optimization
                              input_names = ['input'],   # the model's input names
                              output_names = ['output'], # the model's output names
                              dynamic_axes={'input' : {0 : 'batch_size'},    # variable lenght axes
                                            'output' : {0 : 'batch_size'}})

In [None]:
# Installing onnxruntime without GPU as it requires CUDA 10.1 but 10.2 is installed
!pip install onnxruntime onnx --upgrade
#!conda install -c conda-forge onnx --yes
import onnx
import onnxruntime as ort

In [None]:
model = onnx.load("mobile_pydnet.onnx")

# Check that the IR is well formed
onnx.checker.check_model(model)

# Print a human readable representation of the graph
#onnx.helper.printable_graph(model.graph)


Model visualization

In [None]:
#!pip install netron

In [None]:
import netron
#netron.start("mobile_pydnet.onnx")

ONNX Optimizer: See [this](https://github.com/onnx/onnx/blob/master/docs/PythonAPIOverview.md#optimizing-an-onnx-model)

In [None]:
from onnx import optimizer

onnx.checker.check_model(model)
onnx.helper.strip_doc_string(model)
optimized_model = onnx.shape_inference.infer_shapes(model)

optimizers_list = ['eliminate_deadend', 'eliminate_nop_dropout',
                                            'eliminate_nop_monotone_argmax', 'eliminate_nop_pad',
                                            'extract_constant_to_initializer', 'eliminate_unused_initializer',
                                            'eliminate_nop_transpose', 'fuse_add_bias_into_conv',
                                            'fuse_consecutive_concats', 'eliminate_identity',
                                            'fuse_consecutive_log_softmax',
                                            'fuse_consecutive_reduce_unsqueeze', 'fuse_consecutive_squeezes',
                                            'fuse_consecutive_transposes', 'fuse_matmul_add_bias_into_gemm',
                                            'fuse_pad_into_conv', 'fuse_transpose_into_gemm', 'fuse_bn_into_conv']
optimized_model = optimizer.optimize(optimized_model, optimizers_list,#optimizer.get_available_passes(),
                                     fixed_point=True)
onnx.checker.check_model(optimized_model)

onnx.save(optimized_model, "optimized_mobile_pydnet.onnx")

In [None]:
import numpy as np
ort_input = example1.to('cpu').detach().numpy().astype(np.float32)

ort_session = ort.InferenceSession('optimized_mobile_pydnet.onnx')

print("This is on CPU as ONNXRuntime is on CPU")
%timeit outputs = ort_session.run(None,  {ort_session.get_inputs()[0].name: ort_input})[0]

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);

### Convert to Tensorflow

In [None]:
#!pip install tensorflow 
#!pip install tensorflow-addons
# Install onnx-tensorflow as follows in terminal
#!git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow
#!pip install -e .

In [None]:
from onnx_tf.backend import prepare
import onnx
model = onnx.load("mobile_pydnet_interp.onnx")

tf_rep = prepare(model)

# Input nodes to the model
print('inputs:', tf_rep.inputs)

# Output nodes from the model
print('outputs:', tf_rep.outputs)

# All nodes in the model
print('tensor_dict:')
print(tf_rep.tensor_dict)

In [None]:
print("This is on CPU as ONNXRuntime is on CPU")
%timeit output1 = tf_rep.run(example1.to('cpu').detach().numpy().astype(np.float32))

print(output1.shape)

fig, axes = plt.subplots(1,2, figsize=(20,3))
axes[0].imshow(img1)
depthmap=axes[1].imshow(output1)
fig.colorbar(depthmap);


In [None]:
tf_rep.export_graph('mobile_pydnet.pb')