In [1]:
import tensorrt as trt
trt.__version__

'7.1.2.8'

In [None]:
import os
os.makedirs('./model_repository/custom_plan/1/',exist_ok=True)

In [2]:
!pip install -U tf2onnx

Collecting tf2onnx
  Downloading tf2onnx-1.6.3-py3-none-any.whl (185 kB)
[K     |████████████████████████████████| 185 kB 19.3 MB/s eta 0:00:01
[?25hCollecting onnx>=1.4.1
  Downloading onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 19.5 MB/s eta 0:00:01
Installing collected packages: onnx, tf2onnx
Successfully installed onnx-1.7.0 tf2onnx-1.6.3
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!python -m tf2onnx.convert --saved-model ./model_ckpt --output model.onnx

2020-08-24 16:02:04.625489: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
2020-08-24 16:02:05.975260: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcuda.so.1'; dlerror: /usr/lib/x86_64-linux-gnu/libcuda.so.1: file too short; LD_LIBRARY_PATH: /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2020-08-24 16:02:05.975283: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: UNKNOWN ERROR (303)
2020-08-24 16:02:05.975348: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
2020-08-24 16:02:06.009264: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2194990000 Hz
2020-08-24 16:02:06.015654: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f8a84000b20 initialized for platform Host (

In [4]:
%%writefile engine.py 
import tensorrt as trt
import pycuda.autoinit

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
def build_engine(onnx_path, shape = [1,224,224,3]):

    """
    This is the function to create the TensorRT engine
    Args:
      onnx_path : Path to onnx_file. 
      shape : Shape of the input of the ONNX file. 
    """
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = (1 << 30)
        with open(onnx_path, 'rb') as model:
            parser.parse(model.read())
        network.get_input(0).shape = shape
        engine = builder.build_cuda_engine(network)
        return engine

def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)
def load_engine(trt_runtime, engine_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

Overwriting engine.py


In [3]:
import engine as eng
import argparse
from onnx import ModelProto 
import tensorrt as trt

engine_name = "model.plan"
onnx_path = "model.onnx"
batch_size = 1 

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())

d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]
engine = eng.build_engine(onnx_path, shape= shape)
eng.save_engine(engine, engine_name)

In [4]:
%%writefile inference.py
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda

def allocate_buffers(engine, batch_size, data_type):

    """
    This is the function to allocate buffers for input and output in the device
    Args:
      engine : The path to the TensorRT engine. 
      batch_size : The batch size for execution time.
      data_type: The type of the data for input and output, for example trt.float32. 

    Output:
      h_input_1: Input in the host.
      d_input_1: Input in the device. 
      h_output_1: Output in the host. 
      d_output_1: Output in the device. 
      stream: CUDA stream.

    """

    # Determine dimensions and create page-locked memory buffers (which won't be swapped to disk) to hold host inputs/outputs.
    h_input_1 = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(data_type))
    h_output = cuda.pagelocked_empty(batch_size * trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(data_type))
    # Allocate device memory for inputs and outputs.
    d_input_1 = cuda.mem_alloc(h_input_1.nbytes)

    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input_1, d_input_1, h_output, d_output, stream 

def load_images_to_buffer(pics, pagelocked_buffer):

    preprocessed = np.asarray(pics).ravel()
    np.copyto(pagelocked_buffer, preprocessed)


def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width):

    """
    This is the function to run the inference
    Args:
      engine : Path to the TensorRT engine. 
      pics_1 : Input images to the model.  
      h_input_1: Input in the host. 
      d_input_1: Input in the device. 
      h_output_1: Output in the host. 
      d_output_1: Output in the device. 
      stream: CUDA stream.
      batch_size : Batch size for execution time.
      height: Height of the output image.
      width: Width of the output image.

    Output:
      The list of output images.

    """

    load_images_to_buffer(pics_1, h_input_1)

    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

        # Run inference.

        context.profiler = trt.Profiler()
        context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)])

        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream.
        stream.synchronize()
        # Return the host output.
        out = h_output.reshape((batch_size,1))
        return out

Overwriting inference.py


In [5]:
import engine as eng
import inference as inf
import skimage
from PIL import Image
import numpy as np
import pycuda
import pycuda.autoinit
## construct pre_processing for testing model prediction 
def inference_processing(img_path):
    img = Image.open(img_path)
    img = img.convert('RGB')
    img = img.resize((224, 224), Image.BILINEAR)
    img = np.array(img)

    img=img/255
    return img.reshape(1,224,224,3)
def get_prediction(pred):
    
    pred=np.argmax(pred)
    return np.int32(pred)
outputlayer_name = "outputs"
input_file_path = "parkinson.png"
onnx_file = "model.onnx"
serialized_plan_fp32 = "model.plan"
CHANNEL = 3
HEIGHT = 224
WIDTH = 224


im = inference_processing(input_file_path)

engine = eng.load_engine(trt_runtime, serialized_plan_fp32)
h_input, d_input, h_output, d_output, stream = inf.allocate_buffers(engine, 1, trt.float32)
out = inf.do_inference(engine, im, h_input, d_input, h_output, d_output, stream, 1, HEIGHT, WIDTH)
print(out.shape), get_prediction(out)

(1, 1)


(None, 0)

In [None]:
%%writefile ./model_repository/custom_plan/config.pbtxt
name: "custom_plan"
platform: "tensorrt_plan"
max_batch_size : 1
input [
  {
    name: "hand_drawing:0"
    data_type: TYPE_FP32
    format: FORMAT_NHWC
    dims: [ 224,224 ,3]
    reshape {shape : [1,224,224,3]}
  }
]
output [
  {
    name: "Identity:0"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape{shape:[1,1]}
    label_filename: "labels.txt"
  }
]
instance_group [
  {
    count: 2
    kind: KIND_GPU
  }
]
dynamic_batching {
  preferred_batch_size: [ 1 ]
}

In [None]:
%%writefile ./model_repository/custom_plan/labels.txt 
healthy
parkinson

In [None]:
!cp model.plan ./model_repository/custom_plan/1/

In [7]:
import os
os.listdir('./model_repository/custom_plan/1/')

['model.plan']