```
The MIT License (MIT)

Copyright (c) 2020 NVIDIA

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

```

# Interacting with TRITON Inference Server

Walthrough a generic pipeline for:
- Using client libraries to run inference on models on TRITON sever
- Modifying model configs

#### Environment
**1 - Client**
All steps executed using **NGC Pytorch Docker (v 20.06)**
* TRITON Client libraries installed within container

**2 - Server**
using **NGC tritonserver Docker (20.06-v1)**

![TRITON Workflow](./NB_images/TRITON_image.png)

--------------------------------------------------------------------------------------------------------------------------------------------------------------
### ----------    Triton serving for our **first model** which we trained with 1a_triton_server_own_model.ipynb    ----------

In [5]:
import argparse
import numpy as np
import os
import json
from builtins import range
import tensorrtserver.api.model_config_pb2 as model_config
from tensorrtserver.api import *
import os
#os.makedirs('./model_repo/covid_plan/1/',exist_ok=True)
os.makedirs('./model_repo/covid_onnx/1/',exist_ok=True)

In [2]:
%%writefile engine.py 
import tensorrt as trt
import pycuda.autoinit

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
def build_engine(onnx_path, shape = [1,1,32000]):

    """
    This is the function to create the TensorRT engine
    Args:
      onnx_path : Path to onnx_file. 
      shape : Shape of the input of the ONNX file. 
    """
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = (256 << 20)
        with open(onnx_path, 'rb') as model:
            parser.parse(model.read())
        network.get_input(0).shape = shape
        engine = builder.build_cuda_engine(network)
        return engine

def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)
def load_engine(trt_runtime, engine_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

def inspect_engine(engine):
    profile_meta = {}
    num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
    for profile_index in range(engine.num_optimization_profiles):
        start_binding = profile_index * num_bindings_per_profile
        end_binding = start_binding + num_bindings_per_profile
        
        binding_meta = {}
        for binding_index in range(start_binding, end_binding):
            key = "Binding {}".format(binding_index)
            binding_meta[key] = {
                "profile": profile_index,
                "binding_index": binding_index,
                "binding_shape": engine.get_binding_shape(binding_index),
                "binding_dtype": engine.get_binding_dtype(binding_index),
                "binding_name": engine.get_binding_name(binding_index),
            }

            if engine.binding_is_input(binding_index):
                binding_meta[key]["binding_type"] = "INPUT"
                binding_meta[key]["profile_shape"] = engine.get_profile_shape(profile_index, binding_index)
            else:
                binding_meta[key]["binding_type"] = "OUTPUT"

        profile_meta["Profile {}".format(profile_index)] = binding_meta

    from pprint import pprint
    pprint(profile_meta)

Overwriting engine.py


In [8]:
os.listdir('./saved_model')

['model.plan',
 'covid.engine',
 'relabelled_covid.onnx',
 'convid.pt',
 'covid_fp16.engine',
 'relabelled_convid.pt',
 '.ipynb_checkpoints']

In [2]:
import engine as eng
import argparse
from onnx import ModelProto 
import tensorrt as trt

engine_name = "./saved_model/model.plan"
onnx_path = "./saved_model/relabelled_covid.onnx"
batch_size = 1 

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())

d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value

shape = [batch_size , d0, d1]
engine = eng.build_engine(onnx_path, shape= shape)
eng.save_engine(engine, engine_name)

In [7]:
import os
os.makedirs('./model_repo/covid_plan/', exist_ok=True)
os.makedirs('./model_repo/covid_plan/1/', exist_ok=True)
os.listdir('./model_repo/covid_plan/1/')

[]

In [9]:
!mv ./saved_model/model.plan ./model_repo/covid_plan/1/model.plan

In [22]:
!ls ./empty_dir/covid_plan/

1  config.pbtxt  label.txt  labels.txt


In [10]:
%%writefile ./model_repo/covid_plan/label.txt 
healthy
unknow_rep_illness
covid_positive

Writing ./model_repo/covid_plan/label.txt


In [11]:
%%writefile ./model_repo/covid_plan/config.pbtxt
name: "covid_plan"
platform: "tensorrt_plan"
max_batch_size : 1
input [
  {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 1,1,32000 ]    
  }
]
output [
  {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1,1,3 ]
    label_filename: "label.txt"
  }
]
instance_group [
  {
    count: 2
    kind: KIND_GPU
  }
]
dynamic_batching {
  preferred_batch_size: [ 1 ]
}



Writing ./model_repo/covid_plan/config.pbtxt


In [24]:
!cat './model_repo/covid_plan/config.pbtxt'

name: "covid_plan"
platform: "tensorrt_plan"
max_batch_size : 1
input [
  {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 1,1,32000 ]    
  }
]
output [
  {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1,1,3 ]
    label_filename: "labels.txt"
  }
]
instance_group [
  {
    count: 2
    kind: KIND_GPU
  }
]
dynamic_batching {
  preferred_batch_size: [ 1 ]
}



In [12]:
!cp -R ./model_repo/covid_plan/ ./empty_dir/covid_plan/

## Check status of Inference Server/Specific Model

In [None]:
# !curl localhost:8000/api/status

In [13]:
url = 'localhost:8000'
protocol = ProtocolType.HTTP

health_ctx = ServerHealthContext(url, protocol, verbose=True)
print("Live: {}".format(health_ctx.is_live()))
print("Ready: {}".format(health_ctx.is_ready()))

Live: True
Ready: True


In [14]:
model_name = "covid_onnx"
# Create a status context and get server status
status_ctx = ServerStatusContext(url, protocol, model_name, verbose=True)
print("Status for model {}".format(model_name))
print(status_ctx.get_server_status())

Status for model covid_onnx
id: "inference:0"
version: "1.14.0"
uptime_ns: 301180952943
model_status {
  key: "covid_onnx"
  value {
    config {
      name: "covid_onnx"
      platform: "onnxruntime_onnx"
      version_policy {
        latest {
          num_versions: 1
        }
      }
      input {
        name: "input"
        data_type: TYPE_FP32
        format: FORMAT_NHWC
        dims: 1
        dims: 1
        dims: 32000
      }
      output {
        name: "output"
        data_type: TYPE_FP32
        dims: 1
        dims: 1
        dims: 3
        label_filename: "label.txt"
      }
      instance_group {
        name: "covid_onnx_0"
        count: 2
        gpus: 0
        kind: KIND_GPU
      }
      default_model_filename: "model.onnx"
      optimization {
        input_pinned_memory {
          enable: true
        }
        output_pinned_memory {
          enable: true
        }
      }
    }
    version_status {
      key: 1
      value {
        ready_state: MODEL_RE

In [15]:
ctx = ServerStatusContext(url, protocol, model_name, True)
server_status = ctx.get_server_status()

if model_name not in server_status.model_status:
    raise Exception("unable to get status for '" + model_name + "'")

status = server_status.model_status[model_name]
config = status.config

In [16]:
def parse_model(url, protocol, model_name, batch_size, verbose=False):
    """
    Check the configuration of a model to make sure it meets the
    requirements
    """
    ctx = ServerStatusContext(url, protocol, model_name, verbose)
    server_status = ctx.get_server_status()

    if model_name not in server_status.model_status:
        raise Exception("unable to get status for '" + model_name + "'")

    status = server_status.model_status[model_name]
    config = status.config

    if len(config.input) != 1:
        raise Exception("expecting 1 input, got {}".format(len(config.input)))
    if len(config.output) != 1:
        raise Exception("expecting 1 output, got {}".format(len(config.output)))

    input = config.input[0]
    output = config.output[0]
    
    # Model specifying maximum batch size of 0 indicates that batching
    # is not supported and so the input tensors do not expect an "N"
    # dimension (and 'batch_size' should be 1 so that only a single
    # image instance is inferred at a time).
    max_batch_size = config.max_batch_size
    if max_batch_size == 0:
        if batch_size != 1:
            raise Exception("batching not supported for model '" + model_name + "'")
    else: # max_batch_size > 0
        if batch_size > max_batch_size:
            raise Exception("expecting batch size <= {} for model {}".format(max_batch_size, model_name))

#     Model input must have 3 dims, either CHW or HWC
    if len(input.dims) != 3:
        raise Exception(
            "expecting input to have 3 dimensions, model '{}' input has {}".format(
                model_name, len(input.dims)))

    if input.format == model_config.ModelInput.FORMAT_NHWC:
        h = input.dims[0]
        w = input.dims[1]
        c = input.dims[2]
    else:
        c = input.dims[0]
        h = input.dims[1]
        w = input.dims[2]

    return (input.name, output.name, c, h, w, input.format, input.data_type)

In [17]:
model_version = -1
batch_size = 1

input_name, output_name, c, h, w, format, dtype = parse_model(url, protocol, model_name, batch_size, verbose=True)

## Setup Data for inference

In [18]:
shirnked_labels={0: 'healthy', 1: 'unknow_rep_illness', 2: 'covid_positive'}


covid_np=np.load('covid.npy')
healthy_np=np.load('healthy.npy')
covid_np.shape, healthy_np.shape

((1, 1, 32000), (1, 1, 32000))

## Setup inference on images

In [19]:
def triton_inferer(ctx, input_name, output_name, batch):
    #batch = [batch[i].cpu().numpy().astype(np.float32) for i in range(0,batch.shape[0]) ]
    batch = [batch[i].astype(np.float32) for i in range(0,batch.shape[0]) ]
    input_dict = { input_name : batch }
    output_dict = { output_name : (InferContext.ResultFormat.RAW)}
    results = ctx.run(
        inputs=input_dict, 
        outputs=output_dict, 
        batch_size=len(batch)
    )
    return results[output_name]

def trt_prediction(outputs):
    pred=np.argmax(outputs[0][0], axis=-1)
    out_pred=shirnked_labels[pred[0]]
    return out_pred

In [20]:
model_name = "covid_plan"
model_version = -1
ctx = InferContext(url, protocol, model_name, model_version, verbose=False)
out = triton_inferer(ctx, input_name, output_name, covid_np.reshape(1,1,1,32000))
print(out)
out=trt_prediction(out)
out

[array([[[-8.9344082e+00, -7.0125093e+00, -1.0328293e-03]]], dtype=float32)]


'covid_positive'

In [21]:
model_name = "covid_plan"
model_version = -1
ctx = InferContext(url, protocol, model_name, model_version, verbose=False)
out = triton_inferer(ctx, input_name, output_name, healthy_np.reshape(1,1,1,32000))
print(out)
out=trt_prediction(out)
out

[array([[[-1.803875e-03, -7.310710e+00, -6.782256e+00]]], dtype=float32)]


'healthy'

#  ---------------------- move on to onnx inference --------------------------------

In [36]:
%%writefile ./empty_dir/covid_onnx/config.pbtxt
name: "covid_onnx"
platform: "onnxruntime_onnx"
max_batch_size : 0
input [
  {
    name: "input"
    data_type: TYPE_FP32
    format: FORMAT_NHWC
    dims: [ 1,1,32000 ]   
      
  }
]
output [
  {
    name: "output"
    data_type: TYPE_FP32
    dims: [ 1,1,3 ]
    label_filename: "label.txt"
  }
]
instance_group [
  {
    count: 2
    kind: KIND_GPU
  }
]


Overwriting ./empty_dir/covid_onnx/config.pbtxt


In [8]:
%%writefile ./model_repo/covid_onnx/label.txt 
healthy
unknow_rep_illness
covid_positive


Writing ./model_repo/covid_onnx/label.txt


In [11]:
!cp ./saved_model/relabelled_covid.onnx ./model_repo/covid_onnx/1/

In [12]:
!mv ./model_repo/covid_onnx/1/relabelled_covid.onnx ./model_repo/covid_onnx/1/model.onnx 

In [13]:
!cp -R ./model_repo/covid_onnx/ ./empty_dir/covid_onnx/

In [22]:
model_name = "covid_onnx"
model_version = -1
ctx = InferContext(url, protocol, model_name, model_version, verbose=False)
out = triton_inferer(ctx, input_name, output_name, covid_np.reshape(1,1,1,32000))
print(out)
out=trt_prediction(out)
out

[array([[[-8.9344053e+00, -7.0125089e+00, -1.0328917e-03]]], dtype=float32)]


'covid_positive'

In [23]:
model_name = "covid_onnx"
model_version = -1
ctx = InferContext(url, protocol, model_name, model_version, verbose=False)
out = triton_inferer(ctx, input_name, output_name, healthy_np.reshape(1,1,1,32000))
print(out)
out=trt_prediction(out)
out

[array([[[-1.8036779e-03, -7.3107162e+00, -6.7822638e+00]]], dtype=float32)]


'healthy'