In [46]:
#  Copyright 2022, Lefebvre Dalloz Services
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import argparse

import numpy as np
import tritonclient.http

from benchmark_utils import print_timings, setup_logging, track_infer_time
from tqdm import tqdm

In [47]:
# length = 16
model = "tensorrt"
setup_logging()
model_name = f"transformer_{model}_generate"
url = "127.0.0.1:8000"
model_version = "1"
batch_size = 1
verbose = False

In [61]:
text = "Anish Shah's Documentary is about"

In [62]:
triton_client = tritonclient.http.InferenceServerClient(url=url, verbose=verbose)
assert triton_client.is_model_ready(
    model_name=model_name, model_version=model_version
), f"model {model_name} not yet ready"

model_metadata = triton_client.get_model_metadata(model_name=model_name, model_version=model_version)
model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version)

In [63]:
query = tritonclient.http.InferInput(name="TEXT", shape=(batch_size,), datatype="BYTES")
model_score = tritonclient.http.InferRequestedOutput(name="output", binary_data=False)

In [64]:
time_buffer = list()
for _ in tqdm(range(100)):
    with track_infer_time(time_buffer):
        query.set_data_from_numpy(np.asarray([text] * batch_size, dtype=object))
        response = triton_client.infer(
            model_name=model_name, model_version=model_version, inputs=[query], outputs=[model_score]
        )

100%|██████████| 100/100 [00:15<00:00,  6.52it/s]


In [65]:
print_timings(name="triton transformers", timings=time_buffer)
print(response.as_numpy("output"))

[triton transformers] mean=152.57ms, sd=6.74ms, min=149.44ms, max=216.74ms, median=151.47ms, 95p=155.65ms, 99p=164.53ms
Anish Shah's Documentary is about the people, places and things that fascinate him. From the deserts of Arabia to the streets of Mumbai, he explores
