In [21]:
import tritonclient.grpc as grpcclient
import numpy as np
import numpy.typing as npt
import tritonclient.grpc as grpcclient
import torchvision.transforms as transforms
import asyncio
import time
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial
from typing import Callable, List
import numpy.typing as npt
from tritonclient.utils import InferenceServerException
# Create a client
client = grpcclient.InferenceServerClient(url="localhost:13331")

In [31]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)


def _async_infer_callback(all_results: List, result, error: InferenceServerException):
    if error:
        all_results.append(error)
    else:
        all_results.append(result)


def _async_infer_callback_with_latency(
    all_results: List,
    result,
    error: InferenceServerException,
    start_time: float,
):
    # Call the original callback to store the result
    _async_infer_callback(all_results, result, error)


def default_infer(
    model_name: str,
    image_numpy: npt.NDArray,
    request_id: str = "0",
    results: List = None,
) -> bool:
    inputs = [grpcclient.InferInput("input", image_numpy.shape, "FP32")]
    inputs[0].set_data_from_numpy(image_numpy)
    outputs = [grpcclient.InferRequestedOutput("output")]
    start_time = time.perf_counter()
    return client.infer(
        model_name=model_name,
        inputs=inputs,
        outputs=outputs,
        request_id=request_id,
    )



async def infer(
    model_name: str,
    image_numpy: npt.NDArray,
    request_id: str = "0",
    results: List = None,
) -> bool:
    if results is None:
        results = []
    
    # Create input tensor
    inputs = [grpcclient.InferInput("input", image_numpy.shape, "FP32")]
    inputs[0].set_data_from_numpy(image_numpy)

    # Create output tensor
    outputs = [grpcclient.InferRequestedOutput("output")]

    # Record the start time for this inference request
    start_time = time.perf_counter()

    # Send inference request with our modified callback that records latency
    client.async_infer(
        callback=partial(
            _async_infer_callback_with_latency,
            all_results=results,
            start_time=start_time,
        ),
        model_name=model_name,
        inputs=inputs,
        outputs=outputs,
        request_id=request_id,
    )

    # In this example, we immediately return True; results are collected in the callback.
    return None


async def calculate_throughput(
    infer_func: Callable, num_cycles: int, *args, **kwargs
) -> None:
    start_time = time.perf_counter()

    # Lists to collect inference results and latencies
    results: List = []

    # Create and gather all async tasks
    tasks = [
        infer_func(
            *args,
            **kwargs,
            request_id=str(i),
            results=results,
        )
        for i in range(num_cycles)
    ]
    await asyncio.gather(*tasks)

    # Wait until all callbacks have been executed
    while len(results) < num_cycles:
        await asyncio.sleep(0.01)

    end_time = time.perf_counter()

    # Extract model name and batch size from inputs
    model_name = args[0]
    batch_size = args[1].shape[0]

    execution_time_per_cycle = (end_time - start_time) * 1000 / num_cycles  # ms
    throughput = batch_size / (execution_time_per_cycle / 1000)  # Images per second

    logger.info(f"Model: {model_name}")
    logger.info(f"Batch size: {batch_size}")
    logger.info(f"Execution time per cycle: {execution_time_per_cycle:.2f} ms")
    logger.info(f"Throughput: {throughput:.2f} requests/second")
    logger.info("-" * 50)

    return throughput


speed_results: dict[str, float] = {}


# Test the speed of inference

# Torch script

In [44]:
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
res = default_infer("pytorch_model", image_numpy)
res.get_output("output")
# res.as_numpy("output")
res._result

model_name: "pytorch_model"
model_version: "1"
id: "0"
outputs {
  name: "output"
  datatype: "FP32"
  shape: 1
  shape: 1000
}
raw_output_contents: "\252:\314\277QEN?%\266\266?4\234\216?q\323Z?PS\370\276\231C;?\276A\024>\030\227\244\277q\370\217\276g\003\273?\203ZE@\242S\343?TaJ@\032P9@d\330\272?\327{\002@\266F\016?=\323\330?\365\267\021@\333\002M?R \023@\242\234\023@\006S\370?\200\033q?n8H>\256\004 ?#_3?\301\036h\276\306\234\206\277\030\236-\277\357\n\263?\27377\277\266O&?V\014\371?\313\001-\2772\2251\277m\3773\276\232\313\266?\362m\321>\322\256,@\332\245z?\375\256\361?as\315>*\013\342?\023ip>\263b\021@Ts\200\276\017M\250\275\376\010\355\275G+\272?\003\320\265\277\351$\305?2W\265?B\213\203>\272r\201?EF\305>\343\321C>8\007\257?\\I\363?\276j\214?@\333\033?9\n\253\275\367eZ\277\315\336.?\221p\007@eIX\276^\336\202\277\263\027\003\275\315\260\032@\037\360\r@\225Q\"@8\210V?/[,@\341\254{?\375\037-@\215\037V?\034\017\001@\014\004\206@\004kb@6m\002\276A\366\315?I`P?\t\200\277\277\005e\016\277

In [23]:
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
speed_results["pytorch_model"] = await calculate_throughput(infer, 100, "pytorch_model", image_numpy)

2025-02-19 20:13:23,986 - Model: pytorch_model
2025-02-19 20:13:23,999 - Batch size: 1
2025-02-19 20:13:24,003 - Execution time per cycle: 135.77 ms
2025-02-19 20:13:24,006 - Throughput: 7.37 requests/second
2025-02-19 20:13:24,007 - --------------------------------------------------


# Onnx

## Batch size 1

In [20]:
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
speed_results["onnx_model"] = await calculate_throughput(infer, 10, "onnx_model", image_numpy)

2025-02-17 23:06:13,504 - Model: onnx_model
2025-02-17 23:06:13,507 - Batch size: 1
2025-02-17 23:06:13,509 - Execution time per cycle: 162.49 ms
2025-02-17 23:06:13,510 - Throughput: 6.15 requests/second
2025-02-17 23:06:13,511 - --------------------------------------------------


# External batching

## Batch size 10

In [21]:
# search for "shape: [" in the logs
image_numpy = np.random.rand(16, 3, 224, 224).astype(np.float32)
speed_results["onnx_model_external_bs16"] = await calculate_throughput(infer, 1000, "onnx_model", image_numpy)

2025-02-15 12:50:44,835 - Model: onnx_model
2025-02-15 12:50:44,838 - Batch size: 16
2025-02-15 12:50:44,838 - Execution time per cycle: 15.48 ms
2025-02-15 12:50:44,839 - Throughput: 1033.43 images/second
2025-02-15 12:50:44,839 - --------------------------------------------------


# Dynamic batching

In [22]:
# search for "executing " in the logs
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
speed_results["onnx_model_dynamic_batching"] = await calculate_throughput(infer, 1000, "dynamic_batching", image_numpy)

# ... why slower than onnx_model with batch 16?

2025-02-15 12:50:46,033 - Model: dynamic_batching
2025-02-15 12:50:46,035 - Batch size: 1
2025-02-15 12:50:46,036 - Execution time per cycle: 1.18 ms
2025-02-15 12:50:46,036 - Throughput: 846.81 images/second
2025-02-15 12:50:46,036 - --------------------------------------------------


# Muti-instance

In [26]:
# search for "shape: [" in the logs
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
speed_results["onnx_model_multi_instance"] = await calculate_throughput(infer, 1000, "multi_instance", image_numpy)

2025-02-15 12:51:39,199 - Model: multi_instance
2025-02-15 12:51:39,201 - Batch size: 1
2025-02-15 12:51:39,201 - Execution time per cycle: 0.96 ms
2025-02-15 12:51:39,202 - Throughput: 1036.53 images/second
2025-02-15 12:51:39,204 - --------------------------------------------------


# Quantized model

In [None]:
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
speed_results["quantized_model"] = await calculate_throughput(infer, 1000, "quantized_model", image_numpy)

2025-02-15 12:51:17,778 - Model: quantized_model
2025-02-15 12:51:17,779 - Batch size: 1
2025-02-15 12:51:17,780 - Execution time per cycle: 2.97 ms
2025-02-15 12:51:17,780 - Throughput: 337.22 images/second
2025-02-15 12:51:17,781 - --------------------------------------------------


# TensorRT model

In [32]:
# search for "shape: [" in the logs
image_numpy = np.random.rand(1, 3, 224, 224).astype(np.float32)
speed_results["tensorrt_model"] = await calculate_throughput(infer, 1000, "tensorrt_model", image_numpy)

2025-02-15 13:04:22,347 - Model: tensorrt_model
2025-02-15 13:04:22,348 - Batch size: 1
2025-02-15 13:04:22,348 - Execution time per cycle: 0.65 ms
2025-02-15 13:04:22,349 - Throughput: 1536.34 images/second
2025-02-15 13:04:22,349 - --------------------------------------------------


# Bert model

In [9]:
async def infer_bert(
    model_name: str,
    input_ids: npt.NDArray,
    token_type_ids: npt.NDArray,
    attention_mask: npt.NDArray,
    request_id: str = "0",
    results: List = None,
) -> bool:
    if results is None:
        results = []
    
    # Create input tensor
    inputs = [grpcclient.InferInput("input_ids", input_ids.shape, "INT64"),
             grpcclient.InferInput("token_type_ids", token_type_ids.shape, "INT64"),
             grpcclient.InferInput("attention_mask", attention_mask.shape, "INT64")]
    inputs[0].set_data_from_numpy(input_ids)
    inputs[1].set_data_from_numpy(token_type_ids)
    inputs[2].set_data_from_numpy(attention_mask)

    # Create output tensor
    outputs = [grpcclient.InferRequestedOutput("output")]

    # Record the start time for this inference request
    start_time = time.perf_counter()

    # Send inference request with our modified callback that records latency
    client.async_infer(
        callback=partial(
            _async_infer_callback_with_latency,
            all_results=results,
            start_time=start_time,
        ),
        model_name=model_name,
        inputs=inputs,
        outputs=outputs,
        request_id=request_id,
    )

    # In this example, we immediately return True; results are collected in the callback.
    return True


In [19]:
input_ids = np.random.randint(0, 10000, (1, 512))
token_type_ids = np.random.randint(0, 1, (1, 512))
attention_mask = np.random.randint(0, 1, (1, 512))
speed_results["bert_onnx_model"] = await calculate_throughput(
    infer_bert, 100, "bert_onnx_model", input_ids, token_type_ids, attention_mask
)
speed_results["bert_onnx_model_fp16"] = await calculate_throughput(
    infer_bert, 100, "bert_onnx_model_fp16", input_ids, token_type_ids, attention_mask
)

2025-02-17 21:43:59,361 - Model: bert_onnx_model
2025-02-17 21:43:59,364 - Batch size: 1
2025-02-17 21:43:59,366 - Execution time per cycle: 114.31 ms
2025-02-17 21:43:59,367 - Throughput: 8.75 requests/second
2025-02-17 21:43:59,369 - --------------------------------------------------
2025-02-17 21:44:11,687 - Model: bert_onnx_model_fp16
2025-02-17 21:44:11,689 - Batch size: 1
2025-02-17 21:44:11,690 - Execution time per cycle: 123.16 ms
2025-02-17 21:44:11,691 - Throughput: 8.12 requests/second
2025-02-17 21:44:11,692 - --------------------------------------------------
