In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.pardir))
from argparse import ArgumentParser

import torch
from voltaml.compile import VoltaGPUCompiler
from voltaml.inference import gpu_performance
import torchvision
from voltaml.models.common import DetectMultiBackend
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### Load Model 

In [2]:
torch_model_dir = 'voltaml/yolov5n.pt'
model = DetectMultiBackend(torch_model_dir)

INFO:yolov5:Fusing layers... 
Fusing layers... 
Fusing layers... 
INFO:yolov5:YOLOv5n summary: 270 layers, 1872157 parameters, 0 gradients
YOLOv5n summary: 270 layers, 1872157 parameters, 0 gradients
YOLOv5n summary: 270 layers, 1872157 parameters, 0 gradients


## Set parameters for FP16

In [3]:
input_shape = (1,3,1280,1280)
precision = 'fp16'
compiled_model_dir = 'yolov5n_1280_16.engine' ## Compiled model directory
is_yolo = True
input_name = 'images'
output_name = 'output'
simplify = True

### Compile Model

In [4]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    input_name=input_name,
    output_name=output_name,
    simplify=simplify
    
)

compiled_model = compiler.compile()



-------- Loading ONNX ---------------


INFO:EngineBuilder:Network Description
Network Description
Network Description
INFO:EngineBuilder:Input 'images' with shape (1, 3, 1280, 1280) and dtype DataType.FLOAT
Input 'images' with shape (1, 3, 1280, 1280) and dtype DataType.FLOAT
Input 'images' with shape (1, 3, 1280, 1280) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'output' with shape (1, 100800, 85) and dtype DataType.FLOAT
Output 'output' with shape (1, 100800, 85) and dtype DataType.FLOAT
Output 'output' with shape (1, 100800, 85) and dtype DataType.FLOAT
INFO:EngineBuilder:Building fp16 Engine in /workspace/voltav0.3/voltaML/demo/yolov5n_1280_16.engine
Building fp16 Engine in /workspace/voltav0.3/voltaML/demo/yolov5n_1280_16.engine
Building fp16 Engine in /workspace/voltav0.3/voltaML/demo/yolov5n_1280_16.engine


[10/20/2022-09:32:26] [TRT] [I] [MemUsageChange] Init CUDA: CPU +313, GPU +0, now: CPU 489, GPU 603 (MiB)
[10/20/2022-09:32:26] [TRT] [I] [MemUsageSnapshot] Begin constructing builder kernel library: CPU 489 MiB, GPU 603 MiB
[10/20/2022-09:32:26] [TRT] [I] [MemUsageSnapshot] End constructing builder kernel library: CPU 624 MiB, GPU 637 MiB
[10/20/2022-09:32:26] [TRT] [W] parsers/onnx/onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[10/20/2022-09:32:27] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +513, GPU +224, now: CPU 1171, GPU 865 (MiB)
[10/20/2022-09:32:27] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +115, GPU +49, now: CPU 1286, GPU 914 (MiB)
[10/20/2022-09:32:27] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[10/20/2022-09:35:49] [TRT] [I] Some tactics do not have sufficient workspace memory to run. Increasing w

INFO:EngineBuilder:Serializing engine to file: /workspace/voltav0.3/voltaML/demo/yolov5n_1280_16.engine
Serializing engine to file: /workspace/voltav0.3/voltaML/demo/yolov5n_1280_16.engine
Serializing engine to file: /workspace/voltav0.3/voltaML/demo/yolov5n_1280_16.engine


[10/20/2022-09:35:58] [TRT] [I] Detected 1 inputs and 1 output network tensors.
[10/20/2022-09:35:58] [TRT] [I] Total Host Persistent Memory: 134320
[10/20/2022-09:35:58] [TRT] [I] Total Device Persistent Memory: 5668352
[10/20/2022-09:35:58] [TRT] [I] Total Scratch Memory: 33955200
[10/20/2022-09:35:58] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 5 MiB, GPU 4298 MiB
[10/20/2022-09:35:58] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 5.10551ms to assign 6 blocks to 77 nodes requiring 71161856 bytes.
[10/20/2022-09:35:58] [TRT] [I] Total Activation Memory: 71161856
[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 1833, GPU 1153 (MiB)
[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 1833, GPU 1161 (MiB)
[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +3, GPU +14, now: CPU 3, GPU 14 (MiB)


In [5]:
gpu_performance(compiled_model_dir,model, input_shape=input_shape, is_yolo=is_yolo)

INFO:yolov5:YOLOv5 🚀 2022-10-14 torch 1.12.0+cu102 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11016MiB)

YOLOv5 🚀 2022-10-14 torch 1.12.0+cu102 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11016MiB)

YOLOv5 🚀 2022-10-14 torch 1.12.0+cu102 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11016MiB)

INFO:yolov5:Loading yolov5n_1280_16.engine for TensorRT inference...
Loading yolov5n_1280_16.engine for TensorRT inference...
Loading yolov5n_1280_16.engine for TensorRT inference...


[10/20/2022-09:35:58] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 1780, GPU 1224 (MiB)
[10/20/2022-09:35:58] [TRT] [I] Loaded engine size: 24 MiB
[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 1819, GPU 1247 (MiB)
[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 1819, GPU 1257 (MiB)
[10/20/2022-09:35:58] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +14, now: CPU 0, GPU 101 (MiB)
[10/20/2022-09:36:01] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 3536, GPU 2037 (MiB)
[10/20/2022-09:36:01] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 3536, GPU 

100%|███████████████████████████████████████████| 20/20 [00:00<00:00, 75.37it/s]
INFO:yolov5:Speed: 1.1ms pre-process, 2.5ms inference, 1.2ms NMS per image at shape (1, 3, 1280, 1280)
Speed: 1.1ms pre-process, 2.5ms inference, 1.2ms NMS per image at shape (1, 3, 1280, 1280)
Speed: 1.1ms pre-process, 2.5ms inference, 1.2ms NMS per image at shape (1, 3, 1280, 1280)
INFO:yolov5:Speed: 21.3ms pre-process, 49.1ms inference, 24.8ms NMS per image at shape (1, 3, 1280, 1280)
Speed: 21.3ms pre-process, 49.1ms inference, 24.8ms NMS per image at shape (1, 3, 1280, 1280)
Speed: 21.3ms pre-process, 49.1ms inference, 24.8ms NMS per image at shape (1, 3, 1280, 1280)
INFO:yolov5:Speed: 0.0s pre-process, 0.0s inference, 0.0s NMS per image at shape (1, 3, 1280, 1280)
Speed: 0.0s pre-process, 0.0s inference, 0.0s NMS per image at shape (1, 3, 1280, 1280)
Speed: 0.0s pre-process, 0.0s inference, 0.0s NMS per image at shape (1, 3, 1280, 1280)
INFO:yolov5:Results saved to [1m../voltaml/utils/runs/detect/ex

Latency:
--------------------------------------------------
VoltaML GPU Inference Latency: 2.45 ms / sample
PyTorch Inference Latency: 5.90 ms / sample


FPS:
--------------------------------------------------
VoltaML GPU Inference Throughput: 407.42 fps
PyTorch Inference Throughput: 169.42 fps





### Set parameters for INT8

In [None]:
input_shape = (1,3,224,224)
precision = 'int8'
compiled_model_dir = '' ## Compiled model directory
throughput_batch_size = 1
calib_input = '' ## Calib input images path
calib_cache = '' ## Cache name
calib_num_images=25000
calib_batch_size=8
calib_preprocessor='V2'

### Compile Model

In [None]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    calib_input=calib_input,
    calib_cache=calib_cache,
    calib_num_images=calib_num_images,
    calib_batch_size=calib_batch_size,
    calib_preprocessor=calib_preprocessor
)

compiled_model = compiler.compile()

In [None]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, throughput_batch_size=throughput_batch_size)