In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.pardir))
sys.path.append(os.path.abspath(os.path.pardir)+'/voltaml')
from argparse import ArgumentParser

import torch
from voltaml.compile import VoltaGPUCompiler
from voltaml.inference import gpu_performance
import torchvision
from voltaml.models.common import DetectMultiBackend
from voltaml.yolov6.utils.checkpoint import load_checkpoint
from voltaml.yolov6.layers.common import DetectBackend

import warnings
warnings.filterwarnings('ignore')
os.chdir('../voltaml')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download a pretrained model
# import torch
torch.hub.download_url_to_file('https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6m.pt', 'yolov6m.pt')

100%|████████████████████████████████████████████████████████████| 71.5M/71.5M [00:07<00:00, 10.3MB/s]


### Load Model 

In [3]:
torch_model_dir = 'yolov6m.pt'
device = torch.device(0)

In [4]:
#Load PyTorch model
model = load_checkpoint(torch_model_dir, map_location=device, inplace=True, fuse=True)  # load FP32 modeldo

INFO:voltaml.yolov6.utils.events:Loading checkpoint from yolov6m.pt
Loading checkpoint from yolov6m.pt
Loading checkpoint from yolov6m.pt
INFO:voltaml.yolov6.utils.events:
Fusing model...

Fusing model...

Fusing model...


## Set parameters for FP16

In [5]:
input_shape = (16,3,640,640)
precision = 'fp16'
compiled_model_dir = 'yolov6m_16_640.engine' ## Compiled model directory
is_yolo = True
input_name = 'images'
output_name = 'output'
simplify = True

### Compile Model

In [6]:
compiler = VoltaGPUCompiler(
    model=model,
    output_dir=compiled_model_dir,
    input_shape=input_shape,
    precision=precision,
    input_name=input_name,
    output_name=output_name,
    simplify=simplify
    
)

compiled_model = compiler.compile()

-------- Loading ONNX ---------------


INFO:EngineBuilder:Network Description
Network Description
Network Description
INFO:EngineBuilder:Input 'images' with shape (16, 3, 640, 640) and dtype DataType.FLOAT
Input 'images' with shape (16, 3, 640, 640) and dtype DataType.FLOAT
Input 'images' with shape (16, 3, 640, 640) and dtype DataType.FLOAT
INFO:EngineBuilder:Output 'output' with shape (16, 8400, 85) and dtype DataType.FLOAT
Output 'output' with shape (16, 8400, 85) and dtype DataType.FLOAT
Output 'output' with shape (16, 8400, 85) and dtype DataType.FLOAT
INFO:EngineBuilder:Building fp16 Engine in /workspace/voltaML/voltaml/yolov6m_16_640.engine


[10/14/2022-13:13:35] [TRT] [I] [MemUsageChange] Init CUDA: CPU +314, GPU +0, now: CPU 2596, GPU 1817 (MiB)
[10/14/2022-13:13:35] [TRT] [I] [MemUsageSnapshot] Begin constructing builder kernel library: CPU 2596 MiB, GPU 1817 MiB
[10/14/2022-13:13:35] [TRT] [I] [MemUsageSnapshot] End constructing builder kernel library: CPU 2731 MiB, GPU 1851 MiB
[10/14/2022-13:13:35] [TRT] [W] parsers/onnx/onnx2trt_utils.cpp:364: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.


Building fp16 Engine in /workspace/voltaML/voltaml/yolov6m_16_640.engine
Building fp16 Engine in /workspace/voltaML/voltaml/yolov6m_16_640.engine


[10/14/2022-13:13:37] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +513, GPU +226, now: CPU 3402, GPU 2085 (MiB)
[10/14/2022-13:13:38] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +115, GPU +52, now: CPU 3517, GPU 2137 (MiB)
[10/14/2022-13:13:38] [TRT] [I] Local timing cache in use. Profiling results in this builder pass will not be stored.
[10/14/2022-13:15:09] [TRT] [I] Some tactics do not have sufficient workspace memory to run. Increasing workspace size may increase performance, please check verbose output.
[10/14/2022-13:16:29] [TRT] [I] Detected 1 inputs and 4 output network tensors.


INFO:EngineBuilder:Serializing engine to file: /workspace/voltaML/voltaml/yolov6m_16_640.engine
Serializing engine to file: /workspace/voltaML/voltaml/yolov6m_16_640.engine
Serializing engine to file: /workspace/voltaML/voltaml/yolov6m_16_640.engine


[10/14/2022-13:16:29] [TRT] [I] Total Host Persistent Memory: 362656
[10/14/2022-13:16:29] [TRT] [I] Total Device Persistent Memory: 74990592
[10/14/2022-13:16:29] [TRT] [I] Total Scratch Memory: 24576000
[10/14/2022-13:16:29] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 90 MiB, GPU 7693 MiB
[10/14/2022-13:16:29] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 107.692ms to assign 13 blocks to 288 nodes requiring 596377605 bytes.
[10/14/2022-13:16:29] [TRT] [I] Total Activation Memory: 596377605
[10/14/2022-13:16:29] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 4125, GPU 2319 (MiB)
[10/14/2022-13:16:29] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +1, GPU +8, now: CPU 4126, GPU 2327 (MiB)
[10/14/2022-13:16:29] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +71, GPU +73, now: CPU 71, GPU 73 (MiB)


In [7]:
gpu_performance(compiled_model_dir, model, input_shape=input_shape, is_yolo=is_yolo)

INFO:yolov5:YOLOv5 🚀 2022-9-10 torch 1.12.0+cu102 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11016MiB)

YOLOv5 🚀 2022-9-10 torch 1.12.0+cu102 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11016MiB)

YOLOv5 🚀 2022-9-10 torch 1.12.0+cu102 CUDA:0 (NVIDIA GeForce RTX 2080 Ti, 11016MiB)

INFO:yolov5:Loading yolov6m_16_640.engine for TensorRT inference...
Loading yolov6m_16_640.engine for TensorRT inference...
Loading yolov6m_16_640.engine for TensorRT inference...


[10/14/2022-13:16:30] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.

[10/14/2022-13:16:30] [TRT] [I] [MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 3812, GPU 2964 (MiB)
[10/14/2022-13:16:30] [TRT] [I] Loaded engine size: 75 MiB
[10/14/2022-13:16:30] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 3893, GPU 3048 (MiB)
[10/14/2022-13:16:30] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 3893, GPU 3058 (MiB)
[10/14/2022-13:16:30] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +1, GPU +73, now: CPU 1, GPU 785 (MiB)
[10/14/2022-13:16:30] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 3818, GPU 3170 (MiB)
[10/14/2022-13:16:30] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +1, GPU +8, now: CPU 3819, GPU 

100%|███████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.66it/s]
INFO:yolov5:Speed: 5.3ms pre-process, 32.4ms inference, 25.5ms NMS per image at shape (1, 3, 640, 640)
Speed: 5.3ms pre-process, 32.4ms inference, 25.5ms NMS per image at shape (1, 3, 640, 640)
Speed: 5.3ms pre-process, 32.4ms inference, 25.5ms NMS per image at shape (1, 3, 640, 640)
INFO:yolov5:Speed: 5.3ms pre-process, 32.4ms inference, 25.5ms NMS per image at shape (1, 3, 640, 640)
Speed: 5.3ms pre-process, 32.4ms inference, 25.5ms NMS per image at shape (1, 3, 640, 640)
Speed: 5.3ms pre-process, 32.4ms inference, 25.5ms NMS per image at shape (1, 3, 640, 640)
INFO:yolov5:Speed: 0.0s pre-process, 0.0s inference, 0.0s NMS per image at shape (1, 3, 640, 640)
Speed: 0.0s pre-process, 0.0s inference, 0.0s NMS per image at shape (1, 3, 640, 640)
Speed: 0.0s pre-process, 0.0s inference, 0.0s NMS per image at shape (1, 3, 640, 640)
INFO:yolov5:Results saved to [1m../voltaml/runs/detect/e

Latency:
--------------------------------------------------
VoltaML GPU Inference Latency: 32.36 ms / sample
PyTorch Inference Latency: 164.42 ms / sample


FPS:
--------------------------------------------------
VoltaML GPU Inference Throughput: 30.91 fps
PyTorch Inference Throughput: 6.08 fps
