In [4]:
import os
import time
import contextlib
from collections import namedtuple, OrderedDict

import torch
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda


os.environ['CUDA_MODULE_LOADING'] = 'LAZY'

class TRTInference:
    def __init__(self, engine_path, ouput_names_mapping: dict = None, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
        self.engine_path = engine_path
        self.output_names_mapping = ouput_names_mapping or {}
        self.device = device
        self.backend = backend
        self.max_batch_size = max_batch_size

        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)

        self.engine = self.load_engine(engine_path)

        if self.backend == 'cuda':
            cuda.init()
            device = cuda.Device(0)
            self.context = device.make_context()
        else:
            self.context = self.engine.create_execution_context()

        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())

        self.input_names = self.get_input_names()
        self.output_names = self.get_output_names()

        if self.backend == 'cuda':
            self.stream = cuda.Stream()

    def init(self, ):
        self.dynamic = False

    def load_engine(self, path):
        '''load engine
        '''
        trt.init_libnvinfer_plugins(self.logger, '')
        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def get_input_names(self):
        names = []
        for _, name in enumerate(self.engine):
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                names.append(name)
        return names

    def get_output_names(self):
        names = []
        for _, name in enumerate(self.engine):
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
                names.append(name)
        return names

    def get_bindings(self, engine, context, max_batch_size=32, device=None):
        '''build binddings
        '''
        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
        bindings = OrderedDict()
        # max_batch_size = 1

        for i, name in enumerate(engine):
            shape = engine.get_tensor_shape(name)
            dtype = trt.nptype(engine.get_tensor_dtype(name))

            if shape[0] == -1:
                shape[0] = max_batch_size
                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # dynamic
                    context.set_input_shape(name, shape)

            if self.backend == 'cuda':
                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                    data = np.random.randn(*shape).astype(dtype)
                    ptr = cuda.mem_alloc(data.nbytes)
                    bindings[name] = Binding(name, dtype, shape, data, ptr)
                else:
                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
                    ptr = cuda.mem_alloc(data.nbytes)
                    bindings[name] = Binding(name, dtype, shape, data, ptr)

            else:
                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())

        return bindings

    def run_torch(self, blob):
        '''torch input
        '''
        for n in self.input_names:
            if self.bindings[n].shape != blob[n].shape:
                self.context.set_input_shape(n, blob[n].shape)
                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)

        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
        self.context.execute_v2(list(self.bindings_addr.values()))
        outputs = {self.output_names_mapping.get(n, n): self.bindings[n].data for n in self.output_names}

        return outputs


    def async_run_cuda(self, blob):
        '''numpy input
        '''
        blob = {n: np.ascontiguousarray(v) for n, v in blob.items()}
        for n in self.input_names:
            print(n)
            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)

        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)

        outputs = {}
        for n in self.output_names:
            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
            outputs[self.output_names_mapping.get(n, n)] = self.bindings[n].data

        self.stream.synchronize()

        return outputs

    def __call__(self, blob):
        if self.backend == 'torch':
            return self.run_torch(blob)

        elif self.backend == 'cuda':
            return self.async_run_cuda(blob)

    def synchronize(self, ):
        if self.backend == 'torch' and torch.cuda.is_available():
            torch.cuda.synchronize()

        elif self.backend == 'cuda':
            self.stream.synchronize()

    def warmup(self, blob, n):
        for _ in range(n):
            _ = self(blob)

In [30]:
import os
import glob

import torch
import torch.utils.data as data
import torchvision
import torchvision.transforms as T
import torchvision.transforms.functional as F


class ToTensor(T.ToTensor):
    def __init__(self) -> None:
        super().__init__()

    def __call__(self, pic):
        if isinstance(pic, torch.Tensor):
            return pic
        return super().__call__(pic)


class PadToSize(T.Pad):
    def __init__(self, size, fill=0, padding_mode='constant'):
        super().__init__(0, fill, padding_mode)
        self.size = size
        self.fill = fill

    def __call__(self, img):
        """
        Args:
            img (PIL Image or Tensor): Image to be padded.

        Returns:
            PIL Image or Tensor: Padded image.
        """
        w, h = F.get_image_size(img)
        padding = (0, 0, self.size[0] - w, self.size[1] - h)
        return F.pad(img, padding, self.fill, self.padding_mode)


class Dataset(data.Dataset):
    def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0', backend='torch') -> None:
        super().__init__()

        self.device = device
        self.backend = backend
        self.size = 640

        self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))

        if preprocess is None:
            self.preprocess = T.Compose([
                    T.Resize(size=639, max_size=640),
                    PadToSize(size=(640, 640), fill=114),
                    ToTensor(),
                    T.ConvertImageDtype(torch.float),
            ])
        else:
            self.preprocess = preprocess

    def __len__(self, ):
        return len(self.im_path_list)


    def __getitem__(self, index):
        # im = Image.open(self.img_path_list[index]).convert('RGB')
        im = torchvision.io.read_file(self.im_path_list[index])
        im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB).to(self.device)
        _, h, w = im.shape # c,h,w

        im = self.preprocess(im)
        if self.backend == 'cuda':  # numpy input
            blob = {
            'image': im.cpu().numpy(),
            'im_shape': np.array([self.size, self.size]),
            'scale_factor': np.array([h / self.size, w / self.size]),
            'orig_size': np.array([w, h]),
            }
        else:
            blob = {
            'image': im,
            'im_shape': torch.tensor([self.size, self.size]).to(im.device),
            'scale_factor': torch.tensor([h / self.size, w / self.size]).to(im.device),
            'orig_size': torch.tensor([w, h]).to(im.device),
            }
        return blob


    @staticmethod
    def post_process():
        pass

    @staticmethod
    def collate_fn():
        pass

In [31]:
model = TRTInference('rtdetr_r50vd_6x_coco.trt', ouput_names_mapping={'tile_3.tmp_0': 'bbox_num', 'reshape2_83.tmp_0': 'bbox'}, backend='torch', verbose=True)
data = Dataset(img_dir='.', backend='torch')

In [33]:
data[0]

{'image': tensor([[[0.4471, 0.4471, 0.4510,  ..., 0.8588, 0.8627, 0.8588],
          [0.4588, 0.4588, 0.4471,  ..., 0.8588, 0.8588, 0.8627],
          [0.4588, 0.4667, 0.4549,  ..., 0.8627, 0.8588, 0.8627],
          ...,
          [0.4471, 0.4471, 0.4471,  ..., 0.4471, 0.4471, 0.4471],
          [0.4471, 0.4471, 0.4471,  ..., 0.4471, 0.4471, 0.4471],
          [0.4471, 0.4471, 0.4471,  ..., 0.4471, 0.4471, 0.4471]],
 
         [[0.6745, 0.6902, 0.6824,  ..., 0.8784, 0.8824, 0.8784],
          [0.6863, 0.6863, 0.6784,  ..., 0.8784, 0.8784, 0.8784],
          [0.6824, 0.6824, 0.6784,  ..., 0.8784, 0.8784, 0.8784],
          ...,
          [0.4471, 0.4471, 0.4471,  ..., 0.4471, 0.4471, 0.4471],
          [0.4471, 0.4471, 0.4471,  ..., 0.4471, 0.4471, 0.4471],
          [0.4471, 0.4471, 0.4471,  ..., 0.4471, 0.4471, 0.4471]],
 
         [[0.9922, 0.9961, 0.9961,  ..., 0.8510, 0.8549, 0.8510],
          [0.9961, 0.9961, 0.9922,  ..., 0.8510, 0.8510, 0.8510],
          [0.9961, 0.9961, 0.99

In [34]:
model.warmup(data[0], 100)
result = model(data[0])
if model.backend == 'cuda':
    model.context.pop()  # clears CUDA context, to use model again, need to reintialize the model instance

In [39]:
# bbox_num: number of bbox in each image
# bbox: shape=[bbox_num, 6], 6 is [class_id, score, x1, y1, x2, y2]
# find classes that have predictions
indices = torch.where(~torch.isnan(result['bbox'][:, 1]))[0]
indices

torch.Size([300, 6])