In [1]:
import pycuda.autoinit  #负责数据初始化，内存管理，销毁等
import pycuda.driver as cuda  #GPU CPU之间的数据传输

import numpy as np

In [2]:
import tensorrt as trt  # https://developer.nvidia.com/nvidia-tensorrt-download
from collections import OrderedDict, namedtuple

w = 'D:/Nvidia/TensorRT/bin/fire0808.engine'
Binding = namedtuple('Binding', ('name', 'dtype', 'shape'))
logger = trt.Logger(trt.Logger.INFO)


In [3]:
with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
    model = runtime.deserialize_cuda_engine(f.read())
    

In [4]:
print(model)

<tensorrt.tensorrt.ICudaEngine object at 0x000001B987152470>


In [5]:
context = model.create_execution_context()
bindings = OrderedDict()
output_names = []
fp16 = False  # default updated below
dynamic = False
for i in range(model.num_bindings):
    name = model.get_binding_name(i)
    dtype = trt.nptype(model.get_binding_dtype(i))
    if model.binding_is_input(i):
        if -1 in tuple(model.get_binding_shape(i)):  # dynamic
            dynamic = True
            context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
        if dtype == np.float16:
            fp16 = True
    else:  # output
        output_names.append(name)
    shape = tuple(context.get_binding_shape(i))
    im = np.empty(shape, dtype=dtype)
    bindings[name] = Binding(name, dtype, shape)
batch_size = bindings['images'].shape[0]  # if dynamic, this is instead max batch size

  name = model.get_binding_name(i)
  dtype = trt.nptype(model.get_binding_dtype(i))
  if model.binding_is_input(i):
  if -1 in tuple(model.get_binding_shape(i)):  # dynamic
  context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
  context.set_binding_shape(i, tuple(model.get_profile_shape(0, i)[2]))
  shape = tuple(context.get_binding_shape(i))


In [6]:
bindings

OrderedDict([('images',
              Binding(name='images', dtype=<class 'numpy.float32'>, shape=(1, 3, 2048, 2048))),
             ('output0',
              Binding(name='output0', dtype=<class 'numpy.float32'>, shape=(1, 6, 86016)))])

In [7]:
import cv2
img = cv2.imread('./test.png')
img0 = cv2.imread('./test.png')

# img = np.ones((1, 3, 1024, 1024), dtype=np.float32)


In [8]:
img.shape

(1024, 1024, 3)

In [15]:
img = img.reshape((1,3,1024,1024))
img.shape

(1, 3, 1024, 1024)

In [16]:
# cv2.imshow("img", img)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

In [17]:
if dynamic and img.shape != bindings['images'].shape:
    i = model.get_binding_index('images')
    context.set_binding_shape(i, img.shape)  # reshape if dynamic
    bindings['images'] = bindings['images']._replace(shape=context.get_binding_shape(i))
    for name in output_names:
        i = model.get_binding_index(name)
        bindings[name] = bindings[name]._replace(shape=context.get_binding_shape(i))
        #bindings[name].data.resize_(tuple(context.get_binding_shape(i)))

  i = model.get_binding_index('images')
  context.set_binding_shape(i, img.shape)  # reshape if dynamic
  bindings['images'] = bindings['images']._replace(shape=context.get_binding_shape(i))
  i = model.get_binding_index(name)
  bindings[name] = bindings[name]._replace(shape=context.get_binding_shape(i))


In [18]:
bindings

OrderedDict([('images',
              Binding(name='images', dtype=<class 'numpy.float32'>, shape=(1, 3, 1024, 1024))),
             ('output0',
              Binding(name='output0', dtype=<class 'numpy.float32'>, shape=(1, 6, 21504)))])

In [19]:
h_input = cuda.pagelocked_empty(trt.volume(bindings['images'].shape), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(bindings['output0'].shape), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)

In [20]:
h_input[:] = img.flatten()

In [21]:
h_input.shape

(3145728,)

In [22]:
stream = cuda.Stream()

In [23]:
cuda.memcpy_htod_async(d_input, h_input, stream)

In [24]:
context.execute_v2(bindings=[int(d_input), int(d_output)])

True

In [25]:
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()

In [26]:
h_output.shape

(129024,)

In [27]:
pred = h_output.reshape((21504, 6))

In [28]:
pred.shape

(21504, 6)

In [29]:
pred

array([[8.98131561e+00, 1.69300346e+01, 1.61375771e+01, 3.04497414e+01,
        3.57867813e+01, 4.22653923e+01],
       [5.47671967e+01, 6.18253822e+01, 6.65333099e+01, 7.96277771e+01,
        8.77415466e+01, 8.99921341e+01],
       [9.73884430e+01, 1.08479965e+02, 1.19422745e+02, 1.23437210e+02,
        1.30983429e+02, 1.43108612e+02],
       ...,
       [7.59959221e-06, 1.84774399e-05, 1.12950802e-05, 2.65240669e-06,
        5.96046448e-07, 3.87430191e-07],
       [5.96046448e-07, 2.68220901e-07, 1.19209290e-07, 2.98023224e-07,
        3.18884850e-06, 3.24845314e-06],
       [8.94069672e-08, 1.49011612e-07, 9.83476639e-07, 2.47359276e-06,
        5.06639481e-07, 1.13248825e-06]], dtype=float32)

In [30]:
scores = pred[:, 4]

In [31]:
scores.shape

(21504,)

In [32]:
import numpy as np
import random
import cv2

def non_max_suppress(predict, iou_threshold):
    
    bbox_array = np.array(predict, dtype=np.float32)
    #下面分别获取框的左上角坐标（x1，y1），右下角坐标（x2，y2）及此框的置信度；这里需要注意的是图像左上角可以看做坐标点（0,0），右下角可以看做坐标点（1,1），也就是说从左往右x值增大，从上往下y值增大
    x1 = bbox_array[:, 0]
    y1 = bbox_array[:, 1]
    x2 = bbox_array[:, 2]
    y2 = bbox_array[:, 3]
    scores = bbox_array[:, 4]
    order = scores.argsort()[::-1]  # argsort函数返回的是数组值从小到大的索引值,[::-1]表示取反。即这里返回的是数组值从大到小的索引值
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)  # 当前类所有框的面积(python会自动使用广播机制，相当于MATLAB中的.*即两矩阵对应元素相乘)；x1=3,x2=5,习惯上计算x方向长度就是x=3、4、5这三个像素，即5-3+1=3，而不是5-3=2，所以需要加1
    keep = []
    
    #按confidence从高到低遍历bbx，移除所有与该矩形框的IoU值大于threshold的矩形框
    while order.size > 0:
        i = order[0]
        keep.append(i)
        # 保留当前最大confidence对应的bbx索引
        # 获取所有与当前bbx的交集对应的左上角和右下角坐标，并计算IoU（注意这里是同时计算一个bbx与其他所有bbx的IoU）
        xx1 = np.maximum(x1[i], x1[order[1:]])#最大置信度的左上角坐标分别与剩余所有的框的左上角坐标进行比较，分别保存较大值；因此这里的xx1的维数应该是当前类的框的个数减1
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        inter = np.maximum(0.0, xx2-xx1+1) * np.maximum(0.0, yy2-yy1+1)
        iou = inter / (areas[i] + areas[order[1:]] - inter)#注意这里都是采用广播机制，同时计算了置信度最高的框与其余框的IoU
        inds = np.where(iou <= iou_threshold)[0]#保留iou小于等于阙值的框的索引值
        order = order[inds + 1]  # 将order中的第inds+1处的值重新赋值给order；即更新保留下来的索引，加1是因为因为没有计算与自身的IOU，所以索引相差１，需要加上
    bbox = bbox_array[keep]
	#predicts_dict = predicts_dict
    return bbox
    

In [33]:
res = non_max_suppress(pred, iou_threshold=0.1)

In [34]:
res.shape

(7546, 6)

In [35]:
# img_cp = np.zeros((1024,1024), np.uint8)
predict = non_max_suppress(pred, 0.1)
predict.shape

(7546, 6)

In [38]:
for box in predict:
    # print(box)
    x1, y1, x2, y2, score = int(box[0]), int(box[1]), int(box[2]), int(box[3]), box[4]
    y_text = int(random.uniform(y1, y2))# uniform()是不能直接访问的，需要导入 random 模块，然后通过 random 静态对象调用该方法。uniform() 方法将随机生成下一个实数，它在 [x, y) 范围内
cv2.rectangle(img0, (x1, y1), (x2, y2), (0, 255, 255), 2)
cv2.putText(img0, str(score), (x2 - 30, y_text), 2, 1, (255, 255, 0))
cv2.namedWindow("black1_nms")# 创建一个显示图像的窗口
cv2.imshow("black1_nms", img0)# 在窗口中显示图像;注意这里的窗口名字如果不是刚刚创建的窗口的名字则会自动创建一个新的窗口并将图像显示在这个窗口
cv2.waitKey(0)# 如果不添这一句，在IDLE中执行窗口直接无响应。在命令行中执行的话，则是一闪而过。
cv2.destroyAllWindows()  # 最后释放窗口是个好习惯！

In [None]:
 if self.dynamic and im.shape != self.bindings['images'].shape:
    i = self.model.get_binding_index('images')
    self.context.set_binding_shape(i, im.shape)  # reshape if dynamic
    self.bindings['images'] = self.bindings['images']._replace(shape=im.shape)
    for name in self.output_names:
        i = self.model.get_binding_index(name)
        self.bindings[name].data.resize_(tuple(self.context.get_binding_shape(i)))
s = self.bindings['images'].shape
assert im.shape == s, f"input size {im.shape} {'>' if self.dynamic else 'not equal to'} max model size {s}"
self.binding_addrs['images'] = int(im.data_ptr())
self.context.execute_v2(list(self.binding_addrs.values()))
y = [self.bindings[x].data for x in sorted(self.output_names)]