In [1]:
import numpy as np
import time
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import IPython.display
from ais_bench.infer.interface import InferSession
import onnxruntime as rt
from det_utils import letterbox, scale_coords, nms

## 前处理代码

In [2]:
def resize_image(image, size, letterbox_image):
    """
        对输入图像进行resize
    Args:
        size:目标尺寸
        letterbox_image: bool 是否进行letterbox变换
    Returns:指定尺寸的图像
    """
    ih, iw, _ = image.shape
    print(ih, iw)
    h, w = size
    # letterbox_image = False
    if letterbox_image:
        scale = min(w/iw, h/ih)
        nw = int(iw*scale)
        nh = int(ih*scale)
        image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_LINEAR)
        # cv2.imshow("img", img)
        # cv2.waitKey()
        # print(image.shape)
        # 生成画布
        image_back = np.ones((h, w, 3), dtype=np.uint8) * 128
        # 将image放在画布中心区域-letterbox
        image_back[(h-nh)//2: (h-nh)//2 + nh, (w-nw)//2:(w-nw)//2+nw , :] = image
    else:
        image_back = image
        # cv2.imshow("img", image_back)
        # cv2.waitKey()
    return image_back  

In [3]:
def img2input(img):
    img = np.transpose(img, (2, 0, 1))
    img = img/255
    return np.expand_dims(img, axis=0).astype(np.float32)

In [4]:
def xywh2xyxy(*box):
    """
    将xywh转换为左上角点和左下角点
    Args:
        box:
    Returns: x1y1x2y2
    """
    ret = [box[0] - box[2] // 2, box[1] - box[3] // 2, \
          box[0] + box[2] // 2, box[1] + box[3] // 2]
    return ret

In [5]:
def cod_trf(result, pre, after):
    """
    因为预测框是在经过letterbox后的图像上做预测所以需要将预测框的坐标映射回原图像上
    Args:
        result:  [x,y,w,h,conf(最大类别概率), x1,y1,conf1, x2,y2,conf2, ..., x21,y21,conf21]
        pre:    原尺寸图像
        after:  经过letterbox处理后的图像
    Returns: 坐标变换后的结果,
    """
    res = np.array(result)
    # 提取预测框的坐标和置信度
    x, y, w, h, conf = res[:5]
    # 提取关键点的坐标和置信度
    keypoints = res[5:].reshape(-1, 3)
    
    # 将[x, y, w, h]转换为[x1, y1, x2, y2]
    x1, y1, x2, y2 = xywh2xyxy(x, y, w, h)
    
    # 获取原图像和经过letterbox处理后的图像的尺寸
    h_pre, w_pre, _ = pre.shape
    h_after, w_after, _ = after.shape
    
    # 计算缩放比例和平移量
    scale = max(w_pre/w_after, h_pre/h_after)
    h_pre, w_pre = h_pre/scale, w_pre/scale
    x_move, y_move = abs(w_pre-w_after)//2, abs(h_pre-h_after)//2
    
    # 变换预测框的坐标
    ret_x1, ret_x2 = (x1 - x_move) * scale, (x2 - x_move) * scale
    ret_y1, ret_y2 = (y1 - y_move) * scale, (y2 - y_move) * scale
    ret_box = [int(ret_x1), int(ret_y1), int(ret_x2), int(ret_y2), conf]
    
    # 变换关键点的坐标
    ret_keypoints = []
    for kp in keypoints:
        kp_x, kp_y, kp_conf = kp
        ret_kp_x = (kp_x - x_move) * scale
        ret_kp_y = (kp_y - y_move) * scale
        ret_keypoints.extend([int(ret_kp_x), int(ret_kp_y), kp_conf])
    
    # 组合预测框和关键点的结果
    ret = np.array(ret_box + ret_keypoints)
    return ret

# 假设 pre 和 after 是已经定义的图像
# highest_conf_pred = np.array(...)  # 您的68维numpy数组
# transformed_result = cod_trf(highest_conf_pred, pre, after)

In [6]:
def draw(res, image, color):
    """
    将预测框和关键点绘制在image上
    Args:
        res: 预测框数据
        image: 原图
    Returns:
        image: 绘制了预测框和关键点的图像
    """
    # 假设 res 的结构是 [x1, y1, x2, y2, conf, x1_kp, y1_kp, conf1_kp, ..., x21_kp, y21_kp, conf21_kp]
    # 其中前五个元素是预测框的坐标和置信度，后面是21个关键点的坐标和置信度
    
    # 提取预测框的坐标和置信度
    x1, y1, x2, y2, conf = res[:5]
    
    if conf < Conf_threshold:
        return image
    # 画框 后两个参数：颜色和线宽
    image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 5)
    
    # 计算预测框的长宽
    h, w = int(y2) - int(y1), int(x2) - int(x1)
    
    # 计算字体大小（随框大小调整）

    
    # 绘制文本（这里我们使用分数作为文本，因为没有类别名称）
    text = "{:.2f}".format(float(conf))
    
    # 绘制文本
    image = cv2.putText(image, text, (max(10, int(x1)), max(10, int(y1) - 5)), cv2.FONT_HERSHEY_COMPLEX, Rec_fontsize, (0, 255, 0), Rec_thickness)  # 绿色文本
    
    # 提取关键点的坐标和置信度
    keypoints = res[5:].reshape(-1, 3)
    
    # 绘制关键点
    for i, kp in enumerate(keypoints):
        kp_x, kp_y, kp_conf = kp
        # color
        kpt_color = color[i]['color']
        # 画关键点：图片、XY坐标、半径、颜色、线宽（-1为填充）
        image = cv2.circle(image, (int(kp_x), int(kp_y)), 5, kpt_color, -1)
        
        # 标注关键点编号 字体大小 字体颜色 线宽
        image = cv2.putText(image, str(i), (int(kp_x) + 5, int(kp_y) + 5), cv2.FONT_HERSHEY_SIMPLEX, Kpt_fontsize, kpt_color, Kpt_thickness)
    
    return image

# 假设 bbox_res 是经过 cod_trf 函数处理后的数据
# img = cv2.imread('DSC_5384.jpg')
# drawn_image = draw(bbox_res, img)


In [7]:
# 关键点的名称，id，颜色
keypoint_info = {
    0: {'name': '肾上腺', 'id': 0, 'color': [101, 205, 228]},
    1: {'name': '耳尖', 'id': 1, 'color': [240, 128, 128]},
    2: {'name': '胃', 'id': 2, 'color': [154, 205, 50]},
    3: {'name': '眼', 'id': 3, 'color': [34, 139, 34]},
    4: {'name': '口', 'id': 4, 'color': [139, 0, 0]},
    5: {'name': '肝', 'id': 5, 'color': [255, 165, 0]},
    6: {'name': '对屏尖', 'id': 6, 'color': [255, 0, 255]},
    7: {'name': '心', 'id': 7, 'color': [255, 255, 0]},
    8: {'name': '肺', 'id': 8, 'color': [29, 123,243]},
    9: {'name': '肺2', 'id': 9, 'color': [0, 255, 255]},
    10: {'name': '膀胱', 'id': 10, 'color': [128, 0, 128]},
    11: {'name': '脾', 'id': 11, 'color': [74, 181, 57]},
    12: {'name': '角窝中', 'id': 12, 'color': [165, 42, 42]},
    13: {'name': '神门', 'id': 13, 'color': [128, 128, 0]},
    14: {'name': '肾', 'id': 14, 'color': [255, 0, 0]},
    15: {'name': '耳门', 'id': 15, 'color': [34, 139, 34]},
    16: {'name': '听宫', 'id': 16, 'color': [255, 129, 0]},
    17: {'name': '听会', 'id': 17, 'color': [70, 130, 180]},
    18: {'name': '肩', 'id': 18, 'color': [63, 103,165]},
    19: {'name': '扁桃体', 'id': 19, 'color': [66, 77, 229]},
    20: {'name': '腰骶椎', 'id': 20, 'color': [255, 105, 180]}
}


In [8]:
# 绘制FPS参数

# FPS的坐标位置（左上角）
FPS_offset = (25, 60)

# FPS的字体大小
FPS_fontsize = 3

# FPS的线宽
FPS_thickness = 3

# FPS的颜色
FPS_color = (255, 0, 255)

# 绘制检测框参数

Rec_fontsize = 3

Rec_thickness = 3


# 绘制关键点参数

Kpt_fontsize = 3

Kpt_thickness = 3

# 检测置信度（设置置信度阈值，如果检测出来的框的置信度低于该阈值，则不显示）

Conf_threshold = 0.5

## 推理

### 图像加载

In [9]:
image = cv2.imread('DSC_5384.jpg')

In [10]:
# 图片resize
img_after = resize_image(image, (640, 640), True)

# 将图像处理成输入的格式
data = img2input(img_after)

3712 5568


In [11]:
data.shape

(1, 3, 640, 640)

### 1.onnx推理

In [12]:
model_path_onnx = 'best.onnx'

In [13]:
# 输入模型
model_onnx = rt.InferenceSession('best.onnx')  # yolov8模型onnx格式
input_name = model_onnx.get_inputs()[0].name
label_name = model_onnx.get_outputs()[0].name
output_onnx = model_onnx.run([label_name], {input_name: data})[0]  

[1;31m2024-05-21 14:31:12.694566425 [E:onnxruntime:Default, env.cc:228 ThreadMain] pthread_setaffinity_np failed for thread: 15083, index: 2, mask: {3, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.[m


In [87]:
input_format_onnx = model_onnx.get_inputs()[0].shape
print(input_format_onnx)

[1, 3, 640, 640]


In [14]:
output_onnx.shape

(1, 68, 8400)

In [15]:
# 将模型输出调整为 (8400, 68) 形状，以便于处理
output_onnx_2 = output_onnx.squeeze().transpose((1, 0))
output_onnx_2.shape

(8400, 68)

In [18]:
confidences_onnx = output_onnx_2[:, 4]
max_conf_index_onnx = np.argmax(confidences_onnx)

In [19]:
max_conf_index_onnx

8191

In [22]:
highest_conf_pred_onnx = output_onnx_2[max_conf_index_onnx]
highest_conf_pred_onnx.shape

(68,)

推理结果解读：highest_conf_pred_onnx是长为68的一维数组，其中前5个数据是框的坐标及置信度：(x1, y1) (x2, y2) conf  
后63个输入是21*3(21个关键点，每个关键点都是(x, y, conf)的形式)

In [23]:
highest_conf_pred_onnx

array([358.81714   , 304.8056    , 164.8894    , 281.01233   ,
         0.9364102 , 308.37076   , 334.44568   ,   0.9648454 ,
       357.027     , 187.1751    ,   0.8896568 , 359.2514    ,
       308.93823   ,   0.98701835, 326.8548    , 391.9261    ,
         0.926921  , 320.68762   , 303.62936   ,   0.98595   ,
       365.13074   , 294.82727   ,   0.9863063 , 338.59222   ,
       344.29416   ,   0.9753784 , 330.05737   , 336.07306   ,
         0.98130774, 343.1875    , 332.04938   ,   0.98469883,
       327.50018   , 347.45752   ,   0.9713981 , 324.77847   ,
       260.68372   ,   0.9726881 , 365.33713   , 323.09622   ,
         0.9831027 , 331.59705   , 237.72813   ,   0.9545015 ,
       347.08047   , 238.52997   ,   0.9551488 , 335.76587   ,
       272.6271    ,   0.98179895, 285.73486   , 279.4752    ,
         0.9445071 , 279.20816   , 334.70853   ,   0.93409157,
       279.62207   , 391.72958   ,   0.8845742 , 388.25107   ,
       326.4947    ,   0.96180403, 326.2549    , 411.81

### 2.om推理

In [24]:
model_path_om = 'yolov8n-2.om'

In [26]:
model_om = InferSession(0, model_path_om)

In [79]:
model_om.get_inputs()[0].format

0

In [35]:
output_om = model_om.infer(data)[0]

In [37]:
output_om.shape

(1, 68, 8400)

In [38]:
# 将模型输出调整为 (8400, 68) 形状，以便于处理
output_om_2 = output_om.squeeze().transpose((1, 0))
output_om_2.shape

(8400, 68)

In [68]:
confidences_om = output_om_2[:, 4]
max_conf_index_om = np.argmax(confidences_om)

In [69]:
max_conf_index_om

8180

In [74]:
highest_conf_pred_om = output_om_2[max_conf_index_om]
highest_conf_pred_om.shape

(68,)

In [75]:
highest_conf_pred_om

array([ 1.87187500e+01,  3.23500000e+02,  3.71875000e+01,  5.93000000e+02,
        1.64222717e-03, -1.13750000e+01,  3.68500000e+02,  2.78930664e-02,
        6.61328125e+00,  1.90625000e+02,  3.82385254e-02, -1.00625000e+01,
        3.25000000e+02,  1.01699829e-02, -1.30312500e+01,  4.29000000e+02,
        2.90222168e-02, -1.23359375e+01,  3.32250000e+02,  1.16882324e-02,
       -7.11328125e+00,  3.05000000e+02,  1.10778809e-02, -1.30156250e+01,
        3.71500000e+02,  9.15527344e-03, -1.41015625e+01,  3.63750000e+02,
        7.87353516e-03, -1.31093750e+01,  3.60250000e+02,  6.77490234e-03,
       -1.67500000e+01,  3.76750000e+02,  1.04446411e-02, -5.21093750e+00,
        2.79250000e+02,  1.76391602e-02, -1.06406250e+01,  3.39500000e+02,
        1.20544434e-02, -1.00312500e+01,  2.44375000e+02,  2.31933594e-02,
        1.83593750e+00,  2.38750000e+02,  2.37731934e-02, -8.71875000e+00,
        2.82750000e+02,  1.14135742e-02,  5.19921875e+00,  2.97000000e+02,
        7.25097656e-02,  