# 样例介绍
* YOLOv5是一种单阶段目标检测算法，在这个样例中，我们选取了YOLOv5s，它是YOLOv5系列中较为轻量的网络，适合在边缘设备部署，进行实时目标检测。

# 前期准备
* 基础镜像的样例目录中已包含转换后的om模型以及测试图片，如果直接运行，可跳过此步骤。如果需要重新转换模型，可以参考下面的步骤。
* 首先我们可以在[这个链接](https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Atlas%20200I%20DK%20A2/DevKit/downloads/23.0.RC1/Ascend-devkit_23.0.RC1_downloads.xlsx)的表格中找到本样例的依赖文件，下载我们已经准备好了的ONNX模型，ONNX是开源的离线推理模型框架。

* 为了能进一步优化模型推理性能，我们需要将其转换为om模型进行使用，以下为转换指令：  
    ```shell
    atc --model=yolov5s.onnx --framework=5 --output=yolo --input_format=NCHW --input_shape="input_image:1,3,640,640" --log=error --soc_version=Ascend310B1
    ```
    * 其中转换参数的含义为：  
        * --model：输入模型路径
        * --framework：原始网络模型框架类型，5表示ONNX
        * --output：输出模型路径
        * --input_format：输入Tensor的内存排列方式
        * --input_shape：指定模型输入数据的shape
        * --log：日志级别
        * --soc_version：昇腾AI处理器型号
        * --input_fp16_nodes：指定输入数据类型为FP16的输入节点名称
        * --output_type：指定网络输出数据类型或指定某个输出节点的输出类型

# 模型推理实现

In [1]:
# 导入代码依赖
import cv2
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import torch
from skvideo.io import vreader, FFmpegWriter
import IPython.display
from ais_bench.infer.interface import InferSession

from det_utils import letterbox, scale_coords, nms

In [2]:
def preprocess_image(image, cfg, bgr2rgb=True):
    """图片预处理"""
    img, scale_ratio, pad_size = letterbox(image, new_shape=cfg['input_shape'])
    if bgr2rgb:
        img = img[:, :, ::-1]
    img = img.transpose(2, 0, 1)  # HWC2CHW
    img = np.ascontiguousarray(img, dtype=np.float32)
    return img, scale_ratio, pad_size


def draw_bbox(bbox, img0, color, wt, names):
    """在图片上画预测框"""
    det_result_str = ''
    for idx, class_id in enumerate(bbox[:, 5]):
        if float(bbox[idx][4] < float(0.05)):
            continue
        img0 = cv2.rectangle(img0, (int(bbox[idx][0]), int(bbox[idx][1])), (int(bbox[idx][2]), int(bbox[idx][3])),
                             color, wt)
        img0 = cv2.putText(img0, str(idx) + ' ' + names[int(class_id)], (int(bbox[idx][0]), int(bbox[idx][1] + 16)),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
        img0 = cv2.putText(img0, '{:.4f}'.format(bbox[idx][4]), (int(bbox[idx][0]), int(bbox[idx][1] + 32)),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
        det_result_str += '{} {} {} {} {} {}\n'.format(
            names[bbox[idx][5]], str(bbox[idx][4]), bbox[idx][0], bbox[idx][1], bbox[idx][2], bbox[idx][3])
    return img0


def get_labels_from_txt(path):
    """从txt文件获取图片标签"""
    labels_dict = dict()
    with open(path) as f:
        for cat_id, label in enumerate(f.readlines()):
            labels_dict[cat_id] = label.strip()
    return labels_dict


def draw_prediction(pred, image, labels):
    """在图片上画出预测框并进行可视化展示"""
    imgbox = widgets.Image(format='jpg', height=720, width=1280)
    img_dw = draw_bbox(pred, image, (0, 255, 0), 2, labels)
    imgbox.value = cv2.imencode('.jpg', img_dw)[1].tobytes()
    display(imgbox)


def infer_image(img_path, model, class_names, cfg):
    """图片推理"""
    # 图片载入
    image = cv2.imread(img_path)
    # 数据预处理
    img, scale_ratio, pad_size = preprocess_image(image, cfg)
    # 模型推理
    output = model.infer([img])[0]

    output = torch.tensor(output)
    # 非极大值抑制后处理
    boxout = nms(output, conf_thres=cfg["conf_thres"], iou_thres=cfg["iou_thres"])
    pred_all = boxout[0].numpy()
    # 预测坐标转换
    scale_coords(cfg['input_shape'], pred_all[:, :4], image.shape, ratio_pad=(scale_ratio, pad_size))
    # 图片预测结果可视化
    draw_prediction(pred_all, image, class_names)


def infer_frame_with_vis(image, model, labels_dict, cfg, bgr2rgb=True):
    # 数据预处理
    img, scale_ratio, pad_size = preprocess_image(image, cfg, bgr2rgb)
    # 模型推理
    output = model.infer([img])[0]

    output = torch.tensor(output)
    # 非极大值抑制后处理
    boxout = nms(output, conf_thres=cfg["conf_thres"], iou_thres=cfg["iou_thres"])
    pred_all = boxout[0].numpy()
    # 预测坐标转换
    scale_coords(cfg['input_shape'], pred_all[:, :4], image.shape, ratio_pad=(scale_ratio, pad_size))
    # 图片预测结果可视化
    img_vis = draw_bbox(pred_all, image, (0, 255, 0), 2, labels_dict)
    return img_vis


def img2bytes(image):
    """将图片转换为字节码"""
    return bytes(cv2.imencode('.jpg', image)[1])


def infer_video(video_path, model, labels_dict, cfg):
    """视频推理"""
    image_widget = widgets.Image(format='jpeg', width=800, height=600)
    display(image_widget)

    # 读入视频
    cap = cv2.VideoCapture(video_path)
    while True:
        ret, img_frame = cap.read()
        if not ret:
            break
        # 对视频帧进行推理
        image_pred = infer_frame_with_vis(img_frame, model, labels_dict, cfg, bgr2rgb=True)
        image_widget.value = img2bytes(image_pred)


def infer_camera(model, labels_dict, cfg):
    """外设摄像头实时推理"""
    def find_camera_index():
        max_index_to_check = 10  # Maximum index to check for camera

        for index in range(max_index_to_check):
            cap = cv2.VideoCapture(index)
            if cap.read()[0]:
                cap.release()
                return index

        # If no camera is found
        raise ValueError("No camera found.")

    # 获取摄像头
    camera_index = find_camera_index()
    cap = cv2.VideoCapture(camera_index)
    # 初始化可视化对象
    image_widget = widgets.Image(format='jpeg', width=1280, height=720)
    display(image_widget)
    while True:
        # 对摄像头每一帧进行推理和可视化
        _, img_frame = cap.read()
        image_pred = infer_frame_with_vis(img_frame, model, labels_dict, cfg)
        image_widget.value = img2bytes(image_pred)

In [24]:
# 前处理
def resize_image(image, size, letterbox_image):
    """
        对输入图像进行resize
    Args:
        size:目标尺寸
        letterbox_image: bool 是否进行letterbox变换
    Returns:指定尺寸的图像
    """
    ih, iw, _ = image.shape
    print(ih, iw)
    h, w = size
    # letterbox_image = False
    if letterbox_image:
        scale = min(w/iw, h/ih)
        nw = int(iw*scale)
        nh = int(ih*scale)
        image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_LINEAR)
        # cv2.imshow("img", img)
        # cv2.waitKey()
        # print(image.shape)
        # 生成画布
        image_back = np.ones((h, w, 3), dtype=np.uint8) * 128
        # 将image放在画布中心区域-letterbox
        image_back[(h-nh)//2: (h-nh)//2 + nh, (w-nw)//2:(w-nw)//2+nw , :] = image
    else:
        image_back = image
        # cv2.imshow("img", image_back)
        # cv2.waitKey()
    return image_back  

In [25]:
def img2input(img):
    img = np.transpose(img, (2, 0, 1))
    img = img/255
    return np.expand_dims(img, axis=0).astype(np.float32)

In [26]:
def xywh2xyxy(*box):
    """
    将xywh转换为左上角点和左下角点
    Args:
        box:
    Returns: x1y1x2y2
    """
    ret = [box[0] - box[2] // 2, box[1] - box[3] // 2, \
          box[0] + box[2] // 2, box[1] + box[3] // 2]
    return ret

In [27]:
def cod_trf(result, pre, after):
    """
    因为预测框是在经过letterbox后的图像上做预测所以需要将预测框的坐标映射回原图像上
    Args:
        result:  [x,y,w,h,conf(最大类别概率), x1,y1,conf1, x2,y2,conf2, ..., x21,y21,conf21]
        pre:    原尺寸图像
        after:  经过letterbox处理后的图像
    Returns: 坐标变换后的结果,
    """
    res = np.array(result)
    # 提取预测框的坐标和置信度
    x, y, w, h, conf = res[:5]
    # 提取关键点的坐标和置信度
    keypoints = res[5:].reshape(-1, 3)
    
    # 将[x, y, w, h]转换为[x1, y1, x2, y2]
    x1, y1, x2, y2 = xywh2xyxy(x, y, w, h)
    
    # 获取原图像和经过letterbox处理后的图像的尺寸
    h_pre, w_pre, _ = pre.shape
    h_after, w_after, _ = after.shape
    
    # 计算缩放比例和平移量
    scale = max(w_pre/w_after, h_pre/h_after)
    h_pre, w_pre = h_pre/scale, w_pre/scale
    x_move, y_move = abs(w_pre-w_after)//2, abs(h_pre-h_after)//2
    
    # 变换预测框的坐标
    ret_x1, ret_x2 = (x1 - x_move) * scale, (x2 - x_move) * scale
    ret_y1, ret_y2 = (y1 - y_move) * scale, (y2 - y_move) * scale
    ret_box = [int(ret_x1), int(ret_y1), int(ret_x2), int(ret_y2), conf]
    
    # 变换关键点的坐标
    ret_keypoints = []
    for kp in keypoints:
        kp_x, kp_y, kp_conf = kp
        ret_kp_x = (kp_x - x_move) * scale
        ret_kp_y = (kp_y - y_move) * scale
        ret_keypoints.extend([int(ret_kp_x), int(ret_kp_y), kp_conf])
    
    # 组合预测框和关键点的结果
    ret = np.array(ret_box + ret_keypoints)
    return ret

# 假设 pre 和 after 是已经定义的图像
# highest_conf_pred = np.array(...)  # 您的68维numpy数组
# transformed_result = cod_trf(highest_conf_pred, pre, after)

In [28]:
def draw(res, image, color):
    """
    将预测框和关键点绘制在image上
    Args:
        res: 预测框数据
        image: 原图
    Returns:
        image: 绘制了预测框和关键点的图像
    """
    # 假设 res 的结构是 [x1, y1, x2, y2, conf, x1_kp, y1_kp, conf1_kp, ..., x21_kp, y21_kp, conf21_kp]
    # 其中前五个元素是预测框的坐标和置信度，后面是21个关键点的坐标和置信度
    
    # 提取预测框的坐标和置信度
    x1, y1, x2, y2, conf = res[:5]
    
    # 画框 后两个参数：颜色和线宽
    image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 5)
    
    # 计算预测框的长宽
    h, w = int(y2) - int(y1), int(x2) - int(x1)
    
    # 计算字体大小（随框大小调整）
    font_size = min(h/640, w/640) * 3
    
    # 确保字体大小不小于3
    font_size = max(font_size, 3)
    
    # 绘制文本（这里我们使用分数作为文本，因为没有类别名称）
    text = "{:.2f}".format(float(conf))
    
    # 绘制文本
    image = cv2.putText(image, text, (max(10, int(x1)), max(10, int(y1) - 5)), cv2.FONT_HERSHEY_COMPLEX, font_size, (0, 255, 0), 10)  # 绿色文本
    
    # 提取关键点的坐标和置信度
    keypoints = res[5:].reshape(-1, 3)
    
    # 绘制关键点
    for i, kp in enumerate(keypoints):
        kp_x, kp_y, kp_conf = kp
        # color
        kpt_color = color[i]['color']
        # 画关键点：图片、XY坐标、半径、颜色、线宽（-1为填充）
        image = cv2.circle(image, (int(kp_x), int(kp_y)), 20, kpt_color, -1)
        
        # 标注关键点编号 字体大小 字体颜色 线宽
        image = cv2.putText(image, str(i), (int(kp_x) + 5, int(kp_y) + 5), cv2.FONT_HERSHEY_SIMPLEX, 5, kpt_color, 5)
    
    return image

# 假设 bbox_res 是经过 cod_trf 函数处理后的数据
# img = cv2.imread('DSC_5384.jpg')
# drawn_image = draw(bbox_res, img)


# 样例运行

* 初始化相关参数

In [29]:
cfg = {
    'conf_thres': 0.4,  # 模型置信度阈值，阈值越低，得到的预测框越多
    'iou_thres': 0.5,  # IOU阈值，高于这个阈值的重叠预测框会被过滤掉
    'input_shape': [640, 640],  # 模型输入尺寸
}

model_path = 'yolov8n.om'
label_path = './coco_names.txt'
# 初始化推理模型
model = InferSession(0, model_path)

In [31]:
img = cv2.imread('fch_ear.jpg')

In [33]:
std_h, std_w = 640, 640  # 标准输入尺寸

In [34]:
# 前处理
img_after = resize_image(img, (std_w, std_h), True)  # （640， 640， 3）

3557 2836


In [35]:
img_after.shape

(640, 640, 3)

In [36]:
# 将图像处理成输入的格式
data = img2input(img_after)

In [38]:
data.shape

(1, 3, 640, 640)

In [39]:
output = model.infer([data])[0]

In [41]:
output.shape

(1, 68, 8400)

In [42]:
# 将模型输出调整为 (8400, 68) 形状，以便于处理
pred_reshaped = output.squeeze().transpose((1, 0))
pred_reshaped.shape

(8400, 68)

In [43]:
confidences = pred_reshaped[:, 4]
max_conf_index = np.argmax(confidences)

In [44]:
max_conf_index

8262

In [45]:
highest_conf_pred = pred_reshaped[max_conf_index]
highest_conf_pred.shape

(68,)

In [46]:
highest_conf_pred

array([ 6.6750000e+01,  4.5600000e+02,  1.3550000e+02,  3.2700000e+02,
        2.2621155e-03, -3.9375000e+00,  4.2425000e+02,  3.6224365e-02,
        5.8437500e+00,  4.2825000e+02,  4.9755859e-01,  1.9968750e+01,
        4.5075000e+02,  3.7414551e-02,  1.7593750e+01,  4.2625000e+02,
        5.2825928e-02,  7.0625000e+00,  4.3450000e+02,  3.1799316e-02,
       -6.2500000e-01,  4.4375000e+02,  5.6762695e-02,  1.0343750e+01,
        4.3150000e+02,  2.0248413e-02,  1.6437500e+01,  4.3775000e+02,
        1.8615723e-02, -4.6875000e+00,  4.5450000e+02,  2.5939941e-02,
       -1.9375000e+00,  4.3125000e+02,  2.1789551e-02,  4.9062500e+00,
        4.5825000e+02,  7.2265625e-02,  1.5625000e-01,  4.6475000e+02,
        4.7851562e-02,  1.3343750e+01,  4.5250000e+02,  1.5234375e-01,
        1.4062500e+01,  4.6025000e+02,  1.4257812e-01,  3.8437500e+00,
        4.4250000e+02,  5.0292969e-02,  5.6250000e+00,  4.4050000e+02,
        1.1010742e-01,  2.1031250e+01,  4.1775000e+02,  7.5317383e-02,
      

In [47]:
bbox_res = cod_trf(highest_conf_pred, img, img_after)

In [48]:
bbox_res

array([-3.57000000e+02,  1.62800000e+03,  3.87000000e+02,  3.44000000e+03,
        2.26211548e-03, -3.77000000e+02,  2.35700000e+03,  3.62243652e-02,
       -3.23000000e+02,  2.38000000e+03,  4.97558594e-01, -2.44000000e+02,
        2.50500000e+03,  3.74145508e-02, -2.57000000e+02,  2.36900000e+03,
        5.28259277e-02, -3.16000000e+02,  2.41400000e+03,  3.17993164e-02,
       -3.59000000e+02,  2.46600000e+03,  5.67626953e-02, -2.98000000e+02,
        2.39800000e+03,  2.02484131e-02, -2.64000000e+02,  2.43200000e+03,
        1.86157227e-02, -3.81000000e+02,  2.52600000e+03,  2.59399414e-02,
       -3.66000000e+02,  2.39600000e+03,  2.17895508e-02, -3.28000000e+02,
        2.54600000e+03,  7.22656250e-02, -3.54000000e+02,  2.58200000e+03,
        4.78515625e-02, -2.81000000e+02,  2.51400000e+03,  1.52343750e-01,
       -2.77000000e+02,  2.55700000e+03,  1.42578125e-01, -3.34000000e+02,
        2.45900000e+03,  5.02929688e-02, -3.24000000e+02,  2.44800000e+03,
        1.10107422e-01, -

In [49]:
keypoint_info = {
    0: {'name': '肾上腺', 'id': 0, 'color': [101, 205, 228]},
    1: {'name': '耳尖', 'id': 1, 'color': [240, 128, 128]},
    2: {'name': '胃', 'id': 2, 'color': [154, 205, 50]},
    3: {'name': '眼', 'id': 3, 'color': [34, 139, 34]},
    4: {'name': '口', 'id': 4, 'color': [139, 0, 0]},
    5: {'name': '肝', 'id': 5, 'color': [255, 165, 0]},
    6: {'name': '对屏尖', 'id': 6, 'color': [255, 0, 255]},
    7: {'name': '心', 'id': 7, 'color': [255, 255, 0]},
    8: {'name': '肺', 'id': 8, 'color': [29, 123,243]},
    9: {'name': '肺2', 'id': 9, 'color': [0, 255, 255]},
    10: {'name': '膀胱', 'id': 10, 'color': [128, 0, 128]},
    11: {'name': '脾', 'id': 11, 'color': [74, 181, 57]},
    12: {'name': '角窝中', 'id': 12, 'color': [165, 42, 42]},
    13: {'name': '神门', 'id': 13, 'color': [128, 128, 0]},
    14: {'name': '肾', 'id': 14, 'color': [255, 0, 0]},
    15: {'name': '耳门', 'id': 15, 'color': [34, 139, 34]},
    16: {'name': '听宫', 'id': 16, 'color': [255, 129, 0]},
    17: {'name': '听会', 'id': 17, 'color': [70, 130, 180]},
    18: {'name': '肩', 'id': 18, 'color': [63, 103,165]},
    19: {'name': '扁桃体', 'id': 19, 'color': [66, 77, 229]},
    20: {'name': '腰骶椎', 'id': 20, 'color': [255, 105, 180]}
}

In [50]:
image = draw(bbox_res, img, keypoint_info)

In [53]:
cv2.imwrite("output.jpg", image)

True

In [52]:
cv2.namedWindow("result", cv2.WINDOW_NORMAL)
cv2.imshow("result", image)
key = cv2.waitKey(0)

error: OpenCV(4.9.0) /io/opencv/modules/highgui/src/window.cpp:1272: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvShowImage'


In [5]:
scale_ratio

(0.17992690469496767, 0.17992690469496767)

In [6]:
pad_size

(65.0, 0.0)

In [7]:
# 模型推理
output = model.infer([img])[0]

In [9]:
output.shape

(1, 68, 8400)

In [10]:
# 将模型输出调整为 (8400, 68) 形状，以便于处理
pred_reshaped = output.squeeze().transpose((1, 0))
pred_reshaped.shape

(8400, 68)

In [11]:
confidences = pred_reshaped[:, 4]
max_conf_index = np.argmax(confidences)

1.5258789e-05

In [21]:
max_conf_index

6240

In [22]:
highest_conf_pred = pred_reshaped[max_conf_index]
highest_conf_pred.shape

(68,)

In [23]:
highest_conf_pred

array([ 5.48437500e+00,  6.35000000e+02,  9.90625000e+00,  1.24500000e+02,
        1.00000000e+00, -1.37890625e+01,  6.36500000e+02,  8.37707520e-03,
        2.03125000e+01,  6.27500000e+02,  2.42187500e-01,  9.59375000e+00,
        6.33500000e+02,  4.71496582e-03,  4.05517578e-01,  6.27500000e+02,
        9.82666016e-03, -1.90156250e+01,  6.47000000e+02,  1.04904175e-02,
       -4.14843750e+00,  6.30500000e+02,  1.29241943e-02, -1.06328125e+01,
        6.37000000e+02,  1.10778809e-02, -1.53359375e+01,  6.36000000e+02,
        1.01318359e-02, -1.11796875e+01,  6.22500000e+02,  4.55474854e-03,
       -1.22343750e+01,  6.43500000e+02,  4.96292114e-03, -1.01875000e+01,
        6.43500000e+02,  8.78143311e-03, -6.18164062e-01,  6.17000000e+02,
        1.15051270e-02, -3.60546875e+00,  6.25500000e+02,  1.88903809e-02,
        6.27734375e+00,  6.26000000e+02,  6.74438477e-03,  1.23828125e+00,
        6.39500000e+02,  2.14080811e-02, -1.54140625e+01,  6.28000000e+02,
        2.00195312e-02, -

In [29]:
output = torch.tensor(output)
# 非极大值抑制后处理
boxout = nms(output, conf_thres=cfg["conf_thres"], iou_thres=cfg["iou_thres"])
pred_all = boxout[0].numpy()
# 预测坐标转换
scale_coords(cfg['input_shape'], pred_all[:, :4], image.shape, ratio_pad=(scale_ratio, pad_size))
# 图片预测结果可视化
draw_prediction(pred_all, image, class_names)

NameError: name 'class_names' is not defined

* 选择推理模式。"infer_mode"有三个取值：image, camera, video，分别对应图片推理、摄像头实时推理和视频推理。默认使用视频推理模式。
* 我们选取的样例是一个赛车视频，执行下面的代码后可以看到模型会对视频的每一帧进行推理，并将预测结果展示在画面上。

In [9]:
infer_mode = 'camera'

if infer_mode == 'image':
    img_path = 'world_cup.jpg'
    infer_image(img_path, model, labels_dict, cfg)
elif infer_mode == 'camera':
    infer_camera(model, labels_dict, cfg)
elif infer_mode == 'video':
    video_path = 'racing.mp4'
    infer_video(video_path, model, labels_dict, cfg)

[INFO] acl init success
[INFO] open device 0 success
[INFO] load model yolo.om success
[INFO] create model description success
[INFO] load model yolov8n.om success
[INFO] create model description success


[ WARN:0@63.830] global cap_v4l.cpp:997 open VIDEOIO(V4L2:/dev/video0): can't open camera by index
[ERROR:0@63.832] global obsensor_uvc_stream_channel.cpp:159 getStreamChannelGroup Camera index out of range
[ WARN:0@63.834] global cap_v4l.cpp:997 open VIDEOIO(V4L2:/dev/video1): can't open camera by index
[ERROR:0@63.835] global obsensor_uvc_stream_channel.cpp:159 getStreamChannelGroup Camera index out of range
[ WARN:0@63.836] global cap_v4l.cpp:997 open VIDEOIO(V4L2:/dev/video2): can't open camera by index
[ERROR:0@63.837] global obsensor_uvc_stream_channel.cpp:159 getStreamChannelGroup Camera index out of range
[ WARN:0@63.837] global cap_v4l.cpp:997 open VIDEOIO(V4L2:/dev/video3): can't open camera by index
[ERROR:0@63.838] global obsensor_uvc_stream_channel.cpp:159 getStreamChannelGroup Camera index out of range
[ WARN:0@63.839] global cap_v4l.cpp:997 open VIDEOIO(V4L2:/dev/video4): can't open camera by index
[ERROR:0@63.839] global obsensor_uvc_stream_channel.cpp:159 getStreamChan

ValueError: No camera found.

# 样例总结与扩展
以上就是这个样例的全部内容了，值得关注的是在模型推理后有一步非常重要的后处理，就是非极大值抑制，即NMS，由于模型的原始预测结果会有非常多无效或重叠的预测框，我们需要通过NMS来进行过滤。再者，模型预测框的表示往往是一个标准化的结果，比如0到1之间，我们需要通过坐标转换将结果与原始图片的宽高对应上。