## 1. Imports and Create Team

In [1]:
import sys
import os

sys.path.append(os.path.abspath("../common"))

import math
import time
import numpy as np
from PIL import Image
from matplotlib import pyplot
import cv2
from datetime import datetime

# import pynq
import dac_sdc
from IPython.display import display

import onnxruntime as ort
import torch
from torchvision.ops import nms
from PIL import ImageDraw, ImageFont, Image

#! from shapely.geometry import Polygon
import cv2
import json

team_name = 'T-IMI'
dac_sdc.BATCH_SIZE = 1
team = dac_sdc.Team(team_name)

## 2. Preparing the library and model

In [2]:
# some library
# !pip install onnxruntime
# !conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=10.2 -c pytorch

In [3]:
onnx_model_path = "./nanoprune05ep70.onnx"
# session = ort.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider'])
# session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider']))
session = ort.InferenceSession(onnx_model_path, providers=['TensorrtExecutionProvider'])



## 3. Python Callback Function and Helper Functions

In [4]:
input_shape = [1280, 1280]
num_classes = 7

def preprocess_input(image):
    image /= 255.0
    return image

def resize_image(image, size, letterbox_image):
    iw, ih  = image.size
    w, h    = size
    if letterbox_image:
        scale   = min(w/iw, h/ih)
        nw      = int(iw*scale)
        nh      = int(ih*scale)

        image   = image.resize((nw,nh), Image.BICUBIC)
        new_image = Image.new('RGB', size, (128,128,128))
        new_image.paste(image, ((w-nw)//2, (h-nh)//2))
    else:
        new_image = image.resize((w, h), Image.BICUBIC)
    return new_image

# Function to preprocess the image (modify as per your model's requirement)
def preprocess_image(image_path):
    img = Image.open(image_path)
    img  = resize_image(img, (input_shape[1], input_shape[0]), True)
    image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(img, dtype='float32')), (2, 0, 1)), 0)
    return image_data

def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
    """Transform distance(ltrb) to box(xywh or xyxy)."""
    # 左上右下
    lt, rb  = torch.split(distance, 2, dim)
    x1y1    = anchor_points - lt
    x2y2    = anchor_points + rb
    if xywh:
        c_xy    = (x1y1 + x2y2) / 2
        wh      = x2y2 - x1y1
        return torch.cat((c_xy, wh), dim)  # xywh bbox
    return torch.cat((x1y1, x2y2), dim)  # xyxy bbox


def decode_box(num_classes, input_shape, dbox, cls, anchors, strides):
    # dbox, cls, origin_cls, anchors, strides = inputs
    dbox = dist2bbox(dbox, anchors.unsqueeze(0), xywh=True, dim=1) * strides
    y = torch.cat((dbox, cls.sigmoid()), 1).permute(0, 2, 1)
    y[:, :, :4] = y[:, :, :4] / torch.Tensor([input_shape[1], input_shape[0], input_shape[1], input_shape[0]]).to(y.device)
    return y

def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image):
    box_yx = box_xy[..., ::-1]
    box_hw = box_wh[..., ::-1]
    input_shape = np.array(input_shape)
    image_shape = np.array(image_shape)

    if letterbox_image:
        new_shape = np.round(image_shape * np.min(input_shape/image_shape))
        offset = (input_shape - new_shape)/2./input_shape
        scale = input_shape/new_shape

        box_yx = (box_yx - offset) * scale
        box_hw *= scale

    box_mins = box_yx - (box_hw / 2.)
    box_maxes = box_yx + (box_hw / 2.)
    boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1)
    boxes *= np.concatenate([image_shape, image_shape], axis=-1)
    return boxes

def non_max_suppression(prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4):
    box_corner = prediction.new(prediction.shape)
    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
    prediction[:, :, :4] = box_corner[:, :, :4]

    output = [None for _ in range(len(prediction))]
    for i, image_pred in enumerate(prediction):
        class_conf, class_pred = torch.max(image_pred[:, 4:4 + num_classes], 1, keepdim=True)
        conf_mask = (class_conf[:, 0] >= conf_thres).squeeze()
        image_pred = image_pred[conf_mask]
        class_conf = class_conf[conf_mask]
        class_pred = class_pred[conf_mask]
        if not image_pred.size(0):
            continue
        detections = torch.cat((image_pred[:, :4], class_conf.float(), class_pred.float()), 1)
        unique_labels = detections[:, -1].cpu().unique()

        if prediction.is_cuda:
            unique_labels = unique_labels.cuda()
            detections = detections.cuda()

        for c in unique_labels:
            detections_class = detections[detections[:, -1] == c]
            keep = nms(detections_class[:, :4], detections_class[:, 4], nms_thres)
            max_detections = detections_class[keep]
            output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))
        
        if output[i] is not None:
            output[i] = output[i].cpu().numpy()
            box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2]
            output[i][:, :4] = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
    return output

def my_callback(rgb_imgs):
    preds = {}
    type_mapping = {"0": 1, "1": 2, "2": 3, "3": 4, "4": 5, "5": 6, "6": 7}
    type_mapping_mask = {"0": 0, "1": 8, "2": 9, "3": 10}
    # for image_path in tqdm(image_paths, desc="Processing images"):
    for (img_path, img) in rgb_imgs:
        input_image = preprocess_image(img_path)
        image = Image.open(img_path)
        # Assuming the model takes an input named 'input' and outputs a tensor named 'output'
        image_shape = np.array(np.shape(image)[0:2])
        outputs = session.run(None, {'input': input_image})
        outputs = [torch.tensor(arr) for arr in outputs]
        ########################################################mask
        fea_img = torch.argmax(outputs[7].long(), 1)
        fea_img = fea_img[0, :, :].cpu().detach().numpy()
        fea_img = np.array(fea_img)
        fea_img = fea_img.astype(np.uint8)
        ########################################################mask
        #0是对的，1是对的，6对4，5对3
        outputs = decode_box(num_classes, input_shape, outputs[0], outputs[1], outputs[5], outputs[6])
        results = non_max_suppression(outputs, num_classes, input_shape, 
                    image_shape, True, conf_thres = 0.5, nms_thres = 0.3)
        pred = []
        if results[0] is None:
            contours, hierarchy = cv2.findContours(fea_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
            pred.append({
                "type": '1',
                "x": 0,
                "y": 0,
                "width": 0,
                "height": 0,
                "segmentation": []
            })
            for contour in contours:
                points = contour.reshape(-1, 2).tolist()
                
                # 将坐标点转换为字符串形式
                points_str = [[f"{x:.3f}", f"{y:.3f}"] for x, y in points]  #!
                
                # 获取轮廓像素点的值并加 7
                type_value = int(fea_img[contour[:, 0, 1], contour[:, 0, 0]].mean()) + 7
                
                pred.append({
                    "type": str(type_value),
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "segmentation": [points_str]
                })
            preds[img_path.name] = pred
        else:
            contours, hierarchy = cv2.findContours(fea_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
            top_label   = np.array(results[0][:, 5], dtype = 'int32')
            # top_conf    = results[0][:, 4]
            top_boxes   = results[0][:, :4]
            pred = []
            for idx in range(len(top_label)):
                pred.append({
                    "type": type_mapping[str(int(top_label[idx]))],
                    "x": int(top_boxes[idx, 1]),                            #!
                    "y": int(top_boxes[idx, 0]),                            #!
                    "width": int((top_boxes[idx, 3] - top_boxes[idx, 1])),  #!
                    "height": int((top_boxes[idx, 2] - top_boxes[idx, 0])), #!
                    "segmentation": []
                })
            for contour in contours:
                contour_points = contour.reshape(-1, 2)
                contour_str = ', '.join(f'{x:.2f}, {y:.2f}' for x, y in contour_points)     #!
                contour_list = contour_str.split(',')
                float_list = list(map(float, contour_list))
                # 获取轮廓像素点的值并加 7
                type_value = int(fea_img[contour[:, 0, 1], contour[:, 0, 0]].mean())
                
                pred.append({
                    "type": type_mapping_mask[str(type_value)],
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "segmentation": [float_list]
                })
            preds[img_path.name] = pred 
            
    return preds

In [5]:
team.run(my_callback, debug=True)

Batch 1 starting. 1 images.
Batch 1 done. Runtime = 0.5617876052856445 seconds.
Batch 2 starting. 1 images.
Batch 2 done. Runtime = 0.5045149326324463 seconds.
Batch 3 starting. 1 images.
Batch 3 done. Runtime = 0.5538954734802246 seconds.
Batch 4 starting. 1 images.
Batch 4 done. Runtime = 0.5854434967041016 seconds.
Batch 5 starting. 1 images.
Batch 5 done. Runtime = 1.3425841331481934 seconds.
Batch 6 starting. 1 images.
Batch 6 done. Runtime = 0.7064619064331055 seconds.
Batch 7 starting. 1 images.
Batch 7 done. Runtime = 0.5590658187866211 seconds.
Batch 8 starting. 1 images.
Batch 8 done. Runtime = 1.0665011405944824 seconds.
Batch 9 starting. 1 images.
Batch 9 done. Runtime = 0.7906808853149414 seconds.
Batch 10 starting. 1 images.
Batch 10 done. Runtime = 0.699420690536499 seconds.
Batch 11 starting. 1 images.
Batch 11 done. Runtime = 0.7039165496826172 seconds.
Batch 12 starting. 1 images.
Batch 12 done. Runtime = 0.892493724822998 seconds.
Batch 13 starting. 1 images.
Batch 1