In [1]:
import pandas as pd
import os
import numpy as np
from sklearn import model_selection
from tqdm import tqdm_gui
from glob import glob

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader

import cv2
import random
from tqdm import tqdm

torch.manual_seed(100)
random.seed(100)

In [2]:
class Config(object):
    def __init__(self) -> None:
        self.datasetdir = "/media/beyond/70f23ead-fa6d-4628-acf7-c82133c03245/home/beyond/Documents/ml/data/dataset/image/VOCdevkit/VOC2012"
        
        self.size = 512
        self.batch_size = 1

        self.lr = 1e-3
        self.epochs = 100

        self.max_interset_iou = 0.1
        self.min_confidence = 0.4

        self.classlist = [
            'aeroplane', 'bicycle', 'bird', 'boat',
            'bottle', 'bus', 'car', 'cat', 'chair',
            'cow', 'diningtable', 'dog', 'horse',
            'motorbike', 'person', 'pottedplant',
            'sheep', 'sofa', 'train', 'tvmonitor']
        
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



config = Config()

In [3]:

import xml.etree.ElementTree as ET
import os

def parse_rec(filename:str):
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    result = {}
    result["filename"] = filename[filename.rindex("/")+1:-4]
    result["width"] = int(tree.find("size").find("width").text)
    result["height"] = int(tree.find("size").find("height").text)
    result["boxes"] = []
    for object in tree.findall('object'):
        obj = {}
        # difficult = int(object.find('difficult').text)
        # if difficult == 1:
        #     continue
        obj['classname'] = object.find('name').text
        bbox = object.find('bndbox')
        obj['bbox'] = [int(float(bbox.find('xmin').text)),
                              int(float(bbox.find('ymin').text)),
                              int(float(bbox.find('xmax').text)),
                              int(float(bbox.find('ymax').text))]
        result["boxes"].append(obj)
    return result


axmlpath = os.path.join(config.datasetdir, "Annotations","2010_006026.xml")
parse_rec(axmlpath)

{'filename': '2010_006026',
 'width': 332,
 'height': 500,
 'boxes': [{'classname': 'sheep', 'bbox': [87, 412, 147, 463]},
  {'classname': 'sheep', 'bbox': [100, 353, 144, 382]},
  {'classname': 'sheep', 'bbox': [141, 373, 183, 389]}]}

In [4]:


class MyDataset(Dataset):
    image_size = 448

    def __init__(self, config:Config, transform = [], istrain=True):
        print('loading annotations')
        self.config = config
        self.istrain = istrain
        self.transform = transform
        self.fnames = []
        self.boxes = []
        self.labels = []
        self.S = 7  # grid number 7*7 normally
        self.B = 2  # bounding box number in each grid
        self.C = 20  # how many classes
        self.mean = (123, 117, 104)  # RGB
        annodir = os.path.join(config.datasetdir, "Annotations")
        i = 0
        for filename in tqdm(os.listdir(annodir)):
            if i > 1:
                break
            rect = parse_rec(os.path.join(annodir, filename))
            self.fnames.append(rect["filename"] + ".jpg") 
            boxes = []
            label = []
            for box in rect["boxes"]:
                x1 = box["bbox"][0]
                y1 = box["bbox"][1]
                x2 = box["bbox"][2]
                y2 = box["bbox"][3]
                boxes.append([x1, y1, x2, y2])
                label.append(config.classlist.index(box["classname"]))
            self.boxes.append(torch.Tensor(boxes))
            self.labels.append(torch.LongTensor(label))
            i = i + 1
        # self.num_samples = len(self.boxes)
        self.num_samples = min(len(self.fnames),i)

    def __getitem__(self, idx):
        fname = self.fnames[idx]
        img = cv2.imread(os.path.join(config.datasetdir, "JPEGImages", fname))
        boxes = self.boxes[idx].clone()
        labels = self.labels[idx].clone()
        if self.istrain:  # 数据增强里面的各种变换用torch自带的transform是做不到的，因为对图片进行旋转、随即裁剪等会造成bbox的坐标也会发生变化，所以需要自己来定义数据增强
            img, boxes = self.random_flip(img, boxes)
            img, boxes = self.randomScale(img, boxes)
            img = self.randomBlur(img)
            img = self.RandomBrightness(img)
            img = self.RandomHue(img)
            img = self.RandomSaturation(img)
            img, boxes, labels = self.randomShift(img, boxes, labels)
            img, boxes, labels = self.randomCrop(img, boxes, labels)
        h, w, _ = img.shape
        boxes /= torch.Tensor([w, h, w, h]).expand_as(boxes)         # 坐标 归一化 处理，为了方便训练
        img = self.BGR2RGB(img)                                                                           # because pytorch pretrained model use RGB
        # img = self.subMean(img, self.mean)                          # 减去均值
        img = cv2.resize(img, (self.image_size, self.image_size))   # 将所有图片都resize到指定大小
        img = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ])(img)
        target = self.encoder(boxes, labels)                        # 将图片标签编码到7x7*30的向量

        for t in self.transform:
            img = t(img)
        return img, target

    def __len__(self):
        return self.num_samples

    def encoder(self, boxes, labels):
        '''
        boxes (tensor) [[x1,y1,x2,y2],[]]  归一化后的结果
        labels (tensor) [...]
        return 7x7x30
        '''
        grid_num = 7
        target = torch.zeros((grid_num, grid_num, 30))
        cell_size = 1. / grid_num                         # 每个格子的大小

        # 右下坐标        左上坐标
        # x2,y2           x1,y1
        wh = boxes[:, 2:] - boxes[:, :2]

        # 物体中心坐标集合
        cxcy = (boxes[:, 2:] + boxes[:, :2]) / 2
        for i in range(cxcy.size()[0]):
            # 物体中心坐标
            cxcy_sample = cxcy[i]

            # 指示落在那网格，如[0,0]
            ij = cxcy_sample // cell_size  # 中心点对应格子的坐标
            #    0 1    2 3   4      5 6   7 8   9
            # [中心坐标,长宽,置信度,中心坐标,长宽,置信度, 20个类别] x 7x7   因为一个框预测两个物体

            xy = ij * cell_size
            delta_xy = (cxcy_sample - xy) / cell_size
            target[int(ij[1]), int(ij[0]), :2] = delta_xy
            target[int(ij[1]), int(ij[0]), 2:4] = wh[i]
            target[int(ij[1]), int(ij[0]), 5:7] = delta_xy
            target[int(ij[1]), int(ij[0]), 7:9] = wh[i]

            # 第一/二个框的置信度
            target[int(ij[1]), int(ij[0]), 4] = 1
            target[int(ij[1]), int(ij[0]), 9] = 1

            target[int(ij[1]), int(ij[0]), int(labels[i]) + 10] = 1
        return target

    def BGR2RGB(self, img):
        return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    def BGR2HSV(self, img):
        return cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    def HSV2BGR(self, img):
        return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)

    def RandomBrightness(self, bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h, s, v = cv2.split(hsv)
            adjust = random.choice([0.5, 1.5])
            v = v * adjust
            v = np.clip(v, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h, s, v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def RandomSaturation(self, bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h, s, v = cv2.split(hsv)
            adjust = random.choice([0.5, 1.5])
            s = s * adjust
            s = np.clip(s, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h, s, v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def RandomHue(self, bgr):
        if random.random() < 0.5:
            hsv = self.BGR2HSV(bgr)
            h, s, v = cv2.split(hsv)
            adjust = random.choice([0.5, 1.5])
            h = h * adjust
            h = np.clip(h, 0, 255).astype(hsv.dtype)
            hsv = cv2.merge((h, s, v))
            bgr = self.HSV2BGR(hsv)
        return bgr

    def randomBlur(self, bgr):
        if random.random() < 0.5:
            bgr = cv2.blur(bgr, (5, 5))
        return bgr

    def randomShift(self, bgr, boxes, labels):
        # 平移变换
        center = (boxes[:, 2:] + boxes[:, :2]) / 2
        if random.random() < 0.5:
            height, width, c = bgr.shape
            after_shfit_image = np.zeros((height, width, c), dtype=bgr.dtype)
            after_shfit_image[:, :, :] = (104, 117, 123)  # bgr
            shift_x = random.uniform(-width * 0.2, width * 0.2)
            shift_y = random.uniform(-height * 0.2, height * 0.2)
            # print(bgr.shape,shift_x,shift_y)
            # 原图像的平移
            if shift_x >= 0 and shift_y >= 0:
                after_shfit_image[int(shift_y):,
                                  int(shift_x):,
                                  :] = bgr[:height - int(shift_y),
                                           :width - int(shift_x),
                                           :]
            elif shift_x >= 0 and shift_y < 0:
                after_shfit_image[:height + int(shift_y),
                                  int(shift_x):,
                                  :] = bgr[-int(shift_y):,
                                           :width - int(shift_x),
                                           :]
            elif shift_x < 0 and shift_y >= 0:
                after_shfit_image[int(shift_y):, :width +
                                  int(shift_x), :] = bgr[:height -
                                                         int(shift_y), -
                                                         int(shift_x):, :]
            elif shift_x < 0 and shift_y < 0:
                after_shfit_image[:height + int(shift_y), :width + int(
                    shift_x), :] = bgr[-int(shift_y):, -int(shift_x):, :]

            shift_xy = torch.FloatTensor(
                [[int(shift_x), int(shift_y)]]).expand_as(center)
            center = center + shift_xy
            mask1 = (center[:, 0] > 0) & (center[:, 0] < width)
            mask2 = (center[:, 1] > 0) & (center[:, 1] < height)
            mask = (mask1 & mask2).view(-1, 1)
            boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)
            if len(boxes_in) == 0:
                return bgr, boxes, labels
            box_shift = torch.FloatTensor(
                [[int(shift_x), int(shift_y), int(shift_x), int(shift_y)]]).expand_as(boxes_in)
            boxes_in = boxes_in + box_shift
            labels_in = labels[mask.view(-1)]
            return after_shfit_image, boxes_in, labels_in
        return bgr, boxes, labels

    def randomScale(self, bgr, boxes):
        # 固定住高度，以0.8-1.2伸缩宽度，做图像形变
        if random.random() < 0.5:
            scale = random.uniform(0.8, 1.2)
            height, width, c = bgr.shape
            bgr = cv2.resize(bgr, (int(width * scale), height))
            scale_tensor = torch.FloatTensor(
                [[scale, 1, scale, 1]]).expand_as(boxes)
            boxes = boxes * scale_tensor
            return bgr, boxes
        return bgr, boxes

    def randomCrop(self, bgr, boxes, labels):
        if random.random() < 0.5:
            center = (boxes[:, 2:] + boxes[:, :2]) / 2
            height, width, c = bgr.shape
            h = random.uniform(0.6 * height, height)
            w = random.uniform(0.6 * width, width)
            x = random.uniform(0, width - w)
            y = random.uniform(0, height - h)
            x, y, h, w = int(x), int(y), int(h), int(w)

            center = center - torch.FloatTensor([[x, y]]).expand_as(center)
            mask1 = (center[:, 0] > 0) & (center[:, 0] < w)
            mask2 = (center[:, 1] > 0) & (center[:, 1] < h)
            mask = (mask1 & mask2).view(-1, 1)

            boxes_in = boxes[mask.expand_as(boxes)].view(-1, 4)
            if(len(boxes_in) == 0):
                return bgr, boxes, labels
            box_shift = torch.FloatTensor([[x, y, x, y]]).expand_as(boxes_in)

            boxes_in = boxes_in - box_shift
            boxes_in[:, 0] = boxes_in[:, 0].clamp_(min=0, max=w)
            boxes_in[:, 2] = boxes_in[:, 2].clamp_(min=0, max=w)
            boxes_in[:, 1] = boxes_in[:, 1].clamp_(min=0, max=h)
            boxes_in[:, 3] = boxes_in[:, 3].clamp_(min=0, max=h)

            labels_in = labels[mask.view(-1)]
            img_croped = bgr[y:y + h, x:x + w, :]
            return img_croped, boxes_in, labels_in
        return bgr, boxes, labels

    def subMean(self, bgr, mean):
        mean = np.array(mean, dtype=np.float32)
        bgr = bgr - mean
        return bgr

    def random_flip(self, im, boxes):
        if random.random() < 0.5:
            im_lr = np.fliplr(im).copy()
            h, w, _ = im.shape
            xmin = w - boxes[:, 2]
            xmax = w - boxes[:, 0]
            boxes[:, 0] = xmin
            boxes[:, 2] = xmax
            return im_lr, boxes
        return im, boxes

    def random_bright(self, im, delta=16):
        alpha = random.random()
        if alpha > 0.3:
            im = im * alpha + random.randrange(-delta, delta)
            im = im.clip(min=0, max=255).astype(np.uint8)
        return im


dataset = MyDataset(config, istrain=False)
from matplotlib import pyplot as plt


loading annotations


 29%|██▉       | 5001/17125 [00:50<02:03, 98.26it/s] 


In [5]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import torch
import torchinfo
import torchvision.models as models
from torch.autograd import Variable

__all__ = ['ResNet', 'resnet50']


model_urls = {'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth'}


def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class detnet_bottleneck(nn.Module):
    # no expansion
    # dilation = 2
    # type B use 1x1 conv
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, block_type='A'):
        super(detnet_bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes,
            planes,
            kernel_size=3,
            stride=stride,
            padding=2,
            bias=False,
            dilation=2)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(
            planes,
            self.expansion *
            planes,
            kernel_size=1,
            bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion * planes)

        self.downsample = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes or block_type == 'B':
            self.downsample = nn.Sequential(
                nn.Conv2d(
                    in_planes,
                    self.expansion *
                    planes,
                    kernel_size=1,
                    stride=stride,
                    bias=False),
                nn.BatchNorm2d(
                    self.expansion *
                    planes))

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.downsample(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1470):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        # self.layer5 = self._make_layer(block, 512, layers[3], stride=2)
        self.layer5 = self._make_detnet_layer(in_channels=2048)
        self.avgpool = nn.AvgPool2d(2)  # fit 448 input size
        # self.fc = nn.Linear(512 * block.expansion, num_classes)
        self.conv_end = nn.Conv2d(
            256,
            30,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=False)
        self.bn_end = nn.BatchNorm2d(30)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def _make_detnet_layer(self, in_channels):
        layers = []
        layers.append(
            detnet_bottleneck(
                in_planes=in_channels,
                planes=256,
                block_type='B'))
        layers.append(
            detnet_bottleneck(
                in_planes=256,
                planes=256,
                block_type='A'))
        layers.append(
            detnet_bottleneck(
                in_planes=256,
                planes=256,
                block_type='A'))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.avgpool(x)
        # x = x.view(x.size(0), -1)
        # x = self.fc(x)
        x = self.conv_end(x)
        x = self.bn_end(x)
        x = torch.sigmoid(x)
        # x = x.view(-1,7,7,30)
        x = x.permute(0, 2, 3, 1)  # (-1,7,7,30)

        return x


def resnet50(pretrained=False):
    """Constructs a ResNet-50 model.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3])
    if pretrained:
        # model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
        model.load_state_dict(torch.load("/media/beyond/70f23ead-fa6d-4628-acf7-c82133c03245/home/beyond/Documents/ml/data/my/图像识别/yolo/checkpoints/model-20e.pth",map_location=torch.device('cpu'))
                          ,strict=False)
    return model


model = resnet50(pretrained=True)
model.to(config.device)
torchinfo.summary(model, input_size=(1, 3, 448, 448))

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [1, 7, 7, 30]             --
├─Conv2d: 1-1                            [1, 64, 224, 224]         9,408
├─BatchNorm2d: 1-2                       [1, 64, 224, 224]         128
├─ReLU: 1-3                              [1, 64, 224, 224]         --
├─MaxPool2d: 1-4                         [1, 64, 112, 112]         --
├─Sequential: 1-5                        [1, 256, 112, 112]        --
│    └─Bottleneck: 2-1                   [1, 256, 112, 112]        --
│    │    └─Conv2d: 3-1                  [1, 64, 112, 112]         4,096
│    │    └─BatchNorm2d: 3-2             [1, 64, 112, 112]         128
│    │    └─ReLU: 3-3                    [1, 64, 112, 112]         --
│    │    └─Conv2d: 3-4                  [1, 64, 112, 112]         36,864
│    │    └─BatchNorm2d: 3-5             [1, 64, 112, 112]         128
│    │    └─ReLU: 3-6                    [1, 64, 112, 112]         --
│ 

In [16]:


VOC_CLASSES = [   # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor'] 

Color = [[0, 0, 0],
         [128, 0, 0],
         [0, 128, 0],
         [128, 128, 0],
         [0, 0, 128],
         [128, 0, 128],
         [0, 128, 128],
         [128, 128, 128],
         [64, 0, 0],
         [192, 0, 0],
         [64, 128, 0],
         [192, 128, 0],
         [64, 0, 128],
         [192, 0, 128],
         [64, 128, 128],
         [192, 128, 128],
         [0, 64, 0],
         [128, 64, 0],
         [0, 192, 0],
         [128, 192, 0],
         [0, 64, 128]]

def compute_iou_matrix(a):
    iou_m = torch.ones((a.size(0), a.size(0)))
    for j in range(a.size(0)):
        for k in range(j+1, a.size(0)):
            xmin = torch.max(a[j,0] - a[j,2]/2, a[k,0]- a[j,2]/2) 
            xmax = torch.min(a[j,0] + a[j,2]/2, a[k,0]+ a[j,2]/2) 
            ymin = torch.max(a[j,1] - a[j,3]/2, a[k,1]- a[j,3]/2) 
            ymax = torch.min(a[j,1] + a[j,3]/2, a[k,1]+ a[j,3]/2) 
            interset = (xmax - xmin) * (ymax - ymin)
            union = a[j,2] * a[j,3] + a[k,2] * a[k,3] - interset
            iou = interset / union
            iou_m[j,k] = iou
            iou_m[k,j] = iou
    return iou_m
            
import time
def decode(outputs):
    grid_num = 7
    cell_size = 1. / grid_num

    prob1 = outputs[:,:,:,4].unsqueeze(-1).expand_as(outputs[:,:,:,10:])  * outputs[:,:,:,10:] # softmax?
    prob2 = outputs[:,:,:,9].unsqueeze(-1).expand_as(outputs[:,:,:,10:])  * outputs[:,:,:,10:]
   
    x = torch.arange(0,7).to(config.device).unsqueeze(0).unsqueeze(0).expand_as(outputs[:,:,:,0])* cell_size + outputs[:,:,:,0] * cell_size
    y = torch.arange(0,7).to(config.device).unsqueeze(1).unsqueeze(0).expand_as(outputs[:,:,:,1])* cell_size + outputs[:,:,:,1] * cell_size
    wh  = outputs[:,:,:,2:4]
    real_coord = torch.concat([x.unsqueeze(-1),y.unsqueeze(-1), wh],dim=-1)
    

    x2 = torch.arange(0,7).to(config.device).unsqueeze(0).unsqueeze(0).expand_as(outputs[:,:,:,5])* cell_size + outputs[:,:,:,5] * cell_size
    y2 = torch.arange(0,7).to(config.device).unsqueeze(1).unsqueeze(0).expand_as(outputs[:,:,:,6])* cell_size + outputs[:,:,:,6] * cell_size
    wh2  = outputs[:,:,:,7:9]
    real_coord2 = torch.concat([x2.unsqueeze(-1),y2.unsqueeze(-1), wh2],dim=-1)
    
    
    a = torch.concat([real_coord[:,:,:,0:4], prob1], dim=-1)
    b = torch.concat([real_coord2[:,:,:,0:4], prob2], dim=-1)

    # a = torch.concat([outputs[:,:,:,0:4], prob1], dim=-1)
    # b = torch.concat([outputs[:,:,:,5:9], prob2], dim=-1)
    print(f"a : {a}")
    print(f"{time.time()}")
    a = a.view(-1,24)
    b = b.view(-1,24)
    a = torch.cat([a,b], dim=0)
    print(f"{time.time()}")
    # iou_m = compute_iou_matrix(a)
    print(f"{time.time()}")
    _,indices = torch.sort(a,dim=0, descending=True)
    print(f"{time.time()}")
    a[:,4:][a[:,4:] < config.min_confidence] = 0
    for i in range(4, 24):
        if a[:,i].sum() == 0:
            continue
        index = indices[:,i]
        for m in range(index.size(0)-1):
            j = index[m]
            if a[j,i] == 0:
                continue
            for n in range(m+1, index.size(0)):
                k = index[n]
                if a[k,i] == 0:
                    continue
                xmin = torch.max(a[j,0] - a[j,2]/2, a[k,0]- a[j,2]/2) 
                xmax = torch.min(a[j,0] + a[j,2]/2, a[k,0]+ a[j,2]/2) 
                ymin = torch.max(a[j,1] - a[j,3]/2, a[k,1]- a[j,3]/2) 
                ymax = torch.min(a[j,1] + a[j,3]/2, a[k,1]+ a[j,3]/2) 
                interset = (xmax - xmin) * (ymax - ymin)
                union = a[j,2] * a[j,3] + a[k,2] * a[k,3] - interset
                iou = interset / union
                # iou = iou_m[j,k]
                if iou > config.max_interset_iou:
                    a[k,i] = 0

    print(f"{time.time()}")
    maxprob, maxindex  = torch.max(a[:,4:],dim=1)
    print(maxprob)
    print(maxindex)
    c = torch.concat([a[:,0:4], maxindex.unsqueeze(-1), maxprob.unsqueeze(-1)], dim=1)
    # c[c[:,5] < config.min_confidence] = 0
    d = c[c[:,5] != 0]
    print(f"{time.time()}")
    return d

def predict(model, image_name):

    result = []
    image = cv2.imread(image_name)
    h, w, _ = image.shape
    img = cv2.resize(image, (448, 448))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    mean = (123, 117, 104)  # RGB
    img = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])(img)
    img = Variable(img[None, :, :, :], volatile=True)
    img = img.to(config.device)
    pred = model(img)  # 1x7x7x30
    print(pred)
    d = decode(pred)
    print(f"d : {d}")
    for i in range(d.size(0)):
        box = d[i, :4]
        x1 = int((box[0] - box[2]/2) * w)
        x2 = int((box[0] + box[2]/2) * w)
        y1 = int((box[1] - box[3]/2) * h)
        y2 = int((box[1] + box[3]/2) * h)
        cls_index = int(d[i,4].item())
        prob = float(d[i,5].item())
        result.append(
            [(x1, y1), (x2, y2), VOC_CLASSES[cls_index], image_name, prob])
    return result


if __name__ == '__main__':
    # model = resnet50(pretrained=True)
    print('load model...')
    model.eval()
    model.to(config.device)
    # image_name = os.path.join(config.datasetdir, "JPEGImages", dataset.fnames[21])
    # image_name = "/media/beyond/70f23ead-fa6d-4628-acf7-c82133c03245/home/beyond/Documents/ml/data/my/图像识别/yolo/target.jpg"
    image_name =  "/media/beyond/70f23ead-fa6d-4628-acf7-c82133c03245/home/beyond/Documents/ml/data/my/图像识别/yolo/R-C.jpeg"
    image = cv2.imread(image_name)
    print('predicting...')
    result = predict(model, image_name)
    print(result)
    for left_up, right_bottom, class_name, _, prob in result:
        color = Color[VOC_CLASSES.index(class_name)]
        cv2.rectangle(image, left_up, right_bottom, color, 2)
        label = class_name + str(round(prob, 2))
        text_size, baseline = cv2.getTextSize(
            label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
        p1 = (left_up[0], left_up[1] - text_size[1])
        cv2.rectangle(image, (p1[0] - 2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]), color, - 1)
        cv2.putText(
            image,
            label,
            (p1[0],
             p1[1] +
                baseline),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.4,
            (255,
             255,
             255),
            1,
            8)
    # plt.imshow(image)
    cv2.imwrite('result.jpg', image)


load model...
predicting...


  img = Variable(img[None, :, :, :], volatile=True)


tensor([[[[0.1596, 0.4948, 0.2584,  ..., 0.4555, 0.4669, 0.5033],
          [0.6655, 0.7219, 0.6589,  ..., 0.4572, 0.6218, 0.4867],
          [0.3864, 0.7451, 0.6957,  ..., 0.3787, 0.4395, 0.4624],
          ...,
          [0.6735, 0.7859, 0.6114,  ..., 0.5226, 0.5779, 0.5514],
          [0.4831, 0.7628, 0.5898,  ..., 0.5501, 0.5114, 0.3790],
          [0.4542, 0.5874, 0.4995,  ..., 0.5352, 0.4172, 0.5330]],

         [[0.6073, 0.7305, 0.3532,  ..., 0.6820, 0.7047, 0.6878],
          [0.9307, 0.7722, 0.6431,  ..., 0.3262, 0.5214, 0.6216],
          [0.8156, 0.6101, 0.5429,  ..., 0.5703, 0.4781, 0.3470],
          ...,
          [0.8126, 0.8261, 0.6679,  ..., 0.6111, 0.6956, 0.7345],
          [0.7457, 0.7521, 0.6366,  ..., 0.7686, 0.5819, 0.6124],
          [0.4782, 0.5324, 0.6625,  ..., 0.8393, 0.5972, 0.5915]],

         [[0.7138, 0.7916, 0.4971,  ..., 0.6835, 0.5715, 0.5670],
          [0.7936, 0.8616, 0.4400,  ..., 0.5473, 0.5030, 0.3847],
          [0.4977, 0.6484, 0.4296,  ..., 0