# 数据提取和转换

In [2]:
import os 
import cv2
import xml.etree.ElementTree as ET

CLASSES = ['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
           'aeroplane', 'bicycle', 'boat', 'bus', 'car', 'motorbike', 'train',
           'bottle', 'chair', 'dining table', 'potted plant', 'sofa', 'tvmonitor']
num_bbox = 2

dataset = r'/home/yuzhengbo/文档/VOCdevkit/VOC2012'

def convert(size,box):
    w_ = size[0]
    h_ = size[1]
    x = (box[0]+box[2])/2.
    y = (box[1]+box[3])/2.
    w = box[2]-box[0]
    h = box[3]-box[1]
    return x/w_,y/h_,w/w_,h/h_
    
def convert_annotation(image_id):
    in_file = open(dataset+'/Annotations'+'/%s'%image_id)
    image_id = image_id.split('.')[0]
    out_file = open(dataset+'/labels/%s.txt'%image_id,'w+')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    width = float(size.find('width').text)
    height = float(size.find('height').text)
    
    for obj in root.iter('object'):
        name = obj.find('name').text
        difficult = obj.find('difficult')
        if name not in CLASSES or difficult==1:
            continue
        cls_id = CLASSES.index(name)
        bndbox = obj.find('bndbox')
        box = (float(bndbox.find('xmin').text),float(bndbox.find('ymin').text),
                      float(bndbox.find('xmax').text),float(bndbox.find('ymax').text))
        text_ = convert([width,height],box)
        out_file.write(str(cls_id)+' '+' '.join([str(i) for i in text_])+'\n')
# convert_annotation('2007_000027.xml')

# for i in os.listdir(dataset+'/Annotations'):
#     convert_annotation(i)
# print('convert finish!')

In [3]:
# def show_labels_img(imgname):
#     print(dataset + "JPEGImages/" + imgname + ".jpg")
#     img = cv2.imread(dataset + "/JPEGImages/" + imgname + ".jpg")
#     h, w = img.shape[:2]
#     print(w,h)
#     label = []
#     with open(dataset+'/labels/'+imgname+".txt",'r') as flabel:
#         for label in flabel:
#             label = label.split(' ')
#             label = [float(x.strip()) for x in label]
#             print(CLASSES[int(label[0])])
#             pt1 = (int(label[1] * w - label[3] * w / 2), int(label[2] * h - label[4] * h / 2))
#             pt2 = (int(label[1] * w + label[3] * w / 2), int(label[2] * h + label[4] * h / 2))
#             cv2.putText(img,CLASSES[int(label[0])],pt1,cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255))
#             cv2.rectangle(img,pt1,pt2,(0,0,255,2))

#     cv2.imshow("img",img)
#     cv2.waitKey(0)
# #     cv2.destroyAllWindows()
# show_labels_img('2007_000027')

In [4]:
from torch.utils.data import Dataset
import numpy as np
from torchvision import transforms
import torch
import matplotlib.pyplot as plt
class VOC2012(Dataset):
    def __init__(self,is_train=True,is_aug=True):
        self.filenames = []
        if is_train:
            with open(dataset+'/ImageSets/Main/train.txt','r') as f:
                self.filenames = [x.strip() for x in f]
        else:
            with open(dataset+'/ImageSets/Main/val.txt','r') as f:
                self.filenames = [x.strip() for x in f]
                
        self.imgpath = dataset+'/JPEGImages/'
        self.labelpath = dataset+'/labels/'
        self.is_aug = is_aug
        
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self,item):
        img = cv2.imread(self.imgpath+self.filenames[item]+'.jpg')
        h,w = img.shape[:2]
        input_size = 448
        padw,padh = 0,0
        if h>w:
            padw = (h-w)//2
            img = np.pad(img,((0,0),(padw,padw),(0,0)),'constant',constant_values=0)
        elif w>h:
            padh = (w-h)//2
            img = np.pad(img,((padh,padh),(0,0),(0,0)),'constant',constant_values=0)
        img = cv2.resize(img,(input_size,input_size))
        
        if self.is_aug:
            aug = transforms.Compose([
                transforms.ToTensor()
            ])
            img = aug(img)
            
        with open(self.labelpath+self.filenames[item]+'.txt')as f:
            bbox = f.read().split('\n')
            bbox = [x.split() for x in bbox]
            bbox = [float(y) for x in bbox for y in x]

            if len(bbox)%5!=0:
                raise ValueError('file:'+self.labelpath+self.filenames[item]+'.txt'+'has some questions!')
        for i in range(len(bbox)//5):
            if padw!=0:
                bbox[i*5+1] = (bbox[i*5+1]*w+padw)/h
                bbox[i*5+3] = (bbox[i*5+3]*w)/h
            elif padh!=0:
                bbox[i * 5 + 2] = (bbox[i * 5 + 2] * h + padh) / w
                bbox[i * 5 + 4] = (bbox[i * 5 + 4] * h) / w
                
        labels = convert_bbox2labels(bbox)
        labels = transforms.ToTensor()(labels)
        return img,labels
def convert_bbox2labels(bbox):
    gridsize = 1.0/7
    labels = np.zeros((7,7,5*num_bbox+len(CLASSES)))
    for i in range(len(bbox)//5):
        #目标区域中心点所在最后feature map的位置
        gridx = int(bbox[i*5+1]//gridsize)
        gridy = int(bbox[i*5+2]//gridsize)
        #目标区域中心相对于所在feature map网格左上方坐标的偏移值
        gridpx = bbox[i * 5 + 1] / gridsize - gridx
        gridpy = bbox[i * 5 + 2] / gridsize - gridy
            
        labels[gridy,gridx,0:5] = np.array([gridpx,gridpy,bbox[i*5+3],bbox[i*5+4],1])
        labels[gridy,gridx,5:10] = np.array([gridpx,gridpy,bbox[i*5+3],bbox[i*5+4],1])
        labels[gridy,gridx,int(bbox[i*5])+10] = 1
    return labels      #其中labels的两个预测目标位置都是相同的
    
# voc = VOC2012()
# img,labels = voc.__getitem__(1)
# print(img.shape,labels.shape)

In [5]:
class Loss_yolov1(torch.nn.Module):
    def __init__(self):
        super(Loss_yolov1).__init__()
        
    def forward(self,pred,labels):      #其中30这一列数据分别表示（x1,y1,w1,h1,confi)*2+classes
        # pred和labels的形状都是（batch_size,30,7,7)
        num_gridx,num_gridy = labels.shape[-2:]
        num_b = 2
        num_cls = 20
        noobj_confi_loss = 0     #无检测目标的置信度损失
        coor_loss = 0            #含有目标的坐标损失
        obj_confi_loss = 0       #有目标的置信度损失
        class_loss = 0           #目标类别的置信度损失
        n_batch = labels.shape[0]
        
        for i in range(n_batch):
            for n in range(7):    #x方向的循环
                for m in range(7):
                    if labels[i,4,m,n] == 1:
                        bbox1_pred_xyxy = ((pred[i,0,m,n]+n)/num_gridx - pred[i,2,m,n]/2,(pred[i,1,m,n]+m)/num_gridy - pred[i,3,m,n]/2,
                                           (pred[i,0,m,n]+n)/num_gridx + pred[i,2,m,n]/2,(pred[i,1,m,n]+m)/num_gridy + pred[i,3,m,n]/2)
                        bbox2_pred_xyxy = ((pred[i,5,m,n]+n)/num_gridx - pred[i,7,m,n]/2,(pred[i,6,m,n]+m)/num_gridy - pred[i,8,m,n]/2,
                                           (pred[i,5,m,n]+n)/num_gridx + pred[i,7,m,n]/2,(pred[i,6,m,n]+m)/num_gridy + pred[i,8,m,n]/2)
                        bbox_gt_xyxy = ((labels[i,0,m,n]+n)/num_gridx - labels[i,2,m,n]/2,(labels[i,1,m,n]+m)/num_gridy - labels[i,3,m,n]/2,
                                        (labels[i,0,m,n]+n)/num_gridx + labels[i,2,m,n]/2,(labels[i,1,m,n]+m)/num_gridy + labels[i,3,m,n]/2)
                        
                        iou1 = calculate_iou(bbox1_pred_xyxy,bbox_gt_xyxy)
                        iou2 = calculate_iou(bbox2_pred_xyxy,bbox_gt_xyxy)
                        if iou1>iou2:
                            coor_loss += 5*(torch.sum((pred[i,0:2,m,n]-labels[i,0:2,m,n])**2)+
                                           torch.sum((pred[i,2:4,m,n].sqrt()-labels[i,2:4,m,n].sqrt())**2))
                            obj_confi_loss += (pred[i,4,m,n]-iou1)**2              #IOU即是置信度
                            noobj_confi_loss += 0.5*((pred[i,9,m,n]-iou2)**2)     #没感觉到这两个损失函数的作用
                        else:            
                            coor_loss += 5*(torch.sum((pred[i,5:7,m,n]-labels[i,5:7,m,n])**2)+
                                           torch.sum((pred[i,7:9,m,n].sqrt()-labels[i,7:9,m,n].sqrt())**2))
                            obj_confi_loss += (pred[i,9,m,n]-iou2)**2
                            noobj_confi_loss += 0.5*((pred[i,4,m,n]-iou1)**2)
                        class_loss += torch.sum((pred[i,10:,m,n]-labels[i,10:,m,n])**2)
                    else:
                        #没有目标的情况下
                        noobj_confi_loss += 0.5*torch.sum(pred[i,[4,9],m,n]**2)
        loss =  coor_loss + obj_confi_loss + noobj_confi_loss + class_loss
        return loss
    
    
#计算IOU函数的设计

def calculate_iou(bbox1,bbox2):
    intersect_bbox = [0,0,0,0]
    if bbox1[2]<bbox2[0] or bbox1[3]<bbox2[1] or bbox1[0]>bbox2[2] or bbox1[1]>bbox2[3]:
          return 0
    else:
        intersect_bbox[0] = max(bbox1[0],bbox2[0])
        intersect_bbox[1] = max(bbox1[1],bbox2[1])
        intersect_bbox[2] = min(bbox1[2],bbox2[2])
        intersect_bbox[3] = min(bbox1[3],bbox2[3])
    
    area1 = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
    area2 = (bbox2[2]-bbox2[0])*(bbox2[3]-bbox2[1])
    area_inter = (intersect_bbox[2]-intersect_bbox[0])*(intersect_bbox[3]-intersect_bbox[1])
    
    if area_inter>0:
        return area_inter/(area1+area2-area_inter)
    else:
        return 0
    
    
# pred = torch.from_numpy(np.ones((1,30,7,7)))
# labels = torch.from_numpy(np.expand_dims(labels,axis=0))
# type(pred),type(labels),pred.shape,labels.shape

# loss = Loss_yolov1()
# loss.forward(pred,labels)

In [6]:
#网络模型搭建
import torchvision.models as tvmodel
from torch import nn
import torch

class YOLOv1_resnet(nn.Module):
    def __init__(self):
        super(YOLOv1_resnet,self).__init__()
        resnet = tvmodel.resnet34(pretrained=True)    
        resnet_out_channel = resnet.fc.in_features
        #使用resnet网络的卷积层
        self.resnet = nn.Sequential(*list(resnet.children())[:-2])           
        self.conv_layers = nn.Sequential(
            nn.Conv2d(resnet_out_channel,1024,3,padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            nn.Conv2d(1024,1024,3,stride=2,padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            nn.Conv2d(1024,1024,3,padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            nn.Conv2d(1024,1024,3,padding=1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(1024*7*7,4096),
            nn.LeakyReLU(),
            nn.Linear(4096,7*7*30),
            nn.Sigmoid()
        )
        
    def forward(self,x):
        x = self.resnet(x)
        x= self.conv_layers(x)
        x = x.view(x.size()[0],-1)
        output = self.fc(x)
        return output.reshape(-1,(5*num_bbox+len(CLASSES)),7,7)
        
# YOLO = YOLOv1_resnet()
# YOLO(torch.ones((1,3,448,448))).shape

In [None]:
#train
import visdom
from torch.utils.data import DataLoader
epoch = 50
batchsize = 5
lr = 0.0001
is_vis = False

train_data = VOC2012()
train_dataloader = DataLoader(VOC2012(is_train=True),batch_size=batchsize,shuffle=True)

model = YOLOv1_resnet().cuda()

#将resnet网络部分冻结,整个网络有三个部分(resnet部分,自己定义的卷积结构以及全连接结构)
for layer in model.children():
    layer.requires_grad = False
    break

criterion = Loss_yolov1()
optimizer = torch.optim.SGD(model.parameters(),lr=lr,momentum=0.9,weight_decay=0.0005)
if is_vis:
    vis = visdom.Visdom()
    viswin1 = vis.line(np.array([0.]),np.array([0.]),opts=dict(title='Loss/Step',xlabel='100*step',ylabel='Loss'))

for e in range(epoch):
    model.train()
    y1 = torch.Tensor([0.]).cuda()
    for i,(inputs,labels) in enumerate(train_dataloader):
        inputs = inputs.cuda()
        labels = labels.float().cuda()
        pred = model(inputs)
        loss = criterion.forward(pred,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print("Epoch %d/%d| Step %d/%d| Loss: %.2f"%(e,epoch,i,len(train_data)//batchsize,loss))
        y1 += loss
        if is_vis and (i+1)%100==0:
            vis.line(np.array([yl.cpu().item()/(i+1)]),np.array([i+e*len(train_data)//batchsize]),win=viswin1,update='append')
    if (e+1)%10 == 0:
        torch.save(model,'./models_pkl/YOLOv1_epoch'+str(e+1)+'.pkl')
            

Epoch 0/50| Step 0/1143| Loss: 147.40
Epoch 0/50| Step 1/1143| Loss: 108.09
Epoch 0/50| Step 2/1143| Loss: 116.86
Epoch 0/50| Step 3/1143| Loss: 88.00
Epoch 0/50| Step 4/1143| Loss: 78.12
Epoch 0/50| Step 5/1143| Loss: 94.34
Epoch 0/50| Step 6/1143| Loss: 80.31
Epoch 0/50| Step 7/1143| Loss: 88.14
Epoch 0/50| Step 8/1143| Loss: 33.84
Epoch 0/50| Step 9/1143| Loss: 67.07
Epoch 0/50| Step 10/1143| Loss: 65.99
Epoch 0/50| Step 11/1143| Loss: 26.87
Epoch 0/50| Step 12/1143| Loss: 22.47
Epoch 0/50| Step 13/1143| Loss: 23.53
Epoch 0/50| Step 14/1143| Loss: 53.30
Epoch 0/50| Step 15/1143| Loss: 61.59
Epoch 0/50| Step 16/1143| Loss: 24.81
Epoch 0/50| Step 17/1143| Loss: 50.04
Epoch 0/50| Step 18/1143| Loss: 55.48
Epoch 0/50| Step 19/1143| Loss: 23.82
Epoch 0/50| Step 20/1143| Loss: 48.11
Epoch 0/50| Step 21/1143| Loss: 36.19
Epoch 0/50| Step 22/1143| Loss: 29.16
Epoch 0/50| Step 23/1143| Loss: 69.67
Epoch 0/50| Step 24/1143| Loss: 28.88
Epoch 0/50| Step 25/1143| Loss: 50.33
Epoch 0/50| Step 26

# 网络预测


In [None]:
def labels2bbox(matrix):
    #将matrix由（7，7，30）转变成为（98，25）
    if maxtrix.size()[:2] != (7,7):
        raise ValueError('Error:Wrong with labels size:',matrix.size())
    bbox = torch.zeros((98,25))
    for i in range(7):      #y方向
        for j in range(7):
            bbox[2*(i*7+j),:4] = torch.Tersor([(matrix[i,j,0]+j)/7-matrix[i,j,2]/2,(matrix[i,j,1]+i)/7-matrix[i,j,3]/2,
                                              (matrix[i,j,0]+j)/7+matrix[i,j,2]/2,(matrix[i,j,1]+i)/7+matrix[i,j,3]/2])
            bbox[2*(i*7+j),4] = matrix[i,j,4]
            bbox[2*(i*7+j),5:] = matrix[i,j,10:]
            
            bbox[2*(i*7+j)+1,:4] = torch.Tersor([(matrix[i,j,5]+j)/7-matrix[i,j,7]/2,(matrix[i,j,6]+i)/7-matrix[i,j,8]/2,
                                              (matrix[i,j,5]+j)/7+matrix[i,j,7]/2,(matrix[i,j,6]+i)/7+matrix[i,j,8]/2])
            bbox[2*(i*7+j)+1,4] = matrix[i,j,9]
            bbox[2*(i*7+j)+1,5:] = matrix[i,j,10:]
    return NMS(bbox)


def NMS(bbox,conf_tresh=0.1,iou_thresh=0.3):
    n = bbox.size()[0]
    bbox_prob = bbox[:,5:].clone()
    bbox_confi = bbox[:,4].clone().unsqueeze(1).expand_as(bbox_prob)
    bbox_cls_spec_conf = bbox_confi*bbox_prob
    bbox_cls_spec_conf[bbox_cls_spec_cong<conf_tresh] = 0
    for c in range(20):
        rank = torch.sort(bbox_cls_spec_conf[:,c],descending=True).indices
        for i in range(98):
            if bbox_cls_spec_conf[rank[i],c] != 0:
                for j in range(i+1,98):
                    if bbox_cls_spec_conf[rank[j],c] != 0:
                        iou = calculate_iou(bbox[rank[i],:4],bbox[rank[j],:4])
                        if iou>iou_thresh:
                            bbox_cls_spec_conf[rank[j],c] = 0
                    else:
                        break
    bbox = bbox[torch.max(bbox_cls_spec_conf,dim=1).values>0] 
    bbox_cls_spec_conf = bbox_cls_spec_conf[torch.max(bbox_cls_spec_conf,axis=1).values>0]
    result = torch.zeros((bbox.size()[0],6))
    result[:,1:5] = bbox[:,:4]
    result[:,0] = torch.argmax(bbox[:,5:],axis=1).int()
    result[:,5] = torch.max(bbox_cls_spec_conf,axis=1).values
    return result

def draw_bbox(img,bbox):
    h,w = img.shape[:2]
    n = bbox.size()[0]
#     print(bbox)
    for i in range(n):
        p1 = (bbox[i,1]*w,bbox[i,2]*h)
        p2 = (bbox[i,3]*w,bbox[i,4]*h)
        cls_name = CLASSES[int(bbox[i,0])]
        confidence = bbox[i,5]
        cv2.rectangle(img,p1,p2,color=COLOR[int(bbox[i,0])])
        cv2.putText(img,cls_name,p1,cv.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255))
    cv2.imshow('bbox',img)
    cv2.waitKey(0)



COLOR = [(255,0,0),(255,125,0),(255,255,0),(255,0,125),(255,0,250),
         (255,125,125),(255,125,250),(125,125,0),(0,255,125),(255,0,0),
         (0,0,255),(125,0,255),(0,125,255),(0,255,255),(125,125,255),
         (0,255,0),(125,255,125),(255,255,255),(100,100,100),(0,0,0),] 


In [None]:
val_dataloader = DataLoader(VOC2012(is_train=False),batch_size=1,shuffle=False)

model = torch.load('./models_pkl/YOLOv1_epoch50.pkl')
for i,(inputs,labels) in enumerate(val_dataloader):
    inputs = inputs.cuda()
    pred = model(inputs)
    pred = pred.squeeze(dim=0)   #压缩为30，7，7本来应该是（1，30，7，7）
    pred = pred.permute((1,2,0))   #转变为（7，7，30）
    
    bbox = labels2bbox(pred)
    inputs = inputs.squeeze(dim=0)
    inputs = inputs.permute((1,2,0))  #转换为(448,448,3)
    img = inputs.cpu().numpy()
    img = 255*img
    img = img.astype(np.uint(8))
    draw_bbox(img,bbox.cpu())
#     print(bbox.size(),bbox)
#     with open(dataset+'/pre_labels')