In [None]:
### contents of utils.py

In [70]:
from __future__ import division

import torch 
import torch.nn as nn
import torch.nn.functional as F 
from torch.autograd import Variable
import numpy as np
import cv2 

def predict_transform(prediction,inp_dim,anchors,num_classes,CUDA=True):
    
    batch_size = prediction.size(0)
    stride =  inp_dim // prediction.size(2)
    grid_size = inp_dim // stride
    bbox_attrs = 5 + num_classes
    num_anchors = len(anchors)
    
    prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
    prediction = prediction.transpose(1,2).contiguous()
    prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
    
    anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
    
    prediction[:,:,0]=torch.sigmoid(prediction[:,:,0])
    prediction[:,:,1]=torch.sigmoid(prediction[:,:,1])
    prediction[:,:,4]=torch.sigmoid(prediction[:,:,4])
    
    grid=np.arange(grid_size)
    a,b=np.meshgrid(grid,grid)
    
    x_offset=torch.FloatTensor(a).view(-1,1)
    y_offset=torch.FloatTensor(b).view(-1,1)
    
    if CUDA:
        x_offset = x_offset.cuda()
        y_offset = y_offset.cuda()
    
    x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)

    prediction[:,:,:2] += x_y_offset
    
    #log space transform height and the width
    anchors = torch.FloatTensor(anchors)

    if CUDA:
        anchors = anchors.cuda()

    anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
    prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors

    prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))
    
    prediction[:,:,:4] *= stride
    
    return prediction
    
    

In [78]:
def write_results(predictions,confidence,num_classes,nms_conf=0.4):
    #Our prediction tensor contains information about B x 10647 bounding boxes
    conf_mask=(predictions[:,:,4] > confidence).float().unsqueeze(2)
    prediction=prediction*conf_mask
    box_corner = prediction.new(prediction.shape)
    box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
    box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
    box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 
    box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
    prediction[:,:,:4] = box_corner[:,:,:4]
    
    batch_size = prediction.size(0)

    write = False
    
    for ind in range(batch_size):
        image_pred=prediction[ind]
        max_conf,max_conf_score=torch.max(image_pred[:,5:5+num_classes],1)
        max_conf = max_conf.float().unsqueeze(1)
        max_conf_score = max_conf_score.float().unsqueeze(1)
        seq = (image_pred[:,:5], max_conf, max_conf_score)
        image_pred = torch.cat(seq, 1)
        
        non_zero_ind =  (torch.nonzero(image_pred[:,4]))
        try:
            image_pred=image_pred[non_zero_ind.squeeze(),:].view(-1,7)
        except:
            continue
            
        if image_pred_.shape[0] == 0:
            continue 
            
        img_classes=unique(image_pred[:,-1])
        
        for cls in img_classes:
            

def unique(tensor):
    tensor_np=tensor.cpu().numpy()
    unique_np=np.unique(tensor_np)
    unique_tensor=torch.from_numpy(unique_np)
    tensor_res=tensor.new(unique_tensor.shape)
    tensor_res.copy_(unique_tensor)
    return tensor_res

In [77]:
# # t = torch.randn(2,3)
# # t.size(0)
# # # t.view?
# # t.view(1,6)
# grid=np.arange(5)
# a,b = np.meshgrid(grid,grid)

# x_offset=torch.FloatTensor(a).view(-1,1)
# y_offset=torch.FloatTensor(b).view(-1,1)

# torch.cat((x_offset,y_offset),1).repeat(1,3).view(-1,2)

In [49]:
### contents of darknet.py

In [50]:
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
# from util import * 


In [51]:
def parse_cfg(cfgfile):
    # parse config to store every block as a dict
    file=open(cfgfile,'r')
    lines = file.read().split('\n')
    lines = [x for x in lines if len(x)>0]
    lines = [x for x in lines if x[0] != '#']
    lines = [x.rstrip().lstrip() for x in lines]
    
    block={}
    blocks=[]
    
    for line in lines:
        if line[0]=='[':
            if len(block)!=0:
                blocks.append(block)
                block={}
            block['type']=line[1:-1].rstrip()
        else:
            key,value = line.split('=')
            block[key.rstrip()]=value.lstrip()
    blocks.append(block)
    return blocks
            


<hr>
<i><b> We have 5 types of layers in the list (mentioned above). PyTorch provides pre-built layers for types convolutional and upsample. We will have to write our own modules for the rest of the layers by extending the nn.Module class.</b></i>
<hr>

### nn.ModuleList

Our function will return a nn.ModuleList. This class is almost like a normal list containing nn.Module objects. However, when we add nn.ModuleList as a member of a nn.Module object (i.e. when we add modules to our network), all the parameters of nn.Module objects (modules) inside the nn.ModuleList are added as parameters of the nn.Module object (i.e. our network, which we are adding the nn.ModuleList as a member of) as well.

In [52]:
class EmptyLayer(nn.Module):
    def __init__(self):
        super(EmptyLayer, self).__init__()
        
class DetectionLayer(nn.Module):
    def __init__(self,anchors):
        super().__init__()
        self.anchors = anchors

In [53]:
def create_modules(blocks):
    net_info = blocks[0]
    module_list = nn.ModuleList()
    prev_filters=3
    output_filters=[]
    for index,x in enumerate(blocks[1:]):
        module = nn.Sequential()
        # check the type of block 
        # create a new module for block and append it to module list
        """
        nn.Sequential class is used to sequentially execute a number of nn.Module objects. 
        If you look at the cfg, you will realize a block may contain more than one layer. 
        For example, a block of type convolutional has a batch norm layer as well as leaky ReLU activation layer in addition to a convolutional layer. 
        We string together these layers using the nn.Sequential and it's the add_module function. 
        For example, this is how we create the convolutional and the upsample layers.
        """
        if x['type']=='convolutional':
            activation=x['activation']
            try:
                batch_normalize = int(x['batch_normalize'])
                bias = False
            except:
                batch_normalize = 0
                bias = True
            
            filters = int(x['filters'])
            padding = int(x["pad"])
            kernel_size = int(x["size"])
            stride = int(x["stride"])
            
            if padding:
                pad=(kernel_size-1)//2
            else:
                pad=0
            
            # add conv layer
            conv = nn.Conv2d(prev_filters,filters, kernel_size, stride,pad,bias=bias)
            module.add_module("conv_{0}".format(index),conv)
            
            # add batch norm
            if batch_normalize:
                bn=nn.BatchNorm2d(filters)
                module.add_module("batch_norm_{0}".format(index),bn)
            
            # check the activation
            if activation == "leaky":
                activn = nn.LeakyReLU(0.1, inplace = True)
                module.add_module("leaky_{0}".format(index), activn)
                
        # if it's an upsample layer, we use vilinear2dupsampling
        elif x['type']=='upsample':
            stride=int(x['stride'])
            upsample = nn.Upsample(scale_factor=2,mode='bilinear')
            module.add_module("upsample_{}".format(index),upsample)
            
        # let's work on route / shortcut layer
        elif x['type']=='route':
            x['layers']=x['layers'].split(',')
            start=int(x['layers'][0])
            try:
                end=int(x['layers'][1])
            except:
                end=0
            
            #Positive annotation
            if start>0:
                start=start-index
            if end>0:
                end=end-index
                
            
            route = EmptyLayer()
            module.add_module("route_{0}".format(index),route)
            if end<0:
                filters=output_filters[index+start] + output_filters[index+end]
            else:
                filters=output_filters[index+start]
        
        #shortcut
        elif x['type']=='shortcut':
            shortcut=EmptyLayer()
            module.add_module("shortcut_{}".format(index), shortcut)
        
        elif x['type']=='yolo':
            mask = list(map(int,x['mask'].split(',')))
            # mask = [int(x) for x in mask]
            anchors = list(map(int,x['anchors'].split(',')))
            anchors = [(anchors[i],anchors[i+1]) for i in range(0,len(anchors),2)]
            
            # use only ones indexed in mask
            anchors = [anchors[i] for i in mask]
            
            # we define a custom Detection Layer
            detection = DetectionLayer(anchors)
            module.add_module("Detection_{}".format(index),detection)
            
        module_list.append(module)
        prev_filters = filters
        output_filters.append(filters)
        
    return (net_info,module_list) 

        
            
            

In [54]:
m = nn.Sequential(nn.Linear(3,30))
print(m)
m.add_module("1",nn.Linear(30,3))
print(m)
l=nn.ModuleList()
l.append(m)
print(l)

Sequential(
  (0): Linear(in_features=3, out_features=30, bias=True)
)
Sequential(
  (0): Linear(in_features=3, out_features=30, bias=True)
  (1): Linear(in_features=30, out_features=3, bias=True)
)
ModuleList(
  (0): Sequential(
    (0): Linear(in_features=3, out_features=30, bias=True)
    (1): Linear(in_features=30, out_features=3, bias=True)
  )
)


In [55]:
## test create_module function

In [56]:
block = parse_cfg('./cfg/yolov3.cfg')    

In [57]:
temp_l = create_modules(block)

In [79]:
# let's work on forward pass
cfgfile='./cfg/yolov3.cfg'

class Darknet(nn.Module):
    def __init__(self,cfgfile):
        super().__init__()
        self.blocks = parse_cfg(cfgfile)
        self.net_info, self.module_list=create_modules(self.blocks)
        
    def forward(self,x,CUDA):
        modules=self.blocks[1:]
        outputs={}
        """
        Since route and shortcut layers need output maps from previous layers, 
        we cache the output feature maps of every layer in a dict outputs
        """
        
        #iterate over modules for forward pass
        write=0
        for i,module in enumerate(modules):
            module_type=(module['type'])
            if module_type=='convolutional' or module_type=='upsample':
                x=self.module_list[i](x)
                
            elif module_type=='route':
                #handle for route
                layers=list(map(int,module['layers']))
                if layers[0]>0:
                    layers[0]-=i
                if len(layers)==1:
                    x=outputs[i+(layers[0])]
                else:
                    if layers[1]>0:
                        layers[1]-=i
                    map1=outputs[i+layers[0]]
                    map2=outputs[i+layers[1]]
                    
                    x=torch.cat((map1,map2),1)
            elif  module_type == "shortcut":
                from_ = int(module["from"])
                x = outputs[i-1] + outputs[i+from_]
                
            elif module_type=='yolo':
                anchors=self.module_list[i][0].anchors
                inp_dim=int(self.net_info['height'])
                
                num_classes=int(module['classes'])
                
                x = predict_transform(x,inp_dim,anchors,num_classes,CUDA)
                print("x=",x.shape)
                if not write:
                    detections = x
                    write = 1
                else:
                    detections = torch.cat((detections,x),1)
            outputs[i]=x
        return detections
    
    def load_weights(self,weightfile):
        fp=open(weightfile,'rb')
        
        #The first 5 values are header information 
        # 1. Major version number
        # 2. Minor Version Number
        # 3. Subversion number 
        # 4,5. Images seen by the network (during training)
        
        header = np.fromfile(fp,dtype=np.int32,count=5)
        self.header = torch.from_numpy(header)
        self.seen = self.header[3]
        
        
        weights=np.fromfile(fp,dtype=np.float32)
        ptr=0
        for i in range(len(self.module_list)):
            module_type=self.blocks[i+1]['type']
            #If module_type is convolutional load weights
            #Otherwise ignore.
            if module_type == "convolutional":
                model = self.module_list[i]
                try:
                    batch_normalize = int(self.blocks[i+1]["batch_normalize"])
                except:
                    batch_normalize = 0

                conv = model[0]
                
                if batch_normalize:
                    bn = model[1]
                    num_bn_biases = bn.bias.numel()
                    bn_biases=torch.from_numpy(weights[ptr:ptr + num_bn_biases])
                    ptr += num_bn_biases
                    
                    bn_weights=torch.from_numpy(weights[ptr: ptr + num_bn_biases])
                    ptr  += num_bn_biases
                    
                    bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
                    ptr  += num_bn_biases
                
                    bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
                    ptr  += num_bn_biases
                    
                    bn_biases = bn_biases.view_as(bn.bias.data)
                    bn_weights = bn_weights.view_as(bn_weights.data)
                    bn_running_mean = bn_running_mean.view_as(bn.running_mean)
                    bn_running_var = bn_running_var.view_as(bn.running_var)
                    
                    
                    # copy the data to a model
                    bn.bias.data.copy_(bn_biases)
                    bn.weight.data.copy_(bn_weights)
                    bn.running_mean.copy_(bn_running_mean)
                    bn.running_var.copy_(bn_running_var)
                    
                else:
                    num_biases = conv.bias.numel()
                    
                    conv_biases = torch.from_numpy(weights[ptr:ptr+num_biases])
                    ptr+=num_biases
                    
                    conv_biases = conv_biases.view_as(conv.bias.data)
                    conv.bias.data.copy_(conv_biases)
                
                
                # load weights of conv layer
                num_weights = conv.weight.numel()
                
                conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
                ptr = ptr + num_weights

                conv_weights = conv_weights.view_as(conv.weight.data)
                conv.weight.data.copy_(conv_weights)

In [80]:
def get_test_input():
    img = cv2.imread("dog-cycle-car.png")
    img = cv2.resize(img, (608,608))          #Resize to the input dimension
    img_ =  img[:,:,::-1].transpose((2,0,1))  # BGR -> RGB | H X W C -> C X H X W 
    img_ = img_[np.newaxis,:,:,:]/255.0       #Add a channel at 0 (for batch) | Normalise
    img_ = torch.from_numpy(img_).float()     #Convert to float
    img_ = Variable(img_)                     # Convert to Variable
    return img_

In [81]:
get_test_input()

tensor([[[[0.2392, 0.2392, 0.2392,  ..., 0.6588, 0.2824, 0.2588],
          [0.2392, 0.2392, 0.2392,  ..., 0.6196, 0.2510, 0.2431],
          [0.2392, 0.2392, 0.2392,  ..., 0.5569, 0.2196, 0.2431],
          ...,
          [0.6275, 0.6275, 0.6275,  ..., 0.4078, 0.2431, 0.2078],
          [0.6235, 0.6235, 0.6235,  ..., 0.3804, 0.2392, 0.1922],
          [0.6235, 0.6235, 0.6235,  ..., 0.3569, 0.2392, 0.1804]],

         [[0.2392, 0.2392, 0.2392,  ..., 0.6902, 0.2941, 0.2588],
          [0.2392, 0.2392, 0.2392,  ..., 0.6510, 0.2627, 0.2392],
          [0.2392, 0.2392, 0.2392,  ..., 0.5882, 0.2314, 0.2353],
          ...,
          [0.6627, 0.6627, 0.6627,  ..., 0.3922, 0.2275, 0.1922],
          [0.6588, 0.6588, 0.6588,  ..., 0.3647, 0.2235, 0.1804],
          [0.6588, 0.6588, 0.6588,  ..., 0.3412, 0.2235, 0.1647]],

         [[0.2235, 0.2235, 0.2235,  ..., 0.4314, 0.1098, 0.1137],
          [0.2235, 0.2235, 0.2235,  ..., 0.3961, 0.0863, 0.1020],
          [0.2235, 0.2235, 0.2235,  ..., 0

In [82]:
model = Darknet("cfg/yolov3.cfg")
model.load_weights("yolov3.weights")
inp = get_test_input()
pred = model(inp, torch.cuda.is_available())
print(pred)

x= torch.Size([1, 1083, 85])
x= torch.Size([1, 4332, 85])
x= torch.Size([1, 17328, 85])
tensor([[[1.5619e+01, 1.7705e+01, 1.1273e+02,  ..., 1.3319e-03,
          4.0802e-04, 3.6722e-04],
         [1.7995e+01, 9.8058e+00, 9.4117e+01,  ..., 3.6005e-04,
          5.4206e-04, 9.4166e-04],
         [2.2240e+01, 1.3196e+01, 4.0238e+02,  ..., 6.6262e-03,
          4.6335e-03, 5.4024e-03],
         ...,
         [6.0482e+02, 6.0185e+02, 3.5539e+00,  ..., 1.8485e-06,
          2.6420e-06, 5.1433e-07],
         [6.0296e+02, 6.0191e+02, 7.3237e+00,  ..., 1.8137e-05,
          2.7303e-05, 1.1349e-05],
         [6.0279e+02, 6.0502e+02, 4.2829e+01,  ..., 8.3372e-06,
          1.4732e-05, 1.6987e-05]]], grad_fn=<CatBackward>)
