# Assignment 3

# Instructions

1. You have to use only this notebook for all your code.
2. All the results and plots should be mentioned in this notebook.
3. For final submission, submit this notebook along with the report ( usual 2-4 pages, latex typeset, which includes the challenges faces and details of additional steps, if any)
4. Marking scheme
    -  **60%**: Your code should be able to detect bounding boxes using resnet 18, correct data loading and preprocessing. Plot any 5 correct and 5 incorrect sample detections from the test set in this notebook for both the approached (1 layer and 2 layer detection), so total of 20 plots.
    -  **20%**: Use two layers (multi-scale feature maps) to detect objects independently as in SSD (https://arxiv.org/abs/1512.02325).  In this method, 1st detection will be through the last layer of Resnet18 and the 2nd detection could be through any layer before the last layer. SSD uses lower resolution layers to detect larger scale objects. 
    -  **20%**: Implement Non-maximum suppression (NMS) (should not be imported from any library) on the candidate bounding boxes.
    
5. Report AP for each of the three class and mAP score for the complete test set.

## Build the data
Use the following links to locally download the data:
<br/>Training and validation:
<br/>http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
<br/>Testing data:
<br/>http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
<br/>The dataset consists of images from 20 classes, with detection annotations included. The JPEGImages folder houses the images, and the Annotations folder has the object-wise labels for the objects in one xml file per image. You have to extract the object information, i.e. the [xmin, ymin] (the top left x,y co-ordinates) and the [xmax, ymax] (the bottom right x,y co-ordinates) of only the objects belonging to the three classes(aeroplane, bottle, chair). For parsing the xml file, you can import xml.etree.ElementTree for you. <br/>
<br/> Organize the data as follows:
<br/> For every image in the dataset, extract/crop the object patch from the image one by one using their respective co-ordinates:[xmin, ymin, xmax, ymax], resize the image to resnet_input, and store it with its class label information. Do the same for training/validation and test datasets. <br/>
##### Important
You also have to collect data for an extra background class which stands for the class of an object which is not a part of any of the 20 classes. For this, you can crop and resize any random patches from an image. A good idea is to extract patches that have low "intersection over union" with any object present in the image frame from the 20 Pascal VOC classes. The number of background images should be roughly around those of other class objects' images. Hence the total classes turn out to be four. This is important for applying the sliding window method later.


In [None]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import torch
import torch.utils.data
import torchvision.transforms as transforms
from torch.autograd import Variable
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()

import os, random , pickle
import xml.etree.ElementTree as ET
from skimage import io
from skimage.transform import resize

# Import other modules if required
# Can use other libraries as well

resnet_input = [224, 224, 3]
c_dir = os.getcwd()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [12]:
classes = ('__background__',
           'aeroplane',
           'bottle',
           'chair'
           )

In [27]:

class voc_dataset(torch.utils.data.Dataset): # Extend PyTorch's Dataset class
    def __init__(self, root_dir, train, transform=None):
        self.train = train
        self.root_dir = root_dir
        self.transform = transform
        if(train is True):
            dict_file = os.path.join(self.root_dir, "train.pkl")
            filehandler = open(dict_file,"rb")
            self.dict = pickle.load(filehandler)
        else:
            dict_file = os.path.join(self.root_dir, "test.pkl")
            filehandler = open(dict_file,"rb")
            self.dict = pickle.load(filehandler)
            
        
    def __len__(self):
        return len(self.dict)
        
    def __getitem__(self, idx):
        if self.train is True:
            img_f = os.path.join(self.root_dir, "train")
        else:
            img_f = os.path.join(self.root_dir, "test")
            
        img_name = os.path.join(img_f, "img" + str(idx) + ".jpg")
#        img = io.imread(img_name)
      
        img = io.imread(img_name)
        
        if self.transform is not None:
            img = self.transform(img)
        if self.dict[idx] == "__background__":
            label = [1,0,0,0]
        if self.dict[idx] == "aeroplane":
            label = [0,1,0,0]
        if self.dict[idx] == "bottle":
            label = [0,0,1,0]
        if self.dict[idx] == "chair":
            label = [0,0,0,1]
            
        return img,  np.array(label)
        


        
        
    

In [None]:

def get_iou(bb1, bb2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    """
    # determine the coordinates of the intersection rectangle
    x_left = max(bb1[0], bb2[0])
    y_top = max(bb1[1], bb2[1])
    x_right = min(bb1[2], bb2[2])
    y_bottom = min(bb1[3], bb2[3])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1[2] - bb1[0]) * (bb1[3] - bb1[1])
    bb2_area = (bb2[2] - bb2[0]) * (bb2[3] - bb2[1])

    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    return iou


def sliding_window(image_s, stepSize, windowSize):
    ''' Helper Sliding window function '''
    for y in range(0, image_s[0], stepSize):
        for x in range(0, image_s[1], stepSize):
            yield (x, y, (windowSize[0] ,  windowSize[1]) )
            
def background(img, patches):
    ''' Generate background with iou < 0.2 using sliding window'''
    aspect_ratios = [(400, 400), (224,224), (120, 300) ,(250,100)]
    candidates = []
    for each in aspect_ratios:
        (winH, winW) = each
        for (x, y, window_s) in sliding_window(img.shape, stepSize=min(winH,winW), windowSize=(winH, winW)):
            if window_s[0] != winH or window_s[1] != winW:
                    continue
            candidate = True
            for each in patches:
                iou = get_iou((x, y , x + window_s[1], y + window_s[0]), each)
                if(iou > 0.2):
                    candidate = False
            if candidate:
                candidates.append((x, y , x + window_s[1], y + window_s[0]))
    
    return candidates


def build_dataset(typ = "train"):
    ''' Build the dataset for classifier training and testing '''
    print("Building dataset from PASCAL VOC DATASET")
    train_img_addr = c_dir + "/" + "VOC_" + typ + "/JPEGImages"
    train_ann_addr = c_dir + "/" + "VOC_" + typ + "/Annotations"
    train_images = os.listdir(train_img_addr)
    train_id = 0
    train_dict = {}
    g_take_back = 0
    count = [0,0,0,0,0]
    for each in train_images:
        tree =  ET.parse( train_ann_addr + '/' + each[:-3] + 'xml')
        root = tree.getroot()
        objects = []
        take_back = 0
        for obj in root.findall('object'):
            name = obj.find('name').text
            box = obj.find('bndbox')
            xmin = int(box.find('xmin').text)
            ymin = int(box.find('ymin').text)
            xmax = int(box.find('xmax').text)
            ymax = int(box.find('ymax').text)

            img = io.imread(train_img_addr + '/' + each)
            if name in classes:
                objects.append((xmin, ymin, xmax, ymax))
                take_back =  1
                c_img = img[ ymin:ymax, xmin:xmax]
                r_img = resize(c_img, (resnet_input[0], resnet_input[1]))
                io.imsave(c_dir + "/data/" + typ + "/img" + str(train_id) + ".jpg",r_img)
                train_dict[train_id] = name
                train_id = train_id + 1
                print(train_id)
                if name == "aeroplane":
                    count[0] = count[0] + 1
                if name == "bottle":
                    count[1] = count[1] + 1
                if name == "chair":
                    count[2] = count[2] + 1
                            
        g_take_back = g_take_back + 1
        
        if take_back == 1:
            b_img = background(img, objects)
            r_img = resize(b_img, (resnet_input[0], resnet_input[1]))
            io.imsave(c_dir + "/data/" + typ + "/img" + str(train_id) + ".jpg",r_img)
            io.imsave(c_dir + "/data/" + "back" + "/img" + str(train_id) + ".jpg",r_img)
            train_dict[train_id] = '__background__'
            train_id = train_id + 1
            count[3] = count[3] + 1
            print("back", train_id)
            
        elif g_take_back % 4 == 0:
            b_img = background(img, objects)
            r_img = resize(b_img, (resnet_input[0], resnet_input[1]))
            io.imsave(c_dir + "/data/" + typ + "/img" + str(train_id) + ".jpg",r_img)
            io.imsave(c_dir + "/data/" + "back" + "/img" + str(train_id) + ".jpg",r_img)

            train_dict[train_id] = '__background__'
            train_id = train_id + 1
            count[4] = count[4] + 1
            print("back", train_id)
    
    filehandler = open("data/" + typ +".pkl","wb")
    pickle.dump(train_dict,filehandler)
    return count
            
count1 = build_dataset("train")
count2 = build_dataset("test")
print(count1,count2)


## The networks
<br/> Training the network on the created dataset. This will yield a classification network on the 4 classes of the VOC dataset. 

#### One layer network


In [30]:
# one layer network using pretrained resnet
def resnetOneLayer():
    resnet18 = models.resnet18(pretrained=True)
    resnet18.fc = torch.nn.Linear(resnet18.fc.in_features, 4)    
    ct = 0 
    for child in resnet18.children():
        ct += 1
        if ct < 8:
            for param in child.parameters():
                param.requires_grad = False
    return resnet18

#### Two layer network

In [None]:
# two layer network using pretrained resnet
class resnetTwoLayer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        resnet18 = models.resnet18(pretrained=True)
        children = list(resnet18.children())
        # features upto 2nd last layer
        self.features = torch.nn.Sequential(*(children[:-3]))

        # Freeze the layers upto above
        for layer in self.features.children():
            for param in layer.parameters():
                param.requires_grad = False

        # The last ResNet block
        self.last = children[-3]
        self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
        # caoncat the last and 2nd last layer, 256, 512 size
        self.fc = torch.nn.Sequential(torch.nn.Linear(256 + 512, 4))

    def forward(self, x):
        x = self.features(x)
        y = self.last(x)  
        x = self.avgpool(x)
        y = self.avgpool(y)
        # Concatenate before and after
        x = torch.squeeze(torch.squeeze(torch.cat([x, y], dim=1), dim=3), dim=2)
        return self.fc(x)
        

### Model Training and testing function
Using the newly made pre-trained network to fine-tune the network.

In [36]:
def train(model, criterion, optimizer, num_epochs, batch_size):
    c_dir = os.getcwd()   
    composed_transform = transforms.Compose([ 
        transforms.ToPILImage(),
        transforms.Resize(224, 224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])
    train_dataset = voc_dataset(root_dir= c_dir + '/data', train=True, transform=composed_transform) 
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(num_epochs):  
        running_loss = 0.0
        accuracy_sum = 0.0
        t_loss = [0,0]
        t_accur = [0,0]
        
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()
    
            # forward + backward + optimize
            outputs = model(inputs)
            
            outputs = outputs.to(device)
            
            loss = criterion(outputs, torch.max(labels, 1)[1])
            loss.backward()
            optimizer.step()
            
             #Accuracy
            output = torch.max(outputs, 1)[1]
            label = torch.max(labels, 1)[1]
            correct = (output == label).float().sum()
            accr = correct/output.shape[0] 
            accuracy_sum = accuracy_sum + accr
            # print statistics
            running_loss += loss.item()
            if i % 20 == 19:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f accurcy: %.3f'  %
                      (epoch + 1, i + 1, running_loss / 20, accuracy_sum/ 20 ))
                t_loss[0] = t_loss[0] + running_loss
                t_loss[1] = t_loss[1] +  20
                t_accur[0] = t_accur[0] + accuracy_sum
                t_accur[1] =  t_accur[1] + 20
                running_loss = 0.0
                accuracy_sum = 0.0
                
        print("EPOCH SUMMARY: loss ", t_loss[0]/t_loss[1], " Accuracy " , t_accur[0]/t_accur[1] )

    print('Finished Training')


def test_accuracy(model, model_state, batch_size):
    c_dir = os.getcwd()   
    composed_transform = transforms.Compose([ 
        transforms.ToPILImage(),
        transforms.Resize(224, 224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])
        
    test_dataset = voc_dataset(root_dir= c_dir + '/data', train=False, transform=composed_transform) 
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
    
    model.load_state_dict(torch.load(model_state))
    model.eval()
    
    ## Test accuarcy overall    
    ## classwiz=se accuracy
    class_correct = list(0. for i in range(4))
    class_total = list(0. for i in range(4))
    
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            outputs = outputs.to(device)
            
            _, predicted = torch.max(outputs, 1)
            label = torch.max(labels, 1)[1]
    
            total += labels.size(0)
            correct += (predicted == label).sum().item()
            
            c = (predicted == label)
            for i in range(c.size(0)):
                class_correct[label[i]] += int(c[i].item())
                class_total[label[i]] += 1
                
    print('Accuracy of the network on the test images: %d %%' % (100 * correct / total))

    for i in range(4):
        print('Accuracy of %5s : %2d %%' % (
            i , 100 * class_correct[i] / class_total[i]))


#### One Layer Training

#### Two layer Training

In [None]:
#One Layer Detection
def train_and_test_one_leyer_model():
    ### One LAYER MODEL TRAIN AND TEST
    model = resnetOneLayer().to(device)
    saved_name = "./model/one_layer_t.pt"
    
    num_epochs = 10
    learning_rate =  0.001
    hyp_momentum = 0.9
    batch_size = 30
    
    criterion = torch.nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), learning_rate, hyp_momentum)
    
    # Find total parameters and trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    print("total_params:",total_params)
    total_trainable_params = sum( p.numel() for p in model.parameters() if p.requires_grad)
    print("total_trainable_params:" ,  total_trainable_params)
    
    train(model, criterion, optimizer, num_epochs, batch_size)
    torch.save(model.state_dict(), saved_name)
    
    test_accuracy(model,saved_name, batch_size)


In [None]:
#Two Layer Detection (SSD)
def train_and_test_two_layer():
    ### TWO LAYER MODEL TRAIN AND TEST
    model = resnetOneLayer().to(device)
    saved_name = "./model/two_layer_t.pt"
    
    num_epochs = 10
    learning_rate =  0.001
    hyp_momentum = 0.9
    batch_size = 30
    
    criterion = torch.nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), learning_rate, hyp_momentum)
    
    # Find total parameters and trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    print("total_params:",total_params)
    total_trainable_params = sum( p.numel() for p in model.parameters() if p.requires_grad)
    print("total_trainable_params:" ,  total_trainable_params)
    
    train(model, criterion, optimizer, num_epochs, batch_size)
    torch.save(model.state_dict(), saved_name)
    
    test_accuracy(model,saved_name, batch_size)


# Testing and Accuracy Calculation
For applying detection, use a slding window method to test the above trained trained network on the detection task:<br/>
Take some windows of varying size and aspect ratios and slide it through the test image (considering some stride of pixels) from left to right, and top to bottom, detect the class scores for each of the window, and keep only those which are above a certain threshold value. There is a similar approach used in the paper -Faster RCNN by Ross Girshick, where he uses three diferent scales/sizes and three different aspect ratios, making a total of nine windows per pixel to slide. You need to write the code and use it in testing code to find the predicted boxes and their classes.

In [None]:

def pyramid(image, scale, minSize=(60, 60)):
    yield image
    # keep looping over the pyramid
    while True:
        # compute the new dimensions of the image and resize it
        h = int(image.shape[0] / scale)
        w = int(image.shape[1] / scale)
        image = resize(image, (h,w))
        
        image = image * 255   
        image = image.astype('uint8')         
        # if the resized image does not meet the supplied minimum
        # size, then stop constructing the pyramid
        if image.shape[0] < minSize[0] or image.shape[1] < minSize[1]:
            break

        # yield the next image in the pyramid
        yield image

def sliding_window(image, stepSize, windowSize):
    # slide a window across the image
    for y in range(0, image.shape[0], stepSize):
        for x in range(0, image.shape[1], stepSize):
            # yield the current window
    yield (x, y, image[y:y + windowSize[0], x:x + windowSize[1]])
    

In [None]:
def non_maximum_suppression(dets, thresh):
    if len(dets) == 0:
        return []
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]
        
    return dets[keep]

Test the trained model on the test dataset.

In [None]:
def get_ground_truth(xml_file):
    tree =  ET.parse( xml_file)
    root = tree.getroot()
    boxes = list()
    labels = list()
    difficulties = list()
    actual_boxes = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        box = obj.find('bndbox')
        xmin = int(box.find('xmin').text)
        ymin = int(box.find('ymin').text)
        xmax = int(box.find('xmax').text)
        ymax = int(box.find('ymax').text)
        
        if name == 'aeroplane':
            boxes.append(torch.Tensor([xmin, ymin, xmax, ymax]))
            labels.append(1.0)
            difficulties.append(0.0)
            actual_boxes.append([xmin, ymin, xmax, ymax, 1])
        if name == 'bottle':
            boxes.append(torch.Tensor([xmin, ymin, xmax, ymax]))
            labels.append(2.0)
            difficulties.append(0.0)
            actual_boxes.append([xmin, ymin, xmax, ymax, 2])

        if name == 'chair':
            boxes.append(torch.Tensor([xmin, ymin, xmax, ymax]))
            labels.append(3.0)
            difficulties.append(0.0)
            actual_boxes.append([xmin, ymin, xmax, ymax, 3])
            
    r_boxes = torch.tensor(boxes) if len(boxes) == 0 else torch.stack(boxes)
    return actual_boxes, r_boxes, torch.tensor(labels), torch.tensor(difficulties)

In [None]:
def give_bounding_box(model, img_name, actual_box):
    '''This takes a test image and a model(either one layer or two layer) and gives the predicted bounding 
       box after doing sliding window, predicting the class and applying nms.
    '''
    image = io.imread(img_name)
    count = 0    
    boxes = []
    
    image_batch = []
    image_shape = []
    
    composed_transform = transforms.Compose([ 
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])
    
    img_t = composed_transform(image)
    image_batch.append(img_t)
    image_shape.append((10, 10, (image.shape[1]) - 10, (image.shape[0]) - 10))
  
    for resized in pyramid(image, scale=1.8):
        to_mult = image.shape[0]/ resized.shape[0]
        aspect_ratios = [(200, 200), (96, 256), (156,96)]
        for each in aspect_ratios:
            count += 1
            (winH, winW) = each
            for (x, y, window) in sliding_window(resized, stepSize=50, windowSize=(winH, winW)):
                if window.shape[0] != winH or window.shape[1] != winW:
                    continue
                img_t = composed_transform(window)
                xmin = x*to_mult
                ymin = y*to_mult
                xmax = (x + window.shape[1]) *to_mult
                ymax = (y + window.shape[0]) *to_mult
                image_batch.append(img_t)
                image_shape.append((xmin, ymin, xmax, ymax))
         
    with torch.no_grad():
        inputs = torch.stack(image_batch).to(device) if len(image_batch) != 0 else torch.tensor(image_batch).to(device)
        outputs = model(inputs)
        outputs = outputs.to(device)
        predicted = torch.max(outputs, 1)[1]
        for i, each in enumerate(image_shape):
            xmin, ymin, xmax, ymax = each
            label = predicted[i].item()
            prob = F.softmax(outputs[i], dim=0)
            score = prob[label].item()
            if label == 0:
                continue
            if score < 0.7 and (label == 2 or label == 3):
                continue
            if label == 2 and (ymax-ymin) < (xmax-xmin):
                continue
            if label == 3 and (1.1 * (ymax-ymin)) < (xmax-xmin):
                continue
            boxes.append( (xmin, ymin, xmax, ymax, score, label) )
            

    boxes = np.array(boxes)
    pick = non_maximum_suppression(boxes, 0.1)
#    print("[x] before applying non-maximum, %d bounding boxes" % (boxes.shape[0]) )
#    print("[x] after applying non-maximum, %d bounding boxes" % (len(pick)) )    
    
#
#    img = image.copy()
#    for (startX, startY, endX, endY, cfd, lbl) in pick:
#        cv2.rectangle(img, (int(startX), int(startY)), (int(endX), int(endY)), (0, 255, 0), 2)
#        cv2.putText(img, str(lbl) ,(int(startX), int(startY)+ 20 ), cv2.FONT_HERSHEY_SIMPLEX, 1 ,(0,255,0), 2)
#        cv2.putText(img, str(int(cfd*100)) ,(int(endX) - 20, int(endY) - 20 ), cv2.FONT_HERSHEY_SIMPLEX, 1 ,(0,255,0), 2)
#    for (startX, startY, endX, endY, lbl) in actual_box:
#        cv2.rectangle(img, (int(startX), int(startY)), (int(endX), int(endY)), (255, 255, 0), 2)
#        cv2.putText(img, str(lbl) ,(int(startX), int(startY) + 20), cv2.FONT_HERSHEY_SIMPLEX, 1 ,(150,255,150), 2)
#        
#    plt.imshow(img)
#    plt.show()

    boxes = list()
    labels = list()
    scores = list()
    for xmin , ymin, xmax , ymax , score , label  in pick:
        boxes.append(torch.tensor([xmin, ymin, xmax, ymax]))
        labels.append(float(label))
        scores.append(score)
    
    r_boxes = torch.tensor(boxes) if len(boxes) == 0 else torch.stack(boxes)
    

    return r_boxes , torch.tensor(labels), torch.tensor(scores)

In [None]:
#One Layer Detection
def test():
    model = resnetOneLayer().to(device)
    model.load_state_dict(torch.load("./model/one_layer_t.pt", map_location=device))
    model.eval()

    map_th = 0.5

    c_dir = os.getcwd()
    typ = "test"
    train_img_addr = c_dir + "/" + "VOC_" + typ + "/JPEGImages"
    train_ann_addr = c_dir + "/" + "VOC_" + typ + "/Annotations"
    train_images = os.listdir(train_img_addr)


    det_boxes = list()
    det_labels = list()
    det_scores = list()
    true_boxes = list()
    true_labels = list()
    true_difficulties = list()

    ct = 0

    with torch.no_grad():
        for each in train_images:
            ct += 1
            if ct % 50 == 0:  
                APs, mAP = calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties, map_th)
                print(ct, APs, mAP)

            img_name = train_img_addr + '/' + each 
            xml_file = train_ann_addr + '/' + each[:-3] + 'xml'
            actual_boxes, act_boxes, act_labels, actual_difficulties = get_ground_truth(xml_file)
            if len(actual_boxes) == 0:
                continue
            p_boxes, p_labels, p_scores = give_bounding_box(model, img_name, actual_boxes)
            true_boxes.append(act_boxes)
            true_labels.append(act_labels)
            true_difficulties.append(actual_difficulties)
            det_boxes.append(p_boxes)
            det_labels.append(p_labels)
            det_scores.append(p_scores)
        APs, mAP = calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties, map_th)
        print(ct, APs, mAP)

In [None]:
%time test(resnet18)

In [None]:
#Two Layer Detection
def test():
    model = resnetTwoLayer().to(device)
    model.load_state_dict(torch.load("./model/two_layer_t.pt", map_location=device))
    model.eval()

    map_th = 0.5

    c_dir = os.getcwd()
    typ = "test"
    train_img_addr = c_dir + "/" + "VOC_" + typ + "/JPEGImages"
    train_ann_addr = c_dir + "/" + "VOC_" + typ + "/Annotations"
    train_images = os.listdir(train_img_addr)


    det_boxes = list()
    det_labels = list()
    det_scores = list()
    true_boxes = list()
    true_labels = list()
    true_difficulties = list()

    ct = 0

    with torch.no_grad():
        for each in train_images:
            ct += 1
            if ct % 50 == 0:  
                APs, mAP = calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties, map_th)
                print(ct, APs, mAP)

            img_name = train_img_addr + '/' + each 
            xml_file = train_ann_addr + '/' + each[:-3] + 'xml'
            actual_boxes, act_boxes, act_labels, actual_difficulties = get_ground_truth(xml_file)
            if len(actual_boxes) == 0:
                continue
            p_boxes, p_labels, p_scores = give_bounding_box(model, img_name, actual_boxes)
            true_boxes.append(act_boxes)
            true_labels.append(act_labels)
            true_difficulties.append(actual_difficulties)
            det_boxes.append(p_boxes)
            det_labels.append(p_labels)
            det_scores.append(p_scores)
        APs, mAP = calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties, map_th)
        print(ct, APs, mAP)


### Map Calculation script
This script is taken from 3rd Party (a github repo utils) and accordlingly changed as needed. 

In [None]:


# Label map
voc_labels = ('aeroplane', 'bottle', 'chair')
label_map = {k: v + 1 for v, k in enumerate(voc_labels)}
label_map['background'] = 0
rev_label_map = {v: k for k, v in label_map.items()}  # Inverse mapping

def find_jaccard_overlap(set_1, set_2):
    """
    Find the Jaccard Overlap (IoU) of every box combination between two sets of boxes that are in boundary coordinates.
    :param set_1: set 1, a tensor of dimensions (n1, 4)
    :param set_2: set 2, a tensor of dimensions (n2, 4)
    :return: Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimensions (n1, n2)
    """

    # Find intersections
    intersection = find_intersection(set_1, set_2)  # (n1, n2)

    # Find areas of each box in both sets
    areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1])  # (n1)
    areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1])  # (n2)

    # Find the union
    # PyTorch auto-broadcasts singleton dimensions
    union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection  # (n1, n2)

    return intersection / union  # (n1, n2)



def find_intersection(set_1, set_2):
    """
    Find the intersection of every box combination between two sets of boxes that are in boundary coordinates.
    :param set_1: set 1, a tensor of dimensions (n1, 4)
    :param set_2: set 2, a tensor of dimensions (n2, 4)
    :return: intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimensions (n1, n2)
    """

    # PyTorch auto-broadcasts singleton dimensions
    lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0))  # (n1, n2, 2)
    upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0))  # (n1, n2, 2)
    intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0)  # (n1, n2, 2)
    return intersection_dims[:, :, 0] * intersection_dims[:, :, 1]  # (n1, n2)

def calculate_mAP(det_boxes, det_labels, det_scores, true_boxes, true_labels, true_difficulties, map_th):
    """
    Calculate the Mean Average Precision (mAP) of detected objects.
    See https://medium.com/@jonathan_hui/map-mean-average-precision-for-object-detection-45c121a31173 for an explanation
    :param det_boxes: list of tensors, one tensor for each image containing detected objects' bounding boxes
    :param det_labels: list of tensors, one tensor for each image containing detected objects' labels
    :param det_scores: list of tensors, one tensor for each image containing detected objects' labels' scores
    :param true_boxes: list of tensors, one tensor for each image containing actual objects' bounding boxes
    :param true_labels: list of tensors, one tensor for each image containing actual objects' labels
    :param true_difficulties: list of tensors, one tensor for each image containing actual objects' difficulty (0 or 1)
    :return: list of average precisions for all classes, mean average precision (mAP)
    """
    print("ewuhru")

    assert len(det_boxes) == len(det_labels) == len(det_scores) == len(true_boxes) == len(
        true_labels) == len(
        true_difficulties)  # these are all lists of tensors of the same length, i.e. number of images
    n_classes = len(label_map)

    # Store all (true) objects in a single continuous tensor while keeping track of the image it is from
    true_images = list()
    for i in range(len(true_labels)):
        true_images.extend([i] * true_labels[i].size(0))
    true_images = torch.LongTensor(true_images).to(
        device)  # (n_objects), n_objects is the total no. of objects across all images
    true_boxes = torch.cat(true_boxes, dim=0)  # (n_objects, 4)
    true_labels = torch.cat(true_labels, dim=0)  # (n_objects)
    true_difficulties = torch.cat(true_difficulties, dim=0)  # (n_objects)

    assert true_images.size(0) == true_boxes.size(0) == true_labels.size(0)

    # Store all detections in a single continuous tensor while keeping track of the image it is from
    det_images = list()
    for i in range(len(det_labels)):
        det_images.extend([i] * det_labels[i].size(0))
    det_images = torch.LongTensor(det_images).to(device)  # (n_detections)
    det_boxes = torch.cat(det_boxes, dim=0)  # (n_detections, 4)
    det_labels = torch.cat(det_labels, dim=0)  # (n_detections)
    det_scores = torch.cat(det_scores, dim=0)  # (n_detections)

    assert det_images.size(0) == det_boxes.size(0) == det_labels.size(0) == det_scores.size(0)

    # Calculate APs for each class (except background)
    average_precisions = torch.zeros((n_classes - 1), dtype=torch.float)  # (n_classes - 1)
    for c in range(1, n_classes):
        # Extract only objects with this class
        true_class_images = true_images[true_labels == c]  # (n_class_objects)
        true_class_boxes = true_boxes[true_labels == c]  # (n_class_objects, 4)
        true_class_difficulties = true_difficulties[true_labels == c]  # (n_class_objects)
        n_easy_class_objects = (1 - true_class_difficulties).sum().item()  # ignore difficult objects

        # Keep track of which true objects with this class have already been 'detected'
        # So far, none
        true_class_boxes_detected = torch.zeros((true_class_difficulties.size(0)), dtype=torch.uint8).to(
            device)  # (n_class_objects)

        # Extract only detections with this class
        det_class_images = det_images[det_labels == c]  # (n_class_detections)
        det_class_boxes = det_boxes[det_labels == c]  # (n_class_detections, 4)
        det_class_scores = det_scores[det_labels == c]  # (n_class_detections)
        n_class_detections = det_class_boxes.size(0)
        if n_class_detections == 0:
            continue

        # Sort detections in decreasing order of confidence/scores
        det_class_scores, sort_ind = torch.sort(det_class_scores, dim=0, descending=True)  # (n_class_detections)
        det_class_images = det_class_images[sort_ind]  # (n_class_detections)
        det_class_boxes = det_class_boxes[sort_ind]  # (n_class_detections, 4)

        # In the order of decreasing scores, check if true or false positive
        true_positives = torch.zeros((n_class_detections), dtype=torch.float).to(device)  # (n_class_detections)
        false_positives = torch.zeros((n_class_detections), dtype=torch.float).to(device)  # (n_class_detections)
        for d in range(n_class_detections):
            this_detection_box = det_class_boxes[d].unsqueeze(0)  # (1, 4)
            this_image = det_class_images[d]  # (), scalar

            # Find objects in the same image with this class, their difficulties, and whether they have been detected before
            object_boxes = true_class_boxes[true_class_images == this_image]  # (n_class_objects_in_img)
            object_difficulties = true_class_difficulties[true_class_images == this_image]  # (n_class_objects_in_img)
            # If no such object in this image, then the detection is a false positive
            if object_boxes.size(0) == 0:
                false_positives[d] = 1
                continue

            # Find maximum overlap of this detection with objects in this image of this class
            overlaps = find_jaccard_overlap(this_detection_box, object_boxes)  # (1, n_class_objects_in_img)
            max_overlap, ind = torch.max(overlaps.squeeze(0), dim=0)  # (), () - scalars

            # 'ind' is the index of the object in these image-level tensors 'object_boxes', 'object_difficulties'
            # In the original class-level tensors 'true_class_boxes', etc., 'ind' corresponds to object with index...
            original_ind = torch.LongTensor(range(true_class_boxes.size(0)))[true_class_images == this_image][ind]
            # We need 'original_ind' to update 'true_class_boxes_detected'

            # If the maximum overlap is greater than the threshold of 0.5, it's a match
            if max_overlap.item() > map_th:
                # If the object it matched with is 'difficult', ignore it
                if object_difficulties[ind] == 0:
                    # If this object has already not been detected, it's a true positive
                    if true_class_boxes_detected[original_ind] == 0:
                        true_positives[d] = 1
                        true_class_boxes_detected[original_ind] = 1  # this object has now been detected/accounted for
                    # Otherwise, it's a false positive (since this object is already accounted for)
                    else:
                        false_positives[d] = 1
            # Otherwise, the detection occurs in a different location than the actual object, and is a false positive
            else:
                false_positives[d] = 1

        # Compute cumulative precision and recall at each detection in the order of decreasing scores
        cumul_true_positives = torch.cumsum(true_positives, dim=0)  # (n_class_detections)
        cumul_false_positives = torch.cumsum(false_positives, dim=0)  # (n_class_detections)
        cumul_precision = cumul_true_positives / (
                cumul_true_positives + cumul_false_positives + 1e-10)  # (n_class_detections)
        cumul_recall = cumul_true_positives / n_easy_class_objects  # (n_class_detections)

        # Find the mean of the maximum of the precisions corresponding to recalls above the threshold 't'
        recall_thresholds = torch.arange(start=0, end=1.1, step=.1).tolist()  # (11)
        precisions = torch.zeros((len(recall_thresholds)), dtype=torch.float).to(device)  # (11)
        for i, t in enumerate(recall_thresholds):
            recalls_above_t = cumul_recall >= t
            if recalls_above_t.any():
                precisions[i] = cumul_precision[recalls_above_t].max()
            else:
                precisions[i] = 0.
        average_precisions[c - 1] = precisions.mean()  # c is in [1, n_classes - 1]

    # Calculate Mean Average Precision (mAP)
    mean_average_precision = average_precisions.mean().item()

    # Keep class-wise average precisions in a dictionary
    average_precisions = {rev_label_map[c + 1]: v for c, v in enumerate(average_precisions.tolist())}

    return average_precisions, mean_average_precision