In [1]:
import torch
from torch.utils.data import DataLoader,Dataset
from torchvision import models
import PIL
from torchvision import transforms

In [2]:
import os

In [3]:
path_imgs="/Users/gunnvantsaini/OneDrive/project_codes/content/dl_basics/sony/data/guns-object-detection/Images"
img_names=os.listdir(path_imgs)

In [4]:
img_names.sort(key=lambda x:int(x.split(".")[0]))

In [5]:
img_names[0:4]

['1.jpeg', '2.jpeg', '3.jpeg', '4.jpeg']

In [6]:
path_labels="/Users/gunnvantsaini/OneDrive/project_codes/content/dl_basics/sony/data/guns-object-detection/Labels"
label_names=os.listdir(path_labels)

In [7]:
label_names.sort(key=lambda x: int(x.split(".")[0]))

In [8]:
label_names[0:4]

['1.txt', '2.txt', '3.txt', '4.txt']

In [9]:
base_path="/Users/gunnvantsaini/OneDrive/project_codes/content/dl_basics/sony/data/guns-object-detection"

In [10]:
pipeline=transforms.Compose([transforms.ToTensor()])

In [11]:
## https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
class GunsData(Dataset):
    def __init__(self,img_names,label_names,base_path,img_folder,label_folder):
        self.img_names=img_names
        self.label_names=label_names
        self.base_path=base_path
        self.img_folder=img_folder
        self.label_folder=label_folder
    def __len__(self):
        return len(self.img_names)
    def __getitem__(self,idx):
        img =PIL.Image.open(os.path.join(self.base_path,self.img_folder,self.img_names[idx])).convert("RGB")
        img=pipeline(img)
        img_id=self.img_names[idx].split(".")[0]
        img_id=torch.tensor(int(img_id),dtype=torch.int64)
        con=open(os.path.join(self.base_path,self.label_folder,self.label_names[idx]))
        label_data=con.readlines()
        con.close()
        boxes=[]
        for bbox in label_data[1:]:
            x0=float(bbox.split(" ")[0])
            y0=float(bbox.split(" ")[1])
            x1=float(bbox.split(" ")[2])
            y1=float(bbox.split(" ")[3])
            boxes.append([x0,y0,x1,y1])
        boxes=torch.tensor(boxes)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        area=torch.tensor(area).float()
        target={}
        target['boxes']=boxes
        target['labels']=torch.tensor([1]*boxes.shape[0],dtype=torch.int64)
        target['area']=area
        target['image_id']=img_id
        target['iscrowd']=torch.tensor([0]*boxes.shape[0],dtype=torch.int8)
        return img,target

In [12]:
guns=GunsData(img_names,label_names,base_path,"Images","Labels")

In [13]:
im,lab=next(iter(guns))



In [15]:
im.shape

torch.Size([3, 145, 347])

In [16]:
lab

{'boxes': tensor([[ 76.,  45., 146.,  87.]]),
 'labels': tensor([1]),
 'area': tensor([2940.]),
 'image_id': tensor(1),
 'iscrowd': tensor([0], dtype=torch.int8)}

In [17]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [18]:
model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

In [19]:
num_classes = 2 

In [20]:
in_features = model.roi_heads.box_predictor.cls_score.in_features

In [21]:
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [22]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [23]:
dataset=DataLoader(guns,batch_size=2,collate_fn=collate_fn)

In [24]:
images,targets = next(iter(dataset))



In [25]:
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]

In [29]:
output = model(images,targets)

In [30]:
output

{'loss_classifier': tensor(0.5880, grad_fn=<NllLossBackward>),
 'loss_box_reg': tensor(0.1038, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.0355, grad_fn=<BinaryCrossEntropyWithLogitsBackward>),
 'loss_rpn_box_reg': tensor(0.0135, grad_fn=<DivBackward0>)}

In [31]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

In [33]:
op=model(images)

In [34]:
op

[{'boxes': tensor([[1.6639e+01, 6.6881e+01, 3.1453e+01, 8.9683e+01],
          [1.0813e+02, 0.0000e+00, 2.2660e+02, 1.3818e+02],
          [8.1177e+01, 4.5300e+01, 1.1537e+02, 6.9265e+01],
          [2.2537e+02, 4.0688e+01, 2.4909e+02, 5.8959e+01],
          [0.0000e+00, 1.2541e+02, 4.8426e+01, 1.4243e+02],
          [1.6041e+02, 2.4790e+01, 2.5426e+02, 1.4497e+02],
          [2.7189e+02, 6.4824e+01, 2.7801e+02, 7.7154e+01],
          [2.2542e+02, 4.0598e+01, 2.5874e+02, 6.6811e+01],
          [7.0784e+01, 3.2557e+00, 1.8182e+02, 1.3194e+02],
          [2.0487e+02, 7.9900e+01, 2.3240e+02, 1.4500e+02],
          [5.8428e+01, 6.2614e+01, 1.1998e+02, 1.3725e+02],
          [3.2718e+01, 1.3282e+02, 1.1844e+02, 1.4400e+02],
          [5.5095e+00, 1.3074e+02, 3.9444e+01, 1.4320e+02],
          [1.7285e+00, 1.2859e+02, 3.7661e+01, 1.3796e+02],
          [2.6715e+02, 1.2598e+01, 2.9280e+02, 4.0571e+01],
          [2.3930e+02, 5.8096e+01, 2.4435e+02, 6.8577e+01],
          [3.8433e+01, 1.0037e+