In [35]:
from torchvision.io.image import decode_image
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image

import os

An example Torchvision Object Detection Pipeline.
- Inference on DOTA dataset
- Finetuning training on DOTA dataset

In [36]:
# Step 0: Load and decode input image and label image
img_dir = r"C:\Users\andre\Desktop\mlcvprac\objdetectionpipeline\data\images"
label_dir = r"C:\Users\andre\Desktop\mlcvprac\objdetectionpipeline\data\labels"

for img_name in os.listdir(path=img_dir):
    img_path = os.path.join(img_dir, img_name)
    img_tensor = decode_image(img_path)
    print(f"image path: {img_path}")
    print(f"image tensor shape {img_tensor.shape}")
    break


print(img_tensor.shape)

# Step 1: Initialize model with the best weights
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights)
model.eval()

# Step 2: Initialize and Apply image preprocessing transforms to match the pretrained image format
# weights.transforms() automatically does this
preprocess = weights.transforms()
batch_img_tensor = [preprocess(img_tensor)] #unsqueeze to add batch dimension
#torchvision detection models expect [[3, 5502, 3875]] list of tensors

# Step 3: Inference
prediction = model(batch_img_tensor)[0]
# convert prediction class indices to actual class words
labels = [weights.meta["categories"][i] for i in prediction["labels"]]
img_tensor_with_bbs = draw_bounding_boxes(image=img_tensor, 
                                     boxes=prediction["boxes"], 
                                     labels=labels, 
                                     colors="red",
                                     width=4)
image = to_pil_image(img_tensor_with_bbs.detach())
image.show()

image path: C:\Users\andre\Desktop\mlcvprac\objdetectionpipeline\data\images\P0000.png
image tensor shape torch.Size([3, 5502, 3875])
torch.Size([3, 5502, 3875])


In [47]:
# Custom torch Dataset subclass
import os
import torch

from torchvision.io import decode_image


class DOTADataset(torch.utils.data.Dataset):
    def __init__(self, directory, transforms):
        self.directory = directory
        self.transforms = transforms
        self.imgs = sorted(os.listdir(os.path.join(directory, "images")))
        self.labels = sorted(os.listdir(os.path.join(directory, "labels")))
        
        #Faster R-CNN expects integer label IDs instead of string class names
        self.class_map = {
            "plane": 1,
            "ship": 2,
            "storage-tank": 3,
            "baseball-diamond": 4,
            "tennis-court": 5,
            "basketball-court": 6,
            "ground track-field": 7,
            "harbor": 8,
            "bridge": 9,
            "large-vehicle": 10,
            "small-vehicle": 11,
            "helicopter": 12,
            "roundabout": 13,
            "soccer-ball-field": 14,
            "swimming-pool": 15
        }

    def __getitem__(self, idx):
        img_path = os.path.join(self.directory, "images", self.imgs[idx])
        label_path = os.path.join(self.directory, "labels", self.labels[idx])
        
        image = decode_image(img_path)
        target = {}
        boxes, labels = self.parse_dota_labels(label_path)
        target['boxes'] = torch.tensor(boxes)
        target['labels'] = torch.tensor(labels)

        if self.transforms:
            image, target = self.transforms(image, target)

        return image, target

    def __len__(self):
        return len(self.imgs)
    
    def parse_dota_labels(self, txt_path):
        boxes = []
        labels = []

        with open(txt_path, 'r') as label_file:
            print(f"label_file is: {label_file}")
            for i, line in enumerate(label_file):
                if i<2:
                    continue #skip the first two descriptor lines
                line_items = line.split()
                coordinates = list(map(float, line_items[:8])) #map every string line item to a float
                class_name = line_items[8]

                xs = coordinates[0::2] #starting at idx 0, every 2nd item is x coordinate
                ys = coordinates[1::2] #starting at idx 1, every 2nd item is y coordinate

                x_min = min(xs)
                x_max = max(xs)
                y_min = min(ys)
                y_max = max(ys)

                boxes.append([x_min, y_min, x_max, y_max])
                labels.append(self.class_map[class_name])
        
        return boxes, labels

In [None]:
# WARNING: RUN THIS ONLY ONCE
# Download these helper scripts, we need them to train and evaluate detection models.
import urllib.request

urls = [
    "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py",
    "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py",
    "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py",
    "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py",
    "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py"
]

for url in urls:
    filename = url.split("/")[-1]
    print(f"Downloading {filename}...")
    urllib.request.urlretrieve(url, filename)


Downloading engine.py...
Downloading utils.py...
Downloading coco_utils.py...
Downloading coco_eval.py...
Downloading transforms.py...


In [None]:
# TRAINING AND VALIDATION
from engine import train_one_epoch, evaluate
import utils

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.transforms import v2


# Define Transformations and Augmentations
def get_transform(train):
    transforms = []
    if train:
        transforms.append(v2.RandomHorizontalFlip(0.5))
    transforms.append(v2.ToDtype(torch.float, scale=True))
    transforms.append(v2.ToPureTensor())
    return v2.Compose(transforms)


################################################################################
############### CREATE DATASET OBJECT INSTANCE AND LOAD THEM ###################
################################################################################
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_classes = 16

train_dataset = DOTADataset(r"C:\Users\andre\Desktop\DOTA", get_transform(train=True))
test_dataset = DOTADataset(r"C:\Users\andre\Desktop\DOTA", get_transform(train=False))

indices = torch.randperm(len(train_dataset)).tolist() # list of random indices length of dataset
train_dataset = torch.utils.data.Subset(train_dataset, indices[:-50])
test_dataset = torch.utils.data.Subset(test_dataset, indices[-50:])

trainloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=utils.collate_fn
)

testloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=utils.collate_fn
)


##################################################################################
####################### FINETUNING A PRETRAINED MODEL ############################
##################################################################################
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights) #instantiate a FasterRCNN class object with a ResNet50_FPN_v2 backbone
model.to(device)
# Replace the Classifier with a new one that has num_classes defined by dataset you downloaded
num_classes = 16 #15 DOTA Classes + background class
# roi_heads (region-of-interest heads) --> box_predictor (final classifier and box regressor) --> 
# cls_score (linear layer that predicts class scores) --> 
# in_features (number of input features/neurons this cls_score layer expects)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)


################################## TRAINING ########################################
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

num_epochs = 2

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, trainloader, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, testloader, device=device)

label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0831.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0697.txt' mode='r' encoding='utf-8'>


  with torch.cuda.amp.autocast(enabled=scaler is not None):


Epoch: [0]  [  0/210]  eta: 1:12:32  lr: 0.000029  loss: 5.0708 (5.0708)  loss_classifier: 2.9185 (2.9185)  loss_box_reg: 0.0910 (0.0910)  loss_objectness: 1.9223 (1.9223)  loss_rpn_box_reg: 0.1390 (0.1390)  time: 20.7253  data: 0.1089
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0020.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0656.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0464.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0013.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0387.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\\andre\\Desktop\\DOTA\\labels\\P0098.txt' mode='r' encoding='utf-8'>
label_file is: <_io.TextIOWrapper name='C:\\Users\

KeyboardInterrupt: 