# Application of Artificial Intelligence Q4

In [1]:
import pandas as pd
import os

#### Setup the dataset and annotation directory

In [None]:
base_dir = os.getcwd()
print("Base directory: ", base_dir)
data_dir = os.path.join(base_dir, 'dataset2')
print("Data directory: ", data_dir)
image_dir = os.path.join(data_dir, 'images')
print("Image directory: ", image_dir)
annotations_dir = os.path.join(data_dir, 'annotations')
print("Annotations directory: ", annotations_dir)

#### Create the Test and Train set (100-28)

In [3]:
# Split dataset into train (100) and test (28)
all_images = os.listdir(image_dir)
train_images = all_images[:100]
test_images = all_images[100:]

## annotation files will be read when the train-test images are processed. It is easier to tag.

### Load and Transform the Dataset

In [4]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F
from torchvision import transforms as T
from PIL import Image
import xml.etree.ElementTree as ET
import json

#### Setting up a way to read the annotation file and set inline with COCO annotations

In [5]:
def parse_voc_annotation(annotation_path):
    tree = ET.parse(annotation_path)
    root = tree.getroot()
    
    boxes = []
    labels = []
    for obj in root.findall('object'):
        name = obj.find('name').text
        if name == 'obj1':
            label = 1
        elif name == 'obj2':
            label = 2
        elif name == 'obj3':
            label = 3
        elif name == 'obj4':
            label = 4
        
        bndbox = obj.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        
        boxes.append([xmin, ymin, xmax, ymax])
        labels.append(label)
    
    return {'boxes': boxes, 'labels': labels}

# Example usage
example_path = os.path.join(annotations_dir, '33.xml')  ## has 3 objects
parsed_annotation = parse_voc_annotation(example_path)
print(parsed_annotation)

{'boxes': [[72, 150, 172, 399], [192, 229, 308, 534], [324, 246, 415, 410]], 'labels': [4, 3, 1]}


In [6]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, annotation_dir, image_list, transforms=None):
        self.image_dir = image_dir
        self.annotation_dir = annotation_dir
        self.image_list = image_list
        self.transforms = transforms

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_list[idx])
        annotation_path = os.path.join(self.annotation_dir, self.image_list[idx].replace('.jpg', '.xml'))
        
        img = Image.open(img_path).convert("RGB")
        annot = parse_voc_annotation(annotation_path)
        
        boxes = torch.as_tensor(annot['boxes'], dtype=torch.float32)
        labels = torch.as_tensor(annot['labels'], dtype=torch.int64)
        # Generate a unique image_id based on idx or filename
        image_id = idx  # Example: using idx as image_id
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)

        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': image_id,
            'area': area,
            'iscrowd': iscrowd
        }

        # Apply transformations to the image only
        if self.transforms:
            img = self.transforms(img)
        
        return img, target

    def __len__(self):
        return len(self.image_list)

# Transformations
transform = T.Compose([
    T.ToTensor(),
    T.RandomHorizontalFlip(0.5)
])

#### Creating the datasets and Loaders

In [7]:
# Create datasets
train_dataset = CustomDataset(image_dir, annotations_dir, train_images, transform)
test_dataset = CustomDataset(image_dir, annotations_dir, test_images, transform)

# Data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

#### Load the Pretrained Model
Load a pretrained Faster R-CNN model and modify it to fit our dataset.

In [None]:
def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model

num_classes = 5  # 4 classes (Water Bottle, Milk Bottle, Tetra Pack, Can) + background
model = get_model_instance_segmentation(num_classes)

In [9]:
## Supporting python files for torchvision and coco
from engine_detection import train_one_epoch, evaluate
import utils_detection as utils
from coco_utils_detection import get_coco_api_from_dataset
from coco_eval_detection import CocoEvaluator

#### Fine tuning the FastRCNN model with our data

In [10]:
# Initialize the model
num_classes = 5  # 4 classes + background
model = get_model_instance_segmentation(num_classes)

# Move model to the right device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, test_loader, device=device)

Epoch: [0]  [ 0/50]  eta: 0:43:50  lr: 0.000107  loss: 2.1990 (2.1990)  loss_classifier: 1.9727 (1.9727)  loss_box_reg: 0.2160 (0.2160)  loss_objectness: 0.0018 (0.0018)  loss_rpn_box_reg: 0.0084 (0.0084)  time: 52.6128  data: 0.2190
Epoch: [0]  [10/50]  eta: 0:34:36  lr: 0.001126  loss: 1.1907 (1.3750)  loss_classifier: 0.9729 (1.0818)  loss_box_reg: 0.2703 (0.2658)  loss_objectness: 0.0171 (0.0177)  loss_rpn_box_reg: 0.0084 (0.0097)  time: 51.9122  data: 0.2629
Epoch: [0]  [20/50]  eta: 0:24:31  lr: 0.002146  loss: 0.7239 (1.0505)  loss_classifier: 0.4186 (0.7490)  loss_box_reg: 0.2703 (0.2711)  loss_objectness: 0.0099 (0.0214)  loss_rpn_box_reg: 0.0078 (0.0090)  time: 48.8638  data: 0.5069
Epoch: [0]  [30/50]  eta: 0:15:07  lr: 0.003165  loss: 0.5157 (0.8722)  loss_classifier: 0.2851 (0.5799)  loss_box_reg: 0.2339 (0.2614)  loss_objectness: 0.0118 (0.0214)  loss_rpn_box_reg: 0.0092 (0.0094)  time: 41.7812  data: 0.9619
Epoch: [0]  [40/50]  eta: 0:07:33  lr: 0.004184  loss: 0.4490 (0

KeyboardInterrupt: 