In [None]:
import torch
import torchvision

from torchvision.transforms import v2 as T
from torchvision import datapoints

from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader
import PIL
from torchvision.transforms.v2 import functional as F
import random




In [None]:
# Set the device to use in all the notebook
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# Reading data from drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class mytransform():
    def __init__(self):
        # Initialize the transform sequence using torchvision's Compose
        # The sequence includes resizing the image to 640x640 and applying a random horizontal flip with a 25% chance
        self.transform = T.Compose([T.Resize((640, 640)),
                                    T.RandomHorizontalFlip(p=0.25),
                                    T.RandomVerticalFlip(p=0.25)])

    # The __call__ method makes the class callable, so it can be used like a function
    def __call__(self, img, target):
        # Extract bounding box coordinates from the 'target' dictionary
        xmin = int(target['annotation']['object'][0]['bndbox']['xmin'])
        xmax = int(target['annotation']['object'][0]['bndbox']['xmax'])
        ymin = int(target['annotation']['object'][0]['bndbox']['ymin'])
        ymax = int(target['annotation']['object'][0]['bndbox']['ymax'])

        # Convert image to a tensor of type float32
        img = datapoints.Image(img, dtype=torch.float32)

        # Create a BoundingBox object with the extracted coordinates
        bounding_box = datapoints.BoundingBox([[xmin, ymin, xmax, ymax]],
                                              format=datapoints.BoundingBoxFormat.XYXY,
                                              spatial_size=F.get_spatial_size(img))

        # Move image and bounding box to the device (could be CPU or GPU)
        img, bounding_box = img.to(device), bounding_box.to(device)

        # Apply the previously defined transformations
        img, bounding_box = self.transform(img, bounding_box)

        # Create a dictionary to hold additional target information
        d = {}
        d['image_id'] = target['annotation']['filename'][4:-4]
        d['boxes'] = bounding_box.type(torch.float32)
        d['labels'] = torch.Tensor([1]).to(dtype=torch.int64).to(device)

        return img, d  # Return the transformed image and the new target dictionary

# Custom collate function to prepare a batch for the DataLoader
def collate(batch):
    # Unzip the batch into images and targets
    img, target = list(zip(*batch))

    # Convert tuples to lists
    img, target = list(img), list(target)

    return img, target  # Return the lists as a new batch

# Instantiate the custom transform
transform = mytransform()

In [None]:
# Initialize dataset
voc_dataset_train = VOCDetection(root='/content/drive/MyDrive/Vision and learning/Dataset',
                           year='2012',
                           image_set='train',  # Use 'val' for validation set
                           download=False,     # Set to True if you want to download the dataset
                           transforms=transform)

voc_dataset_test = VOCDetection(root='/content/drive/MyDrive/Vision and learning/Dataset',
                           year='2012',
                           image_set='val',  # Use 'val' for validation set
                           download=False,     # Set to True if you want to download the dataset
                           transforms=transform)


train_loader = DataLoader(voc_dataset_train, batch_size=2, shuffle=True, collate_fn=collate)

test_loader  = DataLoader(voc_dataset_test, batch_size=4, shuffle=False, collate_fn=collate)

In [65]:
#architectura pyramid?
#import torchvision
#from torchvision.models.detection import FasterRCNN
#from torchvision.models.detection.rpn import AnchorGenerator
#from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
#
#def get_model_object_detection(num_classes):
#    # Load a pre-trained ResNet-50 model without pretrained weights
#    backbone = torchvision.models.resnet50(pretrained=False)
#    backbone.out_channels = 256  # Adjust the out_channels to match FPN
#
#    # Define the FPN module
#    fpn = torchvision.ops.FeaturePyramidNetwork(
#        in_channels_list=[256, 512, 1024, 2048],
#        out_channels=256,
#        extra_blocks=None
#    )
#
#    # Create the anchor generator for the RPN
#    rpn_anchor_generator = AnchorGenerator(
#        sizes=((32, 64, 128, 256, 512),),
#        aspect_ratios=((0.5, 1.0, 2.0),) * 5
#    )
#
#    # Create the Faster R-CNN model with FPN integration
#    model = FasterRCNN(
#        backbone,
#        num_classes=num_classes,  # Number of classes (license plates + background)
#        rpn_anchor_generator=rpn_anchor_generator,
#        box_predictor=FastRCNNPredictor(256, num_classes)
#    )
#
#    return model
#


In [64]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
def get_model_object_detection():
  model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=None)
  num_classes = 2  # 1 class (licens plate) + background
  # get number of input features for the classifier
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  # replace the pre-trained head with a new one
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

  return model

In [None]:
# Load the model
model = get_model_object_detection()
model = model.to(device) # Model to device

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)


In [None]:
import math
import sys
from tqdm.auto import tqdm

def train_one_epoch(model, optimizer, data_loader, device, epoch, scaler=None):
    model.train()
    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer, start_factor=warmup_factor, total_iters=warmup_iters
        )

    for images, targets in tqdm(data_loader, desc=f"Epoch {epoch}"):
        with torch.cuda.amp.autocast(enabled=scaler is not None):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

        loss_value = losses.item()

        if not math.isfinite(loss_value):
            print(f"Loss is {loss_value}, stopping training")
            sys.exit(1)

        optimizer.zero_grad()

        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

In [None]:
@torch.inference_mode()
def evaluate(model, data_loader, device):
  model.eval()
  for image, target in data_loader:
    output = model(image)

In [None]:
# 5 epochs --> 16 min
num_epochs = 5

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, train_loader, device, epoch)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    #evaluate(model, data_loader_test, device=device)

print("That's it!")

Epoch 0:   0%|          | 0/173 [00:00<?, ?it/s]



Epoch 1:   0%|          | 0/173 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/173 [00:00<?, ?it/s]

Epoch 3:   0%|          | 0/173 [00:00<?, ?it/s]

Epoch 4:   0%|          | 0/173 [00:00<?, ?it/s]

That's it!


In [None]:
@torch.inference_mode()
def qualitative_results(model, data_loader):
  results = {}
  model.eval()

  for images, targets in tqdm(data_loader, desc=f"Epoch {epoch}"):
    predictions = model(images)

    for idx, pred in enumerate(predictions):

      best_indices = torch.where(pred['scores'] > 0.5)
      best_predictions = pred['boxes'][best_indices]

      best_boundig_boxs = pred['boxes'][torch.where(pred['scores'] > 0.5)]

      filename = targets[idx]['image_id']
      results[filename] = [images[idx], best_boundig_boxs]

  return results

In [None]:
from torchvision.utils import draw_bounding_boxes
import matplotlib.pyplot as plt

def plot_qualitatitve_results(results, start = 0, end = 10):

  for img_id in list(results.keys())[start:end]:
    img = results[img_id][0]
    img = img.type(torch.uint8)
    bounding_boxes = results[img_id][1]
    for bound_box in bounding_boxes:
      fig, ax = plt.subplots()
      new = draw_bounding_boxes(img, bound_box.unsqueeze(0), colors='red', width=3).permute(1, 2, 0).numpy()
      ax.imshow(new)
      ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
      fig.tight_layout()

      fig.show()

In [None]:
results = qualitative_results(model, test_loader)

Epoch 4:   0%|          | 0/22 [00:00<?, ?it/s]



In [None]:
results.keys()

dict_keys(['35', '157', '259', '228', '39', '78', '30', '260', '348', '145', '357', '169', '272', '345', '207', '241', '197', '232', '144', '331', '131', '10', '397', '335', '313', '237', '95', '318', '258', '242', '120', '210', '205', '108', '54', '50', '413', '25', '189', '152', '24', '129', '7', '409', '191', '65', '389', '115', '80', '274', '421', '61', '349', '156', '373', '360', '243', '353', '412', '238', '315', '82', '292', '147', '109', '417', '15', '73', '338', '17', '281', '92', '300', '12', '388', '222', '307', '139', '400', '200', '87', '289', '44', '163', '134', '216', '167'])

In [None]:
results["222"]

[Image([[[224.0000, 222.2500, 220.0000,  ...,  82.1875,  80.8750,  80.0000],
         [223.9219, 222.1035, 219.7583,  ...,  82.2266,  80.8921,  80.0000],
         [223.2031, 220.7559, 217.5347,  ...,  82.5859,  81.0493,  80.0000],
         ...,
         [ 61.8047,  58.4824,  55.0986,  ...,  12.0298,  10.2271,   9.0000],
         [ 62.8828,  60.1895,  57.6592,  ...,  13.1304,  10.6987,   9.0000],
         [ 63.0000,  60.3750,  57.9375,  ...,  13.2500,  10.7500,   9.0000]],
 
        [[235.0000, 233.2500, 231.0000,  ..., 104.1250, 102.8750, 102.0000],
         [235.0391, 233.2207, 230.8755,  ..., 104.1641, 102.8921, 102.0000],
         [235.3984, 232.9512, 229.7300,  ..., 104.5234, 103.0493, 102.0000],
         ...,
         [ 61.8047,  58.4824,  55.0986,  ...,  21.0298,  19.2271,  18.0000],
         [ 62.8828,  60.1895,  57.6592,  ...,  22.1304,  19.6987,  18.0000],
         [ 63.0000,  60.3750,  57.9375,  ...,  22.2500,  19.7500,  18.0000]],
 
        [[237.0000, 235.2500, 233.0000,  .

In [None]:
plot_qualitatitve_results(results, 0, 10)

UnboundLocalError: ignored