In [1]:
import os
import torch
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [2]:
NO_OF_ANCHOR_BOX = N = 3

S = [13, 26, 52]  #Three output prediction Scales of Yolov3

NO_OF_CLASS = C =  3
HEIGHT = H = 416
WIDTH = W = 416
SCALE = [32, 16, 8]


DEVICE =device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = batch_size = 16


CLASS = 1
BOX = 3
BATCH_SIZE = 4
# ANCHORS
IMG_SIZE = IMAGE_SIZE =  416
grid = 13
GRID1 = IMG_SIZE // 32
GRID2 = IMG_SIZE // 16
GRID3 = IMG_SIZE // 8
stride = 32

STRIDE1 = 32
STRIDE2 = 16
STRIDE3 = 8

W = H = 416

#################
CONF_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5

#################


In [3]:
import torch

def convert_to_corners(bboxes):
    cx, cy, w, h = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3]
    x1 = cx - w / 2
    y1 = cy - h / 2
    x2 = cx + w / 2
    y2 = cy + h / 2
    return torch.stack([x1, y1, x2, y2], dim=1)

def intersection_over_union(bb1, bb2):
    # Convert center-width-height format to top-left and bottom-right format
    bboxes1 = convert_to_corners(bb1)
    bboxes2 = convert_to_corners(bb2)

    # Calculate the coordinates of the intersection rectangles
    x_left = torch.max(bboxes1[:, 0], bboxes2[:, 0])
    y_top = torch.max(bboxes1[:, 1], bboxes2[:, 1])
    x_right = torch.min(bboxes1[:, 2], bboxes2[:, 2])
    y_bottom = torch.min(bboxes1[:, 3], bboxes2[:, 3])

    # Calculate the intersection area
    intersection_width = torch.clamp(x_right - x_left, min=0)
    intersection_height = torch.clamp(y_bottom - y_top, min=0)
    intersection_area = intersection_width * intersection_height

    # Calculate the area of each bounding box
    bb1_area = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
    bb2_area = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])

    # Calculate the IoU
    iou = intersection_area / (bb1_area + bb2_area - intersection_area)

    return iou


In [8]:
import torch
import torchvision

from torch.utils.data import Dataset
import torch.nn as nn
from torchvision.transforms import v2
from torchvision import tv_tensors

from config import *


class WIDERFaceDataseti(Dataset):

    def __init__(self, split, transforms=None):
        super().__init__()
        self.transforms = transforms
        self.dataset = torchvision.datasets.WIDERFace(
            root="./data/", split=split, download=True
        )

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        img, annots = self.dataset[idx]
        img = tv_tensors.Image(img)
        bboxes = annots["bbox"]
        labels = torch.ones(len(bboxes))

        w = bboxes[:, 2]
        h = bboxes[:, 3]

        cx = bboxes[:, 0] + 0.5 * w
        cy = bboxes[:, 1] + 0.5 * h

        bboxes = torch.stack((cx, cy, w, h), dim=1)


        bboxes = tv_tensors.BoundingBoxes(
            bboxes, format="CXCYWH", canvas_size=img.shape[-2:]
        )

        sample = {"image": img, "labels": labels, "bboxes": bboxes}



        if self.transforms is not None:
            sample = self.transforms(sample)

        return sample['image'], sample['bboxes']


class FinalTranform(torch.nn.Module):
    # Retruns target in the shape [S, S, N, C+5] for every Scale,
    # So a tesor represtnation of target for all anchor boxes and all scale values .

    def __init__(self):
        super().__init__()

    def forward(self, sample):
        image = sample["image"]
        bboxes = sample["bboxes"]
        labels = sample["labels"]

        # building targets
        targets = []

        # for every scale[13,26,52]:

        for i in range(len(S)):
            to_exclude = []  # we won't assign same anchor box multiple times.

            target = torch.zeros(S[i], S[i], N, 1 + 4 + C)  # S*S*N, 1+4+C

            for bbox, label in zip(bboxes, labels):
                cx, cy = bbox[0] / SCALE[i], bbox[1] / SCALE[i]  # Float values
                pos = (int(cx), int(cy))
                bx, by = cx - int(cx), cy - int(cy)
                box_width, box_height = bbox[2] / SCALE[i], bbox[3] / SCALE[i]

                assigned_anchor_box, ignore_indices = match_anchor_box(
                    box_width, box_height, i, to_exclude
                )

                if assigned_anchor_box is None:
                    continue

                anchor_box = ANCHOR_BOXES[i][assigned_anchor_box]

                bw_by_Pw, bh_by_ph = (
                    box_width / anchor_box[0],
                    box_height / anchor_box[1],
                )

                epsilon = 1e-6

                target[pos[0], pos[1], assigned_anchor_box, 0:5] = torch.tensor(
                    [
                        1,
                        bx,
                        by,
                        torch.log(bw_by_Pw + epsilon),
                        torch.log(bh_by_ph + epsilon),
                    ]
                )
                target[pos[0], pos[1], assigned_anchor_box, 5 + int(label)] = 1

                to_exclude.append(assigned_anchor_box)

                try:
                    for value in ignore_indices:
                        target[pos[0], pos[1], value.item(), 0] = -1
                except:
                    pass

            targets.append(target)

        return image, targets


def match_anchor_box(
    bbox_w,
    bbox_h,
    i,
    to_exclude=[],
):
    """
    Matches the bounding box to the closest anchor box.

    Parameters:
    - bbox_w (float): The width of the bounding box.
    - bbox_h (float): The height of the bounding box.
    - to_exclude (list): List of anchor boxes to exclude.

    Returns:
    - int: Index of the matched anchor box.
    """
    ignore = 0.5
    anchor_boxes = ANCHOR_BOXES[i]
    iou = []
    for i, box in enumerate(anchor_boxes):
        if i in to_exclude:
            iou.append(0)
            continue
        intersection_width = min(box[0], bbox_w)  # Scale up as h, w in range 0-13
        intersection_height = min(box[1], bbox_h)
        I = intersection_width * intersection_height
        IOU = I / (bbox_w * bbox_h + box[0] * box[1] - I)
        iou.append(IOU)

    iou = torch.tensor(iou)
    best = torch.argmax(iou, dim=0).item()
    # I want to not assign anchor if the IOU is below this.

    # if iou[best] < 0.1:
    #     best = None

    # Ignore anchors if they have high IOU but are not the best match
    ignore_indices = torch.nonzero((iou > ignore) & (iou != iou[best])).squeeze()

    return best, ignore_indices


def inverse_target(ground_truths, S=S, SCALE=SCALE, anchor_boxes=ANCHOR_BOXES):
    """
    Converts the target tensor back to bounding boxes and labels.

    Parameters:
    - ground_truth (torch.Tensor): The ground truth tensor.
    - S (int, optional): The size of the grid. Default is 13.
    - SCALE (int, optional): The scale factor. Default is 32.
    - anchor_boxes (list, optional): List of anchor boxes. Default is None.

    Returns:
    - tuple: (bbox, labels) where bbox are the bounding boxes and labels are the object labels.
    """

    # Each list element will have reversed targets, i.e ground truth bb
    all_bboxes = []
    all_labels = (
        []
    )  # Just for verifying all the targets are properly build, if they can be reversed then good.

    for i, ground_truth in enumerate(ground_truths):  # multiple targets
        bboxes = []
        labels = []
        ground_truth = ground_truth.to(device)
        cx = cy = torch.tensor([i for i in range(S[i])], device=device)

        ground_truth = ground_truth.permute(0, 3, 4, 2, 1)
        ground_truth[..., 1:2, :, :] += cx
        ground_truth = ground_truth.permute(0, 1, 2, 4, 3)
        ground_truth[..., 2:3, :, :] += cy
        ground_truth = ground_truth.permute((0, 3, 4, 1, 2))

        ground_truth[..., 1:3] *= SCALE[i]
        ground_truth[..., 3:5] = torch.exp(ground_truth[..., 3:5])
        ground_truth[..., 3:5] *= anchor_boxes[i].to(device)
        ground_truth[..., 3:5] = ground_truth[..., 3:5] * SCALE[i]

        bbox = ground_truth[ground_truth[..., 0] == 1][..., 1:5]
        labels = ground_truth[ground_truth[..., 0] == 1][..., 5]

        all_bboxes.append(bbox)
        all_labels.append(labels)

    return all_bboxes, all_labels


In [14]:
transformations = v2.Compose(
    [
        v2.RandomPhotometricDistort(p=0.3),
        v2.RandomHorizontalFlip(p=0.5),
        v2.RandomZoomOut(
            p=0.2, side_range=(1.0, 1.3), fill={tv_tensors.Image: (128, 128, 128)}
        ),
        #     v2.RandomIoUCrop(min_scale = 0.9, max_scale = 1, max_aspect_ratio=1.25, min_aspect_ratio=0.75),
        # #     v2.Resize((416,416), antialias=True),
        v2.RandomPerspective(distortion_scale=0.2, p=0.1),
        v2.RandomRotation(degrees=20),
        v2.RandomResizedCrop(size=(416, 416), scale=(0.9, 1), antialias=True),
        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        # v2.SanitizeBoundingBoxes(),
    ]
)

In [15]:
train_data = WIDERFaceDataseti(split='train',
                            transforms=transformations)

2427it [00:00, 18080951.70it/s]



RuntimeError: The MD5 checksum of the download file ./data/widerface/WIDER_train.zip does not match the one on record.Please delete the file and try again. If the issue persists, please report this to torchvision at https://github.com/pytorch/vision/issues.

In [None]:
all_boxes = []
for sample in train_data:
    all_boxes+=list(sample[1])

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

fig, ax = plt.subplots()

train_boxes = []
for batch in all_boxes:
    for box in batch:
        x1, y1, x2, y2 = box
        rect = Rectangle((x1-x2/2, y1-y2/2), x2, y2, edgecolor='red', facecolor='none', alpha=0.2)
        ax.add_patch(rect)
        train_boxes.append([x1,y1,x2,y2])

ax.set_xlim(0, 416)  
ax.set_ylim(0, 416)  
ax.invert_yaxis()    

plt.show()


In [None]:
def calculate_iou(box1, box2, image_size):
    ## Represent boxes at top left and bottom right coordinates
    x11,y11,w1,h1 = box1
#     x11,y11 = x11 * image_size[0], y11*image_size[1]
#     w1, h1 = w1 * image_size[0],h1 * image_size[1]

    x21,y21,w2,h2 = box2
#     x21, y21 = x21 * image_size[0], y21 *image_size[1]
#     w2, h2 = w2 * image_size[0],h2 * image_size[1]

    xtl1,ytl1 = x11-w1/2, y11-h1/2
    xbr1,ybr1 = x11+w1/2, y11+h1/2


    xtl2,ytl2 = x21-w2/2, y21-h2/2
    xbr2,ybr2 = x21+w2/2, y21+h2/2

    x_inter_1 = max(xtl1,xtl2)
    y_inter_1 = max(ytl1,ytl2)

    x_inter_2 = min(xbr1,xbr2)
    y_inter_2 = min(ybr1, ybr2)

    width_inter = (x_inter_2-x_inter_1)
    height_inter = (y_inter_2-y_inter_1)

    if width_inter<0 or height_inter<0:
        return 0
    area_inter = width_inter*height_inter

    width_box1 = xbr1-xtl1
    height_box1 = ybr1-ytl1
    box_1_area = width_box1 * height_box1

    width_box2 = xbr2-xtl2
    height_box2 = ybr2-ytl2
    box_2_area = width_box2* height_box2

    area_union = box_1_area + box_2_area - area_inter

    iou = area_inter / area_union

    return iou

In [None]:
from sklearn.model_selection import train_test_split
train_set, val_set = train_test_split(train_boxes, test_size=0.2, random_state=42,shuffle=True)
print("train_set", len(train_set))
print("val_set", len(val_set))

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

k_values = range(1, 15)
iou_avg = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(train_set)
    centroids = kmeans.cluster_centers_

    iou_values = []
    assigned_clusters = []

    for val in val_set:
        centroid_distances = [1 - calculate_iou(centroid, val, (416, 416)) for centroid in centroids]
        nearest_cluster_index = np.argmin(centroid_distances)
        assigned_clusters.append(nearest_cluster_index)
        iou_values.append(1 - centroid_distances[nearest_cluster_index])

    mean_iou = np.mean(iou_values)
    iou_avg.append(mean_iou)

    for centroid_index in range(len(centroids)):
        assigned_data_points = [val_set[i] for i, cluster_index in enumerate(assigned_clusters) if cluster_index == centroid_index]
        if assigned_data_points:
            new_centroid = np.mean(assigned_data_points, axis=0)
            centroids[centroid_index] = new_centroid

plt.plot(k_values, iou_avg, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Mean IOU')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=9, random_state=42,n_init='auto')
kmeans.fit(train_set)

In [None]:
print(centroids)

In [None]:
centroids = kmeans.cluster_centers_
fig,ax = plt.subplots()

fig, ax = plt.subplots()

for i in centroids:
    x1,y1,x2,y2 = i
    rect = Rectangle((x1-x2/2, y1-y2/2), x2 , y2, edgecolor='red', facecolor='none', alpha=0.2)
    ax.add_patch(rect)

ax.set_xlim(0, 416) 
ax.set_ylim(0, 416)  

plt.show()

In [None]:
xyxy = [[204.9835972,  221.60649912, 294.09578263, 312.73110721],
 [292.72444861, 218.55079433, 166.33437402, 175.1397013 ],
 [144.7808461,  178.77215698, 150.48080566, 164.62496094]]

anchors_wh13 = []
anchors_wh24 = []
anchors_wh52 = []

for xy in xyxy:
    width = xy[2]/416
    height = xy[3]/416
    anchors_wh13.append([width * 13,height*13])
    anchors_wh24.append([width * 26,height*26])
    anchors_wh52.append([width * 8,height*8])

In [None]:
print(torch.tensor(anchors_wh13))

In [None]:
print(torch.tensor(anchors_wh13)/2)

In [None]:
print(torch.tensor(anchors_wh13)/4)

In [None]:
print(anchors_wh24)

In [None]:
print(anchors_wh52)