In [None]:
!pip install albumentations # For TPU

import cv2 as cv
import albumentations as A
import os
import sys
import datetime
import io

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import (
    Conv2D,
    MaxPool2D,
    Dense,
    Flatten,
    Input,
    BatchNormalization,
    Layer,
    InputLayer,
    Dropout,
    Resizing,
    Rescaling,
    RandomFlip,
    RandomRotation,
    GlobalAveragePooling2D,
    Add,
    MultiHeadAttention,
    Embedding,
    LayerNormalization,
    LeakyReLU,
)
from tensorflow.keras.losses import (
    BinaryCrossentropy,
    CategoricalCrossentropy,
    SparseCategoricalCrossentropy,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import (
    CategoricalAccuracy,
    TopKCategoricalAccuracy,
)
from tensorflow.keras.callbacks import (
    Callback,
    CSVLogger,
    EarlyStopping,
    LearningRateScheduler,
    ModelCheckpoint,
    ReduceLROnPlateau,
)
from tensorflow.keras.regularizers import L2, L1
from tensorboard.plugins.hparams import api as hp

import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve
import shutil



In [None]:
from google.colab import drive

drive.mount('/content/drive')
ROOT_DIR = "/content/drive/MyDrive/tfds_data/pascal_voc_2012/"
os.makedirs(ROOT_DIR, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install -q kaggle
# !mkdir ~/.kaggle/
# !cp kaggle.json ~/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle datasets download -d huanghanchina/pascal-voc-2012

In [None]:
# !unzip "/content/pascal-voc-2012.zip" -d "/content/drive/MyDrive/tfds_data/pascal_voc_2012/"
# !unzip "/content/pascal-voc-2012.zip" -d {ROOT_DIR}

# Preparing Data

In [None]:
TRAIN_IMAGES = ROOT_DIR + "VOC2012/JPEGImages/"
TRAIN_MAPS = ROOT_DIR + "VOC2012/Annotations/"
VAL_IMAGES = ROOT_DIR + "VOC2012/ValJPEGImages/"
VAL_MAPS = ROOT_DIR + "VOC2012/ValAnnotations/"

os.makedirs(VAL_IMAGES, exist_ok=True)
os.makedirs(VAL_MAPS, exist_ok=True)

val_list=['2007_000027.jpg','2007_000032.jpg','2007_000033.jpg','2007_000039.jpg','2007_000042.jpg','2007_000061.jpg',
          '2007_000063.jpg','2007_000068.jpg','2007_000121.jpg','2007_000123.jpg','2007_000129.jpg','2007_000170.jpg',
          '2007_000175.jpg','2007_000187.jpg','2007_000241.jpg','2007_000243.jpg','2007_000250.jpg','2007_000256.jpg',
          '2007_000272.jpg','2007_000323.jpg','2007_000332.jpg','2007_000333.jpg','2007_000346.jpg','2007_000363.jpg',
          '2007_000364.jpg','2007_000392.jpg','2007_000423.jpg','2007_000452.jpg','2007_000464.jpg','2007_000480.jpg',
          '2007_000491.jpg','2007_000504.jpg','2007_000515.jpg','2007_000528.jpg','2007_000529.jpg','2007_000549.jpg',
          '2007_000559.jpg','2007_000572.jpg','2007_000584.jpg','2007_000629.jpg','2007_000636.jpg','2007_000645.jpg',
          '2007_000648.jpg','2007_000661.jpg','2007_000663.jpg','2007_000664.jpg','2007_000676.jpg','2007_000713.jpg',
          '2007_000720.jpg','2007_000727.jpg','2007_000733.jpg','2007_000738.jpg','2007_000762.jpg','2007_000768.jpg',
          '2007_000783.jpg','2007_000793.jpg','2007_000799.jpg','2007_000804.jpg','2007_000807.jpg','2007_000822.jpg',
          '2007_001299.jpg','2007_001311.jpg','2007_001321.jpg','2007_001340.jpg']

In [None]:
# for name in val_list:
#   shutil.move(TRAIN_IMAGES + name, VAL_IMAGES + name)
# for name in val_list:
#   # Removing jpg and appending xml as the extension.
#   shutil.move(TRAIN_MAPS + name[:-3] + "xml", VAL_MAPS + name[:-3] + "xml")

In [None]:
classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
           "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

# Quantity of outputed bounding boxes
B=2
N_CLASSES = len(classes)
H, W = 224, 224
# Size of each cell
SPLIT_SIZE = H//32
N_EPOCHS = 135
LR = 5e-4
BATCH_SIZE = 32

import xml.etree.ElementTree as ET

def preprocess_xml(filename):
  tree = ET.parse(filename)
  root = tree.getroot()
  sizeE = root.find("size")
  img_height = float(sizeE.find("height").text)
  img_width = float(sizeE.find("width").text)
  img_depth = float(sizeE.find("depth").text)
  class_dict = {classes[i]:i for i in range(len(classes))}
  bounding_boxes = []

  for objectE in root.findall("object"):
    for bbxE in objectE.iter("bndbox"):
      xmin = float(bbxE.find("xmin").text)
      ymin = float(bbxE.find("ymin").text)
      xmax = float(bbxE.find("xmax").text)
      ymax = float(bbxE.find("ymax").text)
      # print(xmin, ymin, xmax, ymax)
      break # We break here because we want only one bbx for object.

    class_name = objectE.find("name").text
    # x_center, y_center, width and height
    # (all divided by img_width (or img_height in case of y) for normalization)
    # NOTE: The center (x,y) is normalized with respect to the whole image,
    # but we eventually will normalize it with respect to the cell width and
    # height by multiplying x and y by the number of cells.
    bbx = [
        (xmin+xmax)/(2*img_width),
        (ymin+ymax)/(2*img_height),
        (xmax-xmin)/img_width,
        (ymax-ymin)/img_height,
        class_dict[class_name],
    ]
    bounding_boxes.append(bbx)

  """
  bounding_boxes[b][0] >> x_center/img_width
  bounding_boxes[b][1] >> y_center/img_height
  bounding_boxes[b][2] >> width/img_width
  bounding_boxes[b][3] >> height/img_height
  bounding_boxes[b][4] >> class number
  """
  # return bounding_boxes
  return tf.convert_to_tensor(bounding_boxes) # For tf.numpy_function

In [None]:
# preprocess_xml(TRAIN_MAPS + "2007_000032.xml") # This image was moved.

In [None]:
# 31:41:00
def generate_output_v1(bounding_boxes, length=None):
  if length is None:
    length = len(bounding_boxes)

  # N_CLASSs+5 because we have the five first positions for:
  # >> [0] = objectness/score (probability of having an object in the cell)
  # >> [1] = x_center
  # >> [2] = y_center
  # >> [3] = cell width with respect to the img_width
  # >> [4] = cell height with respect to the img_height
  # >> [5:] = classes (1 for the correct class position and 0 for the others)
  output_label = tf.Variable(lambda: tf.zeros([SPLIT_SIZE, SPLIT_SIZE, N_CLASSES+5], dtype=tf.float32))

  # NOTE: We only need to iterate through the number of bbx of the image
  # because all the other cells will have 0 for all positions of the tensor
  # (due to the fact that they don't have objects on them).
  for b in range(length):
    # NOTE: We multiply it by the SPLIT_SIZE to make
    # x and y to be with respect to the cell size.
    # It was with respect to the img_width/img_height before.
    grid_x = bounding_boxes[b][0]*SPLIT_SIZE
    grid_y = bounding_boxes[b][1]*SPLIT_SIZE
    # i for x of the cell in the image. (ex: 0)
    # j for y of the cell in the image. (ex: 5)
    i = int(grid_x)
    j = int(grid_y)

    # print(i, j, grid_x, grid_y)

    """
    bounding_boxes[b][0] >> x_center/img_width
    bounding_boxes[b][1] >> y_center/img_height
    bounding_boxes[b][2] >> width/img_width
    bounding_boxes[b][3] >> height/img_height
    bounding_boxes[b][4] >> class number
    """
    # if the score of the cell[i,j] is 0
    if(output_label[i, j, 0] == 0):
      # These 5 have already been explained in the first comment.
      output_label[i, j, 0:5].assign([1., grid_x%1, grid_y%1, bounding_boxes[b][2], bounding_boxes[b][3]])
      # Put 1. in the class position
      output_label[i, j, 5+int(bounding_boxes[b][4])].assign(1.)

  # This needs to be correctly indented and outside the loop, pal.
  return output_label

In [None]:
bbxs = preprocess_xml(VAL_MAPS+"2007_000733.xml")
# tf.config.run_functions_eagerly(False)
generate_output_v1(bbxs, len(bbxs)).shape
bbxs

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[ 0.35666665,  0.46896553,  0.5       ,  0.8229885 , 14.        ],
       [ 0.6122222 ,  0.73103446,  0.76666665,  0.537931  , 13.        ]],
      dtype=float32)>

In [None]:
# 31:41:00
# https://github.com/Neuralearn/deep-learning-with-tensorflow-2/blob/main/deep%20learning%20for%20computer%20vision/5-YOLO%20Object%20Detection%20from%20Scratch%20by%20Neuralearn.ai-.ipynb
def generate_output_v2(bounding_boxes):
  output_label = np.zeros([SPLIT_SIZE, SPLIT_SIZE, N_CLASSES+5], dtype=np.float32)

  for b in range(len(bounding_boxes)):
    # [..., ?, ?] because it will be batched (probably of size 32)
    grid_x = bounding_boxes[..., b, 0]*SPLIT_SIZE
    grid_y = bounding_boxes[..., b, 1]*SPLIT_SIZE
    i = int(grid_x)
    j = int(grid_y)

    if(output_label[i, j, 0] == 0):
      output_label[i, j, 0:5] = [1., grid_x%1, grid_y%1, bounding_boxes[..., b, 2], bounding_boxes[..., b, 3]]
      output_label[i, j, 5+int(bounding_boxes[..., b, 4])] = 1.

  # This needs to be correctly indented and outside the loop, pal.
  return tf.convert_to_tensor(output_label)

In [None]:
# NOTE: A.RandomCrop() and A.RandomScale() may result in a division by zero if
# the bbx is too close to the borders.
transforms = A.Compose([
    A.Resize(H, W),
    # A.RandomCrop(
    #     width=np.random.randint(int(0.9*W),W),
    #     height=np.random.randint(int(0.9*H), H),
    #     p=0.5,
    #     always_apply=False,
    # ),
    A.BBoxSafeRandomCrop(erosion_rate=0.2, p=1.0),
    # A.RandomScale(scale_limit=0.1, interpolation=cv.INTER_LANCZOS4, p=0.5,),
    A.HorizontalFlip(p=0.5),
    A.Resize(H, W),
], bbox_params=A.BboxParams(format="yolo"))

def aug_albument(img, bboxes):
  # NOTE: remember that you must pass the bboxes in the format [[x_center, y_center, width, height, class], ...]
  # KeyError: 'You have to pass data to augmentations as named arguments, for example: aug(image=image)'
  augmented = transforms(image=img, bboxes=bboxes)
  return [tf.convert_to_tensor(augmented["image"], dtype=tf.float32), tf.convert_to_tensor(augmented["bboxes"], dtype=tf.float32)]

In [None]:
train_im_paths = []
train_xml_paths = []
val_im_paths = []
val_xml_paths = []

for p in os.listdir(TRAIN_IMAGES):
  train_im_paths.append(TRAIN_IMAGES + p)
  train_xml_paths.append(TRAIN_MAPS + p[:-3] + "xml")

for p in os.listdir(VAL_IMAGES):
  val_im_paths.append(VAL_IMAGES + p)
  val_xml_paths.append(VAL_MAPS + p[:-3] + "xml")

print(len(train_im_paths), len(train_xml_paths))
print(len(val_im_paths), len(val_xml_paths))

def get_imbboxes(im_path, xml_path):
  img = tf.io.read_file(im_path)
  img = tf.io.decode_jpeg(img)
  img = tf.image.resize(img, size=(H, W))
  img = tf.cast(img, dtype=tf.float32)

  # This is necessary because preprocess_xml() is not made entirely of tensorflow
  # operations only.
  # NOTE: preprocess_xml MUST return a tensor.
  bbxs = tf.numpy_function(func=preprocess_xml, inp=[xml_path], Tout=tf.float32)

  # tf.ensure_shape(img, (H, W, 3))
  # tf.ensure_shape(bbxs, [None, 5])

  # print("-"*30)
  # print(img.shape)
  # print(bbxs.shape)

  return img, bbxs

def augment_data(img, bboxes):
  # We use Albumentations here because it involver transformations which involve changing the bboxes values.
  img, bboxes = tf.numpy_function(func=aug_albument, inp=[img, bboxes], Tout=(tf.float32, tf.float32))
  # These below don't require changes in the bboxes.
  img = tf.image.random_brightness(img, max_delta=50.)
  img = tf.image.random_hue(img, max_delta=0.5)
  img = tf.image.random_saturation(img, lower=0.5, upper=1.5)
  img = tf.image.random_contrast(img, lower=0.5, upper=1.5)

  # tf.ensure_shape(img, (H, W, 3))
  # tf.ensure_shape(bboxes, [None, 5])

  return img, bboxes

def preproces_bbxes(img, bboxes):
  labels = tf.numpy_function(func=generate_output_v2, inp=[bboxes], Tout=tf.float32)
  # tf.ensure_shape(img, (H, W, 3))
  # tf.ensure_shape(labels, (SPLIT_SIZE, SPLIT_SIZE, 25))
  return img, labels

17061 17061
64 64


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_im_paths, train_xml_paths))
val_dataset = tf.data.Dataset.from_tensor_slices((val_im_paths, val_xml_paths))

def ensure_shape(x, y):
  # Ensures the dataset elements have a defined shape.
  return (
      tf.ensure_shape(x, (None, H, W, 3)),
      tf.ensure_shape(y, (None, SPLIT_SIZE, SPLIT_SIZE, N_CLASSES+5))
  )

train_dataset = (
    train_dataset
    .map(get_imbboxes)
    .map(augment_data)
    .map(preproces_bbxes) # This goes AFTER augment_data
    .batch(BATCH_SIZE)
    .map(ensure_shape)
    .prefetch(tf.data.AUTOTUNE)
)
val_dataset = (
    val_dataset
    .map(get_imbboxes)
    # .map(augment_data) # Why in the hell would we augment validation data?
    .map(preproces_bbxes)
    .batch(BATCH_SIZE)
    .map(ensure_shape)
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
# for i, j in train_dataset.take(1):
#   # cv.imwrite("out1.png", i.numpy())
#   print(i.shape, j.shape)

In [None]:
# for i, j in train_dataset.take(1):
#   # cv.imwrite("out2.png", i.numpy())
#   print(i.shape, j.shape)

In [None]:
def calculate_iou(boxes1, boxes2):
  # [..., 0] -> x_center (batched)
  # [..., 1] -> y_center (batched)
  # [..., 2] -> width    (batched)
  # [..., 3] -> height   (batched)

  # xmin, ymin, xmax, ymax
  boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,
                       boxes1[..., 1] - boxes1[..., 3] / 2.0,
                       boxes1[..., 0] + boxes1[..., 2] / 2.0,
                       boxes1[..., 1] + boxes1[..., 3] / 2.0,],
                      axis=-1)

  boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
                       boxes2[..., 1] - boxes2[..., 3] / 2.0,
                       boxes2[..., 0] + boxes2[..., 2] / 2.0,
                       boxes2[..., 1] + boxes2[..., 3] / 2.0,],
                      axis=-1)

  # intermin
  lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
  # intermax
  rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])
  # print(lu, rd)

  # Fomula for the area -> (xmax - xmin) * (ymax - ymin)
  # (it's basically WIDTH * HEIGHT)
  intersection = tf.maximum(0.0, rd - lu)
  intersection_areas = intersection[..., 0] * intersection[..., 1]

  # It's basically WIDTH * HEIGHT
  areas_1 = boxes1[..., 2] * boxes1[..., 3]
  areas_2 = boxes2[..., 2] * boxes2[..., 3]

  union_areas = tf.maximum(areas_1 + areas_2 - intersection_areas, 1e-10)
  return tf.clip_by_value(intersection_areas / union_areas, 0.0, 1.0)

def difference(x, y):
  return tf.reduce_sum(y-x)

@tf.function
def yolo_loss(y_true, y_pred, should_print=False):
  """
  print(y_true.shape)         -> (?, 7, 7, 25)
  print(y_true[0].shape)      -> (7, 7, 25)
  print(y_true[..., 0].shape) -> (?, 7, 7)
  """
  # [..., 0] because we want to know only the scores (chance of having an object).
  target = y_true[..., 0]

  # ======== Object Loss (for regions where we DO HAVE objects in the original image) ========
  y_pred_extract = tf.gather_nd(y_pred, tf.where(target[:]==1))
  y_target_extract = tf.gather_nd(y_true, tf.where(target[:]==1))

  # ´rescaler´  contains [b, x, y] of the origin point (0,0) of the cell with respect to the whole image.
  # EX: rescaler = [[0, 32, 128], [0, 96, 64]]
  # >> SPLIT_SIZE is 7 for this case.
  # >> 224 is the size of the image.
  # >> 32 because: 224 / SPLIT_SIZE = 32
  rescaler = tf.where(target[:]==1)*32

  # rescaler_shape = rescaler.shape[0]
  rescaler_shape = tf.shape(rescaler)[0]
  # tf.print(rescaler_shape)

  # Creating space for weight and height as well (horizontally).
  # EX: upscaler_1 = [[32, 128, 0, 0], [96, 64, 0, 0]]
  upscaler_1 = tf.concat([rescaler[:, 1:], tf.zeros([rescaler_shape, 2], dtype=tf.int64)], axis=-1)
  # [32., 32., 224., 224.,]
  # >> SPLIT_SIZE is 7 for this case.
  # >> 224 is the size of the image.
  # >> 32 because: 224 / SPLIT_SIZE = 32
  target_upscaler_2 = tf.repeat([[32., 32., 224., 224.,]], repeats=[rescaler_shape], axis=0)
  # y_target_extract[..., 1:5]
  # >> ... because it may be batched
  # >> 1:5 because we don't want the score, only (x, y, width, height).
  # And we multiply each one (x, y, width and height) by 32, 32, 224 and 224, respectively
  # in order to get the distance from (0, 0) of the cell to the (x, y) of the object (inside the cell).
  target_upscaler_2 *= tf.cast(y_target_extract[..., 1:5], dtype=tf.float32)

  pred_1_upscaler_2 = tf.repeat([[32.,32.,224.,224.,]], repeats=[rescaler_shape], axis=0)
  # y_pred_extract[..., 1:5]
  # >> ... because it may be batched
  # >> 1:5 because we don't want the score, only (x, y, width, height).
  pred_1_upscaler_2 *= tf.cast(y_pred_extract[..., 1:5], dtype=tf.float32)
  # >> 6:10 because we don't want the SECOND score, BUT we want the second bbx x, y, width and height.
  pred_2_upscaler_2 = tf.repeat([[32.,32.,224.,224.,]], repeats=[rescaler_shape], axis=0)
  pred_2_upscaler_2 *= tf.cast(y_pred_extract[..., 6:10], dtype=tf.float32)

  # If we add the x, y, width and height from the origin (0, 0) of the cell with
  # the x, y, width and height of the origin with respect to the whole image,
  # we get the x, y, width(0) and height(0) of the bbx with respect to the whole
  # image.
  # EX: [32., 32., 224., 224.,] + [18.89, 113.56, 56.778, 78.32,]
  target_origin = tf.cast(upscaler_1, dtype=tf.float32) + target_upscaler_2
  pred_1_origin = tf.cast(upscaler_1, dtype=tf.float32) + pred_1_upscaler_2
  pred_2_origin = tf.cast(upscaler_1, dtype=tf.float32) + pred_2_upscaler_2

  # This tells if the first or second bbx is closer in area to the target bbx (y_true):
  # >> outputs 0 if it's the first.
  # >> outputs 1 if it's the second.
  # NOTE: How to interpret mask: EX: [0, 1]
  # >> "The first(0) bbx of the first pair of predictions has a higher IOU"
  # >> "The second(1) bbx of the second pair of predictions has a higher IOU"
  mask = tf.math.greater(calculate_iou(target_origin, pred_1_origin), calculate_iou(target_origin, pred_2_origin))
  mask = tf.cast(mask, dtype=tf.int32)

  y_pred_joined = tf.transpose(tf.concat([tf.expand_dims(y_pred_extract[..., 0], axis=0), tf.expand_dims(y_pred_extract[..., 5], axis=0)], axis=0))

  obj_pred = tf.gather_nd(y_pred_joined, tf.stack([tf.range(rescaler_shape), mask], axis=-1))

  obj_loss = tf.math.abs(difference(tf.cast(obj_pred, dtype=tf.float32), tf.cast(tf.ones([rescaler_shape]), dtype=tf.float32)))

  # ======== No Object Loss (for regions where we DO NOT HAVE objects in the original image) ========
  y_pred_extract = tf.gather_nd(y_pred[..., 0:B*5], tf.where(target[:] == 0))
  y_target_extract = tf.zeros([len(y_pred_extract)])

  no_object_loss_1 = tf.math.abs(difference(tf.cast(y_pred_extract[..., 0], dtype=tf.float32), tf.cast(y_target_extract, dtype=tf.float32)))
  no_object_loss_2 = tf.math.abs(difference(tf.cast(y_pred_extract[..., 5], dtype=tf.float32), tf.cast(y_target_extract, dtype=tf.float32)))

  no_object_loss = no_object_loss_1 + no_object_loss_2

  # ======== Object Class Loss ========
  # tf.print(tf.shape(y_pred))
  # tf.print(tf.shape(y_true))
  y_pred_extract = tf.gather_nd(y_pred[..., 10:], tf.where(target[:]==1))
  class_extract = tf.gather_nd(y_true[..., 5:], tf.where(target[:]==1))
  # tf.print(tf.shape(y_pred_extract))
  # tf.print(tf.shape(class_extract))

  class_loss = tf.math.abs(difference(tf.cast(y_pred_extract, dtype=tf.float32), tf.cast(class_extract, dtype=tf.float32)))

  # print(class_loss)

  # ======== Object Bounding Box Loss ========
  # For x_center and y_center
  y_pred_extract = tf.gather_nd(y_pred[..., 0:B*5], tf.where(target[:]==1))
  center_joined = tf.stack([y_pred_extract[..., 1:3], y_pred_extract[..., 6:8]], axis=1)

  # Filtering for only x_center and y_center from the bbx with the higher IOU.
  center_pred = tf.gather_nd(center_joined, tf.stack([tf.range(rescaler_shape), mask], axis=-1))
  center_target = tf.gather_nd(y_true[..., 1:3], tf.where(target[:]==1))

  center_loss = tf.math.abs(difference(tf.cast(center_pred, dtype=tf.float32), tf.cast(center_target, dtype=tf.float32)))

  # For width and height
  size_joined = tf.stack([y_pred_extract[..., 3:5], y_pred_extract[..., 8:10]], axis=1)
  size_pred = tf.gather_nd(size_joined, tf.stack([tf.range(rescaler_shape), mask], axis=-1))
  size_target = tf.gather_nd(y_true[..., 3:5], tf.where(target[:]==1))

  # print(tf.math.abs(size_pred))
  # print(tf.math.abs(size_target))
  size_loss = tf.math.abs(difference(tf.cast(tf.math.sqrt(tf.math.abs(size_pred)), dtype=tf.float32), tf.cast(tf.math.sqrt(tf.math.abs(size_target)), dtype=tf.float32)))

  box_loss = center_loss + size_loss

  # ======== Final Loss ========
  lambda_coord = 5.0
  lambda_no_obj = 0.5

  # Just to be sure
  obj_loss = tf.math.abs(obj_loss)
  no_object_loss = tf.math.abs(no_object_loss)
  class_loss = tf.math.abs(class_loss)
  box_loss = tf.math.abs(box_loss)

  loss = tf.math.abs(obj_loss) + (lambda_no_obj * no_object_loss) + tf.cast(lambda_coord * box_loss, dtype=tf.float32) + tf.cast(class_loss, dtype=tf.float32)
  loss = tf.reduce_mean(loss) # Just for tensorflow to understand the shape of `loss`.
  return loss


In [None]:
"""
# x_center, y_center, width, height (all normalized)
For [0.210784,0.616422,0.127451,0.232843,2], the cell indexes are calculated as follows:
>> x = int(0.210784 * SPLIT_SIZE) = 1
>> y = int(0.616422 * SPLIT_SIZE) = 4
For [0.509804,0.411765,0.107843,0.245098,3], the cell indexes are calculated as:
>> x = int(0.509804 * SPLIT_SIZE) = 3
>> y = int(0.411765 * SPLIT_SIZE) = 2
"""
# y_true=generate_output_v2(np.array([[0.210784,0.616422,0.127451,0.232843,2]]))
# y_true=generate_output_v2(np.array([[0.509804,0.411765,0.107843,0.245098,3]]))
y_true=generate_output_v2(np.array([[0.509804,0.411765,0.107843,0.245098,3], [0.210784,0.616422,0.127451,0.232843,2]]))
# y_true=generate_output_v2(np.array([]))
y_true=np.expand_dims(y_true,axis=0)
y_pred=np.random.normal(size = (1,7,7,N_CLASSES+5*B))

# print(y_pred.shape)

#                    5 info for bbx,            5 info for bbx,           20 classes
y_pred[0][1][4] = [0.9,0.2,0.6,0.1,0.95,      1.0,0.47,0.31,0.12,0.23,   0.9,  0.8, 0.2, 0.6, 0.1, 0.5, 0.9,0.35, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,]
# y_pred[0][1][4] = [0.9,0.47,0.31,0.12,0.23,     1.0,0.2,0.6,0.1,0.95,    0.9,  0.8, 0.2, 0.6, 0.1, 0.5, 0.9,0.35, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,]
y_pred[0][3][2] = [0.3,0.01,0.08,0.11,0.54,   0.98,0.56,0.88,0.1,0.24,  0.09,0.018,0.22,0.16,0.01,0.05,0.99, 0.3, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,]
# y_pred[0][3][2] = [0.3,0.01,0.08,0.11,0.54,   0.98,0.56,0.88,0.1,0.24,  0.09,0.018,0.22,0.16,0.01,0.05,0.99, 0.3, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,]

yolo_loss(y_true, y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=19.728378>

In [None]:
NUM_FILTERS=512
# 5 * B because it will be the number of bounding boxes outputed for a single
# object. Ex: when B=2, we have 2 bounding boxes (each having 5 items due to the score being the first).
OUTPUT_DIM=N_CLASSES + (5*B)

# base_model = tf.keras.applications.resnet50.ResNet50(
base_model = tf.keras.applications.efficientnet.EfficientNetB1(
    weights="imagenet",
    include_top=False,
    input_shape=(H, W, 3),
)

model = tf.keras.Sequential([
    base_model,

    # `he_normal` is a weight initialization method designed for layers using rectified
    # linear unit (ReLU) activation functions (or similar variants like LeakyReLU) which
    # helps prevent vanishing and exploding gradients, ensuring that the weights remain
    # at an appropriate scale for effective learning, especially in deep networks.
    Conv2D(NUM_FILTERS, kernel_size=(3,3), padding="same", kernel_initializer="he_normal"),
    BatchNormalization(),
    # UserWarning: Argument `alpha` is deprecated. Use `negative_slope` instead.
    # NOTE: But if you're using an older version of tensorflow, it may be necessary.
    # LeakyReLU(negative_slope=0.1),
    LeakyReLU(alpha=0.1),

    Conv2D(NUM_FILTERS, kernel_size=(3,3) , padding="same", kernel_initializer="he_normal"),
    BatchNormalization(),
    # LeakyReLU(negative_slope=0.1),
    LeakyReLU(alpha=0.1),

    Conv2D(NUM_FILTERS, kernel_size=(3,3) , padding="same", kernel_initializer="he_normal"),
    BatchNormalization(),
    # LeakyReLU(negative_slope=0.1),
    LeakyReLU(alpha=0.1),

    Conv2D(NUM_FILTERS, kernel_size=(3,3) , padding="same", kernel_initializer="he_normal"),
    # LeakyReLU(negative_slope=0.1),
    LeakyReLU(alpha=0.1),

    # GlobalAveragePooling2D() # We don't want only the mean/representative value for each channel.
    Flatten(),

    Dense(NUM_FILTERS, kernel_initializer="he_normal"),
    BatchNormalization(),
    # LeakyReLU(negative_slope=0.1),
    LeakyReLU(alpha=0.1),
    Dropout(0.5),

    # 7 * 7 * 30
    Dense(SPLIT_SIZE * SPLIT_SIZE * OUTPUT_DIM, activation="sigmoid"),
    tf.keras.layers.Reshape((SPLIT_SIZE, SPLIT_SIZE, OUTPUT_DIM)),
])

model.summary()



In [None]:
checkpoint = ModelCheckpoint(
    filepath=ROOT_DIR+"model.weights.h5",
    save_best_only=True,
    save_weights_only=True,
    monitor="val_loss",
    mode="min",
    save_freq="epoch",
)

def scheduler(epoch, lr):
  if epoch < 40:
    return 1e-3
  elif epoch >= 40 and epoch < 80:
    return 5e-4
  else:
    return 1e-4

lr_callback = LearningRateScheduler(scheduler)

model.compile(
    optimizer=Adam(learning_rate=LR),
    loss=yolo_loss,
)

In [None]:
# model.fit(train_dataset, validation_data=val_dataset, verbose=1, callbacks=[checkpoint, lr_callback])

In [None]:
image=tf.io.decode_jpeg(tf.io.read_file("/content/drive/MyDrive/tfds_data/pascal_voc_2012/VOC2012/ValJPEGImages/2007_000515.jpg"))
image=tf.image.resize(image, [H,W])
# model.predict(tf.expand_dims(image, axis = 0))

In [None]:
def model_test(path, filename):
  try:
    test_path=path+filename
    print(test_path)

    img = cv.imread(test_path)
    image=tf.io.decode_jpeg(tf.io.read_file(test_path))
    image=tf.image.resize(image, [H,W])

    output=model.predict(np.expand_dims(image, axis = 0))

    THRESH=.60

    """
    output[..., 0].shape            -> (1,7,7) (we take only the first score)
    output[..., 5].shape            -> (1,7,7) (we take only the second score)
    tf.where(output[...,0]>=THRESH) -> We get all positions (ex: [0, 3, 4]) where the first score is greater or equals to THRESH.
    object_positions                -> All positions where the first or second score are greater or equal to THRESH.
    selected_output                 -> Tensor with all size 30 arrays (from output, which has shape (1, 7, 7, 30)) whih contains 2 bboxes and the classes.
    """
    object_positions=tf.concat(
        [tf.where(output[...,0]>=THRESH), tf.where(output[...,5]>=THRESH)],
        axis=0)
    selected_output=tf.gather_nd(output, object_positions)
    final_boxes=[]
    final_scores=[]

    # print(output.shape)
    # print(output[...,0].shape)
    # print(output[...,5].shape)
    # print(tf.where(output[...,0]>=THRESH).shape)
    # print(tf.where(output[...,5]>=THRESH).shape)
    # print(object_positions.shape)
    # print(selected_output.shape)

    # return None # For printing

    # EX: i=0, pos=[0,3,4]
    for i, pos in enumerate(object_positions):
      for j in range(2):
        if selected_output[i][j*5]>THRESH:
          """
          output[pos[0]][pos[1]][pos[2]][(j*5)+1:(j*5)+5]
          >> if j == 0 -> we take from 1 to 5 (exclusive), which contains x_center, y_center, width and height of the first bbox.
          >> if j == 1 -> we take from 6 to 10(exclusive), which contains x_center, y_center, width and height of the second bbox.
          """
          output_box=tf.cast(output[pos[0]][pos[1]][pos[2]][(j*5)+1:(j*5)+5],dtype=tf.float32)

          # >> pos[0] contains the batch (there's only one for this case).
          # >> pos[1] contains the x cell of the image.
          # >> pos[2] contains the y cell of the image.
          # >> output_box[0] contains the x_center of the bbox.
          # >> output_box[1] contains the y_center of the bbox.
          # >> 32 (size of each cell): its the number used in ´SPLIT_SIZE = H//32´
          # We sum x cell position with x_center and multiply it by 32 to get the exact x_center position of the bbox.
          # We sum y cell position with y_center and multiply it by 32 to get the exact y_center position of the bbox.
          x_center=(tf.cast(pos[1],dtype=tf.float32)+output_box[0])*32
          y_center=(tf.cast(pos[2],dtype=tf.float32)+output_box[1])*32

          # Unormalize width and height
          x_width, y_height=tf.math.abs(W*output_box[2]),tf.math.abs(H*output_box[3])

          x_min = int(x_center-(x_width/2))
          y_min = int(y_center-(y_height/2))
          x_max = int(x_center+(x_width/2))
          y_max = int(y_center+(y_height/2))

          x_min = 0 if x_min < 0 else x_min
          y_min = 0 if y_min < 0 else y_min
          x_max = W if x_max > W else x_max
          y_max = H if y_max > H else y_max

          # selected_output[...,10:] -> Because we only want the classes, so we skip the first two bboxes.
          # [i] because selected_output and object_position both have the same number of rows. Ex:
          # >> if   object_position.shape = (89,  3)
          # >> then selected_output.shape = (89, 30)
          final_boxes.append(
              [x_min, y_min, x_max, y_max,
              str(classes[tf.argmax(selected_output[...,10:],axis=-1)[i]])],
          )
          final_scores.append(selected_output[i][j*5])

    print("finalscores", final_scores)
    print('finalboxes', final_boxes)
    final_boxes=np.array(final_boxes)

    object_classes=final_boxes[...,4]
    nms_boxes=final_boxes[...,0:4]

    """
    >> non_max_suppression: used to remove bboxes which predict the same object.
    >> iou_threshold is used to tell if two bboxes are trying to predict the same object.
    If the iou score is too low, it generally means the boxes are predicting different
    objects.
    >> score_threshold: discards all bboxes which have scores less than a defined amout
    (even if they don't overlap using IOU score).
    """
    nms_output=tf.image.non_max_suppression(
        nms_boxes, final_scores, max_output_size=100,
        iou_threshold=0.2, score_threshold=float('-inf'),
    )
    print(nms_output)

    # nms_output has the index of the bboxes which passed
    # through the non max suppresion operation.
    for i in nms_output:
      cv.rectangle(
          img,
          (int(final_boxes[i][0]), int(final_boxes[i][1])),
          (int(final_boxes[i][2]), int(final_boxes[i][3])),(0,0,255),1)
      cv.putText(
          img,
          final_boxes[i][-1],
          (int(final_boxes[i][0]), int(final_boxes[i][1])+15),
          cv.FONT_HERSHEY_COMPLEX_SMALL,
          1,
          (0,225,0),
          1,)

    cv.imwrite('/content/'+ filename[:-4]+'_det'+'.jpg', cv.resize(img,(384,384)))
  except:
    print("NO object found !!!")

In [None]:
model_test(VAL_IMAGES, "2007_000027.jpg")

/content/drive/MyDrive/tfds_data/pascal_voc_2012/VOC2012/ValJPEGImages/2007_000027.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
finalscores [<tf.Tensor: shape=(), dtype=float32, numpy=0.75394267>, <tf.Tensor: shape=(), dtype=float32, numpy=0.7138698>, <tf.Tensor: shape=(), dtype=float32, numpy=0.6832388>, <tf.Tensor: shape=(), dtype=float32, numpy=0.71943706>, <tf.Tensor: shape=(), dtype=float32, numpy=0.7093084>, <tf.Tensor: shape=(), dtype=float32, numpy=0.6538927>, <tf.Tensor: shape=(), dtype=float32, numpy=0.74860483>, <tf.Tensor: shape=(), dtype=float32, numpy=0.6531306>, <tf.Tensor: shape=(), dtype=float32, numpy=0.7462428>, <tf.Tensor: shape=(), dtype=float32, numpy=0.67122984>, <tf.Tensor: shape=(), dtype=float32, numpy=0.62466246>, <tf.Tensor: shape=(), dtype=float32, numpy=0.6106474>, <tf.Tensor: shape=(), dtype=float32, numpy=0.8205828>, <tf.Tensor: shape=(), dtype=float32, numpy=0.66371423>, <tf.Tensor: shape=(), dtype=float32, numpy=0.611155