In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import keras.backend as K
import sys, os
from tensorflow.python.keras.regularizers import l2
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

here = os.path.dirname(".")
sys.path.append(os.path.join(here, '..'))

from dataHandler import *

2024-07-20 19:31:56.513859: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  1


2024-07-20 19:31:58.351745: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-20 19:31:58.370925: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-20 19:31:58.371103: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [2]:
class YOLOv1_LastLayer_Reshape(tf.keras.layers.Layer):
    """
    Defines a costume layer for reshaping the last layer to YOLOv1 compatible layer.
    Note, No build function is needed.
    """
    def __init__(self, targetShape):
        """
        Initializes the layer.
        """
        super().__init__()
        self.targetShape = tuple(targetShape)
    
    def get_config(self):
        """
        Helps in serializing the layer data
        """
        config = super().get_config()
        config.update({"target_shape": self.targetShape})
        return config

    def call(self, layerInput):
        """
        Forward computations. We take the first Sx * Sy * C indexes of each the input 
        vector to resemble the class probabilities of each grid cell. The rest of the 
        Sx * Sy * B indexes resemble the confidence scores of each grid cell And the 
        rest resemble the bounding box parameters <boxCenterX, boxCenterY, width, height>  

        Args:
            layerInput: tensor: The output from a dense (fully connected) layer.
        """
        
        Sx, Sy = self.targetShape[0], self.targetShape[1] # Number of parts that each axis is divided to
        C = 1 # Number of classes
        B = 2 # Number of predicted bounding boxes per grid cell


        # Get the batch size
        batchSize = tf.keras.backend.shape(layerInput)[0]

        # Class probabilities
        classProbs = tf.keras.backend.reshape(layerInput[:,:Sx*Sy*C], (batchSize,) + (Sx,Sy,C))
        classProbs = tf.keras.backend.softmax(classProbs) # Run a softmax to choose the right class with highest prob

        # Confidence scores
        confScores = tf.keras.backend.reshape(layerInput[:,Sx*Sy*C:Sx*Sy*(C+B)], (batchSize,) + (Sx,Sy,B))
        confScores = tf.keras.backend.sigmoid(confScores) # Confidence scores should be between 0 and 1

        # Bounding boxes
        bBox = tf.keras.backend.reshape(layerInput[:,Sx*Sy*(C+B):], (batchSize,) + (Sx,Sy,B*4))
        bBox = tf.keras.backend.sigmoid(bBox) # All of the bounding box parameters are relative (Between 0 and 1)


        return tf.keras.backend.concatenate([classProbs, confScores, bBox])

# # # Define a simple model using the custom reshaper layer to test it
# # input = tf.keras.layers.Input(shape=(539,))
# # x = YOLOv1_LastLayer_Reshape((7,7,11))(input)
# # model = tf.keras.Model(inputs = input, outputs = x, name = "dummy")
# # model.compile(optimizer='adam',  loss='mse', metrics=['accuracy'])

# # xTest = np.random.randint(10, size = (1,539))
# # pred = model.predict(xTest)
# # print(pred.shape)

In [3]:
def iouUtils(boxParams, gridRatio = tf.constant(7, tf.float32)):
    """
    Given bounding box centers and its width and height, calculates top-left and bottom-right coordinates of the box.
    Note that calculations in this function are done with teh assumption of w and h being a float number, between 0 and 1
    with respect to the entire image's size. However, x and y of the bounding box's center are assumed to be a float 
    between 0 and 1, with respect to the upper-left point of the grid cell.

    Args:
        boxParams: tf.Tensor: A tensor with following information (Box center X, Box center Y, Box width, Box height) for all
            boxes in a tensor.
        gridRatio: int: The number of evenly distributed grid cells in each image axis. Use 7 for YOLOv1.
    
    Returns:
        Two tensors, one indicating top-left pint of the bBox and, the other one denoting bottom-right edge.
    """
    boxXY = boxParams[...,0:2]
    halfWH = tf.divide(boxParams[...,2:], tf.constant([2.]))

    # Top-left (X, Y) and bottom-right (X, Y)
    return tf.subtract(boxXY, halfWH * gridRatio), tf.add(boxXY, halfWH * gridRatio)

def calcIOU(predict_topLeft, predict_bottomRight, truth_topLeft, truth_bottomRight):
    """
    Calculates intersection over union for two bounding boxes.

    Args:
        predict_topLeft, predict_bottomRight: tf.Tensor: Top-left and bottom-right coordinates of the predicted box, acquired 
            by iouUtils.
        truth_topLeft, truth_bottomRight: tf.Tensor: Top-left and bottom-right coordinates of the ground truth box, acquired 
            by iouUtils.
    
    Returns:
        Intersection over union of two boxes
    """

    intersectEdgeLeft = tf.maximum(predict_topLeft, truth_topLeft)
    intersectEdgeRight = tf.minimum(predict_bottomRight, truth_bottomRight)
    
    intersectWH = tf.abs(tf.subtract(intersectEdgeLeft, intersectEdgeRight))
    intersectArea = tf.reduce_prod(intersectWH, axis = -1)

    # Get area of predicted and ground truth bounding boxes
    predArea = tf.reduce_prod(tf.abs(tf.subtract(predict_topLeft, predict_bottomRight)), axis = -1)
    truthArea = tf.reduce_prod(tf.abs(tf.subtract(truth_topLeft, truth_bottomRight)), axis = -1)

    
    # Return IOU
    return tf.divide(intersectArea, predArea + truthArea - intersectArea)

# #  Testing IOU code
# predict = tf.random.uniform((3,4))
# truth = tf.random.uniform((3,4))
# p1, p2 = iouUtils(predict)
# t1, t2 = iouUtils(truth)
# print(calcIOU(p1, p2, t1, t2))

2024-07-20 19:31:58.400751: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-20 19:31:58.400971: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-20 19:31:58.401092: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [4]:

# Get test  ground truth vector
df = generateGroundTruth_YOLOv1("../data/labels/train", "txt", (7,7,1,1))
yTruth = df.at["04c8acd4a5be79bc", "vector"]

In [5]:
# YOLOv1 Loss
class YOLOv1_Loss(tf.keras.losses.Loss):
    """
    Defines the custom loss function that is used for YOLOv1 network.
    The loss is calculated in 3 parts:
    1. Localization Loss
    2. Confidence Loss
    3. Classification Loss

    Note: In this method's documentation, "ground truth" and "target" are used interchangeably 
    """
    
    def __init__(self, Sx = 7, Sy = 7, B_target = 1, B_pred = 2, C = 1):
        """
        Initializes the loss function. 

        Args:
            Sx: int: Number of grid cells on x axis
            Sy: int: Number of grid cells on y axis 
            B_target: int: Number of bounding boxes in the ground truth data
            B_pred: int: Number of bounding boxes in prediction
            C: int: Number of the classes
        """
        super().__init__()

        # Define YOLOv1 parameters
        self.Sx = Sx
        self.Sy = Sy
        self.B_target = B_target # Ground truth grid cells only have one bounding box 
        self.B_pred = B_pred
        self.C = C

    def call(self, yTrue, yPred):

        return 1
        
def testLoss(yTrue, yPred):
    """
    Runs in the even of loss function calculations
    
    Args:
        yTrue, yPred: tf.Tensor: The ground truth value and the predicted value, respectively

    Returns:
        The calculated loss.
    """
    lambdaNoObj = tf.constant(.5)
    lambdaCoord = tf.constant(5.)

    # Split the predictions and ground truth vectors to coordinates, confidence and class matrices
    # 1. Ground truth 
    idx1, idx2 = 1, 1 + 1
    targetClass = yTrue[...,:idx1]
    targetConf = yTrue[...,idx1:idx2]
    targetCoords = yTrue[...,idx2:]

    # 2. Prediction
    idx1, idx2 = 1, 1 + 2
    predClass = yPred[...,:idx1]
    predConf = yPred[...,idx1:idx2]
    predCoords = yPred[...,idx2:]

    # Get the best bounding boxes by calculating the IOUs
    # Note: To to do this process for the confidence scores as well, we concat each box's confidence
    # score to its bounding box coordinates and analyze them as a whole.
    predBox1 = tf.concat([tf.expand_dims(predConf[...,0],-1),predCoords[...,:4]], axis = -1)
    predBox2 = tf.concat([tf.expand_dims(predConf[...,1],-1),predCoords[...,4:]], axis = -1)

    # Get the corners of bounding boxes to calculate IOUs
    # Note, iouUtils is not coded to accept confidence scores. So we only pass the coordinates into 
    # it. 
    p1_left, p1_right = iouUtils(predBox1[...,1:]) 
    p2_left, p2_right = iouUtils(predBox2[...,1:])
    t_left, t_right = iouUtils(targetCoords) 

    # Calculate IOUs for first and second predicted bounding box
    p1_IOU = calcIOU(p1_left, p1_right, t_left, t_right)
    p2_IOU = calcIOU(p2_left, p2_right, t_left, t_right)

    # Get the cells that have objects
    maskObj = tf.cast(0 < targetConf, tf.float32)
    maskNoObj = tf.cast(0 == targetConf, tf.float32)
    
    mask_p1Bigger = tf.expand_dims(tf.cast(p2_IOU < p1_IOU, tf.float32),-1)
    mask_p2Bigger = tf.expand_dims(tf.cast(p1_IOU <= p2_IOU, tf.float32),-1)

    # Getting the responsible bounding box for loss calculation. Output is of shape [...,5]
    # And the first element is the confidence score of that box.
    respBox = maskObj*(mask_p1Bigger * predBox1 + mask_p2Bigger * predBox2)

    # Calculating the losses
    # 1. Classification loss
    classificationLoss =  tf.math.reduce_sum(tf.math.square(maskObj * tf.subtract(targetClass, predClass)))

    # 2. Confidence loss
    # Bear in mind, for the boxes with no objects, we account for the confidence loss as well. 
    # To penalize the network for high confidence scores of the cells containing no objects. The 
    # cells that have no objects, have a confidence score of 0 in the target ground truth matrix.
    # Thus, the loss is calculated as follows: SUM_All_Cells_No_OBJ((C1-0)^2 + (C2-0)^2)
    confidenceLossObj = tf.math.reduce_sum(tf.math.square(maskObj * tf.subtract(targetConf, tf.expand_dims(respBox[...,0],-1))))
    confidenceLossNoObj =  lambdaNoObj * tf.reduce_sum(maskNoObj * tf.reduce_sum(tf.square(predConf), axis = -1, keepdims = True))
    
    # 3. Localization loss
    # Bear in mind that respBox is of the shape (...,5) and targetCoords dimension is (...,4) 
    xyLoss = (tf.reduce_sum(tf.square(tf.subtract(respBox[...,1:3], targetCoords[...,0:2])),-1,True))
    whLoss = (tf.reduce_sum(tf.square(tf.subtract(tf.sqrt(respBox[...,1:3]), tf.sqrt(targetCoords[...,0:2]))),-1,True))
    localizationLoss = lambdaCoord * (xyLoss + whLoss) 

    # Sum all the tree types of the errors
    return classificationLoss + confidenceLossNoObj + confidenceLossObj + localizationLoss


# # Define a simple model using the custom reshaper layer to test it
# input = tf.keras.layers.Input(shape=(539,))
# x = YOLOv1_LastLayer_Reshape((7,7,11))(input)
# model = tf.keras.Model(inputs = input, outputs = x, name = "dummy")
# model.compile(optimizer='adam',  loss=testLoss, metrics=['accuracy'])

# xTest = np.random.randint(10, size = (1,539))
# pred = model.predict(xTest)
# # model.evaluate(xTest,np.expand_dims(yTruth, 0),)
# model.fit(xTest, np.expand_dims(yTruth, 0), epochs = 1)
# # metrics = model.evaluate(xTest)
# # print(pred.shape)

In [6]:
# YOLOv1 structure
YOLOv1_inputShape = (448,448,3) # Shape of the input image 
classNo = 1 # Number of classes we are trying to detect
input = tf.keras.layers.Input(shape=YOLOv1_inputShape)
leakyReLu = tf.keras.layers.LeakyReLU(alpha = .1)


# The backbone, Acts ads a feature extractor
# L1
x = tf.keras.layers.Conv2D(filters = 64, kernel_size=7, strides = 2, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(input)
x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding = "same")(x)

# L2
x = tf.keras.layers.Conv2D(filters = 192, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding = "same")(x)

# L3
x = tf.keras.layers.Conv2D(filters = 128, kernel_size=1, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 256, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 256, kernel_size=1, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 512, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding = "same")(x)

# L4
for _ in range(4):
    x = tf.keras.layers.Conv2D(filters = 256, kernel_size=1, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
    x = tf.keras.layers.Conv2D(filters = 512, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 512, kernel_size=1, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2, padding = "same")(x)

# L5
x = tf.keras.layers.Conv2D(filters = 512, kernel_size=1, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 512, kernel_size=1, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 2, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)

# L6
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)
x = tf.keras.layers.Conv2D(filters = 1024, kernel_size=3, strides = 1, padding = "same", activation= leakyReLu, kernel_regularizer=l2(1e-5))(x)

# Neck
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(4096)(x)
x = tf.keras.layers.Dense(7*7*(5*2+classNo), activation="sigmoid")(x)
x = tf.keras.layers.Dropout(.5)(x) # Dropout layer for avoiding overfitting
x = YOLOv1_LastLayer_Reshape((7,7,5*2+classNo))(x)
model = tf.keras.Model(inputs = input, outputs = x, name = "YOLOv1")

model.compile(loss = testLoss ,optimizer = 'adam')
# # Print the architecture
# for layer in model.layers:
#     print(layer.output_shape)

In [7]:
batch_size = 4

dfTrain = annotationsToDataframe(f"../data/labels/train", "txt")
trainingBatchGenerator = dataGenerator_YOLOv1(f"../data/images/train", 1, (448,448), dfTrain, 1, True)
print(int(dfTrain.shape[0] // batch_size))

model.fit(x=trainingBatchGenerator,
          steps_per_epoch = int(dfTrain.shape[0] // batch_size),
          epochs = 135,
          verbose = 1,
          workers= 4,
)

Epoch 1/135


2024-07-20 19:32:03.358855: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2024-07-20 19:32:03.609875: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.55GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-20 19:32:03.839585: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.15GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-07-20 19:32:04.014392: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.15GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if m



KeyboardInterrupt: 