2. For an object detection problem, assume you are designing a YOLO like model to do the job. Your input image size is 127 x 127 (RGB). We are looking for a 8x8 output grid size. The number of classes is 20 and for each cell in the 8x8 grid, we are considering 2 anchors. Design the CNN network and as a designer feel free to set your networks hyperparameters as you wish. 

Load Modules

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import to_categorical

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

import numpy as np

Write function to build CNN with FCs

In [None]:
def build(width=127,height=127,depth = 3,classes=20):
  # initialize the model along with the input shape to be
  # "channels last" and the channels dimension itself
  model = Sequential()
  inputShape = (height, width, depth)
  chanDim = -1

  # CONV => RELU => BN => POOL
  model.add(Conv2D(8, (5, 5), padding="same",
    input_shape=inputShape))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(MaxPooling2D(pool_size=(2, 2)))

  # first set of (CONV => RELU => CONV => RELU) * 2 => POOL
  model.add(Conv2D(16, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(Conv2D(16, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(MaxPooling2D(pool_size=(2, 2)))

  # second set of (CONV => RELU => CONV => RELU) * 2 => POOL
  model.add(Conv2D(32, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(Conv2D(32, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(MaxPooling2D(pool_size=(2, 2)))

  # First FC Layer
  model.add(Flatten())
  model.add(Activation("relu"))
  model.add(BatchNormalization())
  model.add(Dropout(0.5)) #Regularization

  # Second FC Layer
  model.add(Flatten())
  model.add(Activation("relu"))
  model.add(BatchNormalization())
  model.add(Dropout(0.5)) #Regularization

  # Third FC Layer
  model.add(Dense(classes))
  model.add(Activation("softmax"))

  return model

Write code to extract all possible bounding boxes with object score (likelihood the box contains an object) and class probability (likelihood of which class it is)

In [None]:
def extractInfo(modelOutput, anchors, numClass):
    featureDim = modelOutput.shape
    numAnchor = anchors.shape[0]  # get the number of anchors, 5 for the Pascal dataset
    modelOutput = tf.reshape(modelOutput, shape=(-1, featureDim[1], featureDim[2], numAnchor, numClass + 5))

    imageShape = featureDim[1:3]  # get the width and height of output feature map

    boxXY = tf.nn.sigmoid(modelOutput[..., :2])  # boxXY now w.r.t top left corner of its grid(on grid scale)

    idx = getOffset(imageShape) # convert box center to grid scale
    idx = tf.cast(idx, modelOutput.dtype)
    anchors = tf.cast(tf.reshape(anchors, (1, 1, 1, numAnchor, 2)), idx.dtype)
    boxXY = (boxXY + idx)  

    boxWH = tf.math.exp(modelOutput[..., 2:4]) 

    boxWH = boxWH * anchors

    objScore = tf.nn.sigmoid(modelOutput[..., 4:5])  # objectiveness score; must be between 0 and 1
    classProb = tf.nn.softmax(modelOutput[..., 5:])  # probability of classes; pass through a softmax gate to obtain prob.
    
    return boxXY, boxWH, objScore, classProb


def getOffset(shape):
    hIndex = tf.reshape(tf.range(start=0, limit=shape[0]), (shape[0], 1))
    hIndex = tf.tile(hIndex, [1, shape[1]])  # expand in the height direction
    wIndex = tf.reshape(tf.range(start=0, limit=shape[1]), (1, shape[1]))
    wIndex = tf.tile(wIndex, [shape[0], 1])  # expand in the width direction
    idx = tf.stack([wIndex, hIndex], axis=-1)
    idx = tf.reshape(idx, shape=(1, *shape, 1, 2)) # reshape the offset so that it can add to boxXY directly
    return idx

Get the box location and scale it from the grid size to the original image size 

In [None]:
def getBoxLoc(boxXY, boxWH):
    topLeft = boxXY - boxWH / 2  # top left
    bottomRight = boxXY + boxWH / 2  # bottom right
    # the last dimension is (x1, y1, x2, y2)
    # top left means it is closer to (0,0) in the image, which is the top-left corner
    # if displayed by matplotlib 
    return tf.concat([topLeft, bottomRight], axis=-1)

def scaleBox(boxLoc, scale=(32, 32)):
    height, width = scale[0], scale[1]
    shape = tf.stack([height, width, height, width])
    shape = tf.reshape(shape, [1, 4])
    shape = tf.cast(shape, boxLoc.dtype)
    return boxLoc * shape

Filter out boxes with low object scores (unlikely to have objects in them) 

Filter out boxes that greatly overlap and only return the box that has the highest object score

In [None]:
def filterBox(boxLoc, objScore, classProb, scoreThresh=0.5):
    boxScore = objScore * classProb  # (None, B1, B2, S, NCLASS)
    boxClass = tf.argmax(boxScore, axis=-1)  # shape = (None, S, S, B)
    boxScore = tf.math.reduce_max(boxScore, axis=-1)  # shape = (None, S, S, B)
    mask = boxScore >= scoreThresh
    # filter out low-confidence boxes
    boxes = tf.boolean_mask(boxLoc, mask)
    scores = tf.boolean_mask(boxScore, mask)
    classes = tf.boolean_mask(boxClass, mask)

    return boxes, scores, classes



def nonMaxSuppress(boxLoc, score, classPredict, maxBox=20, iouThresh=0.5):
    idx = tf.image.non_max_suppression(boxLoc, score, maxBox, iou_threshold=iouThresh)
    boxLoc = tf.gather(boxLoc, idx)
    score = tf.gather(score, idx)
    classPredict = tf.gather(classPredict, idx)
    return boxLoc, score, classPredict

Load data (no data in the assignment)

In [None]:
# Load data (no data given for assignment)
randomImages = np.random.randn(100,127,127,3)
randomLabels = np.random.randint(1,5,size=100)

Set up the CNN and fit the model

In [None]:
# Set epochs to 10 and learning rate to 0.001
epochs = 10
learningRate = 1e-3

# Use the Adam learning rate optimizer
opt = Adam(lr=learningRate, decay=learningRate / (epochs * 0.5))

# Build and compile the model
model = build()
model.compile(loss="category_crossentropy", optimizer=opt,
	metrics=["accuracy"])

# HotEncode Labels
numLabels = len(np.unique(randomLabels))
randomLabels = to_categorical(randomLabels, numLabels)


# Fit the model
H = model.fit(x = randomImages, 
              y =randomLabels,
              batch_size=10,
              epochs=epochs)

take the output predictions and feed them into the functions listed above
1. get all bounding boxes
2. find where each bounding box is on the image
3. filter out boxes with low object score
4. filter out boxes with large overlap
5. plot boxes on image (not seen here since there are no input images)

In [None]:
modelOutput = model.predict(randomImage)
bXY, bWH, objScore, classProb = extractInfo(modelOutput,anchors=2,numClass=20)
bLoc = getBoxLoc(bXY,bWH)
bLocScale = scaleBox(bLoc,(127/8,127/8)) #Image sized divided by number of grids
boxes, scores, classes = filterBox(bLocScale, objScore, classProb, scoreThresh=0.5):
boxLoc, score, classPredict = nonMaxSuppress(bLoc, scores, classProb, maxBox=20, iouThresh=0.5):
