# Imports Check

In [None]:
import os
import cv2
import json
import math
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from keras.models import Sequential, Model
from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.merge import concatenate
from keras.applications.mobilenet import MobileNet
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD, Adam, RMSprop

from utils import BoundBox, normalize, bbox_iou, interval_overlap, draw_boxes, decode_netout, sigmoid, softmax
from BatchGenerator import BatchGenerator


%matplotlib inline

# Adjustable Parameters

In [None]:
os.chdir('C:/Users/Owner/Desktop/Folder/')

architecture       = 'MobileNet'
input_size         = 224
anchors            = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]
max_box_per_image  = 5
labels_list        = ['target']

train_image_folder = 'train/image/'
train_annot_folder = 'train/annot/'
pretrained_weights_file = 'mobilenet_raccoon.h5'
backend_weights_file  = "mobilenet_backend.h5"
batch_size         = 16
nb_epoch           = 100
object_scale       = 5.0
no_object_scale    = 1.0
coord_scale        = 1.0
class_scale        = 1.0
debug_flag         = True

valid_image_folder = 'verify/image/'
valid_annot_folder = 'verify/annot/'




# Setup Model

In [None]:
def custom_loss(y_true, y_pred):
    mask_shape = tf.shape(y_true)[:4]

    cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(grid_w), [grid_h]), (1, grid_h, grid_w, 1, 1)))
    cell_y = tf.transpose(cell_x, (0,2,1,3,4))

    cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [batch_size, 1, 1, 5, 1])

    coord_mask = tf.zeros(mask_shape)
    conf_mask  = tf.zeros(mask_shape)
    class_mask = tf.zeros(mask_shape)

    seen = tf.Variable(0.)
    total_recall = tf.Variable(0.)

    ### adjust x and y      
    pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid

    ### adjust w and h
    pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(anchors, [1,1,1,nb_box,2])

    ### adjust confidence
    pred_box_conf = tf.sigmoid(y_pred[..., 4])

    ### adjust class probabilities
    pred_box_class = y_pred[..., 5:]

    ### adjust x and y
    true_box_xy = y_true[..., 0:2] # relative position to the containing cell

    ### adjust w and h
    true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically

    ### adjust confidence
    true_wh_half = true_box_wh / 2.
    true_mins    = true_box_xy - true_wh_half
    true_maxes   = true_box_xy + true_wh_half

    pred_wh_half = pred_box_wh / 2.
    pred_mins    = pred_box_xy - pred_wh_half
    pred_maxes   = pred_box_xy + pred_wh_half       

    intersect_mins  = tf.maximum(pred_mins,  true_mins)
    intersect_maxes = tf.minimum(pred_maxes, true_maxes)
    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

    true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]
    pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]

    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores  = tf.truediv(intersect_areas, union_areas)

    true_box_conf = iou_scores * y_true[..., 4]

    ### adjust class probabilities
    true_box_class = tf.argmax(y_true[..., 5:], -1)

    ### coordinate mask: simply the position of the ground truth boxes (the predictors)
    coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * coord_scale

    ### confidence mask: penelize predictors + penalize boxes with low IOU
    # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6
    true_xy = true_boxes[..., 0:2]
    true_wh = true_boxes[..., 2:4]

    true_wh_half = true_wh / 2.
    true_mins    = true_xy - true_wh_half
    true_maxes   = true_xy + true_wh_half

    pred_xy = tf.expand_dims(pred_box_xy, 4)
    pred_wh = tf.expand_dims(pred_box_wh, 4)

    pred_wh_half = pred_wh / 2.
    pred_mins    = pred_xy - pred_wh_half
    pred_maxes   = pred_xy + pred_wh_half    

    intersect_mins  = tf.maximum(pred_mins,  true_mins)
    intersect_maxes = tf.minimum(pred_maxes, true_maxes)
    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]

    true_areas = true_wh[..., 0] * true_wh[..., 1]
    pred_areas = pred_wh[..., 0] * pred_wh[..., 1]

    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores  = tf.truediv(intersect_areas, union_areas)

    best_ious = tf.reduce_max(iou_scores, axis=4)
    conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * no_object_scale

    # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box
    conf_mask = conf_mask + y_true[..., 4] * object_scale

    ### class mask: simply the position of the ground truth boxes (the predictors)
    class_mask = y_true[..., 4] * tf.gather(class_wt, true_box_class) * class_scale       

    """
    Warm-up training
    """
    no_boxes_mask = tf.to_float(coord_mask < coord_scale/2.)
    seen = tf.assign_add(seen, 1.)

    true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, warmup_bs), 
                          lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, 
                                   true_box_wh + tf.ones_like(true_box_wh) * np.reshape(anchors, [1,1,1,nb_box,2]) * no_boxes_mask, 
                                   tf.ones_like(coord_mask)],
                          lambda: [true_box_xy, 
                                   true_box_wh,
                                   coord_mask])

    nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))
    nb_conf_box  = tf.reduce_sum(tf.to_float(conf_mask  > 0.0))
    nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))

    loss_xy    = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
    loss_wh    = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
    loss_conf  = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask)  / (nb_conf_box  + 1e-6) / 2.
    loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
    loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)

    loss = loss_xy + loss_wh + loss_conf + loss_class

    if debug_flag:
        nb_true_box = tf.reduce_sum(y_true[..., 4])
        nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3))

        current_recall = nb_pred_box/(nb_true_box + 1e-6)
        total_recall = tf.assign_add(total_recall, current_recall) 

        loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \t', summarize=1000)
        loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000)
        loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000)
        loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000)
        loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000)
        loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000)
        loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000)
        loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000)

    return loss


In [None]:
nb_class   = len(list(labels_list))
nb_box   = max_box_per_image
class_wt = np.ones(nb_class, dtype='float32')

warmup_epochs = 0
warmup_bs  = 0

true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4))  
input_image = Input(shape=(input_size, input_size, 3))

# MobileNet
mobilenet = MobileNet(input_shape=(input_size,input_size,3), include_top=False)
mobilenet.load_weights(backend_weights_file)
x = mobilenet(input_image)
feature_extractor = Model(input_image, x)  
print(feature_extractor.get_output_shape_at(-1)[1:3])    
grid_h, grid_w = feature_extractor.get_output_shape_at(-1)[1:3]        

# object detection layer
output = Conv2D(nb_box * (4 + 1 + nb_class), 
                (1,1), strides=(1,1), 
                padding='same', 
                name='conv_23', 
                kernel_initializer='lecun_normal')(x)
output = Reshape((grid_h, grid_w, nb_box, 4 + 1 + nb_class))(output)
output = Lambda(lambda args: args[0])([output, true_boxes])
model = Model([input_image, true_boxes], output)

optimizer = Adam(lr=0.5e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)
#optimizer = RMSprop(lr=1e-6, rho=0.9, epsilon=1e-08, decay=0.0)

model.compile(loss=custom_loss, optimizer=optimizer)

# print a summary of the whole model
model.summary()

In [None]:
#  reset weights of the detection layer
layer = model.layers[-4] # the last convolutional layer
weights = layer.get_weights()
new_kernel = np.random.normal(size=weights[0].shape)/(grid_h*grid_w)
new_bias   = np.random.normal(size=weights[1].shape)/(grid_h*grid_w)
layer.set_weights([new_kernel, new_bias])

# Parse Images and Annotations

In [None]:
def parse_annotation(ann_dir, img_dir, labels_list=[]):
    all_imgs = []

    images = os.listdir(img_dir)
    annots = os.listdir(ann_dir)

    if len(images) == len(annots):
        print('Number of Images == Number of Annotations = GOOD!')

    for n in range(0,len(images)):
        img = {'object':[]}
        img['filename'] = img_dir + images[n]
        img['width'] = int(128)
        img['height'] = int(128)

        with open(ann_dir + annots[n], 'r') as infile:
            lines = infile.readlines()

            #line 1
            n_bbox = int(lines[0])

            for bbox_n in range(0,n_bbox):
                line_data = lines[1+bbox_n]
                x_min,y_min,x_max,y_max = line_data.split(' ')

                obj = {}
                obj['name'] = 'target'
                obj['xmin'] = int(x_min)
                obj['ymin'] = int(y_min)
                obj['xmax'] = int(x_max)
                obj['ymax'] = int(y_max)
                img['object'] += [obj]

        all_imgs += [img]

    return all_imgs

In [None]:
train_imgs = parse_annotation(train_annot_folder, 
                                            train_image_folder, 
                                            labels_list)

valid_imgs = parse_annotation(valid_annot_folder, 
                                                valid_image_folder, 
                                                labels_list)

generator_config = {
    'IMAGE_H'         : input_size, 
    'IMAGE_W'         : input_size,
    'GRID_H'          : grid_h,  
    'GRID_W'          : grid_w,
    'BOX'             : max_box_per_image,
    'LABELS'          : labels_list,
    'CLASS'           : len(labels_list),
    'ANCHORS'         : anchors,
    'BATCH_SIZE'      : batch_size,
    'TRUE_BOX_BUFFER' : 5,
}
    
train_batch = BatchGenerator(train_imgs, 
                             generator_config, 
                             norm=normalize)
valid_batch = BatchGenerator(valid_imgs, 
                             generator_config, 
                             norm=normalize,
                             jitter=False)

# Train Model

In [None]:
model.load_weights(pretrained_weights_file)

In [None]:
model.load_weights('best_target_detector.h5')

In [None]:
###############################
#  Train CNN
###############################

checkpoint = ModelCheckpoint('best_target_detector.h5', 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='auto', 
                             period=1)

model.fit_generator(generator        = train_batch, 
                    steps_per_epoch  = math.ceil(1000/batch_size),
                    epochs           = nb_epoch, 
                    verbose          = 2,
                    validation_data  = valid_batch,
                    validation_steps = math.ceil(250/batch_size),
                    callbacks        = [checkpoint], 
                    max_queue_size   = 10)

# Detect on image

In [None]:
obj_threshold       = 0.0
nms_threshold       = 0.0

#model.load_weights('mobilenet_raccoon.h5')
#model.load_weights('best_raccoon_detector.h5')
model.load_weights('best_target_detector.h5')


image = cv2.imread('verify/image/Detection_942_10_Chip.JPEG')

image = cv2.resize(image, (input_size, input_size))
norm_image = normalize(image)

input_image = norm_image[:,:,::-1]
input_image = np.expand_dims(input_image, 0)
dummy_array = np.zeros((1,1,1,1,max_box_per_image,4))

netout = model.predict([input_image, dummy_array])[0]
boxes = decode_netout(netout, 
                      obj_threshold=obj_threshold,
                      nms_threshold=nms_threshold,
                      anchors=anchors, 
                      nb_class=len(labels_list))

plt.figure(figsize=(10,10))

image = draw_boxes(image, boxes, labels_list)
print(len(boxes), 'boxes are found')
plt.imshow(image[:,:,::-1]); plt.show()
cv2.imwrite('raccoon' + '_detected' + '.jpg', image)

# Detect on video

In [None]:
# untested, not sure if working

obj_threshold       = 0.0
nms_threshold       = 0.0

model.load_weights("mobilenet_raccoon.h5")

dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))

video_inp = '../basic-yolo-keras/images/phnom_penh.mp4'
video_out = '../basic-yolo-keras/images/phnom_penh_bbox.mp4'

video_reader = cv2.VideoCapture(video_inp)

nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))

video_writer = cv2.VideoWriter(video_out,
                               cv2.VideoWriter_fourcc(*'XVID'), 
                               50.0, 
                               (frame_w, frame_h))

for i in tqdm(range(nb_frames)):
    ret, image = video_reader.read()
    
    input_image = cv2.resize(image, (416, 416))
    input_image = input_image / 255.
    input_image = input_image[:,:,::-1]
    input_image = np.expand_dims(input_image, 0)

    netout = model.predict([input_image, dummy_array])

    boxes = decode_netout(netout[0], 
                          obj_threshold=0.3,
                          nms_threshold=NMS_THRESHOLD,
                          anchors=ANCHORS, 
                          nb_class=CLASS)
    image = draw_boxes(image, boxes, labels=labels_list)

    video_writer.write(np.uint8(image))
    
video_reader.release()
video_writer.release()  