In [3]:
import sys

from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras import backend as K
from keras.models import load_model
from math import ceil
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf

from models.ssd_mobilenet import ssd_300
from misc.keras_ssd_loss import SSDLoss, FocalLoss, weightedSSDLoss, weightedFocalLoss
from misc.keras_layer_AnchorBoxes import AnchorBoxes
from misc.keras_layer_L2Normalization import L2Normalization
from misc.ssd_box_encode_decode_utils import SSDBoxEncoder, decode_y, decode_y2
from misc.ssd_batch_generator import BatchGenerator
from bs4 import BeautifulSoup

import os


import cv2
import time





Using TensorFlow backend.


In [4]:
img_height = 300  # Height of the input images
img_width = 300  # Width of the input images
img_channels = 3  # Number of color channels of the input images
subtract_mean = [123, 117, 104]  # The per-channel mean of the images in the dataset
swap_channels = True  # The color channel order in the original SSD is BGR
n_classes = 20  # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO
scales_voc = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88,
              1.05]  # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87,
               1.05]  # The anchor box scaling factors used in the original SSD300 for the MS COCO datasets
scales = scales_voc

aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]]  # The anchor box aspect ratios used in the original SSD300; the order matters
two_boxes_for_ar1 = True
steps = [8, 16, 32, 64, 100, 300]  # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5,
           0.5]  # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
limit_boxes = False  # Whether or not you want to limit the anchor boxes to lie entirely within the image boundaries
variances = [0.1, 0.1, 0.2,
             0.2]  # The variances by which the encoded target coordinates are scaled as in the original implementation
coords = 'centroids'  # Whether the box coordinates to be used as targets for the model should be in the 'centroids', 'corners', or 'minmax' format, see documentation
normalize_coords = True

# 1: Build the Keras model

K.clear_session()  # Clear previous models from memory.



 

In [None]:
def lr_schedule(epoch):
      if epoch <= 200:
          return 0.001
      else:
          return 0.0001


def train():
  model = ssd_300(mode = 'training',
                  image_size=(img_height, img_width, img_channels),
                  n_classes=n_classes,
                  l2_regularization=0.0005,
                  scales=scales,
                  aspect_ratios_per_layer=aspect_ratios,
                  two_boxes_for_ar1=two_boxes_for_ar1,
                  steps=steps,
                  offsets=offsets,
                  limit_boxes=limit_boxes,
                  variances=variances,
                  coords=coords,
                  normalize_coords=normalize_coords,
                  subtract_mean=subtract_mean,
                  divide_by_stddev=None,
                  swap_channels=swap_channels)

  

  model.load_weights('mobilenet_1_0_224_tf.h5', by_name=True,skip_mismatch=True)


  predictor_sizes = [model.get_layer('conv11_mbox_conf').output_shape[1:3],
                     model.get_layer('conv13_mbox_conf').output_shape[1:3],
                     model.get_layer('conv14_2_mbox_conf').output_shape[1:3],
                     model.get_layer('conv15_2_mbox_conf').output_shape[1:3],
                     model.get_layer('conv16_2_mbox_conf').output_shape[1:3],
                     model.get_layer('conv17_2_mbox_conf').output_shape[1:3]]
 
  adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=5e-04)

  ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)


  model.compile(optimizer=adam, loss=ssd_loss.compute_loss)


  train_dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
  val_dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])

  # 2: Parse the image and label lists for the training and validation datasets. This can take a while.

  # TODO: Set the paths to the datasets here.


  VOC_2007_images_dir = 'data/training/'
  VOC_2007_images_dir1 = 'data/val/'
  

  # The directories that contain the annotations.
  VOC_2007_annotations_dir = 'annotrain/'
  VOC_2007_annotations_dir1 = 'annoval/'
  

  # The paths to the image sets.
 # VOC_2007_train_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/trainval.txt'


 # VOC_2007_val_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/test.txt'


  # The XML parser needs to now what object class names to look for and in which order to map them to integers.

  classes = ['drinking','eating']
  VOC_2007_train_image_set_filename= 'data1.txt'
  VOC_2007_train_val_set_filename= 'data2.txt'


  train_dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                          
                          annotations_dirs=[VOC_2007_annotations_dir],
                          image_set_filenames=[VOC_2007_train_image_set_filename],
                          classes=classes,
                          include_classes='all',
                          exclude_truncated=False,
                          exclude_difficult=False,
                          ret=False)
    
  val_dataset.parse_xml(images_dirs=[VOC_2007_images_dir1],
                        image_set_filenames=[VOC_2007_train_val_set_filename],
                        annotations_dirs=[VOC_2007_annotations_dir1],
                        classes=classes,
                        include_classes='all',
                        exclude_truncated=False,
                        exclude_difficult=False,
                        ret=False
                        )



 


  # 3: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

  ssd_box_encoder = SSDBoxEncoder(img_height=img_height,
                                  img_width=img_width,
                                  n_classes=n_classes,
                                  predictor_sizes=predictor_sizes,
                                  min_scale=None,
                                  max_scale=None,
                                  scales=scales,
                                  aspect_ratios_global=None,
                                  aspect_ratios_per_layer=aspect_ratios,
                                  two_boxes_for_ar1=two_boxes_for_ar1,
                                  steps=steps,
                                  offsets=offsets,
                                  limit_boxes=limit_boxes,
                                  variances=variances,
                                  pos_iou_threshold=0.5,
                                  neg_iou_threshold=0.2,
                                  coords=coords,
                                  normalize_coords=normalize_coords)

  batch_size = 32

  train_generator = train_dataset.generate(batch_size=batch_size,
                                           shuffle=True,
                                           train=True,
                                           ssd_box_encoder=ssd_box_encoder,
                                           convert_to_3_channels=True,
                                           equalize=False,
                                           brightness=(0.5, 2, 0.5),
                                           flip=0.5,
                                           translate=False,
                                           scale=False,
                                           max_crop_and_resize=(img_height, img_width, 1, 3),
                                           # This one is important because the Pascal VOC images vary in size
                                           random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
                                           # This one is important because the Pascal VOC images vary in size
                                           random_crop=False,
                                           crop=False,
                                           resize=False,
                                           gray=False,
                                           limit_boxes=True,
                                           # While the anchor boxes are not being clipped, the ground truth boxes should be
                                           include_thresh=0.4)

  val_generator = val_dataset.generate(batch_size=batch_size,
                                           shuffle=True,
                                           train=True,
                                           ssd_box_encoder=ssd_box_encoder,
                                           convert_to_3_channels=True,
                                           equalize=False,
                                           brightness=(0.5, 2, 0.5),
                                           flip=0.5,
                                           translate=False,
                                           scale=False,
                                           max_crop_and_resize=(img_height, img_width, 1, 3),
                                           # This one is important because the Pascal VOC images vary in size
                                           random_pad_and_resize=(img_height, img_width, 1, 3, 0.5),
                                           # This one is important because the Pascal VOC images vary in size
                                           random_crop=False,
                                           crop=False,
                                           resize=False,
                                           gray=False,
                                           limit_boxes=True,
                                           # While the anchor boxes are not being clipped, the ground truth boxes should be
                                           include_thresh=0.4)
  # Get the number of samples in the training and validations datasets to compute the epoch lengths below.
  n_train_samples = train_dataset.get_n_samples()
  n_val_samples = val_dataset.get_n_samples()
  learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule)
   
  checkpoint_path = "ssd300_epoch-{epoch:02d}.h5"

  checkpoint = ModelCheckpoint(checkpoint_path)
  
  log_path =   "/logs"
 
  tensorborad = TensorBoard(log_dir=log_path,
                            histogram_freq=0, write_graph=True, write_images=False)



  callbacks = [checkpoint,tensorborad,learning_rate_scheduler]

  # TODO: Set the number of epochs to train for.
  epochs = 3
  intial_epoch = 1
  history = model.fit_generator(generator=train_generator,
                                steps_per_epoch=ceil(n_train_samples)/batch_size,
                                verbose=1,
                                initial_epoch=intial_epoch,
                                epochs=epochs,
                                validation_data=val_generator,
                                validation_steps=1,
                                callbacks=callbacks
                                )

 



 

  


In [None]:






if __name__== "__main__":
   
    train()

In [5]:
model = ssd_300("training",
                image_size=(img_height, img_width, img_channels),
                n_classes=n_classes,
                l2_regularization=0.0005,
                scales=scales,
                aspect_ratios_per_layer=aspect_ratios,
                two_boxes_for_ar1=two_boxes_for_ar1,
                steps=steps,
                offsets=offsets,
                limit_boxes=limit_boxes,
                variances=variances,
                coords=coords,
                normalize_coords=normalize_coords,
                subtract_mean=subtract_mean,
                divide_by_stddev=127.5,
                swap_channels=swap_channels)
model.load_weights("ssd300_epoch-222.h5")


Instructions for updating:
Colocations handled automatically by placer.
conv1 shape:  (?, 150, 150, 64)
conv3 shape:  (?, 75, 75, 128)
conv5 shape:  (?, 38, 38, 256)
conv11 shape:  (?, 19, 19, 512)
conv13 shape:  (?, 10, 10, 1024)
conv14 shape (?, 5, 5, 512)
conv15 shape (?, 3, 3, 256)
conv16 shape (?, 2, 2, 256)
conv17 shape (?, 1, 1, 128)
in training mode


In [7]:
dir_path='data/val/'
for file in os.listdir(dir_path):
    filename = dir_path + file

    print(filename) 

    img1 = cv2.imread(filename)
   # img = img.astype('uint8')
   
    img = cv2.cvtColor(img1,cv2.COLOR_BGR2RGB)
    
   
    # # img1 = ima[90:390,160:460]
    # img1 = cv2.resize(ima,dsize=(img_height,img_width))
    # im = img1
    orig_images = []  # Store the im ages here.
    input_images = []  # Store resized versions of the images here.
    orig_images.append(img1)

    # img1 = image.img_to_array(img1)
    # input_images.append(img1)
    # input_images = np.array(input_images)



    ima = img
    # img = img[:,a:a+320]
    image1 = cv2.resize(img,(300,300))
    image1 = np.array(image1,dtype=np.float32)

    # image1[:,:,0] = 0.007843*(image1[:,:,0] - 127.5)
    # image1[:,:,1] = 0.007843*(image1[:,:,1] - 127.5)
    # image1[:,:,2] = 0.007843*(image1[:,:,2] - 127.5)
    # image1 = image1[:,:,::-1]

    image1 = image1[np.newaxis,:,:,:]
    # input_images.append(image1)
    input_images = np.array(image1)


    start_time = time.time()


    y_pred = model.predict(input_images)
    # print y_pred.shape
    # y_pred = y_pred.flatten()
    # print (y_pred[:15])

    # print 'y_pred shape', y_pred.shape

    print("time taken by ssd", time.time() - start_time)

    # confidence_threshold = 0.25

    # y_pred_decoded = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]


    y_pred_decoded = decode_y(y_pred,
                              confidence_thresh=0.20,
                              iou_threshold=0.45,
                              top_k=100,
                              input_coords='centroids',
                              normalize_coords=True,
                              img_height=img_height,
                              img_width=img_width)


        

    for box in y_pred_decoded[0]:

        xmin = int(box[-4] * orig_images[0].shape[1] / img_width)
        ymin = int(box[-3] * orig_images[0].shape[0] / img_height)
        xmax = int(box[-2] * orig_images[0].shape[1] / img_width)
        ymax = int(box[-1] * orig_images[0].shape[0] / img_height)
       

      
        orig_images[0]= cv2.rectangle(orig_images[0],(xmin, ymin), (xmax, ymax),(0,255,255),2)
      


    cv2.imshow("image1",orig_images[0])
    cv2.waitKey(0)
cv2.destroyAllWindows()

data/val/000001.jpg
time taken by ssd 3.75723934173584
data/val/000019 (2).jpg
time taken by ssd 0.012933492660522461
data/val/000021 (3).jpg
time taken by ssd 0.012965917587280273
data/val/000021.jpg
time taken by ssd 0.011924028396606445
data/val/000058.jpg
time taken by ssd 0.013922929763793945
data/val/000084 (2).jpg
time taken by ssd 0.012931108474731445
data/val/000085 (2).jpg
time taken by ssd 0.013947725296020508
data/val/000108.jpg
time taken by ssd 0.012935400009155273
data/val/000208.jpg
time taken by ssd 0.012917518615722656
data/val/1.jpg
time taken by ssd 0.012965679168701172


In [None]:

def compute_overlap(a, b):
    """
    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
    Parameters
    ----------
    a: (N, 4) ndarray of float
    b: (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])

    iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
    ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])

    iw = np.maximum(iw, 0)
    ih = np.maximum(ih, 0)

    ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih

    ua = np.maximum(ua, np.finfo(float).eps)

    intersection = iw * ih

    return intersection / ua


def compute_ap(recall, precision):
    """ Compute the average precision, given the recall and precision curves.
    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
    # Arguments
        recall:    The recall curve (list).
        precision: The precision curve (list).
    # Returns
        The average precision as computed in py-faster-rcnn.
    """
    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.], recall, [1.]))
    mpre = np.concatenate(([0.], precision, [0.]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap









def main():
    model = ssd_300("training",
                image_size=(img_height, img_width, img_channels),
                n_classes=n_classes,
                l2_regularization=0.0005,
                scales=scales,
                aspect_ratios_per_layer=aspect_ratios,
                two_boxes_for_ar1=two_boxes_for_ar1,
                steps=steps,
                offsets=offsets,
                limit_boxes=limit_boxes,
                variances=variances,
                coords=coords,
                normalize_coords=normalize_coords,
                subtract_mean=subtract_mean,
                divide_by_stddev=127.5,
                swap_channels=swap_channels)
    model.load_weights("ssd300_epoch-222.h5")
    dataset = BatchGenerator(box_output_format=['class_id', 'xmin', 'ymin', 'xmax', 'ymax'])
    VOC_2007_images_dir = 'data/training/'
    VOC_2007_images_dir1 = 'data/val/'
  

  # The directories that contain the annotations.
    VOC_2007_annotations_dir = 'annotrain/'
    VOC_2007_annotations_dir1 = 'annoval/'
  

  # The paths to the image sets.
 # VOC_2007_train_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/trainval.txt'


 # VOC_2007_val_image_set_filename = args.voc_dir_path + '/VOC2007/ImageSets/Main/test.txt'


  # The XML parser needs to now what object class names to look for and in which order to map them to integers.

    classes = ['drinking','eating']
    VOC_2007_train_image_set_filename= 'data1.txt'
    VOC_2007_train_val_set_filename= 'data2.txt'


    
                        




   
    # VOC_2012_val_image_set_filename = '/media/shareit/manish/blitznet-master/Datasets/VOCdevkit/VOC2012/ImageSets/Main/test.txt'


    # The XML parser needs to now what object class names to look for and in which order to map them to integers.
    
    classes = ['drinking','eating']
    conf_threshold = 0.25


    filenames, labels, image_ids = dataset.parse_xml(images_dirs=[VOC_2007_images_dir1],
                        image_set_filenames=[VOC_2007_train_val_set_filename],
                        annotations_dirs=[VOC_2007_annotations_dir1],
                        classes=classes,
                        include_classes='all',
                        exclude_truncated=False,
                        exclude_difficult=False,
                        ret=True
                        )

    size = len(filenames)
    detected_labels = []
    


    all_detections = [[None for i in range(len(classes))] for j in range(size)]
    all_annotations = [[None for i in range(len(classes))] for j in range(size)]

    for i in range(size):

        image_path = filenames[i]
        ima = cv2.imread(image_path)
        orig_images = []

        orig_images.append(ima)

        image1 = cv2.resize(ima,(img_height,img_width))
        image1 = image1[np.newaxis,:,:,:]

        input_images = np.array(image1)


        start_time = time.time()
        y_pred = model.predict(input_images)
        print("Time Taken by ssd", time.time() - start_time) 

        y_pred_decoded = decode_y(y_pred,
                                  confidence_thresh=0.15,
                                  iou_threshold=0.75,
                                  top_k=100,
                                  input_coords='centroids',
                                  normalize_coords=True,
                                  img_height=img_height,
                                  img_width=img_width)

        pred_boxes = []
        pred_labels = []

        for box in y_pred_decoded[0]:

            xmin = int(box[-4] * orig_images[0].shape[1] / img_width)
            ymin = int(box[-3] * orig_images[0].shape[0] / img_height)
            xmax = int(box[-2] * orig_images[0].shape[1] / img_width)
            ymax = int(box[-1] * orig_images[0].shape[0] / img_height)
            class_id = int(box[0])
            score = box[1]

            pred_boxes.append([xmin, ymin, xmax, ymax, score])

            pred_labels.append(class_id)

        pred_boxes = np.array(pred_boxes)
        pred_labels = np.array(pred_labels)

        l = range(1, len(classes))
        for label in l:
            if(len(pred_labels)):
                all_detections[i][label] = pred_boxes[pred_labels == label, :]

        true_label = np.array(labels[i])
        
        for label in l:
            if len(true_label) > 0:
                all_annotations[i][label] = true_label[true_label[:, 0] == label, 1:5].copy()
            else:
                all_annotations[i][label] = np.array([[]])

    average_precisions = {}


    for label in l:
        false_positives = np.zeros((0,))
        true_positives = np.zeros((0,))
        scores = np.zeros((0,))
        num_annotations = 0.0

        for i in range(size):
            annotations = all_annotations[i][label]
            annotations = annotations.astype(np.float32)


            num_annotations += annotations.shape[0]
            detected_annotations = []
            detections = all_detections[i][label]
            if(detections is not None):
                detections = detections.astype(np.float32)

                for d in detections:
                    scores = np.append(scores, d[4])

                    try:
                        annotations[0][0]
                    except IndexError:
                        false_positives = np.append(false_positives, 1)
                        true_positives = np.append(true_positives, 0)
                        continue

                    overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations)
                    assigned_annotation = np.argmax(overlaps, axis=1)
                    max_overlap = overlaps[0, assigned_annotation]
                    
                    if max_overlap >= conf_threshold and assigned_annotation not in detected_annotations:
                    
                        false_positives = np.append(false_positives, 0)
                        true_positives = np.append(true_positives, 1)
                        detected_annotations.append(assigned_annotation)
                    else:
                        false_positives = np.append(false_positives, 1)
                        true_positives = np.append(true_positives, 0)

        
        if num_annotations == 0:
            average_precisions[label] = 0
            continue
        indices = np.argsort(-scores)
        false_positives = false_positives[indices]
        true_positives = true_positives[indices]

        false_positives = np.cumsum(false_positives)
        true_positives = np.cumsum(true_positives)

        recall = true_positives / num_annotations
        
        precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)

        average_precision = compute_ap(recall, precision)
        average_precisions[label] = average_precision


    count = 0
    for k in average_precisions.keys():
        count  = count + float(average_precisions[k])



    map1 = count/len(l)
    print(average_precisions) 
    print('MAP is :' , map1)
    


if __name__== "__main__":
   
    main()