In [22]:
import cv2
import numpy as np

In [23]:
def find_objects(model_outputs):
    bounding_box_locations = []
    class_ids = []
    confidence_values = []

    for output in model_outputs:
        for prediction in output:
            ## find class id with the highest level of probability/confidence 
            class_probabilities = prediction[5:]
            class_id = np.argmax(class_probabilities)
            confidence = class_probabilities[class_id]

            if confidence > THRESHOLD:
                w, h = int(prediction[2] * YOLO_IMAGE_SIZE), int(prediction[3] * YOLO_IMAGE_SIZE)
                ## rescale output to blob dimensions! 
                # the center of the bounding box (we should transform these values)
                x, y = int(prediction[0] * YOLO_IMAGE_SIZE - w / 2), int(prediction[1] * YOLO_IMAGE_SIZE - h / 2)
                ## now x,y are the coordinates of top left corner of the bounding box 
                bounding_box_locations.append([x, y, w, h])
                class_ids.append(class_id)
                confidence_values.append(float(confidence))

    box_indexes_to_keep = cv2.dnn.NMSBoxes(bounding_box_locations, confidence_values, THRESHOLD, SUPPRESSION_THRESHOLD)

    return box_indexes_to_keep, bounding_box_locations, class_ids, confidence_values


def show_detected_images(img, bounding_box_ids, all_bounding_boxes, class_ids, confidence_values, width_ratio,
                         height_ratio):
    for index in bounding_box_ids:
        bounding_box = all_bounding_boxes[index[0]]
        x, y, w, h = int(bounding_box[0]), int(bounding_box[1]), int(bounding_box[2]), int(bounding_box[3])
        # we have to transform the locations adn coordinates because the resized image
        x = int(x*width_ratio)
        y = int(y * height_ratio)
        w = int(w * width_ratio)
        h = int(h * height_ratio)

        # OpenCV deals with BGR blue green red (255,0,0) then it is the blue color
        # we are not going to detect every objects just PERSON and CAR
        if class_ids[index[0]] == 2:
            cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
            class_with_confidence = 'CAR' + str(int(confidence_values[index[0]] * 100)) + '%'
            cv2.putText(img, class_with_confidence, (x, y-10), cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (255, 0, 0), 1)

        if class_ids[index[0]] == 0:
            cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
            class_with_confidence = 'PERSON' + str(int(confidence_values[index[0]] * 100)) + '%'
            cv2.putText(img, class_with_confidence, (x, y-10), cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (255, 0, 0), 1)




In [24]:
# we are not going to bother with objects less than 30% probability
THRESHOLD = 0.3
# the lower the value: the fewer bounding boxes will remain
SUPPRESSION_THRESHOLD = 0.3
YOLO_IMAGE_SIZE = 320


In [25]:
image = cv2.imread('camus.jpg')
original_width, original_height = image.shape[1], image.shape[0]

# there are 80 (90) possible output classes
# 0: person - 2: car - 5: bus
classes = ['car', 'person', 'bus']

neural_network = cv2.dnn.readNetFromDarknet('yolov3.cfg', 'yolov3.weights')
# define whether we run the algorithm with CPU or with GPU
# WE ARE GOING TO USE CPU !!!
neural_network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
neural_network.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

# the image into a BLOB [0-1] RGB - BGR
blob = cv2.dnn.blobFromImage(image, 1 / 255, (YOLO_IMAGE_SIZE, YOLO_IMAGE_SIZE), True, crop=False)
neural_network.setInput(blob)

## The entire network architecture (list of names)
layer_names = neural_network.getLayerNames()
# call getunconnectedoutlayers to find out the indices of the yolo output layer names

# YOLO network has 3 output layer - note: these indexes are starting with 1
output_names = [layer_names[index[0] - 1] for index in neural_network.getUnconnectedOutLayers()]

# Generate output prediction from all three yolo output layers 
## by specifying their names  

outputs = neural_network.forward(output_names)

## dimensions of output list # ((300, 85), (1200, 85), (4800, 85)) 
## have 300 bounding boxes from first layer, 1200 boxes from second layer, 
## and 4800 bounding boxes from third layer 
## for each bounding box we have a prediction vector of dimension = 85 
## (x,y,w,h, confidence, 80 class probabilities)

# Massage the output and preprocess them to be ready for the show detected image function


predicted_objects, bbox_locations, class_label_ids, conf_values = find_objects(outputs)

## suppress non-max and find the objects with their corresponding bounding boxes 

show_detected_images(image, predicted_objects, bbox_locations, class_label_ids, conf_values,
                     original_width / YOLO_IMAGE_SIZE, original_height / YOLO_IMAGE_SIZE)

cv2.imshow('YOLO Algorithm', image)
cv2.waitKey()

-1

In [26]:
## the entire architecture
layer_names 

['conv_0',
 'bn_0',
 'leaky_1',
 'conv_1',
 'bn_1',
 'leaky_2',
 'conv_2',
 'bn_2',
 'leaky_3',
 'conv_3',
 'bn_3',
 'leaky_4',
 'shortcut_4',
 'conv_5',
 'bn_5',
 'leaky_6',
 'conv_6',
 'bn_6',
 'leaky_7',
 'conv_7',
 'bn_7',
 'leaky_8',
 'shortcut_8',
 'conv_9',
 'bn_9',
 'leaky_10',
 'conv_10',
 'bn_10',
 'leaky_11',
 'shortcut_11',
 'conv_12',
 'bn_12',
 'leaky_13',
 'conv_13',
 'bn_13',
 'leaky_14',
 'conv_14',
 'bn_14',
 'leaky_15',
 'shortcut_15',
 'conv_16',
 'bn_16',
 'leaky_17',
 'conv_17',
 'bn_17',
 'leaky_18',
 'shortcut_18',
 'conv_19',
 'bn_19',
 'leaky_20',
 'conv_20',
 'bn_20',
 'leaky_21',
 'shortcut_21',
 'conv_22',
 'bn_22',
 'leaky_23',
 'conv_23',
 'bn_23',
 'leaky_24',
 'shortcut_24',
 'conv_25',
 'bn_25',
 'leaky_26',
 'conv_26',
 'bn_26',
 'leaky_27',
 'shortcut_27',
 'conv_28',
 'bn_28',
 'leaky_29',
 'conv_29',
 'bn_29',
 'leaky_30',
 'shortcut_30',
 'conv_31',
 'bn_31',
 'leaky_32',
 'conv_32',
 'bn_32',
 'leaky_33',
 'shortcut_33',
 'conv_34',
 'bn_34',
 'l

In [27]:
neural_network.getUnconnectedOutLayers()


array([[200],
       [227],
       [254]], dtype=int32)

In [28]:
# yolo layer names
output_names


['yolo_82', 'yolo_94', 'yolo_106']

In [29]:
outputs[0].shape , outputs[1].shape , outputs[2].shape

((300, 85), (1200, 85), (4800, 85))

In [30]:
predicted_objects, ## only fourth box contain an object! 

(array([[3]], dtype=int32),)

In [31]:
 bbox_locations  ## [x,y,w,h] = [3, 22, 173, 291],

[[0, 21, 176, 270],
 [8, 23, 183, 264],
 [-13, -1, 240, 309],
 [3, 22, 173, 291],
 [-11, 12, 204, 312],
 [5, 28, 187, 284],
 [-4, 21, 219, 298]]