In [5]:
# Imports
import numpy as np
import os

import tensorflow as tf

import cv2
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Playing the Video from file
captured = cv2.VideoCapture('resources/Pedestrian_Detect_2_1_1.mp4')

In [7]:
# Function to load the tensorflow model
def load_model(model_dir):
    model_loc = os.path.join(model_dir, "saved_model")
    model = tf.saved_model.load(str(model_loc))
    model = model.signatures['serving_default']
    
    return model

In [8]:
# Loading the model
model_dir_name = 'faster_rcnn_inception_v2_coco_2018_01_28'
detection_model = load_model(model_dir_name)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [9]:
count = 0
total_time = 0
print(detection_model.inputs)

[<tf.Tensor 'image_tensor:0' shape=(None, None, None, 3) dtype=uint8>]


In [10]:
detection_model.output_dtypes

{'detection_boxes': tf.float32,
 'detection_classes': tf.float32,
 'detection_scores': tf.float32,
 'num_detections': tf.float32}

In [11]:
# Dimensions of the outputted objects
detection_model.output_shapes

{'detection_boxes': TensorShape([None, 100, 4]),
 'detection_classes': TensorShape([None, 100]),
 'detection_scores': TensorShape([None, 100]),
 'num_detections': TensorShape([None])}

In [12]:
def run_inference(model, image):
    image = np.asarray(image)
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]

    # Run inference
    output_dict = model(input_tensor)

    # All outputs are batches tensors.
    # Convert to numpy arrays, and take index [0] to remove the batch dimension.
    # We're only interested in the first num_detections.
    num_detections = int(output_dict.pop('num_detections'))
    output_dict = {key:value[0, :num_detections].numpy() 
                 for key,value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)
    
    return output_dict

In [9]:
# Grab the shape of the input 
width = int(captured.get(3))
height = int(captured.get(4))
HEIGHT = 600
WIDTH = 1024
print("Started video inference....")
total_inf_start = time.time()
while(captured.isOpened()):
    # Capture frame-by-frame
    ret, frame = captured.read()
    if not ret:
        break
    image = cv2.resize(frame, (HEIGHT, WIDTH))
    # Actual inference.    
    # Start asynchronous inference for specified request.
    frame_inf_start = time.perf_counter()
    output_dict = run_inference(detection_model, image)
    frame_det_time = time.perf_counter() - frame_inf_start
    inf_time_message = "Inference time: {:.3f}ms"\
                               .format(frame_det_time*1000)
    count += 1

# When everything done, release the video capture and video write objects
captured.release()


# Closes all the frames
cv2.destroyAllWindows()    
print("Total Frame processed:",count)
total_det_time = time.time() - total_inf_start
print("Average Inference time per frame: {:.3f}ms".format((total_det_time*1000)/count))
total_inf_time_message = "Inference time: {:.3f}ms"\
                               .format(total_det_time * 1000)
print(total_inf_time_message)

Started video inference....
Total Frame processed: 1394
Average Inference time per frame: 754.066ms
Inference time: 1051167.981ms
