# **INFERENCE OVER VIDEO**

 Realizado por: [Alejandro Sanchez Ferrer](mailto:alejandro_sanchez_ferrer@hotmail.com), [Antonio Javier Gallego](mailto:jgallego@dlsi.ua.es), Jorge Calvo Zaragoza & Jose Javier Valero Mas

## Initialization

### Import dependencies

In [1]:
import os, sys
import sys
import random
import math
import numpy as np
import scipy.misc
import matplotlib
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image, ImageDraw
from tensorflow.python.framework.versions import VERSION as __version__
import tensorflow as tf
import imgaug

### Add relevant paths

Add root directory for Mask-RCNN

In [2]:
#Cambiamos el Directorio al propio de MASK_RCNN
ROOT_DIR = os.getcwd()
assert os.path.exists(ROOT_DIR), 'ROOT_DIR does not exist'
sys.path.append(ROOT_DIR)

Add models folder

In [4]:
# Directorio perteneciente a MASK-RCNN
MODEL_DIR = os.path.join(ROOT_DIR, "models")
assert os.path.exists(MODEL_DIR), 'MODEL_DIR does not exist'

### Add MASK-RCNN functions

Add specific functions of Mask-RCNN

In [5]:
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log

In [6]:
def get_ax(rows=1, cols=1, size=8):
    """Return a Matplotlib Axes array to be used in
    all visualizations in the notebook. Provide a
    central point to control graph sizes.

    Change the default size attribute to control the size
    of rendered images
    """
    _, ax = plt.subplots(rows, cols, figsize=(size*cols, size*rows))
    return ax

### Define MASK-RCNN Configuration

In [7]:
class CleanSeaConfig(Config):
    """
    Configuracion para el entrenamiento con CleanSea Dataset.
    """

    # Nombre de la configuracion
    NAME = "debris"

    # We use a GPU with 12GB memory, which can fit two images.
    # Adjust down if you use a smaller GPU.
    IMAGES_PER_GPU = 1

    # Use small images for faster training. Set the limits of the small side
    # the large side, and that determines the image shape.
    IMAGE_MIN_DIM = 512
    IMAGE_MAX_DIM = 512

    # Numero de clases + el background
    NUM_CLASSES = 1 + 19  # Cleansea tiene 19 clases

    # Salta las detecciones con <50% de seguridad
    DETECTION_MIN_CONFIDENCE = 0.5

In [8]:
config= CleanSeaConfig()
config.display()


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     1
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.5
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 1
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  512
IMAGE_META_SIZE                32
IMAGE_MIN_DIM                  512
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [512 512   3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE             

### Dataset Configuration
Add data & masks loading methods

In [9]:
class CleanSeaDataset(utils.Dataset):
    def load_data(self, dataset_dir, subset):
        # Train or validation dataset?
        assert subset in ["train_coco", "test_coco"]
        dataset_dir = os.path.join(dataset_dir, subset)
        print(dataset_dir)

        # Cargamos el archivo json
        annotation_json = os.path.join(dataset_dir,"annotations.json")
        json_file = open(annotation_json)
        coco_json = json.load(json_file)
        json_file.close()
        print("\nAnotaciones Cargadas\n")

        # Añadimos los nombres de las clases usando el metodo de utils.Dataset
        source_name = "coco_like"
        for category in coco_json['categories']:
            class_id = category['id']
            class_name = category['name']
            if class_id < 1:
                print('Error: Class id for "{}" reserved for the background'.format(class_name))
            else:
                self.add_class(source_name, class_id, class_name)
        print("Nombres Añadidos \n")

        # Almacenamos las anotaciones
        annotations = {}
        for annotation in coco_json['annotations']:
            image_id = annotation['image_id']
            if image_id not in annotations:
                annotations[image_id] = []
            annotations[image_id].append(annotation)
        print("Anotaciones Almacenadas\n")

        # Almacenamos las imagenes y las añadimos al dataset
        seen_images = {}
        for image in coco_json['images']:
            image_id = image['id']
            if image_id in seen_images:
                print("Warning: Skipping duplicate image id: {}".format(image))
            else:
                seen_images[image_id] = image
                try:
                    image_file_name = image['file_name']
                    image_width = image['width']
                    image_height = image['height']
                except KeyError as key:
                    print("Warning: Skipping image (id: {}) with missing key: {}".format(image_id, key))
                
                image_path = os.path.join(dataset_dir, image_file_name)
                image_annotations = annotations[image_id]
                
                # Añadimos la imagen usando el metodo de utils.Dataset
                self.add_image(
                    source=source_name,
                    image_id=image_id,
                    path=image_path,
                    width=image_width,
                    height=image_height,
                    annotations=image_annotations
                )
        print("Imagenes añadidas al Dataset\n")

    def load_mask(self, image_id):
        """ Carga la mascara de instancia para la imagen dada
        MaskRCNN espera mascaras en forma de mapa de bits (altura, anchura e instancias)
        Argumentos:
            image_id: El ID de la imagen a la que vamos a cargar la mascara
        Salida:
            masks: Una cadena booleana con estructura (altura, anchya y la cuenta de instancias) con una mascara por instancia
            class_ids: Una cadena de 1 dimension de clase ID de la instancia de la mascara """
        image_info = self.image_info[image_id]
        annotations = image_info['annotations']
        instance_masks = []
        class_ids = []
        
        for annotation in annotations:
            class_id = annotation['category_id']
            mask = Image.new('1', (image_info['width'], image_info['height']))
            mask_draw = ImageDraw.ImageDraw(mask, '1')
            for segmentation in annotation['segmentation']:
                mask_draw.polygon(segmentation, fill=1)
                bool_array = np.array(mask) > 0
                instance_masks.append(bool_array)
                class_ids.append(class_id)

        mask = np.dstack(instance_masks)
        class_ids = np.array(class_ids, dtype=np.int32)
        return mask, class_ids

    def image_reference(self, image_id):
        """Return the path of the image."""
        info = self.image_info[image_id]
        if info["source"] == "object":
            return info["path"]
        else:
            super(self.__class__, self).image_reference(image_id)


### Add video inference utils functions

In [41]:
# define random colors
COLOR_CODE = [(255,0,0),(0,255,0),(0,0,255),(255,155,0),(255,155,155),(155,255,0),(155,255,155),(155,0,255),(155,155,255),(255,255,0),(0,255,255),(255,0,255)]

def random_colors(N):
  np.random.seed(1)
  colors = [tuple(255 * np.random.rand(3)) for _ in range(N)]
  return colors

#apply mask to image
def apply_mask(image, mask, color, alpha=0.5):
  for n, c in enumerate(color):
    image[:, :, n] = np.where(mask == 1, image[:, :, n] * (1-alpha) + alpha * c, image[:, :, n])
  return image

#take the image and apply the mask, box, and Label
def display_instances(image, boxes, masks, ids, names, scores, colors):
  n_instances = boxes.shape[0]
  #colors = random_colors(n_instances)
  if not n_instances:
    print("NO INSTANCES TO DISPLAY")
  else:
    assert boxes.shape[0] == masks.shape[-1] == ids.shape[0]
  for i in range(n_instances):
    if not np.any(boxes[i]):
      continue

    y1, x1, y2, x2 = boxes[i]
    label = names[ids[i]]
    score = scores[i] if scores is not None else None
    caption = "{} {:.2f}".format(label, score) if score else label
    mask = masks[:, :, i]
    color = colors[names.index(label)]

    image = apply_mask(image, mask, color)
    image = cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
    image = cv2.putText(image, caption, (x1, y1), cv2.FONT_HERSHEY_COMPLEX, 0.7, color, 2)
  return image

## Video Paths

In [81]:
VIDEO_FILE = "../cleansea_dataset/Videos/video_analisis/debrisVideo_PRL2.mp4"
assert os.path.exists(VIDEO_FILE), "Chosen file does not exist"
VIDEO_SAVE_DIR = os.path.join(ROOT_DIR, 'detect')
assert os.path.exists(VIDEO_SAVE_DIR), os.makedirs(VIDEO_SAVE_DIR)
MODEL_PATH = os.path.join(MODEL_DIR, 'Mask_RCNN_Epoch-1000_Aug-severe_Size-100_Train-real_Fill-synth_Limit-False.h5')
assert os.path.exists(MODEL_PATH), "Chosen MODEL does not exist"

## Inference

### Inference configuration

In [82]:
class InferenceConfig(CleanSeaConfig):
  GPU_COUNT = 1
  IMAGES_PER_GPU = 3

batch_size = 3

In [83]:
config = InferenceConfig()
config.display()


Configurations:
BACKBONE                       resnet101
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     3
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
COMPUTE_BACKBONE_SHAPE         None
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.5
DETECTION_NMS_THRESHOLD        0.3
FPN_CLASSIF_FC_LAYERS_SIZE     1024
GPU_COUNT                      1
GRADIENT_CLIP_NORM             5.0
IMAGES_PER_GPU                 3
IMAGE_CHANNEL_COUNT            3
IMAGE_MAX_DIM                  512
IMAGE_META_SIZE                32
IMAGE_MIN_DIM                  512
IMAGE_MIN_SCALE                0
IMAGE_RESIZE_MODE              square
IMAGE_SHAPE                    [512 512   3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE                  0.001
LOSS_WEIGHTS                   {'rpn_class_loss': 1.0, 'rpn_bbox_loss': 1.0, 'mrcnn_class_loss': 1.0, 'mrcnn_bbox_loss': 1.0, 'mrcnn_mask_loss': 1.0}
MASK_POOL_SIZE                 14
MASK_SHAPE             

### Class Names

In [84]:
class_names = ['BG','Can','Squared_Can','Wood','Bottle','Plastic_Bag','Glove','Fishing_Net','Tire','Packaging_Bag','WashingMachine','Metal_Chain','Rope','Towel','Plastic_Debris','Metal_Debris','Pipe','Shoe','Car_Bumper','Basket']

### Load Model for inference

In [85]:
# Create model object in inference mode.
model = modellib.MaskRCNN(mode='inference', model_dir=MODEL_DIR, config=config)
# Load weights trained on MS-COCO
model.load_weights(MODEL_PATH, by_name=True)

### Video creation function


In [86]:
def make_video(outvid, images=None, fps=30, size=None,
               is_color=True, format="FMP4"):
    """
    Create a video from a list of images.
 
    @param      outvid      output video
    @param      images      list of images to use in the video
    @param      fps         frame per second
    @param      size        size of each frame
    @param      is_color    color
    @param      format      see http://www.fourcc.org/codecs.php
    @return                 see http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_gui/py_video_display/py_video_display.html
 
    The function relies on http://opencv-python-tutroals.readthedocs.org/en/latest/.
    By default, the video will have the size of the first image.
    It will resize every image to this size before adding them to the video.
    """
    from cv2 import VideoWriter, VideoWriter_fourcc, imread, resize
    fourcc = VideoWriter_fourcc(*format)
    vid = None
    for image in images:
        if not os.path.exists(image):
            raise FileNotFoundError(image)
        img = imread(image)
        if vid is None:
            if size is None:
                size = img.shape[1], img.shape[0]
            vid = VideoWriter(outvid, fourcc, float(fps), size, is_color)
        if size[0] != img.shape[1] and size[1] != img.shape[0]:
            img = resize(img, size)
        vid.write(img)
    vid.release()
    return vid

### Video processing

#### Check CUDA Drivers

In [87]:
!nvidia-smi

Sun Nov 27 00:44:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 526.98       Driver Version: 526.98       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   59C    P0    30W /  N/A |   7557MiB /  8192MiB |     21%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#### Initialize GPU

In [88]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print(physical_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [89]:
tf.test.is_built_with_cuda()

True

#### Process

In [90]:
video = cv2.VideoCapture(VIDEO_FILE)

# Find OpenCV version
(major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')
if int(major_ver) < 3 :
  fps = video.get(cv2.cv.CV_CAP_PROP_FPS)
  print('Frames per second using video.get(cv2.cv.CV_CAP_PROP_FPS): {0}'.format(fps))
else :
  fps = video.get(cv2.CAP_PROP_FPS)
  print("Frames per second using video.get(cv2.CAP_PROP_FPS) : {0}".format(fps))

try:
  if not os.path.exists(VIDEO_SAVE_DIR):
    os.makedirs(VIDEO_SAVE_DIR)
except OSError:
  print ('Error: Creating directory of data')

frames = []
frame_count = 0
colors = random_colors(len(class_names))
while True:
  ret, frame = video.read() 
  if not ret:
    break

# Save each frame of the video to a list
  frame_count += 1
  frames.append(frame)
  print('Frames :{0}'.format(frame_count))
  
  if len(frames) == batch_size:
    results = model.detect(frames, verbose=0)
    print('Predicted')
    for i, item in enumerate(zip(frames, results)):
      frame = item[0]
      r = item[1]
      frame = display_instances(frame, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'],colors)
      name = '{0}.jpg'.format(frame_count + i - 3)
      name = os.path.join(VIDEO_SAVE_DIR, name)
      cv2.imwrite(name, frame)
      print('writing to file:{0}'.format(name))
      # Clear the frames array to start the next batch
      frames = []
video.release()

Frames per second using video.get(cv2.CAP_PROP_FPS) : 30.0
Frames :1
Frames :2
Frames :3
Predicted
(52.135323681536946, 223.9199462796911, 6.983836265471171)
(35.79866934178461, 50.51587971664409, 204.18986501226186)
(170.96921509549256, 106.41272460361738, 142.46590625366667)
(77.09480602111914, 37.42275215836383, 23.54634166604344)
(52.135323681536946, 223.9199462796911, 6.983836265471171)
writing to file:g:\Mi unidad\Cleansea\v2-IBPRIA\Cleansea\Mask_RCNN-Cleansea_DevOPS\detect\0.jpg
(170.96921509549256, 106.41272460361738, 142.46590625366667)
(35.79866934178461, 50.51587971664409, 204.18986501226186)
(52.135323681536946, 223.9199462796911, 6.983836265471171)
(77.09480602111914, 37.42275215836383, 23.54634166604344)
(52.135323681536946, 223.9199462796911, 6.983836265471171)
writing to file:g:\Mi unidad\Cleansea\v2-IBPRIA\Cleansea\Mask_RCNN-Cleansea_DevOPS\detect\1.jpg
(170.96921509549256, 106.41272460361738, 142.46590625366667)
(35.79866934178461, 50.51587971664409, 204.1898650122618

### Generation of video

In [91]:
import glob
import os

images = list(glob.iglob(os.path.join(VIDEO_SAVE_DIR, '*.*')))
# Sort the images by integer index
images = sorted(images, key=lambda x: float(os.path.split(x)[1][:-3]))

outvid = os.path.join(VIDEO_SAVE_DIR, "detection.mp4")
make_video(outvid, images, fps=30)


<VideoWriter 000002666A187B30>

## Clean detection folder

In [92]:
detection_folder = os.listdir(VIDEO_SAVE_DIR)

for item in detection_folder:
    if item.endswith(".jpg"):
        os.remove(os.path.join(VIDEO_SAVE_DIR, item))