# AutoObjectRemoval
## AI based object removal system for videos


### 1. Choose a video to be processed
This video will be rescaled to 960 x 512 pixels and cut up into frames.
Note that this will consume a lot of storage space for long videos.

In [None]:
from pathlib import Path
import os


video_path = Path("./res/videos/skate.mp4")
out_dir_name = f"./outputs/{video_path.stem}_%s"
i = 0
while os.path.exists(out_dir_name % i):
    i += 1
output_dir = Path(out_dir_name % i).resolve()
!mkdir -p {output_dir}
intermediate = Path(output_dir / "intermediates")

### 1.1 Video Preprocessing

In [None]:
amount_of_frames = 150
!mkdir -p {intermediate}/frames
!mkdir -p {intermediate}/masks
!ffmpeg -i {video_path} -frames:v {amount_of_frames} -vf scale="960:512" {intermediate}/frames/%03d.png

### 2. Generating masks object removal
For this step the Detectron2 instance segmentation model trained on MS-COCO dataset is used.
Any of the 80 objects in that dataset can be used for extraction.
You can find a list of these class names [here](https://github.com/amikelive/coco-labels/blob/master/coco-labels-2014_2017.txt).

#### 2.1 Load Detectron2 model

In [None]:
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import cv2
from tqdm import tqdm
from PIL import Image

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg

In [None]:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
predictor = DefaultPredictor(cfg)
class_names = predictor.metadata.thing_classes

#### 2.2 Load frames and apply model

In [None]:
# Load frames
frames = intermediate / "frames/"
imgs = [cv2.imread(str(path)) for path in frames.glob("*.png")]

In [None]:
# Inference on each of the frames
outputs = [predictor(img) for img in imgs]

#### 2.3 Extract masks for chosen object to be removed

In [None]:

# Function to turn mask of shape (width, height) to shape (width, height, 3)
def to_rgb(im):
    w, h = im.shape
    ret = np.empty((w, h, 3))
    ret[:, :, 2] =  ret[:, :, 1] =  ret[:, :, 0] =  im
    return ret

Here `expand_mask(mask)` is a utility function that is used to expand the generated mask by some padding to make
sure that the object intended for removal is fully masked.

In [None]:
def expand_mask(mask, padding=15):
    mask_cpy = mask.copy()
    for x in range(mask.shape[0]):
        for y in range(mask.shape[1]):
            if mask[x,y] == 1:
                if padding < x < mask.shape[0]-padding and padding < y < mask.shape[1]-padding:
                    if 0 in [mask[x+xoff, y+yoff] for xoff in range(-1,2) for yoff in range(-1, 2)]:
                        for px in range(-padding, padding):
                            for py in range(-padding, padding):
                                mask_cpy[x+px,y+py] = 1
    return mask_cpy

#### 2.4 Combining and saving masks from inference results
Choose the labels of each object class which shall be removed and this loop
will combine the masks of all detected objects in `labels_of_objects_to_remove`
for each frame.

In [None]:
labels_of_objects_to_remove = ["person", "skateboard"]
max_objects_to_detect = 2
indices =  [class_names.index(label) for label in labels_of_objects_to_remove]
for i, (img, pred) in tqdm(enumerate(zip(imgs, outputs))):
    shape = img.shape
    created_mask = False
    mask = np.zeros(shape)
    if pred is not None:
        instance = pred["instances"]
        for x in range(len(instance.pred_classes))[:max_objects_to_detect]:
            if instance.pred_classes[x] in indices:
                mask_arr = instance.pred_masks[x].cpu().numpy()
                mask += to_rgb(expand_mask(mask_arr))
    mask_img = Image.fromarray((mask * 255).astype(np.uint8))
    mask_img.save(intermediate / "masks" / f"{i}.png")
                
print('Finished creating masks.')

### 3. Object removal with FGVC
Make sure that you have set up the environments `FGVC` and `raft` according to setup.readme.
For 3.1 ensure that `raft` is activated. When prompted with `Please switch to torch 0.4.0`, deactivate `FGVC` and activate `raft`.

In [None]:
frames = frames.resolve()
masks = (intermediate/"masks").resolve()

!bash -i run_video_completion.sh "raft" {frames} {masks} {output_dir}

In [None]:
!bash -i run_video_completion.sh "FGVC" {frames} {masks} {output_dir}

In [None]:
from ipywidgets import Video
Video.fromfile(f"{output_dir}/frame_seamless_comp_0/video_extrapolation.mp4")

from ipywidgets import Video
Video.fromfile(f"{output_dir}/frame_seamless_comp_0/video_extrapolation.mp4")

In [1]:
from pathlib import Path
import os

video_path = Path("./res/videos/skate.mp4")
out_dir_name = f"./outputs/{video_path.stem}_%s"
i = 0
while os.path.exists(out_dir_name % i):
    i += 1
output_dir = Path(out_dir_name % i).resolve()
!mkdir -p {output_dir}
intermediate = Path(output_dir / "intermediates")

### 1.1 Video Preprocessing

In [2]:
amount_of_frames = 150
!mkdir -p {intermediate}/frames
!mkdir -p {intermediate}/masks
# Here 25fps (youtube) is assumed. For best results adjust to correct fps
!ffmpeg -i {video_path} -frames:v {amount_of_frames} -vf scale="960:512" {intermediate}/frames/%03d.png

ffmpeg version 4.0 Copyright (c) 2000-2018 the FFmpeg developers
  built with gcc 7.2.0 (crosstool-NG fa8859cb)
  configuration: --prefix=/home/bjorn/anaconda3/envs/raft --cc=/opt/conda/conda-bld/ffmpeg_1531088893642/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-shared --enable-static --enable-zlib --enable-pic --enable-gpl --enable-version3 --disable-nonfree --enable-hardcoded-tables --enable-avresample --enable-libfreetype --disable-openssl --disable-gnutls --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --disable-libx264
  libavutil      56. 14.100 / 56. 14.100
  libavcodec     58. 18.100 / 58. 18.100
  libavformat    58. 12.100 / 58. 12.100
  libavdevice    58.  3.100 / 58.  3.100
  libavfilter     7. 16.100 /  7. 16.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  1.100 /  5.  1.100
  libswresample   3.  1.100 /  3.  1.100
  libpostproc    55.  1.100 / 55.  1.100
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'res/videos/skate

### 2. Extracting bounding boxes for object which shall be removed
For this step the YOLOv5 object detection model trained on MS-COCO dataset is used. Any of the 80 objects in that dataset can be used for extraction.
You can find a list of these class names [here](https://github.com/amikelive/coco-labels/blob/master/coco-labels-2014_2017.txt).

#### 2.1 Load YOLO model

In [3]:
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import json, cv2, random

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

In [4]:
cfg = get_cfg()
# add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
# Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
predictor = DefaultPredictor(cfg)


#### 2.2 Load frames and apply model

In [5]:
# Load frames
frames = intermediate / "frames/"
imgs = [cv2.imread(str(path)) for path in frames.glob("*.png")]

In [6]:
# Inference on each of the frames
outputs = [predictor(img) for img in imgs]

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1595629427478/work/torch/csrc/utils/python_arg_parser.cpp:766.)
  return x.nonzero().unbind(1)


In [7]:
#print(outputs[0]["instances"].pred_masks.shape)
class_names = predictor.metadata.thing_classes

#### 2.3 Extract masks for chosen object to be removed

In [8]:
from tqdm import tqdm
from PIL import Image

# Function to turn mask of shape (width, height) to shape (width, height, 3)
def to_rgb(im):
    w, h = im.shape
    ret = np.empty((w, h, 3))
    ret[:, :, 2] =  ret[:, :, 1] =  ret[:, :, 0] =  im
    return ret

In [9]:
def expand_mask(mask):
    mask_cpy = mask.copy()
    padding = 25
    for x in range(mask.shape[0]):
        for y in range(mask.shape[1]):
            if mask[x,y] == 1:
                if x > padding and y > padding and x < mask.shape[0]-padding and y < mask.shape[1]-padding:
                    if 0 in [mask[x+xoff, y+yoff] for xoff in range(-1,2) for yoff in range(-1, 2)]:
                        for px in range(-padding, padding):
                            for py in range(-padding, padding):
                                mask_cpy[x+px,y+py] = 1
    return mask_cpy

In [10]:

labels_of_objects_to_remove = ["person", "skateboard"]
max_objects_to_detect = 2
indices =  [class_names.index(label) for label in labels_of_objects_to_remove]
for i, (img, pred) in tqdm(enumerate(zip(imgs, outputs))):
    shape = img.shape
    created_mask = False
    mask = np.zeros(shape)
    if pred is not None:
        instance = pred["instances"]
        for x in range(len(instance.pred_classes))[:max_objects_to_detect]:
            if instance.pred_classes[x] in indices:
                mask_arr = instance.pred_masks[x].cpu().numpy()
                mask += to_rgb(expand_mask(mask_arr))
    mask_img = Image.fromarray((mask * 255).astype(np.uint8))
    mask_img.save(intermediate / "masks" / f"{i}.png")
                
print('Finished creating masks.')

150it [03:04,  1.23s/it]

Finished creating masks.





### 3. Object removal with FGVC
Make sure that you have set up the environments `FGVC` and `raft` according to setup.readme.
For 3.1 ensure that `raft` is activated. When prompted with `Please switch to torch 0.4.0`, deactivate `FGVC` and activate `raft`.

In [11]:
frames = frames.resolve()
masks = (intermediate/"masks").resolve()

!bash -i run_video_completion.sh "raft" {frames} {masks} {output_dir}

Pytorch 1.6.0 for flow prediction
Completing backward flow 148 <---> 149 
Finish Calculating flow.
Completing forward flow  0 <--->  1
Completing forward flow  1 <--->  2
Completing forward flow  2 <--->  3
Completing forward flow  3 <--->  4
Completing forward flow  4 <--->  5
Completing forward flow  5 <--->  6
Completing forward flow  6 <--->  7
Completing forward flow  7 <--->  8
Completing forward flow  8 <--->  9
Completing forward flow  9 <---> 10
Completing forward flow 10 <---> 11
Completing forward flow 11 <---> 12
Completing forward flow 12 <---> 13
Completing forward flow 13 <---> 14
Completing forward flow 14 <---> 15
Completing forward flow 15 <---> 16
Completing forward flow 16 <---> 17
Completing forward flow 17 <---> 18
Completing forward flow 18 <---> 19
Completing forward flow 19 <---> 20
Completing forward flow 20 <---> 21
Completing forward flow 21 <---> 22
Completing forward flow 22 <---> 23
Completing forward flow 23 <---> 24
Completing forward flow 24 <---> 25
C

In [12]:
!bash -i run_video_completion.sh "FGVC" {frames} {masks} {output_dir}

Pytorch 1.6.0 for flow prediction
Loading /mnt/c/Users/Björn/Documents/MiscCode/cutter/outputs/skate_0/flow/backward_flo/00148.flo 
Finish Calculating flow.
Load Deepfill Model from ../weight/imagenet_deepfill.pth
Forward Pass......
(512, 960, 2, 149)
Frame   1:        0 +        0 =        0
(512, 960, 2, 149)
Frame   2:    68148 +        0 =    68148
(512, 960, 2, 149)
Frame   3:    60979 +        0 =    60979
(512, 960, 2, 149)
Frame   4:    70430 +      216 =    70646
(512, 960, 2, 149)
Frame   5:    70716 +      398 =    71114
(512, 960, 2, 149)
Frame   6:    64621 +      555 =    65176
(512, 960, 2, 149)
Frame   7:    67260 +      533 =    67793
(512, 960, 2, 149)
Frame   8:    71564 +      135 =    71699
(512, 960, 2, 149)
Frame   9:    73824 +      207 =    74031
(512, 960, 2, 149)
Frame  10:    78793 +      331 =    79124
(512, 960, 2, 149)
Frame  11:    78717 +      382 =    79099
(512, 960, 2, 149)
Frame  12:    77876 +      430 =    78306
(512, 960, 2, 149)
Frame  13:    59

In [None]:
from ipywidgets import Video
Video.fromfile(f"{output_dir}/frame_seamless_comp_0/video_extrapolation.mp4")