**Purpose:**
This notebook should consist of our methods of detecting faces and adding effects on top on them in a video. 
It is written in stages on Google Colab.

In [None]:
!pip install face_recognition
!pip install facenet_pytorch
!pip install mmcv

In [2]:
%pylab inline 
import face_recognition
import cv2
import matplotlib.patches as patches
from IPython.display import clear_output
from matplotlib.pyplot import imshow
import matplotlib.pylab as plt
from PIL import Image, ImageDraw
import imageio
from itertools import product
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
import mmcv
import numpy as np
import pandas as pd
import os
# import cupy as cp
from time import time


Populating the interactive namespace from numpy and matplotlib


In [3]:
run_on_colab = True
if run_on_colab:
    from google.colab import drive
  
    drive.mount('/content/drive', force_remount=True)
    path = "/content/drive/My Drive/AdvancedPython2021"
else:
    path = os.getcwd()
os.chdir(path)
from array_mp4_conversion import array_to_mp4, mp4_to_array
file = "girl.gif"
data_path = os.path.join(path, "data", file)


Mounted at /content/drive


In [18]:
import torch
import os
from itertools import product
import numpy as np
import face_recognition
import cv2
import matplotlib.patches as patches
from IPython.display import clear_output
from matplotlib.pyplot import imshow
import matplotlib.pylab as plt
from PIL import Image, ImageDraw
import imageio
from itertools import product
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
import mmcv
import numpy as np
import pandas as pd
import os
from time import time
from array_mp4_conversion import array_to_mp4, mp4_to_array
from numba import jit, prange, cuda
from collections import defaultdict
from scipy.signal import convolve2d
# cimport numpy as np
# ctypedef np.uint8_t D_TYPE

path = os.getcwd()

class video_transformer_base:
    '''
    This is the base of video_transformer, containing basic information about the video.
    '''
    def __init__(self,
                path, 
                save_path, 
                file_name, 
                device='cpu',
                display=False):
        
        self.video_path = os.path.join(path, "data", file_name)
        self.video_array, self.fps = mp4_to_array(self.video_path)
        self.display = display
        self.save_path = save_path
        self.file_name = file_name
        self.num_frames = 0
        if device == 'cpu':
            self.device = 'cpu'
        else:
            self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    def main_transformation(self, 
                            face_detection_model, 
                            filter_effect):
        '''
        For each frame, do:
        1. detect the face;
        2. apply the filter;
        3. save the processed frame.
        '''                    
        video_capture = cv2.VideoCapture(self.video_path)
        frame_count = 0
        output_frames = []
        while video_capture.isOpened():    
            # Grab a single frame of video
            ret, frame = video_capture.read()

            # Bail out when the video file ends
            if not ret:
                video_capture.release()
                break
            
            frame_count += 1

            # detect faces
            if face_detection_model != "mtcnn":
                face_locations = self.face_detection(frame, 
                                                     model=face_detection_model)
            else:
                face_locations = self.face_detection_mtcnn(frame)

            # add effect
            after_effect_frame = filter_effect(frame, face_locations)

            if self.display and frame_count % 2 == 0:
               # If faces were found, we will mark it on frame with blue dots
                for face_location in face_locations:
                    top, right, bottom, left = face_location
                    cv2.rectangle(after_effect_frame,(left, top), (right, bottom), (0, 0, 255), 2)
                plt.imshow(after_effect_frame)
                plt.show()
                clear_output(wait=True)

            output_frames.append(after_effect_frame)
        self.num_frames = frame_count
        self.des_arr = np.array(output_frames)

    def face_detection(self, frame):
        '''
        Face detection with package face_recognition.
        Models includes: svm, knn, cnn.
        Currently fixed as model='svm' because model ='cnn' is slow.
        '''
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        rgb_frame = frame[:, :, ::-1]
        face_locations = face_recognition.face_locations(rgb_frame, model='svm')
        # print(f"{len(face_locations)} face(s) detected.")
        
        return face_locations
    def face_detection_mtcnn(self, frame):
        '''
        Face detection with package facenet_pytorch.
        MTCNN implemented in Pytorch, so also support CUDA.
        '''       

        mtcnn = MTCNN(keep_all=True, device=self.device)
        boxes, _ = mtcnn.detect(frame)
        
        if boxes is None:
            boxes = []
            
        boxes = np.array([[box[1], box[2], box[3], box[0]] for box in boxes]).astype(np.int)
        # print(f"{len(boxes)} face(s) detected.")
        return boxes
    def oil_effect(self, frame):
        '''
        Please refer to unused_oil_effect.py for implementation.
        '''
        
        pass
    
    def negative_effect(self, frame, locations):
        '''
        Apply negative filter effect to target locations.
        '''
        des_img = np.copy(frame)
        try:
            for location in locations:
                t_, r_, b_, l_ = location.astype(int)

                des_img[t_:b_,l_:r_] = 255 - frame[t_:b_,l_:r_]
        except:
            pass
        
        return des_img

    def mean_blur(self, frame, locations, radius=5):
        '''
        Apply simple mosaic effect to specified regions. 
        '''
        k = 1 / (radius*2+1)**2
        des_img = np.copy(frame)
        height, width, _ = des_img.shape

        for location in locations:
            top, right, bottom, left = location
            t_ = max(top+radius,0)
            b_ = min(bottom-radius, height)
            l_ = max(left+radius,0)
            r_ = min(right-radius, width)
            if t_ >= b_ or l_ >= r_:
                continue

            for i, j in product(range(t_, b_), range(l_, r_)):
                kernel = frame[i-radius:i+radius+1, j-radius:j+radius+1, :]
                sumed = np.sum(kernel, axis = (0,1)) * k
                des_img[i, j] = sumed.astype(np.uint8)

        
        return des_img    
    
    def write_to_video(self, output_filename):
        '''
        Write out the video with filter to mp4.
        '''
        array_to_mp4(output_filename, self.des_arr, self.fps)

class video_transformer_parallel(video_transformer_base):
    '''
    This version views the video as an array for easier parallelization.
    '''
    def __init__(self, path, save_path, file_name, device='cpu',display=False):
        video_transformer_base.__init__(self, path, save_path, file_name, 
                                        device, display)
        
        self.locations = None
        self.des_arr = None

        torch.from_numpy(self.video_array).to(self.device)    
   
    def filter_on_video(self, filter_func, face_detection_model = 'mtcnn', radius = 10):
        '''
        Apply filter on the video.
        '''
        self.des_arr = self.video_array.copy()
        frame_size = self.video_array.shape[0]
        self.locations = self.get_face_locations(face_detection_model)
        radius_list = [radius] * frame_size

        list(map(filter_func, self.video_array, self.des_arr, self.locations, radius_list))
        
    def filter_on_video_cuda(self, img_blur_cuda, radius=10, face_detection_model = 'mtcnn'):
        '''
        Filter on entire video using CUDA for blurring. It specifies block size and controls bounds.
        '''
        blocksize = (32,32)
        k = 1 / (2*radius+1)**2
        self.des_arr = self.video_array.copy()
        frame_size = self.video_array.shape[0]
        self.locations = self.get_face_locations(face_detection_model)
        np.save("frame_test.out", self.des_arr[20])
        np.save("locations_tests.out", self.locations[20])
        for i in range(frame_size):
            for location in self.locations[i]:
                top, right, bottom, left = location
                face = np.ascontiguousarray(self.des_arr[i, top:bottom+1, left:right+1, :])
                gridsize = (face.shape[0]//blocksize[0]+1, face.shape[1]//blocksize[1]+1)
                blur_face = np.empty_like(face)
                img_blur_cuda[gridsize, blocksize](face, blur_face, k, radius)
                self.des_arr[i, top:bottom+1, left:right+1, :] = blur_face
    
    def filter_on_video_cv2(self, cv2_blur, radius=10, face_detection_model = 'mtcnn'):
        '''
        Filter on entire video using CV2.blur.
        '''
        k = 1 / (2*radius+1)**2
        self.des_arr = self.video_array.copy()
        frame_size = self.video_array.shape[0]
        self.locations = self.get_face_locations(face_detection_model)
        
        for i in range(frame_size):
            for location in self.locations[i]:
                top, right, bottom, left = location
                face = self.des_arr[i, top:bottom+1, left:right+1, :]
                blur_face = np.empty_like(face)
                cv2_blur(src=face, dst=blur_face, ksize=(radius,radius))
                self.des_arr[i, top:bottom+1, left:right+1, :] = blur_face   

    def filter_on_video_face_only(self, filter_func, radius=10, face_detection_model = 'mtcnn'):
        '''
        Filter on the entire video, but the filter function only take in the face regions 
        instead of the entire frame to reduce data size.
        '''
        k = 1 / (2*radius+1)**2
        self.des_arr = self.video_array.copy()
        frame_size = self.video_array.shape[0]
        self.locations = self.get_face_locations(face_detection_model)

        for i in range(frame_size):
            for location in self.locations[i]:
                top, right, bottom, left = location
                face = self.des_arr[i, top:bottom+1, left:right+1, :]
                blur_face = np.empty_like(face)
                blur_face = filter_func(face=face, blur_face=blur_face, radius=radius)
                self.des_arr[i, top:bottom+1, left:right+1, :] = blur_face   
    
    def mean_blur_convolution(self, image, des_img, locations, radius):
        '''
        Utilized Scipy convolution to perform blurring effect on faces in one frame.
        '''
        if len(locations) == 0:
              print("No faces")
              return
          # print(len(locations))
        n_neighbor = radius*2+1
        height, width, _ = des_img.shape
        kernel = np.ones((n_neighbor, n_neighbor))
        for (top, right, bottom, left) in locations:
            t_ = max(top+radius,0)
            b_ = min(bottom-radius, height)
            l_ = max(left+radius,0)
            r_ = min(right-radius, width)

            if t_ >= b_ or l_ >= r_:
                continue
            sample_area = image[t_-radius:b_+radius+1, l_-radius:r_+radius+1,:].astype(np.uint8)
            
            # np.save("face.out", sample_area)
            red = convolve2d(sample_area[:,:,0], kernel, 'same')
            green = convolve2d(sample_area[:,:,1], kernel, 'same')
            blue = convolve2d(sample_area[:,:,2], kernel, 'same')

            convol_area = np.stack([red, green, blue], axis=2)

            des_img[t_-radius:b_+radius+1, l_-radius:r_+radius+1,:] = (convol_area / (n_neighbor**2)).astype(np.uint8)
            # cv2.rectangle(des_img, (left, top), (right, bottom), (0, 0, 255), 2)            
                    
    def mean_blur_convolution_face_only(self, face, blur_face, radius):
        '''
        Apply blurring with scipy.convolve2d() to faces only.
        '''
        n_neighbor = radius*2+1
        kernel = np.ones((n_neighbor, n_neighbor))
        red = convolve2d(face[:,:,0], kernel, 'same')
        green = convolve2d(face[:,:,1], kernel, 'same')
        blue = convolve2d(face[:,:,2], kernel, 'same')
        convol_area = (np.stack([red, green, blue], axis=2)  / (n_neighbor**2)).astype(np.uint8)

        return convol_area
                    
    def get_face_locations(self, face_detection_model):
        '''
        Get a list of face_locations on entire video.
        '''
        des_arr = torch.from_numpy(self.video_array.copy()).to(self.device)
        
        if face_detection_model != 'mtcnn':
            locations = list(map(self.face_detection, des_arr))
        else:
            locations = list(map(self.face_detection_mtcnn, des_arr))    

        return locations



In [None]:
from multiprocessing import Process, Queue, Lock
class video_transformer_multiprocessing(video_transformer_parallel):
    '''
    This module aims to use multiprocessing to speed up the blurring.
    '''
    def __init__(self, path, save_path, file_name,N, device='cpu',display=False):
        video_transformer_parallel.__init__(self, path, save_path, file_name, 
                                        device, display)
        self.N = N
    def mean_blur_one_location(self, locations, i):
        '''
        Apply the blurring function to only one face.
        '''
        for location in locations:
            top, right, bottom, left = location
            face = self.des_arr[i,top:bottom+1, left:right+1, :]
            blur_face = np.empty_like(face)
            blur_face = self.mean_blur_convolution_face_only(face=face, blur_face=blur_face, radius=self.radius)
            self.des_arr[i, top:bottom+1, left:right+1, :] = blur_face
        return self.des_arr
    
    def mean_blur_one_frame(self, frame, locations):
        '''
        Apply the blurring function to all faces within one frame.
        '''
        for location in locations:
            # print(location)
            top, right, bottom, left = location
            face = frame[top:bottom+1, left:right+1, :]
            blur_face = np.empty_like(face)
            blur_face = self.mean_blur_convolution_face_only(face=face, blur_face=blur_face, radius=self.radius)
            frame[top:bottom+1, left:right+1, :] = blur_face
        return frame
    def mean_blur_some_frame(self, frames, list_locations, queue, idx):
        '''
        Apply the blurring effect to all faces within multiple frames.
        '''
        frames_update = []
        for i in range(len(frames)):
          frame = frames[i]
          locations_=list_locations[i]
          frame_update = self.mean_blur_one_frame(frame, locations_)
          frames_update.append(frame_update)
        queue.put((idx,frames_update))
        # return frames_update

    def filter_on_video_mult(self, filter_func, radius=10, face_detection_model = 'mtcnn'):
        '''
        Divide the entire video into several parts, and apply blurring in parallel.
        '''
        self.des_arr = self.video_array.copy()
        frame_size = self.video_array.shape[0]
        self.locations = self.get_face_locations(face_detection_model)
        self.radius = radius

        frames_portions = list(np.array_split(self.des_arr, self.N))
        locations_portions  = list(np.array_split(self.locations, self.N))
        q = Queue()
        jobs = []
        rets = []

        lock = Lock()


        for i in range(self.N):
            p = Process(target=self.mean_blur_some_frame, args=(frames_portions[i],locations_portions[i], q, i))
            p.Daemon = True
            jobs.append(p)
            p.start()
            
        for p in jobs:
            ret = q.get() # will block
            rets.append(ret)
        for p in jobs:
            p.join()
            
        # sort them by the index to restore order
        rets.sort(key=lambda x:x[0])

        rets = np.array([ret[1] for ret in rets])
        
        self.des_arr = np.concatenate(rets).astype(np.uint8)
        print(self.des_arr.shape)
            



# We listed some test cases that are described in the report below.

In [7]:
# CPU Base: use CPU for both face detection (Pytorch) and naive blurring
ts = time()
case_1 = video_transformer_base(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           display=False)
case_1.main_transformation("mtcnn", case_1.mean_blur)
case_1.write_to_video("cpu_base_peds_short.mp4")
print(f"Time used {time() - ts}s.")

Time used 56.00291705131531s.


In [9]:
# GPU Base: use GPU for face detection (Pytorch), but CPU for naive blurring
ts = time()
case_2 = video_transformer_base(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           display=False,
                           device='gpu')
case_2.main_transformation("mtcnn", case_2.mean_blur)
case_2.write_to_video("gpu_base_peds_short.mp4")
print(f"Time used {time() - ts}s.")

Time used 24.69857692718506s.


In [30]:
# GPU + Convolution: use GPU for face detection (Pytorch), and CPU for blurring effects using Scipy convolution
ts = time()
case_3 = video_transformer_parallel(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           display=False,
                           device='gpu')
case_3.filter_on_video_face_only(case_3.mean_blur_convolution_face_only)
case_3.write_to_video("peds_short_gpu_parallel_conv.mp4")
print(f"Time used {time() - ts}s.")

Time used 16.560893297195435s.


In [14]:
@cuda.jit
def img_blur_cuda(img, des_img, k, radius):
    '''
    numba cuda version of blurring algorithm
    '''
    i, j = cuda.grid(2)

    rows, columns, channel = img.shape
    if i >= rows or j >= columns:
        return

    ra = rows - radius
    ca = columns - radius
    if i < radius or j < radius or i >= ra or j >= ca:
        des_img[i, j, 0] = img[i, j, 0]
        des_img[i, j, 1] = img[i, j, 1]
        des_img[i, j, 2] = img[i, j, 2]
        return

    r = 0
    g = 0
    b = 0
    for x in range(-radius, radius + 1):
        for y in range(-radius, radius + 1):
            i_x = i + x
            j_y = j + y
            r += img[i_x, j_y, 0] * k
            g += img[i_x, j_y, 1] * k
            b += img[i_x, j_y, 2] * k
    des_img[i, j, 0] = r
    des_img[i, j, 1] = g
    des_img[i, j, 2] = b

In [31]:
# GPU + CUDA: use GPU for face detection (Pytorch), and CUDA for blurring effects
start = time()
case_4 = video_transformer_parallel(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           display=False,
                           device='gpu')
case_4.filter_on_video_cuda(img_blur_cuda)
case_4.write_to_video("peds_short_cuda_gpu.mp4")
print(f"Time used {time() - start}s")

Time used 11.36157751083374s


In [20]:
%load_ext Cython

In [21]:
%%cython
import cv2
from scipy.signal import convolve2d
import numpy as np
cimport numpy as np
ctypedef np.npy_intp SIZE_t

def mean_blur_convolution_cython(np.ndarray[np.uint8_t, ndim=3] image, 
                                 np.ndarray[np.uint8_t, ndim=3] des_img, 
                                 np.ndarray locations, 
                                 double radius):
    '''
    Utilized Scipy convolution to perform blurring effect.
    '''
    if len(locations) == 0:
          # print("No faces")
          return
    cdef int n_neighbor = np.int32((radius)*2+1)
    cdef int height = des_img.shape[0]
    cdef int width = des_img.shape[1]
    cdef np.ndarray[int, ndim = 2]kernel = np.ones((n_neighbor, n_neighbor),dtype=np.int32)
    cdef int t_, b_, l_, r_, bound_top, bound_bottom, bound_left, bound_right

    for (top, right, bottom, left) in locations:
        t_ = np.int32(max(top+radius,0))
        b_ = np.int32(min(bottom-radius, height))
        l_ = np.int32(max(left+radius,0))
        r_ = np.int32(min(right-radius, width))

        bound_top = np.int32(t_-radius)
        bound_bottom = np.int32(b_+radius+1)
        bound_left = np.int32(l_-radius)
        bound_right = np.int32(r_+radius+1)

        if t_ >= b_ or l_ >= r_:
            continue
        
        sample_area = image[bound_top:bound_bottom, 
                            bound_left:bound_right,:]
        
        # np.save("face.out", sample_area)
        red = convolve2d(sample_area[:,:,np.int32(0)], kernel, 'same')
        green = convolve2d(sample_area[:,:,np.int32(1)], kernel, 'same')
        blue = convolve2d(sample_area[:,:,np.int32(2)], kernel, 'same')

        convol_area = np.stack([red, green, blue], axis=2)

        des_img[bound_top:bound_bottom, 
               bound_left:bound_right,:] = (convol_area / (n_neighbor**2)).astype(np.uint8)
        cv2.rectangle(des_img, (left, top), (right, bottom), (0, 0, 255), 2)  

In [24]:
# GPU + Cythonized Convolution: use GPU for face detection (Pytorch), and use Cython to compile the Scipy Convolution solution
ts = time()
case_5 = video_transformer_parallel(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           display=False,
                           device='gpu')
case_5.filter_on_video(mean_blur_convolution_cython)
case_5.write_to_video("peds_short_gpu_parallel_cython.mp4")
print(f"Time used {time() - ts}s.")

Time used 16.544543504714966s.


In [28]:
# GPU + Multiprocessing Convolution: use GPU for face detection (Pytorch), and use multiprocessing to blur the images
start = time()
case_6 = video_transformer_multiprocessing(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           N=4,
                           display=False,
                           device='gpu')
case_6.filter_on_video_mult(case_6.mean_blur_convolution_face_only)
case_6.write_to_video("peds_short_gpu_conv_face_only_mult.mp4")
print(f"Time used {time() - start}s")

  return array(a, dtype, copy=False, order=order)


(52, 1080, 1920, 3)
Time used 14.603191375732422s


In [25]:
# GPU + CV2 (Benchmark): use GPU for face detection (Pytorch), and use cv2.blur() for blurring 
start = time()
case_cv2 = video_transformer_parallel(path = path,
                           save_path = os.path.join(path, "data", "frames"),
                           file_name = 'peds_short.mp4',
                           display=False,
                           device='gpu')
case_cv2.filter_on_video_cv2(cv2.blur)
case_cv2.write_to_video("peds_short_gpu_cv2.mp4")
print(f"Time used {time() - start}s")

Time used 11.090066194534302s
