# Working with facenet-pytorch and decord

As of version 2.2, the MTCNN module of facenet-pytorch can work directly with images represented as numpy arrays. This change achieves higher performance when reading video frames with either `cv2.VideoCapture` or `decord.VideoReader` as it avoids conversion to PIL format. A number of additional enhancements have been added to improve detection efficiency.

**This notebook demonstrates how to detect every face in every frame in every video of the dataset at full resolution in approximately 3 hours.**

---

**UPDATE (2020-03-04):** Video reading has been switched from cv2 to decord for improved performance.

---

In [3]:
from decord import VideoReader, gpu


## Imports

In [4]:
import sys, os

from facenet_pytorch import MTCNN
import torch
from decord import VideoReader, gpu, cpu
import glob
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from joblib import Parallel, delayed


device = 'cuda' if torch.cuda.is_available() else 'cpu'

## The FastMTCNN Class

The following class implements a strided version of MTCNN. See [here](https://www.kaggle.com/timesler/fast-mtcnn-detector-55-fps-at-full-resolution) for the original implementation.

In [5]:
class FastMTCNN(object):
    """Fast MTCNN implementation."""
    
    def __init__(self, stride, *args, **kwargs):
        """Constructor for FastMTCNN class.
        
        Arguments:
            stride (int): The detection stride. Faces will be detected every `stride` frames
                and remembered for `stride-1` frames.
        
        Keyword arguments:
            resize (float): Fractional frame scaling. [default: {1}]
            *args: Arguments to pass to the MTCNN constructor. See help(MTCNN).
            **kwargs: Keyword arguments to pass to the MTCNN constructor. See help(MTCNN).
        """
        self.stride = stride
        self.mtcnn = MTCNN(*args, **kwargs)
        
    def __call__(self, frames):
        """Detect faces in frames using strided MTCNN."""
                      
        boxes, probs = self.mtcnn.detect(frames[::self.stride])

        faces = []
        probs_out = []
        frame_index = []
        for i, frame in enumerate(frames):
            box_ind = int(i / self.stride)
            if boxes[box_ind] is None:
                continue
            for box, prob in zip(boxes[box_ind], probs[box_ind]):
                box = [int(b) for b in box]
                faces.append(frame[box[1]:box[3], box[0]:box[2]].copy())
                probs_out.append(prob)
                frame_index.append(i)
                
        
        return faces, probs, frame_index

In [5]:
from joblib import Parallel, delayed
filenames = glob.glob('/Volumes/MY PASSPORT/manipulated_sequences/DeepFakeDetection/c40/videos/*.mp4')
def build_data_set(folder):
    for face in frames: 
            frames = []
            img = cv2.imread(os.path.join(inter_path,image_))
            if img is not None:
                frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                frames.append(Image.fromarray(frame))
                filename = int(image_.split('.')[0])
                folder_name = int(folder)
                #save_paths = [f'image_{i}.jpg' for i in range(len(frames))]
                #mtcnn(frames, save_path='{}.jpg'.format(output_path))
                save_path= os.path.join(output_path,'{:04d}{:04d}_cropped.jpg'.format(filename,folder_name))
                mtcnn(frames, save_path=save_path)

mtcnn = MTCNN(margin=20, keep_all=True, post_process=False)
output_path = ("C:/Users/Dio Gado/Asmaa/Data/manipulated_sequences/Deepfakes/c40/images/Cropped_faces/")
pbar = tqdm(filenames)
Parallel(n_jobs=6)(delayed(f)(folder) for folder in pbar)
filename = int(image_.split('.')[0])
folder_name = int(folder)
                #save_paths = [f'image_{i}.jpg' for i in range(len(frames))]
                #mtcnn(frames, save_path='{}.jpg'.format(output_path))
save_path= os.path.join(output_path,'{:04d}{:04d}_cropped.jpg'.format(filename,folder_name))

HBox(children=(FloatProgress(value=0.0, max=3068.0), HTML(value='')))

NameError: name 'f' is not defined

In [None]:
build_data_set(index,faces)


## Define face detector

The following face detector can detect all faces in a video in approximately 2.8 seconds, allowing all videos in the public test set to be processed in 2.8 * 4000 = 11200 seconds = 3.1 hours.

In [6]:
fast_mtcnn = FastMTCNN(
    stride=10,
    margin=20,
    factor=0.6,
    keep_all=True,
    device=device,
    thresholds=[0.6, 0.7, 0.98])

In [26]:
MTCNN(save_paths)

MTCNN(
  (pnet): PNet(
    (conv1): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1))
    (prelu1): PReLU(num_parameters=10)
    (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    (conv2): Conv2d(10, 16, kernel_size=(3, 3), stride=(1, 1))
    (prelu2): PReLU(num_parameters=16)
    (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
    (prelu3): PReLU(num_parameters=32)
    (conv4_1): Conv2d(32, 2, kernel_size=(1, 1), stride=(1, 1))
    (softmax4_1): Softmax(dim=1)
    (conv4_2): Conv2d(32, 4, kernel_size=(1, 1), stride=(1, 1))
  )
  (rnet): RNet(
    (conv1): Conv2d(3, 28, kernel_size=(3, 3), stride=(1, 1))
    (prelu1): PReLU(num_parameters=28)
    (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (conv2): Conv2d(28, 48, kernel_size=(3, 3), stride=(1, 1))
    (prelu2): PReLU(num_parameters=48)
    (pool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (conv3): Conv2d(48, 64,

## Process all videos

In [7]:
%%time
fast_mtcnn = FastMTCNN(
    stride=10,
    margin=20,
    factor=0.6,
    keep_all=True,
    device=device,
    thresholds=[0.6, 0.7, 0.98])
def mean_detection_prob(prob):
    cnt_p = 0
    sum_p = 0
    for p in prob:
        for pp in p:
            if pp is not None:
                cnt_p += 1
                sum_p += pp
    return sum_p / cnt_p


def get_frames(filename, batch_size=10):
    v_cap = VideoReader(filename, ctx=cpu())
    v_len = len(v_cap)

    frames = []
    for i in range(0, v_len, batch_size):
        batch = v_cap.get_batch(range(i, min(i + batch_size, v_len - 1))).asnumpy()
        frames.extend(batch.copy())
    
    frames = np.array(frames)
    
    del v_cap, v_len, batch
    
    return frames


filenames = glob.glob('/Volumes/MY PASSPORT/manipulated_sequences/DeepFakeDetection/c40/videos/*.mp4')
output_path = "/Volumes/MY PASSPORT/manipulated_sequences/DeepFakeDetection/c40/Images/"

num_faces = 0
probs = []
indexes = []
pbar = tqdm(filenames)
for filename in pbar:
    frames = get_frames(filename)
    save_paths = [f'single_image_{i}.jpg' for i in range(len(frames))]

    faces, prob, index = fast_mtcnn(frames) 
    num_faces += len(faces)
    pbar.set_description(f'Faces found: {num_faces}')

    del frames

HBox(children=(FloatProgress(value=0.0, max=3068.0), HTML(value='')))

KeyboardInterrupt: 

In [32]:
%%time
fast_mtcnn = FastMTCNN(
    stride=10,
    margin=20,
    factor=0.6,
    keep_all=True,
    device=device,
    thresholds=[0.6, 0.7, 0.98])
def mean_detection_prob(prob):
    cnt_p = 0
    sum_p = 0
    for p in prob:
        for pp in p:
            if pp is not None:
                cnt_p += 1
                sum_p += pp
    return sum_p / cnt_p


def get_frames(filename, batch_size=5):
    v_cap = VideoReader(filename, ctx=cpu())
    v_len = len(v_cap)

    frames = []
    for i in range(0, v_len, batch_size):
        batch = v_cap.get_batch(range(i, min(i + batch_size, v_len - 1))).asnumpy()
        frames.extend(batch.copy())
    
    frames = np.array(frames)
    
    del v_cap, v_len, batch
    
    return frames


#filenames = glob.glob('/Volumes/MY PASSPORT/manipulated_sequences/DeepFakeDetection/c40/videos/*.mp4')
filenames = glob.glob('/Volumes/MY PASSPORT/manipulated_sequences/Deepfakes/c23/videos/test/*.mp4')
#output_path = "/Volumes/MY PASSPORT/manipulated_sequences/DeepFakeDetection/c40/Images/"
output_path = '/Volumes/MY PASSPORT/manipulated_sequences/Deepfakes/c23/videos/test/'

num_faces = 0
probs = []
indexes = []
pbar = tqdm(filenames)
mtcnn = MTCNN(margin=10, keep_all=True, post_process=False)

for filename in pbar:
    frames = get_frames(filename)
    #save_path= os.path.join(output_path,'{}_cropped.jpg'.format(filename))
    #mtcnn(frames,save_paths )
    for i in range(len(frames)):
        print(i)
        save_paths = [os.path.join(output_path,('single_image_{}.jpg'.format(i)))]
        mtcnn(frames, save_path=save_paths)
    del frames


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

0
1


KeyboardInterrupt: 

['/Volumes/MY PASSPORT/manipulated_sequences/Deepfakes/c40/videos/test/000_003.mp4']

In [29]:
from joblib import Parallel, delayed
filenames = glob.glob('/Volumes/MY PASSPORT/manipulated_sequences/Deepfakes/c40/videos/test/*.mp4')

def f(folder):
    if folder != '.DS_Store':
        frames = []
        v_cap = VideoReader(filenames, ctx=cpu())
        v_len = len(v_cap)
        frames = []
        for i in range(0, v_len, 10):
            batch = v_cap.get_batch(range(i, min(i + 10, v_len - 1))).asnumpy()
            frames.extend(batch.copy())
    
            frames = np.array(frames)
    
            del v_cap, v_len, batch
            
    # Add to batch
        frames = []
        save_path= os.path.join(output_path,'{:04d}{:04d}_cropped.jpg'.format(filename,folder_name))
        mtcnn(frames, save_path=save_path)
        save_path= os.path.join(output_path,'{:04d}{:04d}_cropped.jpg'.format(filename,folder_name))
mtcnn = MTCNN(margin=20, keep_all=True, post_process=False)
output_path = "/Volumes/MY PASSPORT/manipulated_sequences/Deepfakes/c40/images/Cropped_faces/"

pbar = tqdm(filenames)
print(len(filenames))
Parallel(n_jobs=2)(delayed(f)(folder) for folder in pbar)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

1



TypeError: Don't know how to handle type <class 'list'>