In [1]:
import os
import math
import random
from PIL import Image

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as T

In [4]:
IMG_EXTENSIONS = [".jpg", ".JPG", ".jpeg", ".JPEG", ".png", ".PNG",
                  ".ppm", ".PPM", ".bmp", ".BMP"]

def is_image_file(filename):
    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)

In [8]:
class BaseAugmentation:
    def __init__(self, resize, mean, std, **args):
        self.transform = T.Compose([
            T.Resize(resize, Image.BILINEAR),
            T.ToTensor(),
            T.Normalize(mean=mean, std=std),
        ])

    def __call__(self, image):
        return self.transform(image)

In [11]:
class AddGaussianNoise(object):
    """
        transform 에 없는 기능들은 이런식으로 __init__, __call__, __repr__ 부분을
        직접 구현하여 사용할 수 있습니다.
    """

    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean

    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean

    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

In [12]:
class CustomAugmentation:
    def __init__(self, resize, mean, std, **args):
        self.transform = T.Compose([
            T.CenterCrop((320, 256)),
            T.Resize(resize, Image.BILINEAR),
            T.ColorJitter(0.1, 0.1, 0.1, 0.1),
            T.ToTensor(),
            T.Normalize(mean=mean, std=std),
            AddGaussianNoise()
        ])

    def __call__(self, image):
        return self.transform(image)

In [31]:
from facenet_pytorch import MTCNN, InceptionResnetV1
import torchvision.transforms as T

# pip install facenet-pytorch
# ?MTCNN

mtcnn  = MTCNN(post_process=False, keep_all = True)
resnet = InceptionResnetV1(pretrained='vggface2').eval()

path_to_data = "input/data/train/images/"
image_path = "000001_female_Asian_45"
file_names = ["normal", "mask1", "mask2", "mask3", "mask4", "mask5", "incorrect_mask"]
extensions = [".jpg"]
path = os.path.join(path_to_data, image_path, file_names[0] + extensions[0])

img = Image.open(path)

boxes, probs, points = mtcnn.detect(img, landmarks=True)

print(boxes, probs, points)

[[120.788864 152.77504  246.0723   329.3873  ]] [0.99389887] [[[153.74947 224.46916]
  [214.21593 221.98465]
  [184.88826 260.15286]
  [160.76108 291.51437]
  [214.89313 288.51117]]]


In [16]:
print(mtcnn)

MTCNN(
  (pnet): PNet(
    (conv1): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1))
    (prelu1): PReLU(num_parameters=10)
    (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    (conv2): Conv2d(10, 16, kernel_size=(3, 3), stride=(1, 1))
    (prelu2): PReLU(num_parameters=16)
    (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
    (prelu3): PReLU(num_parameters=32)
    (conv4_1): Conv2d(32, 2, kernel_size=(1, 1), stride=(1, 1))
    (softmax4_1): Softmax(dim=1)
    (conv4_2): Conv2d(32, 4, kernel_size=(1, 1), stride=(1, 1))
  )
  (rnet): RNet(
    (conv1): Conv2d(3, 28, kernel_size=(3, 3), stride=(1, 1))
    (prelu1): PReLU(num_parameters=28)
    (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (conv2): Conv2d(28, 48, kernel_size=(3, 3), stride=(1, 1))
    (prelu2): PReLU(num_parameters=48)
    (pool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (conv3): Conv2d(48, 64,

## Proposal Network (p-net)

![img-pnet](https://miro.medium.com/max/1400/1*6xkYymO5qetLLjUt0MYJXg.jpeg)

* This first stage is a fully convolutional network (FCN). The difference between a CNN and a FCN is that a fully convolutional network does not use a dense layer as part of the architechture. 

* This Proposal Network is used to obtain candidate windows and their bounding box regression vectors. [Reference](https://medium.com/@iselagradilla94/multi-task-cascaded-convolutional-networks-mtcnn-for-face-detection-and-facial-landmark-alignment-7c21e8007923)

* Bounding box regression is a popular technique to predict the localization of boxes when the goal is detecting an object of some pre-defined class, in this case faces. 

* The final output of this stage is all candidate windows after refinement to downsize the volume of candidates.

## Refine Network (r-net)

![img-rnet](https://miro.medium.com/max/1400/1*PoMst7LfCfRSADzSFHXIJg.jpeg)

* All candidates from the P-Net are fed into the Refine Network. Notice that this network is a CNN, not a FCN like the one before since there is a dense layer at the last stage of the network architecture. 

* The R-Net further reduces the number of candidates, performs calibration with bounding box regression and employs non-maximum suppression (NMS) to merge overlapping candidates.

* The R-Net outputs wether the input is a face or not, a 4 element vector which is the bounding box for the face, and a 10 element vector for facial landmark localization.

## Output Network (o-net)

![img-onet](https://miro.medium.com/max/1400/1*GEHEFApb0VF9poTIh1Bmng.jpeg)

* This stage is similar to the R-Net, but this Output Network aims to describe the face in more detail and output the five facial landmarks’ positions for eyes, nose and mouth.

* There are five landmarks: left eye, right eye, nose, left mouth corner and right mouth corner.

## Regarding Outputs

* face classification (2개)

  * y^det = GT에서 얼굴이 있는지 여부(있을때 1, 없을때 0)

  * p = 얼굴이 있을 확률

* bbox regression (4개)

  * 예측한 bbox의 왼쪽상단 x,y좌표

  * 예측한 bbox의 너비와 높이

* face landmark localization (10개)

  * 왼쪽 눈의 x,y 좌표

  * 오른쪽 눈의 x,y 좌표

  * 코의 x,y 좌표

  * 입의 왼쪽 끝 부분의 x,y 좌표

  * 입의 오른쪽 끝 부분의 x,y 좌표

In [23]:
for name, layer in mtcnn.named_parameters():
    print(name, " \t ", layer.size())

pnet.conv1.weight  	  torch.Size([10, 3, 3, 3])
pnet.conv1.bias  	  torch.Size([10])
pnet.prelu1.weight  	  torch.Size([10])
pnet.conv2.weight  	  torch.Size([16, 10, 3, 3])
pnet.conv2.bias  	  torch.Size([16])
pnet.prelu2.weight  	  torch.Size([16])
pnet.conv3.weight  	  torch.Size([32, 16, 3, 3])
pnet.conv3.bias  	  torch.Size([32])
pnet.prelu3.weight  	  torch.Size([32])
pnet.conv4_1.weight  	  torch.Size([2, 32, 1, 1])
pnet.conv4_1.bias  	  torch.Size([2])
pnet.conv4_2.weight  	  torch.Size([4, 32, 1, 1])
pnet.conv4_2.bias  	  torch.Size([4])
rnet.conv1.weight  	  torch.Size([28, 3, 3, 3])
rnet.conv1.bias  	  torch.Size([28])
rnet.prelu1.weight  	  torch.Size([28])
rnet.conv2.weight  	  torch.Size([48, 28, 3, 3])
rnet.conv2.bias  	  torch.Size([48])
rnet.prelu2.weight  	  torch.Size([48])
rnet.conv3.weight  	  torch.Size([64, 48, 2, 2])
rnet.conv3.bias  	  torch.Size([64])
rnet.prelu3.weight  	  torch.Size([64])
rnet.dense4.weight  	  torch.Size([128, 576])
rnet.dense4.bias  	  tor

In [38]:
class PrintOutput:
    def __init__(self):
        self.outputs = []
    
    def __call__(self, module, module_in, module_out):
        self.outputs.append(module_out)
        print(module_out)
        print(module_out.size())
    
    def clear(self):
        self.outputs = []

print_output = PrintOutput()
mtcnn.onet.dense6_3.register_forward_hook(print_output)
mtcnn.onet.dense6_2.register_forward_hook(print_output)
mtcnn.onet.dense6_1.register_forward_hook(print_output)

<torch.utils.hooks.RemovableHandle at 0x7fad7c9b6c10>

In [39]:
boxes, probs, points = mtcnn.detect(img, landmarks=True)

tensor([[-2.3097,  2.3100],
        [-1.9099,  1.9099],
        [-2.5467,  2.5465],
        [-1.6195,  1.6188],
        [-2.0021,  2.0008]])
torch.Size([5, 2])
tensor([[ 0.1471,  0.0575, -0.1350,  0.0606],
        [ 0.1081, -0.0103, -0.1224,  0.0411],
        [ 0.1388, -0.0591, -0.1229, -0.0207],
        [ 0.1100, -0.0169, -0.1379,  0.0118],
        [ 0.1372,  0.0144, -0.1612, -0.0298]])
torch.Size([5, 4])
tensor([[0.3236, 0.6814, 0.5125, 0.3736, 0.6825, 0.4608, 0.4430, 0.6687, 0.8488,
         0.8313],
        [0.3051, 0.6690, 0.4886, 0.3549, 0.6695, 0.4170, 0.3984, 0.6276, 0.8139,
         0.7975],
        [0.3373, 0.6909, 0.5194, 0.3783, 0.6948, 0.3659, 0.3514, 0.5745, 0.7579,
         0.7403],
        [0.3024, 0.6523, 0.4771, 0.3427, 0.6616, 0.3903, 0.3789, 0.6029, 0.7830,
         0.7709],
        [0.3021, 0.6275, 0.4648, 0.3418, 0.6390, 0.3891, 0.3761, 0.5836, 0.7576,
         0.7422]])
torch.Size([5, 10])


In [40]:
print(boxes, probs, points, sep = "\n")

[[120.788864 152.77504  246.0723   329.3873  ]]
[0.99389887]
[[[153.74947 224.46916]
  [214.21593 221.98465]
  [184.88826 260.15286]
  [160.76108 291.51437]
  [214.89313 288.51117]]]
