# Facial Recognition Workbook <a class="tocSkip">

It's time to get your hands dirty and create your own facial recognition model using your own dataset!

Form a group of two or three people then use this workbook to generate interesting insights such as:

1. Based from the eigenfaces, what facial features can you look at to distinguish the face each member of your group effectively?
2. Which among your parents do you look alike the most?
3. Which among your newfound friends/future classmate do you look alike the most?

## Data Preparation

Step 0: Uncomment the cell below to prepare the folders and cascade model configuration.

In [None]:
# !mkdir data
# !mkdir data/raw
# !wget https://raw.githubusercontent.com/aim-msds/bsdsba-trial-lectures/main/face-recognition/haarcascade_frontalface_default.xml

### Image Uploading

Step 1: Upload pictures on the `data/raw` folder which you will use as your dataset to compute for the principal components and eigenfaces.

### Face detection and image cropping

Step 2: Execute the pipeline below to automatically crop face images from the uploaded raw images.

In [1]:
import os
from glob import glob

import cv2

In [None]:
# Directory and filepaths
RAW_DIRPATH = './data/raw'
CASCADE_MODEL_FILEPATH = './haarcascade_frontalface_default.xml'
OUTPUT_DIRPATH = './data/processed'

# Model settings
scaleFactor = 1.2
minNeighbors = 5
minSize = (30, 30)
outputSize = (128, 128)

# Create output directory
os.makedirs(OUTPUT_DIRPATH, exist_ok=True)

In [None]:
# Get all the filepaths of images in the raw directory path
image_filepaths = sorted(glob(RAW_DIRPATH + '/*'))

In [None]:
# Initialize our face detector model
cascade_model = cv2.CascadeClassifier(CASCADE_MODEL_FILEPATH)

# Iterate on to the images found in the RAW_DIRPATH folder
for i, image_filepath in enumerate(image_filepaths):
    # Read the image and convert to gray scale
    image = cv2.imread(image_filepath)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Get the detected faces using the model with specified settings
    faces = cascade_model.detectMultiScale(
        gray, scaleFactor=scaleFactor, minNeighbors=minNeighbors,
        minSize=minSize, flags=cv2.CASCADE_SCALE_IMAGE)

    # Iterate on each detected faces, crop, then save the cropped image
    # onto an output directory
    for j, (x, y, width, height) in enumerate(faces):
        cropped_image = gray[y:y + height, x:x + width]
        cropped_resized = cv2.resize(cropped_image, outputSize)
        output_filepath = os.path.join(OUTPUT_DIRPATH, f'{i +1 :02}-{j + 1:02}.jpg')
        cv2.imwrite(output_filepath, cropped_resized)

Step 3: Check the processed folder and ensure that they are clean and contain only cropped face images.

### Creating the data matrix

Step 4: Create the data matrix by reading each cropped faces in the processed folder

In [None]:
PROCESSED_DIRPATH = './data/processed'
processed_image_filepaths = sorted(glob(PROCESSED_DIRPATH + '/*'))

In [None]:
data = []

for image_filepath in processed_image_filepaths:
    cur_image = cv2.imread(image_filepath, cv2.IMREAD_GRAYSCALE) / 255.
    cur_image = cur_image.flatten()
    data.append(cur_image)

## Principal Component Analysis

Step 5: Compute for the principal components of the dataset using `PCA`

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(len(data), random_state=1337)

reduced_data = pca.fit_transform(data)
principal_components = pca.components_
explained_variance = pca.explained_variance_ratio_

Step 6: Plot the eigenfaces then interpret the result

In [2]:
import matplotlib.pyplot as plt

def plot_eigenfaces(
    principal_components, explained_variance, num_components_to_plot=10,
    figsize=(16, 6)):
    """Plot the eigenface annotated with their corresponding explained
    variance
    """
    num_columns = 5
    num_rows = num_components_to_plot // num_columns

    fig, axes = plt.subplots(num_rows, num_columns, figsize=figsize)

    for i in range(num_components_to_plot):
        ax = axes.flatten()[i]
        cur_component = abs(principal_components[i, :])
        cur_explained_variance = round(explained_variance[i] * 100, 2)
        img_dim = int(np.sqrt(len(cur_component)))
        cur_component = cur_component.reshape((img_dim, img_dim))
        ax.imshow(cur_component)
        ax.set_title(f"PCA {i + 1} ({cur_explained_variance}%)", fontsize=12, weight='bold')

    for ax in axes.flatten():
        ax.axis('off')

In [None]:
plot_eigenfaces(principal_components, explained_variance)

## Facial Recognition Model Building

In [None]:
from scipy.spatial.distance import cosine

class FacialRecognitionModel:
    """A machine learning model that predicts which face a given test
    input is based on reference images
    """
    def __init__(self, ref_images, pca):
        """Initialize the facial recognition model

        Parameters
        ----------
        ref_images : dict
            Dictionary containing the label and image filepath of the
            reference images
        pca : trained sklearn PCA model
            Trained sklearn PCA model using the relevant several images
        """
        # Initialize attributes of this object
        self.ref_images = ref_images
        self.pca = pca
        self.images = {}
        self.labels = []
        self.pcas = {}

        # Generate and compute for the items needed to deploy the model
        for label in ref_images:
            self.labels.append(label)

            # Read image then compute for PCA
            image = cv2.imread(ref_images[label], cv2.IMREAD_GRAYSCALE)
            image_pca = pca.transform(image.flatten()[np.newaxis, :] / 255.)[0]

            # Append the image arrays and computed pca vectors
            self.images[label] = image
            self.pcas[label] = image_pca

    def predict(self, test_image_filepath, metric=cosine):
        """Predict the label of the test image given its filepath.

        Generates the test image along with the prediction annotated
        with the distance of the test image with the reference images

        Parameters
        ----------
        test_image_filepath : str or Path
            Filepath of the test image
        metric : function
            Function that takes in two vectors then returns the distance
            between two vectors
        """
        # Read the test image and compute for its PCA
        test_image = cv2.imread(test_image_filepath, cv2.IMREAD_GRAYSCALE)
        test_pca = pca.transform(test_image.flatten()[np.newaxis, :] / 255.)[0]

        # Plot the test image
        fig, ax = plt.subplots(figsize=(5, 5))
        ax.imshow(test_image, cmap='gray')
        ax.set_title("Test image", fontsize=14, weight='bold')
        ax.axis('off')

        # Plot the reference images along with its distance with the
        # test image
        fig, axes = plt.subplots(1, len(self.images), figsize=(16, 5))

        distances = {}
        for i, label in enumerate(self.labels):
            distance = metric(self.pcas[label], test_pca)
            distances[label] = distance

            ax = axes.flatten()[i]
            ax.imshow(self.images[label], cmap='gray')
            ax.axis('off')
            ax.set_title(f"{label} (dist: {distance:0.4f})", fontsize=14, weight='bold')

        prediction = sorted(self.labels, key=lambda x: distances[x])[0]
        fig.suptitle(f"Prediction: {prediction}", fontsize=16, weight='bold')

Step 7: Create a facial recognition model by specifying the image filepaths of your reference images

In [None]:
ref_images = {'Label_1': '<filepath of reference image 1>',
              'Label_2': '<filepath of reference image 2>',}

model = FacialRecognitionModel(ref_images, pca)

Step 8: Make a prediction on a given test image, evaluate the result

In [None]:
test_image_filepath = '<filepath of test image>'

model.predict(test_image_filepath)