In [1]:
import os
from glob import glob
from skimage import data, io, feature, color, exposure
import numpy as np
import cv2

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
import seaborn as sns
%matplotlib inline

# Paper: Face Recognition Based on HOG and Fast PCA Algorithm

    1- Viola Jones
    2- Resize = 64x128
    3- HOG:
        3780 HOG features
    4- PCA
    5- Normalização: median normalization method (Eq 10)
    

# declare functions

In [2]:
lfw2 = os.path.join('..', 'Data', 'lfw2')
def image_path(person, id_, lfw_folder = lfw2):
    return glob(os.path.join(lfw_folder, person, '*' + id_ + '.jpg'))[0]

In [3]:
'''
Parameters: 
    - img_matrix: (ndarray)
    - title: (string)
Output:
    - image plot
'''
def plt_img(img_matrix, title='Image', normalize=False):
    if normalize:
        plt.imshow(img_matrix, vmin=np.min(img_matrix), vmax=np.max(img_matrix), cmap='gray')
    else:
        io.imshow(img_matrix)
    plt.title(title)
    plt.show()

In [4]:
def plt_two_imgs(img_a, img_b, cmap='gray', normalize=False):
    f = plt.figure(figsize=(12, 8))
    f.add_subplot(1,2, 1)
    if normalize:
        plt.imshow(img_a, vmin=np.min(img_matrix), vmax=np.max(img_matrix), cmap=cmap)
    else:
        plt.imshow(img_a, cmap=cmap)
    f.add_subplot(1,2, 2)
    if normalize:
        plt.imshow(img_b, vmin=np.min(img_matrix), vmax=np.max(img_matrix), cmap=cmap)
    else:
        plt.imshow(img_b, cmap=cmap)
    
    plt.show(block=True)
    
    

In [5]:
'''
Parameters: 
- Path: The image should be in the working directory or a full path of image
should be given;
- color: Second argument is a flag which specifies the way image should be read.
    cv2.IMREAD_COLOR : Loads a color image. Any transparency of image
    will be neglected;
    cv2.IMREAD_GRAYSCALE : Loads image in grayscale mode;
    cv2.IMREAD_UNCHANGED : Loads image as such including alpha channel;
Note Instead of these three flags, you can simply pass integers 1, 0 or -1
respectively.
Output:
- img_array: (ndarray)
'''
def open_img(path, color=0):
    return cv2.imread(path, color)

In [6]:
'''
Parameters:
- path_img: A string representing the file name. The filename must include image format like .jpg, .png, etc.

- img: It is the image that is to be saved (ndarray).

Return Value: It returns true if image is saved successfully.
'''

def save_img(path_img, img):
    cv2.imwrite(path_img, img) 

In [7]:
def getDF(path):
    with open(path) as f:
        file_list = f.readlines()
    n = int(file_list[0].strip())
    df_inicial = pd.read_csv(path, sep='\t', skiprows=1, nrows=n, names=['pair_name_1', 'pair_id_1', 'pair_id_2'])
    df_inicial['pair_name_2'] = None
    df_secondary = pd.read_csv(path, sep='\t', skiprows=n+1, names=['pair_name_1', 'pair_id_1', 'pair_name_2', 'pair_id_2'])
    df = pd.concat([df_inicial, df_secondary])
    df = df.reset_index(drop=True)
    print(df.shape)
    return df

In [8]:
def plt_img_batch(df, show=False, limit=np.inf):
    for index, row in df.iterrows():
        plt_img(open_img(row['path_pair_id_1'], color=0), title=os.path.split(row['path_pair_id_1'])[-1].split('.')[0])
        plt_img(open_img(row['path_pair_id_2'], color=0), title=os.path.split(row['path_pair_id_2'])[-1].split('.')[0])
        if limit == index + 1:
            break

# example

In [9]:
# data_folder = os.path.abspath('..\\data\\')
data_folder = os.path.join('..', 'Data')
train_path = Path(data_folder, 'pairsDevTrain.txt')
test_path = Path(data_folder, 'pairsDevTest.txt')

In [10]:
df_train = getDF(train_path)
df_test = getDF(test_path)

(2200, 4)
(1000, 4)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [12]:
df_train.head(2)

Unnamed: 0,pair_id_1,pair_id_2,pair_name_1,pair_name_2
0,1,2,Aaron_Peirsol,
1,3,4,Aaron_Peirsol,


In [13]:
df_test.head(2)

Unnamed: 0,pair_id_1,pair_id_2,pair_name_1,pair_name_2
0,13,14,Abdullah_Gul,
1,13,16,Abdullah_Gul,


In [None]:
df_train['path_pair_id_1'] = df_train.apply(lambda x: image_path(person=x['pair_name_1'], id_= str(x['pair_id_1'])), axis=1)
df_train['path_pair_id_2'] = df_train.apply(lambda x: image_path(person=x['pair_name_1'], id_= str(x['pair_id_2'])) if x['pair_name_2']==None 
                                            else image_path(person=x['pair_name_2'], id_= str(x['pair_id_2'])), axis=1)

df_test['path_pair_id_1'] = df_test.apply(lambda x: image_path(person=x['pair_name_1'], id_= str(x['pair_id_1'])), axis=1)
df_test['path_pair_id_2'] = df_test.apply(lambda x: image_path(person=x['pair_name_1'], id_= str(x['pair_id_2'])) if x['pair_name_2']==None 
                                          else image_path(person=x['pair_name_2'], id_= str(x['pair_id_2'])), axis=1)

In [None]:
df_train.head()

In [None]:
df_test.head()

# Viola Jones

In [None]:
def plot_rectangle(detected_faces, image, title='Image', cmap_type='gray', kwargs={'lw': 20.}):
    # Create figure and axes
    fig,ax = plt.subplots(1)
    # Display the image
    ax.imshow(image, cmap=cmap_type)
    plt.title(title)
    for (column, row, width, height) in detected_faces:
        rect = Rectangle(
                (column, row),
                width = width,
                height = height,
                fill=False,
                edgecolor='r',
                
                )
        # Add the patch to the Axes
        ax.add_patch(rect)
#     plt.axis('off')
    plt.show()

In [None]:
def crop_image(original_image, column, row, width, height):
    # the goal is crop the biggest area
    return original_image[row:row+height, column:column + width]


### Load the classifier and create a cascade object for face detection

In [None]:
cascade_path = os.path.join('..', 'haarcascades', 'haarcascade_frontalface_alt.xml')
face_cascade = cv2.CascadeClassifier(cascade_path)

## Problem: what image to use?
### Response: Use the biggest area

In [None]:
def crop_biggest_area(original_image, detected_faces):
    
    # the goal is crop the biggest area
    if len(detected_faces) == 0: # viola jones didnt recognize any face
        return original_image, (None, None, original_image.shape[0], original_image.shape[1])
    else:
        # detected_faces returns: column, row, width, height
        # So, assuming all width == height
        # get np.argmax of height
        id_max_max_width = np.argmax(detected_faces[:, -1])
        column, row, width, height = detected_faces[id_max_max_width]
        return crop_image(original_image, column, row, width, height), (column, row, width, height)

## Problem: images with differents shapes
### Response: Use the resize methods. So, cropped all images, then used resize methods to get a standard shape

### Update df_train and df_test

In [None]:
df_train['path_pair_id_1_cropped'] = df_train['path_pair_id_1'].apply(lambda x: x.replace('lfw2', 'lfw2_cropped'))
_ = df_train['path_pair_id_1_cropped'].apply(lambda x: None if os.path.isdir(os.path.split(x)[0]) else os.mkdir(os.path.split(x)[0]))

df_train['path_pair_id_2_cropped'] = df_train['path_pair_id_2'].apply(lambda x: x.replace('lfw2', 'lfw2_cropped'))
_ = df_train['path_pair_id_2_cropped'].apply(lambda x: None if os.path.isdir(os.path.split(x)[0]) else os.mkdir(os.path.split(x)[0]))

In [None]:
df_test['path_pair_id_1_cropped'] = df_test['path_pair_id_1'].apply(lambda x: x.replace('lfw2', 'lfw2_cropped'))
_ = df_test['path_pair_id_1_cropped'].apply(lambda x: None if os.path.isdir(os.path.split(x)[0]) else os.mkdir(os.path.split(x)[0]))

df_test['path_pair_id_2_cropped'] = df_test['path_pair_id_2'].apply(lambda x: x.replace('lfw2', 'lfw2_cropped'))
_ = df_test['path_pair_id_2_cropped'].apply(lambda x: None if os.path.isdir(os.path.split(x)[0]) else os.mkdir(os.path.split(x)[0]))

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.shape

# Paper: Face Recognition Based on HOG and Fast PCA Algorithm

    1- Viola Jones
    2- Resize = 64x128
    3- HOG:
        3780 HOG features
    4- PCA
    5- Normalização: median normalization method (Eq 10)
    

# Experiments

    1- Viola Jones
    2- Resize = 64x128
    3- HOG:
        3.1- Apply Hog in all images: all_images_data_train (path_pair_id_1_cropped and path_pair_id_1_cropped) concatenat by rows without duplicates -> result n x 3780
        3.2- Apply Hog for each pair: data_train[path_pair_id_1_cropped] and data_train[path_pair_id_2_cropped] -> result tow matrices: 2200 x 3780 
        3.3- Apply Hog for each pair: data_test[path_pair_id_1_cropped] and data_test[path_pair_id_2_cropped] -> result tow matrices: 1000 x 3780 
    4- PCA
        - 4.1: Fit pca in data step 3.1
        - 4.2:Transform data_train[path_pair_id_1_cropped]
        - 4.3:Transform data_train[path_pair_id_2_cropped]
        - 4.4:Transform data_test[path_pair_id_1_cropped]
        - 4.5:Transform data_test[path_pair_id_2_cropped]
    5- Append:
        - Append by columns 4.2 and 4.3
        - Append by columns 4.4 and 4.5
    
    6- Normalização: median normalization method (Eq 10)
    

### Steps 1 and 2

In [None]:
def preprocessing(path_image, path_to_save, dim=(100, 100)):
    original_image = open_img(path_image, color=0)
    grayscale_image = original_image.copy()
    detected_faces = face_cascade.detectMultiScale(grayscale_image)# step 1
    cropped_image, (column, row, width, height) = crop_biggest_area(original_image, detected_faces)
    resized = cv2.resize(cropped_image, dim, interpolation = cv2.INTER_AREA) #step 2
    save_img(path_img=path_to_save, img=resized)
    return (column, row, width, height)

### get dimensions VJ and apply pre-processing

In [None]:
df_train['VJ_pair_id_1'] = df_train.apply(lambda x: preprocessing(path_image=x['path_pair_id_1'], path_to_save=x['path_pair_id_1_cropped'], dim=(64,128)), axis=1)
df_train['VJ_pair_id_2'] = df_train.apply(lambda x: preprocessing(path_image=x['path_pair_id_2'], path_to_save=x['path_pair_id_2_cropped'], dim=(64,128)), axis=1)

df_test['VJ_pair_id_1'] = df_test.apply(lambda x: preprocessing(path_image=x['path_pair_id_1'], path_to_save=x['path_pair_id_1_cropped'], dim=(64,128)), axis=1)
df_test['VJ_pair_id_2'] = df_test.apply(lambda x: preprocessing(path_image=x['path_pair_id_2'], path_to_save=x['path_pair_id_2_cropped'], dim=(64,128)), axis=1)

In [None]:
df_train.head()

In [None]:
test_img = open_img(df_train.loc[0, 'path_pair_id_1_cropped'])
print(test_img.shape)
plt_img(test_img)

### Step 3: apply HOG in all images - without duplicates

### Step: 3.1 - Apply Hog in all images: all_images_data_train (path_pair_id_1_cropped and path_pair_id_1_cropped) concatenat by rows without duplicates -> result 3443 x 3780

In [None]:
df_train_1 = df_train.path_pair_id_1_cropped
df_train_2 = df_train.path_pair_id_2_cropped
df_train_all_images_unique = pd.concat([df_train_1, df_train_2]).unique()

In [None]:
df_train_all_images_unique.shape

In [None]:
df_train_all_images_unique_HOG = np.array([feature.hog(open_img(img), orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), 
                                  block_norm='L2', visualize=False, transform_sqrt=False, feature_vector=True, 
                                  multichannel=False)
                      for img in df_train_all_images_unique])

In [None]:
df_train_all_images_unique_HOG.shape

### Step: 3.2- Apply Hog for each pair: data_train[path_pair_id_1_cropped] and data_train[path_pair_id_2_cropped] -> result tow matrices: 2200 x 3780 

In [None]:
df_train.head(2)

In [None]:
X_train_1_HOG = np.array([feature.hog(open_img(img_1), orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), 
                                  block_norm='L2', visualize=False, transform_sqrt=False, feature_vector=True, 
                                  multichannel=False)
                      for img_1 in df_train.loc[:,'path_pair_id_1_cropped'].values])

print(X_train_1_HOG.shape)

In [None]:
X_train_2_HOG = np.array([feature.hog(open_img(img_2), orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), 
                                  block_norm='L2', visualize=False, transform_sqrt=False, feature_vector=True, 
                                  multichannel=False)
                      for img_2 in df_train.loc[:,'path_pair_id_2_cropped'].values])
print(X_train_2_HOG.shape)

In [None]:
y_train = np.array([[1 if par==None else 0 for par in df_train.loc[:,'pair_name_2'].values]]).T

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

### Step: 3.3- Apply Hog for each pair: data_test[path_pair_id_1_cropped] and data_test[path_pair_id_2_cropped] -> result tow matrices: 1000 x 3780 

In [None]:
df_test.head(2)

In [None]:
X_test_1_HOG = np.array([feature.hog(open_img(img_1), orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), 
                                  block_norm='L2', visualize=False, transform_sqrt=False, feature_vector=True, 
                                  multichannel=False)
                      for img_1 in df_test.loc[:,'path_pair_id_1_cropped'].values])

print(X_test_1_HOG.shape)

In [None]:
X_test_2_HOG= np.array([feature.hog(open_img(img_2), orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), 
                                  block_norm='L2', visualize=False, transform_sqrt=False, feature_vector=True, 
                                  multichannel=False)
                      for img_2 in df_test.loc[:,'path_pair_id_2_cropped'].values])

print(X_test_2_HOG.shape)

In [None]:
y_test = np.array([[1 if par==None else 0 for par in df_test.loc[:,'pair_name_2'].values]]).T

In [None]:
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

## Step 4: PCA

### Step 4.1: Fit pca in data step 3.1

In [None]:
pca = PCA().fit(df_train_all_images_unique_HOG)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
np.sum(pca.explained_variance_ratio_[:250])

In [None]:
np.sum(pca.explained_variance_ratio_[:300])

In [None]:
pca = PCA(n_components=300).fit(df_train_all_images_unique_HOG)

In [None]:
pca.get_params

#### Step: 4.2:Transform data_train[path_pair_id_1_cropped]
#### Step: 4.3:Transform data_train[path_pair_id_2_cropped]
#### Step: 4.4:Transform data_test[path_pair_id_1_cropped]
#### Step: 4.5:Transform data_test[path_pair_id_2_cropped]

In [None]:
X_train_1_HOG_PCA = pca.transform(X_train_1_HOG)
X_train_1_HOG_PCA.shape

In [None]:
X_train_2_HOG_PCA = pca.transform(X_train_2_HOG)
X_train_2_HOG_PCA.shape

In [None]:
X_test_1_HOG_PCA = pca.transform(X_test_1_HOG)
X_test_1_HOG_PCA.shape

In [None]:
X_test_2_HOG_PCA = pca.transform(X_test_2_HOG)
X_test_2_HOG_PCA.shape

### 5- Append:
    - Append by columns 4.2 and 4.3
    - Append by columns 4.4 and 4.5

In [None]:
X_train_HOG_PCA = np.append(X_train_1_HOG_PCA, X_train_2_HOG_PCA, axis=1)
print(X_train_HOG_PCA.shape)

In [None]:
X_test_HOG_PCA = np.append(X_test_1_HOG_PCA, X_test_2_HOG_PCA, axis=1)
print(X_test_HOG_PCA.shape)

### shuffle data

In [None]:
arr = np.arange(y_train.shape[0])
np.random.shuffle(arr)
X = X_train_PCA[arr]
y_d = y_train[arr]

In [None]:
print(X.shape)
print(y_d.shape)

## Grid SVM

In [None]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


In [None]:
param_grid = [
  {'C': [1, 5], 'degree': np.arange(1, 5, 2), 'kernel': ['poly']}
 ]
grid = GridSearchCV(SVC(), scoring='accuracy', n_jobs=-1, param_grid=param_grid, verbose=10)
grid.fit(X=X_train_HOG_PCA, y=y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
clf_best = grid.best_estimator_

In [None]:
clf_best

In [None]:
clf_best.fit(X=X_train_HOG_PCA, y=y_train)

In [None]:
y_pred_test = clf_best.predict(X_test_HOG_PCA)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_pred_test)

In [None]:
y_pred_test

# MLP

In [None]:
parameters = {'solver': ['sgd'], 'max_iter': [1000, 2000, 5000], 
              'alpha': 10.0 ** -np.arange(2, 5), 'hidden_layer_sizes':np.arange(10, 100, 5), 'activation':['logistic']}
clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, verbose=10, scoring='accuracy')
clf.fit(X=X_train_HOG_PCA, y=y_train)

In [None]:
clf.best_score_

In [None]:
clf.best_params_

In [None]:
best_clf_ = clf.best_estimator_
best_clf_

In [None]:
best_clf_.fit(X=X_train_HOG_PCA, y=y_train)

In [None]:
y_pred_test_ = best_clf_.predict(X_test_HOG_PCA)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_pred_test_)

In [None]:
y_pred_test_