Frist of all, we import the modules that are required for preprocessing data, images in this case.

In [None]:
import numpy as np
import cv2, warnings, matplotlib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore', '.*do not*')
%matplotlib inline

### Preparing Data

It is assumed that a person is recognized or identified by reading or observing his/her face. When we consider the face to recognize, it is obvious that we basically read the two eyes on the face.

In this project, Python modules are used to detect face, then eyes. If the image does not have a face that can be viewed clearly or if the face is obstructed or the two eyes are not clearly visible, we ignore or discard that image.

Here is an experiment where we intake example image and convert it into numpy array as preparation of data for a machine learning classification model.

In [None]:
img = cv2.imread('./test_images/federer1.jpg')

cv2 nodule by default creates numpy array for a color image based on BGR(Blue-Green-Red) spaces. Therefore, it's a three dimensional expression. If it's a gray image, array will be two dimensional. Data here is three dimesional. x, y coordinates and rgb representation. 

In [None]:
img.shape 

In [None]:
plt.imshow(img)

Since the image has more of the unwanted area, we crop it out so as to keep only the required portion by using numpy array properties.

In [None]:
# crop image to get required area only
img = img[0:600, 850:1400]
plt.imshow(img)

Let's have the RGB version of the same image by using the module that we imported.

In [None]:
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img_rgb)

As a matter of fact, image can also be converted to RGB spaces by using adjusting the numpy array like below.

In [None]:
#bgr converted to rgb
plt.imshow(img[:,:,::-1])

Let's also have the gray image data, it is two dimensional.

In [None]:
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
plt.imshow(gray, cmap='gray')

In [None]:
gray.shape

Detet faces and eyes using openCV library and haarcascade thechnique. It provides us a rectangular location where faces and eyes lie by generating list of lists, if it sees multiple faces. For each face, there is a list of four elements representing x / y coordinates, width and height of the image. Then, the technique to detect eyes will work on each face.  

In [None]:
#detect x,y,w,h 
face_cascade = cv2.CascadeClassifier('./opencv/haarcascades/haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier('./opencv/haarcascades/haarcascade_eye.xml')
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
faces

By definition, the four elements in above np array represent x-pos, y-pos, height and width of the image.

In [None]:
(x,y,w,h) = faces[0]
f'{x= }, {y= }, {w= }, {h= }'

We now draw a rectangle covering only the face by using cv2 module.

In [None]:
face_img = cv2.rectangle(img, (x,y),(x+w,y+h),(255,0,0),2)
plt.imshow(face_img)

After detecting face, we then detect two eyes present on the face image.

In [None]:
cv2.destroyAllWindows()
for (x,y,w,h) in faces:
    face_img = cv2.rectangle(img, (x,y), (x+w, y+h), (255,0,0), 1)
    roi_gray = gray[y:y+h, x:x+w]
    roi_color = face_img[y:y+h, x:x+w]
    eyes = eye_cascade.detectMultiScale(roi_gray)
    for (ex, ey, ew, eh) in eyes:
        cv2.rectangle(roi_color, (ex, ey), (ex+ew, ey+eh), (0,255,0), 1)

plt.figure()
plt.imshow(face_img, cmap='gray')
plt.show()

In [None]:
plt.imshow(roi_color, cmap='gray')

In [None]:
cropped_img = np.array(roi_color)
cropped_img.shape

The cropped image is now converted into wavelets by using wavelt transform module. This technique is used because wavelet transform has several characterstics: it can provide horizontal, vertical, and diagonal multi-directional information. The information is distributed in each individual pixel without interfering with each other, and it can achieve lossless reconstruction in process of image reconstruction.

In [None]:
#wavelet transform
import pywt, cv2

def w2d(img, mode='haar', level=1):
    imArray = img
    #Datatype conversions
    #convert to grayscale, step-1
    imArray = cv2.cvtColor(imArray,cv2.COLOR_RGB2GRAY)
    #convert to float, step-2
    imArray =  np.float32(imArray) 
    imArray /= 255;
    # compute coefficients 
    coeffs = pywt.wavedec2(imArray, mode, level=level)

    #Process Coefficients
    coeffs_H = list(coeffs)  
    coeffs_H[0] *= 0;  

    # reconstruction
    imArray_H = pywt.waverec2(coeffs_H, mode);
    imArray_H *= 255;
    imArray_H =  np.uint8(imArray_H)

    return imArray_H

In [None]:
im_har = w2d(cropped_img, 'db1', 5)
plt.imshow(im_har, cmap='gray')

Function to obtain cropped images if two eyes are clearly visible.

In [None]:
def get_cropped_image_if_2_eyes(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    for (x,y,w,h) in faces:
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = img[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi_gray)
        if len(eyes) >= 2:
            return roi_color

In [None]:
original_image = cv2.imread('./test_images/federer1.jpg')[0:600, 850:1400]
plt.imshow(original_image)

unuseful image, returns None

In [None]:
orig_image_obstructed = cv2.imread('./test_images/federer2.jpg')
plt.imshow(orig_image_obstructed)

In [None]:
cropped_image_no_2_eyes = get_cropped_image_if_2_eyes('./test_images/messi2.jpg')
cropped_image_no_2_eyes #returns None

#### Managing variables to save cropped images of useful original images.

In [None]:
path_to_data = './dataset/'
path_to_cr_data = ''.join([path_to_data, 'cropped'])

In [None]:
import os
img_dirs = []

for entry in os.scandir(path_to_data):
    if not entry.name.startswith('.') and entry.name != 'cropped':
        img_dirs.append(entry.path)
        
img_dirs

In [None]:
import shutil
if os.path.exists(path_to_cr_data):
    shutil.rmtree(path_to_cr_data)
os.mkdir(path_to_cr_data)

Collection of cropped images as preparation for model building.

In [None]:
cropped_image_dirs = []
celebrity_file_names_dict = {}

for img_dir in img_dirs:
    count = 1
    celebrity_name = img_dir.split('/')[-1]
    celebrity_file_names_dict[celebrity_name] = []
    
    for entry in os.scandir(img_dir):
        roi_color = get_cropped_image_if_2_eyes(entry.path)
        
        if roi_colo is not None:
            cropped_folder = '/'.join([path_to_cr_data, celebrity_name])
            
            if not os.path.exists(cropped_folder):
                os.makedirs(cropped_folder)
                cropped_image_dirs.append(cropped_folder)
                print(f'Generating cropped images in folder: {cropped_folder}')
                
            cropped_file_name = ''.join([celebrity_name, f'{count}', '.png'])
            cropped_file_path = '/'.join([cropped_folder, cropped_file_name])
            cv2.imwrite(cropped_file_path, roi_color)
            celebrity_file_names_dict[celebrity_name].append(cropped_file_path)
            count += 1

In [None]:
# celebrity_file_name_dict = {}
class_dict = {}

count = 0
for img_dir in cropped_image_dirs:
    celebrity_name = img_dir.split('/')[-1]
    class_dict[celebrity_name] = count
    
    file_list = []
    for entry in os.scandir(img_dir):
        file_list.append(entry.path)
        
    celebrity_file_names_dict[celebrity_name] = file_list
    count += 1
    
print(celebrity_file_names_dict['serena_williams'][:3])
class_dict

In [None]:
#stackoverflow
X, y = [], []
for celebrity_name, training_files in celebrity_file_names_dict.items():
    for training_image in training_files:
        img = cv2.imread(training_image)
        scalled_raw_img = cv2.resize(img, (32, 32))
        img_har = w2d(img, 'db1', 5)
        scalled_img_har = cv2.resize(img_har, (32, 32))
        combined_img = np.vstack([scalled_raw_img.reshape(32*32*3, 1), scalled_img_har.reshape(32*32, 1)])
        X.append(combined_img)
        y.append(class_dict[celebrity_name])

In [None]:
X = np.array(X).reshape(len(X), 4096).astype(float)
X.shape
#data cleaning process is done

In [None]:
#initiative svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC(kernel='rbf', C=10))])
pipe.fit(X_train, y_train)
np.round(pipe.score(X_test, y_test), 2)

In [None]:
print(classification_report(y_test, pipe.predict(X_test)))

In [None]:
#GridSearch
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
model_params = {
#1
    'svm': {
        'model': svm.SVC(gamma='auto',probability=True),
        'params': {
            'svc__C': [1,10,100,1000],
            'svc__kernel': ['rbf','linear']            
        }
    },
#2  
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'randomforestclassifier__n_estimators': [1,5,10]
        }
    },
#3  
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'logisticregression__C': [1,5,10]
        }
    }
}

In [None]:
scores = []
best_estimators = {}
import pandas as pd

for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_
    
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

In [None]:
best_estimators

In [None]:
best_estimators['svm'].score(X_test, y_test)

In [None]:
best_estimators['random_forest'].score(X_test, y_test)

In [None]:
best_estimators['logistic_regression'].score(X_test,y_test)

In [None]:
best_clf = best_estimators['logistic_regression']
best_clf

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, best_clf.predict(X_test))
cm

In [None]:
!pip install joblib
import joblib 
# Save the model as a pickle in a file 
joblib.dump(best_clf, 'saved_model.pkl') 

In [None]:
import json
with open("class_dictionary.json","w") as f:
    f.write(json.dumps(class_dict))
'Code Finished...'