In [1]:
import face_recognition
from face_recognition import face_locations
import os
import pandas as pd
import h5py
from tqdm import tqdm
from recursive_dict import *
import scipy.io
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split


In [2]:
# set data paths
data_dir = '/Users/arthur/Desktop/2022 EM/Machine Learning/Projet/DB + Model/LWFA+'
img_dir = os.path.join(data_dir, 'lfw/merge_folder')
indices_path = os.path.join(data_dir, 'indices_train_test.mat')
attr_path = os.path.join(data_dir, 'lfw_att_73.mat')

In [3]:
label_mat = '/Users/arthur/Desktop/2022 EM/Machine Learning/Projet/DB + Model/LWFA+/label.mat'
label = scipy.io.loadmat(label_mat)['label']

name_mat = '/Users/arthur/Desktop/2022 EM/Machine Learning/Projet/DB + Model/LWFA+/name.mat'
name = scipy.io.loadmat(name_mat)['name']
name = [s[0].split('\\')[1] for s in name.tolist()[0]]

attr_name_mat = '/Users/arthur/Desktop/2022 EM/Machine Learning/Projet/DB + Model/LWFA+/attrname.mat'
attr_name = scipy.io.loadmat(attr_name_mat)['AttrName']
attr_name = [str(s[0]) for s in attr_name.tolist()[0]]

In [4]:
df_label = pd.DataFrame(label, columns=attr_name, index=name)

In [5]:
df_label

Unnamed: 0,Male,Asian,White,Black,Baby,Child,Youth,Middle Aged,Senior,Black Hair,...,Pale Skin,5 o Clock Shadow,Strong Nose-Mouth Lines,Wearing Lipstick,Flushed Face,High Cheekbones,Brown Eyes,Wearing Earrings,Wearing Necktie,Wearing Necklace
Aaron_Eckhart_0001.jpg,1,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
Aaron_Guiel_0001.jpg,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
Aaron_Patterson_0001.jpg,1,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,1,0,1,0
Aaron_Peirsol_0001.jpg,1,0,1,0,0,0,1,1,0,0,...,1,0,1,0,0,1,1,0,1,0
Aaron_Peirsol_0002.jpg,1,0,1,0,0,0,1,1,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zorica_Radovic_0001.jpg,0,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
Zulfiqar_Ahmed_0001.jpg,1,0,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,1,0
Zumrati_Juma_0001.jpg,0,0,1,1,0,0,0,0,1,0,...,0,0,1,0,0,1,1,0,0,0
Zurab_Tsereteli_0001.jpg,1,0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,1,1,0,1,0


In [6]:
df_label.index

Index(['Aaron_Eckhart_0001.jpg', 'Aaron_Guiel_0001.jpg',
       'Aaron_Patterson_0001.jpg', 'Aaron_Peirsol_0001.jpg',
       'Aaron_Peirsol_0002.jpg', 'Aaron_Peirsol_0003.jpg',
       'Aaron_Peirsol_0004.jpg', 'Aaron_Pena_0001.jpg',
       'Aaron_Sorkin_0001.jpg', 'Aaron_Sorkin_0002.jpg',
       ...
       'Zoe_Ball_0001.jpg', 'Zoran_Djindjic_0001.jpg',
       'Zoran_Djindjic_0002.jpg', 'Zoran_Djindjic_0003.jpg',
       'Zoran_Djindjic_0004.jpg', 'Zorica_Radovic_0001.jpg',
       'Zulfiqar_Ahmed_0001.jpg', 'Zumrati_Juma_0001.jpg',
       'Zurab_Tsereteli_0001.jpg', 'Zydrunas_Ilgauskas_0001.jpg'],
      dtype='object', length=13143)

In [None]:
# extract face features using face_recognition.face_encodings
# take about 10 minutes on my pc
feature_vecs = []
fnames = []
for fname in tqdm(df_label.index):
    img_path = os.path.join(img_dir, fname)
    # face detection
    X_img = face_recognition.load_image_file(img_path)
    X_faces_loc = face_locations(X_img)
    # if the number of faces detected in a image is not 1, ignore the image
    if len(X_faces_loc) != 1:
        continue
    # extract 128 dimensional face features
    faces_encoding = face_recognition.face_encodings(X_img, known_face_locations=X_faces_loc)[0]
    feature_vecs.append(faces_encoding)
    fnames.append(fname)
df_feat = pd.DataFrame(feature_vecs, index=fnames)
df_label = df_label[df_label.index.isin(df_feat.index)]
df_feat.sort_index(inplace=True)
df_label.sort_index(inplace=True)

df_feat.to_csv('feature.csv')
df_label.to_csv('label.csv')


 11%|████▎                                 | 1483/13143 [00:54<07:11, 27.00it/s]

In [None]:
feature_vecs[0]

In [None]:
len(X_faces_loc)

In [None]:
# load features and labels
df_feat = pd.read_csv('feature.csv', index_col=0)
df_label = pd.read_csv('label.csv', index_col=0)

In [None]:
# split training/test name
unique_names = list(set([path.split('/')[0] for path in df_feat.index]))
name_train, name_test = train_test_split(unique_names, test_size = 0.1, random_state = 0)
name_train, name_test = set(name_train), set(name_test)
# split training/test images
idx_train = [path.split('/')[0] in name_train for path in df_feat.index]
idx_test = [path.split('/')[0] in name_test for path in df_feat.index]
X_train, Y_train = df_feat[idx_train], df_label[idx_train]
X_test, Y_test = df_feat[idx_test], df_label[idx_test]

In [None]:
X_train.shape

In [None]:
X_test

In [None]:
model = MLPClassifier(solver='adam', hidden_layer_sizes=(128, 128),max_iter = 5000, verbose=True, tol=1e-4, activation='relu')
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
prediction_score = model.predict_proba(X_test)

df_prediction = pd.DataFrame(prediction, columns=df_label.columns, index=Y_test.index)
df_prediction_score = pd.DataFrame(prediction_score, columns=df_label.columns, index=Y_test.index)

In [None]:
print(prediction_score)

In [None]:
df_prediction

In [None]:
pd.set_option("display.max_rows", 999)
df_prediction_score

In [None]:
df_prediction_score.to_csv('score.csv')

In [None]:
# save model
import pickle
save_path = 'model.pkl'
with open(save_path, 'wb') as f:
    pickle.dump(model, f)
    
# pickle.dump(model,save_path)

In [None]:
def plot_roc(attr, target, score):
    """Plot a ROC curve and show the accuracy score and the AUC"""
    fig, ax = plt.subplots()
    auc = roc_auc_score(target, score)
    acc = accuracy_score(target, (score >= 0.5).astype(int))
    fpr, tpr, _ = roc_curve(target, score)
    plt.plot(fpr, tpr, lw = 2, label = attr.title())
    plt.legend(loc = 4, fontsize = 15)
    plt.title(('ROC Curve for {attr} (Accuracy = {acc:.3f}, AUC = {auc:.3f})'
               .format(attr = attr.title(), acc= acc, auc = auc)),
              fontsize = 15)
    plt.xlabel('False Positive Rate', fontsize = 15)
    plt.ylabel('True Positive Rate', fontsize = 15)
    plt.show()
    return fig

In [None]:
# plot ROC curves
cols = ['Male', 'Asian', 'Black', 'White','Black Hair','Blond Hair','Brown Hair','Curly Hair','Wavy Hair','Straight Hair','Oval Face','Square Face','Round Face','Indian']
for attr in cols:
    target = Y_test[attr]
    score = df_prediction_score[attr]
    fig = plot_roc(attr, target, score)