# Import libraries

In [1]:
import os
import random
import shutil
import pandas as pd
import csv
from itertools import zip_longest
import math

In [2]:
# Import Dependencies for face detection and MTCNN Model
from mtcnn.mtcnn import MTCNN
from numpy import asarray
from PIL import Image

In [3]:
# Import standard dependencies
import cv2
import os
import random
import numpy as np
from matplotlib import pyplot as plt

In [4]:
# Import tensorflow dependencies - Functional API
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Conv2D, Dense, MaxPooling2D, Input, Flatten
import tensorflow as tf

In [5]:
# Import tensorflow dependencies - Functional API
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Conv2D, Dense, MaxPooling2D, Input, Flatten, ZeroPadding2D, Convolution2D, Dropout, Activation
import tensorflow as tf

In [6]:
#Import Preprocessing packages
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from numpy import asarray

## Make a dataset for bacthes and labels

#### Description: This section is to create a good split of data in the labelled faces in the wild dataset. We make sure that there is a max of CHUNK_SIZE images per person in a batch size of BATCH_SIZE. Mainly used to attempt and make a fair split of the data so the models can train and test without overfitting.

In [None]:
#Making sure all the names appended can be cropped
label_idx = 0
labels = {}
for directory in os.listdir('lfw_cropped_faces'):
    files = []
    #Add all the file names of a person in a list
    for file in os.listdir(os.path.join('lfw_cropped_faces',directory)):
        file_name = os.path.join('lfw_cropped_faces',directory,file)
        files.append(file_name)

    labels[label_idx] = files.copy()
    files.clear()
    label_idx = label_idx + 1

In [None]:
all_labels = []
files = []
over_thirty = {}
CHUNK_SIZE = 30

#Going through all the labels
for key, values in labels.items():
    
    #Adding the files of the people who have less than CHUNK_SIZE images
    if len(values) < CHUNK_SIZE: 
        files = files + values
        all_labels = all_labels + [key]*(len(values))
    else:
        over_thirty[key] = values

In [None]:
og_labels = all_labels.copy()
og_files = files.copy()

it = 0
while True:
    files.clear()
    files = og_files.copy()
    all_labels.clear()
    all_labels = og_labels.copy()
    
    
    #Going through all the photos that have over CHUNK_SIZE images
    for key, values in over_thirty.items():
        #Caclulating how many splits of the BATCH_SIZE fit into all the list of files
        files_size = len(files)
        intervals = math.ceil(files_size/BATCH_SIZE)

        #Picking random order to place chuncks of files in
        pic_idx = list(range(0,intervals))
        random.shuffle(pic_idx)
        idx = 0
        
        
        p_img_count = len(values)
        
        #Split all the images into multiple chunks of CHUNK_SIZE
        split_count =  math.ceil(p_img_count/CHUNK_SIZE)
        c_values = [values[i:i + CHUNK_SIZE] for i in range(0, len(values), CHUNK_SIZE)]
        
        #Inserting the values one by one
        for val in c_values:
            for v in val:
                files.insert(pic_idx[idx]*BATCH_SIZE,v)
                all_labels.insert(pic_idx[idx]*BATCH_SIZE,key)
            idx = idx + 1
            if idx >= len(pic_idx):
                idx = 0
        
        idx = idx + 1
        if idx >= len(pic_idx):
            idx = 0
    
    if check_good_dataset(all_labels):
        print('Good data set!')
        print(it)
        break
    it =  it + 1
print(len(files))
        
        

In [None]:
#Function to check if the dataset is done well
def check_good_dataset(all_labels):
    grouped_by_batch_size = [all_labels[i:i + BATCH_SIZE] for i in range(0, len(all_labels), BATCH_SIZE)]
    i = 0
    for g in grouped_by_batch_size:
        contains_duplicates = [g.count(element) for element in g]
        print(sum(contains_duplicates))
        if sum(contains_duplicates) < 10:
            return False
    
    return True

In [None]:
#Write image file paths and labels into a csv file
import csv
from itertools import zip_longest

d = [files, all_labels]
export_data = zip_longest(*d, fillvalue = '')
with open('fair_data_spare.csv', 'w', encoding="ISO-8859-1", newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(('files','labels'))
    wr.writerows(export_data)
myfile.close()

## Preparing Cross Validation data - Iterations

#### Description: LFW data preparation for all 5-cross validation iterations.

In [8]:
#Upload the data set
df = pd.read_csv('fair_data.csv', )

prepared_files = df.files.to_list()
prepared_labels = df.labels.to_list()

In [None]:
#Separating data for Cross Validation iteration 1
tr_v_files = prepared_files[:int(len(prepared_files)*0.8)]
test_files = prepared_files[int(len(prepared_files)*0.8):]
tr_v_labels = prepared_labels[:int(len(prepared_files)*0.8)]
test_lables = prepared_labels[int(len(prepared_files)*0.8):]

d = [tr_v_files, test_files,tr_v_labels,test_lables]
export_data = zip_longest(*d, fillvalue = '')
with open('cross_validation_data/data_cv_1.csv', 'w', encoding="ISO-8859-1", newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(('tr_v_files','test_files','tr_v_labels','test_lables'))
    wr.writerows(export_data)
myfile.close()

In [None]:
#Separating data for Cross Validation iteration 2
tr_v_files = prepared_files[:int(len(prepared_files)*0.8)]
test_files = prepared_files[int(len(prepared_files)*0.8):]
tr_v_labels = prepared_labels[:int(len(prepared_files)*0.8)]
test_lables = prepared_labels[int(len(prepared_files)*0.8):]

d = [tr_v_files, test_files,tr_v_labels,test_lables]
export_data = zip_longest(*d, fillvalue = '')
with open('cross_validation_data/data_cv_1.csv', 'w', encoding="ISO-8859-1", newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(('tr_v_files','test_files','tr_v_labels','test_lables'))
    wr.writerows(export_data)
myfile.close()

In [24]:
#Splitting and saving the data into files
from sklearn.model_selection import KFold
import numpy as np
train = prepared_files
labels = prepared_labels

kf=KFold(n_splits=5)
train = np.array(train)

count = 1
for train_index, test_index in kf.split(train):
    print(train_index, test_index)
    tr_v_files = []
    test_files = []
    tr_v_labels = []
    test_lables = []
    
    for t in train_index:
        tr_v_files.append(prepared_files[t])
        tr_v_labels.append(prepared_labels[t])
    
    for tw in test_index:
        test_files.append(prepared_files[tw])
        test_lables.append(prepared_labels[tw])
    
    file_name = 'cross_validation_data/data_cv_'+str(count)+'.csv'
    
    d = [tr_v_files, test_files,tr_v_labels,test_lables]
    export_data = zip_longest(*d, fillvalue = '')
    with open(file_name, 'w', encoding="ISO-8859-1", newline='') as myfile:
        wr = csv.writer(myfile)
        wr.writerow(('tr_v_files','test_files','tr_v_labels','test_lables'))
        wr.writerows(export_data)
    myfile.close()
    
    count = count+1
        
    

[ 2647  2648  2649 ... 13230 13231 13232] [   0    1    2 ... 2644 2645 2646]
[    0     1     2 ... 13230 13231 13232] [2647 2648 2649 ... 5291 5292 5293]
[    0     1     2 ... 13230 13231 13232] [5294 5295 5296 ... 7938 7939 7940]
[    0     1     2 ... 13230 13231 13232] [ 7941  7942  7943 ... 10584 10585 10586]
[    0     1     2 ... 10584 10585 10586] [10587 10588 10589 ... 13230 13231 13232]


## Peparing binary inputs for Siamese Network for each cross validation training

#### Description: Preparing data into two input pairs because our siamese networks will take two input images.

In [40]:
for num in range(1,6):
    #Preparation for each iteration
    
    #Load images
    file_name = 'cross_validation_data/data_cv_'+str(num)+'.csv'
    df = pd.read_csv(file_name, )
    tr_files = df.tr_v_files.to_list()
    tr_labels = df.tr_v_labels.to_list()
    
    #Empty arrays
    anchor = []
    other_image = []
    binary = []
    anchor_label = []
    other_image_label = []
    
    # Making a positive and negative for each image (repetitions will probably happen)
    for i in range(0,len(tr_labels)):
        temp = tr_labels[i]
        all_idx = [index for index, element in enumerate(tr_labels) if element == temp]
        if len(all_idx) > 1:
            #Shuffling index all potential positives
            ran = random.randint(0,len(all_idx)-1)

            while all_idx[ran] == i:
                ran = random.randint(0,len(all_idx)-1)

            #Adding positive match
            anchor.append(tr_files[i])
            other_image.append(tr_files[all_idx[ran]])
            binary.append(1)
            anchor_label.append(tr_labels[i])
            other_image_label.append(tr_labels[all_idx[ran]])


            #Shuffling index all potential positives
            ran_n = random.randint(0,len(tr_labels)-1)

            while ran_n == i or ran_n in all_idx:
                ran_n = random.randint(0,len(tr_labels)-1)

            #Adding negative match
            anchor.append(tr_files[i])
            other_image.append(tr_files[ran_n])
            binary.append(0)
            anchor_label.append(tr_labels[i])
            other_image_label.append(tr_labels[ran_n])
            
    d = [anchor, other_image, binary,anchor_label,other_image_label]
    export_data = zip_longest(*d, fillvalue = '')
    
    
    file_name_save = 'cross_validation_data/siamese_training_data_cv_'+str(num)+'.csv'
    with open(file_name_save, 'w', encoding="ISO-8859-1", newline='') as myfile:
        wr = csv.writer(myfile)
        wr.writerow(('anchor','other_image','binary','anchor_label','other_image_label'))
        wr.writerows(export_data)
    myfile.close()
    

## Peparing binary inputs for Siamese Network for each cross validation test

#### Description: Preparing data into two input pairs because our siamese networks will take two input images.

In [25]:
for num in range(1,6):
    #Preparation for each iteration
    
    #Load images
    file_name = 'cross_validation_data/data_cv_'+str(num)+'.csv'
    df = pd.read_csv(file_name, )
    tr_files = df.test_files.to_list()
    tr_labels = df.test_lables.to_list()
    
    #Removng nan values
    tr_files = [x for x in tr_files if str(x) != 'nan']
    tr_labels = [x for x in tr_labels if math.isnan(x) == False]
    
    #Empty arrays
    anchor = []
    other_image = []
    binary = []
    anchor_label = []
    other_image_label = []
    
    # Making a positive and negative for each image (repetitions will probably happen)
    for i in range(0,len(tr_labels)):
        temp = tr_labels[i]
        all_idx = [index for index, element in enumerate(tr_labels) if element == temp]
        if len(all_idx) > 1:
            #Shuffling index all potential positives
            ran = random.randint(0,len(all_idx)-1)

            while all_idx[ran] == i:
                ran = random.randint(0,len(all_idx)-1)

            #Adding positive match
            anchor.append(tr_files[i])
            other_image.append(tr_files[all_idx[ran]])
            binary.append(1)
            anchor_label.append(tr_labels[i])
            other_image_label.append(tr_labels[all_idx[ran]])


            #Shuffling index all potential positives
            ran_n = random.randint(0,len(tr_labels)-1)

            while ran_n == i or ran_n in all_idx:
                ran_n = random.randint(0,len(tr_labels)-1)

            #Adding negative match
            anchor.append(tr_files[i])
            other_image.append(tr_files[ran_n])
            binary.append(0)
            anchor_label.append(tr_labels[i])
            other_image_label.append(tr_labels[ran_n])
            
    d = [anchor, other_image, binary,anchor_label,other_image_label]
    export_data = zip_longest(*d, fillvalue = '')
    
    
    file_name_save = 'cross_validation_data/siamese_testing_data_cv_'+str(num)+'.csv'
    with open(file_name_save, 'w', encoding="ISO-8859-1", newline='') as myfile:
        wr = csv.writer(myfile)
        wr.writerow(('anchor','other_image','binary','anchor_label','other_image_label'))
        wr.writerows(export_data)
    myfile.close()
    

## Peparing binary inputs for demo data

#### Description: Preparing data into two input pairs because our siamese networks will take two input images. This is specifically for demonstration data.

In [9]:
label_dict = {}
count = 0
for directory in os.listdir('all_demo_data_cropped'):
    label_dict[directory] = count
    count = count + 1

In [10]:
#Empty arrays
anchor = []
other_image = []
binary = []
anchor_label = []
other_image_label = []
count = 0

#Making dataset for ID and all other angles other than straight faces
for directory in os.listdir('all_demo_data_cropped'):
    
    for directory2 in os.listdir('all_demo_data_cropped'):
        same = 0
        if directory == directory2:
            same = 1

        anchor.append(os.path.join('all_demo_data_cropped',directory,'ID.jpg'))
        other_image.append(os.path.join('all_demo_data_cropped',directory2,'top.jpg'))
        binary.append(same)
        anchor_label.append(label_dict[directory])
        other_image_label.append(label_dict[directory2])

        anchor.append(os.path.join('all_demo_data_cropped',directory,'ID.jpg'))
        other_image.append(os.path.join('all_demo_data_cropped',directory2,'bottom.jpg'))
        binary.append(same)
        anchor_label.append(label_dict[directory])
        other_image_label.append(label_dict[directory2])

        anchor.append(os.path.join('all_demo_data_cropped',directory,'ID.jpg'))
        other_image.append(os.path.join('all_demo_data_cropped',directory2,'left.jpg'))
        binary.append(same)
        anchor_label.append(label_dict[directory])
        other_image_label.append(label_dict[directory2])
        
        anchor.append(os.path.join('all_demo_data_cropped',directory,'ID.jpg'))
        other_image.append(os.path.join('all_demo_data_cropped',directory2,'right.jpg'))
        binary.append(same)
        anchor_label.append(label_dict[directory])
        other_image_label.append(label_dict[directory2])

d = [anchor, other_image, binary,anchor_label,other_image_label]
export_data = zip_longest(*d, fillvalue = '')        

file_name_save = 'cross_validation_data/siamese_demonstration_not_straight_faces_data.csv'
with open(file_name_save, 'w', encoding="ISO-8859-1", newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(('anchor','other_image','binary','anchor_label','other_image_label'))
    wr.writerows(export_data)
myfile.close()   

In [11]:
#Empty arrays
anchor = []
other_image = []
binary = []
anchor_label = []
other_image_label = []
count = 0

#Making dataset for ID and straight faces
for directory in os.listdir('all_demo_data_cropped'):
    
    for directory2 in os.listdir('all_demo_data_cropped'):
        same = 0
        if directory == directory2:
            same = 1

        anchor.append(os.path.join('all_demo_data_cropped',directory,'ID.jpg'))
        other_image.append(os.path.join('all_demo_data_cropped',directory2,'straight.jpg'))
        binary.append(same)
        anchor_label.append(label_dict[directory])
        other_image_label.append(label_dict[directory2])


d = [anchor, other_image, binary,anchor_label,other_image_label]
export_data = zip_longest(*d, fillvalue = '')        

file_name_save = 'cross_validation_data/siamese_demonstration_all_straight_faces_data.csv'
with open(file_name_save, 'w', encoding="ISO-8859-1", newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(('anchor','other_image','binary','anchor_label','other_image_label'))
    wr.writerows(export_data)
myfile.close()   