In [1]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()

In [2]:
from numpy import load
import numpy as  np
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

## Step1: Localizing & Extracting Faces

In [3]:
import tensorflow as tf
print(tf.__version__)

2.3.0


In [4]:
from os import listdir
from os.path import isdir
from PIL import Image
from matplotlib import pyplot
from numpy import savez_compressed
from numpy import asarray
from mtcnn.mtcnn import MTCNN

In [5]:
#extract a single face from a given photograph
def extract_face(filename, required_size = (160, 160)):
    #load image from file
    image = Image.open(filename)
    #convert to RGB, if needed
    image = image.convert('RGB')
    # convert to array
    pixels = asarray(image)
    # create the detector, using default weights
    detector = MTCNN()
    # detect faces in the image
    results = detector.detect_faces(pixels)
    # extract the bounding box from the first face
    x1, y1, width, height = results[0]['box']
    # bug fix
    x1, y1 = abs(x1), abs(y1)
    x2, y2 = x1 + width, y1 + height
    # extract the face
    face = pixels[y1:y2, x1:x2]
    # resize pixels to the model size
    image = Image.fromarray(face)
    image = image.resize(required_size)
    face_array = asarray(image)
    return face_array

#load images and extract faces for all images in a directory
def load_faces(directory):
    faces = list()
    filenames_list = list() #NJ: A list to store all the images file names
    
    #enumerate files
    for filename in listdir(directory):
        # path
        path = directory + filename
        # get face
        face = extract_face(path)
        # store
        faces.append(face)
        filenames_list.append(path) 
        
    return faces, filenames_list

#load & extract faces for a dataset that contains one subdir for each class that in turn contains images
def load_dataset(directory):
    X, y, path_list = list(), list(), list()
    
    # enumerate folders, one per class
    for subdir in listdir(directory):
        #path
        path = directory + subdir + '/'
        # skip any files that might be in the dir
        if not isdir(path):
            continue
        # load all faces in the subdirectory
        faces, filenames_list = load_faces(path)
        
        # create labels
        labels = [subdir for _ in range(len(faces))]
        
        # summarize progress
        print ('>loaded %d example for class %s' % (len(faces), subdir))
        # store
        X.extend(faces)
        y.extend(labels)
        path_list.extend(filenames_list) # to get the actual image names
        
    return asarray(X), asarray(y), asarray(path_list)

## Faces Extraction from dataset

In [6]:
# load train dataset
dataset_path = './dataset/'

faces, filename, images_path = load_dataset(dataset_path)

>loaded 14 example for class ben_afflek
>loaded 16 example for class elton_john
>loaded 21 example for class jerry_seinfeld
>loaded 19 example for class madonna
>loaded 21 example for class mindy_kaling


In [7]:
savez_compressed('./faces_dataset.npz', faces, filename, images_path)

In [8]:
# load the face dataset
faces_dataset = load('./faces_dataset.npz')

faces, filename, images_path = faces_dataset['arr_0'], faces_dataset['arr_1'], faces_dataset['arr_2'] #, data['arr_3']
print('Loaded Train dataset: ', faces.shape, filename.shape, images_path.shape)

Loaded Train dataset:  (91, 160, 160, 3) (91,) (91,)


## Feature Extraction for the Faces extracted from dataset

In [9]:
from numpy import load
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet

embedder = FaceNet()

In [10]:
features_dataset = embedder.embeddings(faces)

In [11]:
# save arrays to one file in compressed format
savez_compressed('./faces_embeddings.npz', features_dataset, filename, images_path)

## Finding Duplicate Images

In [12]:
#load dataset
data = load('./faces_embeddings.npz')
features_dataset, filename, images_path = data['arr_0'], data['arr_1'], data['arr_2'] #, data['arr_3']

In [13]:
def get_duplicate_images_df(feature, matching_cutoff_threshold = 0.65):
    img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','similarity_score'])
    for id1,feature1 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
        for id2,feature2 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
            if id2>id1:
                image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
                image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

                similarity_score = cosine_similarity(image_1.reshape(1, -1),
                                                     image_2.reshape(1, -1))
                if similarity_score > matching_cutoff_threshold:
                    img_dup.loc[len(img_dup)]= [id1,label_1,path_1,label_2,path_2,similarity_score[0][0]]
    return img_dup

In [14]:
df_dup = get_duplicate_images_df(data)
df_dup

Unnamed: 0,image_id,Image_label,Image_path,Duplicate_image,Duplicate_path,similarity_score
0,0,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (2).jpg,0.690369
1,1,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (2).jpg,0.686020
2,1,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (6).jpg,0.738119
3,1,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (9).jpg,0.684661
4,2,ben_afflek,./dataset/ben_afflek/train_ben_afflek (11).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (12).jpg,0.799701
...,...,...,...,...,...,...
317,86,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (5).jpg,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (7).jpg,0.841557
318,86,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (5).jpg,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (8).jpg,0.730636
319,87,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (6).jpg,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (7).jpg,0.881841
320,87,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (6).jpg,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (8).jpg,0.680587


In [15]:
#df_duplicate.to_csv('Duplicate_images.csv')