In [1]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 2200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
from numpy import load
import numpy as  np
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

## Step1: Localizing & Extracting Faces

In [4]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [5]:
from os import listdir
from os.path import isdir
from PIL import Image
from matplotlib import pyplot
from numpy import savez_compressed
from numpy import asarray
from mtcnn.mtcnn import MTCNN

In [6]:
#extract a single face from a given photograph
def extract_face(filename, required_size = (160, 160)):
    #load image from file
    image = Image.open(filename)
    #convert to RGB, if needed
    image = image.convert('RGB')
    # convert to array
    pixels = asarray(image)
    # create the detector, using default weights
    detector = MTCNN()
    # detect faces in the image
    results = detector.detect_faces(pixels)
    # extract the bounding box from the first face
    x1, y1, width, height = results[0]['box']
    # bug fix
    x1, y1 = abs(x1), abs(y1)
    x2, y2 = x1 + width, y1 + height
    # extract the face
    face = pixels[y1:y2, x1:x2]
    # resize pixels to the model size
    image = Image.fromarray(face)
    image = image.resize(required_size)
    face_array = asarray(image)
    return face_array

#load images and extract faces for all images in a directory
def load_faces(directory):
    faces = list()
    filenames_list = list() #NJ: A list to store all the images file names
    
    #enumerate files
    for filename in listdir(directory):
        # path
        path = directory + filename
        # get face
        face = extract_face(path)
        # store
        faces.append(face)
        filenames_list.append(path) 
        
    return faces, filenames_list

#load & extract faces for a dataset that contains one subdir for each class that in turn contains images
def load_dataset(directory):
    X, y, path_list = list(), list(), list()
    
    # enumerate folders, one per class
    for subdir in listdir(directory):
        #path
        path = directory + subdir + '/'
#         print(path)
        # skip any files that might be in the dir
        if not isdir(path):
            continue
        # load all faces in the subdirectory
        faces, filenames_list = load_faces(path)
        
        # create labels
        labels = [subdir for _ in range(len(faces))]
        
        # summarize progress
        print ('>loaded %d example for class %s' % (len(faces), subdir))
        # store
        X.extend(faces)
        y.extend(labels)
        path_list.extend(filenames_list) # to get the actual image names
        
    return asarray(X), asarray(y), asarray(path_list)

## Faces Extraction from dataset

In [7]:
# load train dataset
dataset_train_path = './dataset/train/'

faces_dataset = load_dataset(dataset_train_path)

>loaded 14 example for class ben_afflek
>loaded 16 example for class elton_john
>loaded 21 example for class jerry_seinfeld
>loaded 19 example for class madonna
>loaded 21 example for class mindy_kaling


In [8]:
savez_compressed('./faces-train_dataset.npz', faces_dataset[0], faces_dataset[1], faces_dataset[2])

In [9]:
# load the face dataset
data = load('./faces-train_dataset.npz')

trainX, trainy, train_images_path = data['arr_0'], data['arr_1'], data['arr_2'] #, data['arr_3']
print('Loaded Train dataset: ', trainX.shape, trainy.shape, train_images_path.shape)

Loaded Train dataset:  (91, 160, 160, 3) (91,) (91,)


## Feature Extraction for the Faces extracted from Dataset

In [10]:
from numpy import load
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet

embedder = FaceNet()

In [11]:
features_dataset = embedder.embeddings(trainX)

In [12]:
# save arrays to one file in compressed format
savez_compressed('./faces-train_embeddings.npz', features_dataset, trainy, train_images_path)

In [13]:
#load dataset
data = load('./faces-train_embeddings.npz')
trainX_features, trainy, train_images_path = data['arr_0'], data['arr_1'], data['arr_2'] #, data['arr_3']

In [14]:
def get_duplicate_images_df(feature, matching_cutoff_threshold = 0.65):
    img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','similarity_score'])
    for id1,feature1 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
        for id2,feature2 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
            if id2>id1:
                image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
                image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

                similarity_score = cosine_similarity(image_1.reshape(1, -1),
                                                     image_2.reshape(1, -1))
                if similarity_score > matching_cutoff_threshold:
                    img_dup.loc[len(img_dup)]= [id1,label_1,path_1,label_2,path_2,similarity_score[0][0]]
    return img_dup

In [15]:
df_dup = get_duplicate_images_df(data)

In [16]:
df_dup

Unnamed: 0,image_id,Image_label,Image_path,Duplicate_image,Duplicate_path,similarity_score
0,0,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.690369
1,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.68602
2,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (6...,0.738119
3,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (9...,0.684661
4,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,0.799701
5,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,0.687443
6,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.686731
7,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (8...,0.690363
8,3,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (4...,0.654843
9,3,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (6...,0.728084


In [17]:
# get the similarity between two image features i.e. image embedding vector 
# cosine_similarity(trainX_features[0].reshape(1, -1),trainX_features[90].reshape(1, -1))

In [18]:
df_dup

Unnamed: 0,image_id,Image_label,Image_path,Duplicate_image,Duplicate_path,similarity_score
0,0,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.690369
1,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.68602
2,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (6...,0.738119
3,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (9...,0.684661
4,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,0.799701
5,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,0.687443
6,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.686731
7,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (8...,0.690363
8,3,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (4...,0.654843
9,3,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (6...,0.728084


In [19]:
df_dup['image_id'].value_counts()

30    18
70    14
32    14
35    13
33    13
36    12
31    12
73    12
39    10
77    10
40    10
38    10
34    10
79     9
51     8
80     8
81     7
78     6
44     6
43     6
41     6
82     5
83     5
45     5
42     4
46     4
19     4
37     4
2      4
6      4
47     3
57     3
86     3
1      3
16     3
22     3
74     3
72     3
85     3
71     2
64     2
62     2
5      2
17     2
18     2
48     2
4      2
8      2
20     2
10     2
3      2
23     2
87     2
75     1
0      1
66     1
65     1
54     1
53     1
49     1
25     1
24     1
21     1
12     1
11     1
9      1
88     1
Name: image_id, dtype: int64

# Single function

In [20]:
def getDuplicateImages(images_folder, matching_cutoff_threshold = 0.65):
    face, name, images_path = load_dataset(images_folder)
    embedder = FaceNet()
    feature_vector = embedder.embeddings(face)
    
    img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','Similarity_score'])
    for id1,feature1 in enumerate(zip(feature_vector, name, images_path)):
        for id2,feature2 in enumerate(zip(feature_vector, name, images_path)):
            if id2>id1:
                image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
                image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

                similarity_score = cosine_similarity(image_1.reshape(1, -1),
                                                     image_2.reshape(1, -1))
                if similarity_score > matching_cutoff_threshold:
                    img_dup.loc[len(img_dup)]= [id1,label_1,path_1,label_2,path_2,similarity_score[0][0]]
    return img_dup

In [21]:
dataset_train_path = './dataset/train/'

# getDuplicateImages(dataset_train_path,0.8)
df_duplicate = getDuplicateImages(dataset_train_path)

>loaded 14 example for class ben_afflek
>loaded 16 example for class elton_john
>loaded 21 example for class jerry_seinfeld
>loaded 19 example for class madonna
>loaded 21 example for class mindy_kaling


In [22]:
df_duplicate

Unnamed: 0,image_id,Image_label,Image_path,Duplicate_image,Duplicate_path,Similarity_score
0,0,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.690369
1,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.68602
2,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (6...,0.738119
3,1,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (9...,0.684661
4,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,0.799701
5,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,0.687443
6,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (2...,0.686731
7,2,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (8...,0.690363
8,3,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (4...,0.654843
9,3,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (1...,ben_afflek,./dataset/train/ben_afflek/train_ben_afflek (6...,0.728084


In [23]:
df_duplicate.to_csv('Duplicate_image.csv')