In [1]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 2200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
from numpy import load
import numpy as  np
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

## Localizing & Extracting Faces

In [4]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [5]:
from os import listdir
from os.path import isdir
from PIL import Image
from matplotlib import pyplot
from numpy import savez_compressed
from numpy import asarray
from mtcnn.mtcnn import MTCNN

In [6]:
#extract a single face from a given photograph
def extract_face(filename, required_size = (160, 160)):
    #load image from file
    image = Image.open(filename)
    #convert to RGB, if needed
    image = image.convert('RGB')
    # convert to array
    pixels = asarray(image)
    # create the detector, using default weights
    detector = MTCNN()
    # detect faces in the image
    results = detector.detect_faces(pixels)
    # extract the bounding box from the first face
    x1, y1, width, height = results[0]['box']
    # bug fix
    x1, y1 = abs(x1), abs(y1)
    x2, y2 = x1 + width, y1 + height
    # extract the face
    face = pixels[y1:y2, x1:x2]
    # resize pixels to the model size
    image = Image.fromarray(face)
    image = image.resize(required_size)
    face_array = asarray(image)
    return face_array

#load images and extract faces for all images in a directory
def load_faces(directory):
    faces = list()
    filenames_list = list() #NJ: A list to store all the images file names
    
    #enumerate files
    for filename in listdir(directory):
        # path
        path = directory + filename
        # get face
        face = extract_face(path)
        # store
        faces.append(face)
        filenames_list.append(path) 
        
    return faces, filenames_list

#load & extract faces for a dataset that contains one subdir for each class that in turn contains images
def load_dataset(directory):
    X, y, path_list = list(), list(), list()
    
    # enumerate folders, one per class
    for subdir in listdir(directory):
        #path
        path = directory + subdir + '/'
        # skip any files that might be in the dir
        if not isdir(path):
            continue
        # load all faces in the subdirectory
        faces, filenames_list = load_faces(path)
        
        # create labels
        labels = [subdir for _ in range(len(faces))]
        
        # summarize progress
        print ('>loaded %d example for class %s' % (len(faces), subdir))
        # store
        X.extend(faces)
        y.extend(labels)
        path_list.extend(filenames_list) # to get the actual image names
        
    return asarray(X), asarray(y), asarray(path_list)

## Faces Extraction from dataset

In [7]:
# load train dataset
dataset_path = './dataset/'

faces, filename, images_path = load_dataset(dataset_path)

>loaded 11 example for class ben_afflek
>loaded 7 example for class elton_john
>loaded 20 example for class jerry_seinfeld
>loaded 9 example for class madonna
>loaded 19 example for class mindy_kaling


In [8]:
savez_compressed('./faces_dataset.npz', faces, filename, images_path)

In [9]:
# load the face dataset
faces_dataset = load('./faces_dataset.npz')

faces, filename, images_path = faces_dataset['arr_0'], faces_dataset['arr_1'], faces_dataset['arr_2'] #, data['arr_3']
print('Loaded Train dataset: ', faces.shape, filename.shape, images_path.shape)

Loaded Train dataset:  (66, 160, 160, 3) (66,) (66,)


## Feature Extraction for the Faces extracted from dataset

In [10]:
from numpy import load
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet

embedder = FaceNet()

In [11]:
features_dataset = embedder.embeddings(faces)

In [12]:
# save arrays to one file in compressed format
savez_compressed('./faces_embeddings.npz', features_dataset, filename, images_path)

## Finding Duplicate Images

In [13]:
#load dataset
data = load('./faces_embeddings.npz')
features_dataset, filename, images_path = data['arr_0'], data['arr_1'], data['arr_2'] #, data['arr_3']

In [17]:
df_feature = pd.DataFrame({'Feature_vec':list(range(faces.shape[0])),
              'Filename' :filename,
              'Image_path':images_path})

d = df_feature.groupby('Filename').agg(
original_count = ('Feature_vec','count'),
)

df_feature = df_feature.groupby('Filename').head(1).reset_index(drop=True)

d.reset_index(level=0, inplace=True)
d.columns = ['Image_label','original_count']

In [18]:
# def get_duplicate_images_df(feature, matching_cutoff_threshold = 0.65):
#     img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','similarity_score'])
#     for id1,feature1 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
#         for id2,feature2 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
            
#             if id2>id1 and feature1[1]==feature2[1]:
#                 image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
#                 image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

#                 similarity_score = cosine_similarity(image_1.reshape(1, -1),
#                                                      image_2.reshape(1, -1))
#                 if similarity_score > matching_cutoff_threshold:
#                     img_dup.loc[len(img_dup)]= [id1,label_1,path_1,label_2,path_2,similarity_score[0][0]]
#     return img_dup

In [19]:
def get_duplicate_images_df1(feature, matching_cutoff_threshold = 0.65):
    features_dataset = feature['arr_0']
    img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','similarity_score'])
    for _, row in df_feature.iterrows():
        image_1 = features_dataset[row['Feature_vec']]
        label_1 = row['Filename']
        path_1  = row['Image_path']
        for id2,feature2 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
            
            if  label_1==feature2[1] and path_1!=feature2[2]:
#                 image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
                
                
                image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

                similarity_score = cosine_similarity(image_1.reshape(1, -1),
                                                     image_2.reshape(1, -1))
                if similarity_score > matching_cutoff_threshold:
                    img_dup.loc[len(img_dup)]= [id2,label_1,path_1,label_2,path_2,similarity_score[0][0]]
    return img_dup

In [20]:
df_dup = get_duplicate_images_df1(data,0.4)
df_dup

Unnamed: 0,image_id,Image_label,Image_path,Duplicate_image,Duplicate_path,similarity_score
0,1,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (11).jpg,0.603689
1,2,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (12).jpg,0.62617
2,3,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (14).jpg,0.502282
3,4,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (2).jpg,0.68602
4,6,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (4).jpg,0.638228
5,7,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (6).jpg,0.738119
6,8,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (7).jpg,0.589024
7,9,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (8).jpg,0.599612
8,10,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (9).jpg,0.684661
9,12,elton_john,./dataset/elton_john/train_elton_john (11).jpg,elton_john,./dataset/elton_john/train_elton_john (13).jpg,0.663875


In [21]:
df_dup[['Duplicate_path']]

Unnamed: 0,Duplicate_path
0,./dataset/ben_afflek/train_ben_afflek (11).jpg
1,./dataset/ben_afflek/train_ben_afflek (12).jpg
2,./dataset/ben_afflek/train_ben_afflek (14).jpg
3,./dataset/ben_afflek/train_ben_afflek (2).jpg
4,./dataset/ben_afflek/train_ben_afflek (4).jpg
5,./dataset/ben_afflek/train_ben_afflek (6).jpg
6,./dataset/ben_afflek/train_ben_afflek (7).jpg
7,./dataset/ben_afflek/train_ben_afflek (8).jpg
8,./dataset/ben_afflek/train_ben_afflek (9).jpg
9,./dataset/elton_john/train_elton_john (13).jpg


In [22]:
#df_duplicate.to_csv('Duplicate_images.csv')

In [23]:
result = df_dup.groupby('Image_label').agg(no_of_duplicate = ('Duplicate_path','count') )
result.reset_index(level=0, inplace=True)

In [24]:
result = pd.merge(result, d, on='Image_label')
result

Unnamed: 0,Image_label,no_of_duplicate,original_count
0,ben_afflek,9,11
1,elton_john,6,7
2,jerry_seinfeld,19,20
3,madonna,8,9
4,mindy_kaling,17,19
