In [12]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()

In [35]:
pd.set_option('display.max_rows', 2200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [13]:
from numpy import load
import numpy as  np
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

## Localizing & Extracting Faces

In [14]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [15]:
from os import listdir
from os.path import isdir
from PIL import Image
from matplotlib import pyplot
from numpy import savez_compressed
from numpy import asarray
from mtcnn.mtcnn import MTCNN

In [16]:
#extract a single face from a given photograph
def extract_face(filename, required_size = (160, 160)):
    #load image from file
    image = Image.open(filename)
    #convert to RGB, if needed
    image = image.convert('RGB')
    # convert to array
    pixels = asarray(image)
    # create the detector, using default weights
    detector = MTCNN()
    # detect faces in the image
    results = detector.detect_faces(pixels)
    # extract the bounding box from the first face
    x1, y1, width, height = results[0]['box']
    # bug fix
    x1, y1 = abs(x1), abs(y1)
    x2, y2 = x1 + width, y1 + height
    # extract the face
    face = pixels[y1:y2, x1:x2]
    # resize pixels to the model size
    image = Image.fromarray(face)
    image = image.resize(required_size)
    face_array = asarray(image)
    return face_array

#load images and extract faces for all images in a directory
def load_faces(directory):
    faces = list()
    filenames_list = list() #NJ: A list to store all the images file names
    
    #enumerate files
    for filename in listdir(directory):
        # path
        path = directory + filename
        # get face
        face = extract_face(path)
        # store
        faces.append(face)
        filenames_list.append(path) 
        
    return faces, filenames_list

#load & extract faces for a dataset that contains one subdir for each class that in turn contains images
def load_dataset(directory):
    X, y, path_list = list(), list(), list()
    
    # enumerate folders, one per class
    for subdir in listdir(directory):
        #path
        path = directory + subdir + '/'
        # skip any files that might be in the dir
        if not isdir(path):
            continue
        # load all faces in the subdirectory
        faces, filenames_list = load_faces(path)
        
        # create labels
        labels = [subdir for _ in range(len(faces))]
        
        # summarize progress
        print ('>loaded %d example for class %s' % (len(faces), subdir))
        # store
        X.extend(faces)
        y.extend(labels)
        path_list.extend(filenames_list) # to get the actual image names
        
    return asarray(X), asarray(y), asarray(path_list)

## Faces Extraction from dataset

In [17]:
# load train dataset
dataset_path = './dataset/'

faces, filename, images_path = load_dataset(dataset_path)

>loaded 14 example for class ben_afflek
>loaded 16 example for class elton_john
>loaded 21 example for class jerry_seinfeld
>loaded 19 example for class madonna
>loaded 21 example for class mindy_kaling


In [18]:
savez_compressed('./faces_dataset.npz', faces, filename, images_path)

In [19]:
# load the face dataset
faces_dataset = load('./faces_dataset.npz')

faces, filename, images_path = faces_dataset['arr_0'], faces_dataset['arr_1'], faces_dataset['arr_2'] #, data['arr_3']
print('Loaded Train dataset: ', faces.shape, filename.shape, images_path.shape)

Loaded Train dataset:  (91, 160, 160, 3) (91,) (91,)


## Feature Extraction for the Faces extracted from dataset

In [20]:
from numpy import load
from numpy import expand_dims
from numpy import asarray
from numpy import savez_compressed
from keras.models import load_model
from keras_facenet import FaceNet

embedder = FaceNet()

In [21]:
features_dataset = embedder.embeddings(faces)

In [22]:
# save arrays to one file in compressed format
savez_compressed('./faces_embeddings.npz', features_dataset, filename, images_path)

## Finding Duplicate Images

In [23]:
#load dataset
data = load('./faces_embeddings.npz')
features_dataset, filename, images_path = data['arr_0'], data['arr_1'], data['arr_2'] #, data['arr_3']

In [44]:
def get_duplicate_images_df(feature, matching_cutoff_threshold = 0.65):
    img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','similarity_score'])
    for id1,feature1 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
        for id2,feature2 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
            
            if id2>id1 and feature1[1]==feature2[1]:
                image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
                image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

                similarity_score = cosine_similarity(image_1.reshape(1, -1),
                                                     image_2.reshape(1, -1))
                if similarity_score > matching_cutoff_threshold:
                    img_dup.loc[len(img_dup)]= [id1,label_1,path_1,label_2,path_2,similarity_score[0][0]]
    return img_dup

In [82]:
def get_duplicate_images_df1(feature, matching_cutoff_threshold = 0.65):
    features_dataset = feature['arr_0']
    img_dup = pd.DataFrame([],columns=['image_id','Image_label','Image_path','Duplicate_image','Duplicate_path','similarity_score'])
    for _, row in df_feature.iterrows():
        image_1 = features_dataset[row['Feature_vec']]
        label_1 = row['Filename']
        path_1  = row['Image_path']
        for id2,feature2 in enumerate(zip(feature['arr_0'], feature['arr_1'], feature['arr_2'])):
            
            if  label_1==feature2[1] and path_1!=feature2[2]:
#                 image_1,label_1,path_1 = feature1[0],feature1[1],feature1[2]
                
                
                image_2,label_2,path_2 = feature2[0],feature2[1],feature2[2]

                similarity_score = cosine_similarity(image_1.reshape(1, -1),
                                                     image_2.reshape(1, -1))
                if similarity_score > matching_cutoff_threshold:
                    img_dup.loc[len(img_dup)]= [id2,label_1,path_1,label_2,path_2,similarity_score[0][0]]
    return img_dup

In [86]:
df_dup = get_duplicate_images_df1(data,0.4)
df_dup

Unnamed: 0,image_id,Image_label,Image_path,Duplicate_image,Duplicate_path,similarity_score
0,1,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (10).jpg,0.645257
1,2,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (11).jpg,0.497924
2,3,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (12).jpg,0.481078
3,4,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (13).jpg,0.475466
4,5,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (14).jpg,0.535787
5,6,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (2).jpg,0.690369
6,8,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (4).jpg,0.619748
7,9,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (5).jpg,0.460599
8,10,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (6).jpg,0.636806
9,11,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg,ben_afflek,./dataset/ben_afflek/train_ben_afflek (7).jpg,0.465942


In [26]:
#df_duplicate.to_csv('Duplicate_images.csv')

In [116]:
result = df_dup.groupby('Image_label').agg(no_of_duplicate = ('Duplicate_path','count') )
result.reset_index(level=0, inplace=True)

In [122]:
result

Unnamed: 0,Image_label,no_of_duplicate
0,ben_afflek,12
1,elton_john,11
2,jerry_seinfeld,20
3,madonna,15
4,mindy_kaling,19


In [126]:
result = pd.merge(result, d, on='Image_label')

In [127]:
result

Unnamed: 0,Image_label,no_of_duplicate,original_count
0,ben_afflek,12,14
1,elton_john,11,16
2,jerry_seinfeld,20,21
3,madonna,15,19
4,mindy_kaling,19,21


In [51]:
df_dup.groupby('Image_label').agg(
    no_of_duplicate = ('Duplicate_path','count') )

In [118]:
df_feature = pd.DataFrame({'Feature_vec':list(range(91)),
              'Filename' :filename,
              'Image_path':images_path})

d = df_feature.groupby('Filename').agg(
original_count = ('Feature_vec','count'),
)

df_feature = df_feature.groupby('Filename').head(1).reset_index(drop=True)

d.reset_index(level=0, inplace=True)

# df_feature = pd.merge(df_feature, d, on='Filename')

In [123]:
d.columns = ['Image_label','original_count']

In [124]:
d

Unnamed: 0,Image_label,original_count
0,ben_afflek,14
1,elton_john,16
2,jerry_seinfeld,21
3,madonna,19
4,mindy_kaling,21


In [119]:
df_feature

Unnamed: 0,Feature_vec,Filename,Image_path
0,0,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg
1,14,elton_john,./dataset/elton_john/train_elton_john (1).jpg
2,30,jerry_seinfeld,./dataset/jerry_seinfeld/train_jerry_seinfeld ...
3,51,madonna,./dataset/madonna/train_madonna (1).jpg
4,70,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (1).jpg


In [100]:
df_feature.groupby('Filename').agg(
original_count = ('Feature_vec','count'),
)

Unnamed: 0_level_0,original_count
Filename,Unnamed: 1_level_1
ben_afflek,1
elton_john,1
jerry_seinfeld,1
madonna,1
mindy_kaling,1


In [93]:
df_feature.groupby('Filename').agg(
original_count = ('Feature_vec', 'count')
)

Unnamed: 0_level_0,original_count
Filename,Unnamed: 1_level_1
ben_afflek,14
elton_john,16
jerry_seinfeld,21
madonna,19
mindy_kaling,21


In [None]:
for _, row in df1.iterrows():
        area = row['area']
        team = row['team']

In [79]:
for _, row in df_feature.iterrows()



Unnamed: 0,Feature_vec,Filename,Image_path
0,0,ben_afflek,./dataset/ben_afflek/train_ben_afflek (1).jpg
1,14,elton_john,./dataset/elton_john/train_elton_john (1).jpg
2,30,jerry_seinfeld,./dataset/jerry_seinfeld/train_jerry_seinfeld ...
3,51,madonna,./dataset/madonna/train_madonna (1).jpg
4,70,mindy_kaling,./dataset/mindy_kaling/train_mindy_kaling (1).jpg


In [77]:
features_dataset[Feature_vec]

array([ 1.21032055e-02, -3.43378410e-02,  1.62295881e-03, -1.38984304e-02,
       -1.27211399e-02,  3.46759632e-02, -1.08666141e-02, -4.34253737e-02,
        2.88301650e-02,  1.79396234e-02, -7.63890287e-03,  5.46426028e-02,
       -2.38577742e-02, -3.79724726e-02,  3.01359240e-02,  2.95096822e-02,
        2.62108278e-02, -2.16755942e-02,  1.82985980e-02, -1.69909149e-02,
       -5.61700650e-02, -2.61751693e-02, -2.58328877e-02, -4.16180864e-02,
       -1.94112882e-02, -2.73680333e-02, -8.15922245e-02, -3.12132835e-02,
       -2.80838124e-02,  3.98257300e-02, -6.79586604e-02,  2.79946569e-02,
        2.57289391e-02,  2.09670588e-02, -6.60082791e-03, -1.51867177e-02,
        1.99671425e-02, -1.41039854e-02, -1.71469245e-02,  5.98128587e-02,
        8.73909984e-03, -3.74009088e-02, -9.88477468e-02, -3.93543206e-02,
       -4.58037257e-02,  4.77461368e-02,  4.05100174e-02, -5.80933876e-02,
        7.65876547e-02,  1.69080403e-03,  2.17709802e-02, -1.79598760e-02,
       -4.84184325e-02,  