This notebook investigates different methods of image similarity measurements to find near-duplicate images. These images can help improve the aesthetic training by filtering out (near-) duplicate images that may have contradictory star-ratings.

In [None]:
import itertools
import exifread
from PIL import Image
import pandas as pd
import glob
import keras
import cv2
import numpy as np
from measure_img_similarity import earth_movers_distance, structural_sim, pixel_sim

pd.set_option('display.max_colwidth', -1)

First we need to assess all possible image combinations from a given file directory

In [None]:
directory = "/home/keras/dev/pictures_per_usecase/Ralf_Wieting_Moments/2016-07-10 Binz/moment19/"

Then we measure the image similarity using different methods (EMD, structural similarity, etc.).  
We also measure the execution time both for a single combination and for all images.  
To decrease the execution time we can change the resolution in measure_img_similarity.py. Also we only compare the combinations that are less than 30 seconds apart


In [None]:
import time
from datetime import datetime
from os import walk, path
import math

def get_timestamp_difference(image_a, image_b):
    tags_image_1 = {}
    tags_image_2 = {}
    with open(image_a, 'rb') as f:
        tags_image_1 = exifread.process_file(f, details=False)
    with open(image_b, 'rb') as f:
        tags_image_2 = exifread.process_file(f, details=False)
    
    if "Image DateTime" in tags_image_1.keys() and "Image DateTime" in tags_image_2.keys():
        datetime_1 = str(tags_image_1["Image DateTime"]).split(".")[0]
        datetime_2 = str(tags_image_2["Image DateTime"]).split(".")[0]
        
        dt_obj_1 = datetime.strptime(datetime_1, '%Y:%m:%d %H:%M:%S')
        dt_obj_2 = datetime.strptime(datetime_2, '%Y:%m:%d %H:%M:%S')

        millisec_1 = dt_obj_1.timestamp() * 1000
        millisec_2 = dt_obj_2.timestamp() * 1000

        return abs(millisec_2 - millisec_1)
    
    else:
        print("No timestamp in one of the images")
        return math.inf

'''Calculates the SSIM for all image combinations within a given directory.
   Only combinations that are less than or equal to 30000 miliseconds apart are considered.'''

def calculate_SSIM_for_directory(directory_name, allowed_seconds_between_images=30):
    f = []
    for (dirpath, dirnames, filenames) in walk(directory_name):
        f.extend(filenames)
        break
    
    f = [path.join(directory_name, x) for x in f]
    
    image_filenames = f
    
    #print(image_filenames)
    
    if len(image_filenames) > 1:
        possible_combinations = list(itertools.combinations(image_filenames, 2))

        list_of_dicts = []
        dict_combination_ssim = {}

        time_total_1 = time.time()
        for combination in possible_combinations:
            time_single_1 = time.time()
            #emd = earth_movers_distance(path_a=combination[0], path_b=combination[1])
            if get_timestamp_difference(combination[0], combination[1]) <= allowed_seconds_between_images*1000:
                
                struct_sim = structural_sim(path_a=combination[0], path_b=combination[1])
                pixel_similarity = pixel_sim(path_a=combination[0], path_b=combination[1])
                time_single_2 = time.time()
                time_single_total = time_single_2 - time_single_1

                dict_combination_ssim = {"image_path_1" : combination[0], 
                                         "image_path_2" : combination[1], 
                                         "struct_sim" : struct_sim,
                                         "pixel_sim" : pixel_similarity}
                list_of_dicts.append(dict_combination_ssim.copy())
            else:
                continue

        time_total_2 = time.time()
        time_total_passed = time_total_2 - time_total_1
        return list_of_dicts
    
    else:
        return None

In [None]:
dataframe = pd.DataFrame(calculate_SSIM_for_directory(directory, allowed_seconds_between_images=30))

Now we sort and display the top 10 combinations with the heighest similarity (for EMD it's the lowest values)

In [None]:
sorted_dataframe = dataframe.sort_values(["struct_sim"], ascending=False)
display(sorted_dataframe[:15])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image


def show_images(images, cols = 1, titles = None):
    """Display a list of images in a single figure with matplotlib.
    
    Parameters
    ---------
    images: List of np.arrays compatible with plt.imshow.
    
    cols (Default = 1): Number of columns in figure (number of rows is set to np.ceil(n_images/float(cols))).
    
    titles: List of titles corresponding to each image. Must have the same length as titles.
    """
    
    generic_title = False
    assert((titles is None)or (len(images) == len(titles)))
    n_images = len(images)
    if titles is None: 
        titles = ['Image (%d)' % i for i in range(1,n_images + 1)]
        generic_title = True
        
    best_image = ""
    top_score = 0
    
    if type(titles[0]) is not str:
        for n, (image, title) in enumerate(zip(images, titles)):
            if title >= top_score:
                top_score = title
                best_image = image
                
    fig = plt.figure()
    for n, (image, title) in enumerate(zip(images, titles)):
           
        a = fig.add_subplot(cols, np.ceil(n_images/float(cols)), n + 1)
        
        '''If scores are provides as titles for the images, we mark the best with a green border'''
        if image == best_image:
            img = cv2.imread(image)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # border color
            color = [58, 255, 81]
            
            # border widths
            top, bottom, left, right = [25] * 4
            
            img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
        
        else:
            img = Image.open(image)
            
        #img = img.resize((512, 512))
        img = np.asarray(img)
        
        if img.ndim == 2:
            plt.gray()
        plt.imshow(img)
        if generic_title == False and type(title) is str:
            title = title.split('/')[8]
        a.set_title(title)
        
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_images)
    plt.show()
    plt.close()

We need to define a threshold to assess when two images are (very) similar

In [None]:
def reduce_duplicate_dataframe(sorted_dataframe, threshold = 0.15):
    duplicates_dataframe = pd.DataFrame(columns=["image_path_1", "image_path_2", "struct_sim", "pixel_sim"])

    i = 0
    for row in sorted_dataframe.iterrows():
        if row[1]["struct_sim"] >= threshold:
            show_images([row[1]["image_path_1"], row[1]["image_path_2"]])
            duplicates_dataframe.loc[i] = [row[1]["image_path_1"], row[1]["image_path_2"], row[1]["struct_sim"], row[1]["pixel_sim"]]
            i += 1
            
    return duplicates_dataframe

In [None]:
duplicates_dataframe = reduce_duplicate_dataframe(sorted_dataframe, threshold=0.15)
display(duplicates_dataframe)

We can see that duplicates are found effectively but some pictures are presented multiple times.  
We need to create "duplicate clusters" that represent batches of similar images rather than just have tuples of images.



In [None]:
class DuplicateCluster(object):
        
    def __init__(self, index_image_path):
        self.index_image_path = index_image_path
        self.similar_images = []
    
    def appendImagePath(self, imagePath):
        self.similar_images.append(imagePath)         

In [None]:
def check_if_picture_in_other_cluster(list_of_clusters, image_filepath):
    entry_exists = False
    for cluster in list_of_clusters:
        if image_filepath in cluster.similar_images or image_filepath == cluster.index_image_path:
            entry_exists = True
    return entry_exists

def create_image_clusters(duplicates_dataframe):
    cluster_list = []
    for row in duplicates_dataframe.iterrows():
        image_path_exists_as_index = False

        filepath_image_a = row[1]["image_path_1"]
        filepath_image_b = row[1]["image_path_2"]

        if len(cluster_list) >= 1: 
            for cluster in cluster_list:
                if filepath_image_a == cluster.index_image_path:
                    image_path_exists_as_index = True
                    if check_if_picture_in_other_cluster(cluster_list, filepath_image_b) == False:
                        cluster.appendImagePath(filepath_image_b)
                elif filepath_image_b == cluster.index_image_path:
                    image_path_exists_as_index = True
                    if check_if_picture_in_other_cluster(cluster_list, filepath_image_a) == False:
                        cluster.appendImagePath(filepath_image_a)

        if check_if_picture_in_other_cluster(cluster_list, filepath_image_a) == False or len(cluster_list) == 0:
            cluster_to_add = DuplicateCluster(filepath_image_a)
            if check_if_picture_in_other_cluster(cluster_list, filepath_image_b) == False:
                cluster_to_add.appendImagePath(filepath_image_b)
            cluster_list.append(cluster_to_add)

    return cluster_list

In [None]:
cluster_list = create_image_clusters(duplicates_dataframe)
i = 1
for e in cluster_list:
    #print(e.__dict__)
    image_list = [e.index_image_path, *(e.similar_images)]
    show_images(image_list, titles=image_list)
    i += 1

After preventing the near-duplicate clustering from having images allocated to multiple clusters we now need to allocate the clusters where only one image is present:

In [None]:
def add_image_to_existing_cluster(cluster_list, filepath):
    reference_cluster = None
    ssim = 0.0
    for cluster in cluster_list:
        # Skip the cluster that this image belongs to
        if cluster.index_image_path == filepath:
            continue
        # Calculate structural similarity for each cluster-index-image to find the one that matches the best   
        temp_ssim = structural_sim(cluster.index_image_path, filepath)
        if temp_ssim >= ssim:
            ssim = temp_ssim
            reference_cluster = cluster
    if ssim >= 0.2:
        reference_cluster.appendImagePath(filepath)

def reallocate_single_images(cluster_list):
    for cluster in cluster_list:
        # Find images with no corresponding similar images 
        if len(cluster.similar_images ) == 0:
            # Add them to another cluster and delete the now redundant cluster-object
            add_image_to_existing_cluster(cluster_list, cluster.index_image_path)
            cluster_list.remove(cluster)
    return cluster_list

def print_image_clusters(cluster_list):
    i = 1
    for e in cluster_list:
        score_list = []
        image_list = [e.index_image_path, *(e.similar_images)]
        for image in image_list:
            score_list.append(get_model_score(image))
        print("\n")
        show_images(image_list, titles=score_list)
        i += 1

In [None]:
print_image_clusters(reallocate_single_images(cluster_list))

### Calculate aesthetic scores

In [None]:
from keras.models import load_model
from keras.preprocessing.image import load_img, img_to_array
from keras_applications.mobilenet import relu6, preprocess_input

cewe_model = load_model("/home/keras/dev/trained_models/cewe_binary_scoring/cewe_binary_scoring_ava_pretrained.h5", 
                        custom_objects={"relu6" : relu6})


In [None]:
'''To check which image to "keep" from the duplicate cluster we predict the aesthetics score with the deep learning model'''

def get_model_score(image_filename):
    img = load_img(image_filename, target_size=(224, 224))
    
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    x = preprocess_input(img_array)
    
    return cewe_model.predict(x)[0][0]

### Test - Deep Learning approach

In [None]:
from skimage import io, transform
import numpy as np
import itertools
import matplotlib.pyplot as plt
import double_channel_network.inception_double_channel_color as inception_double_channel_color

images = []
for i, f in enumerate(image_filenames):
    img = io.imread(f)
    img = transform.resize(img, (96, 96))
    images.append(img)

In [None]:
images = np.array(images)

In [None]:
model = inception_double_channel_color.create_model()
model.load_weights('double_channel_network/double_channel_color_inception_95.h5')

In [None]:
import time
# Single test:
time_before = time.time()
result = model.predict(np.dstack((images[5], images[2])).reshape((1,96,96,6)))
time_after = time.time()

In [None]:
print(time_after-time_before)

In [None]:
print(result[0][0])

In [None]:
calculated_predictions = []
single_entry = {}

threshold = 0.999999999
i = 0
for combi in itertools.combinations(images, 2):
    prediction = model.predict(np.dstack((combi[0], combi[1])).reshape((1,96,96,6)))
    if prediction[0][0] >= threshold and prediction[0][1] < 1-threshold:
        plt.imshow(combi[0])
        plt.show()
        plt.imshow(combi[1])
        plt.show()
        
#         single_entry = {"image_1" : i, "image_2" : i+1, "prediction" : prediction[0][0]} 
#         calculated_predictions.append(single_entry.copy())
#         i += 1

#print(len(calculated_predictions))

**Next step:**  
Now that we have a good solution for detecting near-duplicate images and sorted them into clusters we can use the information to improve the training of our deep learning aesthetics model.  
For each duplicate cluster one image is taken with the best star rating. This image is then used for training (in combination with all the other non duplicate images).  

**Repeat for all "moments":**  
* 1.: Calculate near-duplicate cluster for given images
* 2.: Create a list where each cluster is on the same level (as opposed to a single index-image and several similar images in a list)
* 3.: Get the best image for each cluster with the list created in 2.
* 4.: Define the non-duplicate images by subtractinc all images in the moment by the list created in 2.
* 5.: Get a dictionary with the filepath and the star-rating for the non-duplicate images
* 6.: Combine the result of step 3. with the images from step 5. to get a final dictionary for each moment. Save this dataframe to a .csv file

The last step is to iterate over every .csv file created in 6 and combine them into a single dataframe that can be used for training.

In [None]:
# 4.
def get_rating_for_non_duplicates(non_duplicates):
    list_of_dicts = []
       
    for image in non_duplicates:
        single_entry = {}
        file = open(image, 'rb')
        tags = exifread.process_file(file)
        if "Image Rating" in tags.keys():
            single_entry = {"filepath" : image, "star_rating" : tags["Image Rating"]}
            list_of_dicts.append(single_entry.copy())
        else: 
            print("No rating for image!")
            continue

    non_duplicate_dataframe = pd.DataFrame(list_of_dicts)
    return non_duplicate_dataframe

# 3.
def get_best_images_from_clusters(final_cluster_list):
    best_images = []
    for c in final_cluster_list:
        single_entry = {}
        list_to_check = []
        best_image_of_cluster = ""
        list_to_check = [c.index_image_path, *(c.similar_images)]
        for image in list_to_check:
            temp_rating = 0
            file = open(image, 'rb')
            tags = exifread.process_file(file)
            if "Image Rating" in tags.keys():
                if int(str(tags["Image Rating"])) >= temp_rating:
                    temp_rating = tags["Image Rating"]
            else:
                continue
        single_entry = {"filepath" : image, "star_rating" : temp_rating}
        best_images.append(single_entry.copy())
    return best_images

# 2.
def create_same_level_list(final_cluster_list):
    image_cluster_as_list = []
    for e in final_cluster_list:
        image_cluster_as_list.append(e.index_image_path)
        for s_img in e.similar_images:
            image_cluster_as_list.append(s_img)
    
    return image_cluster_as_list

# 1.
def calculate_clusters_for_all_moments(directory):
    #print(directory)
    list_of_dicts = calculate_SSIM_for_directory(directory)
    if list_of_dicts != None and len(list_of_dicts) != 0:
        dataframe = pd.DataFrame(list_of_dicts)
        sorted_dataframe = dataframe.sort_values(["struct_sim"], ascending=False)

        reduced_dataframe = reduce_duplicate_dataframe(sorted_dataframe=sorted_dataframe, threshold=0.15)
        cluster_list = create_image_clusters(reduced_dataframe)
        final_cluster_list = reallocate_single_images(cluster_list)

        return final_cluster_list
    else:
        return None

In [None]:
# Generate test output:

list_of_dicts = calculate_SSIM_for_directory("/home/keras/dev/pictures_per_usecase/Ralf_Wieting_Moments/2018-04-01 Ostern/moment1", allowed_seconds_between_images=30)

dataframe = pd.DataFrame(list_of_dicts)
sorted_dataframe = dataframe.sort_values(["struct_sim"], ascending=False)

reduced_dataframe = reduce_duplicate_dataframe(sorted_dataframe=sorted_dataframe, threshold=0.05)
cluster_list = create_image_clusters(reduced_dataframe)
final_cluster_list = reallocate_single_images(cluster_list)

print_image_clusters(final_cluster_list)

In [None]:
parent_directory = "/home/keras/dev/pictures_per_usecase/Ralf_Wieting_Moments/*/*"

'''For every moment we calculate the duplicate clusters, get the best image out of each cluster and 
   define the non-duplicate images. A combination of the latter two is used to create a dataframe 
   which is saved to a .csv file. '''

i = 0
sub_directory_list = glob.glob(parent_directory)
for sub_directory in sub_directory_list:
    print(i)
    final_cluster_list = calculate_clusters_for_all_moments(sub_directory)
    if final_cluster_list != None:
        image_cluster_as_list = create_same_level_list(final_cluster_list)     
        best_images = get_best_images_from_clusters(final_cluster_list)
        best_images_dataframe = pd.DataFrame(best_images)
    else:
        image_cluster_as_list = []
    
    f = []
    for (dirpath, dirnames, filenames) in walk(sub_directory):
        f.extend(filenames)
        break
    
    f = [path.join(sub_directory, x) for x in f]
    
    all_filenames = f
    non_duplicates = list(set(all_filenames) - set(image_cluster_as_list))
    non_duplicate_dataframe = get_rating_for_non_duplicates(non_duplicates)

    frames = [best_images_dataframe, non_duplicate_dataframe]
    result = pd.concat(frames)
    result.reset_index(inplace=True, drop=True)
    result.to_csv("duplicate_free_images_" + str(i) + ".csv", decimal=".", sep=";", index=False)
    
    i += 1

The **last step** is to create one Dataframe that contains every image filepath and its star-rating.

In [None]:
csv_files = glob.glob("*.csv")

In [None]:
overall_data = pd.DataFrame()
for file in csv_files:
    try:
        df = pd.read_csv(file, sep=";", decimal=".")
        overall_data = overall_data.append(df, ignore_index=True)
    
    except pd.errors.EmptyDataError:
        continue

display(overall_data)

In [None]:
overall_data.to_csv("duplicate_free_images_with_rating.csv", sep=";", decimal=".", index=False)