# Image Data Association

In [1]:
from math import sqrt, pi, ceil
from PIL import Image

import numpy as np
import os

In [2]:
# initialize superset directories
superset1_dir = "images/superset_1/"
superset2_dir = "images/superset_2/"

In [3]:
# create color histogram descriptor
def color_histogram(directory, bin_count=8):
    # create list of histogram for all images
    hist = [[img_count, [0 for _ in range(bin_count*3)]] for img_count in range(len(os.listdir(directory)))]
        
    idx = 0
    for fname in os.listdir(directory):
        img = Image.open(directory + fname)
        img_array = np.asarray(img)
        
        # get image dimensions
        img_width, img_height, _ = img_array.shape
        
        # loop through entire image
        for i in range(img_width):
            for j in range(img_height):
                # get RGB pixel values
                R = int((img_array[i][j][0]) / (256 / bin_count))
                G = int((img_array[i][j][1]) / (256 / bin_count)) + bin_count
                B = int((img_array[i][j][2]) / (256 / bin_count)) + (bin_count * 2)
                
                hist[idx][1][R] += 1
                hist[idx][1][G] += 1
                hist[idx][1][B] += 1
        idx += 1
    
    print(f"Successfuly generated histograms from \"{directory}\" directory.")
    return hist

In [4]:
superset1_hist = color_histogram(superset1_dir, bin_count=4)
superset2_hist = color_histogram(superset2_dir, bin_count=4)

Successfuly generated histograms from "images/superset_1/" directory.
Successfuly generated histograms from "images/superset_2/" directory.


In [5]:
# calculate euclidean distance between
def euclidean_distance(x1, x2):
    distance = float(0)
    for i in range(len(x1)):
        distance += ((x1[i] - x2[i]) ** 2)
    return sqrt(distance)

In [6]:
# gaussian value
def gaussian(distance, bandwidth):
    return (1 / (bandwidth * sqrt(2 * pi))) * np.exp(-0.5 * ((distance / bandwidth)) ** 2)

In [7]:
# mean shift algorithm using gaussian kernel
# reference
# https://stats.stackexchange.com/questions/61743/understanding-the-mean-shift-algorithm-with-gaussian-kernel
def mean_shift(points, center, kernel_bw):
    # create empty list of mean
    mean = [0 for _ in range(len(points[0][1]))]
    
    for i in range(len(points[0][1])):
        # initialization for gaussian kernel
        numerator = float(0)
        denominator = float(0)
        
        for point in points:
            distance = euclidean_distance(point[1], center)
            weight = gaussian(distance, kernel_bw)
            numerator += (weight * point[1][i])
            denominator += weight
            
        mean[i] = round(numerator/denominator)
    
    return mean

In [8]:
# generate clusters from the histograms, kernel bandwidth and max distance
def generate_clusters(hists, max_distance, kernel_bw):
    # create empty lists to store the clusters and mean value
    final_clusters = []
    mean_of_clusters = []
    
    # loop through each histograms
    for hist in hists:
        center = hist[1]
        converged = False
        while converged == False:
            points = []
            converged = True
            for pt in hists:
                if euclidean_distance(center, pt[1]) < max_distance:
                    points.append(pt)
            if (points != []):
                mean = mean_shift(points, center, kernel_bw)
            if mean != center:
                center = mean
                converged = False
            else:
                if mean not in mean_of_clusters:
                    mean_of_clusters.append(mean)
                    final_clusters.append([hist[0]])
                else:
                    idx = mean_of_clusters.index(mean)
                    final_clusters[idx].append(hist[0])
    #print(mean_of_clusters)
    return final_clusters

In [9]:
# create image clusters from mean shift
def create_image_clusters(directory, final_clusters, current_superset):
    # count number of clusters from supersets 1 and 2
    count = 1
    
    for cluster in final_clusters:
        # initialize scale of output image
        scale = ceil(sqrt(len(cluster)))
        
        # initialize starting point of image
        # top-left (0,0)
        x = 0
        y = 0
        
        # list filenames
        fname = os.listdir(directory)
        
        # load one image to get height and width of it
        img = Image.open(directory + fname[0])
        img_array = np.asarray(img)
        img_width, img_height, _ = img_array.shape
        
        # initialize output image dimensions to fit cluster of images
        out_img_width = img_width * scale
        out_img_height = img_height * scale
        
        # initialize output image based on above dimensions
        out_img = Image.new('RGB', (out_img_height, out_img_width), color='Black')
        out_img = np.array(out_img)
        
        # initialize scaling factor
        # increases to accomodate more images side by side
        scaling_factor = 0
        
        # loop through cluster
        for i in cluster:
            img = Image.open(directory + fname[i])
            img_array = np.asarray(img)
            
            # get image dimensions
            row, column, _ = img_array.shape
            
            # loop through every image and add to output image
            for i in range(row):
                for j in range(column):
                    out_img[x + i][y + j] = img_array[i][j]
            
            # this part ensures that every image is attached like a grid of images
            y += img_height
            scaling_factor += 1
            
            # when scaling factor reaches the scale, start over
            if scaling_factor == scale:
                x += img_width
                y = 0
                scaling_factor = 0
        
        # Save clustered image output
        out_img = Image.fromarray(out_img)
        out_img_fname = "cluster_" + str(count) + "_" + current_superset + ".png"
        out_img.save("output/problem1/" + out_img_fname)
        
        # increment count for next cluster
        count += 1
        
        print(f"Successfuly generated clusters from \"{current_superset}\" directory.")

In [10]:
if __name__ == "__main__":
    max_distance = 49500.0
    kernel_bw = 27500.0
    
    # clusters from Superset 1
    superset1_clusters = generate_clusters(superset1_hist, max_distance, kernel_bw)
    current_superset = "ss1"
    create_image_clusters(superset1_dir, superset1_clusters, current_superset)
    
    # clusters from Superset 2
    superset2_clusters = generate_clusters(superset2_hist, max_distance, kernel_bw)
    current_superset = "ss2"
    create_image_clusters(superset2_dir, superset2_clusters, current_superset)

Successfuly generated clusters from "ss1" directory.
Successfuly generated clusters from "ss1" directory.
Successfuly generated clusters from "ss1" directory.
Successfuly generated clusters from "ss2" directory.
Successfuly generated clusters from "ss2" directory.
