In [1]:
from sklearn.preprocessing import MinMaxScaler
import random
import math

def euclidean_dist(x1,y1,r1,g1,b1,x2,y2,r2,g2,b2) :
    distance = math.sqrt((x1-x2)**2 + (y1-y2)**2 + (r1-r2)**2 + (g1-g2)**2 + (b1-b2)**2)
    return distance

# This method pre-processes the image data (a 2D np array of RGB pixels
# values) into the following datapoint format for each pixel:

# datapoint = [row_position, col_position, R_value, G_value, B_value]

# Pixels are transformed into datapoints. The datapoints are stored in a 
# list called data. The data list is turned into a numpy array, scaled 
# using MinMaxScaler and then finally the array is returned.
def pre_processing(np_img) :
    
    # A list that will store the datapoints of the image.
    data = list()
    
    # For each pixel...
    for row in range(len(np_img)) :
        for col in range(len(np_img[0])) :
            
            # The pixel value at the row/col position in the np array
            # in the form of an array of 3 values: R, G, and B.
            pixel_value = (np_img[row][col])
            
            # Separating the values.
            R = pixel_value[0]
            G = pixel_value[1]
            B = pixel_value[1]
            
            # Convert to datapoint.
            datapoint = [row,col,R,G,B]
            
            # Append it to the data list.
            data.append(datapoint)

    # Convert newly formatted data to numpy array.
    data = np.asarray(data)
    
    # Use the MinMaxScaler to scale the data.
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data)
    
    # Convert back to list.
    scaled_data = scaled_data.tolist()
    
    return scaled_data
            
# We feed in a list of the datapoints of the image as our data, as well
# as the number of clusters we want.
def kmeans(n_clusters, np_data) :
    
    #----------------------------------------------
    # STEP 1: RANDOM INITIALIZATION OF CENTROIDS...
    #----------------------------------------------
    
    # A list of what will be the current centroids once they're initialized.
    centroids = list()
    
    # A list of what will be the previous centroid states after the centroids have
    # been moved.
    prev_centroids = list()
    
    # For each centroid...
    for centroid in range(n_clusters) :
        
        # Pick a random datapoint.
        datapoint = data[random.randint(0, len(data) - 1)]
        
        # Make that datapoint a centroid.
        centroids.append(datapoint)
        
    #---------------------------------------------------
    # STEP 2: ASSIGN DATAPOINTS TO CLUSTERS/CENTROIDS...
    #---------------------------------------------------
    
    # Continue to repeat kmeans until convergence. That is, until the centroids stop
    # moving.
    while centroids != prev_centroids :
        print(centroids)
        # A list for the cluster assignments of each datapoint.
        cluster_assignments = list()
        
        sums_per_cluster = list()
        # This list is indexed according to clusters, so index 0 contains data from
        # cluster 0, index 1 contains data from cluster 1, etc. Each index contains
        # a list with the sum of the row values, sum of the column values, sum of
        # the R values, sum of the G values, sum of the B values of all the datapoints
        # associated with the cluster.

        # This will be used to calculate the average (mean) of these values for each
        # cluster to come up with each centroid's new position.
        
        # The sums are initialized to zero.
        for cluster in range(n_clusters) :
            sums_per_cluster.append([0,0,0,0,0])
            
        #--------------------------------------------------------------------------------------------------
        # EXAMPLE
        #--------------------------------------------------------------------------------------------------
        # For cluster 0:
        
        # Initially...
        # sums_per_cluster[0] 
        # = [sum_row_values = 0, sum_col_values = 0, sum_R_values = 0, sum_G_values = 0, sum_B_values = 0]
        
        # ...sum_row_values for example indicates the sum of the row values from each
        # datapoint in cluster 0.
        #-------------------------------------------------------------------------------------------------
        
        # A list of the number of datapoints per cluster (indexed according to clusters).
        n_datapoints_per_cluster = list()
        for cluster in range(n_clusters) :
            n_datapoints_per_cluster.append(0)
        # EXAMPLE: K = 2, n_datapoints_per_cluster = [cluster 1: 0 datapoints, cluster 2: 0 datapoints]
        
        # For each datapoint...
        for datapoint in data :
            
            # A list of what will be the current datapoint's distances from each centroid
            # (indexed according to centroids).
            distances = list()
            # Example: distances[0] is the distance of the datapoint from centroid 0.
                
            # For each centroid...
            for centroid in centroids :
                # Get the datapoint's data values. That is, row value, column value, R value, G value,
                # and B value and also the data values of the centroid and use euclidean distance to
                # find the distance between the datapoint and the centroid.
                
                # Datapoint's data values.
                x1 = datapoint[0]
                y1 = datapoint[1]
                r1 = datapoint[2]
                g1 = datapoint[3]
                b1 = datapoint[4]
                
                # Centroid's data values.
                x2 = centroid[0]
                y2 = centroid[1]
                r2 = centroid[2]
                g2 = centroid[2]
                b2 = centroid[2]
                
                distance = euclidean_dist(x1,y1,r1,g1,b1,x2,y2,r2,g2,b2)
                
                # Append the distance between the datapoint and the centroid to the list
                # of distances.
                distances.append(distance)
                
            # Find the shortest distance from a centroid and also find the
            # index of that closest centroid within the list of centroids.
            shortest_dist = None
            shortest_dist_centroid_index = None
            
            # Go through the distances from each centroid of the datapoint
            # to determine which is the closest one (i.e. what cluster the
            # pixel belongs to).
            for i, distance in enumerate(distances) :
                if shortest_dist is None or distance < shortest_dist :
                    shortest_dist = distance
                    shortest_dist_centroid_index = i
            
            # Append the closest centroid's index to the list of cluster assignment
            cluster_assignments.append(shortest_dist_centroid_index)
            
            # Add the datapoint's data values to the sums of the datapoint data values per
            # cluster.
            
            # Add the datapoint's row value.
            sums_per_cluster[shortest_dist_centroid_index][0] += datapoint[0]
            # Add the datapoint's column value.
            sums_per_cluster[shortest_dist_centroid_index][1] += datapoint[1]
            # Add the datapoint's R value.
            sums_per_cluster[shortest_dist_centroid_index][2] += datapoint[2]
            # Add the datapoint's G value.
            sums_per_cluster[shortest_dist_centroid_index][3] += datapoint[3]
            # Add the datapoint's B value.
            sums_per_cluster[shortest_dist_centroid_index][4] += datapoint[4]
            
            # Increment the number of datapoints belonging to the cluster.
            n_datapoints_per_cluster[shortest_dist_centroid_index] += 1
            
        #----------------------------------------------------
        # End of for loop for iterating through datapoints...
        #----------------------------------------------------
        
        #--------------------------
        # STEP 3: MOVE CENTROIDS...
        #--------------------------
        
        # Make the current centroids the previous centroids before moving them.
        prev_centroids = list(centroids)
        
        # Now we must remove any centroids that have no datapoints attached to them.
        
        # We must iterate through each cluster and check if it has zero pixels
        # associated with it.
    
        # Loop counter: we start checking at cluster 0...
        cluster = 0
        # For each cluster...
        while(cluster < n_clusters) :
        # If the number of datapoints associated with the cluster is zero, remove the 
        # cluster's centroid from the centroid list and also remove the cluster 
        # from the n_datapoints_per_cluster list.
            
            if n_datapoints_per_cluster[cluster] == 0 :
                centroids.pop(cluster)
                n_datapoints_per_cluster.pop(cluster)
                # Decrement the number of cluster by one.
                n_clusters -= 1
                # Because we removed a cluster, we have to decrement the loop counter because
                # the next cluster is now at the index of the cluster we just removed.
                cluster -= 1 
                # Go to the next cluster.
                continue
            # Go to the next cluster unless we removed the last one, therefore we can't go to the
            # next one.
            if(cluster < n_clusters) :
                cluster += 1

        # NOW WE MOVE THE CENTROIDS!
        
        # To move a centroid, we have to calculate the average of the data values of all
        # the datapoints within the clusters...
        
        # 1) Calculate the mean of the row values of all the datapoints in the cluster.
        # 2) Calculate the mean of the column values of all the datapoints in the cluster.
        # 3) Calculate the mean of the R values of all the datapoints in the cluster.
        # 4) Calculate the mean of the G values of all the datapoints in the cluster.
        # 5) Calculate the mean of the B values of all the datapoints in the cluster.
        
        # ...and then we use these values as our data values for the new centroid
        # (i.e. moving the centroid).
        
        # For each centroid...
        for centroid in range(len(centroids)) :
            
            # Get the number of datapoints associated with the centroid.
            n_datapoints = n_datapoints_per_cluster[centroid]
            
            # Calculate the averages of the data values. These will be the new 
            # data values of our centroid.
            row = sums_per_cluster[centroid][0] / n_datapoints
            col = sums_per_cluster[centroid][1] / n_datapoints
            R = sums_per_cluster[centroid][2] / n_datapoints
            G = sums_per_cluster[centroid][3] / n_datapoints
            B = sums_per_cluster[centroid][4] / n_datapoints
            
            # Move the centroid.
            new_centroid = [row,col,R,G,B]
            centroids[centroid] = new_centroid
            
    #-------------------------------------------
    # End of while loop for centroid movement...
    #-------------------------------------------
            
    return cluster_assignments
            
from PIL import Image
import numpy as np

# Open image and convert it to an numpy array.
np_img = np.array(Image.open('face.jpeg'))
print(np_img)                  
# Pre-processing.
data = pre_processing(np_img)

# Calling KMeans.
cluster_assignments = kmeans(11, data)

cluster_assignments = np.asarray(cluster_assignments)

cluster_assignments = np.reshape(cluster_assignments, (-1, len(np_img[0])))

cluster_img = list()

for row in range(len(cluster_assignments)) :
    cluster_img.append([])
    for col in range(len(cluster_assignments[0])) :
        if cluster_assignments[row][col] == 0 :
            cluster_img[row].append([255,0,0])
        if cluster_assignments[row][col] == 1 :
            cluster_img[row].append([0,255,0])
        if cluster_assignments[row][col] == 2 :
            cluster_img[row].append([0,0,255])
        if cluster_assignments[row][col] == 3 :
            cluster_img[row].append([255,255,0])
        if cluster_assignments[row][col] == 4 :
            cluster_img[row].append([0,255,255])
        if cluster_assignments[row][col] == 5 :
            cluster_img[row].append([255,0,255])
        if cluster_assignments[row][col] == 6 :
            cluster_img[row].append([255,255,255])
        if cluster_assignments[row][col] == 7 :
            cluster_img[row].append([127,127,127])
        if cluster_assignments[row][col] == 8 :
            cluster_img[row].append([127,0,0])
        if cluster_assignments[row][col] == 9 :
            cluster_img[row].append([0,127,0])
        if cluster_assignments[row][col] == 10 :
            cluster_img[row].append([0,0,127])
        if cluster_assignments[row][col] == 11 :
            cluster_img[row].append([127,127,0])
        if cluster_assignments[row][col] == 12 :
            cluster_img[row].append([0,127,127])
                
cluster_img = np.asarray(cluster_img)
print(type(cluster_img))
print(cluster_img)

cluster_img = Image.fromarray(cluster_img.astype('uint8'), 'RGB')
cluster_img.save("results.jpeg")

FileNotFoundError: [Errno 2] No such file or directory: 'face.jpeg'