In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def get_x(folder_name, number_images):
    x_input_points = np.zeros((0, 784))
    for i in range(1, number_images + 1): # +1 Since it's exclusive
        img_path = '{}/{}.jpg'.format(folder_name, i)
        x_input_points = np.append(x_input_points, plt.imread(img_path).reshape(1, 784), axis=0)
    # Convert it to binary with threshold 140
    x_input_points[x_input_points < 140] = 0
    x_input_points[x_input_points >= 140] = 1
    return x_input_points

In [None]:
def get_t(labels_path):
    with open(labels_path) as f:
        training_labels = [int(x) for x in f.read().splitlines()]
    return np.array(training_labels)

In [None]:
# Loading in a separate cell to avoid multiple loads.
x_input = get_x('Images', 2400)
labels = get_t('Images/Training Labels.txt')

In [None]:
def initialize_center_indices(images, k_number):
    number_of_images = images.shape[0]
    centers_indices = np.zeros(k_number).astype(int)
    centers_indices[0] = np.random.randint(0, number_of_images)
    
    for k in range(1, k_number):
        previous_center = x_input[centers_indices[k - 1]]
        max_so_far = {
            'index': centers_indices[k - 1],
            'value': 0
        }
        for i in range(0, number_of_images):
            difference =  np.dot(previous_center - x_input[i], previous_center - x_input[i])
            if difference > max_so_far['value'] and i not in centers_indices:
                max_so_far = {
                    'index': i,
                    'value': difference
                }
        centers_indices[k] = max_so_far['index']
    return centers_indices

In [None]:
def get_k_means_clusters(x_input):

    centroids = x_input[initialize_center_indices(x_input, 10)]
    previous_centroids = np.zeros((10, 784))

    # Keep looping till 2 consecutive runs yield the same centroids
    iterations = 0
    while(not (centroids == previous_centroids).all()):
        iterations += 1
        previous_centroids = centroids
        images_clusters = np.array([]) # 1d array, size of the count of images (2400)
        for x in x_input:
            differences = np.array([])
            for c in centroids:
                differences = np.append(differences, np.dot(x-c, x-c))

            # Each index represents which cluster an image at said index belongs to
            images_clusters = np.append(images_clusters, np.argmin(differences))

        # Update centroids: get the mean of each cluster(k) 
        centroids = np.array([x_input[images_clusters == k].mean(axis = 0) for k in range(10)])
    
    return iterations, centroids

In [None]:
'''
TODO: 
Somehow we need to do the following for all 2400 images:
1- diff from the 10 centroids
2- get argmin(diffs)
3- compare indices from (2) with true labels
4- plot a histogram of counts clustered
'''
i, centroids = get_k_means_clusters(x_input)
print('Took {} runs to converge'.format(i))

In [None]:
fig=plt.figure(figsize=(8, 8))

for i in range(0,10):
    fig.add_subplot(3, 4, i+1)
    plt.imshow(np.split(centroids[i], 28))
plt.show()