In [1]:
from tensorflow.examples.tutorials.mnist import input_data
from scipy.spatial import distance
import numpy as np
from datetime import datetime
import math

  return f(*args, **kwds)


In [2]:
mnist = input_data.read_data_sets("MNIST_data/")

training_images = mnist.train.images
training_labels = mnist.train.labels
test_images = mnist.test.images
test_labels = mnist.test.labels
images = np.concatenate((training_images, test_images), axis = 0)
labels = np.concatenate((training_labels, test_labels), axis = 0)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
def get_distance(a,b):
    return distance.euclidean(a,b)

In [4]:
def make_clusters(images, centroids, k):
    number_of_images = images.shape[0]
    clusters = [[] for i in range(0, k)]

    for i in range(0, number_of_images - 1):
        distances = []
        for j in range(0, k):
            distance = get_distance(images[i], centroids[j])
            distances.append((distance, j))

        distances = sorted(distances)
        cluster_number = distances[0][1]
        clusters[cluster_number].append(i)


    return clusters

In [5]:
def recalculate_centroids(centroids, clusters):
    for i in range (0, len(clusters)):
        image_list = []
        for image_index in clusters[i]:
            image_list.append(images[image_index])
        np_array = np.array(image_list)
        centroids[i] = np_array.mean(axis = 0)

    return centroids

In [6]:
def remake_cluster(clusters, centroids):
    total_images_reclustered = 0
    for i in range(0, len(clusters)):
        cluster = clusters[i]
        number_of_images_reclustered = 0
        for image_index in cluster:
            distances_to_each_centroid = {}
            for j in range(0 , len(centroids)):
                distance = get_distance(images[image_index], centroids[j])
                distances_to_each_centroid[j] = distance

            min_distance = distances_to_each_centroid[0]
            min_distance_index = 0
            for cluster_no in distances_to_each_centroid.keys():
                if distances_to_each_centroid[cluster_no] < min_distance:
                    min_distance = distances_to_each_centroid[cluster_no]
                    min_distance_index = cluster_no
            if min_distance_index != i:
                clusters[i].remove(image_index)
                clusters[min_distance_index].append(image_index)
                change_occured = True
                number_of_images_reclustered += 1
        total_images_reclustered += number_of_images_reclustered
    return clusters, total_images_reclustered

In [7]:
def calculate_purity_of_clusters(clusters):
    total_purity = 0
    for i in range(0, len(clusters)):
        majority = majority_count_of_cluster(clusters[i])
        total_purity += majority
    total_purity = total_purity / len(images)
    return total_purity
    

In [8]:
def majority_count_of_cluster(cluster):
    label_occurance = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in label_occurance:
            label_occurance[label] += 1
        else:
            label_occurance[label] = 1
    majority_label = ""
    max_occurance = 0
    for label in label_occurance.keys():
        if label_occurance[label] > max_occurance:
            max_occurance = label_occurance[label]
            majority_label = label
    
    majority_count = 0
    for image_index in cluster:
        if labels[image_index] == majority_label:
            majority_count += 1
    
    return majority_count 


In [9]:
def calculate_gini_index(clusters):
    gini_total = 0
    for cluster in clusters:
        cluster_gini = 1
        image_index_count = create_image_index_count_dict(cluster)
        for label in image_index_count.keys():
            cluster_gini = cluster_gini - math.pow((image_index_count[label] / len(cluster)) , 2)
        gini_total += cluster_gini * len(cluster)
    
    return gini_total / len(images)

In [10]:
def create_image_index_count_dict(cluster):
    image_index_count = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in image_index_count:
            image_index_count[label] += 1
        else:
            image_index_count[label] = 1
    return image_index_count

In [11]:
def create_image_index_count_dict(cluster):
    image_index_count = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in image_index_count:
            image_index_count[label] += 1
        else:
            image_index_count[label] = 1
    return image_index_count

In [12]:
def KMeans(k):

    number_of_images = images.shape[0]

    startTime = datetime.now()
    centroid_indexs = np.linspace(0, number_of_images-1, k)
    centroid_indexs = list(map(int, centroid_indexs))
    
    centroids = []
    for i in range(0, len(centroid_indexs)):
        centroids.append(images[centroid_indexs[i]])
    # initial clusters
    clusters = make_clusters(images, centroids, k)

    iterate = True
    while(iterate):
        centroids = recalculate_centroids(centroids, clusters)
        clusters, total_images_reclustered = remake_cluster(clusters, centroids)
        purity_of_clusters = calculate_purity_of_clusters(clusters)
        gini_index = calculate_gini_index(clusters)
        print("Purity of cluster: " + str(purity_of_clusters))
        print("Gini index: " + str(gini_index))
        print("Total images reclustered: " + str(total_images_reclustered))
        print()
        
        if total_images_reclustered == 0:
            iterate = False
            
    return clusters

In [17]:
print("Starting KMeans.....\n")
clusters = KMeans(10)
print("Done")

Starting KMeans.....

Purity of cluster: 0.42049230769230767
Gini index: 0.7048758125773793
Total images reclustered: 15154

Purity of cluster: 0.44464615384615386
Gini index: 0.6793639049781723
Total images reclustered: 10404

Purity of cluster: 0.45863076923076923
Gini index: 0.6598276065128407
Total images reclustered: 7305

Purity of cluster: 0.4813846153846154
Gini index: 0.6411267050501133
Total images reclustered: 5572

Purity of cluster: 0.5026461538461539
Gini index: 0.6238797507945099
Total images reclustered: 4557

Purity of cluster: 0.5236615384615385
Gini index: 0.6062330305247313
Total images reclustered: 3724

Purity of cluster: 0.5414307692307693
Gini index: 0.5888666592183857
Total images reclustered: 3156

Purity of cluster: 0.5553076923076923
Gini index: 0.5735991172851713
Total images reclustered: 2593

Purity of cluster: 0.5659846153846154
Gini index: 0.5613075013726744
Total images reclustered: 2120

Purity of cluster: 0.5738
Gini index: 0.553291043032778
Total im

Purity of cluster: 0.5841538461538461
Gini index: 0.5476092652006648
Total images reclustered: 30

Purity of cluster: 0.5841076923076923
Gini index: 0.5476428281710426
Total images reclustered: 22

Purity of cluster: 0.5840923076923077
Gini index: 0.5475787970515861
Total images reclustered: 23

Purity of cluster: 0.5840461538461539
Gini index: 0.547567079284058
Total images reclustered: 19

Purity of cluster: 0.584076923076923
Gini index: 0.5475268630307947
Total images reclustered: 16

Purity of cluster: 0.5840923076923077
Gini index: 0.5475248723208603
Total images reclustered: 17

Purity of cluster: 0.584076923076923
Gini index: 0.547534735636107
Total images reclustered: 23

Purity of cluster: 0.5840461538461539
Gini index: 0.5475918221997137
Total images reclustered: 15

Purity of cluster: 0.5839692307692308
Gini index: 0.5476522615460556
Total images reclustered: 11

Purity of cluster: 0.5839384615384615
Gini index: 0.5476642186732004
Total images reclustered: 10

Purity of clus

In [21]:
print("Starting KMeans.....\n")
clusters = KMeans(5)
print("Done")

Starting KMeans.....

Purity of cluster: 0.30146153846153845
Gini index: 0.8029003395629009
Total images reclustered: 11649

Purity of cluster: 0.3258923076923077
Gini index: 0.7851957525224695
Total images reclustered: 7407

Purity of cluster: 0.33966153846153846
Gini index: 0.7717841486280311
Total images reclustered: 4960

Purity of cluster: 0.34747692307692307
Gini index: 0.7617970877701108
Total images reclustered: 3459

Purity of cluster: 0.35655384615384617
Gini index: 0.7549193582389453
Total images reclustered: 2274

Purity of cluster: 0.3629538461538461
Gini index: 0.750440082522088
Total images reclustered: 1540

Purity of cluster: 0.3670307692307692
Gini index: 0.7476347382286057
Total images reclustered: 1048

Purity of cluster: 0.3698
Gini index: 0.7459196333830733
Total images reclustered: 759

Purity of cluster: 0.3712923076923077
Gini index: 0.7450210858486201
Total images reclustered: 522

Purity of cluster: 0.3723076923076923
Gini index: 0.7444392866755585
Total imag

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0.4007538461538461
Gini index: 0.7383621293507497
Total images reclustered: 0

Purity of cluster: 0

In [13]:
print("Starting KMeans.....\n")
clusters = KMeans(20)
print("Done")

Starting KMeans.....

Purity of cluster: 0.5029846153846154
Gini index: 0.6082286024309501
Total images reclustered: 15023

Purity of cluster: 0.5574615384615385
Gini index: 0.5570924760268444
Total images reclustered: 9848

Purity of cluster: 0.5971076923076923
Gini index: 0.5228584785185385
Total images reclustered: 6795

Purity of cluster: 0.6207846153846154
Gini index: 0.50240008504275
Total images reclustered: 5001

Purity of cluster: 0.6395076923076923
Gini index: 0.4864267182885696
Total images reclustered: 4100

Purity of cluster: 0.6533230769230769
Gini index: 0.4739666641369416
Total images reclustered: 3483

Purity of cluster: 0.6631538461538462
Gini index: 0.4644925930649321
Total images reclustered: 2911

Purity of cluster: 0.671
Gini index: 0.45672528513186383
Total images reclustered: 2578

Purity of cluster: 0.6784
Gini index: 0.4495790025635899
Total images reclustered: 2184

Purity of cluster: 0.6851538461538461
Gini index: 0.44246375163596224
Total images reclustered