In [1]:
from scipy.spatial import distance
import numpy as np
import mnist_reader
import math

In [2]:
training_images, training_labels = mnist_reader.load_mnist('data/fashion', kind='train')
test_images, test_labels = mnist_reader.load_mnist('data/fashion', kind='t10k')
images = np.concatenate((training_images, test_images), axis = 0)
labels = np.concatenate((training_labels, test_labels), axis = 0)

In [3]:
def get_distance(a,b):
    return distance.euclidean(a,b)

In [4]:
def make_clusters(images, centroids, k):
    number_of_images = images.shape[0]
    clusters = [[] for i in range(0, k)]

    for i in range(0, number_of_images - 1):
        distances = []
        for j in range(0, k):
            distance = get_distance(images[i], centroids[j])
            distances.append((distance, j))

        distances = sorted(distances)
        cluster_number = distances[0][1]
        clusters[cluster_number].append(i)


    return clusters

In [5]:
def recalculate_centroids(centroids, clusters):
    for i in range (0, len(clusters)):
        image_list = []
        for image_index in clusters[i]:
            image_list.append(images[image_index])
        np_array = np.array(image_list)
        centroids[i] = np_array.mean(axis = 0)

    return centroids

In [6]:
def remake_cluster(clusters, centroids):
    
    total_images_reclustered = 0
    for i in range(0, len(clusters)):
        cluster = clusters[i]
        number_of_images_reclustered = 0
        for image_index in cluster:
            distances_to_each_centroid = {}
            for j in range(0 , len(centroids)):
                distance = get_distance(images[image_index], centroids[j])
                distances_to_each_centroid[j] = distance

            min_distance = distances_to_each_centroid[0]
            min_distance_index = 0
            for cluster_no in distances_to_each_centroid.keys():
                if distances_to_each_centroid[cluster_no] < min_distance:
                    min_distance = distances_to_each_centroid[cluster_no]
                    min_distance_index = cluster_no
            if min_distance_index != i:
                clusters[i].remove(image_index)
                clusters[min_distance_index].append(image_index)
                change_occured = True
                number_of_images_reclustered += 1
        total_images_reclustered += number_of_images_reclustered
    return clusters, total_images_reclustered

In [7]:
def calculate_purity_of_clusters(clusters, k):
    total_purity = 0
    for i in range(0, len(clusters)):
        purity = purity_of_cluster(clusters[i])
#         print("Purity: " + str(purity))
        total_purity += purity
    total_purity = total_purity / len(images)
    return total_purity

In [8]:
def purity_of_cluster(cluster):
    label_occurance = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in label_occurance:
            label_occurance[label] += 1
        else:
            label_occurance[label] = 1
    majority_label = ""
    max_occurance = 0
    for label in label_occurance.keys():
        if label_occurance[label] > max_occurance:
            max_occurance = label_occurance[label]
            majority_label = label
    
    majority_count = 0
    for image_index in cluster:
        if labels[image_index] == majority_label:
            majority_count += 1
    
    return majority_count 

In [9]:
def calculate_gini_index(clusters):
    gini_total = 0
    for cluster in clusters:
        cluster_gini = 1
        image_index_count = create_image_index_count_dict(cluster)
        for label in image_index_count.keys():
            cluster_gini = cluster_gini - math.pow((image_index_count[label] / len(cluster)) , 2)
        gini_total += cluster_gini * len(cluster)
    
    return gini_total / len(images)
        

In [10]:
def create_image_index_count_dict(cluster):
    image_index_count = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in image_index_count:
            image_index_count[label] += 1
        else:
            image_index_count[label] = 1
    return image_index_count

In [15]:

def KMeans(k):

    number_of_images = images.shape[0]

    centroid_indexs = np.linspace(0, number_of_images-1, k)
    centroid_indexs = list(map(int, centroid_indexs))

    centroids = []
    for i in range(0, len(centroid_indexs)):
        centroids.append(images[centroid_indexs[i]])
    # initial clusters
    clusters = make_clusters(images, centroids, k)

    iterate = True
    while(iterate):
        centroids = recalculate_centroids(centroids, clusters)
        clusters, total_images_reclustered = remake_cluster(clusters, centroids)
        purity_of_clusters = calculate_purity_of_clusters(clusters, k)
        gini_index = calculate_gini_index(clusters)
        print("Purity of cluster: " + str(purity_of_clusters))
        print("Gini index: " + str(gini_index))
        print("Total images reclustered: " + str(total_images_reclustered))
        print()
        
        if total_images_reclustered == 0:
            iterate = False
            
    return clusters



In [18]:
print("Starting KMeans.........\n")
clusters = KMeans(10)
print("Done")

Starting KMeans.........

Purity of cluster: 0.35754285714285716
Gini index: 0.7476871806430206
Total images reclustered: 19337

Purity of cluster: 0.40114285714285713
Gini index: 0.7065730706472088
Total images reclustered: 11236

Purity of cluster: 0.43
Gini index: 0.6754671058825736
Total images reclustered: 7119

Purity of cluster: 0.45094285714285715
Gini index: 0.6495364553702704
Total images reclustered: 5040

Purity of cluster: 0.4689714285714286
Gini index: 0.6293112019552598
Total images reclustered: 4000

Purity of cluster: 0.48684285714285713
Gini index: 0.6119093125266154
Total images reclustered: 3572

Purity of cluster: 0.5029142857142858
Gini index: 0.5962686092966007
Total images reclustered: 3290

Purity of cluster: 0.5169285714285714
Gini index: 0.5827694192658567
Total images reclustered: 3147

Purity of cluster: 0.5281857142857143
Gini index: 0.5735937099050163
Total images reclustered: 2954

Purity of cluster: 0.5354285714285715
Gini index: 0.5678352411740929
Tota

Purity of cluster: 0.5836428571428571
Gini index: 0.5431756765928036
Total images reclustered: 45

Purity of cluster: 0.5836714285714286
Gini index: 0.5431763402711506
Total images reclustered: 45

Purity of cluster: 0.5838
Gini index: 0.5430680493343226
Total images reclustered: 38

Purity of cluster: 0.584
Gini index: 0.5429046723289654
Total images reclustered: 39

Purity of cluster: 0.5841571428571428
Gini index: 0.5428750178024627
Total images reclustered: 49

Purity of cluster: 0.5842714285714286
Gini index: 0.5428164800190733
Total images reclustered: 51

Purity of cluster: 0.5844428571428572
Gini index: 0.5426798549365279
Total images reclustered: 44

Purity of cluster: 0.5846571428571429
Gini index: 0.5425003217071769
Total images reclustered: 34

Purity of cluster: 0.5847714285714286
Gini index: 0.5423991330235655
Total images reclustered: 30

Purity of cluster: 0.5848714285714286
Gini index: 0.5423542330619389
Total images reclustered: 21

Purity of cluster: 0.58491428571428

In [19]:
print("Starting KMeans.........\n")
clusters = KMeans(5)
print("Done")

Starting KMeans.........

Purity of cluster: 0.25524285714285716
Gini index: 0.812659995983776
Total images reclustered: 19493

Purity of cluster: 0.2891142857142857
Gini index: 0.7893106705108118
Total images reclustered: 12603

Purity of cluster: 0.31538571428571427
Gini index: 0.7778977188847753
Total images reclustered: 8444

Purity of cluster: 0.3296857142857143
Gini index: 0.7703998085916869
Total images reclustered: 5800

Purity of cluster: 0.34127142857142856
Gini index: 0.7634925447944679
Total images reclustered: 4197

Purity of cluster: 0.3502142857142857
Gini index: 0.7573297592663034
Total images reclustered: 3386

Purity of cluster: 0.3575714285714286
Gini index: 0.7510406072315362
Total images reclustered: 2871

Purity of cluster: 0.3626
Gini index: 0.74506776577804
Total images reclustered: 2565

Purity of cluster: 0.37011428571428573
Gini index: 0.738966659857378
Total images reclustered: 2281

Purity of cluster: 0.3775428571428571
Gini index: 0.7334895060367163
Total 

In [20]:
print("Starting KMeans.........\n")
clusters = KMeans(20)
print("Done")

Starting KMeans.........

Purity of cluster: 0.5351142857142858
Gini index: 0.6034197890197003
Total images reclustered: 20264

Purity of cluster: 0.6016
Gini index: 0.5315158407222228
Total images reclustered: 13070

Purity of cluster: 0.6455714285714286
Gini index: 0.4818663394844875
Total images reclustered: 9137

Purity of cluster: 0.6791
Gini index: 0.4453557937701466
Total images reclustered: 6552

Purity of cluster: 0.6976571428571429
Gini index: 0.4229026952012545
Total images reclustered: 4804

Purity of cluster: 0.7044857142857143
Gini index: 0.41264606397049697
Total images reclustered: 3703

Purity of cluster: 0.7053285714285714
Gini index: 0.40845959897414374
Total images reclustered: 3129

Purity of cluster: 0.7038571428571428
Gini index: 0.4061683309138223
Total images reclustered: 2671

Purity of cluster: 0.7016857142857142
Gini index: 0.40358850325543205
Total images reclustered: 2221

Purity of cluster: 0.7028571428571428
Gini index: 0.4020115205670585
Total images re

Purity of cluster: 0.7307
Gini index: 0.38730124126511684
Total images reclustered: 9

Purity of cluster: 0.7306857142857143
Gini index: 0.3873249768804989
Total images reclustered: 9

Purity of cluster: 0.7306857142857143
Gini index: 0.38730959522028763
Total images reclustered: 9

Purity of cluster: 0.7307285714285714
Gini index: 0.38724227612869466
Total images reclustered: 9

Purity of cluster: 0.7307142857142858
Gini index: 0.3872514709611897
Total images reclustered: 8

Purity of cluster: 0.7307285714285714
Gini index: 0.3872398725338756
Total images reclustered: 2

Purity of cluster: 0.7307285714285714
Gini index: 0.3872435091979794
Total images reclustered: 4

Purity of cluster: 0.7307285714285714
Gini index: 0.3872312753351585
Total images reclustered: 3

Purity of cluster: 0.7307142857142858
Gini index: 0.38723784067635436
Total images reclustered: 1

Purity of cluster: 0.7307142857142858
Gini index: 0.38723784067635436
Total images reclustered: 0

Done
