In [1]:
from scipy.spatial import distance
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.decomposition import IncrementalPCA

In [2]:
# preprocess data 

file = open("mnist_noisy_SAMPLE5000_K20_F31.txt", 'r')
images = []
labels = []
for line in file:
    labels.append(np.int32(line[0]))
    line = line[1:]
    image = []
    for i in line.split():
        image.append(np.float32(i))
    images.append(image)
file.close()
images = np.array(images)
labels = np.array(labels)

In [3]:
pca = IncrementalPCA(n_components=30)
images = pca.fit_transform(images)

In [25]:
plt.imshow(images[0].reshape(28,28))

ValueError: cannot reshape array of size 30 into shape (28,28)

In [24]:
def get_distance(a,b):
    return distance.cosine(a,b)

In [6]:
def make_clusters(images, centroids, k):
    number_of_images = images.shape[0]
    clusters = [[] for i in range(0, k)]

    for i in range(0, number_of_images - 1):
        distances = []
        for j in range(0, k):
            distance = get_distance(images[i], centroids[j])
            distances.append((distance, j))

        distances = sorted(distances)
        cluster_number = distances[0][1]
        clusters[cluster_number].append(i)


    return clusters

In [7]:
def recalculate_centroids(centroids, clusters):
    for i in range (0, len(clusters)):
        image_list = []
        for image_index in clusters[i]:
            image_list.append(images[image_index])
        np_array = np.array(image_list)
        centroids[i] = np_array.mean(axis = 0)

    return centroids

In [8]:
def remake_cluster(clusters, centroids):
    
    total_images_reclustered = 0
    for i in range(0, len(clusters)):
        cluster = clusters[i]
        number_of_images_reclustered = 0
        for image_index in cluster:
            distances_to_each_centroid = {}
            for j in range(0 , len(centroids)):
                distance = get_distance(images[image_index], centroids[j])
                distances_to_each_centroid[j] = distance

            min_distance = distances_to_each_centroid[0]
            min_distance_index = 0
            for cluster_no in distances_to_each_centroid.keys():
                if distances_to_each_centroid[cluster_no] < min_distance:
                    min_distance = distances_to_each_centroid[cluster_no]
                    min_distance_index = cluster_no
            if min_distance_index != i:
                clusters[i].remove(image_index)
                clusters[min_distance_index].append(image_index)
                change_occured = True
                number_of_images_reclustered += 1
        total_images_reclustered += number_of_images_reclustered
    return clusters, total_images_reclustered

In [9]:
def calculate_purity_of_clusters(clusters, k):
    total_purity = 0
    for i in range(0, len(clusters)):
        purity = purity_of_cluster(clusters[i])
#         print("Purity: " + str(purity))
        total_purity += purity
    total_purity = total_purity / len(images)
    return total_purity

In [10]:
def purity_of_cluster(cluster):
    label_occurance = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in label_occurance:
            label_occurance[label] += 1
        else:
            label_occurance[label] = 1
    majority_label = ""
    max_occurance = 0
    for label in label_occurance.keys():
        if label_occurance[label] > max_occurance:
            max_occurance = label_occurance[label]
            majority_label = label
    
    majority_count = 0
    for image_index in cluster:
        if labels[image_index] == majority_label:
            majority_count += 1
    
    return majority_count 

In [19]:
def calculate_purity_of_labels(clusters):
    total = 0
    for label in set(labels):
        labels_count = get_label_distribution_count(label, clusters)
        total += labels_count[np.argmax(labels_count)] / len(clusters[np.argmax(labels_count)])
    return total / 10

In [12]:
def get_label_distribution_count(test_label, clusters):
    label_cluster_counts = []
    for cluster in clusters:
        count = 0
        for image_index in cluster:
            if labels[image_index] == test_label:
                count += 1
        label_cluster_counts.append(count)
    return label_cluster_counts


In [13]:
def calculate_gini_index(clusters):
    gini_total = 0
    for cluster in clusters:
        cluster_gini = 1
        image_index_count = create_image_index_count_dict(cluster)
        for label in image_index_count.keys():
            cluster_gini = cluster_gini - math.pow((image_index_count[label] / len(cluster)) , 2)
        gini_total += cluster_gini * len(cluster)
    
    return gini_total / len(images)
        

In [14]:
def create_image_index_count_dict(cluster):
    image_index_count = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in image_index_count:
            image_index_count[label] += 1
        else:
            image_index_count[label] = 1
    return image_index_count

In [27]:
def KMeans(k):
    temp_total = 0
    print("Working with images with shape: " + str(images.shape))
    number_of_images = images.shape[0]

    centroid_indexs = np.linspace(0, number_of_images-1, k)
    centroid_indexs = list(map(int, centroid_indexs))

    centroids = []
    for i in range(0, len(centroid_indexs)):
        centroids.append(images[centroid_indexs[i]])
    # initial clusters
    clusters = make_clusters(images, centroids, k)

    iterate = True
    while(iterate):
        centroids = recalculate_centroids(centroids, clusters)
        clusters, total_images_reclustered = remake_cluster(clusters, centroids)
        purity_of_clusters = calculate_purity_of_clusters(clusters, k)
        gini_index = calculate_gini_index(clusters)
        print("Purity of cluster (labels, cluster): " + str(purity_of_clusters))
        print("Gini index: " + str(gini_index))
        print("Total images reclustered: " + str(total_images_reclustered))
        print()
        
        if total_images_reclustered == 0:
            iterate = False
        if total_images_reclustered <= 6:
            temp_total += 1
        if temp_total == 10:
            iterate = False
    return clusters

In [28]:
print("Starting KMeans.........\n")
clusters = KMeans(20)
print("\n Purity of labels: " + str(calculate_purity_of_labels(clusters)))
print("Done")

Starting KMeans.........

Working with images with shape: (5000, 30)
Purity of cluster (labels, cluster): 0.2628
Gini index: 0.831269638269363
Total images reclustered: 781

Purity of cluster (labels, cluster): 0.2648
Gini index: 0.8291104081637335
Total images reclustered: 485

Purity of cluster (labels, cluster): 0.2652
Gini index: 0.827630953459862
Total images reclustered: 340

Purity of cluster (labels, cluster): 0.2694
Gini index: 0.8247963403222539
Total images reclustered: 242

Purity of cluster (labels, cluster): 0.269
Gini index: 0.8229942842523513
Total images reclustered: 225

Purity of cluster (labels, cluster): 0.2714
Gini index: 0.8195449825830208
Total images reclustered: 182

Purity of cluster (labels, cluster): 0.2736
Gini index: 0.8147885245942924
Total images reclustered: 173

Purity of cluster (labels, cluster): 0.2764
Gini index: 0.8093660344060901
Total images reclustered: 141

Purity of cluster (labels, cluster): 0.2818
Gini index: 0.8028589700261534
Total image

In [29]:
a = calculate_purity_of_clusters(clusters, 20)

In [30]:
b = calculate_purity_of_labels(clusters)

In [31]:
a

0.295

In [32]:
b

0.30949703043031607

In [33]:
2 * a * b / (a+b)

0.3020746815313532

In [116]:
purity_of_each_cluster = []
for cluster in clusters:
    purity_of_each_cluster.append(purity_of_cluster(cluster) / len(cluster))

In [117]:
purity_of_each_cluster

[0.7590361445783133,
 0.27014218009478674,
 0.15339233038348082,
 0.24152542372881355,
 0.17391304347826086,
 0.4326241134751773,
 0.1904,
 0.21768707482993196,
 0.7105263157894737,
 0.12658227848101267,
 0.3956043956043956,
 0.68,
 0.2129032258064516,
 0.17866004962779156,
 0.3333333333333333,
 0.14814814814814814,
 0.21481481481481482,
 0.8905472636815921,
 0.16173120728929385,
 0.47342995169082125]

In [57]:
def majority_count_label(cluster):
    label_occurance = {}
    for image_index in cluster:
        label = labels[image_index]
        if label in label_occurance:
            label_occurance[label] += 1
        else:
            label_occurance[label] = 1
    majority_label = ""
    max_occurance = 0
    for label in label_occurance.keys():
        if label_occurance[label] > max_occurance:
            max_occurance = label_occurance[label]
            majority_label = label
    
    return majority_label 

In [61]:
majority_count_label(clusters[9])

1

In [74]:
majority_count_label(clusters[4])

8

In [75]:
majority_count_label(clusters[16])

8

In [65]:
for image_index in clusters[9]:
    print(labels[image_index], end = " ")

4 6 4 5 2 3 9 5 8 7 4 0 5 7 7 5 0 4 0 1 6 8 2 0 4 5 0 6 3 9 9 7 5 6 4 5 0 0 5 6 6 7 4 0 8 2 3 5 9 9 6 9 6 3 1 4 6 5 1 8 3 9 9 4 6 3 3 1 7 3 3 9 0 1 1 3 7 3 7 4 2 5 2 4 2 9 7 9 3 1 6 1 2 5 3 6 6 4 7 9 9 0 6 6 4 3 3 2 3 2 7 7 2 7 4 8 3 1 7 6 3 8 6 2 2 1 7 9 8 2 2 1 0 8 6 1 1 9 2 9 1 9 5 3 1 1 7 1 7 8 2 2 8 1 6 9 1 1 

In [72]:
for image_index in clusters[4]:
    print(labels[image_index], end = " ")

8 2 5 5 5 1 3 1 1 2 8 9 7 1 1 7 1 2 1 1 9 4 8 2 2 2 4 1 9 3 9 4 4 8 3 7 1 1 5 1 8 0 3 1 1 1 4 1 8 7 1 2 1 9 3 1 9 1 1 5 8 6 5 8 9 8 4 2 8 8 4 8 9 6 9 5 8 5 8 8 2 5 4 2 4 6 3 8 9 7 9 2 2 9 3 3 9 9 9 8 4 5 3 3 3 8 3 7 9 0 4 3 4 3 3 8 7 2 3 3 5 7 3 3 9 3 0 7 1 3 4 4 9 5 9 4 3 8 8 6 6 9 8 5 3 4 6 2 8 8 2 6 8 4 8 6 4 8 3 2 3 7 6 8 3 8 2 2 4 4 4 8 2 8 9 3 7 3 7 4 9 4 4 8 

In [73]:
for image_index in clusters[16]:
    print(labels[image_index], end = " ")

2 5 1 6 5 1 6 9 2 1 2 2 6 1 2 1 5 2 5 2 4 2 2 3 2 8 5 2 9 8 8 8 8 8 5 8 6 4 7 3 2 2 2 2 8 8 9 8 8 5 8 8 8 3 2 4 8 3 8 3 8 1 8 7 3 8 8 6 2 8 9 3 1 1 4 3 5 5 4 0 5 0 9 8 7 1 3 8 7 1 5 4 5 8 8 0 4 8 7 8 6 2 7 9 9 7 3 5 2 7 4 8 1 6 5 7 8 2 1 9 1 9 1 4 5 9 5 4 1 1 9 2 1 2 1 

In [66]:
len(clusters[9])

158

In [67]:
len(clusters[4])

184

In [70]:
len(clusters[16])

135

In [86]:
temp_cluster1 = [] # 1
temp_cluster2 = [] # 8
temp_cluster3 = [] # rest
for image_index in clusters[9]:
    if labels[image_index] == 1:
        temp_cluster1.append(image_index)
    elif labels[image_index] == 8:
        temp_cluster2.append(image_index)
    else:
        temp_cluster3.append(image_index)
for image_index in clusters[4]:
    if labels[image_index] == 1:
        temp_cluster1.append(image_index)
    elif labels[image_index] == 8:
        temp_cluster2.append(image_index)
    else:
        temp_cluster3.append(image_index)
for image_index in clusters[16]:
    if labels[image_index] == 1:
        temp_cluster1.append(image_index)
    elif labels[image_index] == 8:
        temp_cluster2.append(image_index)
    else:
        temp_cluster3.append(image_index)

In [87]:
for image_index in temp_cluster1:
    print(labels[image_index], end = " ")

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [88]:
for image_index in temp_cluster2:
    print(labels[image_index], end = " ")

8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 

In [89]:
for image_index in temp_cluster3:
    print(labels[image_index], end = " ")

4 6 4 5 2 3 9 5 7 4 0 5 7 7 5 0 4 0 6 2 0 4 5 0 6 3 9 9 7 5 6 4 5 0 0 5 6 6 7 4 0 2 3 5 9 9 6 9 6 3 4 6 5 3 9 9 4 6 3 3 7 3 3 9 0 3 7 3 7 4 2 5 2 4 2 9 7 9 3 6 2 5 3 6 6 4 7 9 9 0 6 6 4 3 3 2 3 2 7 7 2 7 4 3 7 6 3 6 2 2 7 9 2 2 0 6 9 2 9 9 5 3 7 7 2 2 6 9 2 5 5 5 3 2 9 7 7 2 9 4 2 2 2 4 9 3 9 4 4 3 7 5 0 3 4 7 2 9 3 9 5 6 5 9 4 2 4 9 6 9 5 5 2 5 4 2 4 6 3 9 7 9 2 2 9 3 3 9 9 9 4 5 3 3 3 3 7 9 0 4 3 4 3 3 7 2 3 3 5 7 3 3 9 3 0 7 3 4 4 9 5 9 4 3 6 6 9 5 3 4 6 2 2 6 4 6 4 3 2 3 7 6 3 2 2 4 4 4 2 9 3 7 3 7 4 9 4 4 2 5 6 5 6 9 2 2 2 6 2 5 2 5 2 4 2 2 3 2 5 2 9 5 6 4 7 3 2 2 2 2 9 5 3 2 4 3 3 7 3 6 2 9 3 4 3 5 5 4 0 5 0 9 7 3 7 5 4 5 0 4 7 6 2 7 9 9 7 3 5 2 7 4 6 5 7 2 9 9 4 5 9 5 4 9 2 2 

In [95]:
test_clusters = []
for i in range(0, len(clusters)):
    if i == 4: 
        test_clusters.append(temp_cluster1)
    elif i == 9:
        test_clusters.append(temp_cluster2)
    elif i == 16:
        test_clusters.append(temp_cluster3)
    else:
        test_clusters.append(clusters[i])

In [118]:
purity_of_each_cluster = []
for cluster in test_clusters:
    purity_of_each_cluster.append(purity_of_cluster(cluster) / len(cluster))

In [120]:
total_purity = 0
for purity in purity_of_each_cluster:
    total_purity += purity

In [126]:
avg_purity = total_purity / 20

In [127]:
avg_purity 

0.4308660607932636

In [99]:
b = calculate_purity_of_labels(test_clusters)

In [98]:
b

0.3166

In [128]:
a = 0.4308660607932636

In [133]:
2 * a * b / (a+b)

0.3649990334072882