In [1]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pprint import pprint
from random import randrange
from scipy.sparse import csr_matrix,vstack
from collections import Counter
import numpy as np
import scipy
import os

In [2]:
# creates absolute path
def abspath(path, *paths):
    fpath = os.path.join(os.getcwd(), os.pardir, path)

    for p in paths:
        fpath = os.path.join(fpath, p)
    return fpath

In [3]:
# K = no of cluster
# d = distance measurement - 'c' for cosine, 'e' for euclidean
def hard_kmeans(K, d, data, labels):
    max_iterations = 200
    centroids = []        # List of centroids
    cluster = {}          # {index of centroid: [mnist samples]}
    cluster_labels = {}   # {index of centroid: [mnist lables]}
    cluster_index = {}    # {index of centroid: [mnist indices]}
    iterations = 0
    N = data.shape[0]
    
    # Assume K centroids (by random)
    for i in range(K):
        centroids.append(data[randrange(0, N - 1)])

    print('K=%d' % (K))
    
    while(1):
        
        # Convert list of csr matrices to a 2d csr matrix
        centroids = vstack(centroids) if type(data).__name__ == 'csr_matrix' else centroids
        
        # Compute distance matrix
        if d == 'c':
            distance_matrix = cosine_distances(data, centroids)
        else:
            distance_matrix = euclidean_distances(data, centroids)

        # E step - compute memberships given centroids
        cluster = {}
        cluster_labels = {}
        cluster_index = {}
        for i in range(N):
            cluster.setdefault(np.argmin(distance_matrix[i]), []).append(data[i])
            cluster_labels.setdefault(np.argmin(distance_matrix[i]), []).append(labels[i])
            cluster_index.setdefault(np.argmin(distance_matrix[i]), []).append(i)

        # Store the current centroids before the M step
        prev_centroids = []
        for k in cluster:
            prev_centroids.append(centroids[k])

        # M step - compute centroids given memberships
        centroids = []
        for k in cluster:
            # np.mean(vstack(cluster[k]), axis=0) returns a number, csr_matrix() converts it to a csr matrix
            centroids.append(csr_matrix(np.mean(vstack(cluster[k]), axis=0))) if type(data).__name__ == 'csr_matrix' else centroids.append(np.mean(cluster[k], axis=0))

        iterations += 1
        
        # Termination conditions - on convergence, else after a fixed number of iterations 
        if np.array_equal(centroids, prev_centroids): #allclose , atol=1e-2
            print('Iteration', iterations,': CONVERGED!')
            break

        if iterations == max_iterations:
            print('Iteration', iterations,': max reached')
            break
            
    # Store list of labels as a Counter
    for key,value in cluster_labels.items():
        cluster_labels[key] = Counter(value)

    # Calculate purity
    purity = 0
    for cluster in cluster_labels:
        purity += max(cluster_labels[cluster].values())
    purity /= N
    
    # Calculate gini index
    gini_index = 0
    for key,value in cluster_labels.items():
        gini = 0
        for k,v in value.items():
            gini += (v / sum(cluster_labels[key].values())) ** 2
        gini_index += 1 - gini
    gini_index /= K
    
    # Final result
    print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4), '\n')

In [4]:
# Fetch data
fashion_train_path = abspath('datasets', 'fashion-mnist_train.csv')
fashion_test_path = abspath('datasets', 'fashion-mnist_test.csv')

fashion_train_dataset = np.loadtxt(open(fashion_train_path, 'rb'), delimiter=',', skiprows=1)
fashion_test_dataset = np.loadtxt(open(fashion_test_path, 'rb'), delimiter=',', skiprows=1)

print(fashion_train_dataset.shape)
print(fashion_test_dataset.shape)

(60000, 785)
(10000, 785)


In [5]:
# Data and labels
fashion_train_data = fashion_train_dataset[:, list(range(1, fashion_train_dataset.shape[1]))]
fashion_train_labels = fashion_train_dataset[:, 0]
fashion_test_data = fashion_test_dataset[:, list(range(1, fashion_test_dataset.shape[1]))]
fashion_test_labels = fashion_test_dataset[:, 0]

print(fashion_train_data.shape)
print(fashion_train_labels.shape)
print(fashion_test_data.shape)
print(fashion_test_labels.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


In [6]:
k_list = [5, 10, 20]

print('Without Normalizing - Train Data')
print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='e', data=fashion_train_data, labels=fashion_train_labels)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='c', data=fashion_train_data, labels=fashion_train_labels)

Without Normalizing - Train Data
-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------
K=5
Iteration 46 : CONVERGED!
Purity - 0.4107 Gini Index - 0.6934 

K=10
Iteration 94 : CONVERGED!
Purity - 0.5795 Gini Index - 0.5343 

K=20
Iteration 192 : CONVERGED!
Purity - 0.6602 Gini Index - 0.4166 

-------------------------------------------------
Using Cosine distances ....
-------------------------------------------------
K=5
Iteration 22 : CONVERGED!
Purity - 0.374 Gini Index - 0.6445 

K=10
Iteration 84 : CONVERGED!
Purity - 0.5858 Gini Index - 0.4653 

K=20
Iteration 81 : CONVERGED!
Purity - 0.6481 Gini Index - 0.4266 



In [7]:
k_list = [5, 10, 20]

print('Without Normalizing - Test Data')
print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='e', data=fashion_test_data, labels=fashion_test_labels)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='c', data=fashion_test_data, labels=fashion_test_labels)

Without Normalizing - Test Data
-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------
K=5
Iteration 29 : CONVERGED!
Purity - 0.4125 Gini Index - 0.6911 

K=10
Iteration 68 : CONVERGED!
Purity - 0.573 Gini Index - 0.5155 

K=20
Iteration 24 : CONVERGED!
Purity - 0.6657 Gini Index - 0.4115 

-------------------------------------------------
Using Cosine distances ....
-------------------------------------------------
K=5
Iteration 35 : CONVERGED!
Purity - 0.4551 Gini Index - 0.647 

K=10
Iteration 58 : CONVERGED!
Purity - 0.6155 Gini Index - 0.4177 

K=20
Iteration 64 : CONVERGED!
Purity - 0.6531 Gini Index - 0.396 



# With normalizing

In [8]:
# Normalize data
norm_fashion_train_data = np.divide(fashion_train_data, 255)
norm_fashion_test_data = np.divide(fashion_test_data, 255)

In [9]:
k_list = [5, 10, 20]

print('With Normalizing - Train Data')
print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='e', data=norm_fashion_train_data, labels=fashion_train_labels)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='c', data=norm_fashion_train_data, labels=fashion_train_labels)

With Normalizing - Train Data
-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------
K=5
Iteration 35 : CONVERGED!
Purity - 0.3818 Gini Index - 0.6964 

K=10
Iteration 74 : CONVERGED!
Purity - 0.6029 Gini Index - 0.46 

K=20
Iteration 121 : CONVERGED!
Purity - 0.6512 Gini Index - 0.4114 

-------------------------------------------------
Using Cosine distances ....
-------------------------------------------------
K=5
Iteration 18 : CONVERGED!
Purity - 0.4583 Gini Index - 0.645 

K=10
Iteration 44 : CONVERGED!
Purity - 0.5988 Gini Index - 0.4764 

K=20
Iteration 73 : CONVERGED!
Purity - 0.6414 Gini Index - 0.3914 



In [10]:
k_list = [5, 10, 20]

print('With Normalizing - Test Data')
print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='e', data=norm_fashion_test_data, labels=fashion_test_labels)

print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
for k in k_list:
    hard_kmeans(k, d='c', data=norm_fashion_test_data, labels=fashion_test_labels)

With Normalizing - Test Data
-------------------------------------------------
Using Euclidean distances ....
-------------------------------------------------
K=5
Iteration 35 : CONVERGED!
Purity - 0.377 Gini Index - 0.7053 

K=10
Iteration 33 : CONVERGED!
Purity - 0.6078 Gini Index - 0.4567 

K=20
Iteration 39 : CONVERGED!
Purity - 0.6568 Gini Index - 0.4364 

-------------------------------------------------
Using Cosine distances ....
-------------------------------------------------
K=5
Iteration 20 : CONVERGED!
Purity - 0.4561 Gini Index - 0.5484 

K=10
Iteration 58 : CONVERGED!
Purity - 0.6159 Gini Index - 0.4171 

K=20
Iteration 43 : CONVERGED!
Purity - 0.6717 Gini Index - 0.4012 

