In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.cluster import AgglomerativeClustering
from pprint import pprint
from collections import Counter
from random import randrange
import numpy as np

In [7]:
def evaluation_metrics(pred_labels, true_labels=None):
    if true_labels is not None:
        N = len(pred_labels)

        cluster_labels = {}
        for i in range(len(pred_labels)):
            cluster_labels.setdefault(pred_labels[i], []).append(true_labels[i])

        cluster_labels.pop('Noise', None)
        K = len(cluster_labels)

        # Store list of labels as a Counter
        for key,value in cluster_labels.items():
            cluster_labels[key] = Counter(value)

        # Calculate purity
        purity = 0
        for cluster in cluster_labels:
            purity += max(cluster_labels[cluster].values())

        purity /= N

        # Calculate gini index
        gini_index = 0
        for key,value in cluster_labels.items():
            gini = 0
            for k,v in value.items():
                gini += (v / sum(cluster_labels[key].values())) ** 2
            gini_index += 1 - gini

        gini_index /= K if K != 0 else 1

        # Final result
        print('Purity -', round(purity, 4), 'Gini Index -', round(gini_index, 4))

    print('No. of clusters -', len(Counter(pred_labels)))
    print(Counter(pred_labels), '\n')

In [3]:
# N - size of sample
def get_samples(data, N, labels=None):
    sampled_data = np.zeros((N, data.shape[1]))
    
    if labels is None:
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]

        return sampled_data
        
    else:
        sampled_labels = []
        for i in range(N):
            j = randrange(0, data.shape[0] - 1)
            sampled_data[i] = data[j]
            sampled_labels.append(labels[j])

        return (sampled_data, sampled_labels)

In [4]:
# Fetch data
mnist_dataset = fetch_mldata('mnist original')

# Data and labels
mnist_data = mnist_dataset.data
mnist_labels = mnist_dataset.target

print(mnist_data.shape)
print(mnist_labels.shape)

(70000, 784)
(70000,)
(70000, 784)


In [8]:
# Without normalizing
sampled_data, sampled_labels = get_samples(mnist_data, 20000, mnist_labels)
model = AgglomerativeClustering(n_clusters=10)
pred_labels = model.fit_predict(sampled_data)
evaluation_metrics(pred_labels, sampled_labels)

Purity - 0.6758 Gini Index - 0.3159
No. of clusters - 10
Counter({0: 3479, 1: 3178, 6: 2684, 5: 2520, 2: 1925, 3: 1721, 4: 1295, 7: 1230, 9: 1062, 8: 906}) 



In [None]:
# Normalize data
norm_mnist_data = np.divide(mnist_data, 255)
print(norm_mnist_data.shape)

In [9]:
# With normalizing
sampled_data, sampled_labels = get_samples(norm_mnist_data, 20000, mnist_labels)
model = AgglomerativeClustering(n_clusters=10)
pred_labels = model.fit(sampled_data)
evaluation_metrics(pred_labels.labels_, sampled_labels)

Purity - 0.6877 Gini Index - 0.299
No. of clusters - 10
Counter({1: 3183, 5: 3111, 4: 2767, 2: 2343, 3: 2133, 0: 1964, 6: 1570, 7: 1096, 8: 953, 9: 880}) 

