In [42]:
import pandas as pd
import numpy as np
from keras.datasets import mnist

In [51]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print('Training Data:',x_train.shape)
print('Training Labels:', y_train.shape)

Training Data: (60000, 28, 28)
Training Labels: (60000,)


In [44]:
X = x_train.reshape(len(x_train),-1)
Y = y_train

X = X.astype(float) / 255.

print(X.shape)
print(X[0].shape)

(60000, 784)
(784,)


In [45]:
class KMeansScratch:
    def __init__(self, n_clusters, max_iter=100):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def fit(self, X):
        n_samples, n_features = X.shape
        self.centroids = X[np.random.choice(n_samples, self.n_clusters, replace=False)]
        for _ in range(self.max_iter):
            labels = self.assign_clusters(X)
            new_centroids = self.update_centroids(X, labels)
            if np.allclose(self.centroids, new_centroids):
                break
            self.centroids = new_centroids
        return labels

    def assign_clusters(self, X):
        distances = np.sqrt(((X - self.centroids[:, np.newaxis])**2).sum(axis=2))
        return np.argmin(distances, axis=0)

    def update_centroids(self, X, labels):
        new_centroids = np.zeros_like(self.centroids)
        for i in range(self.n_clusters):
            new_centroids[i] = np.mean(X[labels == i], axis=0)
        return new_centroids
    
    
    First, we randomly initialize k points, called means or cluster centroids.
    We categorize each item to its closest mean, and we update the mean’s coordinates, 
    which are the averages of the items categorized in that cluster so far.
    We repeat the process for a given number of iterations and at the end, we have our clusters.


In [46]:
n_digits = len(np.unique(y_test))
print(n_digits)

10


In [47]:
Kmeans = KMeansScratch(n_clusters= n_digits)
pred = Kmeans.fit(X)

In [48]:
print("Labels:", clusters)

Labels: [6 4 0 ... 6 8 9]


In [49]:
def infer_cluster_labels(kmeans, actual_labels):
    inferred_labels = {}

    for i in range(kmeans.n_clusters):
        labels = []
        index = np.where(kmeans.labels_ == i)
        labels.append(actual_labels[index])
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))

        if np.argmax(counts) in inferred_labels:
            inferred_labels[np.argmax(counts)].append(i)
        else:
            inferred_labels[np.argmax(counts)] = [i]
    return inferred_labels  

def infer_data_labels(X_labels, cluster_labels):
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)
    
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [50]:
cluster_labels = infer_cluster_labels(kmeans, Y)
X_clusters = kmeans.predict(X)
predicted_labels = infer_data_labels(pred, cluster_labels)
print (predicted_labels[:20])
print (Y[:20])

[0 4 3 1 7 2 1 0 1 1 7 1 7 8 1 1 2 0 8 1]
[5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9]
