In [None]:
# Code taken from https://realpython.com/k-means-clustering-python/#how-to-perform-k-means-clustering-in-python
# Code taken from https://www.geeksforgeeks.org/k-means-clustering-on-the-handwritten-digits-data-using-scikit-learn-in-python/
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import \
    silhouette_score, adjusted_rand_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics
import numpy as np
import pandas as pd

df = pd.read_csv('digit-recognizer/train.csv')
df.head()

In [None]:
features = df.loc[:, df.columns != 'label']
true_labels = df['label']

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
kmeans = KMeans(
    init = "k-means++",
    n_clusters = 10,
    n_init = 50,
    max_iter = 500,
    random_state = 42
)
kmeans.fit(scaled_features)

In [None]:
predicted_labels = kmeans.labels_

print(f"Inertia: {kmeans.inertia_}")
print(f"Cluster Centres: {kmeans.cluster_centers_}")
print(f"# Iterations: {kmeans.n_iter_}")
# A silhouette coefficient of 0 indicates that clusters are significantly overlapping one another, 
# and a silhouette coefficient of 1 indicates clusters are well-separated.
print(f"Silhouette Score: {silhouette_score(scaled_features, predicted_labels)}")
# An ARI score of 0 indicates that cluster labels are randomly assigned, 
# and an ARI score of 1 means that the true labels and predicted labels form identical clusters.
print(f"Adjusted Rand Score: {adjusted_rand_score(true_labels, predicted_labels)}")

In [None]:
# Code taken from https://medium.com/@joel_34096/k-means-clustering-for-image-classification-a648f28bdc47
def retrieve_info(cluster_labels,y_train):
    """Associates most probable label with each cluster in KMeans model 
    returns: dictionary of clusters assigned to each label"""
    # Initializing
    reference_labels = {}
    # For loop to run through each label of cluster label
    for i in range(len(np.unique(kmeans.labels_))):
        index = np.where(cluster_labels == i,1,0)
        num = np.bincount(y_train[index==1]).argmax()
        reference_labels[i] = num
    return reference_labels

In [None]:
reference_labels = retrieve_info(predicted_labels, true_labels)
number_labels = np.random.rand(len(predicted_labels))
for i in range(len(predicted_labels)):
    number_labels[i] = reference_labels[predicted_labels[i]]
print(reference_labels)

In [None]:
confusion_matrix = metrics.confusion_matrix(true_labels, number_labels)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)
cm_display.plot()
plt.show()

In [None]:
print(f"First 20 true labels: \t\t {true_labels[:20].tolist()}")
print(f"First 20 predicted labels: \t {number_labels[:20].astype('int').tolist()}")

In [None]:
# macro average does not take label imbalance into account
# micro and weighted averages return same value as accuracy score
print(f"Accuracy: {round(accuracy_score(true_labels, number_labels), 4) * 100}%")
print(f"Precision: {round(precision_score(true_labels, number_labels, average = 'macro'), 4) * 100}%")
print(f"Recall: {round(recall_score(true_labels, number_labels, average = 'macro'), 4) * 100}%")
print(f"F1 Score: {round(f1_score(true_labels, number_labels, average = 'macro'), 4) * 100}%")

## Creating the Clustering Visualization

In [None]:
kmeans_cluster = KMeans(init="k-means++", n_clusters=10, n_init=10, random_state=0)

# Reducing the dataset 
pca = PCA(2) 
reduced_data = pca.fit_transform(scaled_features) 
kmeans_cluster.fit(reduced_data) 
  
# Calculating the centroids 
centroids = kmeans_cluster.cluster_centers_ 
label = kmeans_cluster.fit_predict(reduced_data) 
unique_labels = np.unique(label) 
  
# plotting the clusters: 
plt.figure(figsize=(8, 8)) 
for i in unique_labels: 
    plt.scatter(reduced_data[label == i, 0], reduced_data[label == i, 1], label=i) 
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='k', zorder=10) 
plt.legend() 
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title("Clustering by Principal Component Analysis")
plt.show() 