# Clustering
<br> __Cluster using K-Means.__
<br> Some random numbers are used to test K-Means.

In [None]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.cluster import KMeans


In [None]:
# Create random data points
x = [1, 5, 1.5, 8, 1, 9]
y = [2, 8, 1.8, 8, 0.6, 11]

plt.scatter(x, y)
plt.show()

In [None]:
# Create single matrix taking x, y as columns
X = [[p, q] for p, q in zip(x,y)]

print(X)

In [None]:
# K-Means with 2 clusters, initialise the class.
kmeans = KMeans(n_clusters=2)

# Perform clustering on data
kmeans.fit( X )

# Get the centers of two clusters.
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(f"Centroids: {centroids}")
print(f"Labels: {labels}")

In [None]:
# Lets try to visualize the clusters
colors = ['g.', 'r.', 'c.', 'y.']
for i in range( len(X) ):
    print(f"coordinate:{X[i]}, label:{labels[i]}")
    plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize=10)
    
# Visualize the centroids
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=150, linewidths=5, zorder=10)

## Clustering on digits data.
<br> 1. Load digits data
<br> 2. Number of clusters is 10. Targets are between 0 and 9. 
<br> 3. Try to verify cluster centroids with digit data targets. 

In [None]:
# Write code here as per the above steps.

from sklearn import datasets, metrics
from sklearn.preprocessing import scale

# Load digits dataset
X_digits, y_digits = datasets.load_digits(return_X_y=True)
data = scale(X_digits)

n_samples, n_features = data.shape
n_digits = len(np.unique(y_digits))
labels = y_digits

print(f"n_digits:{n_digits}, n_samples:{n_samples}, n_features:{n_features}")

In [None]:
# K-Means with 10 clusters, random initialisation.
kmeans = KMeans(init='random', n_clusters=n_digits, n_init=10)
kmeans.fit( data )

# Get the centers of two clusters.
print(f"Centroids: {kmeans.cluster_centers_.shape}")
print(f"Labels: {kmeans.labels_}, shape:{kmeans.labels_.shape}")

print(f"Adjusted rand index: {metrics.adjusted_rand_score(labels, kmeans.labels_)}")
print(f"Adjusted mutual info: {metrics.adjusted_mutual_info_score(labels, kmeans.labels_)}")

In [None]:
# K-Means with 10 clusters, KMeans++ initialisation.
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit( data )

# Get the centers of two clusters.
print(f"Centroids: {kmeans.cluster_centers_.shape}")
print(f"Labels: {kmeans.labels_}, shape:{kmeans.labels_.shape}")

print(f"Adjusted rand index: {metrics.adjusted_rand_score(labels, kmeans.labels_)}")
print(f"Adjusted mutual info: {metrics.adjusted_mutual_info_score(labels, kmeans.labels_)}")

# Reduce Dimensionality
# PCA (Principal Component Analysis)

In [None]:
from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

print(f"Adjusted rand index: {metrics.adjusted_rand_score(labels, kmeans.labels_)}")
# print(f"Adjusted mutual info: {metrics.adjusted_mutual_info_score(labels, kmeans.labels_)}")

In [None]:
# Write code to increase number of components and observe ARI and AMI.

