# Clustering with K-Means

In [None]:
import pandas as pd

First we make our fictional dataset.

In [None]:
from sklearn.datasets import make_blobs

In [None]:
x, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

In [None]:
x[:10]

In [None]:
y

In [None]:
import matplotlib.pyplot as plt

In [None]:
ourcolors = ['red','blue','black','green','yellow','magenta','orange','brown','grey','aqua']

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in y])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
plt.scatter(x[:,0],
            x[:,1])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
from sklearn.cluster import KMeans

We create an object for our model by calling "KMeans" with the number of clusters we want to look for

In [None]:
kmeans = KMeans(n_clusters=4, n_init=10)

We then call the fit method, and pass in the data in which we want to search for clusters

In [None]:
kmeans.fit(x)

In [None]:
x[[0]]

In [None]:
kmeans.predict(x[[0]])

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in kmeans.labels_])

In [None]:
kmeans.labels_

In [None]:
kmeans.cluster_centers_

In [None]:
import ipywidgets

In [None]:
def plotblobs(n):
    kmeans = KMeans(n_clusters=n, n_init=10)
    kmeans.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])
    
ipywidgets.interact(plotblobs,n=(1,10));

# Basic approach

In [None]:
import numpy as np

In [None]:
np.random.seed(0)
nclusters = 4

In [None]:
centers_x0 = []
centers_x1 = []
for i in range(nclusters):
    centers_x0.append(np.random.randint(-3,4))
    centers_x1.append(np.random.randint(-1,10))

In [None]:
centers_x0

In [None]:
plt.scatter(x[:,0],
            x[:,1])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
def d(a,b):
    return np.sqrt(a**2 + b**2)

In [None]:
np.argmin([d(x[0,0] - centers_x0[i], x[0,1] - centers_x1[i]) for i in range(nclusters)])

In [None]:
x.shape

In [None]:
cluster = []
for point in range(x.shape[0]):
    cluster.append(np.argmin([d(x[point,0] - centers_x0[i], x[point,1] - centers_x1[i]) for i in range(nclusters)]))

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in cluster])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
df = pd.DataFrame({'x0':x[:,0],'x1':x[:,1],'cluster':cluster})

In [None]:
df

In [None]:
for i in range(nclusters):
    centers_x0[i] = df.loc[df['cluster']==i, 'x0'].mean()
    centers_x1[i] = df.loc[df['cluster']==i, 'x1'].mean()

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i,row in df.iterrows():
    df.loc[i,'cluster'] = np.argmin([d(row['x0'] - centers_x0[i], row['x1'] - centers_x1[i]) for i in range(nclusters)])

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i in range(nclusters):
    centers_x0[i] = df.loc[df['cluster']==i, 'x0'].mean()
    centers_x1[i] = df.loc[df['cluster']==i, 'x1'].mean()

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i,row in df.iterrows():
    df.loc[i,'cluster'] = np.argmin([d(row['x0'] - centers_x0[i], row['x1'] - centers_x1[i]) for i in range(nclusters)])

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i in range(nclusters):
    centers_x0[i] = df.loc[df['cluster']==i, 'x0'].mean()
    centers_x1[i] = df.loc[df['cluster']==i, 'x1'].mean()

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i,row in df.iterrows():
    df.loc[i,'cluster'] = np.argmin([d(row['x0'] - centers_x0[i], row['x1'] - centers_x1[i]) for i in range(nclusters)])

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i in range(nclusters):
    centers_x0[i] = df.loc[df['cluster']==i, 'x0'].mean()
    centers_x1[i] = df.loc[df['cluster']==i, 'x1'].mean()

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i,row in df.iterrows():
    df.loc[i,'cluster'] = np.argmin([d(row['x0'] - centers_x0[i], row['x1'] - centers_x1[i]) for i in range(nclusters)])

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i in range(nclusters):
    centers_x0[i] = df.loc[df['cluster']==i, 'x0'].mean()
    centers_x1[i] = df.loc[df['cluster']==i, 'x1'].mean()

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i,row in df.iterrows():
    df.loc[i,'cluster'] = np.argmin([d(row['x0'] - centers_x0[i], row['x1'] - centers_x1[i]) for i in range(nclusters)])

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i in range(nclusters):
    centers_x0[i] = df.loc[df['cluster']==i, 'x0'].mean()
    centers_x1[i] = df.loc[df['cluster']==i, 'x1'].mean()

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
for i,row in df.iterrows():
    df.loc[i,'cluster'] = np.argmin([d(row['x0'] - centers_x0[i], row['x1'] - centers_x1[i]) for i in range(nclusters)])

In [None]:
plt.scatter(df['x0'],
            df['x1'],
            color=[ourcolors[i] for i in df['cluster']])
plt.plot(centers_x0, centers_x1, 'yo', markersize=10)
plt.xlabel('x0')
plt.ylabel('x1')

# Ascertaining clusters

In [None]:
x, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

In [None]:
plt.scatter(x[:,0],
            x[:,1])
plt.xlabel('x0')
plt.ylabel('x1')

In [None]:
kmeans = KMeans(n_clusters=4, n_init=10)

In [None]:
kmeans.fit(x)

In [None]:
plt.scatter(x[:,0],
            x[:,1],
            color=[ourcolors[i] for i in kmeans.labels_])

In [None]:
kmeans.labels_

In [None]:
kmeans.cluster_centers_

In [None]:
def plotblobs(n):
    kmeans = KMeans(n_clusters=n, n_init=10)
    kmeans.fit(x)
    plt.scatter(x[:,0], x[:,1], color=[ourcolors[i] for i in kmeans.labels_])
    
ipywidgets.interact(plotblobs,n=(1,10));

There is no means by which to evaluate the performance of clustering.  This is unsupervised learning, so there are no test values against which we can measure metrics.

Inertia is one metric that is used to evaluate clustering.  Inertia measures the sum of the distances between each training instance and the cluster centroid with which it is identified.

In [None]:
kmeans.inertia_

For k-means clustering, the `score` method returns this inertia score (or rather the negative of the inertia, since score is meant to be optimized and higher values, rather than lower values, are meant for such optimization).

In [None]:
kmeans.score(x)

In [None]:
nclusters = []
inertia_scores = []
for i in range(1,15):
    nclusters.append(i)
    inertia_scores.append(KMeans(n_clusters=i, n_init=10).fit(x).inertia_)

In [None]:
plt.plot(nclusters, inertia_scores, 'ko')

Another approach is to look at the silhouette score.  For any given point, the silhouette coefficient equals $(b-a)/\text{max}(a,b)$, where a is the average distance to other points in the same cluster and b is the average distance to points in the next closest cluster.  +1 means the point is well within its own cluster, and -1 means the point is likely in the next closest cluster.

The silhouette score is the average silhouette coefficient over all points.

In [None]:
from sklearn.metrics import silhouette_score

For just one training, we pass in the points and labels.

In [None]:
kmeans = KMeans(n_clusters=4, n_init=10)
kmeans.fit(x)
silhouette_score(x, kmeans.labels_)

We can again look at how this varies when identifying different numbers of clusters.

In [None]:
nclusters = []
silhouette_scores = []
# Note: doesn't work for just 1 cluster because then there isn't a next-closest cluster
for i in range(2,15):
    nclusters.append(i)
    silhouette_scores.append(silhouette_score(x, KMeans(n_clusters=i, n_init=10).fit(x).labels_))

In [None]:
plt.plot(nclusters, silhouette_scores, 'ko')

# Using KMeans Clustering for Preprocessing

Acknowledgements to our course text by A. Geron.

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_digits, y_digits = load_digits(return_X_y=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)

We'll use Logistic Regression to do multi-class classification:
* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
log_reg = LogisticRegression(multi_class="ovr", 
                             solver="lbfgs", 
                             max_iter=5000, 
                             random_state=42)
log_reg.fit(X_train, y_train)

In [None]:
log_reg_score = log_reg.score(X_test, y_test)
log_reg_score

In [None]:
X_digits[1]

Now comes a nifty little preprocessing step: identify 50 clusters for this image data, and replace the 8x8 feature space (64 features) with the 50x1 feature space corresponding to distances of points from the 50 identified clusters.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42, n_init=10)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(X_train, y_train)

In [None]:
pipeline_score = pipeline.score(X_test, y_test)
pipeline_score

We could stop and try to use inertia or silhouette score to identify an ideal number of clusters, but no!  Just use grid search cross-validation to identify the number of clusters (our hyperparameter) that performs the best.

This will take awhile and I'm not going to run it now.  I invite you to try on your own later.  For now I'll just show another value that gives good results.

In [None]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=53, random_state=42, n_init=10)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(X_train, y_train)

In [None]:
pipeline_score = pipeline.score(X_test, y_test)
pipeline_score