# K-Means Clustering and Choosing K
## (and some other challenges)

 

_Again, this is all available at [ajbc.io/MLintro](ajbc.io/MLintro)._

In [None]:
# preliminaries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from plotnine import *
%matplotlib inline

Let's generate some simulated data!
(Based on [this demo](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py).)

In [None]:
# create some data
obs, true_clusters = make_blobs(n_samples=1000, centers=6, random_state=42)
data = pd.DataFrame(obs, columns=['D1', 'D2'])
data['true_cluster'] = true_clusters 

print(data)

In [None]:
# plot the data
(ggplot(data)
 + aes(x='D1', y='D2', color='factor(true_cluster)')
 + geom_point()
)

In [None]:
# fit K-means with K=3
kmeans = KMeans(n_clusters=3, random_state=0).fit(obs)
data['learned_cluster'] = kmeans.labels_

(ggplot(data)
 + aes(x='D1', y='D2', color='factor(learned_cluster)')
 + geom_point()
)

In [None]:
# How do we choose "K"?  One approach: the "elbow" method
sse_log = list()
for k in range(1,11):
    print("Fitting K =", k)
    model = KMeans(n_clusters=k, random_state=0).fit(obs)
    print("\tSSS =", model.inertia_)
    sse_log.append(model.inertia_)

In [None]:
sse_data = pd.DataFrame({"K": range(1,11), "SSE": sse_log})
(ggplot(sse_data)
 + aes(x='K', y='SSE')
 + geom_point() + geom_line()
 + ylab("sum of squared errors")
)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(obs)
data['learned_cluster'] = kmeans.labels_

(ggplot(data)
 + aes(x='D1', y='D2', color='factor(learned_cluster)')
 + geom_point()
)

## Choosing K for other models
- Likelihood
- Bayesian information criterion (BIC)
- Perplexity
- ...

### More resources
- [Gaussian Mixture Model clustering: how to select the number of components (clusters)](https://towardsdatascience.com/gaussian-mixture-model-clusterization-how-to-select-the-number-of-components-clusters-553bef45f6e4)
- [Perplexity To Evaluate Topic Models](http://qpleple.com/perplexity-to-evaluate-topic-models/)

## What other issues might we have?

In [None]:
# Anisotropicly distributed data
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
obs2 = np.dot(obs, transformation)
data = pd.DataFrame(obs2, columns=['D1', 'D2'])
data['learned_cluster'] = KMeans(n_clusters=4, random_state=42).fit(obs2).labels_

(ggplot(data) + aes(x='D1', y='D2', color='factor(learned_cluster)') + geom_point())

In [None]:
# Different variance
obs, true_clusters = make_blobs(n_samples=1000, centers=3, 
                                cluster_std=[1.0, 2.5, 0.5], random_state=170)
data = pd.DataFrame(obs, columns=['D1', 'D2'])
data['learned_cluster'] = KMeans(n_clusters=3).fit(obs).labels_

(ggplot(data) + aes(x='D1', y='D2', color='factor(learned_cluster)') + geom_point())

In [None]:
# Dimension have very different ranges
obs, true_clusters = make_blobs(n_samples=1000, centers=4, random_state=42)
obs[:,1] *= 10
data = pd.DataFrame(obs, columns=['D1', 'D2'])
data['learned_cluster'] = KMeans(n_clusters=4).fit(obs).labels_

(ggplot(data) + aes(x='D1', y='D2', color='factor(learned_cluster)') + geom_point())

In [None]:
# There may not actually be clusters!
obs = np.random.random((1000, 2))
data = pd.DataFrame(obs, columns=['D1', 'D2'])
data['learned_cluster'] = KMeans(n_clusters=3).fit(obs).labels_

(ggplot(data) + aes(x='D1', y='D2', color='factor(learned_cluster)') + geom_point())