# KMeans Clustering

This is for unsupervised machine learning.  Let's assume the Iris data is not categorized by species, and we are left to simply identify clusters without any guidance from labels.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import datasets
import ipywidgets
import plotly.express as px
import pandas as pd

In [None]:
iris = datasets.load_iris()
X = iris.data[:,2:]
y = iris.target

In [None]:
sns.scatterplot(x=X[:,0],y=X[:,1],hue=y,palette='rainbow');

We create an object for our model by calling "KMeans" with the number of clusters we want to look for

In [None]:
kmeans = KMeans(n_clusters=3)

We then call the fit method, and pass in the data in which we want to search for clusters

In [None]:
kmeans.fit(X)

In [None]:
kmeans.labels_

In [None]:
sns.scatterplot(x=X[:,0],
                y=X[:,1],
                hue=kmeans.labels_,
                palette='rainbow');

It gets a little tedious to repeat the process multiple times for different numbers of clusters.  We can make the process more interactive with the ipywidgets library.

In [None]:
def kmeans_iris(n_clusters=3):
    kmeans = KMeans(n_clusters)
    kmeans.fit(X)
    sns.scatterplot(x=X[:,0],
                    y=X[:,1],
                    hue=kmeans.labels_,
                    palette='rainbow')
ipywidgets.interact(kmeans_iris,n_clusters=(1,20));

There are four different measured parameters too.  Let's go up to 3D.

In [None]:
X = iris.data[:,1:]
def kmeans_iris(n_clusters=3):
    kmeans = KMeans(n_clusters)
    kmeans.fit(X)
    fig = px.scatter_3d(x=X[:,0],
          y=X[:,1],
          z=X[:,2],
          color=kmeans.labels_)
    fig.show()

ipywidgets.interact(kmeans_iris,n_clusters=(1,20));

It would actually be a little more convenient to see the axis labels, and more convenient to work with this by passing in a pandas dataframe.

In [None]:
iris.feature_names

In [None]:
irisdf = pd.DataFrame(X[1:],columns=iris.feature_names[1:])

In [None]:
def kmeans_iris(n_clusters=3):
    kmeans = KMeans(n_clusters)
    kmeans.fit(irisdf)
    fig = px.scatter_3d(irisdf,x='sepal width (cm)',
          y='petal length (cm)',
          z='petal width (cm)',
          color=kmeans.labels_)
    fig.show()

ipywidgets.interact(kmeans_iris,n_clusters=(1,20));

The kmeans object has various parameters available for us to inspect, including the location of the cluster centers in this parameter space.

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.cluster_centers_[:,0]

We can append these to the dataframe and include them in our plot.

In [None]:
irisdf

In [None]:
irisdf.append(pd.DataFrame(kmeans.cluster_centers_,columns=irisdf.columns),ignore_index=True)

In [None]:
def kmeans_iris(n_clusters=3):
    kmeans = KMeans(n_clusters, n_init=50)
    kmeans.fit(irisdf)
    fig = px.scatter_3d(irisdf.append(pd.DataFrame(kmeans.cluster_centers_,columns=irisdf.columns),ignore_index=True),
                        x='sepal width (cm)',
                        y='petal length (cm)',
                        z='petal width (cm)',
                        color=list(kmeans.labels_)+[max(kmeans.labels_)+1]*n_clusters,
                        size=list(kmeans.labels_+10)+[max(kmeans.labels_+10)*3]*n_clusters,
                       color_continuous_scale='jet')
    fig.show()

ipywidgets.interact(kmeans_iris,n_clusters=(1,10));