# K-Means Clustering Implementing

In [None]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans # for comparison

Make some simulated data:

In [None]:
# Make some simple data on the unit square
def MakeUnifData(N):
    XX = np.vstack((np.random.rand(N, 2),
                    np.random.rand(N, 2) + 1,
                    np.random.rand(N, 2) - 1,
                    np.random.rand(N, 2) + 2,
                    np.random.rand(N, 2) - 2))
    return(XX)

X = MakeUnifData(20)
plt.scatter(X[:, 0], X[:, 1])

Let's implement K means, which has the following steps:

1. Pick some starting points
2. Iterate until we get no updating of cluster assignments:

    a. Assign points to nearest cluster centroid  
    b. Calculate centroids of each cluster

In [None]:
def Init(X, k):
    # initialize k-mean centers
    # random selections
    selections = np.random.choice(X.shape[0], k, replace=False)
    return X[selections, :]


def KmeansPlusPlusInit(X, k):
    ## We will code this together
    
    return 1
    
    

    
def AssignClusters(X, centers):
    # Return the array assigns that contains for each sample x in X the index of the
    # center, which is closest to x
    
    n = X.shape[0]
    assigns = np.zeros((n,))
    
    ### YOUR CODE HERE  ###
    
    return assigns
    
    
    
def CalculateCenters(X, cluster_assign, k):
    # Return the array centers that contains in each row i the mean of all samples
    # belonging to the ith cluster.
    m = X.shape[1]
    centers = np.zeros((k,m))
    
    ### YOUR CODE HERE ###
        
    return centers

    
    
def Criteria(X, cluster_assign):
    ## We will code this together
   
    return 1
    
    
def MyKmeans(X, k, plot=True, with_print=True):
    
    # initialize
    centers = Init(X, k)
    #centers = KmeansPlusPlusInit(X, k)
   
    cluster_assign = AssignClusters(X, centers)

     # iterate:
    max_iter = 10
    
    if plot:
        f, ax = plt.subplots(max_iter + 1)
        f.set_figheight(max_iter * 3)
        f.set_figwidth(3)
        
        ax[0].scatter(X[:, 0], X[:, 1], c=cluster_assign, s=100)
        ax[0].scatter(centers[:, 0], centers[:, 1], c='gold', s=100)

        
   
    for ii in range(max_iter):
        # assign points
        cluster_assign = AssignClusters(X, centers)
        
        prev_centers = centers
        
        centers = CalculateCenters(X, cluster_assign, k)
        
        print(ii, ':', np.sum(prev_centers == centers) == (k*2), Criteria(X, cluster_assign))
        
        if plot:
            ax[ii+1].scatter(X[:, 0], X[:, 1], c=cluster_assign, s=100)
            ax[ii+1].scatter(centers[:, 0], centers[:, 1], c='gold', s=100)
    
        if plot:
            f.set_figheight(max_iter * 3)
            f.set_figwidth(3)
        
   

In [None]:
MyKmeans(X,5)

In [None]:
scores = []

for kk in range(2, 10):
    scores.append(MyKmeans(X, kk, plot=False, with_print=False))
    
print(scores)
plt.scatter(range(2, 10), scores)
plt.xlabel('k')
plt.ylabel('scores')

In [None]:
# check for clustering stability

criteria_list = []

for rr in range(10):
    criteria_list.append(MyKmeans(X, 5, plot=False, with_print=False))
    
print(criteria_list)

plt.scatter(range(10), criteria_list)
plt.ylim(0.9,1.02)