#### The procedure for k-means algorithm is as follows:

1. **randomly place k centroids, one for each cluster** - the farther apart the clusters are placed, the better. 
2. **calculate the distance of each data point or object from the centroids.** Euclidean distance is used to measure the distance from the object to the centroid. Euclidean distance is the most popular. 
3. **assign each data point or object to its closest centroid creating a group.** - each data point has been classified to a group
4. **recalculate the position of the k centroids.** The new centroid position is determined by the mean of all points in the group. 
5. **repeat steps 2-5 until the centroids no longer move** - points will not switch clusters

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline

X = np.array([[1.0, 1.0],
              [1.5, 1.0],
              [1.5, 0.5],
              [1.0, 1.5],
              [2.0, 1.5],
              [1.75, 2],
              [3.5, 2.0],
              [3.0, 2.5],
              [2.5, 2.0],
              [2.2, 2.0]])

In [None]:
#Plot points
plt.figure(figsize=(8,5))
plt.scatter(X[:,0],X[:,1])
plt.show()

In [None]:
k=2 #number of clusters
max_iterations=10 #number of iterations

# checks if the object (first argument) is an instance or subclass of classinfo class (second argument).
# helpful if we use dataframe, not relevan for numpy array
if isinstance(X, pd.DataFrame):  
    X = X.values

'''1. randomly place k centroids, one for each cluster''' 
# here we selected centroids manually for two centroids to simplify code
centroids = np.array([[0, 0], [5, 5]])
print('print centroids')
print(centroids,'\n')

'''2. calculate the distance of each data point or object from the k-centroids'''
M = distance.cdist(X, centroids, 'euclidean')
print('print distances M')
print(M[:],'\n')

'''3. assign each data point or object to its closest centroid creating a group'''
# np.argmin returns indices 
P = np.argmin(M,axis=1)
print('print initial clustering')
print(P[:])

In [None]:
# plot points assigned to initial clusters
def plot_kmeans_with_centroids():
    plt.figure(figsize=(10,7))
    plt.scatter(X[:,0],X[:,1],c=P)
    plt.scatter(centroids[:,0], centroids[:,1], c='red')
    plt.show()
plot_kmeans_with_centroids() 

for _ in range(max_iterations):
    '''4. recalculate the position of the k centroids.'''
    '''The new centroid position is determined by the mean of all points in the group.'''
    centroids = np.vstack([X[P==i,:].mean(axis=0) for i in range(k)])
    print('\n\nCentroids changed positions...')
    plot_kmeans_with_centroids()
    
    '''Calculate distance and reassign points to closest clusters'''
    tmp = np.argmin(distance.cdist(X, centroids, 'euclidean'),axis=1)

    '''Check if temp and P arrays of clusters are equal'''
    '''If no changes are made (no reassignment) then exit the loop '''
    if np.array_equal(P,tmp):
        print('\nNo more points reassingments from cluster to cluster, the end of learning.')
        break
    
    print('\n\nInitial clusters: ',P)
    P = tmp
    print('Clusters after reassignment : ',P)
    plot_kmeans_with_centroids()
    
print(f'\nFinal cluster assignment: {P}')
print(f'Final centroid positions: {centroids}')

### Using Scikit learn

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2).fit(X)
print(kmeans.labels_)
kmeans.cluster_centers_