## Import necessary packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
from sklearn.metrics import pairwise_distances

## Implement k-means

Let us implement the k-means algorithm. First, we choose an initial set of centroids. A common practice is to choose randomly from the data points.


In [2]:
def get_initial_centroids(data, k, seed=None):
    '''Randomly choose k data points as initial centroids'''
    if seed is not None: 
        np.random.seed(seed)
    n = data.shape[0] 
    rand_indices = np.random.randint(0, n, k)

    centroids = data[rand_indices,:].toarray()
    
    return centroids

In [3]:

def assign_clusters(data, centroids):
    
   
    distances_from_centroids = pairwise_distances(data,centroids)
    
 
    cluster_assignment = np.argmin(distances_from_centroids,axis=1)
    
    return cluster_assignment

In pseudocode, we iteratively do the following:
```
cluster_assignment = assign_clusters(data, centroids)
centroids = revise_centroids(data, k, cluster_assignment)
```

### Assigning clusters

In [4]:
data = np.array([[1., 2., 0.],
                 [0., 0., 0.],
                 [2., 2., 0.]])
centroids = np.array([[0.5, 0.5, 0.],
                      [0., -0.5, 0.]])

Let's assign these data points to the closest centroid.

In [5]:
cluster_assignment = assign_clusters(data, centroids)
print (cluster_assignment)

[0 1 0]


In [6]:
def revise_centroids(data, k, cluster_assignment):
    new_centroids = []
    for i in range(k):
       
        member_data_points = data.loc[data.cluster_assignment==i,:]
        
        centroid = np.mean(member_data_points)
        
       
        centroid = centroid.A1
        new_centroids.append(centroid)
    new_centroids = np.array(new_centroids)
    
    return new_centroids

**The Basic objective of k-means is to reduce overall euclidean distance inside the cluster **

In [7]:
def compute_heterogeneity(data, k, centroids, cluster_assignment):
    
    heterogeneity = 0.0
    for i in range(k):
        
        
        member_data_points = data[cluster_assignment==i, :]
        
        if member_data_points.shape[0] > 0: 
           
            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
            squared_distances = distances**2
            heterogeneity += np.sum(squared_distances)
        
    return heterogeneity