In [1]:
import numpy as np

In [2]:
mat = np.random.randint(0,20,size=(200,3))

In [21]:
def euclidean(q, data):
    euc = np.zeros((data.shape[1],data.shape[0]))
    for i in range(data.shape[1]):
        euc[i] = np.sqrt(np.sum(((q[i]-mat)**2), axis=1))
    return euc.transpose()

In [22]:
def _initialize_centroids(data, k):
    
    n = data.shape[0] 
    # Randomly choose index of the data as initial centroids
    random_indices = np.random.randint(0, n, k)
    centroids = data[random_indices,:]
    
    return centroids

In [23]:
def _assign_clusters(data, centroids):
    
    distances_from_centroids = euclidean(centroids, data)
    cluster_assignments = np.argmin(distances_from_centroids, axis=1)
    
    return cluster_assignments

In [24]:
def _revise_centroids(data, k, clusters):
    new_centroids = []
    for i in range(k):
        # Compute mean of data points in each cluster.
        data_points_in_cluster = data[clusters==i]
        centroid = data_points_in_cluster.mean(axis=0)
        new_centroids.append(centroid)
        
    new_centroids = np.array(new_centroids)
    
    return new_centroids

In [38]:
def kmeans(data, k, maxiter=300):

    centroids = _initialize_centroids(data, k)
    prev_cluster_assignment = None
    
    print centroids
    
    for i in range(maxiter):        
        # Assign data points to the nearest centroids
        cluster_assignment = _assign_clusters(data, centroids)
            
        # Revise centroids by averaging all data points in each centroid
        centroids = _revise_centroids(data, k, cluster_assignment)
            
        # if not data points change cluster, exit the loop
        if prev_cluster_assignment is not None and (prev_cluster_assignment==cluster_assignment).all():
            break
        
        # Print number of new assignments 
        if prev_cluster_assignment is not None:
            num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
            
            print('    {0:5d} elements changed their cluster assignment.'.format(num_changed))   
        
        # Record heterogeneity convergence metric
        #if record_heterogeneity is not None:
            # YOUR CODE HERE
        #    score = compute_heterogeneity(data, k, centroids, cluster_assignment)
        #    record_heterogeneity.append(score)
        
        prev_cluster_assignment = cluster_assignment[:]
        
    return centroids, cluster_assignment

In [39]:
kmeans(mat, 3)

[[ 7 13  2]
 [19 19  0]
 [ 6 10 19]]
       23 elements changed their cluster assignment.
       10 elements changed their cluster assignment.
        2 elements changed their cluster assignment.
        2 elements changed their cluster assignment.
        5 elements changed their cluster assignment.
        4 elements changed their cluster assignment.
        4 elements changed their cluster assignment.
        4 elements changed their cluster assignment.
        3 elements changed their cluster assignment.
        3 elements changed their cluster assignment.
        6 elements changed their cluster assignment.
        4 elements changed their cluster assignment.
        5 elements changed their cluster assignment.
        6 elements changed their cluster assignment.
        5 elements changed their cluster assignment.
        4 elements changed their cluster assignment.
        2 elements changed their cluster assignment.
        2 elements changed their cluster assignment.
        2

(array([[ 14.51515152,   5.25757576,   8.66666667],
        [ 13.96666667,  15.58333333,   8.2       ],
        [  3.18918919,   8.81081081,  10.90540541]]),
 array([1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 1, 2, 2, 1, 1, 2, 0, 2, 0, 2, 2, 0, 2,
        1, 0, 1, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 0, 0, 1, 0, 2, 0, 0, 1, 2, 2,
        2, 1, 2, 1, 0, 2, 1, 1, 1, 2, 0, 0, 0, 1, 2, 1, 1, 0, 0, 2, 1, 2, 0,
        2, 0, 0, 0, 1, 2, 2, 0, 1, 0, 2, 2, 2, 0, 1, 1, 0, 0, 2, 1, 0, 1, 0,
        2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2,
        0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 1, 1, 2, 0, 0, 2, 1, 1, 1, 2,
        2, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 2, 2, 1, 0, 0, 0, 1, 2, 2, 1, 1, 2,
        2, 1, 2, 2, 2, 1, 0, 0, 1, 1, 1, 2, 2, 1, 0, 0, 0, 2, 0, 2, 2, 1, 1,
        0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 1]))

In [35]:
from sklearn.cluster import KMeans

In [36]:
sk_kmeans = KMeans(n_clusters=3).fit(mat)

In [37]:
sk_kmeans.cluster_centers_

array([[  3.28947368,   8.73684211,  10.65789474],
       [ 13.82758621,  15.63793103,   7.37931034],
       [ 14.84848485,   5.5       ,   9.59090909]])