In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
X = pd.read_csv('Wholesale customers data.csv', ).values

In [3]:
X[:10]

array([[    2,     3, 12669,  9656,  7561,   214,  2674,  1338],
       [    2,     3,  7057,  9810,  9568,  1762,  3293,  1776],
       [    2,     3,  6353,  8808,  7684,  2405,  3516,  7844],
       [    1,     3, 13265,  1196,  4221,  6404,   507,  1788],
       [    2,     3, 22615,  5410,  7198,  3915,  1777,  5185],
       [    2,     3,  9413,  8259,  5126,   666,  1795,  1451],
       [    2,     3, 12126,  3199,  6975,   480,  3140,   545],
       [    2,     3,  7579,  4956,  9426,  1669,  3321,  2566],
       [    1,     3,  5963,  3648,  6192,   425,  1716,   750],
       [    2,     3,  6006, 11093, 18881,  1159,  7425,  2098]],
      dtype=int64)

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
X[:10]

array([[ 1.44865163,  0.59066829,  0.05293319,  0.52356777, -0.04111489,
        -0.58936716, -0.04356873, -0.06633906],
       [ 1.44865163,  0.59066829, -0.39130197,  0.54445767,  0.17031835,
        -0.27013618,  0.08640684,  0.08915105],
       [ 1.44865163,  0.59066829, -0.44702926,  0.40853771, -0.0281571 ,
        -0.13753572,  0.13323164,  2.24329255],
       [-0.69029709,  0.59066829,  0.10011141, -0.62401993, -0.3929769 ,
         0.6871443 , -0.49858822,  0.09341105],
       [ 1.44865163,  0.59066829,  0.84023948, -0.05239645, -0.07935618,
         0.17385884, -0.23191782,  1.29934689],
       [ 1.44865163,  0.59066829, -0.20480553,  0.33406659, -0.29763704,
        -0.49615501, -0.22813824, -0.02622403],
       [ 1.44865163,  0.59066829,  0.00995035, -0.35231565, -0.10284877,
        -0.53451222,  0.05428041, -0.34785425],
       [ 1.44865163,  0.59066829, -0.34998145, -0.11398095,  0.15535895,
        -0.28931479,  0.09228619,  0.36960125],
       [-0.69029709,  0.59066829

In [6]:
def initialize_centroids(data, k_centroids):
    '''Randomly picks k elements from data as centroids'''
    
    index = np.random.choice(data.shape[0], k_centroids)
    return data[index]

def find_closest_centroid(data, centroids):
    '''assign each data element the closest centroid from k centroids'''
    
    closest_centroid_index = np.zeros(data.shape[0])

    for ind in range(data.shape[0]):
        closest_centroid_index[ind] = np.argmin(np.sum(np.square(data[ind] - centroids), axis=1))
    
    return closest_centroid_index

def compute_centroids(data, closest_centroid_index, n_centroids):
    new_centroids = np.zeros((n_centroids, data.shape[1]))
    n_neighbours = np.zeros(n_centroids)
    
    for ind in range(data.shape[0]):
        centroid_index = int(closest_centroid_index[ind])
        new_centroids[centroid_index] += data[ind]
        n_neighbours[centroid_index] += 1
        
    for ind in range(n_centroids):
        new_centroids[ind] /= n_neighbours[ind]
        
    return new_centroids

In [7]:
def k_means(data, k_centroids, iterations):
    centroids = initialize_centroids(data, k_centroids)
#     centroids_history = list()
#     centroids_history.append(centroids)
    
    for i in range(iterations):
        closest_centroid = find_closest_centroid(data, centroids)
        centroids = compute_centroids(data, closest_centroid, k_centroids)
#         centroids_history.append(centroids)
    
    return centroids, closest_centroid

In [8]:
def compute_cost(data, centroids, closest_centroids):
    cost = 0
    for i in range(data.shape[0]):
        cost += np.sum(np.square(data[i] - centroids[int(closest_centroids[i])]))
    return cost

In [None]:
max_cluster = 20
cost_history = list()
centroid_history = list()
for k in range(1, max_cluster + 1):
    min_cost = None
    temp_centroid = None
    for _ in range(10):
        centroids, closest_cent = k_means(X, k, 20)
        cost = compute_cost(X, centroids, closest_cent)
        if min_cost is None:
            min_cost = cost
            local_centroid = centroids
        else:
            if min_cost > cost:
                min_cost = cost
                local_centroid = centroids
    cost_history.append(min_cost)
    centroid_history.append(local_centroid)

In [None]:
fig = plt.figure()
plt.plot([k for k in range(1, max_cluster + 1)], cost_history)
fig.savefig('k_means_wholesale_cust.jpg')