In [1]:
from __future__ import division
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy.random as nrd
%matplotlib inline
%precision 4
plt.style.use('ggplot')

### Dataset

GAUSSMIXTURE, which is synthetic. To generate it, I first sample k=(3,4,5) centers from a 3-dimensional Gaussian distribution with mean 0 and variance 5$I_3$. Then add points from Gaussian distributions of unit variance around centers.

In [2]:
#first only generate k=5

nrd.seed(1234)
k = 5
centers = nrd.multivariate_normal([0,0,0],5*np.identity(3),k)
data = [nrd.multivariate_normal(center, np.identity(3),100) for center in centers]
data = np.vstack(data)

### Important functions

In [3]:
#%%file kmeans.py

#centroid
# def Centroid(Y):
#     """Y is a subset of the dataset"""
#     return np.average(Y,axis=0)

#cost
def Cost(C, Y):
    """C is a subset of the dataset, Y can be a point or a subset"""
    if  len(Y.shape)==1 or Y.shape[0]==1:
        MinIndex = np.argmin(np.sum((Y-C)**2,axis=1))
        return np.sum((Y-C[MinIndex,])**2)
    else:
        return np.sum([Cost(C,Y_i) for Y_i in Y])
    
def weight(C, data):
    """C is the centroid set and data is the data set"""
    Cost_matrix = np.array([np.sum((c-x)**2) for c in C
                                             for x in data]).reshape(len(C),len(data))
    Index_min = list(np.argmin(Cost_matrix,axis=0))
    return np.array([Index_min.count(i) for i in range(len(C))])
    

### k-means||

In [4]:
def kmeanspar(k,l):
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #Step 2
    Phi = Cost(C,data)
    #for loop
    for i in range(int(np.log(Phi))):
        prob = [l*Cost(C,x) for x in data]/Cost(C,data)
        flag = nrd.uniform(size=len(data))
        C = np.concatenate((C,data[prob>=flag,]))
    #step 7
    weights = weight(C,data)
    #step 8: k-means++ to choose weighted points
    c = C[nrd.choice(range(len(C)),1),]
    while len(c) < k:
        p = np.array([Cost(c,x) for x in C])
        Prob = p*weights/np.sum(p*weights)
        x = nrd.choice(range(len(C)),1,p=Prob)
        c = np.concatenate((c,C[x,]))
    return c

### Random

Random means get k initual points randomly, each points in the data has the same probability to be chosen. We can use random.choice to get that.

### K-means++

In [5]:
def kmeansplus(k,data):
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #while loop
    while len(C) < k:
        prob = ([Cost(C,x) for x in data]/Cost(C,data)).reshape(len(data))
        x = nrd.choice(range(len(data)),1,p=prob)
        C = np.concatenate((C,data[x,]))
    return C

### Comparison

In [6]:
pluspoint = kmeansplus(k,data)

array([[ 1.1336, -1.4892, -1.3834],
       [-4.3293,  2.9194,  1.3839],
       [ 3.692 , -4.4052,  3.1092],
       [ 0.7094, -2.1348,  2.1346],
       [ 2.401 , -6.3684, -1.4968]])

In [7]:
kmeanspar(k,2*k)

array([[ 3.692 , -4.4052,  3.1092],
       [-1.1236, -0.4924,  3.5533],
       [ 1.2785, -3.3992, -0.6267],
       [ 0.5748, -2.9621,  3.8977],
       [-5.9876,  1.8227,  2.9829]])

In [8]:
centers

array([[ 1.0542, -2.6631,  3.2036],
       [-0.6991, -1.6113,  1.9838],
       [ 1.9221, -1.4233,  0.0351],
       [-5.0148,  2.5716,  2.2181],
       [ 2.1317, -4.5197, -0.747 ]])

### Profile

I use broadcasting and list comprehension without using for loop (except the largest one in the algorithm), which might be faster. However, broadcasting might cause other issues, because we can't control and know which broadcasting actually being used. For example, if each column represent one point, the codes will result in a wrong answer.

### Optimation Strategies

1. The for loop might could be changed to the while loop，so that we don't need to run too many times. Or according to the paper, 15 rounds when l=0.1k, 5 rounds when l=0.5k, 2k, 10k.
2. it seems that the k-means || algorithm is in fact slower than k-means ++, I need to figure out the reason.

### Makefile

In [None]:
%%file makefile
TARGET = center
OBJECTS = distributions.o
CFLAGS = -g -O3 
LDLIBS = -lm
CC = c99 

all: $(TARGET)
    
clean:
	 rm $(TARGET) $(OBJECTS)

$(TARGET): $(OBJECTS)