In [1]:
from __future__ import division
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as nrd
from sklearn.cluster import KMeans
from kmeans import *
import timeit
%matplotlib inline
%precision 4
plt.style.use('ggplot')

### Dataset

GAUSSMIXTURE, which is synthetic. To generate it, I first sample k=(3,4,5) centers from a 3-dimensional Gaussian distribution with mean 0 and variance 5$I_3$. Then add points from Gaussian distributions of unit variance around centers.

In [24]:
def GAUSSMIXTURE(k,R):
    """R is the variance to generate centers, K is the number of centers
    will sample 10000 points in 10-dimensional space"""
    n = 10000
    centers = nrd.multivariate_normal([0]*10,R*np.identity(10),k)
    data = [nrd.multivariate_normal(center, np.identity(10),int(n/k)) for center in centers]
    data = np.vstack(data)
    return data

In [25]:
data = GAUSSMIXTURE(50,100)

### Important functions

In [124]:
def Cost(C, Y):
    """C is a subset of the dataset, Y can be a point or a subset"""
    if  len(Y.shape)==1 or Y.shape[0]==1:
        #Y is a point
        MinIndex = np.argmin(np.sum((Y-C)**2,axis=1))
        return np.sum((Y-C[MinIndex,])**2)
    else:
        return np.sum([Cost(C,Y_i) for Y_i in Y])

def weight(C, data):
    """C is the centroid set and data is the target data set"""
    if len(C.shape)==1 or C.shape[0]==1:
        #C only have one point
        if len(data.shape)==1 or data.shape[0]==1:
            return np.array([1])
        else:
            return np.array([len(data)])
    else:
        Index_min = [np.argmin(np.sum((x-C)**2,axis=1)) for x in data]
        return np.array([Index_min.count(i) for i in range(len(C))]).astype(float)

def weight_v1(C, data):
    """C is the centroid set and data is the target data set"""
    if len(C.shape)==1 or C.shape[0]==1:
        #C only have one point
        if len(data.shape)==1 or data.shape[0]==1:
            return np.array([1])
        else:
            return np.array([len(data)])
    else:
        Cost_matrix = np.array([np.sum((c-x)**2) for c in C
                                             for x in data]).reshape(len(C),len(data))
        Index_min = list(np.argmin(Cost_matrix,axis=0))
        return np.array([Index_min.count(i) for i in range(len(C))])

### k-means||

In [39]:
def kmeanspar(k,l,r,data):
    """k is the number of centers, l is the expected number of intermediate points
    in each iteration, r is the number of iterations, data is the target data set"""
    #l*r should be larger than k in case k-means|| select too few points
    if l*r < k:
        raise ValueError('r or l must be bigger, ')
    #if k is too large
    if k >= len(data):
        raise ValueError('k is too large')
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #for loop
    for i in range(r):
        prob = [l*Cost(C,x) for x in data]/Cost(C,data)
        flag = nrd.uniform(size=len(data))
        C = np.concatenate((C,data[prob>=flag,]))
    #step 7
    weights = weight(C,data)
    #step 8: k-means++ to choose weighted points
    c = C[nrd.choice(range(len(C)),1),]
    while len(c) < k:
        p = np.array([Cost(c,x) for x in C])
        Prob = p*weights/np.sum(p*weights)
        x = nrd.choice(range(len(C)),1,p=Prob)
        c = np.concatenate((c,C[x,]))
    ##carry out K-means clustering
    km = KMeans(n_clusters=k,init=c,n_init=1)
    km.fit(data)
    return km

In [41]:
km = kmeanspar(50,50/2,5,data)

### Random

In [30]:
def Random(k,data):
    """k is the number of centers, data is target data"""
    if k >= len(data):
        raise ValueError('k is too large')
    ##carry out K-means clustering
    km = KMeans(n_clusters=k,init='random')
    km.fit(data)
    return km

### K-means++

In [28]:
def kmeansplus(k,data):
    if k >= len(data):
        raise ValueError('k is too large')
    km = KMeans(n_clusters=k,init='k-means++')
    km.fit(data)
    return km

### Profile

In [None]:
def clustering_cost(data,initial):
    """K is 
    initial is the initialization way, could be Random, kmeanspar or kmeansplus"""
    

In [57]:
k = 50; R = 10;
l = 0.5*k; r = 5;
data = GAUSSMIXTURE(k,R)

In [31]:
#carry out K-means clustering and time profiling
#try different l and r
%timeit kmeanspar(k,l,r,data)
%timeit kmeansplus(k,data)
%timeit Random(k,data)
    
# km_1 = KMeans(n_clusters=k,init='k-means++')
# km_2 = KMeans(n_clusters=k,init='random')
# km_3 = KMeans(n_clusters=k,init=c1)

# %timeit Random(data)
# %timeit km_2.fit(data)
# %timeit km_3.fit(data)

1 loops, best of 3: 7.07 s per loop
1 loops, best of 3: 658 ms per loop
1 loops, best of 3: 475 ms per loop


In [32]:
stats = %prun -r -q kmeanspar(k,l,r,data)
stats.sort_stats('time').print_stats(10);

          3235683 function calls (3205683 primitive calls) in 8.447 seconds

   Ordered by: internal time
   List reduced from 73 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   747421    2.847    0.000    2.847    0.000 {method 'reduce' of 'numpy.ufunc' objects}
        1    2.441    2.441    6.304    6.304 kmeans.py:18(weight)
   747305    0.989    0.000    4.697    0.000 fromnumeric.py:1623(sum)
61131/31131    0.945    0.000    2.043    0.000 kmeans.py:9(Cost)
   747449    0.537    0.000    0.537    0.000 {isinstance}
   747366    0.326    0.000    3.172    0.000 _methods.py:31(_sum)
    61126    0.108    0.000    0.108    0.000 {method 'argmin' of 'numpy.ndarray' objects}
      200    0.057    0.000    0.057    0.000 {numpy.core.multiarray.array}
        1    0.055    0.055    8.447    8.447 <ipython-input-25-c34800b429f1>:1(kmeanspar)
    61126    0.047    0.000    0.155    0.000 fromnumeric.py:938(argmin)




In [37]:
stats = %prun -r -q kmeansplus(k,data)
stats.sort_stats('time').print_stats(10);

          72581 function calls in 0.695 seconds

   Ordered by: internal time
   List reduced from 66 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      618    0.178    0.000    0.420    0.001 pairwise.py:143(euclidean_distances)
      118    0.154    0.001    0.360    0.003 k_means_.py:400(_labels_inertia_precompute_dense)
      736    0.118    0.000    0.118    0.000 {numpy.core._dotblas.dot}
     5673    0.078    0.000    0.078    0.000 {method 'reduce' of 'numpy.ufunc' objects}
       10    0.048    0.005    0.307    0.031 k_means_.py:41(_k_init)
      118    0.018    0.000    0.018    0.000 _k_means.pyx:244(_centers_dense)
     3091    0.015    0.000    0.078    0.000 validation.py:37(_assert_all_finite)
      490    0.009    0.000    0.009    0.000 {method 'cumsum' of 'numpy.ndarray' objects}
     8159    0.009    0.000    0.009    0.000 {numpy.core.multiarray.array}
     1855    0.007    0.000    0.013    0.000 shape_base

In [35]:
stats = %prun -r -q kmeanspar(k,l,r,data)
stats.sort_stats('time').print_stats(10);

          3050990 function calls (3020990 primitive calls) in 7.966 seconds

   Ordered by: internal time
   List reduced from 72 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   701569    2.734    0.000    2.734    0.000 {method 'reduce' of 'numpy.ufunc' objects}
        1    2.311    2.311    7.966    7.966 <ipython-input-33-58047ee5deba>:15(kmeanspar)
   701423    0.913    0.000    4.432    0.000 fromnumeric.py:1623(sum)
60690/30690    0.913    0.000    1.976    0.000 kmeans.py:9(Cost)
   701627    0.490    0.000    0.490    0.000 {isinstance}
   701514    0.297    0.000    3.031    0.000 _methods.py:31(_sum)
    60685    0.105    0.000    0.105    0.000 {method 'argmin' of 'numpy.ndarray' objects}
      270    0.053    0.000    0.053    0.000 {numpy.core.multiarray.array}
    60685    0.048    0.000    0.152    0.000 fromnumeric.py:938(argmin)
      116    0.042    0.000    0.042    0.000 {method 'count' of 'list' objects}




### Profile

I use broadcasting and list comprehension without using for loop (except the largest one in the algorithm), which might be faster. However, broadcasting might cause other issues, because we can't control and know which broadcasting actually being used. For example, if each column represent one point, the codes will result in a wrong answer.
2. When do the parallelization, found that the weight function use lots of time. Because there is a two for loop inside. Change it to one for loop, 2.7 -> 0.295097827911. Also change the original one
3. there is a trick that we can use, don't use cost(data,C), just normalize them

### Optimation Strategies

1. The for loop might could be changed to the while loop，so that we don't need to run too many times. Or according to the paper, 15 rounds when l=0.1k, 5 rounds when l=0.5k, 2k, 10k.
2. k-means || algorithm is in fact far slower than k-means ++, I need to use Cython to improve the speed.

###Cython

In [27]:
%load_ext cython

In [185]:
%%cython -a 
#%%file kmeanspar.pyx

import numpy as np
cimport numpy as np
cimport cython
from libc.stdlib cimport rand
cdef extern from "stdlib.h":
    int RAND_MAX
    
def randnum():
    return rand()/float(RAND_MAX)


def distance(double[::1] p1,double[::1] p2):
    cdef int i, d
    cdef double w=0
    d = p1.shape[0]
    for i in range(d):
        w += (p1[i]-p2[i])**2
    return w

# def min_distance_C(double p1,double[::1] C):
#     cdef int i, n
#     cdef double dmin
#     n = C.shape[0]
#     dmin = distance(p1,C[0,])
#     for i in range(n):
#         dmin
#     distance()
    
    
def Cost_C(double[:,::1] C,double[:,::1] Y):
    cdef int i, j, n_C, n_Y, d
    cdef double cost = 0
    n_C = C.shape[0]
    cdef double[::1] dists=np.zeros(n_C)
    d = C.shape[1]
    n_Y = Y.shape[0]
    for i in range(n_Y):
        for j in range(n_C):
            dists[j] = distance(Y[i,:],C[j,:])
        cost += np.sum(dists)
    return cost

# #def weight_C():
    
    
def kmeanspar_C(int k,int l,int r,double[:,::1] data):
    cdef int n_Y, i, j, d
    d = data.shape[1]
    n_Y = data.shape[0]
    cdef double[::1] prob=np.zeros(n_Y)
    cdef double[::1] flag=np.zeros(n_Y)
    cdef double sum
    C = data[np.random.choice(range(n_Y),1),:]
    k = k+1
    for i in range(r):
        for j in range(n_Y):
            flag[j] = randnum()
            prob[j] = Cost_C(C,data[j,:])
        prob = prob/sum(prob)*float(l)
        C = np.concatenate((C,data[prob>=flag,:]))
            

    
# def kmeanspar_C(int k,int l,int r,double[:,:] data):
#     #Step 1
#     C = data[np.random.choice(range(len(data)),1),]
#     #for loop
#     for i in range(r):
#         prob = [l*Cost(C,x) for x in data]/Cost(C,data)
#         flag = nrd.uniform(size=len(data))
#         C = np.concatenate((C,data[prob>=flag,]))
#     #step 7
#     Cost_matrix = np.array([np.sum((c-x)**2) for c in C
#                                              for x in data]).reshape(len(C),len(data))
#     Index_min = list(np.argmin(Cost_matrix,axis=0))
#     weights = np.array([Index_min.count(i) for i in range(len(C))])
#     #step 8: k-means++ to choose weighted points
#     c = C[nrd.choice(range(len(C)),1),]
#     while len(c) < k:
#         p = np.array([Cost(c,x) for x in C])
#         Prob = p*weights/np.sum(p*weights)
#         x = nrd.choice(range(len(C)),1,p=Prob)
#         c = np.concatenate((c,C[x,]))
#     ##carry out K-means clustering
#     km = KMeans(n_clusters=k,init=c)
#     km.fit(data)
#     return km   


CompileError: command 'gcc' failed with exit status 1

In [181]:
randnum()

0.1592

In [182]:
%%file setup.py
from distutils.core import setup, Extension
from Cython.Build import cythonize

ext = Extension("KmeansparC",
              sources=["kmeanspar."])

setup(name = "cython_kmeanspar",
      ext_modules = cythonize(ext))

Overwriting setup.py


In [142]:
! python setup.py build_ext -i &> /dev/null

In [143]:
import KmeansparC

ImportError: No module named KmeansparC

###parallelization

In [44]:
from multiprocessing import Pool, cpu_count
from functools import partial

In [117]:
def min_distance(p1, C, axis = 1):
    return np.min(np.sum((p1-C)**2,axis))

def argmin_distance(p1, C, axis = 1):
    return np.argmin(np.sum((p1-C)**2,axis))

def Cost_par(C, Y):
    """C is a subset of the dataset, Y can be a point or a subset"""
    pool = Pool(processes=cpu_count())
    PartialMinDist = partial(min_distance, C=C, axis=1)
    cost = np.sum(pool.map(PartialMinDist, Y))
    pool.close()
    pool.terminate()
    return cost

def weight_par(C, data):
    start = timeit.default_timer()
    pool = Pool(processes=cpu_count())
    PartialArgminDist = partial(argmin_distance, C=C, axis=1)
    Index_min = list(pool.map(PartialArgminDist,data))
    return np.array([Index_min.count(i) for i in range(len(C))])
    
def kmeanspar_par(k,l,r,data):
    """k is the number of centers, l is the expected number of intermediate points
    in each iteration, r is the number of iterations, data is the target data set"""
    #l*r should be larger than k in case k-means|| select too few points
    if l*r < k:
        raise ValueError('r or l must be bigger, ')
    #if k is too large
    if k >= len(data):
        raise ValueError('k is too large')
        
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #for loop
    for i in range(r):
        pool = Pool(processes=cpu_count())
        PartialMinDist = partial(min_distance, C=C, axis=1)
        prob = np.array(pool.map(PartialMinDist, data))
        prob = prob/np.sum(prob)*float(l)
        pool.close()
        pool.terminate()
        flag = nrd.uniform(size=len(data))
        C = np.concatenate((C,data[prob>=flag,]))
    #step 7
    weights = weight_par(C,data)
    #step 8: k-means++ to choose weighted points
    c = C[nrd.choice(range(len(C)),1),]
    while len(c) < k:
        p = np.array([Cost(c,x) for x in C])
        Prob = p*weights/np.sum(p*weights)
        x = nrd.choice(range(len(C)),1,p=Prob)
        c = np.concatenate((c,C[x,]))
    ##carry out K-means clustering
    km = KMeans(n_clusters=k,init=c)
    km.fit(data)
    return km

In [118]:
%timeit kmeanspar_par(k,l,r,data)

1 loops, best of 3: 1.81 s per loop


In [120]:
cpu_count()

2

### Makefile

In [None]:
%%file makefile
TARGET = center
OBJECTS = distributions.o
CFLAGS = -g -O3 
LDLIBS = -lm
CC = c99 

all: $(TARGET)
    
clean:
	 rm $(TARGET) $(OBJECTS)

$(TARGET): $(OBJECTS)

###Application Analysis

Application to simulated/real problem and comparative anlaysis (up to 40 points)
As an example, a 40-point answer could include an extensive comparative analysis against the majority of classes of existing algorithms used to solve a problem with benchmarking and a thoguhtful consideration of the benefits/drawbacks of each