In [None]:
from __future__ import division
import os
import sys
import glob
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as nrd
from sklearn.cluster import KMeans
from kmeans import *
%matplotlib inline
%precision 4
plt.style.use('ggplot')

### Dataset

GAUSSMIXTURE, which is synthetic. To generate it, I first sample k=(3,4,5) centers from a 3-dimensional Gaussian distribution with mean 0 and variance 5$I_3$. Then add points from Gaussian distributions of unit variance around centers.

In [7]:
#first only generate k=5

nrd.seed(1234)
k = 20
centers = nrd.multivariate_normal([0]*10,5*np.identity(10),k)
data = [nrd.multivariate_normal(center, np.identity(10),100) for center in centers]
data = np.vstack(data)

### Important functions

In [3]:
def Cost(C, Y):
    """C is a subset of the dataset, Y can be a point or a subset"""
    if  len(Y.shape)==1 or Y.shape[0]==1:
        #Y is a point
        MinIndex = np.argmin(np.sum((Y-C)**2,axis=1))
        return np.sum((Y-C[MinIndex,])**2)
    else:
        return np.sum([Cost(C,Y_i) for Y_i in Y])

def weight(C, data):
    """C is the centroid set and data is the target data set"""
    if len(C.shape)==1 or C.shape[0]==1:
        #C only have one point
        if len(data.shape)==1 or data.shape[0]==1:
            return np.array([1])
        else:
            return np.array([len(data)])
    else:
        Cost_matrix = np.array([np.sum((c-x)**2) for c in C
                                             for x in data]).reshape(len(C),len(data))
        Index_min = list(np.argmin(Cost_matrix,axis=0))
        return np.array([Index_min.count(i) for i in range(len(C))])

### k-means||

In [4]:
def kmeanspar(k,l,r,data):
    """k is the number of centers, l is the expected number of intermediate points
    in each iteration, r is the number of iterations, data is the target data set"""
    #l*r should be larger than k in case k-means|| select too few points
    if l*r < k:
        raise ValueError('r or l must be bigger, ')
    #if k is too large
    if k >= len(data):
        raise ValueError('k is too large')
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #Step 2
    Phi = Cost(C,data)
    #for loop
    for i in range(r):
        prob = [l*Cost(C,x) for x in data]/Cost(C,data)
        flag = nrd.uniform(size=len(data))
        C = np.concatenate((C,data[prob>=flag,]))
    #step 7
    weights = weight(C,data)
    #step 8: k-means++ to choose weighted points
    c = C[nrd.choice(range(len(C)),1),]
    while len(c) < k:
        p = np.array([Cost(c,x) for x in C])
        Prob = p*weights/np.sum(p*weights)
        x = nrd.choice(range(len(C)),1,p=Prob)
        c = np.concatenate((c,C[x,]))
    return c

### Random

In [8]:
def Random(k,data):
    """k is the number of centers, data is target data"""
    if k >= len(data):
        raise ValueError('k is too large')
    return data[np.random.choice(len(data),k,replace=False),:]

### K-means++

In [5]:
def kmeansplus(k,data):
    if k >= len(data):
        raise ValueError('k is too large')
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #while loop
    while len(C) < k:
        prob = ([Cost(C,x) for x in data]/Cost(C,data)).reshape(len(data))
        x = nrd.choice(range(len(data)),1,p=prob)
        C = np.concatenate((C,data[x,]))
    return C

### Comparison

In [3]:
pluspoint = kmeansplus(k,data)

In [4]:
kmeanspar(k,2*k)

TypeError: kmeanspar() takes exactly 4 arguments (2 given)

In [8]:
centers

array([[ 1.0542, -2.6631,  3.2036],
       [-0.6991, -1.6113,  1.9838],
       [ 1.9221, -1.4233,  0.0351],
       [-5.0148,  2.5716,  2.2181],
       [ 2.1317, -4.5197, -0.747 ]])

### Profile

I use broadcasting and list comprehension without using for loop (except the largest one in the algorithm), which might be faster. However, broadcasting might cause other issues, because we can't control and know which broadcasting actually being used. For example, if each column represent one point, the codes will result in a wrong answer.

### Optimation Strategies

1. The for loop might could be changed to the while loop，so that we don't need to run too many times. Or according to the paper, 15 rounds when l=0.1k, 5 rounds when l=0.5k, 2k, 10k.
2. it seems that the k-means || algorithm is in fact slower than k-means ++, I need to figure out the reason.

### Makefile

In [None]:
%%file makefile
TARGET = center
OBJECTS = distributions.o
CFLAGS = -g -O3 
LDLIBS = -lm
CC = c99 

all: $(TARGET)
    
clean:
	 rm $(TARGET) $(OBJECTS)

$(TARGET): $(OBJECTS)

In [1]:
!pip install mrjob

Downloading/unpacking mrjob
  Downloading mrjob-0.4.4.tar.gz (186kB): 186kB downloaded
  Running setup.py (path:/tmp/pip_build_bitnami/mrjob/setup.py) egg_info for package mrjob
    
    no previously-included directories found matching 'docs'
Downloading/unpacking filechunkio (from mrjob)
  Downloading filechunkio-1.6.tar.gz
  Running setup.py (path:/tmp/pip_build_bitnami/filechunkio/setup.py) egg_info for package filechunkio
    
Downloading/unpacking simplejson>=2.0.9 (from mrjob)
  Downloading simplejson-3.6.5.tar.gz (73kB): 73kB downloaded
  Running setup.py (path:/tmp/pip_build_bitnami/simplejson/setup.py) egg_info for package simplejson
    
Installing collected packages: mrjob, filechunkio, simplejson
  Running setup.py install for mrjob
    changing mode of build/scripts-2.7/mrjob from 664 to 775
    
    no previously-included directories found matching 'docs'
    changing mode of /home/bitnami/anaconda/bin/mrjob to 775
  Running setup.py install for filechunkio
    
  Runnin