# Function File

##Python Function File

In [None]:
%%file dist_sq_func.py

import numpy as np

#distance square function
def dist_sq(a, b, axis = 0):
    return np.sum((a-b)**2,axis)


In [32]:
%%file cost_func.py

import numpy as np
from dist_sq_func import dist_sq

##cost function
def cost(c,data):
    return np.sum([min(dist_sq(c, d, axis = 1)) for d in data])


Overwriting cost_func.py


In [33]:
%%file smpl_prb_func.py

import numpy as np
from dist_sq_func import dist_sq
from cost_func import cost

#sample probability function
def smpl_prb(c,data,l):
    phi_temp = cost(c,data)
    return np.array([(min(dist_sq(c, d, axis = 1)))*l/phi_temp for d in data])


Overwriting smpl_prb_func.py


In [1]:
%%file weight_func_file.py

import numpy as np
from dist_sq_func import dist_sq

#weight function - propotional to the number of data points have the same specific center
def weight_func(c, data):
    # Find the closet point in c for each point in data
    min_c = [np.argmin(dist_sq(c, d, axis = 1)) for d in data];
    ## number of points which is closest to each s in c
    num_closest = np.array([min_c.count(i) for i in range(len(c))]).astype(float);
    ## return normalized weight
    return num_closest/np.sum(num_closest)


Writing weight_func_file.py


In [5]:
%%file KmeansParallel_func.py

from __future__ import division
import numpy as np
import sys
import sklearn.cluster
from dist_sq_func import dist_sq
from cost_func import cost
from smpl_prb_func import smpl_prb
from weight_func_file import weight_func

#Kmeans||
def KmeansParallel(n_clusters, data, l):
    if n_clusters <= 0 or not(isinstance(n_clusters,int)):
        sys.exit("n_cluster is not positive integer")
    
    if l <= 0: 
        sys.exit("l is not positive")
    
    if len(data) < n_clusters: 
        sys.exit("number of data is less than n_clusters")
    
    num = len(data)
    
    #Step 1 - uniformly sample one point
    c = np.array(data[np.random.choice(range(num),1),])
    
    #Step 2 - cost
    phi = cost(c,data)
    
    #Step 3~6 - get potential centers
    for i in range(np.ceil(np.log(phi)).astype(int)):
        c_add = data[smpl_prb(c,data,l)>np.random.uniform(size = num),]
        c = np.concatenate((c,c_add))
        
    #Step 7
    # Find the closet point in c for each point in data
    ##weight
    weight = weight_func(c, data)
    
    #Step 8 - recluster by kmeans++ initialization
    c_final = data[np.random.choice(range(len(c)),size=1,p=weight),]
    data_final = c
    for i in range(n_clusters-1):
        new_prb = smpl_prb(c_final,data_final,l) * weight
        c_fin_add = data[np.random.choice(range(len(c)),size=1,p=new_prb/np.sum(new_prb)),]
        c_final = np.concatenate((c_final,c_fin_add))
    
    #Do k-means with initial centers
    import sklearn.cluster
    km2 = sklearn.cluster.KMeans(n_clusters=n_clusters, n_init=1, init=c_final, max_iter=500, tol=0.0001)
    km2.fit(data);
    
    #return a KMeans type result - including: cluster_centers_, labels_, inertia_
    return km2
    

Overwriting KmeansParallel_func.py


##Multiple Processing Function File

In [1]:
%%file MP_func.py

import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial

##distance square function - we don't need the square root so we can save computation time
# euclidean distance
def dist_sq(a, b, axis = 0):
    return np.sum((a-b)**2,axis)

##minimum distance square between data and centers
def min_dist_sq(d, c, axis):
    return np.min(dist_sq(c,d,axis))

##cost function
# Version 4 - parallel computing with multiple cores
def cost_mc(c,data,axis=1):
    pool = Pool(processes=cpu_count())
    # define a partial function for min_dist_sq
    partial_min_dist_sq = partial(min_dist_sq, c=c, axis=1)
    cost = np.sum(pool.map(partial_min_dist_sq, data))
    pool.close()
    pool.terminate()
    return cost


#sampling probability function
# Version 4 - parallel computing with multiple cores
def smpl_prb_mc(c,data,l,axis):
    phi_temp = cost_mc(c,data,axis)
    pool = Pool(processes=cpu_count())
    # define a partial function for min_dist_sq
    partial_min_dist_sq2 = partial(min_dist_sq, c=c, axis=1)
    sampling_prob = np.array(pool.map(partial_min_dist_sq2, data))*l/phi_temp
    pool.close()
    pool.terminate()
    return sampling_prob

##weight function 
# propotional to the number of data points have the same specific center
# Version 2
def weight_func(c, data):
    # Find the closet point in c for each point in data
    min_c = [np.argmin(dist_sq(c, d, axis = 1)) for d in data];
    ## number of points which is closest to each s in c
    num_closest = np.array([min_c.count(i) for i in range(len(c))]).astype(float);
    ## return normalized weight
    return num_closest/np.sum(num_closest)


#Kmeans||
#l is oversampling factor

def KmeansParallel_mc(n_clusters, data, l):
    if n_clusters <= 0 or not(isinstance(n_clusters,int)):
        sys.exit("n_cluster is not positive integer")
    
    if l <= 0: 
        sys.exit("l is not positive")
    
    if len(data) < n_clusters: 
        sys.exit("number of data is less than n_clusters")
    
    num = len(data)
    
    #Step 1 - uniformly sample one point
    c = np.array(data[np.random.choice(range(num),1),])
    
    #Step 2 - cost
    phi = cost_mc(c,data,axis=1)
    
    #Step 3~6 - get potential centers
    for i in range(np.ceil(np.log(phi)).astype(int)):
        c_add = data[smpl_prb_mc(c,data,l,axis=1)>np.random.uniform(size = num),]
        c = np.concatenate((c,c_add))
        
    #Step 7
    # Find the closet point in c for each point in data
    ##weight
    weight = weight_func(c,data)
    
    #Step 8 - recluster by kmeans++ initialization
    c_final = data[np.random.choice(range(len(c)),size=1,p=weight),]
    data_final = c
    for i in range(n_clusters-1):
        new_prb = smpl_prb_mc(c_final,data_final,l,axis=1) * weight
        c_fin_add = data[np.random.choice(range(len(c)),size=1,p=new_prb/np.sum(new_prb)),]
        c_final = np.concatenate((c_final,c_fin_add))
    
    
    #Do k-means with initial centers
    import sklearn.cluster
    km2 = sklearn.cluster.KMeans(n_clusters=n_clusters, n_init=1, init=c_final, max_iter=500, tol=0.0001)
    km2.fit(data);
    
    #return a KMeans type result - including: cluster_centers_, labels_, inertia_
    return km2
    

Overwriting MP_func.py


#Test File

In [12]:
%%file test_cost.py

import numpy as np
from numpy.testing import assert_almost_equal
from cost_func import cost
from MP_func import cost_mc

def test_non_negativity():
    for i in range(10):
        data = np.random.normal(size=(10,2))
        c = data[np.random.choice(range(10),1),]
        assert (cost(c, data) >= 0) and (cost_mc(c, data) >= 0)

def test_full_data_zero():
    for i in range(10):
        data = np.random.normal(size=(10,2))
        c = data
        assert (cost(c, data) == 0) and (cost_mc(c, data) == 0)

def test_c_more_cost_less():
     for i in range(10):
        data = np.random.normal(size=(10,2))
        c_more = data[np.random.choice(range(10),4,replace=False),]
        c = c_more[:2,]
        assert (cost(c_more, data) <= cost(c, data)) and (cost_mc(c_more, data) <= cost_mc(c, data))


Overwriting test_cost.py


In [36]:
%%file test_dist_sq.py

import numpy as np
from numpy.testing import assert_almost_equal
from dist_sq_func import dist_sq

def test_non_negativity():
    for i in range(10):
        u = np.random.normal(3)
        v = np.random.normal(3)
        assert dist_sq(u, v) >= 0

def test_coincidence_when_zero():
    u = np.zeros(3)
    v = np.zeros(3)
    assert dist_sq(u, v) == 0

def test_coincidence_when_not_zero():
     for i in range(10):
        u = np.random.random(3)
        v = np.zeros(3)
        assert dist_sq(u, v) != 0

def test_symmetry():
    for i in range(10):
        u = np.random.random(3)
        v = np.random.random(3)
        assert dist_sq(u, v) == dist_sq(v, u)

def test_triangle():
    
    u = np.random.random(3)
    v = np.random.random(3)
    w = np.random.random(3)
    assert np.sqrt(dist_sq(u, w)) <= np.sqrt(dist_sq(u, v)) + np.sqrt(dist_sq(v, w))

def test_known1():
    u = np.array([0])
    v = np.array([3])
    assert_almost_equal(dist_sq(u, v), 3**2)

def test_known2():
    u = np.array([0,0])
    v = np.array([3, 4])
    assert_almost_equal(dist_sq(u, v), 5**2)

def test_known3():
    u = np.array([0,0])
    v = np.array([-3, -4])
    assert_almost_equal(dist_sq(u, v), 5**2)

Overwriting test_dist_sq.py


In [3]:
%%file test_smpl_prb.py

import numpy as np
from numpy.testing import assert_almost_equal
from cost_func import cost
from smpl_prb_func import smpl_prb
from MP_func import smpl_prb_mc

def test_non_negativity():
    l = 3
    for i in range(10):
        data = np.random.normal(size=(10,2))
        c = data[np.random.choice(range(10),1),]
        assert np.alltrue(smpl_prb(c,data,l) >= 0) and np.alltrue(smpl_prb_mc(c,data,l,axis=1) >= 0)
        

def test_sum_to_l():
    for i in range(10):
        l = i + 1
        data = np.random.normal(size=(10,2))
        c = data[np.random.choice(range(10),1),]
        assert ((np.sum(smpl_prb(c,data,l)) - l) <= 1e-6) and ((np.sum(smpl_prb_mc(c,data,l,axis=1)) - l) <= 1e-6)

def test_in_c_zero():
    l = 2
    for i in range(10):
        data = np.random.normal(size=(10,2))
        choice = np.random.choice(range(10),1)
        c = data[choice,]
        prb = smpl_prb(c,data,l)
        prb_mc = smpl_prb_mc(c,data,l,axis=1)
        assert np.alltrue(prb[choice,] == 0) and np.alltrue(prb_mc[choice,] == 0)



Overwriting test_smpl_prb.py


In [3]:
%%file test_weight_func.py

import numpy as np
from numpy.testing import assert_almost_equal
from cost_func import cost
from weight_func_file import weight_func

def test_non_negativity():
    for i in range(10):
        data = np.random.normal(size=(10,2))
        c = data[np.random.choice(range(10),1),]
        assert np.alltrue(weight_func(c,data) >= 0)
        

def test_sum_to_1():
    for i in range(10):
        data = np.random.normal(size=(10,2))
        c = data[np.random.choice(range(10),1),]
        assert (np.sum(weight_func(c,data)) - 1) <= 1e-6


Writing test_weight_func.py


In [5]:
%%file test_KmeansParallel.py

import numpy as np
import sys
from numpy.testing import assert_almost_equal
from KmeansParallel_func import KmeansParallel
from MP_func import KmeansParallel_mc

def test_level_label():
    for i in range(10):
        data = np.random.normal(size=(10,2))
        k = 3
        assert (len(set(KmeansParallel(n_clusters = k, data = data, l = 2*k).labels_)) == k) and (len(set(KmeansParallel_mc(n_clusters = k, data = data, l = 2*k).labels_)) == k)

def test_num_label():
    for i in range(10):
        data = np.random.normal(size=(10,2))
        k = 3
        assert (len(KmeansParallel(n_clusters = k, data = data, l = 2*k).labels_) == len(data)) and (len(KmeansParallel_mc(n_clusters = k, data = data, l = 2*k).labels_) == len(data))

        

Overwriting test_KmeansParallel.py


In [6]:
! py.test

platform linux2 -- Python 2.7.9 -- py-1.4.25 -- pytest-2.6.3
collected 18 items 
[0m
test_KmeansParallel.py ..
test_cost.py ...
test_dist_sq.py ........
test_smpl_prb.py ...
test_weight_func.py ..

