# Tests

In [4]:
%%file kmeans_fortest.py

import numpy as np
import numpy.random as nrd
#centroid
# def Centroid(Y):
#     """Y is a subset of the dataset"""
#     return np.average(Y,axis=0)

def Cost(C, Y):
    """C is a subset of the dataset, Y can be a point or a subset"""
    if  len(Y.shape)==1 or Y.shape[0]==1:
        #Y is a point
        MinIndex = np.argmin(np.sum((Y-C)**2,axis=1))
        return np.sum((Y-C[MinIndex,])**2)
    else:
        return np.sum([Cost(C,Y_i) for Y_i in Y])

def weight(C, data):
    """C is the centroid set and data is the target data set"""
    if len(C.shape)==1 or C.shape[0]==1:
        #C only have one point
        if len(data.shape)==1 or data.shape[0]==1:
            return np.array([1])
        else:
            return np.array([len(data)])
    else:
        #the cloest center for each point in data
        Index_min = [np.argmin(np.sum((x-C)**2,axis=1)) for x in data]
        #frequency for each center
        return np.array([Index_min.count(i) for i in range(len(C))]).astype(float)

def kmeanspar(k,l,r,data):
    """k is the number of centers, l is the expected number of intermediate points
    in each iteration, r is the number of iterations, data is the target data set"""
    #l*r should be larger than k in case k-means|| select too few points
    if l*r < k:
        raise ValueError('r or l must be bigger, ')
    #if k is too large
    if k >= len(data):
        raise ValueError('k is too large')
    #Step 1: choose one point randomly
    C = data[nrd.choice(range(len(data)),1),]
    #for loop
    for i in range(r):
        prob = [l*Cost(C,x) for x in data]/Cost(C,data)
        flag = nrd.uniform(size=len(data))
        C = np.concatenate((C,data[prob>=flag,]))
    #step 7
    weights = weight(C,data)
    #step 8: k-means++ to choose weighted points
    c = C[nrd.choice(range(len(C)),1),]
    while len(c) < k:
        p = np.array([Cost(c,x) for x in C])
        Prob = p*weights/np.sum(p*weights)
        x = nrd.choice(range(len(C)),1,p=Prob)
        c = np.concatenate((c,C[x,]))
    #change the return to only test initialization part
    return c

def Random(k,data):
    """k is the number of centers, data is target data"""
    if k >= len(data):
        raise ValueError('k is too large')
    return data[np.random.choice(len(data),k,replace=False),:]

def kmeansplus(k,data):
    if k >= len(data):
        raise ValueError('k is too large')
    #Step 1
    C = data[nrd.choice(range(len(data)),1),]
    #while loop
    while len(C) < k:
        prob = ([Cost(C,x) for x in data]/Cost(C,data)).reshape(len(data))
        x = nrd.choice(range(len(data)),1,p=prob)
        C = np.concatenate((C,data[x,]))
    return C

Writing kmeans_fortest.py


In [5]:
%%file test_cost.py

import numpy as np
import numpy.random as nrd
from numpy.testing import assert_equal
from kmeans_fortest import Cost

# def test_Centroid_dimension():
   
# def test_Centroid_when_same():
    
# def test_Centroid_when_different():
    
# def test_Centroid_known():

def test_Cost_integer1():
    Y=np.array([1,2,3])
    C=np.array([[1,2,3],[3,4,2],[5,2,1]])
    assert Cost(C,Y) == 0
    
def test_Cost_integer1():
    Y=C=np.array([[1,2,3],[3,4,2],[5,2,1]])
    assert Cost(C,Y) == 0
    

def test_Cost_non_negative1():
    for i in range(10):
        Y=nrd.multivariate_normal([0,0,0],5*np.identity(3),4)
        C=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
        assert Cost(C,Y) >= 0
        
def test_Cost_non_negative2():
    for i in range(10):
        Y=nrd.multivariate_normal([0,0,0],5*np.identity(3),1)
        C=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
        assert Cost(C,Y) >= 0
    
def test_Cost_zero1():
    Y=C=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
    assert Cost(C,Y) == 0
    
def test_Cost_zero2():
    C=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
    Y=C[0,]
    assert Cost(C,Y) == 0
    
def test_Cost_known1():
    C1=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
    C2=C1[0:1,]
    Y=nrd.multivariate_normal([0,0,0],5*np.identity(3),4)
    assert Cost(C1,Y) <= Cost(C2,Y)
    

Overwriting test_cost.py


In [6]:
%%file test_weight.py

import numpy as np
import numpy.random as nrd
from numpy.testing import assert_equal
from kmeans_fortest import Cost,weight

def test_weight_one_point():
    C=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
    assert_equal(weight(C[0,],C),np.array([3]))
def test_weight_known1():
    C=nrd.multivariate_normal([0,0,0],5*np.identity(3),1)
    assert_equal(weight(C,C),np.array([1]))

def test_weight_known2():
    C=nrd.multivariate_normal([0,0,0],5*np.identity(3),3)
    assert_equal(weight(C,C),np.array([1,1,1]))

def test_weight_zero1():
    C=nrd.multivariate_normal([0,0,0],5*np.identity(3),5)
    X=C[0:4,]
    assert_equal(weight(C,X),np.array([1,1,1,1,0]))
    
def test_weight_zero1():
    C=nrd.multivariate_normal([0,0,0],5*np.identity(3),5)
    X=C[1:5,]
    assert_equal(weight(C,X),np.array([0,1,1,1,1]))
    
def test_weight_integer1():
    X=np.array([[1,1,1],[2,3,4],[1,1,1]])
    C=X[0:2,]
    assert_equal(weight(C,X),np.array([2,1]))

Overwriting test_weight.py


In [7]:
%%file test_kmeanspar.py

import numpy as np
import numpy.random as nrd
from numpy.testing import assert_equal,assert_raises
from kmeans_fortest import Cost,weight,kmeanspar

k = 5
centers = nrd.multivariate_normal([0,0,0],5*np.identity(3),k)
data = [nrd.multivariate_normal(center, np.identity(3),10) for center in centers]
data = np.vstack(data)

def test_kmeanspar_len1():
    assert_equal(len(kmeanspar(2,2,5,data)),2)

def test_kmeanspar_len2():
    assert_equal(len(kmeanspar(2,1,5,data)),2)

def test_kmeanspar_len3():
    assert_equal(len(kmeanspar(5,3,5,data)),5)

def test_kmeanspar_inside1():
    assert_equal(all([d in data for d in kmeanspar(5,1,10,data)]),True)
    
def test_kmeanspar_inside2():
    assert_equal(all([d in data for d in kmeanspar(10,3,5,data)]),True)
    
def test_kmeanspar_throws_exception():
    assert_raises(ValueError,kmeanspar,500,100,100,data)
    
def test_kmeanspar_throws_exception2():
    assert_raises(ValueError,kmeanspar,5,1,3,data)
    

Overwriting test_kmeanspar.py


In [8]:
%%file test_Random.py

import numpy as np
import numpy.random as nrd
from numpy.testing import assert_equal,assert_raises
from kmeans_fortest import Random

k = 5
centers = nrd.multivariate_normal([0,0,0],5*np.identity(3),k)
data = [nrd.multivariate_normal(center, np.identity(3),10) for center in centers]
data = np.vstack(data)

def test_Random_len():
    assert_equal(len(Random(10,data)),10)

def test_Random_inside():
    assert_equal(all([d in data for d in Random(5,data)]),True)
    
def test_Random_throws_exception():
    assert_raises(ValueError,Random,500,data)

Overwriting test_Random.py


In [9]:
%%file test_kmeansplus.py
import numpy as np
import numpy.random as nrd
from numpy.testing import assert_equal,assert_raises
from kmeans_fortest import Cost,kmeansplus

k = 5
centers = nrd.multivariate_normal([0,0,0],5*np.identity(3),k)
data = [nrd.multivariate_normal(center, np.identity(3),10) for center in centers]
data = np.vstack(data)

def test_kmeansplus_len1():
    assert_equal(len(kmeansplus(10,data)),10)

def test_kmeansplus_len2():
    assert_equal(len(kmeansplus(7,data)),7)

def test_kmeansplus_inside1():
    assert_equal(all([d in data for d in kmeansplus(5,data)]),True)
    
def test_kmeansplus_inside2():
    assert_equal(all([d in data for d in kmeansplus(1,data)]),True)
    
def test_kmeansplus_throws_exception():
    assert_raises(ValueError,kmeansplus,100,data)

Overwriting test_kmeansplus.py


In [10]:
! py.test

platform linux2 -- Python 2.7.9 -- py-1.4.25 -- pytest-2.6.3
collected 36 items 
[0m
test_Random.py ...
test_cost.py ......
test_kmeans.py ..........
test_kmeanspar.py .......
test_kmeansplus.py .....
test_weight.py .....

