In [27]:
import numpy as np
# from sklearn.metrics.pairwise import rbf_kernel
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist
from sklearn.neighbors import NearestNeighbors, KDTree

In [2]:
np.random.seed(3)
data = np.random.rand(100, 2)

In [3]:
# K = rbf_kernel(data, data, 1.8)
# print(f"K: {K.shape}")
# np.dot(K[k, :], K[:, k]) / (K[k, k] + lambda_)

In [14]:
# make some fake data X and y
np.random.seed(0)
train_X = np.random.rand(100, 3)
train_y = np.random.randint(0,10, size=100)
# print(train_y.min(), train_y.max())
svc = SVC()
svc.fit(train_X, train_y)

SVC()

In [15]:
test_X = np.random.rand(1, 3)
svc.predict(test_X)

array([2])

In [4]:
def rbf_kernel(X, Y, sigma):
    N, K = X.shape
    M = Y.shape[0]

    K_xy = np.ones(M)*np.sum(X**2) + np.ones(N)*np.sum(Y**2) - 2*np.dot(X, Y.transpose())
    K_xy = np.exp(-0.5 * K_xy / sigma**2)

    return K_xy

In [7]:
K = rbf_kernel(data, data, 1.8)
print(f"K: {K.shape}")
# print(K[:10])

K: (100, 100)


In [8]:
def halving(K, m, candidate_index=None, lambda_=0.001):
    
    n = K.shape[0]
    print(f'number of data: {n}')

    m = min(n, m)
    print(f'number of samples: {m}')

    if candidate_index is None:
        candidate_index = np.array(range(n))
    
    print(f'candidate_index: {len(candidate_index)}')

    q = len(candidate_index)

    index = np.empty(m, dtype=int)
    print(f'number of index: {index.shape}')

    print('Selecting samples......')
    for i in range(m):
        score = np.zeros(q)
        for j in range(q):
            k = candidate_index[j]
            # print(k)
            score[j] = np.dot(K[k, :], K[:, k]) / (K[k, k] + lambda_)
        
        I = score.argmax()
        # print(I)
        index[i] = candidate_index[I]

        # update K
        K = K - np.dot(K[:, index[i]], K[index[i], :]) / (K[index[i], index[i]] + lambda_)

    print('Done.\n')
    return index

In [9]:
id = halving(K, 4)
id

number of data: 100
number of samples: 4
candidate_index: 100
number of index: (4,)
Selecting samples......
Done.



array([2, 2, 2, 2])

In [10]:
def number_density(data, center, radius):
    print(f'length of data: {len(data)}\nlength of center: {len(center)}')
    f = 0
    for i in range(len(data)):
        ball_dist = np.zeros(len(center))
        dist = np.ones(len(center))
        for j in range(len(center)):
            dist[j] = np.linalg.norm(data[i, :] - center[j, :])
            if dist[j] < radius:
                ball_dist[j] = dist[j]

        # print(np.exp(ball_dist/1.8))
        f += np.sum(np.exp(ball_dist/1.8)**2) / (len(ball_dist) + 1)
    
    return f

In [11]:
kmeans = KMeans(n_clusters=4).fit(data)
center = kmeans.cluster_centers_

In [12]:
f = number_density(data, center, radius=0.25)
f

length of data: 100
length of center: 4


83.2588001745248

In [33]:
def SDAL(data, k):

    kmeans = KMeans(n_clusters=k).fit(data)
    center = kmeans.cluster_centers_

    radius = 0.25
    L, R = data.shape
    
    f = number_density(data, center, radius)
    T = 0
    while T<50:
        for j in range(k):
            ball = []
            dist = np.empty(L)
            for i in range(L):
                dist[i] = np.linalg.norm(data[i] - center[j])
                if dist[i] < radius:
                    ball.append(data[i])
            if len(ball)==0:
                center[j] = center[j]
            else:
                center[j] = np.mean(ball)

        F = number_density(data, center, radius)
        
        if F-f==0 or len(np.argwhere(pdist(center)<2*radius))>0:
            break
        else:
            f = F
        T+=1
        radius*=1.1
    
    tree = KDTree(data)
    _, idx = tree.query(center, k=1)
    # print(idx)
    center = data[idx].squeeze()
            
    return center

In [34]:
center = SDAL(data, 4)
center.shape

length of data: 100
length of center: 4
length of data: 100
length of center: 4
[[55]
 [55]
 [28]
 [21]]


(4, 2)