In [1]:
import numpy as np
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist
from sklearn.neighbors import NearestNeighbors, KDTree

In [47]:
num_data = 100
num_samples = num_data // 2
num_features = 2
num_centers = 4
rand_seed = 0

In [48]:
np.random.seed(rand_seed)
data = np.random.rand(num_data, num_features)

In [49]:
# K = rbf_kernel(data, data, 1.8)
# print(f"K: {K.shape}")
# np.dot(K[k, :], K[:, k]) / (K[k, k] + lambda_)

In [50]:
# make some fake data X and y
np.random.seed(0)
train_X = np.random.rand(100, 3)
train_y = np.random.randint(0,10, size=100)
# print(train_y.min(), train_y.max())
svc = SVC()
svc.fit(train_X, train_y)

SVC()

In [51]:
test_X = np.random.rand(1, 3)
svc.predict(test_X)

array([2])

In [52]:
def rbf_kernel(X, Y, sigma):
    N, K = X.shape
    M = Y.shape[0]

    K_xy = np.ones(M)*np.sum(X**2) + np.ones(N)*np.sum(Y**2) - 2*np.dot(X, Y.transpose())
    K_xy = np.exp(-0.5 * K_xy / sigma**2)

    return K_xy

In [53]:
K = rbf_kernel(data, data, 1.8)
print(f"K: {K.shape}")
print(K[:10])

K: (100, 100)
[[1.71466761e-09 1.66656512e-09 1.65321677e-09 1.74954823e-09
  1.70960115e-09 1.71470853e-09 1.80204387e-09 1.37665086e-09
  1.60898489e-09 1.84456260e-09 1.87865801e-09 1.71403276e-09
  1.56780576e-09 1.68403776e-09 1.59729894e-09 1.65549985e-09
  1.63415999e-09 1.53404248e-09 1.69596142e-09 1.81984291e-09
  1.56166663e-09 1.52173624e-09 1.73215239e-09 1.42259585e-09
  1.52516465e-09 1.61896652e-09 1.61337948e-09 1.43243586e-09
  1.57605878e-09 1.52398585e-09 1.40449758e-09 1.53735612e-09
  1.49642404e-09 1.56656877e-09 1.57072572e-09 1.74584147e-09
  1.79921586e-09 1.52540527e-09 1.43738290e-09 1.44016585e-09
  1.54294650e-09 1.57162268e-09 1.55728608e-09 1.48848065e-09
  1.80593167e-09 1.63178128e-09 1.59812878e-09 1.45913990e-09
  1.48019590e-09 1.53700604e-09 1.58851952e-09 1.86882826e-09
  1.58044015e-09 1.67362164e-09 1.70998202e-09 1.73491710e-09
  1.60404161e-09 1.67156231e-09 1.76111015e-09 1.80508995e-09
  1.68522315e-09 1.80848440e-09 1.63892009e-09 1.4307636

In [54]:
def halving(K, m, candidate_index=None, lambda_=0.001):
    
    n = K.shape[0]
    print(f'number of data: {n}')

    m = min(n, m)
    print(f'number of samples: {m}')

    if candidate_index is None:
        candidate_index = np.array(range(n))
    
    # print(f'candidate_index: {candidate_index}')

    q = len(candidate_index)

    index = np.empty(m, dtype=int)
    # print(f'number of index: {index.shape}')

    print('Selecting samples......')
    for i in range(m):
        score = np.zeros(q)
        for j in range(q):
            k = candidate_index[j]
            # print(k)
            score[j] = np.dot(K[k, :], K[:, k]) / (K[k, k] + lambda_)
        
        I = score.argmax()
        # print(I)
        index[i] = candidate_index[I]

        # update K
        # K = K - np.dot(K[:, index[i]], K[index[i], :]) / (K[index[i], index[i]] + lambda_)
        K = K - K[:, index[i]][:, np.newaxis] @ K[index[i], :][np.newaxis, :] / (K[index[i], index[i]] + lambda_)

    print('Done.\n')
    return index

In [55]:
# K_random = np.random.rand(10, 10)
id = halving(K, num_samples)
id

number of data: 100
number of samples: 50
Selecting samples......
Done.



array([82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
       82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
       82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82])

In [32]:
def number_density(data, center, radius):
    print(f'length of data: {len(data)}\nlength of center: {len(center)}')
    f = 0
    for i in range(len(data)):
        ball_dist = np.zeros(len(center))
        dist = np.ones(len(center))
        for j in range(len(center)):
            dist[j] = np.linalg.norm(data[i, :] - center[j, :])
            if dist[j] < radius:
                ball_dist[j] = dist[j]

        # print(np.exp(ball_dist/1.8))
        f += np.sum(np.exp(ball_dist/1.8)**2) / (len(ball_dist) + 1)
    
    return f

In [33]:
kmeans = KMeans(n_clusters=num_centers).fit(data)
center = kmeans.cluster_centers_

In [34]:
f = number_density(data, center, radius=0.25)
f

length of data: 80
length of center: 4


66.87375596462208

In [35]:
def SDAL(data, k):

    kmeans = KMeans(n_clusters=k).fit(data)
    center = kmeans.cluster_centers_

    radius = 0.25
    L, R = data.shape
    
    f = number_density(data, center, radius)
    T = 0
    while T<50:
        for j in range(k):
            ball = []
            dist = np.empty(L)
            for i in range(L):
                dist[i] = np.linalg.norm(data[i] - center[j])
                if dist[i] < radius:
                    ball.append(data[i])
            if len(ball)==0:
                center[j] = center[j]
            else:
                center[j] = np.mean(ball)

        F = number_density(data, center, radius)
        
        if F-f==0 or len(np.argwhere(pdist(center)<2*radius))>0:
            break
        else:
            f = F
        T+=1
        radius*=1.1
    
    tree = KDTree(data)
    _, idx = tree.query(center, k=1)
    # print(idx)
    center = data[idx].squeeze()
            
    return center

In [36]:
center = SDAL(data, num_centers)
center.shape

length of data: 80
length of center: 4
length of data: 80
length of center: 4


(4, 2)

In [37]:
import scipy.io
mat = scipy.io.loadmat('Syndata.mat')

In [16]:
type(mat['data'])

numpy.ndarray