In [127]:
from finch import finch
import scipy.io
import h5py
import numpy as np


In [128]:
file_path = "/media/fcheng/FINCH-Clustering/data/STL-10/data.mat"


with h5py.File(file_path, 'r') as f:
    data = f['data']
    data = np.array(data).T

In [129]:
from scipy.spatial import distance
from scipy.sparse import coo_matrix


In [130]:
## clustRank

mat = data 
s = mat.shape[0]
initial_rank = []

if initial_rank:
    orig_dist = None
elif s <= 70000:
    # orig_dist = pdist2(mat, mat, 'cosine')
    orig_dist = distance.cdist(mat, mat, metric='cosine')
    orig_dist[np.eye(orig_dist.shape[0], orig_dist.shape[1])>0] = np.inf
    d = np.min(orig_dist, axis=1)
    initial_rank = np.argmin(orig_dist, axis=1)
    min_sim = np.max(d)  # 最小相似性
else:
    print('finding exact neghbours via pdist is not fesable on ram with data size of %d points.\nUsing flann to compute 1st-neighbours at this step ...\n\n '%s)
    initial_rank, d = flann_nn(mat, 8)
    print('step flann done...')
    min_sim = np.max(d)
    orig_dist = None

# A = np.zeros(shape=(s, s))
# idx = np.stack([np.arange(s), initial_rank], axis=0).T
# A[idx[:, 0], idx[:, 1]] = 1
# A = A + np.eye(s, s)
# A = A * A.T
# A[np.eye(s, s)>0] = 0

A = coo_matrix((np.ones(s), (np.arange(s), initial_rank)), shape=(s, s))
A = A + coo_matrix((np.ones(s), (np.arange(s), np.arange(s))), shape=(s, s))
A = A @ A.T
A[coo_matrix((np.ones(s), (np.arange(s), np.arange(s))), shape=(s, s))>0] = 0

In [131]:
print(A.shape)
print(A[:10, :10])
print(orig_dist.shape)
print(data.shape)

(13000, 13000)
  (0, 0)	0.0
  (1, 1)	0.0
  (2, 2)	0.0
  (3, 3)	0.0
  (4, 4)	0.0
  (5, 5)	0.0
  (6, 6)	0.0
  (7, 7)	0.0
  (8, 8)	0.0
  (9, 9)	0.0
(13000, 13000)
(13000, 2048)


In [132]:
np.sum(A*orig_dist == np.nan)
temp = A*orig_dist

In [133]:
np.unique(temp)

array([0.04992016, 0.05044046, 0.05266028, ...,        nan,        nan,
              nan])

In [134]:
print(np.sum(temp == temp[0,0]))
temp[0,1]

0


0.6404325522053438

In [135]:
## get_clust

from scipy.sparse.csgraph import connected_components
from FINCH_Core.graph import mat2edgeGraph

_, labels_cc = connected_components(csgraph=A, directed=False, connection='weak', return_labels=True)

g = mat2edgeGraph(A)
cc = g.connectedComponents()
labels = np.zeros(A.shape[0])
for i, c in enumerate(cc):
    labels[c] = i

In [136]:
print(labels.shape, np.max(labels))
print(labels_cc.shape, np.max(labels_cc))
print(np.sum(labels.astype(np.int) == labels_cc))

(13000,) 2060.0
(13000,) 2060
13000


In [137]:
## get_merge
# from FINCH_Core.coolMean import coolMean

def getC(G, u):
    b, m, n = np.unique(G, return_index=True, return_inverse=True)
    G_ = u[n]
    return G_

def ind2vec(ind, N=None):
    ind = np.asarray(ind)
    if N is None: 
        N = ind.max() + 1
    return (np.arange(N) == ind[:,None]).astype(int)

def coolMean(M_, u):
    u_ = ind2vec(u.T).T
    nf = np.sum(u_, 1)
    idx = np.argsort(u)
    M = np.zeros(shape=(len(idx), M_.shape[1]))
    M = M_[idx, :]

    M = np.cumsum(np.concatenate([np.zeros((1, M.shape[1])), M]), axis=0)

    cnf = np.cumsum(nf)
    nf1 = [0] + (cnf).tolist()
    nf1 = np.array(nf1[:-1])
    s = np.stack([nf1, cnf], axis=1)
    
    M = M[np.array(s)[:, 1], :] - M[np.array(s)[:, 0], :]
    M = M / nf[:, None]

    return M

c = []
u = labels
#  data已经存在


In [138]:
# get_merge
u_ = ind2vec(u).T
num_clust = u_.shape[0]

if len(c):
    c = getC(c, u.T)
else:
    c = u.T

if num_clust <= 5e6:
    mat = coolMean(data, c)
else:
    print("resorting to approx combining method ...")

    # 这里是Fisrt 模式, .m中是last
    _, ic, _ = np.unique(c, return_index=True, return_inverse=True)

    mat = data[ic, :]

In [139]:
print(mat[:10, :10])

[[0.04093142 0.15441199 1.23327896 0.65436242 0.93097696 0.24027635
  0.52146819 0.28421423 0.08741484 0.00656254]
 [0.78801213 0.38923053 0.17731755 0.11491592 0.62110606 3.36836135
  0.06537916 0.36890557 0.10829236 0.04893236]
 [0.30743476 0.11307178 0.35239838 0.30884921 1.03607747 0.79648317
  0.16686233 0.14079956 0.24026253 0.21062335]
 [0.74710247 1.33104843 0.34513118 0.98153604 0.45060413 0.71767141
  0.29718454 0.83767126 3.18700963 0.03520721]
 [0.07266629 0.18395681 0.19053651 0.10359657 3.17704082 0.23914292
  0.3092468  0.49075881 0.00897    0.03298206]
 [0.28023019 0.14312471 0.1274875  0.70268275 0.2183824  2.26531586
  0.58831347 0.49654616 1.38862363 0.04738321]
 [0.20141239 0.13563699 0.31446827 0.10387727 5.89049594 0.08737619
  0.06358969 0.30360839 0.30314005 0.4703034 ]
 [0.10085571 0.17027033 0.27665633 0.10256788 0.71235251 1.13755859
  0.06933954 0.10267043 0.27801215 0.03590233]
 [0.33960968 0.25821274 0.08292947 0.42589353 0.48099163 1.01381814
  0.49558241

In [97]:
print(u_.shape, np.unique(u_))
print(u[209])
np.where(u_[:, 209]==1)
num_clust

(2061, 13000) [0 1]
190.0


2061

In [98]:
print(mat.shape)
print(c.shape)
print(u.shape)

(2061, 2048)
(13000,)
(13000,)


In [99]:
if orig_dist is not None:
    min_sim = np.max(orig_dist[A.toarray() > 0])
exit_clust = np.inf
c_ = c

num_clust = [num_clust]
k = 1

In [46]:
min_sim

0.6136007848862834

In [None]:
#### **************************** ####
## while-LOOP 进入循环
##   ............................   ##

In [47]:
exit_clust > 0

True

In [117]:
## coolMean 
# 返回的矩阵值不对
M = data
u = c
print(data.shape)
print(c.shape)
print(c[1000:1010])


(13000, 2048)
(13000,)
[694. 609. 695. 275.  40. 696.  30.  59. 697. 634.]


In [120]:
print(type(M), M.shape)
print(data.shape)

<class 'numpy.ndarray'> (2061, 2048)
(13000, 2048)


In [119]:
u_ = ind2vec(u.T).T
nf = np.sum(u_, 1)
idx = np.argsort(u)
M = M[idx, :]

M = np.cumsum(np.concatenate([np.zeros((1, M.shape[1])), M]), axis=0)

cnf = np.cumsum(nf)
nf1 = [0] + (cnf).tolist()
nf1 = np.array(nf1[:-1])
s = np.stack([nf1, cnf], axis=1)

M = M[np.array(s)[:, 1], :] - M[np.array(s)[:, 0], :]
M = M / nf[:, None]

In [113]:
print(idx[:100])

[    0  7070  7690  1350  8423     1 12244   410  8096 11878  8882  2818
     2 10840  1752  8812  4088  1212  2115  2112  6542  2269 12568 12704
 12449     3  4893  7549   302     4   846 10119  3659  5263  3384   118
  4669 11554  9779     5  7868  1640 12005     6  4642  8173     7 12574
  6847  2356  3489  2632 10050 10514  6474   359 12910 10402  4829  7084
  6185 10213 10263  3104 10144  9036   792  3021  8118  9063 11397 11276
   409  5847  1406  5647 10746 10262  3993 10898   122  7268 12594     8
  1883  5367  5238  4488 10678 11183     9   843  6119  1340  1496 10862
 10109  2447  4118    10]


In [114]:
print(M[:10, :10])

[[0.04093142 0.15441199 1.23327896 0.65436242 0.93097696 0.24027635
  0.52146819 0.28421423 0.08741484 0.00656254]
 [0.78801213 0.38923053 0.17731755 0.11491592 0.62110606 3.36836135
  0.06537916 0.36890557 0.10829236 0.04893236]
 [0.30743476 0.11307178 0.35239838 0.30884921 1.03607747 0.79648317
  0.16686233 0.14079956 0.24026253 0.21062335]
 [0.74710247 1.33104843 0.34513118 0.98153604 0.45060413 0.71767141
  0.29718454 0.83767126 3.18700963 0.03520721]
 [0.07266629 0.18395681 0.19053651 0.10359657 3.17704082 0.23914292
  0.3092468  0.49075881 0.00897    0.03298206]
 [0.28023019 0.14312471 0.1274875  0.70268275 0.2183824  2.26531586
  0.58831347 0.49654616 1.38862363 0.04738321]
 [0.20141239 0.13563699 0.31446827 0.10387727 5.89049594 0.08737619
  0.06358969 0.30360839 0.30314005 0.4703034 ]
 [0.10085571 0.17027033 0.27665633 0.10256788 0.71235251 1.13755859
  0.06933954 0.10267043 0.27801215 0.03590233]
 [0.33960968 0.25821274 0.08292947 0.42589353 0.48099163 1.01381814
  0.49558241

In [75]:
print(nf1)

[    0     4     6 ... 12994 12996 12998]
