In [29]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
import pandas as pd

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [30]:
# iris = datasets.load_iris()
# print(iris.data.shape)

# dist_mat = squareform(pdist(iris.data))

# N = len(iris.data)

# X = iris.data[np.random.permutation(N),:]

# dist_mat = squareform(pdist(X))

# print(np.random.permutation(N))
# print(dist_mat)

# new data

buckets = np.array(pd.DataFrame(pd.read_json('../data/distance_matrix_arr_euclidean_100.json')))
print(buckets.shape)

dist_mat = buckets

print(dist_mat.shape)
# print(dist_mat[0])

(500, 500)
(500, 500)


In [31]:
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    print(ordered_dist_mat.shape)
    print(res_order)

Method:	 ward
(500, 500)
[71, 242, 279, 127, 292, 426, 176, 291, 57, 59, 230, 293, 196, 482, 421, 184, 445, 299, 232, 316, 413, 255, 371, 360, 112, 429, 328, 423, 64, 473, 141, 146, 253, 342, 152, 330, 186, 220, 13, 78, 172, 111, 219, 4, 209, 100, 306, 327, 415, 18, 389, 203, 167, 442, 248, 460, 53, 348, 191, 398, 91, 305, 74, 378, 90, 206, 343, 28, 63, 46, 498, 359, 22, 350, 280, 324, 142, 307, 304, 265, 358, 365, 486, 257, 73, 244, 97, 116, 469, 400, 278, 386, 457, 494, 30, 376, 7, 45, 10, 315, 122, 262, 134, 47, 135, 2, 221, 446, 313, 224, 407, 410, 461, 79, 62, 479, 297, 338, 185, 259, 420, 436, 34, 147, 294, 380, 115, 263, 87, 192, 8, 98, 273, 333, 33, 81, 9, 187, 282, 474, 249, 258, 121, 340, 329, 214, 345, 290, 17, 246, 487, 381, 197, 388, 403, 471, 117, 399, 418, 95, 363, 86, 470, 223, 287, 199, 464, 483, 42, 437, 60, 495, 314, 344, 157, 201, 239, 472, 288, 300, 84, 124, 213, 438, 431, 228, 390, 416, 49, 491, 226, 205, 207, 309, 439, 285, 160, 310, 65, 128, 430, 298, 367, 402, 