In [22]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
import pandas as pd

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [25]:
# iris = datasets.load_iris()
# print(iris.data.shape)

# dist_mat = squareform(pdist(iris.data))

# N = len(iris.data)

# X = iris.data[np.random.permutation(N),:]

# dist_mat = squareform(pdist(X))

# print(np.random.permutation(N))
# print(dist_mat)

# new data

buckets = np.array(pd.DataFrame(pd.read_json('../data/distance_matrix_arr_euclidean_100.json')))
print(buckets.shape)

dist_mat = buckets

print(dist_mat.shape)
# print(dist_mat[0])

(500, 500)
(500, 500)


In [26]:
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    print(ordered_dist_mat.shape)
    print(res_order)

Method:	 ward
(500, 500)
[49, 240, 200, 438, 144, 230, 12, 231, 32, 256, 350, 48, 114, 317, 301, 388, 407, 45, 67, 241, 164, 403, 36, 244, 229, 308, 80, 372, 160, 345, 76, 242, 292, 479, 3, 203, 272, 249, 279, 315, 392, 61, 47, 227, 341, 376, 290, 430, 464, 492, 273, 238, 280, 121, 496, 185, 197, 347, 378, 313, 461, 467, 37, 133, 146, 337, 495, 158, 42, 431, 116, 205, 178, 433, 328, 408, 498, 107, 181, 334, 27, 31, 257, 28, 251, 207, 351, 259, 401, 353, 191, 266, 173, 367, 127, 268, 64, 188, 371, 453, 265, 491, 209, 228, 463, 165, 365, 68, 77, 108, 211, 221, 212, 423, 220, 175, 179, 386, 123, 380, 245, 58, 14, 362, 63, 252, 8, 102, 166, 306, 387, 21, 333, 395, 443, 41, 87, 275, 323, 5, 182, 15, 426, 72, 483, 397, 422, 486, 55, 342, 69, 218, 452, 459, 271, 236, 478, 325, 399, 286, 307, 38, 51, 162, 143, 409, 82, 382, 66, 295, 105, 277, 451, 98, 262, 170, 329, 494, 148, 174, 319, 30, 145, 11, 74, 385, 189, 448, 402, 466, 171, 20, 167, 119, 361, 432, 109, 248, 360, 65, 1, 441, 193, 446, 3