In [38]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
import pandas as pd

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [39]:
# iris = datasets.load_iris()
# print(iris.data.shape)

# dist_mat = squareform(pdist(iris.data))

# N = len(iris.data)

# X = iris.data[np.random.permutation(N),:]

# dist_mat = squareform(pdist(X))

# print(np.random.permutation(N))
# print(dist_mat)

# new data

buckets = np.array(pd.DataFrame(pd.read_json('../data/distance_matrix_arr_wasserstein_100.json')))
print(buckets.shape)

dist_mat = buckets

print(dist_mat.shape)
# print(dist_mat[0])

(500, 500)
(500, 500)


In [40]:
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    print(ordered_dist_mat.shape)
    print(res_order)

Method:	 ward
(500, 500)
[310, 343, 491, 336, 464, 230, 256, 296, 191, 314, 173, 496, 271, 190, 281, 52, 457, 339, 353, 290, 305, 321, 167, 494, 63, 307, 168, 38, 469, 123, 73, 458, 193, 272, 75, 18, 287, 140, 135, 447, 133, 342, 499, 14, 223, 39, 477, 428, 459, 88, 424, 197, 24, 113, 214, 294, 303, 65, 351, 161, 304, 212, 445, 374, 57, 219, 181, 356, 222, 430, 397, 264, 358, 326, 492, 385, 407, 323, 92, 453, 357, 48, 392, 15, 232, 72, 484, 94, 25, 414, 260, 262, 316, 103, 441, 322, 129, 344, 144, 172, 141, 390, 431, 160, 362, 77, 258, 37, 465, 96, 179, 379, 164, 460, 87, 209, 434, 118, 119, 51, 341, 350, 236, 76, 355, 122, 187, 376, 456, 152, 462, 419, 208, 114, 372, 111, 44, 211, 6, 53, 267, 432, 393, 79, 490, 143, 242, 148, 395, 61, 192, 112, 381, 404, 263, 486, 157, 301, 205, 450, 47, 493, 308, 218, 278, 49, 203, 3, 373, 80, 402, 7, 104, 289, 29, 335, 269, 337, 489, 274, 34, 279, 487, 89, 283, 297, 329, 23, 224, 56, 198, 331, 364, 299, 246, 334, 32, 378, 389, 495, 417, 483, 162, 41