In [5]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
import pandas as pd

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [20]:
# iris = datasets.load_iris()
# print(iris.data.shape)

# dist_mat = squareform(pdist(iris.data))

# N = len(iris.data)

# X = iris.data[np.random.permutation(N),:]

# dist_mat = squareform(pdist(X))

# print(np.random.permutation(N))
# print(dist_mat)

# new data

buckets = np.array(pd.DataFrame(pd.read_json('../data/distance_matrix_arr.json')))
print(buckets.shape)

dist_mat = buckets

print(dist_mat.shape)
# print(dist_mat[0])

(500, 500)
(500, 500)
0.0 1.2524350009590761
1.117926766738241


In [21]:
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    print(ordered_dist_mat.shape)
    print(res_order)

Method:	 ward
(500, 500)
[456, 405, 472, 467, 5, 209, 194, 322, 177, 419, 221, 338, 391, 399, 448, 127, 162, 190, 371, 403, 133, 324, 401, 427, 93, 247, 272, 294, 493, 51, 400, 15, 237, 320, 143, 255, 219, 260, 252, 311, 126, 473, 161, 410, 290, 426, 444, 47, 178, 463, 462, 474, 443, 96, 189, 188, 442, 458, 198, 101, 168, 61, 131, 361, 358, 445, 135, 298, 384, 295, 421, 465, 7, 160, 54, 422, 179, 490, 152, 84, 344, 452, 43, 407, 441, 279, 437, 22, 77, 83, 17, 278, 73, 78, 109, 359, 81, 107, 343, 102, 92, 104, 241, 86, 267, 226, 310, 218, 253, 259, 340, 428, 232, 418, 38, 205, 35, 151, 95, 197, 307, 191, 485, 4, 297, 113, 346, 395, 370, 396, 106, 424, 63, 280, 234, 254, 342, 171, 59, 224, 11, 203, 120, 146, 117, 208, 65, 417, 94, 313, 412, 12, 238, 285, 469, 98, 277, 57, 85, 114, 376, 281, 274, 369, 122, 164, 282, 367, 119, 366, 261, 323, 72, 425, 97, 123, 286, 64, 111, 293, 377, 115, 468, 300, 326, 353, 29, 284, 41, 105, 319, 222, 328, 165, 121, 257, 200, 354, 23, 488, 225, 139, 416, 2