In [22]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
import pandas as pd

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [27]:
# iris = datasets.load_iris()
# print(iris.data.shape)

# dist_mat = squareform(pdist(iris.data))

# N = len(iris.data)

# X = iris.data[np.random.permutation(N),:]

# dist_mat = squareform(pdist(X))

# print(np.random.permutation(N))
# print(dist_mat)

# new data

buckets = np.array(pd.DataFrame(pd.read_json('../data/distance_matrix_arr_manhattan_100.json')))
print(buckets.shape)

dist_mat = buckets

print(dist_mat.shape)
# print(dist_mat[0])

(500, 500)
(500, 500)


In [28]:
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    print(ordered_dist_mat.shape)
    print(res_order)

Method:	 ward
(500, 500)
[448, 189, 385, 24, 466, 410, 421, 183, 216, 148, 174, 91, 302, 258, 319, 381, 106, 115, 90, 213, 126, 190, 110, 120, 377, 26, 136, 332, 324, 404, 171, 10, 458, 252, 330, 14, 58, 245, 138, 362, 101, 139, 395, 424, 8, 102, 63, 239, 443, 306, 387, 307, 325, 399, 21, 333, 11, 336, 285, 293, 489, 440, 445, 6, 481, 199, 177, 470, 219, 282, 289, 159, 316, 379, 471, 303, 474, 60, 318, 195, 23, 287, 232, 493, 156, 206, 97, 129, 168, 150, 260, 284, 320, 283, 322, 243, 247, 254, 415, 375, 134, 456, 436, 125, 488, 373, 39, 124, 352, 281, 465, 460, 499, 113, 400, 56, 74, 476, 218, 286, 43, 69, 425, 327, 355, 269, 412, 57, 237, 428, 118, 444, 29, 184, 317, 250, 477, 179, 359, 4, 192, 261, 62, 154, 77, 108, 27, 31, 186, 83, 253, 331, 19, 416, 321, 413, 147, 152, 103, 131, 163, 427, 204, 52, 278, 384, 391, 271, 455, 459, 490, 202, 439, 277, 451, 82, 235, 105, 66, 295, 30, 145, 109, 248, 162, 382, 224, 329, 494, 112, 5, 182, 482, 390, 93, 217, 454, 225, 22, 135, 104, 212, 366,