In [12]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
import pandas as pd

def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [18]:
# iris = datasets.load_iris()
# print(iris.data.shape)

# dist_mat = squareform(pdist(iris.data))

# N = len(iris.data)

# X = iris.data[np.random.permutation(N),:]

# dist_mat = squareform(pdist(X))

# # print(np.random.permutation(N))
# print(X.shape)

buckets = pd.DataFrame(pd.read_json('../data/distance_matrix_arr.json'))
print(buckets.shape)

dist_mat = squareform(pdist(buckets))

print(dist_mat.shape)

(1000, 1000)
(1000, 1000)


In [19]:
methods = ["ward","single","average","complete"]
for method in methods:
    print("Method:\t",method)
    
    ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(dist_mat,method)
    print(ordered_dist_mat.shape)
    print(res_order)

Method:	 ward
(1000, 1000)
[251, 201, 734, 936, 351, 888, 75, 970, 426, 725, 264, 418, 681, 478, 194, 829, 136, 258, 759, 762, 781, 305, 751, 295, 669, 337, 661, 477, 587, 647, 678, 364, 631, 514, 979, 182, 960, 711, 920, 826, 41, 799, 285, 924, 986, 690, 493, 228, 956, 326, 366, 674, 341, 741, 32, 141, 208, 281, 705, 270, 938, 592, 47, 591, 405, 890, 937, 233, 420, 930, 975, 948, 229, 534, 518, 105, 663, 583, 802, 886, 907, 662, 508, 495, 708, 768, 772, 72, 164, 61, 348, 770, 821, 605, 992, 917, 745, 769, 845, 4, 834, 720, 773, 450, 724, 706, 782, 568, 74, 269, 955, 260, 864, 71, 487, 566, 673, 259, 379, 963, 891, 898, 442, 763, 377, 355, 491, 622, 113, 953, 79, 872, 0, 620, 253, 400, 614, 482, 695, 219, 262, 261, 381, 468, 168, 726, 927, 81, 565, 380, 503, 203, 912, 634, 983, 336, 730, 790, 974, 347, 687, 173, 481, 69, 236, 123, 603, 536, 919, 618, 807, 500, 35, 524, 392, 441, 656, 320, 868, 526, 204, 135, 509, 635, 961, 324, 453, 129, 428, 997, 342, 978, 996, 883, 307, 276, 497, 154