-
Notifications
You must be signed in to change notification settings - Fork 4
/
cluster_alignment.py
102 lines (87 loc) · 4.23 KB
/
cluster_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from scipy.optimize import linear_sum_assignment
from scipy.sparse import coo_matrix
import numpy as np
# Adapted from https://github.com/mwburke/cluster-stability/blob/master/cluster_stability.py
__all__ = ['align_clusters']
def align_clusters(labels_v1, labels_v2):
"""
Use the Hungarian (Kuhn–Munkres) algorithm to align two sets of cluster
labels for the same dataset as phrased as a combinational optimziation
algorithm that runs in O(n^3) time. Recreates the cluster labels for
both versions of the cluster labels and returns them.
Parameters
----------
labels_v1 : int array : shape=[num_cluster_items]
Labels for the first version of the cluster data
labels_v2 : int array : shape=[num_cluster_items]
Labels for the second version of the cluster data
Returns
-------
new_labels_v2 :
Re-mapped labels such that they use the same labels in v1 and have mapped
for optimal alignment.
"""
"""Here the labels are turned into integers and this mapping is retained for mapping back
into the original labels using vX_unmapper"""
contingency, mapped_labels_v1, mapped_labels_v2 = contingency_matrix(labels_v1, labels_v2)
"""Turn the contingency table into a cost and minimize with scipy"""
contingency = contingency * -1
assignments = linear_sum_assignment(contingency)
v1_unmapper = {i:s for i,s in zip(mapped_labels_v1, labels_v1)}
v2_unmapper = {i:s for i,s in zip(mapped_labels_v2, labels_v2)}
mapper_v2_to_v1 = {v2:v1 for v1, v2 in zip(assignments[0], assignments[1])}
# mapper_v1_to_v2 = {v1:v2 for v1, v2 in zip(assignments[0], assignments[1])}
"""Two steps: use assignment for v2 to get the closest v1 cluster (integer labels),
then unmap back into labels using the v1 unmapper"""
"""out = []
for v in mapped_labels_v2:
v1_lab = mapper_v2_to_v1.get(v, v2_unmapper[v])
newv2 = v1_unmapper[v1_lab]
out.append(newv2)"""
new_labels_v2 = np.array([v1_unmapper.get(mapper_v2_to_v1.get(v, v2_unmapper[v]), v2_unmapper[v]) for v in mapped_labels_v2])
return new_labels_v2
def map_labels(label_map, labels):
return [label_map[label] for label in labels]
def contingency_matrix(clus1_labels, clus2_labels, eps=None):
"""
Taken from this public implementation of the Munkres/Kuhns algorithm
under the Apache 2 license.
Can be found here: https://github.com/bmc/munkres
Build a contengency matrix describing the relationship between labels.
Parameters
----------
clus1_labels : int array, shape = [n_samples]
Ground truth class labels to be used as a reference
clus2_labels : array, shape = [n_samples]
Cluster labels to evaluate
eps: None or float
If a float, that value is added to all values in the contingency
matrix. This helps to stop NaN propogation.
If ``None``, nothing is adjusted.
Returns
-------
contingency: array, shape=[n_classes_true, n_classes_pred]
Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in
true class :math:`i` and in predicted class :math:`j`. If
``eps is None``, the dtype of this array will be integer. If ``eps`` is
given, the dtype will be float.
class_idx: array, shape=[n_samples]
Array of class labels with new mappings from from 0..n_classes_true
clusters_idx: array, shape=[n_samples]
Array of class labels with new mappings from from 0..n_classes_pred
"""
classes, class_idx = np.unique(clus1_labels, return_inverse=True)
clusters, cluster_idx = np.unique(clus2_labels, return_inverse=True)
n_classes = classes.shape[0]
n_clusters = clusters.shape[0]
# Using coo_matrix to accelerate simple histogram calculation,
# i.e. bins are consecutive integers
# Currently, coo_matrix is faster than histogram2d for simple cases
contingency = coo_matrix((np.ones(class_idx.shape[0]),
(class_idx, cluster_idx)),
shape=(n_classes, n_clusters),
dtype=int).toarray()
if eps is not None:
# don't use += as contingency is integer
contingency = contingency + eps
return contingency, class_idx, cluster_idx