-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluation.py
61 lines (50 loc) · 2.23 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
from sklearn import metrics
from munkres import Munkres
def evaluate(label, pred):
nmi = metrics.normalized_mutual_info_score(label, pred)
ari = metrics.adjusted_rand_score(label, pred)
# f = metrics.fowlkes_mallows_score(label, pred)
pred_adjusted = get_y_preds(label, pred, len(set(label)))
acc = metrics.accuracy_score(pred_adjusted, label)
return nmi, ari, acc
def evaluate_others(label, pred):
ami = metrics.adjusted_mutual_info_score(label, pred)
homo, comp, v_mea = metrics.homogeneity_completeness_v_measure(label, pred)
return ami, homo, comp, v_mea
def calculate_cost_matrix(C, n_clusters):
cost_matrix = np.zeros((n_clusters, n_clusters))
# cost_matrix[i,j] will be the cost of assigning cluster i to label j
for j in range(n_clusters):
s = np.sum(C[:, j]) # number of examples in cluster i
for i in range(n_clusters):
t = C[i, j]
cost_matrix[j, i] = s - t
return cost_matrix
def get_cluster_labels_from_indices(indices):
n_clusters = len(indices)
cluster_labels = np.zeros(n_clusters)
for i in range(n_clusters):
cluster_labels[i] = indices[i][1]
return cluster_labels
def get_y_preds(y_true, cluster_assignments, n_clusters):
"""
Computes the predicted labels, where label assignments now
correspond to the actual labels in y_true (as estimated by Munkres)
cluster_assignments: array of labels, outputted by kmeans
y_true: true labels
n_clusters: number of clusters in the dataset
returns: a tuple containing the accuracy and confusion matrix,
in that order
"""
confusion_matrix = metrics.confusion_matrix(
y_true, cluster_assignments, labels=None
)
# compute accuracy based on optimal 1:1 assignment of clusters to labels
cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters)
indices = Munkres().compute(cost_matrix)
kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
if np.min(cluster_assignments) != 0:
cluster_assignments = cluster_assignments - np.min(cluster_assignments)
y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
return y_pred