-
Notifications
You must be signed in to change notification settings - Fork 89
/
_critical_difference.py
592 lines (517 loc) · 20.6 KB
/
_critical_difference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
"""Function to compute and plot critical difference diagrams."""
__author__ = ["SveaMeyer13", "dguijo", "TonyBagnall"]
import math
import numpy as np
# import pandas as pd
from scipy.stats import distributions, find_repeats, rankdata, wilcoxon
from aeon.benchmarking.utils import get_qalpha
from aeon.utils.validation._dependencies import _check_soft_dependencies
def _check_friedman(ranks):
"""
Check whether Friedman test is significant.
Parameters
----------
ranks : np.array
Rank of estimators on datasets, shape (n_estimators, n_datasets).
Returns
-------
float
p-value of the test.
"""
n_datasets, n_estimators = ranks.shape
if n_estimators < 3:
raise ValueError(
"At least 3 sets of measurements must be given for Friedmann test, "
f"got {n_estimators}."
)
# calculate c to correct chisq for ties:
ties = 0
for i in range(n_datasets):
replist, repnum = find_repeats(ranks[i])
for t in repnum:
ties += t * (t * t - 1)
c = 1 - ties / (n_estimators * (n_estimators * n_estimators - 1) * n_datasets)
ssbn = np.sum(ranks.sum(axis=0) ** 2)
chisq = (
12.0 / (n_estimators * n_datasets * (n_estimators + 1)) * ssbn
- 3 * n_datasets * (n_estimators + 1)
) / c
p_value = distributions.chi2.sf(chisq, n_estimators - 1)
return p_value
def _nemenyi_test(ordered_avg_ranks, n_datasets, alpha):
"""
Find cliques using post hoc Nemenyi test.
Parameters
----------
ordered_avg_ranks : np.array
Average ranks of estimators.
n_datasets : int
Mumber of datasets.
alpha : float
alpha level for Nemenyi test.
Returns
-------
list of lists
List of cliques. A clique is a group of estimators within which there is no
significant difference.
"""
n_estimators = len(ordered_avg_ranks)
qalpha = get_qalpha(alpha)
# calculate critical difference with Nemenyi
cd = qalpha[n_estimators] * np.sqrt(
n_estimators * (n_estimators + 1) / (6 * n_datasets)
)
# compute statistically similar cliques
cliques = np.tile(ordered_avg_ranks, (n_estimators, 1)) - np.tile(
np.vstack(ordered_avg_ranks.T), (1, n_estimators)
)
cliques[cliques < 0] = np.inf
cliques = cliques < cd
cliques = _build_cliques(cliques)
return cliques
def _wilcoxon_test(results, lower_better=False):
"""
Perform Wilcoxon test.
Parameters
----------
results: np.array
results of estimators on datasets
lower_better : bool, default = False
Indicates whether smaller is better for the results in scores. For example,
if errors are passed instead of accuracies, set ``lower_better`` to ``True``.
Returns
-------
np.array
p-values of Wilcoxon sign rank test.
"""
n_estimators = results.shape[1]
p_values = np.eye(n_estimators)
for i in range(n_estimators - 1):
for j in range(i + 1, n_estimators):
p_values[i, j] = wilcoxon(
results[:, i],
results[:, j],
zero_method="wilcox",
alternative="less" if lower_better else "greater",
)[1]
return p_values
def _build_cliques(pairwise_matrix):
"""
Build cliques from pairwise comparison matrix.
Parameters
----------
pairwise_matrix : np.array
Pairwise matrix shape (n_estimators, n_estimators) indicating if there is a
significant difference between pairs. Assumed to be ordered by rank of
estimators.
Returns
-------
list of lists
cliques within which there is no significant different between estimators.
"""
for i in range(0, pairwise_matrix.shape[0]):
for j in range(i + 1, pairwise_matrix.shape[1]):
if pairwise_matrix[i, j] == 0:
pairwise_matrix[i, j + 1 :] = 0 # noqa: E203
break
n = np.sum(pairwise_matrix, 1)
possible_cliques = pairwise_matrix[n > 1, :]
for i in range(possible_cliques.shape[0] - 1, 0, -1):
for j in range(i - 1, -1, -1):
if np.all(possible_cliques[j, possible_cliques[i, :]]):
possible_cliques[i, :] = 0
break
n = np.sum(possible_cliques, 1)
cliques = possible_cliques[n > 1, :]
return cliques
def plot_critical_difference(
scores,
labels,
highlight=None,
lower_better=False,
test="wilcoxon",
correction="holm",
alpha=0.1,
width=6,
textspace=1.5,
reverse=True,
return_p_values=False,
):
"""
Plot the average ranks and cliques based on the method described in [1]_.
This function summarises the relative performance of multiple estimators
evaluated on multiple datasets. The resulting plot shows the average rank of each
estimator on a number line. Estimators are grouped by solid lines,
called cliques. A clique represents a group of estimators within which there is
no significant difference in performance (see the caveats below). Please note
that these diagrams are not an end in themselves, but rather form one part of
the description of performance of estimators.
The input is a summary performance measure of each estimator on each problem,
where columns are estimators and rows datasets. This could be any measure such as
accuracy/error, F1, negative log-likelihood, mean squared error or rand score.
This algorithm first calculates the rank of all estimators on all problems (
averaging ranks over ties), then sorts estimators on average rank. It then forms
cliques. The original critical difference diagrams [1]_ use the post hoc Neymeni
test [4]_ to find a critical difference. However, as discussed [3]_,this post hoc
test is senstive to the estimators included in the test: "For instance the
difference between A and B could be declared significant if the pool comprises
algorithms C, D, E and not significant if the pool comprises algorithms
F, G, H.". Our default option is to base cliques finding on pairwise Wilcoxon sign
rank test.
There are two issues when performing multiple pairwise tests to find cliques:
what adjustment to make for multiple testing, and whether to perform a one sided
or two sided test. The Bonferroni adjustment is known to be conservative. Hence,
by default, we base our clique finding from pairwise tests on the control
tests described in [1]_ and the sequential method recommended in [2]_ and proposed
in [5]_ that uses a less conservative adjustment than Bonferroni.
We perform all pairwise tests using a one-sided Wilcoxon sign rank test with the
Holm correction to alpha, which involves reducing alpha by dividing it by number
of estimators -1.
Suppose we have four estimators, A, B, C and D sorted by average rank. Starting
from A, we test the null hypothesis that average ranks are equal against the
alternative hypothesis that the average rank of A is less than that of B,
with significance level alpha/(n_estimators-1). If we reject the null hypothesis
then we stop, and A is not in a clique. If we cannot
reject the null, we test A vs C, continuing until we reject the null or we have
tested all estimators. Suppose we find that A vs B is significant. We form no
clique for A.
We then continue to form a clique using the second best estimator,
B, as a control. Imagine we find no difference between B and C, nor any difference
between B and D. We have formed a clique for B: [B, C, D]. On the third
iteration, imagine we also find not difference between C and D and thus form a
second clique, [C, D]. We have found two cliques, but [C,D] is contained in [B, C,
D] and is thus redundant. In this case we would return a single clique, [B, C, D].
Not this is a heuristic approach not without problems: If the best ranked estimator
A is significantly better than B but not significantly different to C, this will
not be reflected in the diagram. Because of this, we recommend also reporting
p-values in a table, and exploring other ways to present results such as pairwise
plots. Comparing estimators on archive data sets can only be indicative of
overall performance in general, and such comparisons should be seen as exploratory
analysis rather than designed experiments to test an a priori hypothesis.
Parts of the code are adapted from here:
https://github.com/hfawaz/cd-diagram
Parameters
----------
scores : np.array
Performance scores for estimators of shape (n_datasets, n_estimators).
labels : list of estimators
List with names of the estimators. Order should be the same as scores
highlight : dict, default = None
A dict with labels and HTML colours to be used for highlighting. Order should be
the same as scores.
lower_better : bool, default = False
Indicates whether smaller is better for the results in scores. For example,
if errors are passed instead of accuracies, set ``lower_better`` to ``True``.
test : string, default = "wilcoxon"
test method used to form cliques, either "nemenyi" or "wilcoxon"
correction: string, default = "holm"
correction method for multiple testing, one of "bonferroni", "holm" or "none".
alpha : float, default = 0.1
Critical value for statistical tests of difference.
width : int, default = 6
Width in inches.
textspace : int
Space on figure sides (in inches) for the method names (default: 1.5).
reverse : bool, default = True
If set to 'True', the lowest rank is on the right.
return_p_values : bool, default = False
Whether to return the pairwise matrix of p-values.
Returns
-------
fig : matplotlib.figure
Figure created.
p_values : np.ndarray (optional)
if return_p_values is True, returns a (n_estimators, n_estimators) matrix of
unadjusted p values for the pairwise Wilcoxon sign rank test.
References
----------
.. [1] Demsar J., "Statistical comparisons of classifiers over multiple data sets."
Journal of Machine Learning Research 7:1-30, 2006.
.. [2] García S. and Herrera F., "An extension on “statistical comparisons of
classifiers over multiple data sets” for all pairwise comparisons."
Journal of Machine Learning Research 9:2677-2694, 2008.
.. [3] Benavoli A., Corani G. and Mangili F "Should we really use post-hoc tests
based on mean-ranks?" Journal of Machine Learning Research 17:1-10, 2016.
.. [4] Nemenyi P., "Distribution-free multiple comparisons".
PhD thesis, Princeton University, 1963.
.. [5] Holm S., " A simple sequentially rejective multiple test procedure."
Scandinavian Journal of Statistics, 6:65-70, 1979.
Example
-------
>>> from aeon.benchmarking import plot_critical_difference
>>> from aeon.benchmarking.results_loaders import get_estimator_results_as_array
>>> methods = ["IT", "WEASEL-Dilation", "HIVECOTE2", "FreshPRINCE"]
>>> results = get_estimator_results_as_array(estimators=methods) # doctest: +SKIP
>>> plot = plot_critical_difference(results[0], methods, alpha=0.1)\
# doctest: +SKIP
>>> plot.show() # doctest: +SKIP
>>> plot.savefig("cd.pdf", bbox_inches="tight") # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")
import matplotlib.pyplot as plt
n_datasets, n_estimators = scores.shape
if isinstance(test, str):
test = test.lower()
if isinstance(correction, str):
correction = correction.lower()
if return_p_values and test == "nemenyi":
raise ValueError(
"Cannot return p values for the Nemenyi test, since it does "
"not calculate p-values."
)
# Step 1: rank data: in case of ties average ranks are assigned
if lower_better: # low is good -> rank 1
ranks = rankdata(scores, axis=1)
else: # assign opposite ranks
ranks = rankdata(-1 * scores, axis=1)
# Step 2: calculate average rank per estimator
ordered_avg_ranks = ranks.mean(axis=0)
# Sort labels and ranks
ordered_labels_ranks = np.array(
[(l, float(r)) for r, l in sorted(zip(ordered_avg_ranks, labels))], dtype=object
)
ordered_labels = np.array([la for la, _ in ordered_labels_ranks], dtype=str)
ordered_avg_ranks = np.array([r for _, r in ordered_labels_ranks], dtype=np.float32)
indices = [np.where(np.array(labels) == r)[0] for r in ordered_labels]
ordered_scores = scores[:, indices]
# sort out colours for labels
if highlight is not None:
colours = [
highlight[label] if label in highlight else "#000000"
for label in ordered_labels
]
else:
colours = ["#000000"] * len(ordered_labels)
# Step 3 : check whether Friedman test is significant
p_value_friedman = _check_friedman(ranks)
# Step 4: If Friedman test is significant find cliques
if p_value_friedman < alpha:
if test == "nemenyi":
cliques = _nemenyi_test(ordered_avg_ranks, n_datasets, alpha)
elif test == "wilcoxon":
if correction == "bonferroni":
adjusted_alpha = alpha / (n_estimators * (n_estimators - 1) / 2)
elif correction == "holm":
adjusted_alpha = alpha / (n_estimators - 1)
elif correction is None:
adjusted_alpha = alpha
else:
raise ValueError("correction available are None, Bonferroni and Holm.")
p_values = _wilcoxon_test(ordered_scores, lower_better)
cliques = _build_cliques(p_values > adjusted_alpha)
else:
raise ValueError("tests available are only nemenyi and wilcoxon.")
# If Friedman test is not significant everything has to be one clique
else:
cliques = [[1] * n_estimators]
# Step 6 create the diagram:
# check from where to where the axis has to go
lowv = min(1, int(math.floor(min(ordered_avg_ranks))))
highv = max(len(ordered_avg_ranks), int(math.ceil(max(ordered_avg_ranks))))
# set up the figure
width = float(width)
textspace = float(textspace)
cline = 0.6 # space needed above scale
linesblank = 1 # lines between scale and text
scalewidth = width - 2 * textspace
# calculate needed height
minnotsignificant = max(2 * 0.2, linesblank)
height = cline + ((n_estimators + 1) / 2) * 0.2 + minnotsignificant + 0.2
fig = plt.figure(figsize=(width, height))
fig.set_facecolor("white")
ax = fig.add_axes([0, 0, 1, 1]) # reverse y axis
ax.set_axis_off()
hf = 1.0 / height # height factor
wf = 1.0 / width
# Upper left corner is (0,0).
ax.plot([0, 1], [0, 1], c="w")
ax.set_xlim(0.1, 0.9)
ax.set_ylim(1, 0)
def _lloc(lst, n):
"""List location in list of list structure."""
if n < 0:
return len(lst[0]) + n
else:
return n
def _nth(lst, n):
n = _lloc(lst, n)
return [a[n] for a in lst]
def _hfl(lst):
return [a * hf for a in lst]
def _wfl(lst):
return [a * wf for a in lst]
def _line(lst, color="k", **kwargs):
ax.plot(_wfl(_nth(lst, 0)), _hfl(_nth(lst, 1)), color=color, **kwargs)
# draw scale
_line([(textspace, cline), (width - textspace, cline)], linewidth=2)
bigtick = 0.3
smalltick = 0.15
linewidth = 0.75
linewidth_sign = 2.5
def _rankpos(rank):
if not reverse:
a = rank - lowv
else:
a = highv - rank
return textspace + scalewidth / (highv - lowv) * a
# add ticks to scale
tick = None
for a in list(np.arange(lowv, highv, 0.5)) + [highv]:
tick = smalltick
if a == int(a):
tick = bigtick
_line([(_rankpos(a), cline - tick / 2), (_rankpos(a), cline)], linewidth=2)
def _text(x, y, s, *args, **kwargs):
ax.text(wf * x, hf * y, s, *args, **kwargs)
for a in range(lowv, highv + 1):
_text(
_rankpos(a),
cline - tick / 2 - 0.05,
str(a),
ha="center",
va="bottom",
size=16,
)
# sort out lines and text based on whether order is reversed or not
space_between_names = 0.24
for i in range(math.ceil(len(ordered_avg_ranks) / 2)):
chei = cline + minnotsignificant + i * space_between_names
if reverse:
_line(
[
(_rankpos(ordered_avg_ranks[i]), cline),
(_rankpos(ordered_avg_ranks[i]), chei),
(textspace + scalewidth + 0.2, chei),
],
linewidth=linewidth,
color=colours[i],
)
_text( # labels left side.
textspace + scalewidth + 0.3,
chei,
ordered_labels[i],
ha="left",
va="center",
size=16,
color=colours[i],
)
_text( # ranks left side.
textspace + scalewidth - 0.3,
chei - 0.075,
format(ordered_avg_ranks[i], ".4f"),
ha="left",
va="center",
size=10,
color=colours[i],
)
else:
_line(
[
(_rankpos(ordered_avg_ranks[i]), cline),
(_rankpos(ordered_avg_ranks[i]), chei),
(textspace - 0.1, chei),
],
linewidth=linewidth,
color=colours[i],
)
_text( # labels left side.
textspace - 0.2,
chei,
ordered_labels[i],
ha="right",
va="center",
size=16,
color=colours[i],
)
_text( # ranks left side.
textspace + 0.4,
chei - 0.075,
format(ordered_avg_ranks[i], ".4f"),
ha="right",
va="center",
size=10,
color=colours[i],
)
for i in range(math.ceil(len(ordered_avg_ranks) / 2), len(ordered_avg_ranks)):
chei = (
cline
+ minnotsignificant
+ (len(ordered_avg_ranks) - i - 1) * space_between_names
)
if reverse:
_line(
[
(_rankpos(ordered_avg_ranks[i]), cline),
(_rankpos(ordered_avg_ranks[i]), chei),
(textspace - 0.1, chei),
],
linewidth=linewidth,
color=colours[i],
)
_text( # labels right side.
textspace - 0.2,
chei,
ordered_labels[i],
ha="right",
va="center",
size=16,
color=colours[i],
)
_text( # ranks right side.
textspace + 0.4,
chei - 0.075,
format(ordered_avg_ranks[i], ".4f"),
ha="right",
va="center",
size=10,
color=colours[i],
)
else:
_line(
[
(_rankpos(ordered_avg_ranks[i]), cline),
(_rankpos(ordered_avg_ranks[i]), chei),
(textspace + scalewidth + 0.1, chei),
],
linewidth=linewidth,
color=colours[i],
)
_text( # labels right side.
textspace + scalewidth + 0.2,
chei,
ordered_labels[i],
ha="left",
va="center",
size=16,
color=colours[i],
)
_text( # ranks right side.
textspace + scalewidth - 0.4,
chei - 0.075,
format(ordered_avg_ranks[i], ".4f"),
ha="left",
va="center",
size=10,
color=colours[i],
)
# draw lines for cliques
start = cline + 0.2
side = -0.02 if reverse else 0.02
height = 0.1
i = 1
for clq in cliques:
positions = np.where(np.array(clq) == 1)[0]
min_idx = np.array(positions).min()
max_idx = np.array(positions).max()
_line(
[
(_rankpos(ordered_avg_ranks[min_idx]) - side, start),
(_rankpos(ordered_avg_ranks[max_idx]) + side, start),
],
linewidth=linewidth_sign,
)
start += height
if return_p_values:
return fig, p_values
else:
return fig