From 886ef86f7d914c5f3873ae38033b5a576c33bb3f Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 13 Oct 2018 21:51:38 +0200 Subject: [PATCH 1/2] Implement Mann-Whitney U test --- asv/statistics.py | 152 ++++++++++++++++++++++++++++++++++++++++ test/test_statistics.py | 91 ++++++++++++++++++++++++ 2 files changed, 243 insertions(+) diff --git a/asv/statistics.py b/asv/statistics.py index a60af1433..f5b18b2b8 100644 --- a/asv/statistics.py +++ b/asv/statistics.py @@ -7,6 +7,7 @@ unicode_literals) import math +import operator from .util import inf, nan @@ -210,6 +211,140 @@ def quantile(x, q): return m +_mann_whitney_u_memo = {} + +def mann_whitney_u(x, y, method='auto'): + """ + Mann-Whitney U test + + Ties are handled conservatively, returning the least significant + tie breaking. + + Parameters + ---------- + x, y : list of float + Samples to test + method : {'auto', 'exact', 'normal'} + Whether to compute p-value exactly of via normal approximation. + The option 'auto' switches to approximation for sample size > 20. + + Returns + ------- + u : int + U-statistic + p : float + p-value for two-sided alternative + + References + ---------- + .. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947). + .. [2] Gibbons & Chakraborti, "Nonparametric statistical inference". (2003) + + """ + memo = _mann_whitney_u_memo + if len(memo) > 100000: + memo.clear() + + m = len(x) + n = len(y) + + if method == 'auto': + if max(m, n) > 20: + method = 'normal' + else: + method = 'exact' + + u, ties = mann_whitney_u_u(x, y) + + # Conservative tie breaking + if u <= m*n//2 and u + ties >= m*n//2: + ties = m*n//2 - u + + ux1 = min(u, m*n - u) + ux2 = min(u + ties, m*n - (u + ties)) + + if ux1 >= ux2: + ux = ux1 + else: + u = u + ties + ux = ux2 + + # Get p-value + if method == 'exact': + p1 = mann_whitney_u_cdf(m, n, ux, memo) + p2 = 1.0 - mann_whitney_u_cdf(m, n, m*n - ux, memo) + p = p1 + p2 + elif method == 'normal': + N = m + n + var = m*n*(N + 1) / 12 + z = (ux - m*n/2) / math.sqrt(var) + cdf = 0.5 * math.erfc(-z / math.sqrt(2)) + p = 2 * cdf + else: + raise ValueError("Unknown method {!r}".format(method)) + + return u, p + + +def mann_whitney_u_u(x, y): + u = 0 + ties = 0 + for xx in x: + for yy in y: + if xx > yy: + u += 1 + elif xx == yy: + ties += 1 + return u, ties + + +def mann_whitney_u_cdf(m, n, u, memo=None): + if memo is None: + memo = {} + cdf = 0 + for uu in range(u + 1): + cdf += mann_whitney_u_pmf(m, n, uu, memo) + return cdf + + +def mann_whitney_u_pmf(m, n, u, memo=None): + if memo is None: + memo = {} + return mann_whitney_u_r(m, n, u, memo) / binom(m + n, m) + + +def mann_whitney_u_r(m, n, u, memo=None): + """ + Number of orderings in Mann-Whitney U test. + + The PMF of U for samples of sizes (m, n) is given by + p(u) = r(m, n, u) / binom(m + n, m). + + References + ---------- + .. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947). + """ + if u < 0: + value = 0 + elif m == 0 or n == 0: + value = 1 if u == 0 else 0 + else: + # Don't bother figuring out table construction, memoization + # sorts it out + if memo is None: + memo = {} + key = (m, n, u) + value = memo.get(key) + if value is not None: + return value + + value = (mann_whitney_u_r(m, n - 1, u, memo) + + mann_whitney_u_r(m - 1, n, u - n, memo)) + + memo[key] = value + return value + + def binom_pmf(n, k, p): """Binomial pmf = (n choose k) p**k (1 - p)**(n - k)""" if not (0 <= k <= n): @@ -254,6 +389,23 @@ def lgamma(x): return nan +def binom(n, k): + """ + Binomial coefficient (n over k) + """ + n = operator.index(n) + k = operator.index(k) + if not 0 <= k <= n: + return 0 + m = n + 1 + num = 1 + den = 1 + for j in range(1, min(k, n - k) + 1): + num *= m - j + den *= j + return num // den + + class LaplacePosterior(object): """ Univariate distribution:: diff --git a/test/test_statistics.py b/test/test_statistics.py index 093cbfaf8..0ab5fb2a7 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -6,6 +6,8 @@ import warnings from itertools import product +import math + import pytest from asv.util import inf @@ -329,3 +331,92 @@ def test_laplace_posterior_cdf(): x = c.cdf(t) assert abs(x - num_cdf(t)) < 1e-5 assert abs(c.ppf(x) - t) < 1e-5 + + +def test_mann_whitney_u_cdf(): + memo = {} + + def check_table(m, tbl): + for u, row in enumerate(tbl): + for nx, p in enumerate(row): + n = nx + 1 + if p is None: + continue + + p2 = statistics.mann_whitney_u_cdf(m, n, u, memo=memo) + assert p2 == pytest.approx(p, abs=1e-3, rel=0), (m, n, u, p2, p) + + # Tables from Mann & Whitney, Ann. Math. Statist. 18, 50 (1947). + tbl = [[.250, .100, .050], + [.500, .200, .100], + [.750, .400, .200], + [None, .600, .350], + [None, None, .500], + [None, None, .650]] + check_table(3, tbl) + + tbl = [[.200, .067, .028, .014], + [.400, .133, .057, .029], + [.600, .267, .114, .057], + [None, .400, .200, .100], + [None, .600, .314, .171], + [None, None, .429, .243], + [None, None, .571, .343], + [None, None, None, .443], + [None, None, None, .557]] + check_table(4, tbl) + + tbl = [[.167, .047, .018, .008, .004], + [.333, .095, .036, .016, .008], + [.500, .190, .071, .032, .016], + [.667, .286, .125, .056, .028], + [None, .429, .196, .095, .048], + [None, .571, .286, .143, .075], + [None, None, .393, .206, .111], + [None, None, .500, .278, .155], + [None, None, .607, .365, .210], + [None, None, None, .452, .274], + [None, None, None, .548, .345], + [None, None, None, None, .421], + [None, None, None, None, .500], + [None, None, None, None, .579]] + check_table(5, tbl) + + +@pytest.mark.skipif(not HAS_SCIPY, reason="Requires scipy") +def test_mann_whitney_u_scipy(): + # Scipy only has the large-sample limit... + + def check(x, y): + u0, p0 = stats.mannwhitneyu(x, y, alternative='two-sided', use_continuity=False) + + u, p = statistics.mann_whitney_u(x.tolist(), y.tolist(), method='normal') + assert u == u0 + assert p == pytest.approx(p0, rel=1e-9, abs=0) + + u, p = statistics.mann_whitney_u(x.tolist(), y.tolist(), method='exact') + assert u == u0 + assert p == pytest.approx(p0, rel=5e-2, abs=5e-3) + + u, p = statistics.mann_whitney_u(x.tolist(), y.tolist()) + assert u == u0 + assert p == pytest.approx(p0, rel=5e-2, abs=5e-3) + + np.random.seed(1) + x = np.random.randn(22) + y = np.random.randn(23) + + check(x, y) + check(x, y + 0.5) + check(x, y - 2.5) + + +def test_binom(): + for n in range(10): + for k in range(10): + p = statistics.binom(n, k) + if 0 <= k <= n: + p2 = math.factorial(n) / math.factorial(k) / math.factorial(n - k) + else: + p2 = 0 + assert p == p2 From 63ad49e755c1b5e2ff926440e11bb932770d2db4 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 14 Oct 2018 01:38:28 +0200 Subject: [PATCH 2/2] Use U-test in asv compare when raw data is available --- asv/commands/compare.py | 42 ++++++++++++++++++++------------------ asv/commands/continuous.py | 3 ++- asv/statistics.py | 32 +++++++++++++++++++++-------- test/test_statistics.py | 5 +++-- 4 files changed, 51 insertions(+), 31 deletions(-) diff --git a/asv/commands/compare.py b/asv/commands/compare.py index 164c2edd9..3872cc234 100644 --- a/asv/commands/compare.py +++ b/asv/commands/compare.py @@ -64,21 +64,22 @@ def _isna(value): return value is None or value != value -def _is_result_better(a, b, a_stats, b_stats, factor, use_stats=True): +def _is_result_better(a, b, a_ss, b_ss, factor, use_stats=True): """ Check if result 'a' is better than 'b' by the given factor, possibly taking confidence intervals into account. """ - if use_stats and a_stats and b_stats and ( - a_stats.get('repeat', 0) != 1 and b_stats.get('repeat', 0) != 1): + if use_stats and a_ss and b_ss and a_ss[0] and b_ss[0] and ( + a_ss[0].get('repeat', 0) != 1 and b_ss[0].get('repeat', 0) != 1): # Return False if estimates don't differ. # # Special-case the situation with only one sample, in which # case we do the comparison only based on `factor` as there's # not enough data to do statistics. - if not statistics.is_different(a_stats, b_stats): + if not statistics.is_different(a_ss[1], b_ss[1], + a_ss[0], b_ss[0]): return False return a < b / factor @@ -178,8 +179,8 @@ def print_table(cls, conf, hash_1, hash_2, factor, split, commit_names=None): results_1 = {} results_2 = {} - stats_1 = {} - stats_2 = {} + ss_1 = {} + ss_2 = {} versions_1 = {} versions_2 = {} units = {} @@ -198,9 +199,10 @@ def results_default_iter(commit_hash): params = result.get_result_params(key) result_value = result.get_result_value(key, params) result_stats = result.get_result_stats(key, params) + result_samples = result.get_result_samples(key, params) result_version = result.benchmark_version.get(key) - yield (key, params, result_value, result_stats, result_version, - result.params['machine'], result.env_name) + yield (key, params, result_value, result_stats, result_samples, + result_version, result.params['machine'], result.env_name) if resultset_1 is None: resultset_1 = results_default_iter(hash_1) @@ -210,22 +212,22 @@ def results_default_iter(commit_hash): machine_env_names = set() - for key, params, value, stats, version, machine, env_name in resultset_1: + for key, params, value, stats, samples, version, machine, env_name in resultset_1: machine_env_name = "{}/{}".format(machine, env_name) machine_env_names.add(machine_env_name) - for name, value, stats in unroll_result(key, params, value, stats): + for name, value, stats, samples in unroll_result(key, params, value, stats, samples): units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit') results_1[(name, machine_env_name)] = value - stats_1[(name, machine_env_name)] = stats + ss_1[(name, machine_env_name)] = (stats, samples) versions_1[(name, machine_env_name)] = version - for key, params, value, stats, version, machine, env_name in resultset_2: + for key, params, value, stats, samples, version, machine, env_name in resultset_2: machine_env_name = "{}/{}".format(machine, env_name) machine_env_names.add(machine_env_name) - for name, value, stats in unroll_result(key, params, value, stats): + for name, value, stats, samples in unroll_result(key, params, value, stats, samples): units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit') results_2[(name, machine_env_name)] = value - stats_2[(name, machine_env_name)] = stats + ss_2[(name, machine_env_name)] = (stats, samples) versions_2[(name, machine_env_name)] = version if len(results_1) == 0: @@ -265,13 +267,13 @@ def results_default_iter(commit_hash): else: time_2 = float("nan") - if benchmark in stats_1 and stats_1[benchmark]: - err_1 = statistics.get_err(time_1, stats_1[benchmark]) + if benchmark in ss_1 and ss_1[benchmark][0]: + err_1 = statistics.get_err(time_1, ss_1[benchmark][0]) else: err_1 = None - if benchmark in stats_2 and stats_2[benchmark]: - err_2 = statistics.get_err(time_2, stats_2[benchmark]) + if benchmark in ss_2 and ss_2[benchmark][0]: + err_2 = statistics.get_err(time_2, ss_2[benchmark][0]) else: err_2 = None @@ -313,13 +315,13 @@ def results_default_iter(commit_hash): color = 'default' mark = ' ' elif _is_result_better(time_2, time_1, - stats_2.get(benchmark), stats_1.get(benchmark), + ss_2.get(benchmark), ss_1.get(benchmark), factor, use_stats=use_stats): color = 'green' mark = '-' improved = True elif _is_result_better(time_1, time_2, - stats_1.get(benchmark), stats_2.get(benchmark), + ss_1.get(benchmark), ss_2.get(benchmark), factor, use_stats=use_stats): color = 'red' mark = '+' diff --git a/asv/commands/continuous.py b/asv/commands/continuous.py index 8dd70e8f8..5a8a64942 100644 --- a/asv/commands/continuous.py +++ b/asv/commands/continuous.py @@ -127,7 +127,8 @@ def results_iter(commit_hash): value = result.get_result_value(name, params) stats = result.get_result_stats(name, params) - yield name, params, value, stats, version, machine_name, env.name + samples = result.get_result_samples(name, params) + yield name, params, value, stats, samples, version, machine_name, env.name commit_names = {parent: repo.get_name_from_hash(parent), head: repo.get_name_from_hash(head)} diff --git a/asv/statistics.py b/asv/statistics.py index f5b18b2b8..a0c617d44 100644 --- a/asv/statistics.py +++ b/asv/statistics.py @@ -9,7 +9,7 @@ import math import operator -from .util import inf, nan +from .util import inf, nan, is_na def compute_stats(samples, number): @@ -88,23 +88,39 @@ def get_err(result, stats): return (b - a)/2 -def is_different(stats_a, stats_b): +def is_different(samples_a, samples_b, stats_a, stats_b, p_threshold=0.002): """Check whether the samples are statistically different. - This is a pessimistic check --- if it returns True, then the - difference is statistically significant. If it returns False, - there might still might be difference. + If sample data is not provided, or the sample is too small, falls + back to a pessimistic CI-based check. If it returns True, then the + difference is statistically significant. If it returns False, it + might or might not be statistically significant. Parameters ---------- samples_a, samples_b Input samples - p : float, optional - Threshold p-value + stats_a, stats_b + Input stats data """ - # If confidence intervals overlap, reject + if samples_a is not None and samples_b is not None: + # Raw data present: Mann-Whitney U test, but only if there's + # enough data so that the test can return True + a = [x for x in samples_a if not is_na(x)] + b = [x for x in samples_b if not is_na(x)] + + p_min = 1 / binom(len(a) + len(b), min(len(a), len(b))) + if p_min < p_threshold: + _, p = mann_whitney_u(a, b) + return p < p_threshold + + # If confidence intervals overlap, reject. + # Corresponds to a test with ill-specified threshold p-value, + # which generally can be significantly smaller than p <= 0.01 + # depending on the actual data. For normal test (known variance), + # 0.00027 <= p <= 0.01. ci_a = stats_a['ci_99'] ci_b = stats_b['ci_99'] diff --git a/test/test_statistics.py b/test/test_statistics.py index 0ab5fb2a7..7c0e713c6 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -66,12 +66,13 @@ def test_is_different(): np.random.seed(1) # Smoke test is_different - for true_mean, n, significant in [(0.05, 10, False), (0.05, 100, True), (0.1, 10, True)]: + for true_mean, n, significant in [(0.01, 10, False), (0.05, 100, True), (0.1, 10, True)]: samples_a = 0 + 0.1 * np.random.rand(n) samples_b = true_mean + 0.1 * np.random.rand(n) result_a, stats_a = statistics.compute_stats(samples_a, 1) result_b, stats_b = statistics.compute_stats(samples_b, 1) - assert statistics.is_different(stats_a, stats_b) == significant + assert statistics.is_different(None, None, stats_a, stats_b) == significant + assert statistics.is_different(samples_a, samples_b, stats_a, stats_b) == significant def _check_ci(estimator, sampler, nsamples=300):