From 886ef86f7d914c5f3873ae38033b5a576c33bb3f Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sat, 13 Oct 2018 21:51:38 +0200
Subject: [PATCH 1/2] Implement Mann-Whitney U test

---
 asv/statistics.py       | 152 ++++++++++++++++++++++++++++++++++++++++
 test/test_statistics.py |  91 ++++++++++++++++++++++++
 2 files changed, 243 insertions(+)

diff --git a/asv/statistics.py b/asv/statistics.py
index a60af1433..f5b18b2b8 100644
--- a/asv/statistics.py
+++ b/asv/statistics.py
@@ -7,6 +7,7 @@
                         unicode_literals)
 
 import math
+import operator
 
 from .util import inf, nan
 
@@ -210,6 +211,140 @@ def quantile(x, q):
     return m
 
 
+_mann_whitney_u_memo = {}
+
+def mann_whitney_u(x, y, method='auto'):
+    """
+    Mann-Whitney U test
+
+    Ties are handled conservatively, returning the least significant
+    tie breaking.
+
+    Parameters
+    ----------
+    x, y : list of float
+        Samples to test
+    method : {'auto', 'exact', 'normal'}
+        Whether to compute p-value exactly of via normal approximation.
+        The option 'auto' switches to approximation for sample size > 20.
+
+    Returns
+    -------
+    u : int
+        U-statistic
+    p : float
+        p-value for two-sided alternative
+
+    References
+    ----------
+    .. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
+    .. [2] Gibbons & Chakraborti, "Nonparametric statistical inference". (2003)
+
+    """
+    memo = _mann_whitney_u_memo
+    if len(memo) > 100000:
+        memo.clear()
+
+    m = len(x)
+    n = len(y)
+
+    if method == 'auto':
+        if max(m, n) > 20:
+            method = 'normal'
+        else:
+            method = 'exact'
+
+    u, ties = mann_whitney_u_u(x, y)
+
+    # Conservative tie breaking
+    if u <= m*n//2 and u + ties >= m*n//2:
+        ties = m*n//2 - u
+
+    ux1 = min(u, m*n - u)
+    ux2 = min(u + ties, m*n - (u + ties))
+
+    if ux1 >= ux2:
+        ux = ux1
+    else:
+        u = u + ties
+        ux = ux2
+
+    # Get p-value
+    if method == 'exact':
+        p1 = mann_whitney_u_cdf(m, n, ux, memo)
+        p2 = 1.0 - mann_whitney_u_cdf(m, n, m*n - ux, memo)
+        p = p1 + p2
+    elif method == 'normal':
+        N = m + n
+        var = m*n*(N + 1) / 12
+        z = (ux - m*n/2) / math.sqrt(var)
+        cdf = 0.5 * math.erfc(-z / math.sqrt(2))
+        p = 2 * cdf
+    else:
+        raise ValueError("Unknown method {!r}".format(method))
+
+    return u, p
+
+
+def mann_whitney_u_u(x, y):
+    u = 0
+    ties = 0
+    for xx in x:
+        for yy in y:
+            if xx > yy:
+                u += 1
+            elif xx == yy:
+                ties += 1
+    return u, ties
+
+
+def mann_whitney_u_cdf(m, n, u, memo=None):
+    if memo is None:
+        memo = {}
+    cdf = 0
+    for uu in range(u + 1):
+         cdf += mann_whitney_u_pmf(m, n, uu, memo)
+    return cdf
+
+
+def mann_whitney_u_pmf(m, n, u, memo=None):
+    if memo is None:
+        memo = {}
+    return mann_whitney_u_r(m, n, u, memo) / binom(m + n, m)
+
+
+def mann_whitney_u_r(m, n, u, memo=None):
+    """
+    Number of orderings in Mann-Whitney U test.
+
+    The PMF of U for samples of sizes (m, n) is given by
+    p(u) = r(m, n, u) / binom(m + n, m).
+
+    References
+    ----------
+    .. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
+    """
+    if u < 0:
+        value = 0
+    elif m == 0 or n == 0:
+        value = 1 if u == 0 else 0
+    else:
+        # Don't bother figuring out table construction, memoization
+        # sorts it out
+        if memo is None:
+            memo = {}
+        key = (m, n, u)
+        value = memo.get(key)
+        if value is not None:
+            return value
+
+        value = (mann_whitney_u_r(m, n - 1, u, memo)
+                 + mann_whitney_u_r(m - 1, n, u - n, memo))
+
+        memo[key] = value
+    return value
+
+
 def binom_pmf(n, k, p):
     """Binomial pmf = (n choose k) p**k (1 - p)**(n - k)"""
     if not (0 <= k <= n):
@@ -254,6 +389,23 @@ def lgamma(x):
     return nan
 
 
+def binom(n, k):
+    """
+    Binomial coefficient (n over k)
+    """
+    n = operator.index(n)
+    k = operator.index(k)
+    if not 0 <= k <= n:
+        return 0
+    m = n + 1
+    num = 1
+    den = 1
+    for j in range(1, min(k, n - k) + 1):
+        num *= m - j
+        den *= j
+    return num // den
+
+
 class LaplacePosterior(object):
     """
     Univariate distribution::
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 093cbfaf8..0ab5fb2a7 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -6,6 +6,8 @@
 
 import warnings
 from itertools import product
+import math
+
 import pytest
 
 from asv.util import inf
@@ -329,3 +331,92 @@ def test_laplace_posterior_cdf():
             x = c.cdf(t)
             assert abs(x - num_cdf(t)) < 1e-5
             assert abs(c.ppf(x) - t) < 1e-5
+
+
+def test_mann_whitney_u_cdf():
+    memo = {}
+
+    def check_table(m, tbl):
+        for u, row in enumerate(tbl):
+            for nx, p in enumerate(row):
+                n = nx + 1
+                if p is None:
+                    continue
+
+                p2 = statistics.mann_whitney_u_cdf(m, n, u, memo=memo)
+                assert p2 == pytest.approx(p, abs=1e-3, rel=0), (m, n, u, p2, p)
+
+    # Tables from Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
+    tbl = [[.250, .100, .050],
+           [.500, .200, .100],
+           [.750, .400, .200],
+           [None, .600, .350],
+           [None, None, .500],
+           [None, None, .650]]
+    check_table(3, tbl)
+
+    tbl = [[.200, .067, .028, .014],
+           [.400, .133, .057, .029],
+           [.600, .267, .114, .057],
+           [None, .400, .200, .100],
+           [None, .600, .314, .171],
+           [None, None, .429, .243],
+           [None, None, .571, .343],
+           [None, None, None, .443],
+           [None, None, None, .557]]
+    check_table(4, tbl)
+
+    tbl = [[.167, .047, .018, .008, .004],
+           [.333, .095, .036, .016, .008],
+           [.500, .190, .071, .032, .016],
+           [.667, .286, .125, .056, .028],
+           [None, .429, .196, .095, .048],
+           [None, .571, .286, .143, .075],
+           [None, None, .393, .206, .111],
+           [None, None, .500, .278, .155],
+           [None, None, .607, .365, .210],
+           [None, None, None, .452, .274],
+           [None, None, None, .548, .345],
+           [None, None, None, None, .421],
+           [None, None, None, None, .500],
+           [None, None, None, None, .579]]
+    check_table(5, tbl)
+
+
+@pytest.mark.skipif(not HAS_SCIPY, reason="Requires scipy")
+def test_mann_whitney_u_scipy():
+    # Scipy only has the large-sample limit...
+
+    def check(x, y):
+        u0, p0 = stats.mannwhitneyu(x, y, alternative='two-sided', use_continuity=False)
+
+        u, p = statistics.mann_whitney_u(x.tolist(), y.tolist(), method='normal')
+        assert u == u0
+        assert p == pytest.approx(p0, rel=1e-9, abs=0)
+
+        u, p = statistics.mann_whitney_u(x.tolist(), y.tolist(), method='exact')
+        assert u == u0
+        assert p == pytest.approx(p0, rel=5e-2, abs=5e-3)
+
+        u, p = statistics.mann_whitney_u(x.tolist(), y.tolist())
+        assert u == u0
+        assert p == pytest.approx(p0, rel=5e-2, abs=5e-3)
+
+    np.random.seed(1)
+    x = np.random.randn(22)
+    y = np.random.randn(23)
+
+    check(x, y)
+    check(x, y + 0.5)
+    check(x, y - 2.5)
+
+
+def test_binom():
+    for n in range(10):
+        for k in range(10):
+            p = statistics.binom(n, k)
+            if 0 <= k <= n:
+                p2 = math.factorial(n) / math.factorial(k) / math.factorial(n - k)
+            else:
+                p2 = 0
+            assert p == p2

From 63ad49e755c1b5e2ff926440e11bb932770d2db4 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Sun, 14 Oct 2018 01:38:28 +0200
Subject: [PATCH 2/2] Use U-test in asv compare when raw data is available

---
 asv/commands/compare.py    | 42 ++++++++++++++++++++------------------
 asv/commands/continuous.py |  3 ++-
 asv/statistics.py          | 32 +++++++++++++++++++++--------
 test/test_statistics.py    |  5 +++--
 4 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/asv/commands/compare.py b/asv/commands/compare.py
index 164c2edd9..3872cc234 100644
--- a/asv/commands/compare.py
+++ b/asv/commands/compare.py
@@ -64,21 +64,22 @@ def _isna(value):
     return value is None or value != value
 
 
-def _is_result_better(a, b, a_stats, b_stats, factor, use_stats=True):
+def _is_result_better(a, b, a_ss, b_ss, factor, use_stats=True):
     """
     Check if result 'a' is better than 'b' by the given factor,
     possibly taking confidence intervals into account.
 
     """
 
-    if use_stats and a_stats and b_stats and (
-            a_stats.get('repeat', 0) != 1 and b_stats.get('repeat', 0) != 1):
+    if use_stats and a_ss and b_ss and a_ss[0] and b_ss[0] and (
+            a_ss[0].get('repeat', 0) != 1 and b_ss[0].get('repeat', 0) != 1):
         # Return False if estimates don't differ.
         #
         # Special-case the situation with only one sample, in which
         # case we do the comparison only based on `factor` as there's
         # not enough data to do statistics.
-        if not statistics.is_different(a_stats, b_stats):
+        if not statistics.is_different(a_ss[1], b_ss[1],
+                                       a_ss[0], b_ss[0]):
             return False
 
     return a < b / factor
@@ -178,8 +179,8 @@ def print_table(cls, conf, hash_1, hash_2, factor, split,
                     commit_names=None):
         results_1 = {}
         results_2 = {}
-        stats_1 = {}
-        stats_2 = {}
+        ss_1 = {}
+        ss_2 = {}
         versions_1 = {}
         versions_2 = {}
         units = {}
@@ -198,9 +199,10 @@ def results_default_iter(commit_hash):
                     params = result.get_result_params(key)
                     result_value = result.get_result_value(key, params)
                     result_stats = result.get_result_stats(key, params)
+                    result_samples = result.get_result_samples(key, params)
                     result_version = result.benchmark_version.get(key)
-                    yield (key, params, result_value, result_stats, result_version,
-                           result.params['machine'], result.env_name)
+                    yield (key, params, result_value, result_stats, result_samples,
+                           result_version, result.params['machine'], result.env_name)
 
         if resultset_1 is None:
             resultset_1 = results_default_iter(hash_1)
@@ -210,22 +212,22 @@ def results_default_iter(commit_hash):
 
         machine_env_names = set()
 
-        for key, params, value, stats, version, machine, env_name in resultset_1:
+        for key, params, value, stats, samples, version, machine, env_name in resultset_1:
             machine_env_name = "{}/{}".format(machine, env_name)
             machine_env_names.add(machine_env_name)
-            for name, value, stats in unroll_result(key, params, value, stats):
+            for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
                 units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
                 results_1[(name, machine_env_name)] = value
-                stats_1[(name, machine_env_name)] = stats
+                ss_1[(name, machine_env_name)] = (stats, samples)
                 versions_1[(name, machine_env_name)] = version
 
-        for key, params, value, stats, version, machine, env_name in resultset_2:
+        for key, params, value, stats, samples, version, machine, env_name in resultset_2:
             machine_env_name = "{}/{}".format(machine, env_name)
             machine_env_names.add(machine_env_name)
-            for name, value, stats in unroll_result(key, params, value, stats):
+            for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
                 units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
                 results_2[(name, machine_env_name)] = value
-                stats_2[(name, machine_env_name)] = stats
+                ss_2[(name, machine_env_name)] = (stats, samples)
                 versions_2[(name, machine_env_name)] = version
 
         if len(results_1) == 0:
@@ -265,13 +267,13 @@ def results_default_iter(commit_hash):
             else:
                 time_2 = float("nan")
 
-            if benchmark in stats_1 and stats_1[benchmark]:
-                err_1 = statistics.get_err(time_1, stats_1[benchmark])
+            if benchmark in ss_1 and ss_1[benchmark][0]:
+                err_1 = statistics.get_err(time_1, ss_1[benchmark][0])
             else:
                 err_1 = None
 
-            if benchmark in stats_2 and stats_2[benchmark]:
-                err_2 = statistics.get_err(time_2, stats_2[benchmark])
+            if benchmark in ss_2 and ss_2[benchmark][0]:
+                err_2 = statistics.get_err(time_2, ss_2[benchmark][0])
             else:
                 err_2 = None
 
@@ -313,13 +315,13 @@ def results_default_iter(commit_hash):
                 color = 'default'
                 mark = ' '
             elif _is_result_better(time_2, time_1,
-                                   stats_2.get(benchmark), stats_1.get(benchmark),
+                                   ss_2.get(benchmark), ss_1.get(benchmark),
                                    factor, use_stats=use_stats):
                 color = 'green'
                 mark = '-'
                 improved = True
             elif _is_result_better(time_1, time_2,
-                                   stats_1.get(benchmark), stats_2.get(benchmark),
+                                   ss_1.get(benchmark), ss_2.get(benchmark),
                                    factor, use_stats=use_stats):
                 color = 'red'
                 mark = '+'
diff --git a/asv/commands/continuous.py b/asv/commands/continuous.py
index 8dd70e8f8..5a8a64942 100644
--- a/asv/commands/continuous.py
+++ b/asv/commands/continuous.py
@@ -127,7 +127,8 @@ def results_iter(commit_hash):
 
                     value = result.get_result_value(name, params)
                     stats = result.get_result_stats(name, params)
-                    yield name, params, value, stats, version, machine_name, env.name
+                    samples = result.get_result_samples(name, params)
+                    yield name, params, value, stats, samples, version, machine_name, env.name
 
         commit_names = {parent: repo.get_name_from_hash(parent),
                         head: repo.get_name_from_hash(head)}
diff --git a/asv/statistics.py b/asv/statistics.py
index f5b18b2b8..a0c617d44 100644
--- a/asv/statistics.py
+++ b/asv/statistics.py
@@ -9,7 +9,7 @@
 import math
 import operator
 
-from .util import inf, nan
+from .util import inf, nan, is_na
 
 
 def compute_stats(samples, number):
@@ -88,23 +88,39 @@ def get_err(result, stats):
     return (b - a)/2
 
 
-def is_different(stats_a, stats_b):
+def is_different(samples_a, samples_b, stats_a, stats_b, p_threshold=0.002):
     """Check whether the samples are statistically different.
 
-    This is a pessimistic check --- if it returns True, then the
-    difference is statistically significant. If it returns False,
-    there might still might be difference.
+    If sample data is not provided, or the sample is too small, falls
+    back to a pessimistic CI-based check. If it returns True, then the
+    difference is statistically significant. If it returns False, it
+    might or might not be statistically significant.
 
     Parameters
     ----------
     samples_a, samples_b
         Input samples
-    p : float, optional
-        Threshold p-value
+    stats_a, stats_b
+        Input stats data
 
     """
 
-    # If confidence intervals overlap, reject
+    if samples_a is not None and samples_b is not None:
+        # Raw data present: Mann-Whitney U test, but only if there's
+        # enough data so that the test can return True
+        a = [x for x in samples_a if not is_na(x)]
+        b = [x for x in samples_b if not is_na(x)]
+
+        p_min = 1 / binom(len(a) + len(b), min(len(a), len(b)))
+        if p_min < p_threshold:
+            _, p = mann_whitney_u(a, b)
+            return p < p_threshold
+
+    # If confidence intervals overlap, reject.
+    # Corresponds to a test with ill-specified threshold p-value,
+    # which generally can be significantly smaller than p <= 0.01
+    # depending on the actual data. For normal test (known variance),
+    # 0.00027 <= p <= 0.01.
     ci_a = stats_a['ci_99']
     ci_b = stats_b['ci_99']
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 0ab5fb2a7..7c0e713c6 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -66,12 +66,13 @@ def test_is_different():
     np.random.seed(1)
 
     # Smoke test is_different
-    for true_mean, n, significant in [(0.05, 10, False), (0.05, 100, True), (0.1, 10, True)]:
+    for true_mean, n, significant in [(0.01, 10, False), (0.05, 100, True), (0.1, 10, True)]:
         samples_a = 0 + 0.1 * np.random.rand(n)
         samples_b = true_mean + 0.1 * np.random.rand(n)
         result_a, stats_a = statistics.compute_stats(samples_a, 1)
         result_b, stats_b = statistics.compute_stats(samples_b, 1)
-        assert statistics.is_different(stats_a, stats_b) == significant
+        assert statistics.is_different(None, None, stats_a, stats_b) == significant
+        assert statistics.is_different(samples_a, samples_b, stats_a, stats_b) == significant
 
 
 def _check_ci(estimator, sampler, nsamples=300):