airspeed-velocity · pv · Oct 14, 2018 · Oct 13, 2018 · Oct 13, 2018
diff --git a/asv/commands/compare.py b/asv/commands/compare.py
@@ -64,21 +64,22 @@ def _isna(value):
     return value is None or value != value
 
 
-def _is_result_better(a, b, a_stats, b_stats, factor, use_stats=True):
+def _is_result_better(a, b, a_ss, b_ss, factor, use_stats=True):
     """
     Check if result 'a' is better than 'b' by the given factor,
     possibly taking confidence intervals into account.
 
     """
 
-    if use_stats and a_stats and b_stats and (
-            a_stats.get('repeat', 0) != 1 and b_stats.get('repeat', 0) != 1):
+    if use_stats and a_ss and b_ss and a_ss[0] and b_ss[0] and (
+            a_ss[0].get('repeat', 0) != 1 and b_ss[0].get('repeat', 0) != 1):
         # Return False if estimates don't differ.
         #
         # Special-case the situation with only one sample, in which
         # case we do the comparison only based on `factor` as there's
         # not enough data to do statistics.
-        if not statistics.is_different(a_stats, b_stats):
+        if not statistics.is_different(a_ss[1], b_ss[1],
+                                       a_ss[0], b_ss[0]):
             return False
 
     return a < b / factor
@@ -178,8 +179,8 @@ def print_table(cls, conf, hash_1, hash_2, factor, split,
                     commit_names=None):
         results_1 = {}
         results_2 = {}
-        stats_1 = {}
-        stats_2 = {}
+        ss_1 = {}
+        ss_2 = {}
         versions_1 = {}
         versions_2 = {}
         units = {}
@@ -198,9 +199,10 @@ def results_default_iter(commit_hash):
                     params = result.get_result_params(key)
                     result_value = result.get_result_value(key, params)
                     result_stats = result.get_result_stats(key, params)
+                    result_samples = result.get_result_samples(key, params)
                     result_version = result.benchmark_version.get(key)
-                    yield (key, params, result_value, result_stats, result_version,
-                           result.params['machine'], result.env_name)
+                    yield (key, params, result_value, result_stats, result_samples,
+                           result_version, result.params['machine'], result.env_name)
 
         if resultset_1 is None:
             resultset_1 = results_default_iter(hash_1)
@@ -210,22 +212,22 @@ def results_default_iter(commit_hash):
 
         machine_env_names = set()
 
-        for key, params, value, stats, version, machine, env_name in resultset_1:
+        for key, params, value, stats, samples, version, machine, env_name in resultset_1:
             machine_env_name = "{}/{}".format(machine, env_name)
             machine_env_names.add(machine_env_name)
-            for name, value, stats in unroll_result(key, params, value, stats):
+            for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
                 units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
                 results_1[(name, machine_env_name)] = value
-                stats_1[(name, machine_env_name)] = stats
+                ss_1[(name, machine_env_name)] = (stats, samples)
                 versions_1[(name, machine_env_name)] = version
 
-        for key, params, value, stats, version, machine, env_name in resultset_2:
+        for key, params, value, stats, samples, version, machine, env_name in resultset_2:
             machine_env_name = "{}/{}".format(machine, env_name)
             machine_env_names.add(machine_env_name)
-            for name, value, stats in unroll_result(key, params, value, stats):
+            for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
                 units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
                 results_2[(name, machine_env_name)] = value
-                stats_2[(name, machine_env_name)] = stats
+                ss_2[(name, machine_env_name)] = (stats, samples)
                 versions_2[(name, machine_env_name)] = version
 
         if len(results_1) == 0:
@@ -265,13 +267,13 @@ def results_default_iter(commit_hash):
             else:
                 time_2 = float("nan")
 
-            if benchmark in stats_1 and stats_1[benchmark]:
-                err_1 = statistics.get_err(time_1, stats_1[benchmark])
+            if benchmark in ss_1 and ss_1[benchmark][0]:
+                err_1 = statistics.get_err(time_1, ss_1[benchmark][0])
             else:
                 err_1 = None
 
-            if benchmark in stats_2 and stats_2[benchmark]:
-                err_2 = statistics.get_err(time_2, stats_2[benchmark])
+            if benchmark in ss_2 and ss_2[benchmark][0]:
+                err_2 = statistics.get_err(time_2, ss_2[benchmark][0])
             else:
                 err_2 = None
 
@@ -313,13 +315,13 @@ def results_default_iter(commit_hash):
                 color = 'default'
                 mark = ' '
             elif _is_result_better(time_2, time_1,
-                                   stats_2.get(benchmark), stats_1.get(benchmark),
+                                   ss_2.get(benchmark), ss_1.get(benchmark),
                                    factor, use_stats=use_stats):
                 color = 'green'
                 mark = '-'
                 improved = True
             elif _is_result_better(time_1, time_2,
-                                   stats_1.get(benchmark), stats_2.get(benchmark),
+                                   ss_1.get(benchmark), ss_2.get(benchmark),
                                    factor, use_stats=use_stats):
                 color = 'red'
                 mark = '+'

diff --git a/asv/commands/continuous.py b/asv/commands/continuous.py
@@ -127,7 +127,8 @@ def results_iter(commit_hash):
 
                     value = result.get_result_value(name, params)
                     stats = result.get_result_stats(name, params)
-                    yield name, params, value, stats, version, machine_name, env.name
+                    samples = result.get_result_samples(name, params)
+                    yield name, params, value, stats, samples, version, machine_name, env.name
 
         commit_names = {parent: repo.get_name_from_hash(parent),
                         head: repo.get_name_from_hash(head)}

diff --git a/asv/statistics.py b/asv/statistics.py
@@ -7,8 +7,9 @@
                         unicode_literals)
 
 import math
+import operator
 
-from .util import inf, nan
+from .util import inf, nan, is_na
 
 
 def compute_stats(samples, number):
@@ -87,23 +88,39 @@ def get_err(result, stats):
     return (b - a)/2
 
 
-def is_different(stats_a, stats_b):
+def is_different(samples_a, samples_b, stats_a, stats_b, p_threshold=0.002):
     """Check whether the samples are statistically different.
 
-    This is a pessimistic check --- if it returns True, then the
-    difference is statistically significant. If it returns False,
-    there might still might be difference.
+    If sample data is not provided, or the sample is too small, falls
+    back to a pessimistic CI-based check. If it returns True, then the
+    difference is statistically significant. If it returns False, it
+    might or might not be statistically significant.
 
     Parameters
     ----------
     samples_a, samples_b
         Input samples
-    p : float, optional
-        Threshold p-value
+    stats_a, stats_b
+        Input stats data
 
     """
 
-    # If confidence intervals overlap, reject
+    if samples_a is not None and samples_b is not None:
+        # Raw data present: Mann-Whitney U test, but only if there's
+        # enough data so that the test can return True
+        a = [x for x in samples_a if not is_na(x)]
+        b = [x for x in samples_b if not is_na(x)]
+
+        p_min = 1 / binom(len(a) + len(b), min(len(a), len(b)))
+        if p_min < p_threshold:
+            _, p = mann_whitney_u(a, b)
+            return p < p_threshold
+
+    # If confidence intervals overlap, reject.
+    # Corresponds to a test with ill-specified threshold p-value,
+    # which generally can be significantly smaller than p <= 0.01
+    # depending on the actual data. For normal test (known variance),
+    # 0.00027 <= p <= 0.01.
     ci_a = stats_a['ci_99']
     ci_b = stats_b['ci_99']
 
@@ -210,6 +227,140 @@ def quantile(x, q):
     return m
 
 
+_mann_whitney_u_memo = {}
+
+def mann_whitney_u(x, y, method='auto'):
+    """
+    Mann-Whitney U test
+
+    Ties are handled conservatively, returning the least significant
+    tie breaking.
+
+    Parameters
+    ----------
+    x, y : list of float
+        Samples to test
+    method : {'auto', 'exact', 'normal'}
+        Whether to compute p-value exactly of via normal approximation.
+        The option 'auto' switches to approximation for sample size > 20.
+
+    Returns
+    -------
+    u : int
+        U-statistic
+    p : float
+        p-value for two-sided alternative
+
+    References
+    ----------
+    .. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
+    .. [2] Gibbons & Chakraborti, "Nonparametric statistical inference". (2003)
+
+    """
+    memo = _mann_whitney_u_memo
+    if len(memo) > 100000:
+        memo.clear()
+
+    m = len(x)
+    n = len(y)
+
+    if method == 'auto':
+        if max(m, n) > 20:
+            method = 'normal'
+        else:
+            method = 'exact'
+
+    u, ties = mann_whitney_u_u(x, y)
+
+    # Conservative tie breaking
+    if u <= m*n//2 and u + ties >= m*n//2:
+        ties = m*n//2 - u
+
+    ux1 = min(u, m*n - u)
+    ux2 = min(u + ties, m*n - (u + ties))
+
+    if ux1 >= ux2:
+        ux = ux1
+    else:
+        u = u + ties
+        ux = ux2
+
+    # Get p-value
+    if method == 'exact':
+        p1 = mann_whitney_u_cdf(m, n, ux, memo)
+        p2 = 1.0 - mann_whitney_u_cdf(m, n, m*n - ux, memo)
+        p = p1 + p2
+    elif method == 'normal':
+        N = m + n
+        var = m*n*(N + 1) / 12
+        z = (ux - m*n/2) / math.sqrt(var)
+        cdf = 0.5 * math.erfc(-z / math.sqrt(2))
+        p = 2 * cdf
+    else:
+        raise ValueError("Unknown method {!r}".format(method))
+
+    return u, p
+
+
+def mann_whitney_u_u(x, y):
+    u = 0
+    ties = 0
+    for xx in x:
+        for yy in y:
+            if xx > yy:
+                u += 1
+            elif xx == yy:
+                ties += 1
+    return u, ties
+
+
+def mann_whitney_u_cdf(m, n, u, memo=None):
+    if memo is None:
+        memo = {}
+    cdf = 0
+    for uu in range(u + 1):
+         cdf += mann_whitney_u_pmf(m, n, uu, memo)
+    return cdf
+
+
+def mann_whitney_u_pmf(m, n, u, memo=None):
+    if memo is None:
+        memo = {}
+    return mann_whitney_u_r(m, n, u, memo) / binom(m + n, m)
+
+
+def mann_whitney_u_r(m, n, u, memo=None):
+    """
+    Number of orderings in Mann-Whitney U test.
+
+    The PMF of U for samples of sizes (m, n) is given by
+    p(u) = r(m, n, u) / binom(m + n, m).
+
+    References
+    ----------
+    .. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
+    """
+    if u < 0:
+        value = 0
+    elif m == 0 or n == 0:
+        value = 1 if u == 0 else 0
+    else:
+        # Don't bother figuring out table construction, memoization
+        # sorts it out
+        if memo is None:
+            memo = {}
+        key = (m, n, u)
+        value = memo.get(key)
+        if value is not None:
+            return value
+
+        value = (mann_whitney_u_r(m, n - 1, u, memo)
+                 + mann_whitney_u_r(m - 1, n, u - n, memo))
+
+        memo[key] = value
+    return value
+
+
 def binom_pmf(n, k, p):
     """Binomial pmf = (n choose k) p**k (1 - p)**(n - k)"""
     if not (0 <= k <= n):
@@ -254,6 +405,23 @@ def lgamma(x):
     return nan
 
 
+def binom(n, k):
+    """
+    Binomial coefficient (n over k)
+    """
+    n = operator.index(n)
+    k = operator.index(k)
+    if not 0 <= k <= n:
+        return 0
+    m = n + 1
+    num = 1
+    den = 1
+    for j in range(1, min(k, n - k) + 1):
+        num *= m - j
+        den *= j
+    return num // den
+
+
 class LaplacePosterior(object):
     """
     Univariate distribution::