Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use U-test in asv compare when raw data is available #754

Merged
merged 2 commits into from
Oct 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
42 changes: 22 additions & 20 deletions asv/commands/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,22 @@ def _isna(value):
return value is None or value != value


def _is_result_better(a, b, a_stats, b_stats, factor, use_stats=True):
def _is_result_better(a, b, a_ss, b_ss, factor, use_stats=True):
"""
Check if result 'a' is better than 'b' by the given factor,
possibly taking confidence intervals into account.

"""

if use_stats and a_stats and b_stats and (
a_stats.get('repeat', 0) != 1 and b_stats.get('repeat', 0) != 1):
if use_stats and a_ss and b_ss and a_ss[0] and b_ss[0] and (
a_ss[0].get('repeat', 0) != 1 and b_ss[0].get('repeat', 0) != 1):
# Return False if estimates don't differ.
#
# Special-case the situation with only one sample, in which
# case we do the comparison only based on `factor` as there's
# not enough data to do statistics.
if not statistics.is_different(a_stats, b_stats):
if not statistics.is_different(a_ss[1], b_ss[1],
a_ss[0], b_ss[0]):
return False

return a < b / factor
Expand Down Expand Up @@ -178,8 +179,8 @@ def print_table(cls, conf, hash_1, hash_2, factor, split,
commit_names=None):
results_1 = {}
results_2 = {}
stats_1 = {}
stats_2 = {}
ss_1 = {}
ss_2 = {}
versions_1 = {}
versions_2 = {}
units = {}
Expand All @@ -198,9 +199,10 @@ def results_default_iter(commit_hash):
params = result.get_result_params(key)
result_value = result.get_result_value(key, params)
result_stats = result.get_result_stats(key, params)
result_samples = result.get_result_samples(key, params)
result_version = result.benchmark_version.get(key)
yield (key, params, result_value, result_stats, result_version,
result.params['machine'], result.env_name)
yield (key, params, result_value, result_stats, result_samples,
result_version, result.params['machine'], result.env_name)

if resultset_1 is None:
resultset_1 = results_default_iter(hash_1)
Expand All @@ -210,22 +212,22 @@ def results_default_iter(commit_hash):

machine_env_names = set()

for key, params, value, stats, version, machine, env_name in resultset_1:
for key, params, value, stats, samples, version, machine, env_name in resultset_1:
machine_env_name = "{}/{}".format(machine, env_name)
machine_env_names.add(machine_env_name)
for name, value, stats in unroll_result(key, params, value, stats):
for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
results_1[(name, machine_env_name)] = value
stats_1[(name, machine_env_name)] = stats
ss_1[(name, machine_env_name)] = (stats, samples)
versions_1[(name, machine_env_name)] = version

for key, params, value, stats, version, machine, env_name in resultset_2:
for key, params, value, stats, samples, version, machine, env_name in resultset_2:
machine_env_name = "{}/{}".format(machine, env_name)
machine_env_names.add(machine_env_name)
for name, value, stats in unroll_result(key, params, value, stats):
for name, value, stats, samples in unroll_result(key, params, value, stats, samples):
units[(name, machine_env_name)] = benchmarks.get(key, {}).get('unit')
results_2[(name, machine_env_name)] = value
stats_2[(name, machine_env_name)] = stats
ss_2[(name, machine_env_name)] = (stats, samples)
versions_2[(name, machine_env_name)] = version

if len(results_1) == 0:
Expand Down Expand Up @@ -265,13 +267,13 @@ def results_default_iter(commit_hash):
else:
time_2 = float("nan")

if benchmark in stats_1 and stats_1[benchmark]:
err_1 = statistics.get_err(time_1, stats_1[benchmark])
if benchmark in ss_1 and ss_1[benchmark][0]:
err_1 = statistics.get_err(time_1, ss_1[benchmark][0])
else:
err_1 = None

if benchmark in stats_2 and stats_2[benchmark]:
err_2 = statistics.get_err(time_2, stats_2[benchmark])
if benchmark in ss_2 and ss_2[benchmark][0]:
err_2 = statistics.get_err(time_2, ss_2[benchmark][0])
else:
err_2 = None

Expand Down Expand Up @@ -313,13 +315,13 @@ def results_default_iter(commit_hash):
color = 'default'
mark = ' '
elif _is_result_better(time_2, time_1,
stats_2.get(benchmark), stats_1.get(benchmark),
ss_2.get(benchmark), ss_1.get(benchmark),
factor, use_stats=use_stats):
color = 'green'
mark = '-'
improved = True
elif _is_result_better(time_1, time_2,
stats_1.get(benchmark), stats_2.get(benchmark),
ss_1.get(benchmark), ss_2.get(benchmark),
factor, use_stats=use_stats):
color = 'red'
mark = '+'
Expand Down
3 changes: 2 additions & 1 deletion asv/commands/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ def results_iter(commit_hash):

value = result.get_result_value(name, params)
stats = result.get_result_stats(name, params)
yield name, params, value, stats, version, machine_name, env.name
samples = result.get_result_samples(name, params)
yield name, params, value, stats, samples, version, machine_name, env.name

commit_names = {parent: repo.get_name_from_hash(parent),
head: repo.get_name_from_hash(head)}
Expand Down
184 changes: 176 additions & 8 deletions asv/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
unicode_literals)

import math
import operator

from .util import inf, nan
from .util import inf, nan, is_na


def compute_stats(samples, number):
Expand Down Expand Up @@ -87,23 +88,39 @@ def get_err(result, stats):
return (b - a)/2


def is_different(stats_a, stats_b):
def is_different(samples_a, samples_b, stats_a, stats_b, p_threshold=0.002):
"""Check whether the samples are statistically different.

This is a pessimistic check --- if it returns True, then the
difference is statistically significant. If it returns False,
there might still might be difference.
If sample data is not provided, or the sample is too small, falls
back to a pessimistic CI-based check. If it returns True, then the
difference is statistically significant. If it returns False, it
might or might not be statistically significant.

Parameters
----------
samples_a, samples_b
Input samples
p : float, optional
Threshold p-value
stats_a, stats_b
Input stats data

"""

# If confidence intervals overlap, reject
if samples_a is not None and samples_b is not None:
# Raw data present: Mann-Whitney U test, but only if there's
# enough data so that the test can return True
a = [x for x in samples_a if not is_na(x)]
b = [x for x in samples_b if not is_na(x)]

p_min = 1 / binom(len(a) + len(b), min(len(a), len(b)))
if p_min < p_threshold:
_, p = mann_whitney_u(a, b)
return p < p_threshold

# If confidence intervals overlap, reject.
# Corresponds to a test with ill-specified threshold p-value,
# which generally can be significantly smaller than p <= 0.01
# depending on the actual data. For normal test (known variance),
# 0.00027 <= p <= 0.01.
ci_a = stats_a['ci_99']
ci_b = stats_b['ci_99']

Expand Down Expand Up @@ -210,6 +227,140 @@ def quantile(x, q):
return m


_mann_whitney_u_memo = {}

def mann_whitney_u(x, y, method='auto'):
"""
Mann-Whitney U test

Ties are handled conservatively, returning the least significant
tie breaking.

Parameters
----------
x, y : list of float
Samples to test
method : {'auto', 'exact', 'normal'}
Whether to compute p-value exactly of via normal approximation.
The option 'auto' switches to approximation for sample size > 20.

Returns
-------
u : int
U-statistic
p : float
p-value for two-sided alternative

References
----------
.. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
.. [2] Gibbons & Chakraborti, "Nonparametric statistical inference". (2003)

"""
memo = _mann_whitney_u_memo
if len(memo) > 100000:
memo.clear()

m = len(x)
n = len(y)

if method == 'auto':
if max(m, n) > 20:
method = 'normal'
else:
method = 'exact'

u, ties = mann_whitney_u_u(x, y)

# Conservative tie breaking
if u <= m*n//2 and u + ties >= m*n//2:
ties = m*n//2 - u

ux1 = min(u, m*n - u)
ux2 = min(u + ties, m*n - (u + ties))

if ux1 >= ux2:
ux = ux1
else:
u = u + ties
ux = ux2

# Get p-value
if method == 'exact':
p1 = mann_whitney_u_cdf(m, n, ux, memo)
p2 = 1.0 - mann_whitney_u_cdf(m, n, m*n - ux, memo)
p = p1 + p2
elif method == 'normal':
N = m + n
var = m*n*(N + 1) / 12
z = (ux - m*n/2) / math.sqrt(var)
cdf = 0.5 * math.erfc(-z / math.sqrt(2))
p = 2 * cdf
else:
raise ValueError("Unknown method {!r}".format(method))

return u, p


def mann_whitney_u_u(x, y):
u = 0
ties = 0
for xx in x:
for yy in y:
if xx > yy:
u += 1
elif xx == yy:
ties += 1
return u, ties


def mann_whitney_u_cdf(m, n, u, memo=None):
if memo is None:
memo = {}
cdf = 0
for uu in range(u + 1):
cdf += mann_whitney_u_pmf(m, n, uu, memo)
return cdf


def mann_whitney_u_pmf(m, n, u, memo=None):
if memo is None:
memo = {}
return mann_whitney_u_r(m, n, u, memo) / binom(m + n, m)


def mann_whitney_u_r(m, n, u, memo=None):
"""
Number of orderings in Mann-Whitney U test.

The PMF of U for samples of sizes (m, n) is given by
p(u) = r(m, n, u) / binom(m + n, m).

References
----------
.. [1] Mann & Whitney, Ann. Math. Statist. 18, 50 (1947).
"""
if u < 0:
value = 0
elif m == 0 or n == 0:
value = 1 if u == 0 else 0
else:
# Don't bother figuring out table construction, memoization
# sorts it out
if memo is None:
memo = {}
key = (m, n, u)
value = memo.get(key)
if value is not None:
return value

value = (mann_whitney_u_r(m, n - 1, u, memo)
+ mann_whitney_u_r(m - 1, n, u - n, memo))

memo[key] = value
return value


def binom_pmf(n, k, p):
"""Binomial pmf = (n choose k) p**k (1 - p)**(n - k)"""
if not (0 <= k <= n):
Expand Down Expand Up @@ -254,6 +405,23 @@ def lgamma(x):
return nan


def binom(n, k):
"""
Binomial coefficient (n over k)
"""
n = operator.index(n)
k = operator.index(k)
if not 0 <= k <= n:
return 0
m = n + 1
num = 1
den = 1
for j in range(1, min(k, n - k) + 1):
num *= m - j
den *= j
return num // den


class LaplacePosterior(object):
"""
Univariate distribution::
Expand Down