Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[benchmark] More Robust Benchmark_Driver #19910

Merged
merged 17 commits into from Oct 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake
Expand Up @@ -701,11 +701,11 @@ function(swift_benchmark_compile)
COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
"-o" "O" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
"--swift-repo" "${SWIFT_SOURCE_DIR}"
"--iterations" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
"--independent-samples" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
"-o" "Onone" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
"--swift-repo" "${SWIFT_SOURCE_DIR}"
"--iterations" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
"--independent-samples" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
COMMAND "${swift-bin-dir}/Benchmark_Driver" "compare"
"--log-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
"--swift-repo" "${SWIFT_SOURCE_DIR}"
Expand Down
65 changes: 44 additions & 21 deletions benchmark/scripts/Benchmark_Driver
Expand Up @@ -133,27 +133,31 @@ class BenchmarkDriver(object):
benchmarks.intersection(set(self.all_tests)).union(indexed_names)))

def run(self, test, num_samples=None, num_iters=None,
verbose=None, measure_memory=False):
verbose=None, measure_memory=False, quantile=None):
"""Execute benchmark and gather results."""
num_samples = num_samples or 1
num_samples = num_samples or 0
num_iters = num_iters or 0 # automatically determine N to run for 1s

cmd = self._cmd_run(
test, num_samples, num_iters, verbose, measure_memory)
test, num_samples, num_iters, verbose, measure_memory, quantile)
output = self._invoke(cmd)
result = self.parser.results_from_string(output).items()[0][1]
return result

def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory,
quantile):
cmd = [self.test_harness, test]
if num_samples > 1:
if num_samples > 0:
cmd.append('--num-samples={0}'.format(num_samples))
if num_iters > 0:
cmd.append('--num-iters={0}'.format(num_iters))
if verbose:
cmd.append('--verbose')
if measure_memory:
cmd.append('--memory')
if quantile:
cmd.append('--quantile={0}'.format(quantile))
cmd.append('--delta')
return cmd

def run_independent_samples(self, test):
Expand All @@ -166,8 +170,9 @@ class BenchmarkDriver(object):
return a

return reduce(merge_results,
[self.run(test, measure_memory=True)
for _ in range(self.args.iterations)])
[self.run(test, measure_memory=True,
num_iters=1, quantile=20)
for _ in range(self.args.independent_samples)])

def log_results(self, output, log_file=None):
"""Log output to `log_file`.
Expand All @@ -182,7 +187,7 @@ class BenchmarkDriver(object):
with open(log_file, 'w') as f:
f.write(output)

RESULT = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
RESULT = '{:>3} {:<25} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}'

def run_and_log(self, csv_console=True):
"""Run benchmarks and continuously log results to the console.
Expand All @@ -201,14 +206,15 @@ class BenchmarkDriver(object):
def console_log(values):
print(format(values))

console_log(['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', # header
'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)'])

def result_values(r):
return map(str, [r.test_num, r.name, r.num_samples, r.min, r.max,
int(r.mean), int(r.sd), r.median, r.max_rss])

results = []
return map(str, [r.test_num, r.name, r.num_samples, r.min,
r.samples.q1, r.median, r.samples.q3, r.max,
r.max_rss])

header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
console_log(header)
results = [header]
for test in self.tests:
result = result_values(self.run_independent_samples(test))
console_log(result)
Expand Down Expand Up @@ -290,6 +296,7 @@ class BenchmarkDoctor(object):
self._name_matches_capital_words_convention,
self._name_is_at_most_40_chars_long,
self._no_setup_overhead,
self._reasonable_setup_time,
self._optimized_runtime_in_range,
self._constant_memory_use
]
Expand Down Expand Up @@ -341,15 +348,20 @@ class BenchmarkDoctor(object):
for correction in [(setup / i) for i in [1, 2]]
] for result in i_series])

if 2500 < runtime:
log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
threshold = 1000
if threshold < runtime:
log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else
BenchmarkDoctor.log_runtime.error)
caveat = '' if setup == 0 else ' (excluding the setup overhead)'
log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))

def factor(base): # suitable divisior that's integer power of base
return int(pow(base, math.ceil(
math.log(runtime / float(threshold), base))))

BenchmarkDoctor.log_runtime.info(
"Decrease the workload of '%s' by a factor of %d, "
"to be less than 2500 μs.", name, factor)
"Decrease the workload of '%s' by a factor of %d (%d), to be "
"less than %d μs.", name, factor(2), factor(10), threshold)

@staticmethod
def _setup_overhead(measurements):
Expand All @@ -372,6 +384,17 @@ class BenchmarkDoctor(object):
'Move initialization of benchmark data to the `setUpFunction` '
'registered in `BenchmarkInfo`.')

@staticmethod
def _reasonable_setup_time(measurements):
setup = min([result.setup
for result in BenchmarkDoctor._select(measurements)])
if 200000 < setup: # 200 ms
BenchmarkDoctor.log_runtime.error(
"'%s' setup took at least %d μs.",
measurements['name'], setup)
BenchmarkDoctor.log_runtime.info(
'The `setUpFunction` should take no more than 200 ms.')

@staticmethod
def _constant_memory_use(measurements):
select = BenchmarkDoctor._select
Expand Down Expand Up @@ -603,7 +626,7 @@ def parse_args(args):
help='Run benchmarks and output results to stdout',
parents=[shared_benchmarks_parser])
run_parser.add_argument(
'-i', '--iterations',
'-i', '--independent-samples',
help='number of times to run each test (default: 1)',
type=positive_int, default=1)
run_parser.add_argument(
Expand Down
137 changes: 100 additions & 37 deletions benchmark/scripts/compare_perf_tests.py
Expand Up @@ -51,6 +51,14 @@ def __repr__(self):
return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)


class Yield(namedtuple('Yield', 'before_sample after')):
u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.

`before_sample`: index of measurement taken just after returning from yield
`after`: time elapsed since the previous yield in microseconds (μs)
"""


class PerformanceTestSamples(object):
"""Collection of runtime samples from the benchmark execution.

Expand All @@ -69,7 +77,7 @@ def __init__(self, name, samples=None):
self.add(sample)

def __str__(self):
"""Text summary of benchmark statisctics."""
"""Text summary of benchmark statistics."""
return (
'{0.name!s} n={0.count!r} '
'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
Expand Down Expand Up @@ -211,31 +219,60 @@ class PerformanceTestResult(object):
Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
or Benchmark_Driver).

It depends on the log format emitted by the test driver in the form:
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)

The last column, MAX_RSS, is emitted only for runs instrumented by the
Benchmark_Driver to measure rough memory use during the execution of the
benchmark.
It suppors 2 log formats emitted by the test driver. Legacy format with
statistics for normal distribution (MEAN, SD):
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
And new quantiles format with variable number of columns:
#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
The number of columns between MIN and MAX depends on the test driver's
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
"""

def __init__(self, csv_row):
"""Initialize from a row with 8 or 9 columns with benchmark summary.
def __init__(self, csv_row, quantiles=False, memory=False, delta=False):
"""Initialize from a row of multiple columns with benchmark summary.

The row is an iterable, such as a row provided by the CSV parser.
"""
self.test_num = csv_row[0] # Ordinal number of the test
self.name = csv_row[1] # Name of the performance test
self.num_samples = ( # Number of measurement samples taken
int(csv_row[2]))
self.min = int(csv_row[3]) # Minimum runtime (μs)
self.max = int(csv_row[4]) # Maximum runtime (μs)
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
self.sd = float(csv_row[6]) # Standard Deviation (μs)
self.median = int(csv_row[7]) # Median runtime (μs)
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[8]) if len(csv_row) > 8 else None)
self.samples = None
self.test_num = csv_row[0] # Ordinal number of the test
self.name = csv_row[1] # Name of the performance test
self.num_samples = int(csv_row[2]) # Number of measurements taken

if quantiles: # Variable number of columns representing quantiles
runtimes = csv_row[3:-1] if memory else csv_row[3:]
if delta:
runtimes = [int(x) if x else 0 for x in runtimes]
runtimes = reduce(lambda l, x: l.append(l[-1] + x) or # runnin
l if l else [x], runtimes, None) # total
num_values = len(runtimes)
if self.num_samples < num_values: # remove repeated samples
quantile = num_values - 1
qs = [float(i) / float(quantile) for i in range(0, num_values)]
indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
for q in qs]
runtimes = [runtimes[indices.index(i)]
for i in range(0, self.num_samples)]

self.samples = PerformanceTestSamples(
self.name,
[Sample(None, None, int(runtime)) for runtime in runtimes])
self.samples.exclude_outliers(top_only=True)
sams = self.samples
self.min, self.max, self.median, self.mean, self.sd = \
sams.min, sams.max, sams.median, sams.mean, sams.sd
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[-1]) if memory else None)
else: # Legacy format with statistics for normal distribution.
self.min = int(csv_row[3]) # Minimum runtime (μs)
self.max = int(csv_row[4]) # Maximum runtime (μs)
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
self.sd = float(csv_row[6]) # Standard Deviation (μs)
self.median = int(csv_row[7]) # Median runtime (μs)
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[8]) if len(csv_row) > 8 else None)
self.samples = None
self.yields = None
self.setup = None

def __repr__(self):
"""Short summary for debugging purposes."""
Expand All @@ -253,6 +290,7 @@ def merge(self, r):
The use case here is comparing test results parsed from concatenated
log files from multiple runs of benchmark driver.
"""
# Statistics
if self.samples and r.samples:
map(self.samples.add, r.samples.samples)
sams = self.samples
Expand All @@ -266,8 +304,14 @@ def merge(self, r):
(self.mean * self.num_samples) + (r.mean * r.num_samples)
) / float(self.num_samples + r.num_samples)
self.num_samples += r.num_samples
self.max_rss = min(self.max_rss, r.max_rss)
self.median, self.sd = 0, 0
self.median, self.sd = None, None

# Metadata
def minimum(a, b): # work around None being less than everything
return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
else None)
self.max_rss = minimum(self.max_rss, r.max_rss)
self.setup = minimum(self.setup, r.setup)


class ResultComparison(object):
Expand Down Expand Up @@ -307,40 +351,48 @@ class LogParser(object):
def __init__(self):
"""Create instance of `LogParser`."""
self.results = []
self.quantiles, self.delta, self.memory = False, False, False
self._reset()

def _reset(self):
"""Reset parser to the default state for reading a new result."""
self.samples, self.num_iters = [], 1
self.max_rss, self.mem_pages = None, None
self.samples, self.yields, self.num_iters = [], [], 1
self.setup, self.max_rss, self.mem_pages = None, None, None
self.voluntary_cs, self.involuntary_cs = None, None

# Parse lines like this
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
results_re = re.compile(r'( *\d+[, \t]*[\w.]+[, \t]*' +
r'[, \t]*'.join([r'[\d.]+'] * 6) +
r'[, \t]*[\d.]*)') # optional MAX_RSS(B)
results_re = re.compile(
r'( *\d+[, \t]+[\w.]+[, \t]+' + # #,TEST
r'[, \t]+'.join([r'\d+'] * 2) + # at least 2...
r'(?:[, \t]+\d*)*)') # ...or more numeric columns

def _append_result(self, result):
columns = result.split(',')
if len(columns) < 8:
columns = result.split()
r = PerformanceTestResult(columns)
if self.max_rss:
r.max_rss = self.max_rss
r.mem_pages = self.mem_pages
r.voluntary_cs = self.voluntary_cs
r.involuntary_cs = self.involuntary_cs
columns = result.split(',') if ',' in result else result.split()
r = PerformanceTestResult(
columns, quantiles=self.quantiles, memory=self.memory,
delta=self.delta)
r.setup = self.setup
r.max_rss = r.max_rss or self.max_rss
r.mem_pages = self.mem_pages
r.voluntary_cs = self.voluntary_cs
r.involuntary_cs = self.involuntary_cs
if self.samples:
r.samples = PerformanceTestSamples(r.name, self.samples)
r.samples.exclude_outliers()
self.results.append(r)
r.yields = self.yields or None
self._reset()

def _store_memory_stats(self, max_rss, mem_pages):
self.max_rss = int(max_rss)
self.mem_pages = int(mem_pages)

def _configure_format(self, header):
self.quantiles = 'MEAN' not in header
self.memory = 'MAX_RSS' in header
self.delta = '𝚫' in header

# Regular expression and action to take when it matches the parsed line
state_actions = {
results_re: _append_result,
Expand All @@ -355,6 +407,17 @@ def _store_memory_stats(self, max_rss, mem_pages):
self.samples.append(
Sample(int(i), int(self.num_iters), int(runtime)))),

re.compile(r'\s+SetUp (\d+)'):
(lambda self, setup: setattr(self, 'setup', int(setup))),

re.compile(r'\s+Yielding after ~(\d+) μs'):
(lambda self, since_last_yield:
self.yields.append(
Yield(len(self.samples), int(since_last_yield)))),

re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
_configure_format,

# Environmental statistics: memory usage and context switches
re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
_store_memory_stats,
Expand Down