diff --git a/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake b/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake index 1f84bb829d2ee..2ca6532648dae 100644 --- a/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake +++ b/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake @@ -701,11 +701,11 @@ function(swift_benchmark_compile) COMMAND "${swift-bin-dir}/Benchmark_Driver" "run" "-o" "O" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs" "--swift-repo" "${SWIFT_SOURCE_DIR}" - "--iterations" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}" + "--independent-samples" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}" COMMAND "${swift-bin-dir}/Benchmark_Driver" "run" "-o" "Onone" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs" "--swift-repo" "${SWIFT_SOURCE_DIR}" - "--iterations" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}" + "--independent-samples" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}" COMMAND "${swift-bin-dir}/Benchmark_Driver" "compare" "--log-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs" "--swift-repo" "${SWIFT_SOURCE_DIR}" diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver index adf182b4d0c20..3e9cf0fe8225a 100755 --- a/benchmark/scripts/Benchmark_Driver +++ b/benchmark/scripts/Benchmark_Driver @@ -133,20 +133,21 @@ class BenchmarkDriver(object): benchmarks.intersection(set(self.all_tests)).union(indexed_names))) def run(self, test, num_samples=None, num_iters=None, - verbose=None, measure_memory=False): + verbose=None, measure_memory=False, quantile=None): """Execute benchmark and gather results.""" - num_samples = num_samples or 1 + num_samples = num_samples or 0 num_iters = num_iters or 0 # automatically determine N to run for 1s cmd = self._cmd_run( - test, num_samples, num_iters, verbose, measure_memory) + test, num_samples, num_iters, verbose, measure_memory, quantile) output = self._invoke(cmd) result = self.parser.results_from_string(output).items()[0][1] return result - def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory): + def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory, + quantile): cmd = [self.test_harness, test] - if num_samples > 1: + if num_samples > 0: cmd.append('--num-samples={0}'.format(num_samples)) if num_iters > 0: cmd.append('--num-iters={0}'.format(num_iters)) @@ -154,6 +155,9 @@ class BenchmarkDriver(object): cmd.append('--verbose') if measure_memory: cmd.append('--memory') + if quantile: + cmd.append('--quantile={0}'.format(quantile)) + cmd.append('--delta') return cmd def run_independent_samples(self, test): @@ -166,8 +170,9 @@ class BenchmarkDriver(object): return a return reduce(merge_results, - [self.run(test, measure_memory=True) - for _ in range(self.args.iterations)]) + [self.run(test, measure_memory=True, + num_iters=1, quantile=20) + for _ in range(self.args.independent_samples)]) def log_results(self, output, log_file=None): """Log output to `log_file`. @@ -182,7 +187,7 @@ class BenchmarkDriver(object): with open(log_file, 'w') as f: f.write(output) - RESULT = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}' + RESULT = '{:>3} {:<25} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}' def run_and_log(self, csv_console=True): """Run benchmarks and continuously log results to the console. @@ -201,14 +206,15 @@ class BenchmarkDriver(object): def console_log(values): print(format(values)) - console_log(['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', # header - 'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)']) - def result_values(r): - return map(str, [r.test_num, r.name, r.num_samples, r.min, r.max, - int(r.mean), int(r.sd), r.median, r.max_rss]) - - results = [] + return map(str, [r.test_num, r.name, r.num_samples, r.min, + r.samples.q1, r.median, r.samples.q3, r.max, + r.max_rss]) + + header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)', + 'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)'] + console_log(header) + results = [header] for test in self.tests: result = result_values(self.run_independent_samples(test)) console_log(result) @@ -290,6 +296,7 @@ class BenchmarkDoctor(object): self._name_matches_capital_words_convention, self._name_is_at_most_40_chars_long, self._no_setup_overhead, + self._reasonable_setup_time, self._optimized_runtime_in_range, self._constant_memory_use ] @@ -341,15 +348,20 @@ class BenchmarkDoctor(object): for correction in [(setup / i) for i in [1, 2]] ] for result in i_series]) - if 2500 < runtime: - log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else + threshold = 1000 + if threshold < runtime: + log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else BenchmarkDoctor.log_runtime.error) caveat = '' if setup == 0 else ' (excluding the setup overhead)' log("'%s' execution took at least %d μs%s.", name, runtime, caveat) - factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2)))) + + def factor(base): # suitable divisior that's integer power of base + return int(pow(base, math.ceil( + math.log(runtime / float(threshold), base)))) + BenchmarkDoctor.log_runtime.info( - "Decrease the workload of '%s' by a factor of %d, " - "to be less than 2500 μs.", name, factor) + "Decrease the workload of '%s' by a factor of %d (%d), to be " + "less than %d μs.", name, factor(2), factor(10), threshold) @staticmethod def _setup_overhead(measurements): @@ -372,6 +384,17 @@ class BenchmarkDoctor(object): 'Move initialization of benchmark data to the `setUpFunction` ' 'registered in `BenchmarkInfo`.') + @staticmethod + def _reasonable_setup_time(measurements): + setup = min([result.setup + for result in BenchmarkDoctor._select(measurements)]) + if 200000 < setup: # 200 ms + BenchmarkDoctor.log_runtime.error( + "'%s' setup took at least %d μs.", + measurements['name'], setup) + BenchmarkDoctor.log_runtime.info( + 'The `setUpFunction` should take no more than 200 ms.') + @staticmethod def _constant_memory_use(measurements): select = BenchmarkDoctor._select @@ -603,7 +626,7 @@ def parse_args(args): help='Run benchmarks and output results to stdout', parents=[shared_benchmarks_parser]) run_parser.add_argument( - '-i', '--iterations', + '-i', '--independent-samples', help='number of times to run each test (default: 1)', type=positive_int, default=1) run_parser.add_argument( diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py index fe3a84be6c18a..83f2265f2e8c1 100755 --- a/benchmark/scripts/compare_perf_tests.py +++ b/benchmark/scripts/compare_perf_tests.py @@ -51,6 +51,14 @@ def __repr__(self): return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self) +class Yield(namedtuple('Yield', 'before_sample after')): + u"""Meta-measurement of when the Benchmark_X voluntarily yielded process. + + `before_sample`: index of measurement taken just after returning from yield + `after`: time elapsed since the previous yield in microseconds (μs) + """ + + class PerformanceTestSamples(object): """Collection of runtime samples from the benchmark execution. @@ -69,7 +77,7 @@ def __init__(self, name, samples=None): self.add(sample) def __str__(self): - """Text summary of benchmark statisctics.""" + """Text summary of benchmark statistics.""" return ( '{0.name!s} n={0.count!r} ' 'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} ' @@ -211,31 +219,60 @@ class PerformanceTestResult(object): Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize or Benchmark_Driver). - It depends on the log format emitted by the test driver in the form: - #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) - - The last column, MAX_RSS, is emitted only for runs instrumented by the - Benchmark_Driver to measure rough memory use during the execution of the - benchmark. + It suppors 2 log formats emitted by the test driver. Legacy format with + statistics for normal distribution (MEAN, SD): + #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) + And new quantiles format with variable number of columns: + #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs) + #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B) + The number of columns between MIN and MAX depends on the test driver's + `--quantile`parameter. In both cases, the last column, MAX_RSS is optional. """ - def __init__(self, csv_row): - """Initialize from a row with 8 or 9 columns with benchmark summary. + def __init__(self, csv_row, quantiles=False, memory=False, delta=False): + """Initialize from a row of multiple columns with benchmark summary. The row is an iterable, such as a row provided by the CSV parser. """ - self.test_num = csv_row[0] # Ordinal number of the test - self.name = csv_row[1] # Name of the performance test - self.num_samples = ( # Number of measurement samples taken - int(csv_row[2])) - self.min = int(csv_row[3]) # Minimum runtime (μs) - self.max = int(csv_row[4]) # Maximum runtime (μs) - self.mean = float(csv_row[5]) # Mean (average) runtime (μs) - self.sd = float(csv_row[6]) # Standard Deviation (μs) - self.median = int(csv_row[7]) # Median runtime (μs) - self.max_rss = ( # Maximum Resident Set Size (B) - int(csv_row[8]) if len(csv_row) > 8 else None) - self.samples = None + self.test_num = csv_row[0] # Ordinal number of the test + self.name = csv_row[1] # Name of the performance test + self.num_samples = int(csv_row[2]) # Number of measurements taken + + if quantiles: # Variable number of columns representing quantiles + runtimes = csv_row[3:-1] if memory else csv_row[3:] + if delta: + runtimes = [int(x) if x else 0 for x in runtimes] + runtimes = reduce(lambda l, x: l.append(l[-1] + x) or # runnin + l if l else [x], runtimes, None) # total + num_values = len(runtimes) + if self.num_samples < num_values: # remove repeated samples + quantile = num_values - 1 + qs = [float(i) / float(quantile) for i in range(0, num_values)] + indices = [max(0, int(ceil(self.num_samples * float(q))) - 1) + for q in qs] + runtimes = [runtimes[indices.index(i)] + for i in range(0, self.num_samples)] + + self.samples = PerformanceTestSamples( + self.name, + [Sample(None, None, int(runtime)) for runtime in runtimes]) + self.samples.exclude_outliers(top_only=True) + sams = self.samples + self.min, self.max, self.median, self.mean, self.sd = \ + sams.min, sams.max, sams.median, sams.mean, sams.sd + self.max_rss = ( # Maximum Resident Set Size (B) + int(csv_row[-1]) if memory else None) + else: # Legacy format with statistics for normal distribution. + self.min = int(csv_row[3]) # Minimum runtime (μs) + self.max = int(csv_row[4]) # Maximum runtime (μs) + self.mean = float(csv_row[5]) # Mean (average) runtime (μs) + self.sd = float(csv_row[6]) # Standard Deviation (μs) + self.median = int(csv_row[7]) # Median runtime (μs) + self.max_rss = ( # Maximum Resident Set Size (B) + int(csv_row[8]) if len(csv_row) > 8 else None) + self.samples = None + self.yields = None + self.setup = None def __repr__(self): """Short summary for debugging purposes.""" @@ -253,6 +290,7 @@ def merge(self, r): The use case here is comparing test results parsed from concatenated log files from multiple runs of benchmark driver. """ + # Statistics if self.samples and r.samples: map(self.samples.add, r.samples.samples) sams = self.samples @@ -266,8 +304,14 @@ def merge(self, r): (self.mean * self.num_samples) + (r.mean * r.num_samples) ) / float(self.num_samples + r.num_samples) self.num_samples += r.num_samples - self.max_rss = min(self.max_rss, r.max_rss) - self.median, self.sd = 0, 0 + self.median, self.sd = None, None + + # Metadata + def minimum(a, b): # work around None being less than everything + return (min(filter(lambda x: x is not None, [a, b])) if any([a, b]) + else None) + self.max_rss = minimum(self.max_rss, r.max_rss) + self.setup = minimum(self.setup, r.setup) class ResultComparison(object): @@ -307,40 +351,48 @@ class LogParser(object): def __init__(self): """Create instance of `LogParser`.""" self.results = [] + self.quantiles, self.delta, self.memory = False, False, False self._reset() def _reset(self): """Reset parser to the default state for reading a new result.""" - self.samples, self.num_iters = [], 1 - self.max_rss, self.mem_pages = None, None + self.samples, self.yields, self.num_iters = [], [], 1 + self.setup, self.max_rss, self.mem_pages = None, None, None self.voluntary_cs, self.involuntary_cs = None, None # Parse lines like this # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs) - results_re = re.compile(r'( *\d+[, \t]*[\w.]+[, \t]*' + - r'[, \t]*'.join([r'[\d.]+'] * 6) + - r'[, \t]*[\d.]*)') # optional MAX_RSS(B) + results_re = re.compile( + r'( *\d+[, \t]+[\w.]+[, \t]+' + # #,TEST + r'[, \t]+'.join([r'\d+'] * 2) + # at least 2... + r'(?:[, \t]+\d*)*)') # ...or more numeric columns def _append_result(self, result): - columns = result.split(',') - if len(columns) < 8: - columns = result.split() - r = PerformanceTestResult(columns) - if self.max_rss: - r.max_rss = self.max_rss - r.mem_pages = self.mem_pages - r.voluntary_cs = self.voluntary_cs - r.involuntary_cs = self.involuntary_cs + columns = result.split(',') if ',' in result else result.split() + r = PerformanceTestResult( + columns, quantiles=self.quantiles, memory=self.memory, + delta=self.delta) + r.setup = self.setup + r.max_rss = r.max_rss or self.max_rss + r.mem_pages = self.mem_pages + r.voluntary_cs = self.voluntary_cs + r.involuntary_cs = self.involuntary_cs if self.samples: r.samples = PerformanceTestSamples(r.name, self.samples) r.samples.exclude_outliers() self.results.append(r) + r.yields = self.yields or None self._reset() def _store_memory_stats(self, max_rss, mem_pages): self.max_rss = int(max_rss) self.mem_pages = int(mem_pages) + def _configure_format(self, header): + self.quantiles = 'MEAN' not in header + self.memory = 'MAX_RSS' in header + self.delta = '𝚫' in header + # Regular expression and action to take when it matches the parsed line state_actions = { results_re: _append_result, @@ -355,6 +407,17 @@ def _store_memory_stats(self, max_rss, mem_pages): self.samples.append( Sample(int(i), int(self.num_iters), int(runtime)))), + re.compile(r'\s+SetUp (\d+)'): + (lambda self, setup: setattr(self, 'setup', int(setup))), + + re.compile(r'\s+Yielding after ~(\d+) μs'): + (lambda self, since_last_yield: + self.yields.append( + Yield(len(self.samples), int(since_last_yield)))), + + re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'): + _configure_format, + # Environmental statistics: memory usage and context switches re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'): _store_memory_stats, diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py index c4dd29192c036..f3adeb6fa7edd 100644 --- a/benchmark/scripts/test_Benchmark_Driver.py +++ b/benchmark/scripts/test_Benchmark_Driver.py @@ -96,15 +96,16 @@ def test_optimization_argument(self): "(choose from 'O', 'Onone', 'Osize')"], err.getvalue()) - def test_iterations(self): - self.assertEquals(parse_args(['run']).iterations, 1) - self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3) + def test_independent_samples(self): + self.assertEquals(parse_args(['run']).independent_samples, 1) + self.assertEquals(parse_args(['run', '-i', '3']).independent_samples, + 3) with captured_output() as (out, err): self.assertRaises(SystemExit, parse_args, ['run', '-i', '-3']) self.assert_contains( - ['error:', - "argument -i/--iterations: invalid positive_int value: '-3'"], + ['error:', "argument -i/--independent-samples: " + + "invalid positive_int value: '-3'"], err.getvalue()) def test_output_dir(self): @@ -278,30 +279,40 @@ def test_measure_memory(self): self.subprocess_mock.assert_called_with( ('/benchmarks/Benchmark_O', 'b', '--memory')) + def test_report_quantiles(self): + """Use delta compression for quantile reports.""" + self.driver.run('b', quantile=4) + self.subprocess_mock.assert_called_with( + ('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta')) + def test_run_benchmark_independent_samples(self): - self.driver.args.iterations = 3 + """Extract up to 20 measurements from an independent run.""" + self.driver.args.independent_samples = 3 r = self.driver.run_independent_samples('b1') self.assertEquals(self.subprocess_mock.calls.count( - ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3) + ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory', + '--quantile=20', '--delta')), 3) self.assertEquals(r.num_samples, 3) # results are merged def test_run_and_log(self): def mock_run(test): self.assertEquals(test, 'b1') return PerformanceTestResult( - '3,b1,1,123,123,123,0,123,888'.split(',')) + '3,b1,5,101,1,1,1,1,888'.split(','), + quantiles=True, delta=True, memory=True) driver = BenchmarkDriver(tests=['b1'], args=Stub(output_dir=None)) driver.run_independent_samples = mock_run # patching with captured_output() as (out, _): log = driver.run_and_log() - csv_log = '3,b1,1,123,123,123,0,123,888\n' + header = '#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs),' +\ + 'MAX_RSS(B)\n' + csv_log = '3,b1,5,101,102,103,104,105,888\n' self.assertEquals(log, None) self.assertEquals( out.getvalue(), - '#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' + - 'MAX_RSS(B)\n' + + header + csv_log + '\n' + 'Total performance tests executed: 1\n') @@ -309,13 +320,13 @@ def mock_run(test): with captured_output() as (out, _): log = driver.run_and_log(csv_console=False) - self.assertEquals(log, csv_log) + self.assertEquals(log, header + csv_log) self.assertEquals( out.getvalue(), - ' # TEST SAMPLES MIN(μs) MAX(μs)' + - ' MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)\n' + - ' 3 b1 1 123 123' + - ' 123 0 123 888\n' + + ' # TEST SAMPLES MIN(μs) Q1(μs)' + + ' MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n' + + ' 3 b1 5 101 102' + + ' 103 104 105 888\n' + '\n' + 'Total performance tests executed: 1\n') @@ -410,9 +421,9 @@ def test_no_prefix_for_base_logging(self): self.assertEquals(f.format(lr), 'INFO Hi!') -def _PTR(min=700, mem_pages=1000): +def _PTR(min=700, mem_pages=1000, setup=None): """Create PerformanceTestResult Stub.""" - return Stub(min=min, mem_pages=mem_pages) + return Stub(min=min, mem_pages=mem_pages, setup=setup) def _run(test, num_samples=None, num_iters=None, verbose=None, @@ -537,13 +548,18 @@ def test_benchmark_name_is_at_most_40_chars_long(self): self.logs['info']) def test_benchmark_runtime_range(self): - """Optimized benchmark should run in less then 2500 μs. - - With runtimes less than 2500 μs there is better than 1:4 chance of - being interrupted in the middle of measurement due to elapsed 10 ms - quantum used by macos scheduler. - - Warn about longer runtime. Runtimes over half a second are an error. + """Optimized benchmark should run in less then 1000 μs. + + Even on calm machine, benchmark with runtime of 2500 μs has 1:4 chance + of being interrupted in the middle of measurement due to elapsed 10 ms + quantum used by macos scheduler. Linux scheduler's quantum is 6ms. + Driver yielding the process before the 10ms quantum elapses helped + a lot, but benchmarks with runtimes under 1ms usually exhibit a strong + mode which is best for accurate performance charaterization. + To minimize the number of involuntary context switches that corrupt our + measurements, we should strive to stay in the microbenchmark range. + + Warn about longer runtime. Runtimes over 10ms are an error. """ def measurements(name, runtime): return {'name': name, @@ -553,7 +569,7 @@ def measurements(name, runtime): with captured_output() as (out, _): doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([])) doctor.analyze(measurements('Cheetah', 200)) - doctor.analyze(measurements('Hare', 2501)) + doctor.analyze(measurements('Hare', 1001)) doctor.analyze(measurements('Tortoise', 500000)) doctor.analyze({'name': 'OverheadTurtle', 'OverheadTurtle O i1a': _PTR(min=800000), @@ -562,17 +578,17 @@ def measurements(name, runtime): self.assertIn('runtime: ', output) self.assertNotIn('Cheetah', output) - self.assert_contains(["'Hare' execution took at least 2501 μs."], + self.assert_contains(["'Hare' execution took at least 1001 μs."], self.logs['warning']) self.assert_contains( - ["Decrease the workload of 'Hare' by a factor of 2, " - "to be less than 2500 μs."], self.logs['info']) + ["Decrease the workload of 'Hare' by a factor of 2 (10), " + "to be less than 1000 μs."], self.logs['info']) self.assert_contains( ["'Tortoise' execution took at least 500000 μs."], self.logs['error']) self.assert_contains( - ["Decrease the workload of 'Tortoise' by a factor of 256, " - "to be less than 2500 μs."], self.logs['info']) + ["Decrease the workload of 'Tortoise' by a factor of 512 (1000), " + "to be less than 1000 μs."], self.logs['info']) self.assert_contains( ["'OverheadTurtle' execution took at least 600000 μs" " (excluding the setup overhead)."], @@ -605,6 +621,29 @@ def test_benchmark_has_no_significant_setup_overhead(self): ["Move initialization of benchmark data to the `setUpFunction` " "registered in `BenchmarkInfo`."], self.logs['info']) + def test_benchmark_setup_takes_reasonable_time(self): + """Setup < 200 ms (20% extra on top of the typical 1 s measurement)""" + with captured_output() as (out, _): + doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([])) + doctor.analyze({ + 'name': 'NormalSetup', + 'NormalSetup O i1a': _PTR(setup=199999), + 'NormalSetup O i2a': _PTR(setup=200001)}) + doctor.analyze({ + 'name': 'LongSetup', + 'LongSetup O i1a': _PTR(setup=200001), + 'LongSetup O i2a': _PTR(setup=200002)}) + output = out.getvalue() + + self.assertIn('runtime: ', output) + self.assertNotIn('NormalSetup', output) + self.assert_contains( + ["'LongSetup' setup took at least 200001 μs."], + self.logs['error']) + self.assert_contains( + ["The `setUpFunction` should take no more than 200 ms."], + self.logs['info']) + def test_benchmark_has_constant_memory_use(self): """Benchmark's memory footprint must not vary with num-iters.""" with captured_output() as (out, _): diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py index bf530e76a03b1..0f4ac29940ae1 100644 --- a/benchmark/scripts/test_compare_perf_tests.py +++ b/benchmark/scripts/test_compare_perf_tests.py @@ -204,6 +204,110 @@ def test_init(self): r = PerformanceTestResult(log_line.split(',')) self.assertEquals(r.max_rss, 10510336) + def test_init_quantiles(self): + # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs) + log = '1,Ackermann,3,54383,54512,54601' + r = PerformanceTestResult(log.split(','), quantiles=True) + self.assertEquals(r.test_num, '1') + self.assertEquals(r.name, 'Ackermann') + self.assertEquals((r.num_samples, r.min, r.median, r.max), + (3, 54383, 54512, 54601)) + self.assertAlmostEquals(r.mean, 54498.67, places=2) + self.assertAlmostEquals(r.sd, 109.61, places=2) + self.assertEquals(r.samples.count, 3) + self.assertEquals(r.samples.num_samples, 3) + self.assertEquals([s.runtime for s in r.samples.all_samples], + [54383, 54512, 54601]) + + # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B) + log = '1,Ackermann,3,54529,54760,55807,266240' + r = PerformanceTestResult(log.split(','), quantiles=True, memory=True) + self.assertEquals((r.samples.count, r.max_rss), (3, 266240)) + # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs) + log = '1,Ackermann,5,54570,54593,54644,57212,58304' + r = PerformanceTestResult(log.split(','), quantiles=True, memory=False) + self.assertEquals((r.num_samples, r.min, r.median, r.max), + (5, 54570, 54644, 58304)) + self.assertEquals((r.samples.q1, r.samples.q3), (54593, 57212)) + self.assertEquals(r.samples.count, 5) + # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B) + log = '1,Ackermann,5,54686,54731,54774,55030,63466,270336' + r = PerformanceTestResult(log.split(','), quantiles=True, memory=True) + self.assertEquals(r.samples.num_samples, 5) + self.assertEquals(r.samples.count, 4) # outlier was excluded + self.assertEquals(r.max_rss, 270336) + + def test_init_delta_quantiles(self): + # #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX + # 2-quantile from 2 samples in repeated min, when delta encoded, + # the difference is 0, which is ommited -- only separator remains + log = '202,DropWhileArray,2,265,,22' + r = PerformanceTestResult(log.split(','), quantiles=True, delta=True) + self.assertEquals((r.num_samples, r.min, r.median, r.max), + (2, 265, 265, 287)) + self.assertEquals(r.samples.count, 2) + self.assertEquals(r.samples.num_samples, 2) + + def test_init_oversampled_quantiles(self): + """When num_samples is < quantile + 1, some of the measurements are + repeated in the report summary. Samples should contain only true + values, discarding the repetated artifacts from quantile estimation. + + The test string is slightly massaged output of the following R script: + subsample <- function(x, q) { + quantile(1:x, probs=((0:(q-1))/(q-1)), type=1)} + tbl <- function(s) t(sapply(1:s, function(x) { + qs <- subsample(x, s); c(qs[1], diff(qs)) })) + sapply(c(3, 5, 11, 21), tbl) + """ + def validatePTR(deq): # construct from delta encoded quantiles string + deq = deq.split(',') + num_samples = deq.count('1') + r = PerformanceTestResult(['0', 'B', str(num_samples)] + deq, + quantiles=True, delta=True) + self.assertEquals(r.samples.num_samples, num_samples) + self.assertEquals([s.runtime for s in r.samples.all_samples], + range(1, num_samples + 1)) + + delta_encoded_quantiles = """ +1,, +1,,1 +1,,,, +1,,,1, +1,,1,1, +1,,1,1,1 +1,,,,,,,,,, +1,,,,,,1,,,, +1,,,,1,,,1,,, +1,,,1,,,1,,1,, +1,,,1,,1,,1,,1, +1,,1,,1,,1,1,,1, +1,,1,1,,1,1,,1,1, +1,,1,1,1,,1,1,1,1, +1,,1,1,1,1,1,1,1,1, +1,,1,1,1,1,1,1,1,1,1 +1,,,,,,,,,,,,,,,,,,,, +1,,,,,,,,,,,1,,,,,,,,, +1,,,,,,,1,,,,,,,1,,,,,, +1,,,,,,1,,,,,1,,,,,1,,,, +1,,,,,1,,,,1,,,,1,,,,1,,, +1,,,,1,,,1,,,,1,,,1,,,1,,, +1,,,1,,,1,,,1,,,1,,,1,,,1,, +1,,,1,,,1,,1,,,1,,1,,,1,,1,, +1,,,1,,1,,1,,1,,,1,,1,,1,,1,, +1,,,1,,1,,1,,1,,1,,1,,1,,1,,1, +1,,1,,1,,1,,1,,1,1,,1,,1,,1,,1, +1,,1,,1,,1,1,,1,,1,1,,1,,1,1,,1, +1,,1,,1,1,,1,1,,1,1,,1,1,,1,1,,1, +1,,1,1,,1,1,,1,1,,1,1,1,,1,1,,1,1, +1,,1,1,,1,1,1,,1,1,1,,1,1,1,,1,1,1, +1,,1,1,1,,1,1,1,1,,1,1,1,1,,1,1,1,1, +1,,1,1,1,1,1,,1,1,1,1,1,1,,1,1,1,1,1, +1,,1,1,1,1,1,1,1,1,,1,1,1,1,1,1,1,1,1, +1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1""" + map(validatePTR, delta_encoded_quantiles.split('\n')[1:]) + def test_repr(self): log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884' r = PerformanceTestResult(log_line.split(',')) @@ -214,29 +318,32 @@ def test_repr(self): ) def test_merge(self): - tests = """1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336 + tests = """ +1,AngryPhonebook,1,12045,12045,12045,0,12045 1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336 1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144 -1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n') +1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:] results = map(PerformanceTestResult, [line.split(',') for line in tests]) + results[2].setup = 9 + results[3].setup = 7 def as_tuple(r): return (r.num_samples, r.min, r.max, round(r.mean, 2), - r.sd, r.median, r.max_rss) + r.sd, r.median, r.max_rss, r.setup) r = results[0] self.assertEquals(as_tuple(r), - (1, 12045, 12045, 12045, 0, 12045, 10510336)) + (1, 12045, 12045, 12045, 0, 12045, None, None)) r.merge(results[1]) - self.assertEquals(as_tuple(r), # drops SD and median - (2, 12045, 12325, 12185, 0, 0, 10510336)) + self.assertEquals(as_tuple(r), # drops SD and median, +max_rss + (2, 12045, 12325, 12185, None, None, 10510336, None)) r.merge(results[2]) - self.assertEquals(as_tuple(r), # picks smaller of the MAX_RSS - (3, 11616, 12325, 11995.33, 0, 0, 10502144)) + self.assertEquals(as_tuple(r), # picks smaller of the MAX_RSS, +setup + (3, 11616, 12325, 11995.33, None, None, 10502144, 9)) r.merge(results[3]) - self.assertEquals(as_tuple(r), - (4, 11616, 12325, 12064, 0, 0, 10498048)) + self.assertEquals(as_tuple(r), # picks smaller of the setup values + (4, 11616, 12325, 12064, None, None, 10498048, 7)) class TestResultComparison(unittest.TestCase): @@ -336,7 +443,7 @@ def assert_report_contains(self, texts, report): class TestLogParser(unittest.TestCase): def test_parse_results_csv(self): - """Ignores header row, empty lines and Totals row.""" + """Ignores uknown lines, extracts data from supported formats.""" log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us) 34,BitCount,20,3,4,4,0,4 @@ -369,6 +476,41 @@ def test_parse_results_formatted_text(self): self.assertEquals(r.name, 'Array2D') self.assertEquals(r.max_rss, 20915200) + def test_parse_quantiles(self): + """Gathers samples from reported quantiles. Handles optional memory.""" + r = LogParser.results_from_string( + """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs) +1,Ackermann,3,54383,54512,54601""")['Ackermann'] + self.assertEquals([s.runtime for s in r.samples.all_samples], + [54383, 54512, 54601]) + r = LogParser.results_from_string( + """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B) +1,Ackermann,3,54529,54760,55807,266240""")['Ackermann'] + self.assertEquals([s.runtime for s in r.samples.all_samples], + [54529, 54760, 55807]) + self.assertEquals(r.max_rss, 266240) + + def test_parse_delta_quantiles(self): + r = LogParser.results_from_string( # 2-quantile aka. median + '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,')['B'] + self.assertEquals( + (r.num_samples, r.min, r.median, r.max, r.samples.count), + (1, 101, 101, 101, 1)) + r = LogParser.results_from_string( + '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1')['B'] + self.assertEquals( + (r.num_samples, r.min, r.median, r.max, r.samples.count), + (2, 101, 101, 102, 2)) + r = LogParser.results_from_string( # 20-quantiles aka. ventiles + '#,TEST,SAMPLES,MIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,' + + '𝚫V9,𝚫VA,𝚫VB,𝚫VC,𝚫VD,𝚫VE,𝚫VF,𝚫VG,𝚫VH,𝚫VI,𝚫VJ,𝚫MAX\n' + + '202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464' + )['DropWhileArray'] + self.assertEquals( + (r.num_samples, r.min, r.max, r.samples.count), + # last 3 ventiles were outliers and were excluded from the sample + (200, 214, 215, 18)) + def test_parse_results_verbose(self): """Parse multiple performance test results with 2 sample formats: single line for N = 1; two lines for N > 1. @@ -383,8 +525,11 @@ def test_parse_results_verbose(self): Sample 2,11467 1,AngryPhonebook,3,11467,13898,12392,1315,11812 Running Array2D for 3 samples. + SetUp 14444 Sample 0,369900 + Yielding after ~369918 μs Sample 1,381039 + Yielding after ~381039 μs Sample 2,371043 3,Array2D,3,369900,381039,373994,6127,371043 @@ -400,15 +545,21 @@ def test_parse_results_verbose(self): self.assertEquals(r.num_samples, r.samples.num_samples) self.assertEquals(results[0].samples.all_samples, [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)]) + self.assertEquals(r.yields, None) r = results[1] self.assertEquals( (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median), ('Array2D', 369900, 381039, 373994, 6127, 371043) ) + self.assertEquals(r.setup, 14444) self.assertEquals(r.num_samples, r.samples.num_samples) self.assertEquals(results[1].samples.all_samples, [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)]) + yielded = r.yields[0] + self.assertEquals(yielded.before_sample, 1) + self.assertEquals(yielded.after, 369918) + self.assertEquals(r.yields, [(1, 369918), (2, 381039)]) def test_parse_environment_verbose(self): """Parse stats about environment in verbose mode.""" diff --git a/benchmark/utils/DriverUtils.swift b/benchmark/utils/DriverUtils.swift index 1d31d37881e01..e938ea2290025 100644 --- a/benchmark/utils/DriverUtils.swift +++ b/benchmark/utils/DriverUtils.swift @@ -74,7 +74,7 @@ struct TestConfig { let numIters: Int? /// The number of samples we should take of each test. - let numSamples: Int + let numSamples: Int? /// Quantiles to report in results. let quantile: Int? @@ -130,7 +130,9 @@ struct TestConfig { // Configure the command line argument parser let p = ArgumentParser(into: PartialTestConfig()) p.addArgument("--num-samples", \.numSamples, - help: "number of samples to take per benchmark; default: 1", + help: "number of samples to take per benchmark;\n" + + "default: 1 or auto-scaled to measure for\n" + + "`sample-time` if num-iters is also specified\n", parser: { UInt($0) }) p.addArgument("--num-iters", \.numIters, help: "number of iterations averaged in the sample;\n" + @@ -174,7 +176,7 @@ struct TestConfig { delim = c.delim ?? "," sampleTime = c.sampleTime ?? 1.0 numIters = c.numIters.map { Int($0) } - numSamples = Int(c.numSamples ?? 1) + numSamples = c.numSamples.map { Int($0) } quantile = c.quantile.map { Int($0) } delta = c.delta ?? false verbose = c.verbose ?? false @@ -200,7 +202,7 @@ struct TestConfig { let testList = tests.map({ $0.1.name }).joined(separator: ", ") print(""" --- CONFIG --- - NumSamples: \(numSamples) + NumSamples: \(numSamples ?? 0) Verbose: \(verbose) LogMemory: \(logMemory) SampleTime: \(sampleTime) @@ -453,10 +455,9 @@ final class TestRunner { logVerbose("Skipping unsupported benchmark \(test.name)!") return nil } - logVerbose("Running \(test.name) for \(c.numSamples) samples.") + logVerbose("Running \(test.name)") var samples: [Int] = [] - samples.reserveCapacity(c.numSamples) func addSample(_ time: Int) { logVerbose(" Sample \(samples.count),\(time)") @@ -464,7 +465,12 @@ final class TestRunner { } resetMeasurements() - test.setUpFunction?() + if let setUp = test.setUpFunction { + setUp() + stopMeasurement() + logVerbose(" SetUp \(lastSampleTime.microseconds)") + resetMeasurements() + } // Determine number of iterations for testFn to run for desired time. func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) { @@ -477,17 +483,26 @@ final class TestRunner { } } + // Determine the scale of measurements. Re-use the calibration result if + // it is just one measurement. + func calibrateMeasurements() -> Int { + let (numIters, oneIter) = iterationsPerSampleTime() + if numIters == 1 { addSample(oneIter) } + else { resetMeasurements() } // for accurate yielding reports + return numIters + } + let numIters = min( // Cap to prevent overflow on 32-bit systems when scaled Int.max / 10_000, // by the inner loop multiplier inside the `testFn`. - c.numIters ?? { - let (numIters, oneIter) = iterationsPerSampleTime() - if numIters == 1 { addSample(oneIter) } - else { resetMeasurements() } // for accurate yielding reports - return numIters - }()) + c.numIters ?? calibrateMeasurements()) + + let numSamples = c.numSamples ?? min(2000, // Cap the number of samples + c.numIters == nil ? 1 : calibrateMeasurements()) + samples.reserveCapacity(numSamples) + logVerbose(" Collecting \(numSamples) samples.") logVerbose(" Measuring with scale \(numIters).") - for _ in samples.count..