diff --git a/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake b/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake
index 1f84bb829d2ee..2ca6532648dae 100644
--- a/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake
+++ b/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake
@@ -701,11 +701,11 @@ function(swift_benchmark_compile)
           COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
                   "-o" "O" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
                   "--swift-repo" "${SWIFT_SOURCE_DIR}"
-                  "--iterations" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
+                  "--independent-samples" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
           COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
                   "-o" "Onone" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
                   "--swift-repo" "${SWIFT_SOURCE_DIR}"
-                  "--iterations" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
+                  "--independent-samples" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
           COMMAND "${swift-bin-dir}/Benchmark_Driver" "compare"
                   "--log-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
                   "--swift-repo" "${SWIFT_SOURCE_DIR}"
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index adf182b4d0c20..3e9cf0fe8225a 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -133,20 +133,21 @@ class BenchmarkDriver(object):
             benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
 
     def run(self, test, num_samples=None, num_iters=None,
-            verbose=None, measure_memory=False):
+            verbose=None, measure_memory=False, quantile=None):
         """Execute benchmark and gather results."""
-        num_samples = num_samples or 1
+        num_samples = num_samples or 0
         num_iters = num_iters or 0  # automatically determine N to run for 1s
 
         cmd = self._cmd_run(
-            test, num_samples, num_iters, verbose, measure_memory)
+            test, num_samples, num_iters, verbose, measure_memory, quantile)
         output = self._invoke(cmd)
         result = self.parser.results_from_string(output).items()[0][1]
         return result
 
-    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
+    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory,
+                 quantile):
         cmd = [self.test_harness, test]
-        if num_samples > 1:
+        if num_samples > 0:
             cmd.append('--num-samples={0}'.format(num_samples))
         if num_iters > 0:
             cmd.append('--num-iters={0}'.format(num_iters))
@@ -154,6 +155,9 @@ class BenchmarkDriver(object):
             cmd.append('--verbose')
         if measure_memory:
             cmd.append('--memory')
+        if quantile:
+            cmd.append('--quantile={0}'.format(quantile))
+            cmd.append('--delta')
         return cmd
 
     def run_independent_samples(self, test):
@@ -166,8 +170,9 @@ class BenchmarkDriver(object):
             return a
 
         return reduce(merge_results,
-                      [self.run(test, measure_memory=True)
-                       for _ in range(self.args.iterations)])
+                      [self.run(test, measure_memory=True,
+                                num_iters=1, quantile=20)
+                       for _ in range(self.args.independent_samples)])
 
     def log_results(self, output, log_file=None):
         """Log output to `log_file`.
@@ -182,7 +187,7 @@ class BenchmarkDriver(object):
         with open(log_file, 'w') as f:
             f.write(output)
 
-    RESULT = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
+    RESULT = '{:>3} {:<25} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}'
 
     def run_and_log(self, csv_console=True):
         """Run benchmarks and continuously log results to the console.
@@ -201,14 +206,15 @@ class BenchmarkDriver(object):
         def console_log(values):
             print(format(values))
 
-        console_log(['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)',  # header
-                    'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)'])
-
         def result_values(r):
-            return map(str, [r.test_num, r.name, r.num_samples, r.min, r.max,
-                             int(r.mean), int(r.sd), r.median, r.max_rss])
-
-        results = []
+            return map(str, [r.test_num, r.name, r.num_samples, r.min,
+                             r.samples.q1, r.median, r.samples.q3, r.max,
+                             r.max_rss])
+
+        header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
+                  'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
+        console_log(header)
+        results = [header]
         for test in self.tests:
             result = result_values(self.run_independent_samples(test))
             console_log(result)
@@ -290,6 +296,7 @@ class BenchmarkDoctor(object):
             self._name_matches_capital_words_convention,
             self._name_is_at_most_40_chars_long,
             self._no_setup_overhead,
+            self._reasonable_setup_time,
             self._optimized_runtime_in_range,
             self._constant_memory_use
         ]
@@ -341,15 +348,20 @@ class BenchmarkDoctor(object):
               for correction in [(setup / i) for i in [1, 2]]
               ] for result in i_series])
 
-        if 2500 < runtime:
-            log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
+        threshold = 1000
+        if threshold < runtime:
+            log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else
                    BenchmarkDoctor.log_runtime.error)
             caveat = '' if setup == 0 else ' (excluding the setup overhead)'
             log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
-            factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
+
+            def factor(base):  # suitable divisior that's integer power of base
+                return int(pow(base, math.ceil(
+                    math.log(runtime / float(threshold), base))))
+
             BenchmarkDoctor.log_runtime.info(
-                "Decrease the workload of '%s' by a factor of %d, "
-                "to be less than 2500 μs.", name, factor)
+                "Decrease the workload of '%s' by a factor of %d (%d), to be "
+                "less than %d μs.", name, factor(2), factor(10), threshold)
 
     @staticmethod
     def _setup_overhead(measurements):
@@ -372,6 +384,17 @@ class BenchmarkDoctor(object):
                 'Move initialization of benchmark data to the `setUpFunction` '
                 'registered in `BenchmarkInfo`.')
 
+    @staticmethod
+    def _reasonable_setup_time(measurements):
+        setup = min([result.setup
+                     for result in BenchmarkDoctor._select(measurements)])
+        if 200000 < setup:  # 200 ms
+            BenchmarkDoctor.log_runtime.error(
+                "'%s' setup took at least %d μs.",
+                measurements['name'], setup)
+            BenchmarkDoctor.log_runtime.info(
+                'The `setUpFunction` should take no more than 200 ms.')
+
     @staticmethod
     def _constant_memory_use(measurements):
         select = BenchmarkDoctor._select
@@ -603,7 +626,7 @@ def parse_args(args):
         help='Run benchmarks and output results to stdout',
         parents=[shared_benchmarks_parser])
     run_parser.add_argument(
-        '-i', '--iterations',
+        '-i', '--independent-samples',
         help='number of times to run each test (default: 1)',
         type=positive_int, default=1)
     run_parser.add_argument(
diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index fe3a84be6c18a..83f2265f2e8c1 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -51,6 +51,14 @@ def __repr__(self):
         return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
 
 
+class Yield(namedtuple('Yield', 'before_sample after')):
+    u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
+
+    `before_sample`: index of measurement taken just after returning from yield
+    `after`: time elapsed since the previous yield in microseconds (μs)
+    """
+
+
 class PerformanceTestSamples(object):
     """Collection of runtime samples from the benchmark execution.
 
@@ -69,7 +77,7 @@ def __init__(self, name, samples=None):
             self.add(sample)
 
     def __str__(self):
-        """Text summary of benchmark statisctics."""
+        """Text summary of benchmark statistics."""
         return (
             '{0.name!s} n={0.count!r} '
             'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
@@ -211,31 +219,60 @@ class PerformanceTestResult(object):
     Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
     or Benchmark_Driver).
 
-    It depends on the log format emitted by the test driver in the form:
-    #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
-
-    The last column, MAX_RSS, is emitted only for runs instrumented by the
-    Benchmark_Driver to measure rough memory use during the execution of the
-    benchmark.
+    It suppors 2 log formats emitted by the test driver. Legacy format with
+    statistics for normal distribution (MEAN, SD):
+        #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
+    And new quantiles format with variable number of columns:
+        #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+        #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+    The number of columns between MIN and MAX depends on the test driver's
+    `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
     """
 
-    def __init__(self, csv_row):
-        """Initialize from a row with 8 or 9 columns with benchmark summary.
+    def __init__(self, csv_row, quantiles=False, memory=False, delta=False):
+        """Initialize from a row of multiple columns with benchmark summary.
 
         The row is an iterable, such as a row provided by the CSV parser.
         """
-        self.test_num = csv_row[0]      # Ordinal number of the test
-        self.name = csv_row[1]          # Name of the performance test
-        self.num_samples = (            # Number of measurement samples taken
-            int(csv_row[2]))
-        self.min = int(csv_row[3])      # Minimum runtime (μs)
-        self.max = int(csv_row[4])      # Maximum runtime (μs)
-        self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
-        self.sd = float(csv_row[6])     # Standard Deviation (μs)
-        self.median = int(csv_row[7])   # Median runtime (μs)
-        self.max_rss = (                # Maximum Resident Set Size (B)
-            int(csv_row[8]) if len(csv_row) > 8 else None)
-        self.samples = None
+        self.test_num = csv_row[0]          # Ordinal number of the test
+        self.name = csv_row[1]              # Name of the performance test
+        self.num_samples = int(csv_row[2])  # Number of measurements taken
+
+        if quantiles:  # Variable number of columns representing quantiles
+            runtimes = csv_row[3:-1] if memory else csv_row[3:]
+            if delta:
+                runtimes = [int(x) if x else 0 for x in runtimes]
+                runtimes = reduce(lambda l, x: l.append(l[-1] + x) or  # runnin
+                                  l if l else [x], runtimes, None)     # total
+            num_values = len(runtimes)
+            if self.num_samples < num_values:  # remove repeated samples
+                quantile = num_values - 1
+                qs = [float(i) / float(quantile) for i in range(0, num_values)]
+                indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
+                           for q in qs]
+                runtimes = [runtimes[indices.index(i)]
+                            for i in range(0, self.num_samples)]
+
+            self.samples = PerformanceTestSamples(
+                self.name,
+                [Sample(None, None, int(runtime)) for runtime in runtimes])
+            self.samples.exclude_outliers(top_only=True)
+            sams = self.samples
+            self.min, self.max, self.median, self.mean, self.sd = \
+                sams.min, sams.max, sams.median, sams.mean, sams.sd
+            self.max_rss = (                # Maximum Resident Set Size (B)
+                int(csv_row[-1]) if memory else None)
+        else:  # Legacy format with statistics for normal distribution.
+            self.min = int(csv_row[3])      # Minimum runtime (μs)
+            self.max = int(csv_row[4])      # Maximum runtime (μs)
+            self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
+            self.sd = float(csv_row[6])     # Standard Deviation (μs)
+            self.median = int(csv_row[7])   # Median runtime (μs)
+            self.max_rss = (                # Maximum Resident Set Size (B)
+                int(csv_row[8]) if len(csv_row) > 8 else None)
+            self.samples = None
+        self.yields = None
+        self.setup = None
 
     def __repr__(self):
         """Short summary for debugging purposes."""
@@ -253,6 +290,7 @@ def merge(self, r):
         The use case here is comparing test results parsed from concatenated
         log files from multiple runs of benchmark driver.
         """
+        # Statistics
         if self.samples and r.samples:
             map(self.samples.add, r.samples.samples)
             sams = self.samples
@@ -266,8 +304,14 @@ def merge(self, r):
                 (self.mean * self.num_samples) + (r.mean * r.num_samples)
             ) / float(self.num_samples + r.num_samples)
             self.num_samples += r.num_samples
-            self.max_rss = min(self.max_rss, r.max_rss)
-            self.median, self.sd = 0, 0
+            self.median, self.sd = None, None
+
+        # Metadata
+        def minimum(a, b):  # work around None being less than everything
+            return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
+                    else None)
+        self.max_rss = minimum(self.max_rss, r.max_rss)
+        self.setup = minimum(self.setup, r.setup)
 
 
 class ResultComparison(object):
@@ -307,40 +351,48 @@ class LogParser(object):
     def __init__(self):
         """Create instance of `LogParser`."""
         self.results = []
+        self.quantiles, self.delta, self.memory = False, False, False
         self._reset()
 
     def _reset(self):
         """Reset parser to the default state for reading a new result."""
-        self.samples, self.num_iters = [], 1
-        self.max_rss, self.mem_pages = None, None
+        self.samples, self.yields, self.num_iters = [], [], 1
+        self.setup, self.max_rss, self.mem_pages = None, None, None
         self.voluntary_cs, self.involuntary_cs = None, None
 
     # Parse lines like this
     # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
-    results_re = re.compile(r'( *\d+[, \t]*[\w.]+[, \t]*' +
-                            r'[, \t]*'.join([r'[\d.]+'] * 6) +
-                            r'[, \t]*[\d.]*)')  # optional MAX_RSS(B)
+    results_re = re.compile(
+        r'( *\d+[, \t]+[\w.]+[, \t]+' +  # #,TEST
+        r'[, \t]+'.join([r'\d+'] * 2) +  # at least 2...
+        r'(?:[, \t]+\d*)*)')             # ...or more numeric columns
 
     def _append_result(self, result):
-        columns = result.split(',')
-        if len(columns) < 8:
-            columns = result.split()
-        r = PerformanceTestResult(columns)
-        if self.max_rss:
-            r.max_rss = self.max_rss
-            r.mem_pages = self.mem_pages
-            r.voluntary_cs = self.voluntary_cs
-            r.involuntary_cs = self.involuntary_cs
+        columns = result.split(',') if ',' in result else result.split()
+        r = PerformanceTestResult(
+            columns, quantiles=self.quantiles, memory=self.memory,
+            delta=self.delta)
+        r.setup = self.setup
+        r.max_rss = r.max_rss or self.max_rss
+        r.mem_pages = self.mem_pages
+        r.voluntary_cs = self.voluntary_cs
+        r.involuntary_cs = self.involuntary_cs
         if self.samples:
             r.samples = PerformanceTestSamples(r.name, self.samples)
             r.samples.exclude_outliers()
         self.results.append(r)
+        r.yields = self.yields or None
         self._reset()
 
     def _store_memory_stats(self, max_rss, mem_pages):
         self.max_rss = int(max_rss)
         self.mem_pages = int(mem_pages)
 
+    def _configure_format(self, header):
+        self.quantiles = 'MEAN' not in header
+        self.memory = 'MAX_RSS' in header
+        self.delta = '𝚫' in header
+
     # Regular expression and action to take when it matches the parsed line
     state_actions = {
         results_re: _append_result,
@@ -355,6 +407,17 @@ def _store_memory_stats(self, max_rss, mem_pages):
          self.samples.append(
              Sample(int(i), int(self.num_iters), int(runtime)))),
 
+        re.compile(r'\s+SetUp (\d+)'):
+        (lambda self, setup: setattr(self, 'setup', int(setup))),
+
+        re.compile(r'\s+Yielding after ~(\d+) μs'):
+        (lambda self, since_last_yield:
+            self.yields.append(
+                Yield(len(self.samples), int(since_last_yield)))),
+
+        re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
+        _configure_format,
+
         # Environmental statistics: memory usage and context switches
         re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
         _store_memory_stats,
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index c4dd29192c036..f3adeb6fa7edd 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -96,15 +96,16 @@ def test_optimization_argument(self):
              "(choose from 'O', 'Onone', 'Osize')"],
             err.getvalue())
 
-    def test_iterations(self):
-        self.assertEquals(parse_args(['run']).iterations, 1)
-        self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3)
+    def test_independent_samples(self):
+        self.assertEquals(parse_args(['run']).independent_samples, 1)
+        self.assertEquals(parse_args(['run', '-i', '3']).independent_samples,
+                          3)
         with captured_output() as (out, err):
             self.assertRaises(SystemExit,
                               parse_args, ['run', '-i', '-3'])
         self.assert_contains(
-            ['error:',
-             "argument -i/--iterations: invalid positive_int value: '-3'"],
+            ['error:', "argument -i/--independent-samples: " +
+             "invalid positive_int value: '-3'"],
             err.getvalue())
 
     def test_output_dir(self):
@@ -278,30 +279,40 @@ def test_measure_memory(self):
         self.subprocess_mock.assert_called_with(
             ('/benchmarks/Benchmark_O', 'b', '--memory'))
 
+    def test_report_quantiles(self):
+        """Use delta compression for quantile reports."""
+        self.driver.run('b', quantile=4)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta'))
+
     def test_run_benchmark_independent_samples(self):
-        self.driver.args.iterations = 3
+        """Extract up to 20 measurements from an independent run."""
+        self.driver.args.independent_samples = 3
         r = self.driver.run_independent_samples('b1')
         self.assertEquals(self.subprocess_mock.calls.count(
-            ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3)
+            ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
+             '--quantile=20', '--delta')), 3)
         self.assertEquals(r.num_samples, 3)  # results are merged
 
     def test_run_and_log(self):
         def mock_run(test):
             self.assertEquals(test, 'b1')
             return PerformanceTestResult(
-                '3,b1,1,123,123,123,0,123,888'.split(','))
+                '3,b1,5,101,1,1,1,1,888'.split(','),
+                quantiles=True, delta=True, memory=True)
         driver = BenchmarkDriver(tests=['b1'], args=Stub(output_dir=None))
         driver.run_independent_samples = mock_run  # patching
 
         with captured_output() as (out, _):
             log = driver.run_and_log()
 
-        csv_log = '3,b1,1,123,123,123,0,123,888\n'
+        header = '#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs),' +\
+            'MAX_RSS(B)\n'
+        csv_log = '3,b1,5,101,102,103,104,105,888\n'
         self.assertEquals(log, None)
         self.assertEquals(
             out.getvalue(),
-            '#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' +
-            'MAX_RSS(B)\n' +
+            header +
             csv_log +
             '\n' +
             'Total performance tests executed: 1\n')
@@ -309,13 +320,13 @@ def mock_run(test):
         with captured_output() as (out, _):
             log = driver.run_and_log(csv_console=False)
 
-        self.assertEquals(log, csv_log)
+        self.assertEquals(log, header + csv_log)
         self.assertEquals(
             out.getvalue(),
-            '  # TEST                      SAMPLES MIN(μs) MAX(μs)' +
-            ' MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)\n' +
-            '  3 b1                              1     123     123' +
-            '      123      0        123        888\n' +
+            '  # TEST                      SAMPLES MIN(μs) Q1(μs)' +
+            ' MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n' +
+            '  3 b1                              5     101    102' +
+            '        103    104     105        888\n' +
             '\n' +
             'Total performance tests executed: 1\n')
 
@@ -410,9 +421,9 @@ def test_no_prefix_for_base_logging(self):
         self.assertEquals(f.format(lr), 'INFO Hi!')
 
 
-def _PTR(min=700, mem_pages=1000):
+def _PTR(min=700, mem_pages=1000, setup=None):
     """Create PerformanceTestResult Stub."""
-    return Stub(min=min, mem_pages=mem_pages)
+    return Stub(min=min, mem_pages=mem_pages, setup=setup)
 
 
 def _run(test, num_samples=None, num_iters=None, verbose=None,
@@ -537,13 +548,18 @@ def test_benchmark_name_is_at_most_40_chars_long(self):
             self.logs['info'])
 
     def test_benchmark_runtime_range(self):
-        """Optimized benchmark should run in less then 2500 μs.
-
-        With runtimes less than 2500 μs there is better than 1:4 chance of
-        being interrupted in the middle of measurement due to elapsed 10 ms
-        quantum used by macos scheduler.
-
-        Warn about longer runtime. Runtimes over half a second are an error.
+        """Optimized benchmark should run in less then 1000 μs.
+
+        Even on calm machine, benchmark with runtime of 2500 μs has 1:4 chance
+        of being interrupted in the middle of measurement due to elapsed 10 ms
+        quantum used by macos scheduler. Linux scheduler's quantum is 6ms.
+        Driver yielding the process before the 10ms quantum elapses helped
+        a lot, but benchmarks with runtimes under 1ms usually exhibit a strong
+        mode which is best for accurate performance charaterization.
+        To minimize the number of involuntary context switches that corrupt our
+        measurements, we should strive to stay in the microbenchmark range.
+
+        Warn about longer runtime. Runtimes over 10ms are an error.
         """
         def measurements(name, runtime):
             return {'name': name,
@@ -553,7 +569,7 @@ def measurements(name, runtime):
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
             doctor.analyze(measurements('Cheetah', 200))
-            doctor.analyze(measurements('Hare', 2501))
+            doctor.analyze(measurements('Hare', 1001))
             doctor.analyze(measurements('Tortoise', 500000))
             doctor.analyze({'name': 'OverheadTurtle',
                             'OverheadTurtle O i1a': _PTR(min=800000),
@@ -562,17 +578,17 @@ def measurements(name, runtime):
 
         self.assertIn('runtime: ', output)
         self.assertNotIn('Cheetah', output)
-        self.assert_contains(["'Hare' execution took at least 2501 μs."],
+        self.assert_contains(["'Hare' execution took at least 1001 μs."],
                              self.logs['warning'])
         self.assert_contains(
-            ["Decrease the workload of 'Hare' by a factor of 2, "
-             "to be less than 2500 μs."], self.logs['info'])
+            ["Decrease the workload of 'Hare' by a factor of 2 (10), "
+             "to be less than 1000 μs."], self.logs['info'])
         self.assert_contains(
             ["'Tortoise' execution took at least 500000 μs."],
             self.logs['error'])
         self.assert_contains(
-            ["Decrease the workload of 'Tortoise' by a factor of 256, "
-             "to be less than 2500 μs."], self.logs['info'])
+            ["Decrease the workload of 'Tortoise' by a factor of 512 (1000), "
+             "to be less than 1000 μs."], self.logs['info'])
         self.assert_contains(
             ["'OverheadTurtle' execution took at least 600000 μs"
              " (excluding the setup overhead)."],
@@ -605,6 +621,29 @@ def test_benchmark_has_no_significant_setup_overhead(self):
             ["Move initialization of benchmark data to the `setUpFunction` "
              "registered in `BenchmarkInfo`."], self.logs['info'])
 
+    def test_benchmark_setup_takes_reasonable_time(self):
+        """Setup < 200 ms (20% extra on top of the typical 1 s measurement)"""
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze({
+                'name': 'NormalSetup',
+                'NormalSetup O i1a': _PTR(setup=199999),
+                'NormalSetup O i2a': _PTR(setup=200001)})
+            doctor.analyze({
+                'name': 'LongSetup',
+                'LongSetup O i1a': _PTR(setup=200001),
+                'LongSetup O i2a': _PTR(setup=200002)})
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('NormalSetup', output)
+        self.assert_contains(
+            ["'LongSetup' setup took at least 200001 μs."],
+            self.logs['error'])
+        self.assert_contains(
+            ["The `setUpFunction` should take no more than 200 ms."],
+            self.logs['info'])
+
     def test_benchmark_has_constant_memory_use(self):
         """Benchmark's memory footprint must not vary with num-iters."""
         with captured_output() as (out, _):
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index bf530e76a03b1..0f4ac29940ae1 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -204,6 +204,110 @@ def test_init(self):
         r = PerformanceTestResult(log_line.split(','))
         self.assertEquals(r.max_rss, 10510336)
 
+    def test_init_quantiles(self):
+        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+        log = '1,Ackermann,3,54383,54512,54601'
+        r = PerformanceTestResult(log.split(','), quantiles=True)
+        self.assertEquals(r.test_num, '1')
+        self.assertEquals(r.name, 'Ackermann')
+        self.assertEquals((r.num_samples, r.min, r.median, r.max),
+                          (3, 54383, 54512, 54601))
+        self.assertAlmostEquals(r.mean, 54498.67, places=2)
+        self.assertAlmostEquals(r.sd, 109.61, places=2)
+        self.assertEquals(r.samples.count, 3)
+        self.assertEquals(r.samples.num_samples, 3)
+        self.assertEquals([s.runtime for s in r.samples.all_samples],
+                          [54383, 54512, 54601])
+
+        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
+        log = '1,Ackermann,3,54529,54760,55807,266240'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
+        self.assertEquals((r.samples.count, r.max_rss), (3, 266240))
+        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
+        log = '1,Ackermann,5,54570,54593,54644,57212,58304'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=False)
+        self.assertEquals((r.num_samples, r.min, r.median, r.max),
+                          (5, 54570, 54644, 58304))
+        self.assertEquals((r.samples.q1, r.samples.q3), (54593, 57212))
+        self.assertEquals(r.samples.count, 5)
+        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+        log = '1,Ackermann,5,54686,54731,54774,55030,63466,270336'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
+        self.assertEquals(r.samples.num_samples, 5)
+        self.assertEquals(r.samples.count, 4)  # outlier was excluded
+        self.assertEquals(r.max_rss, 270336)
+
+    def test_init_delta_quantiles(self):
+        # #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
+        # 2-quantile from 2 samples in repeated min, when delta encoded,
+        # the difference is 0, which is ommited -- only separator remains
+        log = '202,DropWhileArray,2,265,,22'
+        r = PerformanceTestResult(log.split(','), quantiles=True, delta=True)
+        self.assertEquals((r.num_samples, r.min, r.median, r.max),
+                          (2, 265, 265, 287))
+        self.assertEquals(r.samples.count, 2)
+        self.assertEquals(r.samples.num_samples, 2)
+
+    def test_init_oversampled_quantiles(self):
+        """When num_samples is < quantile + 1, some of the measurements are
+        repeated in the report summary. Samples should contain only true
+        values, discarding the repetated artifacts from quantile estimation.
+
+        The test string is slightly massaged output of the following R script:
+        subsample <- function(x, q) {
+          quantile(1:x, probs=((0:(q-1))/(q-1)), type=1)}
+        tbl <- function(s) t(sapply(1:s, function(x) {
+          qs <- subsample(x, s); c(qs[1], diff(qs)) }))
+        sapply(c(3, 5, 11, 21), tbl)
+        """
+        def validatePTR(deq):  # construct from delta encoded quantiles string
+            deq = deq.split(',')
+            num_samples = deq.count('1')
+            r = PerformanceTestResult(['0', 'B', str(num_samples)] + deq,
+                                      quantiles=True, delta=True)
+            self.assertEquals(r.samples.num_samples, num_samples)
+            self.assertEquals([s.runtime for s in r.samples.all_samples],
+                              range(1, num_samples + 1))
+
+        delta_encoded_quantiles = """
+1,,
+1,,1
+1,,,,
+1,,,1,
+1,,1,1,
+1,,1,1,1
+1,,,,,,,,,,
+1,,,,,,1,,,,
+1,,,,1,,,1,,,
+1,,,1,,,1,,1,,
+1,,,1,,1,,1,,1,
+1,,1,,1,,1,1,,1,
+1,,1,1,,1,1,,1,1,
+1,,1,1,1,,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,1
+1,,,,,,,,,,,,,,,,,,,,
+1,,,,,,,,,,,1,,,,,,,,,
+1,,,,,,,1,,,,,,,1,,,,,,
+1,,,,,,1,,,,,1,,,,,1,,,,
+1,,,,,1,,,,1,,,,1,,,,1,,,
+1,,,,1,,,1,,,,1,,,1,,,1,,,
+1,,,1,,,1,,,1,,,1,,,1,,,1,,
+1,,,1,,,1,,1,,,1,,1,,,1,,1,,
+1,,,1,,1,,1,,1,,,1,,1,,1,,1,,
+1,,,1,,1,,1,,1,,1,,1,,1,,1,,1,
+1,,1,,1,,1,,1,,1,1,,1,,1,,1,,1,
+1,,1,,1,,1,1,,1,,1,1,,1,,1,1,,1,
+1,,1,,1,1,,1,1,,1,1,,1,1,,1,1,,1,
+1,,1,1,,1,1,,1,1,,1,1,1,,1,1,,1,1,
+1,,1,1,,1,1,1,,1,1,1,,1,1,1,,1,1,1,
+1,,1,1,1,,1,1,1,1,,1,1,1,1,,1,1,1,1,
+1,,1,1,1,1,1,,1,1,1,1,1,1,,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,,1,1,1,1,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"""
+        map(validatePTR, delta_encoded_quantiles.split('\n')[1:])
+
     def test_repr(self):
         log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
         r = PerformanceTestResult(log_line.split(','))
@@ -214,29 +318,32 @@ def test_repr(self):
         )
 
     def test_merge(self):
-        tests = """1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336
+        tests = """
+1,AngryPhonebook,1,12045,12045,12045,0,12045
 1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
 1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
-1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')
+1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:]
         results = map(PerformanceTestResult,
                       [line.split(',') for line in tests])
+        results[2].setup = 9
+        results[3].setup = 7
 
         def as_tuple(r):
             return (r.num_samples, r.min, r.max, round(r.mean, 2),
-                    r.sd, r.median, r.max_rss)
+                    r.sd, r.median, r.max_rss, r.setup)
 
         r = results[0]
         self.assertEquals(as_tuple(r),
-                          (1, 12045, 12045, 12045, 0, 12045, 10510336))
+                          (1, 12045, 12045, 12045, 0, 12045, None, None))
         r.merge(results[1])
-        self.assertEquals(as_tuple(r),  # drops SD and median
-                          (2, 12045, 12325, 12185, 0, 0, 10510336))
+        self.assertEquals(as_tuple(r),  # drops SD and median, +max_rss
+                          (2, 12045, 12325, 12185, None, None, 10510336, None))
         r.merge(results[2])
-        self.assertEquals(as_tuple(r),  # picks smaller of the MAX_RSS
-                          (3, 11616, 12325, 11995.33, 0, 0, 10502144))
+        self.assertEquals(as_tuple(r),  # picks smaller of the MAX_RSS, +setup
+                          (3, 11616, 12325, 11995.33, None, None, 10502144, 9))
         r.merge(results[3])
-        self.assertEquals(as_tuple(r),
-                          (4, 11616, 12325, 12064, 0, 0, 10498048))
+        self.assertEquals(as_tuple(r),  # picks smaller of the setup values
+                          (4, 11616, 12325, 12064, None, None, 10498048, 7))
 
 
 class TestResultComparison(unittest.TestCase):
@@ -336,7 +443,7 @@ def assert_report_contains(self, texts, report):
 
 class TestLogParser(unittest.TestCase):
     def test_parse_results_csv(self):
-        """Ignores header row, empty lines and Totals row."""
+        """Ignores uknown lines, extracts data from supported formats."""
         log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
 34,BitCount,20,3,4,4,0,4
 
@@ -369,6 +476,41 @@ def test_parse_results_formatted_text(self):
         self.assertEquals(r.name, 'Array2D')
         self.assertEquals(r.max_rss, 20915200)
 
+    def test_parse_quantiles(self):
+        """Gathers samples from reported quantiles. Handles optional memory."""
+        r = LogParser.results_from_string(
+            """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+1,Ackermann,3,54383,54512,54601""")['Ackermann']
+        self.assertEquals([s.runtime for s in r.samples.all_samples],
+                          [54383, 54512, 54601])
+        r = LogParser.results_from_string(
+            """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
+1,Ackermann,3,54529,54760,55807,266240""")['Ackermann']
+        self.assertEquals([s.runtime for s in r.samples.all_samples],
+                          [54529, 54760, 55807])
+        self.assertEquals(r.max_rss, 266240)
+
+    def test_parse_delta_quantiles(self):
+        r = LogParser.results_from_string(  # 2-quantile aka. median
+            '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,')['B']
+        self.assertEquals(
+            (r.num_samples, r.min, r.median, r.max, r.samples.count),
+            (1, 101, 101, 101, 1))
+        r = LogParser.results_from_string(
+            '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1')['B']
+        self.assertEquals(
+            (r.num_samples, r.min, r.median, r.max, r.samples.count),
+            (2, 101, 101, 102, 2))
+        r = LogParser.results_from_string(  # 20-quantiles aka. ventiles
+            '#,TEST,SAMPLES,MIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,' +
+            '𝚫V9,𝚫VA,𝚫VB,𝚫VC,𝚫VD,𝚫VE,𝚫VF,𝚫VG,𝚫VH,𝚫VI,𝚫VJ,𝚫MAX\n' +
+            '202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464'
+        )['DropWhileArray']
+        self.assertEquals(
+            (r.num_samples, r.min, r.max, r.samples.count),
+            # last 3 ventiles were outliers and were excluded from the sample
+            (200, 214, 215, 18))
+
     def test_parse_results_verbose(self):
         """Parse multiple performance test results with 2 sample formats:
         single line for N = 1; two lines for N > 1.
@@ -383,8 +525,11 @@ def test_parse_results_verbose(self):
     Sample 2,11467
 1,AngryPhonebook,3,11467,13898,12392,1315,11812
 Running Array2D for 3 samples.
+    SetUp 14444
     Sample 0,369900
+    Yielding after ~369918 μs
     Sample 1,381039
+    Yielding after ~381039 μs
     Sample 2,371043
 3,Array2D,3,369900,381039,373994,6127,371043
 
@@ -400,15 +545,21 @@ def test_parse_results_verbose(self):
         self.assertEquals(r.num_samples, r.samples.num_samples)
         self.assertEquals(results[0].samples.all_samples,
                           [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)])
+        self.assertEquals(r.yields, None)
 
         r = results[1]
         self.assertEquals(
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
             ('Array2D', 369900, 381039, 373994, 6127, 371043)
         )
+        self.assertEquals(r.setup, 14444)
         self.assertEquals(r.num_samples, r.samples.num_samples)
         self.assertEquals(results[1].samples.all_samples,
                           [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
+        yielded = r.yields[0]
+        self.assertEquals(yielded.before_sample, 1)
+        self.assertEquals(yielded.after, 369918)
+        self.assertEquals(r.yields, [(1, 369918), (2, 381039)])
 
     def test_parse_environment_verbose(self):
         """Parse stats about environment in verbose mode."""
diff --git a/benchmark/utils/DriverUtils.swift b/benchmark/utils/DriverUtils.swift
index 1d31d37881e01..e938ea2290025 100644
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -74,7 +74,7 @@ struct TestConfig {
   let numIters: Int?
 
   /// The number of samples we should take of each test.
-  let numSamples: Int
+  let numSamples: Int?
 
   /// Quantiles to report in results.
   let quantile: Int?
@@ -130,7 +130,9 @@ struct TestConfig {
     // Configure the command line argument parser
     let p = ArgumentParser(into: PartialTestConfig())
     p.addArgument("--num-samples", \.numSamples,
-                  help: "number of samples to take per benchmark; default: 1",
+                  help: "number of samples to take per benchmark;\n" +
+                        "default: 1 or auto-scaled to measure for\n" +
+                        "`sample-time` if num-iters is also specified\n",
                   parser: { UInt($0) })
     p.addArgument("--num-iters", \.numIters,
                   help: "number of iterations averaged in the sample;\n" +
@@ -174,7 +176,7 @@ struct TestConfig {
     delim = c.delim ?? ","
     sampleTime = c.sampleTime ?? 1.0
     numIters = c.numIters.map { Int($0) }
-    numSamples = Int(c.numSamples ?? 1)
+    numSamples = c.numSamples.map { Int($0) }
     quantile = c.quantile.map { Int($0) }
     delta = c.delta ?? false
     verbose = c.verbose ?? false
@@ -200,7 +202,7 @@ struct TestConfig {
       let testList = tests.map({ $0.1.name }).joined(separator: ", ")
       print("""
             --- CONFIG ---
-            NumSamples: \(numSamples)
+            NumSamples: \(numSamples ?? 0)
             Verbose: \(verbose)
             LogMemory: \(logMemory)
             SampleTime: \(sampleTime)
@@ -453,10 +455,9 @@ final class TestRunner {
       logVerbose("Skipping unsupported benchmark \(test.name)!")
       return nil
     }
-    logVerbose("Running \(test.name) for \(c.numSamples) samples.")
+    logVerbose("Running \(test.name)")
 
     var samples: [Int] = []
-    samples.reserveCapacity(c.numSamples)
 
     func addSample(_ time: Int) {
       logVerbose("    Sample \(samples.count),\(time)")
@@ -464,7 +465,12 @@ final class TestRunner {
     }
 
     resetMeasurements()
-    test.setUpFunction?()
+    if let setUp = test.setUpFunction {
+      setUp()
+      stopMeasurement()
+      logVerbose("    SetUp \(lastSampleTime.microseconds)")
+      resetMeasurements()
+    }
 
     // Determine number of iterations for testFn to run for desired time.
     func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) {
@@ -477,17 +483,26 @@ final class TestRunner {
       }
     }
 
+    // Determine the scale of measurements. Re-use the calibration result if
+    // it is just one measurement.
+    func calibrateMeasurements() -> Int {
+      let (numIters, oneIter) = iterationsPerSampleTime()
+      if numIters == 1 { addSample(oneIter) }
+      else { resetMeasurements() } // for accurate yielding reports
+      return numIters
+    }
+
     let numIters = min( // Cap to prevent overflow on 32-bit systems when scaled
       Int.max / 10_000, // by the inner loop multiplier inside the `testFn`.
-      c.numIters ?? {
-        let (numIters, oneIter) = iterationsPerSampleTime()
-        if numIters == 1 { addSample(oneIter) }
-        else { resetMeasurements() } // for accurate yielding reports
-        return numIters
-      }())
+      c.numIters ?? calibrateMeasurements())
+
+    let numSamples = c.numSamples ?? min(2000, // Cap the number of samples
+      c.numIters == nil ? 1 : calibrateMeasurements())
 
+    samples.reserveCapacity(numSamples)
+    logVerbose("    Collecting \(numSamples) samples.")
     logVerbose("    Measuring with scale \(numIters).")
-    for _ in samples.count..<c.numSamples {
+    for _ in samples.count..<numSamples {
       addSample(measure(test.name, fn: testFn, numIters: numIters))
     }
 
diff --git a/test/benchmark/Benchmark_O.test.md b/test/benchmark/Benchmark_O.test.md
index 2d3b1a06bb7a9..eb8a586f90889 100644
--- a/test/benchmark/Benchmark_O.test.md
+++ b/test/benchmark/Benchmark_O.test.md
@@ -110,7 +110,7 @@ RUN:                              --check-prefix LOGHEADER \
 RUN:                              --check-prefix LOGBENCH
 LOGHEADER-LABEL: #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
 LOGBENCH: {{[0-9]+}},
-NUMITERS1: AngryPhonebook,1
+NUMITERS1: AngryPhonebook,{{[0-9]+}}
 NUMITERS1-NOT: 0,0,0,0,0
 LOGBENCH-SAME: ,{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}}
 ````
@@ -153,7 +153,8 @@ CONFIG: NumSamples: 2
 CONFIG: Tests Filter: ["1", "Ackermann", "1", "AngryPhonebook"]
 CONFIG: Tests to run: Ackermann, AngryPhonebook
 LOGFORMAT: #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX,MAX_RSS(B)
-LOGVERBOSE-LABEL: Running Ackermann for 2 samples.
+LOGVERBOSE-LABEL: Running Ackermann
+LOGVERBOSE: Collecting 2 samples.
 LOGVERBOSE: Measuring with scale {{[0-9]+}}.
 LOGVERBOSE: Sample 0,{{[0-9]+}}
 LOGVERBOSE: Sample 1,{{[0-9]+}}
@@ -163,7 +164,8 @@ MEASUREENV: VCS {{[0-9]+}} - {{[0-9]+}} = {{[0-9]+}}
 RUNJUSTONCE-LABEL: 1,Ackermann
 RUNJUSTONCE-NOT: 1,Ackermann
 LOGFORMAT: ,{{[0-9]+}},{{[0-9]+}},,{{[0-9]+}},{{[0-9]+}}
-LOGVERBOSE-LABEL: Running AngryPhonebook for 2 samples.
+LOGVERBOSE-LABEL: Running AngryPhonebook
+LOGVERBOSE: Collecting 2 samples.
 ````
 
 Verify the specified delimiter is used when logging to console. The non-verbose