Made multiple improvements to summarize script

The following features were added: * Ability to summarize more than one file * Calculate standard deviation * Print summary in table
boto · Aug 23, 2016 · f6417aa · f6417aa
1 parent 1791893
commit f6417aa
Show file tree

Hide file tree

Showing 2 changed files with 134 additions and 25 deletions.
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,2 +1,3 @@
 -r requirements-test.txt
 psutil>=4.1.0,<5.0.0
+tabulate==0.7.5
diff --git a/scripts/performance/summarize b/scripts/performance/summarize
@@ -12,14 +12,45 @@ Run this script with::
 
 And that should output::
 
-    Total time: 1.810 seconds
-    Max memory: 114.0 MiB
-    Max cpu: 208.3 percent
-    Average memory: 67.3 MiB
-    Average cpu: 140.5 percent
+    +-----------------------+----------+----------------------+
+    | Metric                | Mean     | Standard Deviation   |
+    +=======================+==========+======================+
+    | Total Time (seconds)  | 1.200    | 0.0                  |
+    +-----------------------+----------+----------------------+
+    | Maximum Memory        | 42.3 MiB | 0 Bytes              |
+    +-----------------------+----------+----------------------+
+    | Maximum CPU (percent) | 88.1     | 0.0                  |
+    +-----------------------+----------+----------------------+
+    | Average Memory        | 33.9 MiB | 0 Bytes              |
+    +-----------------------+----------+----------------------+
+    | Average CPU (percent) | 30.5     | 0.0                  |
+    +-----------------------+----------+----------------------+
+
+The script can also be ran with multiple files:
+
+    ./summarize -f performance.csv performance-2.csv
+
+And will have a similar output:
+
+    +-----------------------+----------+----------------------+
+    | Metric                | Mean     | Standard Deviation   |
+    +=======================+==========+======================+
+    | Total Time (seconds)  | 1.155    | 0.0449999570847      |
+    +-----------------------+----------+----------------------+
+    | Maximum Memory        | 42.5 MiB | 110.0 KiB            |
+    +-----------------------+----------+----------------------+
+    | Maximum CPU (percent) | 94.5     | 6.45                 |
+    +-----------------------+----------+----------------------+
+    | Average Memory        | 35.6 MiB | 1.7 MiB              |
+    +-----------------------+----------+----------------------+
+    | Average CPU (percent) | 27.5     | 3.03068181818        |
+    +-----------------------+----------+----------------------+
 """
 import argparse
 import csv
+from math import sqrt
+
+from tabulate import tabulate
 
 
 def human_readable_size(value):
@@ -51,8 +82,16 @@ class Summarizer(object):
         self._start_time = None
         self._end_time = None
         self._totals = {
+            'time': [],
+            'average_memory': [],
+            'average_cpu': [],
+            'max_memory': [],
+            'max_cpu': [],
+        }
+        self._averages = {
             'memory': 0.0,
-            'cpu': 0.0
+            'cpu': 0.0,
+
         }
         self._maximums = {
             'memory': 0.0,
@@ -61,43 +100,90 @@ class Summarizer(object):
 
     @property
     def total_time(self):
-        return self._end_time - self._start_time
+        return self._average_across_all_files('time')
 
     @property
     def max_cpu(self):
-        return self._maximums['cpu']
+        return self._average_across_all_files('max_cpu')
 
     @property
     def max_memory(self):
-        return human_readable_size(self._maximums['memory'])
+        return human_readable_size(
+            self._average_across_all_files('max_memory'))
 
     @property
     def average_cpu(self):
-        return self._average('cpu')
+        return self._average_across_all_files('average_cpu')
 
     @property
     def average_memory(self):
-        return human_readable_size(self._average('memory'))
+        return human_readable_size(
+            self._average_across_all_files('average_memory'))
+
+    @property
+    def std_dev_total_time(self):
+        return self._standard_deviation_across_all_files('time')
+
+    @property
+    def std_dev_max_cpu(self):
+        return self._standard_deviation_across_all_files('max_cpu')
+
+    @property
+    def std_dev_max_memory(self):
+        return human_readable_size(
+            self._standard_deviation_across_all_files('max_memory'))
+
+    @property
+    def std_dev_average_cpu(self):
+        return self._standard_deviation_across_all_files('average_cpu')
+
+    @property
+    def std_dev_average_memory(self):
+        return human_readable_size(
+            self._standard_deviation_across_all_files('average_memory'))
 
-    def _average(self, name):
-        return self._totals[name]/self._num_rows
+    def _average_across_all_files(self, name):
+        return sum(self._totals[name])/len(self._totals[name])
+
+    def _standard_deviation_across_all_files(self, name):
+        mean = self._average_across_all_files(name)
+        differences = [total - mean for total in self._totals[name]]
+        sq_differences = [difference ** 2 for difference in differences]
+        return sqrt(sum(sq_differences)/len(self._totals[name]))
 
     def summarize(self):
         """Prints out the processed data"""
-        print('Total time: %.3f seconds' % self.total_time)
-        print('Max memory: %s' % self.max_memory)
-        print('Max cpu: %.1f percent' % self.max_cpu)
-        print('Average memory: %s' % self.average_memory)
-        print('Average cpu: %.1f percent' % self.average_cpu)
+        table = [
+            ['Total Time (seconds)', '%.3f' % self.total_time,
+             self.std_dev_total_time],
+            ['Maximum Memory', self.max_memory, self.std_dev_max_memory],
+            ['Maximum CPU (percent)',  '%.1f' % self.max_cpu,
+             self.std_dev_max_cpu],
+            ['Average Memory', self.average_memory,
+             self.std_dev_average_memory],
+            ['Average CPU (percent)', '%.1f' % self.average_cpu,
+             self.std_dev_average_cpu],
+        ]
+        print(
+            tabulate(
+                table, headers=['Metric', 'Mean', 'Standard Deviation'],
+                tablefmt="grid"
+            )
+        )
 
     def process(self, args):
         """Processes the data from the CSV file"""
-        with open(args.benchmark_file, 'rb') as f:
+        for benchmark_file in args.benchmark_files:
+            self.process_individual_file(benchmark_file)
+
+    def process_individual_file(self, benchmark_file):
+        with open(benchmark_file, 'rb') as f:
             reader = csv.reader(f)
             # Process each row from the CSV file
             for row in reader:
                 self.process_data_row(row)
-        self._end_time = self._get_time(row)
+            self._end_time = self._get_time(row)
+            self._finalize_processed_data_for_file()
 
     def process_data_row(self, row):
         # If the row is the first row collect the start time.
@@ -112,14 +198,31 @@ class Summarizer(object):
         index = self.DATA_INDEX_IN_ROW[name]
         # Get the data point.
         data_point = float(row[index])
-        self._add_to_total(name, data_point)
+        self._add_to_average(name, data_point)
         self._account_for_maximum(name, data_point)
 
+    def _finalize_processed_data_for_file(self):
+        # Add numbers to the total, which keeps track of data over
+        # all files provided.
+        self._totals['time'].append(self._end_time - self._start_time)
+        self._totals['max_cpu'].append(self._maximums['cpu'])
+        self._totals['max_memory'].append(self._maximums['memory'])
+        self._totals['average_cpu'].append(
+                self._averages['cpu']/self._num_rows)
+        self._totals['average_memory'].append(
+            self._averages['memory']/self._num_rows)
+
+        # Reset some of the data needed to be tracked for each specific
+        # file.
+        self._num_rows = 0
+        self._maximums = self._maximums.fromkeys(self._maximums, 0.0)
+        self._averages = self._averages.fromkeys(self._averages, 0.0)
+
     def _get_time(self, row):
         return float(row[self.DATA_INDEX_IN_ROW['time']])
 
-    def _add_to_total(self, name, data_point):
-        self._totals[name] += data_point
+    def _add_to_average(self, name, data_point):
+        self._averages[name] += data_point
 
     def _account_for_maximum(self, name, data_point):
         if data_point > self._maximums[name]:
@@ -129,8 +232,13 @@ class Summarizer(object):
 def main():
     parser = argparse.ArgumentParser(usage=__doc__)
     parser.add_argument(
-        '-f', '--benchmark-file', required=True,
-        help='The CSV output file from the benchmark script.')
+        '-f', '--benchmark-files', required=True, nargs='+',
+        help=(
+            'The CSV output file from the benchmark script. If you provide'
+            'more than one of these files, it will give you the average '
+            'across all of the files for each metric.'
+        )
+    )
     args = parser.parse_args()
     summarizer = Summarizer()
     summarizer.process(args)