From 09b8a86de9ce9c5d7e5ad509ae722dc53b4c2a8a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 May 2024 18:49:38 -0600 Subject: [PATCH 1/8] Add script to generate comparison of two benchmark runs --- scripts/generate-comparision.py | 109 ++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/generate-comparision.py diff --git a/scripts/generate-comparision.py b/scripts/generate-comparision.py new file mode 100644 index 0000000..7221485 --- /dev/null +++ b/scripts/generate-comparision.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +import matplotlib.pyplot as plt +import numpy as np +import sys + +def generate_per_query_chart(baseline, comparison): + results = [] + for query in range(1, 23): + a = np.mean(np.array(baseline[str(query)])) + b = np.mean(np.array(comparison[str(query)])) + if a > b: + speedup = a/b-1 + else: + speedup = -(1/(a/b)-1) + results.append(("q" + str(query), round(speedup*100, 0))) + + results = sorted(results, key=lambda x: -x[1]) + + queries, speedups = zip(*results) + + # Create figure and axis + fig, ax = plt.subplots(figsize=(10, 6)) + + # Create bar chart + bars = ax.bar(queries, speedups, color='skyblue') + + # Add text annotations + for bar, speedup in zip(bars, speedups): + yval = bar.get_height() + if yval >= 0: + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8, + color='blue') + else: + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8, + color='blue') + + # Add title and labels + ax.set_title('Comet Acceleration of TPC-H Queries') + ax.set_ylabel('Speedup (100% speedup = 2x faster)') + ax.set_xlabel('Query') + + # Customize the y-axis to handle both positive and negative values better + ax.axhline(0, color='black', linewidth=0.8) + ax.set_ylim(-400, 600) + + # Show grid for better readability + ax.yaxis.grid(True) + + # Save the plot as an image file + plt.savefig('comet_acceleration_tpch_queries.png', format='png') + + +def generate_summary(baseline, comparison): + baseline_total = 0 + comparison_total = 0 + for query in range(1, 23): + baseline_total += np.mean(np.array(baseline[str(query)])) + comparison_total += np.mean(np.array(comparison[str(query)])) + + # TODO make labels configurable + labels = ['Spark', 'Spark + Comet'] + times = [round(baseline_total,0), round(comparison_total,0)] + + # Create figure and axis + fig, ax = plt.subplots() + + # Create bar chart + bars = ax.bar(labels, times, color='skyblue') + + # Add text annotations + for bar in bars: + yval = bar.get_height() + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment + + # Add title and labels + #TODO make title configurable + ax.set_title('TPC-H Performance (scale factor 100)') + ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') + + plt.savefig('tpch_performance.png', format='png') + +def main(filename1: str, filename2: str): + with open(filename1) as f1: + baseline = json.load(f1) + with open(filename2) as f2: + comparison = json.load(f2) + generate_summary(baseline, comparison) + generate_per_query_chart(baseline, comparison) + +if __name__ == '__main__': + # TODO argparse + main(sys.argv[1], sys.argv[2]) From 5f37f5bd3672b6357cedb0ee894852072307fd9a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 May 2024 18:53:57 -0600 Subject: [PATCH 2/8] rename script --- scripts/generate-comparision.py | 109 -------------------------------- 1 file changed, 109 deletions(-) delete mode 100644 scripts/generate-comparision.py diff --git a/scripts/generate-comparision.py b/scripts/generate-comparision.py deleted file mode 100644 index 7221485..0000000 --- a/scripts/generate-comparision.py +++ /dev/null @@ -1,109 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import json -import matplotlib.pyplot as plt -import numpy as np -import sys - -def generate_per_query_chart(baseline, comparison): - results = [] - for query in range(1, 23): - a = np.mean(np.array(baseline[str(query)])) - b = np.mean(np.array(comparison[str(query)])) - if a > b: - speedup = a/b-1 - else: - speedup = -(1/(a/b)-1) - results.append(("q" + str(query), round(speedup*100, 0))) - - results = sorted(results, key=lambda x: -x[1]) - - queries, speedups = zip(*results) - - # Create figure and axis - fig, ax = plt.subplots(figsize=(10, 6)) - - # Create bar chart - bars = ax.bar(queries, speedups, color='skyblue') - - # Add text annotations - for bar, speedup in zip(bars, speedups): - yval = bar.get_height() - if yval >= 0: - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8, - color='blue') - else: - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8, - color='blue') - - # Add title and labels - ax.set_title('Comet Acceleration of TPC-H Queries') - ax.set_ylabel('Speedup (100% speedup = 2x faster)') - ax.set_xlabel('Query') - - # Customize the y-axis to handle both positive and negative values better - ax.axhline(0, color='black', linewidth=0.8) - ax.set_ylim(-400, 600) - - # Show grid for better readability - ax.yaxis.grid(True) - - # Save the plot as an image file - plt.savefig('comet_acceleration_tpch_queries.png', format='png') - - -def generate_summary(baseline, comparison): - baseline_total = 0 - comparison_total = 0 - for query in range(1, 23): - baseline_total += np.mean(np.array(baseline[str(query)])) - comparison_total += np.mean(np.array(comparison[str(query)])) - - # TODO make labels configurable - labels = ['Spark', 'Spark + Comet'] - times = [round(baseline_total,0), round(comparison_total,0)] - - # Create figure and axis - fig, ax = plt.subplots() - - # Create bar chart - bars = ax.bar(labels, times, color='skyblue') - - # Add text annotations - for bar in bars: - yval = bar.get_height() - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment - - # Add title and labels - #TODO make title configurable - ax.set_title('TPC-H Performance (scale factor 100)') - ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') - - plt.savefig('tpch_performance.png', format='png') - -def main(filename1: str, filename2: str): - with open(filename1) as f1: - baseline = json.load(f1) - with open(filename2) as f2: - comparison = json.load(f2) - generate_summary(baseline, comparison) - generate_per_query_chart(baseline, comparison) - -if __name__ == '__main__': - # TODO argparse - main(sys.argv[1], sys.argv[2]) From f8cb406a036cdd69ec9657b9442b632d54ea8378 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 May 2024 18:54:21 -0600 Subject: [PATCH 3/8] rename script --- scripts/generate-comparison.py | 109 +++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/generate-comparison.py diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py new file mode 100644 index 0000000..7221485 --- /dev/null +++ b/scripts/generate-comparison.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +import matplotlib.pyplot as plt +import numpy as np +import sys + +def generate_per_query_chart(baseline, comparison): + results = [] + for query in range(1, 23): + a = np.mean(np.array(baseline[str(query)])) + b = np.mean(np.array(comparison[str(query)])) + if a > b: + speedup = a/b-1 + else: + speedup = -(1/(a/b)-1) + results.append(("q" + str(query), round(speedup*100, 0))) + + results = sorted(results, key=lambda x: -x[1]) + + queries, speedups = zip(*results) + + # Create figure and axis + fig, ax = plt.subplots(figsize=(10, 6)) + + # Create bar chart + bars = ax.bar(queries, speedups, color='skyblue') + + # Add text annotations + for bar, speedup in zip(bars, speedups): + yval = bar.get_height() + if yval >= 0: + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8, + color='blue') + else: + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8, + color='blue') + + # Add title and labels + ax.set_title('Comet Acceleration of TPC-H Queries') + ax.set_ylabel('Speedup (100% speedup = 2x faster)') + ax.set_xlabel('Query') + + # Customize the y-axis to handle both positive and negative values better + ax.axhline(0, color='black', linewidth=0.8) + ax.set_ylim(-400, 600) + + # Show grid for better readability + ax.yaxis.grid(True) + + # Save the plot as an image file + plt.savefig('comet_acceleration_tpch_queries.png', format='png') + + +def generate_summary(baseline, comparison): + baseline_total = 0 + comparison_total = 0 + for query in range(1, 23): + baseline_total += np.mean(np.array(baseline[str(query)])) + comparison_total += np.mean(np.array(comparison[str(query)])) + + # TODO make labels configurable + labels = ['Spark', 'Spark + Comet'] + times = [round(baseline_total,0), round(comparison_total,0)] + + # Create figure and axis + fig, ax = plt.subplots() + + # Create bar chart + bars = ax.bar(labels, times, color='skyblue') + + # Add text annotations + for bar in bars: + yval = bar.get_height() + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment + + # Add title and labels + #TODO make title configurable + ax.set_title('TPC-H Performance (scale factor 100)') + ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') + + plt.savefig('tpch_performance.png', format='png') + +def main(filename1: str, filename2: str): + with open(filename1) as f1: + baseline = json.load(f1) + with open(filename2) as f2: + comparison = json.load(f2) + generate_summary(baseline, comparison) + generate_per_query_chart(baseline, comparison) + +if __name__ == '__main__': + # TODO argparse + main(sys.argv[1], sys.argv[2]) From 1bf24cdda04ebf80dff0599e7651df748fd83130 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 29 May 2024 19:00:41 -0600 Subject: [PATCH 4/8] updates --- scripts/generate-comparison.py | 43 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py index 7221485..3447a5c 100644 --- a/scripts/generate-comparison.py +++ b/scripts/generate-comparison.py @@ -20,11 +20,14 @@ import numpy as np import sys +def geomean(data): + return np.prod(data) ** (1 / len(data)) + def generate_per_query_chart(baseline, comparison): results = [] for query in range(1, 23): - a = np.mean(np.array(baseline[str(query)])) - b = np.mean(np.array(comparison[str(query)])) + a = np.median(np.array(baseline[str(query)])) + b = np.median(np.array(comparison[str(query)])) if a > b: speedup = a/b-1 else: @@ -45,11 +48,11 @@ def generate_per_query_chart(baseline, comparison): for bar, speedup in zip(bars, speedups): yval = bar.get_height() if yval >= 0: - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8, - color='blue') + ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+20), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8, + color='blue', rotation=90) else: - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8, - color='blue') + ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8, + color='blue', rotation=90) # Add title and labels ax.set_title('Comet Acceleration of TPC-H Queries') @@ -58,29 +61,36 @@ def generate_per_query_chart(baseline, comparison): # Customize the y-axis to handle both positive and negative values better ax.axhline(0, color='black', linewidth=0.8) - ax.set_ylim(-400, 600) + min_value = (min(speedups) // 100) * 100 + max_value = ((max(speedups) // 100) + 1) * 100 + ax.set_ylim(min_value, max_value) # Show grid for better readability ax.yaxis.grid(True) # Save the plot as an image file - plt.savefig('comet_acceleration_tpch_queries.png', format='png') + plt.savefig('tpch_queries.png', format='png') def generate_summary(baseline, comparison): baseline_total = 0 comparison_total = 0 for query in range(1, 23): - baseline_total += np.mean(np.array(baseline[str(query)])) - comparison_total += np.mean(np.array(comparison[str(query)])) + baseline_total += np.median(np.array(baseline[str(query)])) + comparison_total += np.median(np.array(comparison[str(query)])) + + # Create figure and axis + fig, ax = plt.subplots() + + # Add title and labels + #TODO make title configurable + ax.set_title('TPC-H Performance (scale factor 100)') + ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') # TODO make labels configurable labels = ['Spark', 'Spark + Comet'] times = [round(baseline_total,0), round(comparison_total,0)] - # Create figure and axis - fig, ax = plt.subplots() - # Create bar chart bars = ax.bar(labels, times, color='skyblue') @@ -89,12 +99,7 @@ def generate_summary(baseline, comparison): yval = bar.get_height() ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment - # Add title and labels - #TODO make title configurable - ax.set_title('TPC-H Performance (scale factor 100)') - ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') - - plt.savefig('tpch_performance.png', format='png') + plt.savefig('tpch_allqueries.png', format='png') def main(filename1: str, filename2: str): with open(filename1) as f1: From bf9bb1746da92601a01023c5ca71ca5434050b0c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 30 May 2024 08:00:09 -0600 Subject: [PATCH 5/8] add another chart style --- scripts/generate-comparison.py | 38 +++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py index 3447a5c..8406a09 100644 --- a/scripts/generate-comparison.py +++ b/scripts/generate-comparison.py @@ -23,7 +23,7 @@ def geomean(data): return np.prod(data) ** (1 / len(data)) -def generate_per_query_chart(baseline, comparison): +def generate_query_speedup_chart(baseline, comparison): results = [] for query in range(1, 23): a = np.median(np.array(baseline[str(query)])) @@ -69,9 +69,40 @@ def generate_per_query_chart(baseline, comparison): ax.yaxis.grid(True) # Save the plot as an image file - plt.savefig('tpch_queries.png', format='png') + plt.savefig('tpch_queries_speedup.png', format='png') +def generate_query_comparison_chart(baseline, comparison): + queries = [] + a = [] + b = [] + for query in range(1, 23): + queries.append("q" + str(query)) + a.append(np.median(np.array(baseline[str(query)]))) + b.append(np.median(np.array(comparison[str(query)]))) + + # Define the width of the bars + bar_width = 0.35 + + # Define the positions of the bars on the x-axis + index = np.arange(len(queries)) + + # Create a bar chart + fig, ax = plt.subplots(figsize=(10, 6)) + bar1 = ax.bar(index, a, bar_width, label='Spark') + bar2 = ax.bar(index + bar_width, b, bar_width, label='Spark + Comet') + + # Add labels, title, and legend + ax.set_xlabel('Queries') + ax.set_ylabel('Run Time') + ax.set_title('TPC-H Queries') + ax.set_xticks(index + bar_width / 2) + ax.set_xticklabels(queries) + ax.legend() + + # Save the plot as an image file + plt.savefig('tpch_queries_compare.png', format='png') + def generate_summary(baseline, comparison): baseline_total = 0 comparison_total = 0 @@ -107,7 +138,8 @@ def main(filename1: str, filename2: str): with open(filename2) as f2: comparison = json.load(f2) generate_summary(baseline, comparison) - generate_per_query_chart(baseline, comparison) + generate_query_comparison_chart(baseline, comparison) + generate_query_speedup_chart(baseline, comparison) if __name__ == '__main__': # TODO argparse From 5a8b7f61421595ef58150914b5bf6b8f1c29bc73 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 30 May 2024 09:49:22 -0600 Subject: [PATCH 6/8] improve script --- scripts/generate-comparison.py | 45 +++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py index 8406a09..c8cf5d3 100644 --- a/scripts/generate-comparison.py +++ b/scripts/generate-comparison.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import argparse import json import matplotlib.pyplot as plt import numpy as np @@ -23,7 +24,7 @@ def geomean(data): return np.prod(data) ** (1 / len(data)) -def generate_query_speedup_chart(baseline, comparison): +def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str): results = [] for query in range(1, 23): a = np.median(np.array(baseline[str(query)])) @@ -48,21 +49,21 @@ def generate_query_speedup_chart(baseline, comparison): for bar, speedup in zip(bars, speedups): yval = bar.get_height() if yval >= 0: - ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+20), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8, + ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8, color='blue', rotation=90) else: ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8, color='blue', rotation=90) # Add title and labels - ax.set_title('Comet Acceleration of TPC-H Queries') + ax.set_title(label2 + " speedup over " + label1 + " (" + benchmark + ")") ax.set_ylabel('Speedup (100% speedup = 2x faster)') ax.set_xlabel('Query') # Customize the y-axis to handle both positive and negative values better ax.axhline(0, color='black', linewidth=0.8) min_value = (min(speedups) // 100) * 100 - max_value = ((max(speedups) // 100) + 1) * 100 + max_value = ((max(speedups) // 100) + 1) * 100 + 50 ax.set_ylim(min_value, max_value) # Show grid for better readability @@ -72,7 +73,7 @@ def generate_query_speedup_chart(baseline, comparison): plt.savefig('tpch_queries_speedup.png', format='png') -def generate_query_comparison_chart(baseline, comparison): +def generate_query_comparison_chart(baseline, comparison, label1: str, label2: str, benchmark: str): queries = [] a = [] b = [] @@ -89,13 +90,13 @@ def generate_query_comparison_chart(baseline, comparison): # Create a bar chart fig, ax = plt.subplots(figsize=(10, 6)) - bar1 = ax.bar(index, a, bar_width, label='Spark') - bar2 = ax.bar(index + bar_width, b, bar_width, label='Spark + Comet') + bar1 = ax.bar(index, a, bar_width, label=label1) + bar2 = ax.bar(index + bar_width, b, bar_width, label=label2) # Add labels, title, and legend + ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")") ax.set_xlabel('Queries') - ax.set_ylabel('Run Time') - ax.set_title('TPC-H Queries') + ax.set_ylabel('Query Time (seconds)') ax.set_xticks(index + bar_width / 2) ax.set_xticklabels(queries) ax.legend() @@ -103,7 +104,7 @@ def generate_query_comparison_chart(baseline, comparison): # Save the plot as an image file plt.savefig('tpch_queries_compare.png', format='png') -def generate_summary(baseline, comparison): +def generate_summary(baseline, comparison, label1: str, label2: str, benchmark: str): baseline_total = 0 comparison_total = 0 for query in range(1, 23): @@ -114,12 +115,10 @@ def generate_summary(baseline, comparison): fig, ax = plt.subplots() # Add title and labels - #TODO make title configurable - ax.set_title('TPC-H Performance (scale factor 100)') + ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")") ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') - # TODO make labels configurable - labels = ['Spark', 'Spark + Comet'] + labels = [label1, label2] times = [round(baseline_total,0), round(comparison_total,0)] # Create bar chart @@ -132,15 +131,21 @@ def generate_summary(baseline, comparison): plt.savefig('tpch_allqueries.png', format='png') -def main(filename1: str, filename2: str): +def main(filename1: str, filename2: str, label1: str, label2: str, benchmark: str): with open(filename1) as f1: baseline = json.load(f1) with open(filename2) as f2: comparison = json.load(f2) - generate_summary(baseline, comparison) - generate_query_comparison_chart(baseline, comparison) - generate_query_speedup_chart(baseline, comparison) + generate_summary(baseline, comparison, label1, label2, benchmark) + generate_query_comparison_chart(baseline, comparison, label1, label2, benchmark) + generate_query_speedup_chart(baseline, comparison, label1, label2, benchmark) if __name__ == '__main__': - # TODO argparse - main(sys.argv[1], sys.argv[2]) + argparse = argparse.ArgumentParser(description='Generate comparison') + argparse.add_argument('filename1', type=str, help='First file') + argparse.add_argument('filename2', type=str, help='Second file') + argparse.add_argument('label1', type=str, help='First label') + argparse.add_argument('label2', type=str, help='Second label') + argparse.add_argument('benchmark', type=str, help='Benchmark description') + args = argparse.parse_args() + main(args.filename1, args.filename2, args.label1, args.label2, args.benchmark) From fdb498a1c015e0cebc611076ca6fe4c6589148d8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 30 May 2024 11:15:25 -0600 Subject: [PATCH 7/8] support more than 2 files --- scripts/generate-comparison.py | 67 +++++++++++++++++----------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py index c8cf5d3..b13904e 100644 --- a/scripts/generate-comparison.py +++ b/scripts/generate-comparison.py @@ -73,28 +73,29 @@ def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, plt.savefig('tpch_queries_speedup.png', format='png') -def generate_query_comparison_chart(baseline, comparison, label1: str, label2: str, benchmark: str): +def generate_query_comparison_chart(results, labels, benchmark: str): queries = [] - a = [] - b = [] + benches = [] + for _ in results: + benches.append([]) for query in range(1, 23): queries.append("q" + str(query)) - a.append(np.median(np.array(baseline[str(query)]))) - b.append(np.median(np.array(comparison[str(query)]))) + for i in range(0, len(results)): + benches[i].append(np.median(np.array(results[i][str(query)]))) # Define the width of the bars - bar_width = 0.35 + bar_width = 0.3 # Define the positions of the bars on the x-axis - index = np.arange(len(queries)) + index = np.arange(len(queries)) * 1.5 # Create a bar chart - fig, ax = plt.subplots(figsize=(10, 6)) - bar1 = ax.bar(index, a, bar_width, label=label1) - bar2 = ax.bar(index + bar_width, b, bar_width, label=label2) + fig, ax = plt.subplots(figsize=(15, 6)) + for i in range(0, len(results)): + bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i]) # Add labels, title, and legend - ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")") + ax.set_title(benchmark) ax.set_xlabel('Queries') ax.set_ylabel('Query Time (seconds)') ax.set_xticks(index + bar_width / 2) @@ -104,22 +105,23 @@ def generate_query_comparison_chart(baseline, comparison, label1: str, label2: s # Save the plot as an image file plt.savefig('tpch_queries_compare.png', format='png') -def generate_summary(baseline, comparison, label1: str, label2: str, benchmark: str): - baseline_total = 0 - comparison_total = 0 +def generate_summary(results, labels, benchmark: str): + timings = [] + for _ in results: + timings.append(0) + for query in range(1, 23): - baseline_total += np.median(np.array(baseline[str(query)])) - comparison_total += np.median(np.array(comparison[str(query)])) + for i in range(0, len(results)): + timings[i] += np.median(np.array(results[i][str(query)])) # Create figure and axis fig, ax = plt.subplots() # Add title and labels - ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")") + ax.set_title(benchmark) ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)') - labels = [label1, label2] - times = [round(baseline_total,0), round(comparison_total,0)] + times = [round(x,0) for x in timings] # Create bar chart bars = ax.bar(labels, times, color='skyblue') @@ -131,21 +133,20 @@ def generate_summary(baseline, comparison, label1: str, label2: str, benchmark: plt.savefig('tpch_allqueries.png', format='png') -def main(filename1: str, filename2: str, label1: str, label2: str, benchmark: str): - with open(filename1) as f1: - baseline = json.load(f1) - with open(filename2) as f2: - comparison = json.load(f2) - generate_summary(baseline, comparison, label1, label2, benchmark) - generate_query_comparison_chart(baseline, comparison, label1, label2, benchmark) - generate_query_speedup_chart(baseline, comparison, label1, label2, benchmark) +def main(files, labels, benchmark: str): + results = [] + for filename in files: + with open(filename) as f: + results.append(json.load(f)) + generate_summary(results, labels, benchmark) + generate_query_comparison_chart(results, labels, benchmark) + if len(files) == 2: + generate_query_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark) if __name__ == '__main__': argparse = argparse.ArgumentParser(description='Generate comparison') - argparse.add_argument('filename1', type=str, help='First file') - argparse.add_argument('filename2', type=str, help='Second file') - argparse.add_argument('label1', type=str, help='First label') - argparse.add_argument('label2', type=str, help='Second label') - argparse.add_argument('benchmark', type=str, help='Benchmark description') + argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files') + argparse.add_argument('--labels', nargs='+', type=str, help='Labels') + argparse.add_argument('--benchmark', type=str, help='Benchmark description') args = argparse.parse_args() - main(args.filename1, args.filename2, args.label1, args.label2, args.benchmark) + main(args.filenames, args.labels, args.benchmark) From 45bd311e1b77bf4a6740c1d82ce88c619ad320e9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 30 May 2024 19:10:41 -0600 Subject: [PATCH 8/8] update README --- .gitignore | 4 +++- README.md | 31 ++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 2a2f6e2..f4e2059 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ +venv .idea -*.json \ No newline at end of file +*.json +*.png diff --git a/README.md b/README.md index b772f3e..e405e6c 100644 --- a/README.md +++ b/README.md @@ -48,17 +48,12 @@ prohibited by the TPC. ## Data Generation -See the benchmark-specific instructions for generating the CSV data for [TPC-H](tpch) and [TPC-DS](tpcds). - -## Converting CSV data to Parquet - -Although it is valid to run benchmarks against CSV data, this does not really represent how most of the world is -running OLAP queries, especially when dealing with large datasets. When benchmarking DataFusion, we typically want -to be querying Parquet data, so first we must convert the generated datasets to Parquet. Also, we typically do not -want a single file per table, so we also need to repartition the data. - -We plan on adding Python scripts in this repository to perform this conversion and repartitioning. Until then you may -want to write your own scripts using DataFusion or Spark. Another option is to use [tpc-tools](https://crates.io/crates/tpctools). +See the benchmark-specific instructions for generating the CSV data for [TPC-H](tpch) and [TPC-DS](tpcds) and for +converting that data to Parquet format. Although it is valid to run benchmarks against CSV data, this does not really +represent how most of the world is running OLAP queries, especially when dealing with large datasets. When benchmarking +DataFusion and its subprojects, we typically want to be querying Parquet data. Also, we typically do not +want a single file per table, so we also need to repartition the data. The provided scripts take care of this conversion +and repartitioning. ## Running the Benchmarks with DataFusion @@ -67,10 +62,20 @@ Scripts are available for the following DataFusion projects: - [DataFusion Python](./runners/datafusion-python) - [DataFusion Comet](./runners/datafusion-comet) +These benchmarking scripts produce JSON files containing query timings. + ## Comparing Results -Coming soon. The plan is to add some Python scripts for comparing results from different runs and producing charts -that we can use in blog posts. +The Python script [scripts/generate-comparison.py](scripts/generate-comparison.py) can be used to produce charts +comparing results from different benchmark runs. + +For example: + +```shell +python scripts/generate-comparison.py file1.json file2.json --labels "Spark" "Comet" --benchmark "TPC-H 100GB" +``` + +This will create image files in the current directory in PNG format. ## Legal Notices