From 09b8a86de9ce9c5d7e5ad509ae722dc53b4c2a8a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 29 May 2024 18:49:38 -0600
Subject: [PATCH 1/8] Add script to generate comparison of two benchmark runs

---
 scripts/generate-comparision.py | 109 ++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 scripts/generate-comparision.py

diff --git a/scripts/generate-comparision.py b/scripts/generate-comparision.py
new file mode 100644
index 0000000..7221485
--- /dev/null
+++ b/scripts/generate-comparision.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+
+def generate_per_query_chart(baseline, comparison):
+    results = []
+    for query in range(1, 23):
+        a = np.mean(np.array(baseline[str(query)]))
+        b = np.mean(np.array(comparison[str(query)]))
+        if a > b:
+            speedup = a/b-1
+        else:
+            speedup = -(1/(a/b)-1)
+        results.append(("q" + str(query), round(speedup*100, 0)))
+
+    results = sorted(results, key=lambda x: -x[1])
+
+    queries, speedups = zip(*results)
+
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    # Create bar chart
+    bars = ax.bar(queries, speedups, color='skyblue')
+
+    # Add text annotations
+    for bar, speedup in zip(bars, speedups):
+        yval = bar.get_height()
+        if yval >= 0:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8,
+                    color='blue')
+        else:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8,
+                    color='blue')
+
+    # Add title and labels
+    ax.set_title('Comet Acceleration of TPC-H Queries')
+    ax.set_ylabel('Speedup (100% speedup = 2x faster)')
+    ax.set_xlabel('Query')
+
+    # Customize the y-axis to handle both positive and negative values better
+    ax.axhline(0, color='black', linewidth=0.8)
+    ax.set_ylim(-400, 600)
+
+    # Show grid for better readability
+    ax.yaxis.grid(True)
+
+    # Save the plot as an image file
+    plt.savefig('comet_acceleration_tpch_queries.png', format='png')
+
+
+def generate_summary(baseline, comparison):
+    baseline_total = 0
+    comparison_total = 0
+    for query in range(1, 23):
+        baseline_total += np.mean(np.array(baseline[str(query)]))
+        comparison_total += np.mean(np.array(comparison[str(query)]))
+
+    # TODO make labels configurable
+    labels = ['Spark', 'Spark + Comet']
+    times = [round(baseline_total,0), round(comparison_total,0)]
+
+    # Create figure and axis
+    fig, ax = plt.subplots()
+
+    # Create bar chart
+    bars = ax.bar(labels, times, color='skyblue')
+
+    # Add text annotations
+    for bar in bars:
+        yval = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom')  # va: vertical alignment
+
+    # Add title and labels
+    #TODO make title configurable
+    ax.set_title('TPC-H Performance (scale factor 100)')
+    ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
+
+    plt.savefig('tpch_performance.png', format='png')
+
+def main(filename1: str, filename2: str):
+    with open(filename1) as f1:
+        baseline = json.load(f1)
+    with open(filename2) as f2:
+        comparison = json.load(f2)
+    generate_summary(baseline, comparison)
+    generate_per_query_chart(baseline, comparison)
+
+if __name__ == '__main__':
+    # TODO argparse
+    main(sys.argv[1], sys.argv[2])

From 5f37f5bd3672b6357cedb0ee894852072307fd9a Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 29 May 2024 18:53:57 -0600
Subject: [PATCH 2/8] rename script

---
 scripts/generate-comparision.py | 109 --------------------------------
 1 file changed, 109 deletions(-)
 delete mode 100644 scripts/generate-comparision.py

diff --git a/scripts/generate-comparision.py b/scripts/generate-comparision.py
deleted file mode 100644
index 7221485..0000000
--- a/scripts/generate-comparision.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import json
-import matplotlib.pyplot as plt
-import numpy as np
-import sys
-
-def generate_per_query_chart(baseline, comparison):
-    results = []
-    for query in range(1, 23):
-        a = np.mean(np.array(baseline[str(query)]))
-        b = np.mean(np.array(comparison[str(query)]))
-        if a > b:
-            speedup = a/b-1
-        else:
-            speedup = -(1/(a/b)-1)
-        results.append(("q" + str(query), round(speedup*100, 0)))
-
-    results = sorted(results, key=lambda x: -x[1])
-
-    queries, speedups = zip(*results)
-
-    # Create figure and axis
-    fig, ax = plt.subplots(figsize=(10, 6))
-
-    # Create bar chart
-    bars = ax.bar(queries, speedups, color='skyblue')
-
-    # Add text annotations
-    for bar, speedup in zip(bars, speedups):
-        yval = bar.get_height()
-        if yval >= 0:
-            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8,
-                    color='blue')
-        else:
-            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8,
-                    color='blue')
-
-    # Add title and labels
-    ax.set_title('Comet Acceleration of TPC-H Queries')
-    ax.set_ylabel('Speedup (100% speedup = 2x faster)')
-    ax.set_xlabel('Query')
-
-    # Customize the y-axis to handle both positive and negative values better
-    ax.axhline(0, color='black', linewidth=0.8)
-    ax.set_ylim(-400, 600)
-
-    # Show grid for better readability
-    ax.yaxis.grid(True)
-
-    # Save the plot as an image file
-    plt.savefig('comet_acceleration_tpch_queries.png', format='png')
-
-
-def generate_summary(baseline, comparison):
-    baseline_total = 0
-    comparison_total = 0
-    for query in range(1, 23):
-        baseline_total += np.mean(np.array(baseline[str(query)]))
-        comparison_total += np.mean(np.array(comparison[str(query)]))
-
-    # TODO make labels configurable
-    labels = ['Spark', 'Spark + Comet']
-    times = [round(baseline_total,0), round(comparison_total,0)]
-
-    # Create figure and axis
-    fig, ax = plt.subplots()
-
-    # Create bar chart
-    bars = ax.bar(labels, times, color='skyblue')
-
-    # Add text annotations
-    for bar in bars:
-        yval = bar.get_height()
-        ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom')  # va: vertical alignment
-
-    # Add title and labels
-    #TODO make title configurable
-    ax.set_title('TPC-H Performance (scale factor 100)')
-    ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
-
-    plt.savefig('tpch_performance.png', format='png')
-
-def main(filename1: str, filename2: str):
-    with open(filename1) as f1:
-        baseline = json.load(f1)
-    with open(filename2) as f2:
-        comparison = json.load(f2)
-    generate_summary(baseline, comparison)
-    generate_per_query_chart(baseline, comparison)
-
-if __name__ == '__main__':
-    # TODO argparse
-    main(sys.argv[1], sys.argv[2])

From f8cb406a036cdd69ec9657b9442b632d54ea8378 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 29 May 2024 18:54:21 -0600
Subject: [PATCH 3/8] rename script

---
 scripts/generate-comparison.py | 109 +++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 scripts/generate-comparison.py

diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py
new file mode 100644
index 0000000..7221485
--- /dev/null
+++ b/scripts/generate-comparison.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+
+def generate_per_query_chart(baseline, comparison):
+    results = []
+    for query in range(1, 23):
+        a = np.mean(np.array(baseline[str(query)]))
+        b = np.mean(np.array(comparison[str(query)]))
+        if a > b:
+            speedup = a/b-1
+        else:
+            speedup = -(1/(a/b)-1)
+        results.append(("q" + str(query), round(speedup*100, 0)))
+
+    results = sorted(results, key=lambda x: -x[1])
+
+    queries, speedups = zip(*results)
+
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    # Create bar chart
+    bars = ax.bar(queries, speedups, color='skyblue')
+
+    # Add text annotations
+    for bar, speedup in zip(bars, speedups):
+        yval = bar.get_height()
+        if yval >= 0:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8,
+                    color='blue')
+        else:
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8,
+                    color='blue')
+
+    # Add title and labels
+    ax.set_title('Comet Acceleration of TPC-H Queries')
+    ax.set_ylabel('Speedup (100% speedup = 2x faster)')
+    ax.set_xlabel('Query')
+
+    # Customize the y-axis to handle both positive and negative values better
+    ax.axhline(0, color='black', linewidth=0.8)
+    ax.set_ylim(-400, 600)
+
+    # Show grid for better readability
+    ax.yaxis.grid(True)
+
+    # Save the plot as an image file
+    plt.savefig('comet_acceleration_tpch_queries.png', format='png')
+
+
+def generate_summary(baseline, comparison):
+    baseline_total = 0
+    comparison_total = 0
+    for query in range(1, 23):
+        baseline_total += np.mean(np.array(baseline[str(query)]))
+        comparison_total += np.mean(np.array(comparison[str(query)]))
+
+    # TODO make labels configurable
+    labels = ['Spark', 'Spark + Comet']
+    times = [round(baseline_total,0), round(comparison_total,0)]
+
+    # Create figure and axis
+    fig, ax = plt.subplots()
+
+    # Create bar chart
+    bars = ax.bar(labels, times, color='skyblue')
+
+    # Add text annotations
+    for bar in bars:
+        yval = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom')  # va: vertical alignment
+
+    # Add title and labels
+    #TODO make title configurable
+    ax.set_title('TPC-H Performance (scale factor 100)')
+    ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
+
+    plt.savefig('tpch_performance.png', format='png')
+
+def main(filename1: str, filename2: str):
+    with open(filename1) as f1:
+        baseline = json.load(f1)
+    with open(filename2) as f2:
+        comparison = json.load(f2)
+    generate_summary(baseline, comparison)
+    generate_per_query_chart(baseline, comparison)
+
+if __name__ == '__main__':
+    # TODO argparse
+    main(sys.argv[1], sys.argv[2])

From 1bf24cdda04ebf80dff0599e7651df748fd83130 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 29 May 2024 19:00:41 -0600
Subject: [PATCH 4/8] updates

---
 scripts/generate-comparison.py | 43 +++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py
index 7221485..3447a5c 100644
--- a/scripts/generate-comparison.py
+++ b/scripts/generate-comparison.py
@@ -20,11 +20,14 @@
 import numpy as np
 import sys
 
+def geomean(data):
+    return np.prod(data) ** (1 / len(data))
+
 def generate_per_query_chart(baseline, comparison):
     results = []
     for query in range(1, 23):
-        a = np.mean(np.array(baseline[str(query)]))
-        b = np.mean(np.array(comparison[str(query)]))
+        a = np.median(np.array(baseline[str(query)]))
+        b = np.median(np.array(comparison[str(query)]))
         if a > b:
             speedup = a/b-1
         else:
@@ -45,11 +48,11 @@ def generate_per_query_chart(baseline, comparison):
     for bar, speedup in zip(bars, speedups):
         yval = bar.get_height()
         if yval >= 0:
-            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='bottom', ha='center', fontsize=8,
-                    color='blue')
+            ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+20), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
+                    color='blue', rotation=90)
         else:
-            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}%', va='top', ha='center', fontsize=8,
-                    color='blue')
+            ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
+                    color='blue', rotation=90)
 
     # Add title and labels
     ax.set_title('Comet Acceleration of TPC-H Queries')
@@ -58,29 +61,36 @@ def generate_per_query_chart(baseline, comparison):
 
     # Customize the y-axis to handle both positive and negative values better
     ax.axhline(0, color='black', linewidth=0.8)
-    ax.set_ylim(-400, 600)
+    min_value = (min(speedups) // 100) * 100
+    max_value = ((max(speedups) // 100) + 1) * 100
+    ax.set_ylim(min_value, max_value)
 
     # Show grid for better readability
     ax.yaxis.grid(True)
 
     # Save the plot as an image file
-    plt.savefig('comet_acceleration_tpch_queries.png', format='png')
+    plt.savefig('tpch_queries.png', format='png')
 
 
 def generate_summary(baseline, comparison):
     baseline_total = 0
     comparison_total = 0
     for query in range(1, 23):
-        baseline_total += np.mean(np.array(baseline[str(query)]))
-        comparison_total += np.mean(np.array(comparison[str(query)]))
+        baseline_total += np.median(np.array(baseline[str(query)]))
+        comparison_total += np.median(np.array(comparison[str(query)]))
+
+    # Create figure and axis
+    fig, ax = plt.subplots()
+
+    # Add title and labels
+    #TODO make title configurable
+    ax.set_title('TPC-H Performance (scale factor 100)')
+    ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
 
     # TODO make labels configurable
     labels = ['Spark', 'Spark + Comet']
     times = [round(baseline_total,0), round(comparison_total,0)]
 
-    # Create figure and axis
-    fig, ax = plt.subplots()
-
     # Create bar chart
     bars = ax.bar(labels, times, color='skyblue')
 
@@ -89,12 +99,7 @@ def generate_summary(baseline, comparison):
         yval = bar.get_height()
         ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom')  # va: vertical alignment
 
-    # Add title and labels
-    #TODO make title configurable
-    ax.set_title('TPC-H Performance (scale factor 100)')
-    ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
-
-    plt.savefig('tpch_performance.png', format='png')
+    plt.savefig('tpch_allqueries.png', format='png')
 
 def main(filename1: str, filename2: str):
     with open(filename1) as f1:

From bf9bb1746da92601a01023c5ca71ca5434050b0c Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 30 May 2024 08:00:09 -0600
Subject: [PATCH 5/8] add another chart style

---
 scripts/generate-comparison.py | 38 +++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py
index 3447a5c..8406a09 100644
--- a/scripts/generate-comparison.py
+++ b/scripts/generate-comparison.py
@@ -23,7 +23,7 @@
 def geomean(data):
     return np.prod(data) ** (1 / len(data))
 
-def generate_per_query_chart(baseline, comparison):
+def generate_query_speedup_chart(baseline, comparison):
     results = []
     for query in range(1, 23):
         a = np.median(np.array(baseline[str(query)]))
@@ -69,9 +69,40 @@ def generate_per_query_chart(baseline, comparison):
     ax.yaxis.grid(True)
 
     # Save the plot as an image file
-    plt.savefig('tpch_queries.png', format='png')
+    plt.savefig('tpch_queries_speedup.png', format='png')
 
 
+def generate_query_comparison_chart(baseline, comparison):
+    queries = []
+    a = []
+    b = []
+    for query in range(1, 23):
+        queries.append("q" + str(query))
+        a.append(np.median(np.array(baseline[str(query)])))
+        b.append(np.median(np.array(comparison[str(query)])))
+
+    # Define the width of the bars
+    bar_width = 0.35
+
+    # Define the positions of the bars on the x-axis
+    index = np.arange(len(queries))
+
+    # Create a bar chart
+    fig, ax = plt.subplots(figsize=(10, 6))
+    bar1 = ax.bar(index, a, bar_width, label='Spark')
+    bar2 = ax.bar(index + bar_width, b, bar_width, label='Spark + Comet')
+
+    # Add labels, title, and legend
+    ax.set_xlabel('Queries')
+    ax.set_ylabel('Run Time')
+    ax.set_title('TPC-H Queries')
+    ax.set_xticks(index + bar_width / 2)
+    ax.set_xticklabels(queries)
+    ax.legend()
+
+    # Save the plot as an image file
+    plt.savefig('tpch_queries_compare.png', format='png')
+
 def generate_summary(baseline, comparison):
     baseline_total = 0
     comparison_total = 0
@@ -107,7 +138,8 @@ def main(filename1: str, filename2: str):
     with open(filename2) as f2:
         comparison = json.load(f2)
     generate_summary(baseline, comparison)
-    generate_per_query_chart(baseline, comparison)
+    generate_query_comparison_chart(baseline, comparison)
+    generate_query_speedup_chart(baseline, comparison)
 
 if __name__ == '__main__':
     # TODO argparse

From 5a8b7f61421595ef58150914b5bf6b8f1c29bc73 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 30 May 2024 09:49:22 -0600
Subject: [PATCH 6/8] improve script

---
 scripts/generate-comparison.py | 45 +++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py
index 8406a09..c8cf5d3 100644
--- a/scripts/generate-comparison.py
+++ b/scripts/generate-comparison.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import argparse
 import json
 import matplotlib.pyplot as plt
 import numpy as np
@@ -23,7 +24,7 @@
 def geomean(data):
     return np.prod(data) ** (1 / len(data))
 
-def generate_query_speedup_chart(baseline, comparison):
+def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str):
     results = []
     for query in range(1, 23):
         a = np.median(np.array(baseline[str(query)]))
@@ -48,21 +49,21 @@ def generate_query_speedup_chart(baseline, comparison):
     for bar, speedup in zip(bars, speedups):
         yval = bar.get_height()
         if yval >= 0:
-            ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+20), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
+            ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
                     color='blue', rotation=90)
         else:
             ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
                     color='blue', rotation=90)
 
     # Add title and labels
-    ax.set_title('Comet Acceleration of TPC-H Queries')
+    ax.set_title(label2 + " speedup over " + label1 + " (" + benchmark + ")")
     ax.set_ylabel('Speedup (100% speedup = 2x faster)')
     ax.set_xlabel('Query')
 
     # Customize the y-axis to handle both positive and negative values better
     ax.axhline(0, color='black', linewidth=0.8)
     min_value = (min(speedups) // 100) * 100
-    max_value = ((max(speedups) // 100) + 1) * 100
+    max_value = ((max(speedups) // 100) + 1) * 100 + 50
     ax.set_ylim(min_value, max_value)
 
     # Show grid for better readability
@@ -72,7 +73,7 @@ def generate_query_speedup_chart(baseline, comparison):
     plt.savefig('tpch_queries_speedup.png', format='png')
 
 
-def generate_query_comparison_chart(baseline, comparison):
+def generate_query_comparison_chart(baseline, comparison, label1: str, label2: str, benchmark: str):
     queries = []
     a = []
     b = []
@@ -89,13 +90,13 @@ def generate_query_comparison_chart(baseline, comparison):
 
     # Create a bar chart
     fig, ax = plt.subplots(figsize=(10, 6))
-    bar1 = ax.bar(index, a, bar_width, label='Spark')
-    bar2 = ax.bar(index + bar_width, b, bar_width, label='Spark + Comet')
+    bar1 = ax.bar(index, a, bar_width, label=label1)
+    bar2 = ax.bar(index + bar_width, b, bar_width, label=label2)
 
     # Add labels, title, and legend
+    ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")")
     ax.set_xlabel('Queries')
-    ax.set_ylabel('Run Time')
-    ax.set_title('TPC-H Queries')
+    ax.set_ylabel('Query Time (seconds)')
     ax.set_xticks(index + bar_width / 2)
     ax.set_xticklabels(queries)
     ax.legend()
@@ -103,7 +104,7 @@ def generate_query_comparison_chart(baseline, comparison):
     # Save the plot as an image file
     plt.savefig('tpch_queries_compare.png', format='png')
 
-def generate_summary(baseline, comparison):
+def generate_summary(baseline, comparison, label1: str, label2: str, benchmark: str):
     baseline_total = 0
     comparison_total = 0
     for query in range(1, 23):
@@ -114,12 +115,10 @@ def generate_summary(baseline, comparison):
     fig, ax = plt.subplots()
 
     # Add title and labels
-    #TODO make title configurable
-    ax.set_title('TPC-H Performance (scale factor 100)')
+    ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")")
     ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
 
-    # TODO make labels configurable
-    labels = ['Spark', 'Spark + Comet']
+    labels = [label1, label2]
     times = [round(baseline_total,0), round(comparison_total,0)]
 
     # Create bar chart
@@ -132,15 +131,21 @@ def generate_summary(baseline, comparison):
 
     plt.savefig('tpch_allqueries.png', format='png')
 
-def main(filename1: str, filename2: str):
+def main(filename1: str, filename2: str, label1: str, label2: str, benchmark: str):
     with open(filename1) as f1:
         baseline = json.load(f1)
     with open(filename2) as f2:
         comparison = json.load(f2)
-    generate_summary(baseline, comparison)
-    generate_query_comparison_chart(baseline, comparison)
-    generate_query_speedup_chart(baseline, comparison)
+    generate_summary(baseline, comparison, label1, label2, benchmark)
+    generate_query_comparison_chart(baseline, comparison, label1, label2, benchmark)
+    generate_query_speedup_chart(baseline, comparison, label1, label2, benchmark)
 
 if __name__ == '__main__':
-    # TODO argparse
-    main(sys.argv[1], sys.argv[2])
+    argparse = argparse.ArgumentParser(description='Generate comparison')
+    argparse.add_argument('filename1', type=str, help='First file')
+    argparse.add_argument('filename2', type=str, help='Second file')
+    argparse.add_argument('label1', type=str, help='First label')
+    argparse.add_argument('label2', type=str, help='Second label')
+    argparse.add_argument('benchmark', type=str, help='Benchmark description')
+    args = argparse.parse_args()
+    main(args.filename1, args.filename2, args.label1, args.label2, args.benchmark)

From fdb498a1c015e0cebc611076ca6fe4c6589148d8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 30 May 2024 11:15:25 -0600
Subject: [PATCH 7/8] support more than 2 files

---
 scripts/generate-comparison.py | 67 +++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/scripts/generate-comparison.py b/scripts/generate-comparison.py
index c8cf5d3..b13904e 100644
--- a/scripts/generate-comparison.py
+++ b/scripts/generate-comparison.py
@@ -73,28 +73,29 @@ def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str,
     plt.savefig('tpch_queries_speedup.png', format='png')
 
 
-def generate_query_comparison_chart(baseline, comparison, label1: str, label2: str, benchmark: str):
+def generate_query_comparison_chart(results, labels, benchmark: str):
     queries = []
-    a = []
-    b = []
+    benches = []
+    for _ in results:
+        benches.append([])
     for query in range(1, 23):
         queries.append("q" + str(query))
-        a.append(np.median(np.array(baseline[str(query)])))
-        b.append(np.median(np.array(comparison[str(query)])))
+        for i in range(0, len(results)):
+            benches[i].append(np.median(np.array(results[i][str(query)])))
 
     # Define the width of the bars
-    bar_width = 0.35
+    bar_width = 0.3
 
     # Define the positions of the bars on the x-axis
-    index = np.arange(len(queries))
+    index = np.arange(len(queries)) * 1.5
 
     # Create a bar chart
-    fig, ax = plt.subplots(figsize=(10, 6))
-    bar1 = ax.bar(index, a, bar_width, label=label1)
-    bar2 = ax.bar(index + bar_width, b, bar_width, label=label2)
+    fig, ax = plt.subplots(figsize=(15, 6))
+    for i in range(0, len(results)):
+        bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i])
 
     # Add labels, title, and legend
-    ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")")
+    ax.set_title(benchmark)
     ax.set_xlabel('Queries')
     ax.set_ylabel('Query Time (seconds)')
     ax.set_xticks(index + bar_width / 2)
@@ -104,22 +105,23 @@ def generate_query_comparison_chart(baseline, comparison, label1: str, label2: s
     # Save the plot as an image file
     plt.savefig('tpch_queries_compare.png', format='png')
 
-def generate_summary(baseline, comparison, label1: str, label2: str, benchmark: str):
-    baseline_total = 0
-    comparison_total = 0
+def generate_summary(results, labels, benchmark: str):
+    timings = []
+    for _ in results:
+        timings.append(0)
+
     for query in range(1, 23):
-        baseline_total += np.median(np.array(baseline[str(query)]))
-        comparison_total += np.median(np.array(comparison[str(query)]))
+        for i in range(0, len(results)):
+            timings[i] += np.median(np.array(results[i][str(query)]))
 
     # Create figure and axis
     fig, ax = plt.subplots()
 
     # Add title and labels
-    ax.set_title(label1 + " vs " + label2 + " (" + benchmark + ")")
+    ax.set_title(benchmark)
     ax.set_ylabel('Time in seconds to run all 22 TPC-H queries (lower is better)')
 
-    labels = [label1, label2]
-    times = [round(baseline_total,0), round(comparison_total,0)]
+    times = [round(x,0) for x in timings]
 
     # Create bar chart
     bars = ax.bar(labels, times, color='skyblue')
@@ -131,21 +133,20 @@ def generate_summary(baseline, comparison, label1: str, label2: str, benchmark:
 
     plt.savefig('tpch_allqueries.png', format='png')
 
-def main(filename1: str, filename2: str, label1: str, label2: str, benchmark: str):
-    with open(filename1) as f1:
-        baseline = json.load(f1)
-    with open(filename2) as f2:
-        comparison = json.load(f2)
-    generate_summary(baseline, comparison, label1, label2, benchmark)
-    generate_query_comparison_chart(baseline, comparison, label1, label2, benchmark)
-    generate_query_speedup_chart(baseline, comparison, label1, label2, benchmark)
+def main(files, labels, benchmark: str):
+    results = []
+    for filename in files:
+        with open(filename) as f:
+            results.append(json.load(f))
+    generate_summary(results, labels, benchmark)
+    generate_query_comparison_chart(results, labels, benchmark)
+    if len(files) == 2:
+        generate_query_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark)
 
 if __name__ == '__main__':
     argparse = argparse.ArgumentParser(description='Generate comparison')
-    argparse.add_argument('filename1', type=str, help='First file')
-    argparse.add_argument('filename2', type=str, help='Second file')
-    argparse.add_argument('label1', type=str, help='First label')
-    argparse.add_argument('label2', type=str, help='Second label')
-    argparse.add_argument('benchmark', type=str, help='Benchmark description')
+    argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files')
+    argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
+    argparse.add_argument('--benchmark', type=str, help='Benchmark description')
     args = argparse.parse_args()
-    main(args.filename1, args.filename2, args.label1, args.label2, args.benchmark)
+    main(args.filenames, args.labels, args.benchmark)

From 45bd311e1b77bf4a6740c1d82ce88c619ad320e9 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Thu, 30 May 2024 19:10:41 -0600
Subject: [PATCH 8/8] update README

---
 .gitignore |  4 +++-
 README.md  | 31 ++++++++++++++++++-------------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2a2f6e2..f4e2059 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
+venv
 .idea
-*.json
\ No newline at end of file
+*.json
+*.png
diff --git a/README.md b/README.md
index b772f3e..e405e6c 100644
--- a/README.md
+++ b/README.md
@@ -48,17 +48,12 @@ prohibited by the TPC.
 
 ## Data Generation
 
-See the benchmark-specific instructions for generating the CSV data for [TPC-H](tpch) and [TPC-DS](tpcds).
-
-## Converting CSV data to Parquet
-
-Although it is valid to run benchmarks against CSV data, this does not really represent how most of the world is
-running OLAP queries, especially when dealing with large datasets. When benchmarking DataFusion, we typically want
-to be querying Parquet data, so first we must convert the generated datasets to Parquet. Also, we typically do not
-want a single file per table, so we also need to repartition the data.
-
-We plan on adding Python scripts in this repository to perform this conversion and repartitioning. Until then you may
-want to write your own scripts using DataFusion or Spark. Another option is to use [tpc-tools](https://crates.io/crates/tpctools).
+See the benchmark-specific instructions for generating the CSV data for [TPC-H](tpch) and [TPC-DS](tpcds) and for 
+converting that data to Parquet format. Although it is valid to run benchmarks against CSV data, this does not really 
+represent how most of the world is running OLAP queries, especially when dealing with large datasets. When benchmarking 
+DataFusion and its subprojects, we typically want to be querying Parquet data. Also, we typically do not
+want a single file per table, so we also need to repartition the data. The provided scripts take care of this conversion 
+and repartitioning.
 
 ## Running the Benchmarks with DataFusion
 
@@ -67,10 +62,20 @@ Scripts are available for the following DataFusion projects:
 - [DataFusion Python](./runners/datafusion-python)
 - [DataFusion Comet](./runners/datafusion-comet)
 
+These benchmarking scripts produce JSON files containing query timings.
+
 ## Comparing Results
 
-Coming soon. The plan is to add some Python scripts for comparing results from different runs and producing charts
-that we can use in blog posts.
+The Python script [scripts/generate-comparison.py](scripts/generate-comparison.py) can be used to produce charts 
+comparing results from different benchmark runs.
+
+For example:
+
+```shell
+python scripts/generate-comparison.py file1.json file2.json --labels "Spark" "Comet" --benchmark "TPC-H 100GB"
+```
+
+This will create image files in the current directory in PNG format.
 
 ## Legal Notices