apache · gszadovszky · Sep 24, 2019 · Aug 29, 2019 · Aug 29, 2019 · Sep 5, 2019
diff --git a/parquet-benchmarks/README.md b/parquet-benchmarks/README.md
@@ -17,22 +17,42 @@
   ~ under the License.
   -->
 
-##Running Parquet Benchmarks
+# Running Parquet Benchmarks
 
-First, build the ``parquet-benchmarks`` module
+The Parquet benchmarks in this module are run using the 
+[OpenJDK Java Microbenchmarking Harness](http://openjdk.java.net/projects/code-tools/jmh/).
+
+First, building the `parquet-benchmarks` module creates an uber-jar including the Parquet
+classes and all dependencies, and a main class to launch the JMH tool.
 
 ```
 mvn --projects parquet-benchmarks -amd -DskipTests -Denforcer.skip=true clean package
 ```
 
-Then, you can run all the benchmarks with the following command
+JMH doesn't have the notion of "benchmark suites", but there are certain benchmarks that 
+make sense to group together or to run in isolation during development.  The 
+`./parquet-benchmarks/run.sh` script can be used to launch all or some benchmarks:
 
 ```
-./parquet-benchmarks/run.sh -wi 5 -i 5 -f 3 -bm all
-```
+# More information about the run script and the available arguments.
+./parquet-benchmarks/run.sh
+
+# More information on the JMH options available.
+./parquet-benchmarks/run.sh all -help
+
+# Run every benchmark once (~20 minutes).
+./parquet-benchmarks/run.sh all -wi 0 -i 1 -f 1
 
-To understand what each command line argument means and for more arguments please see
+# A more rigourous run of all benchmarks, saving a report for comparison.
+./parquet-benchmarks/run.sh all -wi 5 -i 5 -f 3 -rff /tmp/benchmark1.json
 
+# Run a benchmark "suite" built into the script, with JMH defaults (about 30 minutes)
+./parquet-benchmarks/run.sh checksum
+
+# Running one specific benchmark using a regex.
+./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.NestedNullWritingBenchmarks
+
+# Manually clean up any state left behind from a previous run.
+./parquet-benchmarks/run.sh clean
 ```
-java -jar parquet-benchmarks/target/parquet-benchmarks.jar -help
-```
+
diff --git a/parquet-benchmarks/run.sh b/parquet-benchmarks/run.sh
@@ -20,11 +20,91 @@
 
 SCRIPT_PATH=$( cd "$(dirname "$0")" ; pwd -P )
 
-echo "Starting WRITE benchmarks"
-java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*Write* "$@"
-echo "Generating test data"
-java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator generate
-echo "Data generated, starting READ benchmarks"
-java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*Read* "$@"
-echo "Cleaning up generated data"
-java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator cleanup
+BENCHMARK=$1; shift
+JMH_OPTIONS="$@"
+
+if [ -z "$BENCHMARK" ]; then
+
+  # Print usage if run without arguments.
+  cat << EOF
+Runs Parquet JMH-based benchmarks.
+
+Usage:
+  run.sh <BENCHMARK> [JMH_OPTIONS]
+
+Information on the JMH_OPTIONS can be found by running: run.sh all -help
+
+<BENCHMARK> | Description
+----------- | ----------
+all         | Runs all benchmarks in the module (listed here and others).
+build       | (No benchmark run, shortcut to rebuild the JMH uber jar).
+clean       | (No benchmark run, shortcut to clean up any temporary files).
+read        | Reading files with different compression, page and block sizes.
+write       | Writing files.
+checksum    | Reading and writing with and without CRC checksums.
+filter      | Filtering column indexes
+
+Examples:
+
+# More information about the run script and the available arguments.
+./parquet-benchmarks/run.sh
+
+# More information on the JMH options available.
+./parquet-benchmarks/run.sh all -help
+
+# Run every benchmark once (~20 minutes).
+./parquet-benchmarks/run.sh all -wi 0 -i 1 -f 1
+
+# A more rigourous run of all benchmarks, saving a report for comparison.
+./parquet-benchmarks/run.sh all -wi 5 -i 5 -f 3 -rff /tmp/benchmark1.json
+
+# Run a benchmark "suite" built into the script, with JMH defaults (about 30 minutes)
+./parquet-benchmarks/run.sh checksum
+
+# Running one specific benchmark using a regex.
+./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.NestedNullWritingBenchmarks
+
+EOF
+
+elif [ "$BENCHMARK" == "build" ]; then
+
+  # Shortcut utility to rebuild the benchmark module only.
+  ( cd $SCRIPT_PATH && mvn -amd -DskipTests -Denforcer.skip=true clean package )
+
+elif [ "$BENCHMARK" == "clean" ]; then
+
+  # Shortcut utility to clean any state left behind from any previous run.
+  java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator cleanup
+
+else
+
+  # Actually run a benchmark in the JMH harness.
+
+  # Build the benchmark uberjar if it doesn't already exist.
+  if [ ! -f ${SCRIPT_PATH}/target/parquet-benchmarks.jar ]; then
+    ${SCRIPT_PATH}/run.sh build
+  fi
+
+  # Pick a regex if specified.
+  BENCHMARK_REGEX=""
+  case "$BENCHMARK" in
+  "read")
+    BENCHMARK_REGEX="org.apache.parquet.benchmarks.ReadBenchmarks"
+    ;;
+  "write")
+    BENCHMARK_REGEX="org.apache.parquet.benchmarks.WriteBenchmarks"
+    ;;
+  "checksum")
+    BENCHMARK_REGEX="org.apache.parquet.benchmarks.PageChecksum.*"
+    ;;
+  "filter")
+    BENCHMARK_REGEX="org.apache.parquet.benchmarks.FilteringBenchmarks"
+    ;;
+  esac
+
+  echo JMH command: java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar $BENCHMARK_REGEX $JMH_OPTIONS
+  java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar $BENCHMARK_REGEX $JMH_OPTIONS
+
+   # Clean any data files generated by the benchmarks.
+   ${SCRIPT_PATH}/run.sh clean
+fi
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java
@@ -25,6 +25,8 @@ public class BenchmarkFiles {
   public static final Configuration configuration = new Configuration();
 
   public static final String TARGET_DIR = "target/tests/ParquetBenchmarks";
+  public static final Path targetDir = new Path(TARGET_DIR );
+
   public static final Path file_1M = new Path(TARGET_DIR + "/PARQUET-1M");
 
   //different block and page sizes

diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java
@@ -115,14 +115,7 @@ public void generateData(Path outFile, Configuration configuration, ParquetPrope
 
   public void cleanup()
   {
-    deleteIfExists(configuration, file_1M);
-    deleteIfExists(configuration, file_1M_BS256M_PS4M);
-    deleteIfExists(configuration, file_1M_BS256M_PS8M);
-    deleteIfExists(configuration, file_1M_BS512M_PS4M);
-    deleteIfExists(configuration, file_1M_BS512M_PS8M);
-//    deleteIfExists(configuration, parquetFile_1M_LZO);
-    deleteIfExists(configuration, file_1M_SNAPPY);
-    deleteIfExists(configuration, file_1M_GZIP);
+    deleteIfExists(configuration, targetDir);
   }
 
   public static void main(String[] args) {

diff --git a/...uet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java b/...uet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java
@@ -40,7 +40,7 @@
 import static org.apache.parquet.benchmarks.BenchmarkUtils.exists;
 import static org.apache.parquet.hadoop.metadata.CompressionCodecName.*;
 
-public class PageChecksumDataGenerator {
+public class PageChecksumDataGenerator extends DataGenerator {
 
   private final MessageType SCHEMA = MessageTypeParser.parseMessageType(
     "message m {" +
@@ -103,25 +103,4 @@ public void generateAll() {
       throw new RuntimeException(e);
     }
   }
-
-  public void cleanup() {
-    deleteIfExists(configuration, file_100K_NOCHECKSUMS_UNCOMPRESSED);
-    deleteIfExists(configuration, file_100K_CHECKSUMS_UNCOMPRESSED);
-    deleteIfExists(configuration, file_100K_NOCHECKSUMS_GZIP);
-    deleteIfExists(configuration, file_100K_CHECKSUMS_GZIP);
-    deleteIfExists(configuration, file_100K_NOCHECKSUMS_SNAPPY);
-    deleteIfExists(configuration, file_100K_CHECKSUMS_SNAPPY);
-    deleteIfExists(configuration, file_1M_NOCHECKSUMS_UNCOMPRESSED);
-    deleteIfExists(configuration, file_1M_CHECKSUMS_UNCOMPRESSED);
-    deleteIfExists(configuration, file_1M_NOCHECKSUMS_GZIP);
-    deleteIfExists(configuration, file_1M_CHECKSUMS_GZIP);
-    deleteIfExists(configuration, file_1M_NOCHECKSUMS_SNAPPY);
-    deleteIfExists(configuration, file_1M_CHECKSUMS_SNAPPY);
-    deleteIfExists(configuration, file_10M_NOCHECKSUMS_UNCOMPRESSED);
-    deleteIfExists(configuration, file_10M_CHECKSUMS_UNCOMPRESSED);
-    deleteIfExists(configuration, file_10M_NOCHECKSUMS_GZIP);
-    deleteIfExists(configuration, file_10M_CHECKSUMS_GZIP);
-    deleteIfExists(configuration, file_10M_NOCHECKSUMS_SNAPPY);
-    deleteIfExists(configuration, file_10M_CHECKSUMS_SNAPPY);
-  }
 }
diff --git a/...et-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java b/...et-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java
@@ -51,16 +51,15 @@ public class PageChecksumReadBenchmarks {
 
   private PageChecksumDataGenerator pageChecksumDataGenerator = new PageChecksumDataGenerator();
 
+  /**
+   * This needs to be done exactly once.  To avoid needlessly regenerating the files for reading, they aren't cleaned
+   * as part of the benchmark.  If the files exist, a message will be printed and they will not be regenerated.
+   */
   @Setup(Level.Trial)
   public void setup() {
     pageChecksumDataGenerator.generateAll();
   }
 
-  @Setup(Level.Trial)
-  public void cleanup() {
-    pageChecksumDataGenerator.cleanup();
-  }
-
   private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
     throws IOException {
     try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
@@ -82,96 +81,114 @@ private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole b
 
   // 100k rows, uncompressed, GZIP, Snappy
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read100KRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read100KRowsUncompressedWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, true, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read100KRowsGzipWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read100KRowsGzipWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, true, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read100KRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read100KRowsSnappyWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, true, blackhole);
   }
 
   // 1M rows, uncompressed, GZIP, Snappy
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read1MRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read1MRowsUncompressedWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, true, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read1MRowsGzipWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_1M_CHECKSUMS_GZIP, ONE_MILLION, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read1MRowsGzipWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_1M_CHECKSUMS_GZIP, ONE_MILLION, true, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read1MRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read1MRowsSnappyWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, true, blackhole);
   }
 
   // 10M rows, uncompressed, GZIP, Snappy
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read10MRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read10MRowsUncompressedWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, true, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read10MRowsGzipWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read10MRowsGzipWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, true, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read10MRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException {
     readFile(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, false, blackhole);
   }
 
-  @Benchmark @BenchmarkMode(Mode.SingleShotTime)
+  @Benchmark
+  @BenchmarkMode(Mode.SingleShotTime)
   public void read10MRowsSnappyWithVerification(Blackhole blackhole) throws IOException {
     readFile(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, true, blackhole);
   }