diff --git a/parquet-benchmarks/README.md b/parquet-benchmarks/README.md index 8da067b09b..63101bd1b5 100644 --- a/parquet-benchmarks/README.md +++ b/parquet-benchmarks/README.md @@ -17,22 +17,42 @@ ~ under the License. --> -##Running Parquet Benchmarks +# Running Parquet Benchmarks -First, build the ``parquet-benchmarks`` module +The Parquet benchmarks in this module are run using the +[OpenJDK Java Microbenchmarking Harness](http://openjdk.java.net/projects/code-tools/jmh/). + +First, building the `parquet-benchmarks` module creates an uber-jar including the Parquet +classes and all dependencies, and a main class to launch the JMH tool. ``` mvn --projects parquet-benchmarks -amd -DskipTests -Denforcer.skip=true clean package ``` -Then, you can run all the benchmarks with the following command +JMH doesn't have the notion of "benchmark suites", but there are certain benchmarks that +make sense to group together or to run in isolation during development. The +`./parquet-benchmarks/run.sh` script can be used to launch all or some benchmarks: ``` -./parquet-benchmarks/run.sh -wi 5 -i 5 -f 3 -bm all -``` +# More information about the run script and the available arguments. +./parquet-benchmarks/run.sh + +# More information on the JMH options available. +./parquet-benchmarks/run.sh all -help + +# Run every benchmark once (~20 minutes). +./parquet-benchmarks/run.sh all -wi 0 -i 1 -f 1 -To understand what each command line argument means and for more arguments please see +# A more rigourous run of all benchmarks, saving a report for comparison. +./parquet-benchmarks/run.sh all -wi 5 -i 5 -f 3 -rff /tmp/benchmark1.json +# Run a benchmark "suite" built into the script, with JMH defaults (about 30 minutes) +./parquet-benchmarks/run.sh checksum + +# Running one specific benchmark using a regex. +./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.NestedNullWritingBenchmarks + +# Manually clean up any state left behind from a previous run. +./parquet-benchmarks/run.sh clean ``` -java -jar parquet-benchmarks/target/parquet-benchmarks.jar -help -``` \ No newline at end of file + diff --git a/parquet-benchmarks/run.sh b/parquet-benchmarks/run.sh index 8aa1e69ab3..ba407662d2 100755 --- a/parquet-benchmarks/run.sh +++ b/parquet-benchmarks/run.sh @@ -20,11 +20,91 @@ SCRIPT_PATH=$( cd "$(dirname "$0")" ; pwd -P ) -echo "Starting WRITE benchmarks" -java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*Write* "$@" -echo "Generating test data" -java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator generate -echo "Data generated, starting READ benchmarks" -java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*Read* "$@" -echo "Cleaning up generated data" -java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator cleanup +BENCHMARK=$1; shift +JMH_OPTIONS="$@" + +if [ -z "$BENCHMARK" ]; then + + # Print usage if run without arguments. + cat << EOF +Runs Parquet JMH-based benchmarks. + +Usage: + run.sh [JMH_OPTIONS] + +Information on the JMH_OPTIONS can be found by running: run.sh all -help + + | Description +----------- | ---------- +all | Runs all benchmarks in the module (listed here and others). +build | (No benchmark run, shortcut to rebuild the JMH uber jar). +clean | (No benchmark run, shortcut to clean up any temporary files). +read | Reading files with different compression, page and block sizes. +write | Writing files. +checksum | Reading and writing with and without CRC checksums. +filter | Filtering column indexes + +Examples: + +# More information about the run script and the available arguments. +./parquet-benchmarks/run.sh + +# More information on the JMH options available. +./parquet-benchmarks/run.sh all -help + +# Run every benchmark once (~20 minutes). +./parquet-benchmarks/run.sh all -wi 0 -i 1 -f 1 + +# A more rigourous run of all benchmarks, saving a report for comparison. +./parquet-benchmarks/run.sh all -wi 5 -i 5 -f 3 -rff /tmp/benchmark1.json + +# Run a benchmark "suite" built into the script, with JMH defaults (about 30 minutes) +./parquet-benchmarks/run.sh checksum + +# Running one specific benchmark using a regex. +./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.NestedNullWritingBenchmarks + +EOF + +elif [ "$BENCHMARK" == "build" ]; then + + # Shortcut utility to rebuild the benchmark module only. + ( cd $SCRIPT_PATH && mvn -amd -DskipTests -Denforcer.skip=true clean package ) + +elif [ "$BENCHMARK" == "clean" ]; then + + # Shortcut utility to clean any state left behind from any previous run. + java -cp ${SCRIPT_PATH}/target/parquet-benchmarks.jar org.apache.parquet.benchmarks.DataGenerator cleanup + +else + + # Actually run a benchmark in the JMH harness. + + # Build the benchmark uberjar if it doesn't already exist. + if [ ! -f ${SCRIPT_PATH}/target/parquet-benchmarks.jar ]; then + ${SCRIPT_PATH}/run.sh build + fi + + # Pick a regex if specified. + BENCHMARK_REGEX="" + case "$BENCHMARK" in + "read") + BENCHMARK_REGEX="org.apache.parquet.benchmarks.ReadBenchmarks" + ;; + "write") + BENCHMARK_REGEX="org.apache.parquet.benchmarks.WriteBenchmarks" + ;; + "checksum") + BENCHMARK_REGEX="org.apache.parquet.benchmarks.PageChecksum.*" + ;; + "filter") + BENCHMARK_REGEX="org.apache.parquet.benchmarks.FilteringBenchmarks" + ;; + esac + + echo JMH command: java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar $BENCHMARK_REGEX $JMH_OPTIONS + java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar $BENCHMARK_REGEX $JMH_OPTIONS + + # Clean any data files generated by the benchmarks. + ${SCRIPT_PATH}/run.sh clean +fi diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java index f039403bfc..24da8220ca 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java @@ -25,6 +25,8 @@ public class BenchmarkFiles { public static final Configuration configuration = new Configuration(); public static final String TARGET_DIR = "target/tests/ParquetBenchmarks"; + public static final Path targetDir = new Path(TARGET_DIR ); + public static final Path file_1M = new Path(TARGET_DIR + "/PARQUET-1M"); //different block and page sizes diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java index 42d9953e68..3b5db686fa 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java @@ -115,14 +115,7 @@ public void generateData(Path outFile, Configuration configuration, ParquetPrope public void cleanup() { - deleteIfExists(configuration, file_1M); - deleteIfExists(configuration, file_1M_BS256M_PS4M); - deleteIfExists(configuration, file_1M_BS256M_PS8M); - deleteIfExists(configuration, file_1M_BS512M_PS4M); - deleteIfExists(configuration, file_1M_BS512M_PS8M); -// deleteIfExists(configuration, parquetFile_1M_LZO); - deleteIfExists(configuration, file_1M_SNAPPY); - deleteIfExists(configuration, file_1M_GZIP); + deleteIfExists(configuration, targetDir); } public static void main(String[] args) { diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java index 6c62cc6e6d..49ebdce8e3 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumDataGenerator.java @@ -40,7 +40,7 @@ import static org.apache.parquet.benchmarks.BenchmarkUtils.exists; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.*; -public class PageChecksumDataGenerator { +public class PageChecksumDataGenerator extends DataGenerator { private final MessageType SCHEMA = MessageTypeParser.parseMessageType( "message m {" + @@ -103,25 +103,4 @@ public void generateAll() { throw new RuntimeException(e); } } - - public void cleanup() { - deleteIfExists(configuration, file_100K_NOCHECKSUMS_UNCOMPRESSED); - deleteIfExists(configuration, file_100K_CHECKSUMS_UNCOMPRESSED); - deleteIfExists(configuration, file_100K_NOCHECKSUMS_GZIP); - deleteIfExists(configuration, file_100K_CHECKSUMS_GZIP); - deleteIfExists(configuration, file_100K_NOCHECKSUMS_SNAPPY); - deleteIfExists(configuration, file_100K_CHECKSUMS_SNAPPY); - deleteIfExists(configuration, file_1M_NOCHECKSUMS_UNCOMPRESSED); - deleteIfExists(configuration, file_1M_CHECKSUMS_UNCOMPRESSED); - deleteIfExists(configuration, file_1M_NOCHECKSUMS_GZIP); - deleteIfExists(configuration, file_1M_CHECKSUMS_GZIP); - deleteIfExists(configuration, file_1M_NOCHECKSUMS_SNAPPY); - deleteIfExists(configuration, file_1M_CHECKSUMS_SNAPPY); - deleteIfExists(configuration, file_10M_NOCHECKSUMS_UNCOMPRESSED); - deleteIfExists(configuration, file_10M_CHECKSUMS_UNCOMPRESSED); - deleteIfExists(configuration, file_10M_NOCHECKSUMS_GZIP); - deleteIfExists(configuration, file_10M_CHECKSUMS_GZIP); - deleteIfExists(configuration, file_10M_NOCHECKSUMS_SNAPPY); - deleteIfExists(configuration, file_10M_CHECKSUMS_SNAPPY); - } } diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java index db23eeb672..be2ebe40f7 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumReadBenchmarks.java @@ -51,16 +51,15 @@ public class PageChecksumReadBenchmarks { private PageChecksumDataGenerator pageChecksumDataGenerator = new PageChecksumDataGenerator(); + /** + * This needs to be done exactly once. To avoid needlessly regenerating the files for reading, they aren't cleaned + * as part of the benchmark. If the files exist, a message will be printed and they will not be regenerated. + */ @Setup(Level.Trial) public void setup() { pageChecksumDataGenerator.generateAll(); } - @Setup(Level.Trial) - public void cleanup() { - pageChecksumDataGenerator.cleanup(); - } - private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole) throws IOException { try (ParquetReader reader = ParquetReader.builder(new GroupReadSupport(), file) @@ -82,96 +81,114 @@ private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole b // 100k rows, uncompressed, GZIP, Snappy - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read100KRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read100KRowsUncompressedWithVerification(Blackhole blackhole) throws IOException { readFile(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, true, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read100KRowsGzipWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read100KRowsGzipWithVerification(Blackhole blackhole) throws IOException { readFile(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, true, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read100KRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read100KRowsSnappyWithVerification(Blackhole blackhole) throws IOException { readFile(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, true, blackhole); } // 1M rows, uncompressed, GZIP, Snappy - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsUncompressedWithVerification(Blackhole blackhole) throws IOException { readFile(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, true, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsGzipWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_1M_CHECKSUMS_GZIP, ONE_MILLION, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsGzipWithVerification(Blackhole blackhole) throws IOException { readFile(file_1M_CHECKSUMS_GZIP, ONE_MILLION, true, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsSnappyWithVerification(Blackhole blackhole) throws IOException { readFile(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, true, blackhole); } // 10M rows, uncompressed, GZIP, Snappy - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read10MRowsUncompressedWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read10MRowsUncompressedWithVerification(Blackhole blackhole) throws IOException { readFile(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, true, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read10MRowsGzipWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read10MRowsGzipWithVerification(Blackhole blackhole) throws IOException { readFile(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, true, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read10MRowsSnappyWithoutVerification(Blackhole blackhole) throws IOException { readFile(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, false, blackhole); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read10MRowsSnappyWithVerification(Blackhole blackhole) throws IOException { readFile(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, true, blackhole); } diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java index c743dde01e..e892d53a76 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/PageChecksumWriteBenchmarks.java @@ -57,102 +57,120 @@ public class PageChecksumWriteBenchmarks { private PageChecksumDataGenerator pageChecksumDataGenerator = new PageChecksumDataGenerator(); @Setup(Level.Iteration) - public void cleanup() { + public void setup() { pageChecksumDataGenerator.cleanup(); } // 100k rows, uncompressed, GZIP, Snappy - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write100KRowsUncompressedWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_100K_NOCHECKSUMS_UNCOMPRESSED, 100 * ONE_K, false, UNCOMPRESSED); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write100KRowsUncompressedWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_100K_CHECKSUMS_UNCOMPRESSED, 100 * ONE_K, true, UNCOMPRESSED); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write100KRowsGzipWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_100K_NOCHECKSUMS_GZIP, 100 * ONE_K, false, GZIP); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write100KRowsGzipWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_100K_CHECKSUMS_GZIP, 100 * ONE_K, true, GZIP); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write100KRowsSnappyWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_100K_NOCHECKSUMS_SNAPPY, 100 * ONE_K, false, SNAPPY); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write100KRowsSnappyWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_100K_CHECKSUMS_SNAPPY, 100 * ONE_K, true, SNAPPY); } // 1M rows, uncompressed, GZIP, Snappy - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsUncompressedWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_1M_NOCHECKSUMS_UNCOMPRESSED, ONE_MILLION, false, UNCOMPRESSED); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsUncompressedWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_1M_CHECKSUMS_UNCOMPRESSED, ONE_MILLION, true, UNCOMPRESSED); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsGzipWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_1M_NOCHECKSUMS_GZIP, ONE_MILLION, false, GZIP); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsGzipWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_1M_CHECKSUMS_GZIP, ONE_MILLION, true, GZIP); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsSnappyWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_1M_NOCHECKSUMS_SNAPPY, ONE_MILLION, false, SNAPPY); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsSnappyWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_1M_CHECKSUMS_SNAPPY, ONE_MILLION, true, SNAPPY); } // 10M rows, uncompressed, GZIP, Snappy - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write10MRowsUncompressedWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_10M_NOCHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, false, UNCOMPRESSED); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write10MRowsUncompressedWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_10M_CHECKSUMS_UNCOMPRESSED, 10 * ONE_MILLION, true, UNCOMPRESSED); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write10MRowsGzipWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_10M_NOCHECKSUMS_GZIP, 10 * ONE_MILLION, false, GZIP); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write10MRowsGzipWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_10M_CHECKSUMS_GZIP, 10 * ONE_MILLION, true, GZIP); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write10MRowsSnappyWithoutChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_10M_NOCHECKSUMS_SNAPPY, 10 * ONE_MILLION, false, SNAPPY); } - @Benchmark @BenchmarkMode(Mode.SingleShotTime) + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write10MRowsSnappyWithChecksums() throws IOException { pageChecksumDataGenerator.generateData(file_10M_CHECKSUMS_SNAPPY, 10 * ONE_MILLION, true, SNAPPY); } diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java index dba5544a5e..e74204a69d 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java @@ -20,6 +20,13 @@ import org.apache.hadoop.fs.Path; import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.infra.Blackhole; import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetReader; @@ -29,7 +36,9 @@ import java.io.IOException; +@State(Scope.Benchmark) public class ReadBenchmarks { + private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException { ParquetReader reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build(); @@ -47,7 +56,17 @@ private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOExc reader.close(); } + /** + * This needs to be done exactly once. To avoid needlessly regenerating the files for reading, they aren't cleaned + * as part of the benchmark. If the files exist, a message will be printed and they will not be regenerated. + */ + @Setup(Level.Trial) + public void generateFilesForRead() { + new DataGenerator().generateAll(); + } + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsDefaultBlockAndPageSizeUncompressed(Blackhole blackhole) throws IOException { @@ -55,6 +74,7 @@ public void read1MRowsDefaultBlockAndPageSizeUncompressed(Blackhole blackhole) } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsBS256MPS4MUncompressed(Blackhole blackhole) throws IOException { @@ -62,6 +82,7 @@ public void read1MRowsBS256MPS4MUncompressed(Blackhole blackhole) } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsBS256MPS8MUncompressed(Blackhole blackhole) throws IOException { @@ -69,6 +90,7 @@ public void read1MRowsBS256MPS8MUncompressed(Blackhole blackhole) } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsBS512MPS4MUncompressed(Blackhole blackhole) throws IOException { @@ -76,6 +98,7 @@ public void read1MRowsBS512MPS4MUncompressed(Blackhole blackhole) } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsBS512MPS8MUncompressed(Blackhole blackhole) throws IOException { @@ -91,6 +114,7 @@ public void read1MRowsBS512MPS8MUncompressed(Blackhole blackhole) // } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsDefaultBlockAndPageSizeSNAPPY(Blackhole blackhole) throws IOException { @@ -98,6 +122,7 @@ public void read1MRowsDefaultBlockAndPageSizeSNAPPY(Blackhole blackhole) } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void read1MRowsDefaultBlockAndPageSizeGZIP(Blackhole blackhole) throws IOException { diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java index 5c26a845dc..0a2d2c058b 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java @@ -19,7 +19,9 @@ package org.apache.parquet.benchmarks; import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Mode; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; @@ -39,12 +41,13 @@ public class WriteBenchmarks { private DataGenerator dataGenerator = new DataGenerator(); @Setup(Level.Iteration) - public void cleanup() { + public void setup() { //clean existing test data at the beginning of each iteration dataGenerator.cleanup(); } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsDefaultBlockAndPageSizeUncompressed() throws IOException { @@ -59,6 +62,7 @@ public void write1MRowsDefaultBlockAndPageSizeUncompressed() } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsBS256MPS4MUncompressed() throws IOException { @@ -73,6 +77,7 @@ public void write1MRowsBS256MPS4MUncompressed() } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsBS256MPS8MUncompressed() throws IOException { @@ -87,6 +92,7 @@ public void write1MRowsBS256MPS8MUncompressed() } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsBS512MPS4MUncompressed() throws IOException { @@ -101,6 +107,7 @@ public void write1MRowsBS512MPS4MUncompressed() } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsBS512MPS8MUncompressed() throws IOException { @@ -130,6 +137,7 @@ public void write1MRowsBS512MPS8MUncompressed() // } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsDefaultBlockAndPageSizeSNAPPY() throws IOException { @@ -144,6 +152,7 @@ public void write1MRowsDefaultBlockAndPageSizeSNAPPY() } @Benchmark + @BenchmarkMode(Mode.SingleShotTime) public void write1MRowsDefaultBlockAndPageSizeGZIP() throws IOException { diff --git a/parquet-benchmarks/run_checksums.sh b/parquet-benchmarks/src/main/resources/log4j.properties old mode 100755 new mode 100644 similarity index 68% rename from parquet-benchmarks/run_checksums.sh rename to parquet-benchmarks/src/main/resources/log4j.properties index e798488157..f4737c8808 --- a/parquet-benchmarks/run_checksums.sh +++ b/parquet-benchmarks/src/main/resources/log4j.properties @@ -1,4 +1,3 @@ -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,12 +16,9 @@ # under the License. # -# !/usr/bin/env bash - -SCRIPT_PATH=$( cd "$(dirname "$0")" ; pwd -P ) +log4j.rootLogger=INFO, stdout -echo "Page level CRC checksum benchmarks" -echo "Running write benchmarks" -java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*PageChecksumWriteBenchmarks -bm ss "$@" -echo "Running read benchmarks" -java -jar ${SCRIPT_PATH}/target/parquet-benchmarks.jar p*PageChecksumReadBenchmarks -bm ss "$@" +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p :: %m [%C]%n