diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml index d5a288b677..1f5aade313 100644 --- a/parquet-benchmarks/pom.xml +++ b/parquet-benchmarks/pom.xml @@ -87,6 +87,12 @@ slf4j-api ${slf4j.version} + + com.aayushatharva.brotli4j + brotli4j + ${brotli4j.version} + runtime + @@ -94,6 +100,18 @@ org.apache.maven.plugins maven-compiler-plugin + + + + org.openjdk.jmh + jmh-generator-annprocess + ${jmh.version} + + + + org.openjdk.jmh.generators.BenchmarkProcessor + + org.apache.maven.plugins @@ -112,6 +130,12 @@ org.openjdk.jmh.Main + + META-INF/BenchmarkList + + + META-INF/CompilerHints + diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java index 43b907befe..bbf4a8d46d 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BenchmarkFiles.java @@ -36,7 +36,7 @@ public class BenchmarkFiles { public static final Path file_1M_BS512M_PS8M = new Path(TARGET_DIR + "/PARQUET-1M-BS512M_PS8M"); // different compression codecs - // public final Path parquetFile_1M_LZO = new Path("target/tests/ParquetBenchmarks/PARQUET-1M-LZO"); + public static final Path file_1M_LZO = new Path(TARGET_DIR + "/PARQUET-1M-LZO"); public static final Path file_1M_SNAPPY = new Path(TARGET_DIR + "/PARQUET-1M-SNAPPY"); public static final Path file_1M_GZIP = new Path(TARGET_DIR + "/PARQUET-1M-GZIP"); diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java new file mode 100644 index 0000000000..690ddc2bbe --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/BlackHoleOutputFile.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import org.apache.parquet.io.OutputFile; +import org.apache.parquet.io.PositionOutputStream; + +/** + * A no-op {@link OutputFile} that discards all written data. + * Useful for isolating CPU/encoding cost from filesystem I/O in write benchmarks. + */ +public final class BlackHoleOutputFile implements OutputFile { + + public static final BlackHoleOutputFile INSTANCE = new BlackHoleOutputFile(); + + private BlackHoleOutputFile() {} + + @Override + public boolean supportsBlockSize() { + return false; + } + + @Override + public long defaultBlockSize() { + return -1L; + } + + @Override + public PositionOutputStream createOrOverwrite(long blockSizeHint) { + return create(blockSizeHint); + } + + @Override + public PositionOutputStream create(long blockSizeHint) { + return new PositionOutputStream() { + private long pos; + + @Override + public long getPos() throws IOException { + return pos; + } + + @Override + public void write(int b) throws IOException { + ++pos; + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + pos += len; + } + }; + } + + @Override + public String getPath() { + return "/dev/null"; + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java new file mode 100644 index 0000000000..7aba6db4d7 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CompressionBenchmark.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.DirectByteBufferAllocator; +import org.apache.parquet.compression.CompressionCodecFactory; +import org.apache.parquet.hadoop.CodecFactory; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Isolated JMH benchmarks for raw Parquet compression and decompression throughput. + * + *

Measures the performance of {@link CompressionCodecFactory.BytesInputCompressor} + * and {@link CompressionCodecFactory.BytesInputDecompressor} for each supported codec, + * using the direct-memory {@link CodecFactory} path (same as actual Parquet file I/O). + * Input data is generated to approximate realistic Parquet page content (a mix of + * sequential, repeated, and random byte patterns). + * + *

This benchmark isolates the codec hot path from file I/O, encoding, and other + * Parquet overhead, making it ideal for measuring compression-specific optimizations. + */ +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@Fork(1) +@Warmup(iterations = 2, time = 1) +@Measurement(iterations = 3, time = 2) +@State(Scope.Thread) +public class CompressionBenchmark { + + @Param({"SNAPPY", "ZSTD", "LZ4_RAW", "GZIP", "BROTLI", "LZO"}) + public String codec; + + @Param({"65536", "131072", "262144", "1048576"}) + public int pageSize; + + private byte[] uncompressedData; + private byte[] compressedData; + private int decompressedSize; + + private CompressionCodecFactory.BytesInputCompressor compressor; + private CompressionCodecFactory.BytesInputDecompressor decompressor; + private CodecFactory factory; + + @Setup(Level.Trial) + public void setup() throws IOException { + uncompressedData = generatePageData(pageSize, 42L); + decompressedSize = uncompressedData.length; + + Configuration conf = new Configuration(); + factory = CodecFactory.createDirectCodecFactory(conf, DirectByteBufferAllocator.getInstance(), pageSize); + CompressionCodecName codecName = CompressionCodecName.valueOf(codec); + + compressor = factory.getCompressor(codecName); + decompressor = factory.getDecompressor(codecName); + + // Pre-compress for decompression benchmark; copy to a stable byte array + // since the compressor may reuse its internal buffer. + BytesInput compressed = compressor.compress(BytesInput.from(uncompressedData)); + compressedData = compressed.toByteArray(); + } + + @TearDown(Level.Trial) + public void tearDown() { + factory.release(); + } + + @Benchmark + public BytesInput compress() throws IOException { + return compressor.compress(BytesInput.from(uncompressedData)); + } + + @Benchmark + public byte[] decompress() throws IOException { + // Force materialization of the decompressed data. Without this, codecs using + // the stream-based HeapBytesDecompressor (e.g. GZIP) would return a lazy + // StreamBytesInput, deferring the actual work. toByteArray() is essentially + // free for our optimized implementations (returns the existing byte[]). + return decompressor + .decompress(BytesInput.from(compressedData), decompressedSize) + .toByteArray(); + } + + /** + * Generates byte data that approximates realistic Parquet page content. + * Mixes sequential runs, repeated values, low-range random, and full random + * to produce a realistic compression ratio (~2-4x for fast codecs). + */ + static byte[] generatePageData(int size, long seed) { + Random random = new Random(seed); + byte[] data = new byte[size]; + int i = 0; + while (i < size) { + int patternType = random.nextInt(4); + int chunkSize = Math.min(random.nextInt(256) + 64, size - i); + switch (patternType) { + case 0: // Sequential bytes (highly compressible) + for (int j = 0; j < chunkSize && i < size; j++) { + data[i++] = (byte) (j & 0xFF); + } + break; + case 1: // Repeated value (highly compressible) + byte val = (byte) random.nextInt(256); + for (int j = 0; j < chunkSize && i < size; j++) { + data[i++] = val; + } + break; + case 2: // Small range random (moderately compressible) + for (int j = 0; j < chunkSize && i < size; j++) { + data[i++] = (byte) random.nextInt(16); + } + break; + case 3: // Full random (low compressibility) + byte[] randomChunk = new byte[chunkSize]; + random.nextBytes(randomChunk); + int toCopy = Math.min(chunkSize, size - i); + System.arraycopy(randomChunk, 0, data, i, toCopy); + i += toCopy; + break; + } + } + return data; + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java new file mode 100644 index 0000000000..de94b422cf --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ConcurrentReadWriteBenchmark.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.File; +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.LocalInputFile; +import org.apache.parquet.io.LocalOutputFile; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Multi-threaded benchmarks measuring independent read and write throughput under + * concurrency. Uses {@code @Threads(4)} by default (overridable via JMH {@code -t} flag). + * + *

This benchmark does not assert correctness; it measures the cost of each thread + * writing a full file to a stateless sink or reading a shared pre-generated file. + * The set of rows used by {@link #concurrentWrite(Blackhole)} is built once during + * setup and shared (read-only) across all threads, so the timed section measures + * the encoder/serializer pipeline rather than per-row data construction. + * + *

    + *
  • {@link #concurrentWrite(Blackhole)} - each thread independently writes the + * shared pre-generated rows to a {@link BlackHoleOutputFile} (stateless sink)
  • + *
  • {@link #concurrentRead(Blackhole)} - each thread independently reads the same + * pre-generated Parquet file
  • + *
+ * + *

{@link Mode#SingleShotTime} is used because each invocation does enough work + * (a full file write or read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) + * that JIT amortization across invocations is unnecessary. + */ +@BenchmarkMode(Mode.SingleShotTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Fork(1) +@Warmup(iterations = 2, batchSize = 1) +@Measurement(iterations = 5, batchSize = 1) +@Threads(4) +@State(Scope.Benchmark) +public class ConcurrentReadWriteBenchmark { + + private File tempFile; + private Group[] rows; + + @Setup(Level.Trial) + public void setup() throws IOException { + rows = TestDataFactory.generateRows( + TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED); + + // Generate a shared file for concurrent reads + tempFile = File.createTempFile("parquet-concurrent-bench-", ".parquet"); + tempFile.deleteOnExit(); + tempFile.delete(); + + try (ParquetWriter writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath())) + .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) + .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + } + + @TearDown(Level.Trial) + public void tearDown() { + if (tempFile != null && tempFile.exists()) { + tempFile.delete(); + } + } + + /** + * Each thread writes the shared pre-generated rows independently to the + * stateless {@link BlackHoleOutputFile} sink. + */ + @Benchmark + public void concurrentWrite(Blackhole bh) throws IOException { + try (ParquetWriter writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE) + .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) + .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + bh.consume(rows); + } + + /** + * Each thread reads the full pre-generated file independently. + */ + @Benchmark + public void concurrentRead(Blackhole bh) throws IOException { + InputFile inputFile = new LocalInputFile(tempFile.toPath()); + try (ParquetReader reader = new ParquetReader.Builder(inputFile) { + @Override + protected ReadSupport getReadSupport() { + return new GroupReadSupport(); + } + }.build()) { + Group group; + while ((group = reader.read()) != null) { + bh.consume(group); + } + } + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CpuReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CpuReadBenchmark.java new file mode 100644 index 0000000000..9966ec20ec --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CpuReadBenchmark.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * CPU-only read benchmarks measuring decoding and decompression throughput through the + * example {@link Group} API, isolated from filesystem I/O. A Parquet file is written + * to an in-memory byte array during setup, then read back from an {@link InMemoryInputFile} + * during the benchmark so that no disk access contaminates the results. + * + *

Parameterized across compression codec and writer version. For end-to-end benchmarks + * that include filesystem I/O, see {@link FileReadBenchmark}. + * + *

{@link Mode#SingleShotTime} is used because each invocation does enough work + * (a full read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT + * amortization across invocations is unnecessary. Ten measurement iterations + * provide stable statistics for SS mode. + */ +@BenchmarkMode(Mode.SingleShotTime) +@Fork(1) +@Warmup(iterations = 5, batchSize = 1) +@Measurement(iterations = 10, batchSize = 1) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +public class CpuReadBenchmark { + + @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW", "BROTLI", "LZO"}) + public String codec; + + @Param({"PARQUET_1_0", "PARQUET_2_0"}) + public String writerVersion; + + private byte[] fileBytes; + + @Setup(Level.Trial) + public void setup() throws IOException { + Group[] rows = TestDataFactory.generateRows( + TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED); + InMemoryOutputFile outputFile = new InMemoryOutputFile(); + try (ParquetWriter writer = ExampleParquetWriter.builder(outputFile) + .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA) + .withCompressionCodec(CompressionCodecName.valueOf(codec)) + .withWriterVersion(WriterVersion.valueOf(writerVersion)) + .withDictionaryEncoding(true) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + fileBytes = outputFile.toByteArray(); + } + + @Benchmark + public void readFile(Blackhole bh) throws IOException { + InMemoryInputFile inputFile = new InMemoryInputFile(fileBytes); + try (ParquetReader reader = new ParquetReader.Builder(inputFile) { + @Override + protected ReadSupport getReadSupport() { + return new GroupReadSupport(); + } + }.build()) { + Group group; + while ((group = reader.read()) != null) { + bh.consume(group); + } + } + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CpuWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CpuWriteBenchmark.java new file mode 100644 index 0000000000..1c5988ad19 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/CpuWriteBenchmark.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/** + * CPU-only write benchmarks measuring encoding and compression throughput through the + * example {@link Group} API, isolated from filesystem I/O. Row contents are pre-generated + * during setup so compression and writer settings dominate the timed section, while + * writes still flow through the full Parquet writer path. + * + *

Writes are sent to a {@link BlackHoleOutputFile} that discards all bytes, so the + * results reflect pure CPU cost (encoding, compression, index generation) without any + * filesystem noise. For end-to-end benchmarks that include filesystem I/O, see + * {@link FileWriteBenchmark}. + * + *

Parameterized across compression codec, writer version, dictionary encoding, + * row-group block size, and data page size. Block size controls how many rows accumulate + * before a row-group flush (triggering encoding, compression, and index generation). + * Page size controls the unit of encoding and compression within a column chunk. Use JMH + * {@code -p blockSize=...} and {@code -p pageSize=...} to select specific combinations + * and avoid the full cross-product when not needed. + * + *

{@link Mode#SingleShotTime} is used because each invocation does enough work + * (a full write of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT + * amortization across invocations is unnecessary. Ten measurement iterations + * provide stable statistics for SS mode. + */ +@BenchmarkMode(Mode.SingleShotTime) +@Fork(1) +@Warmup(iterations = 5, batchSize = 1) +@Measurement(iterations = 10, batchSize = 1) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +public class CpuWriteBenchmark { + + @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW", "BROTLI", "LZO"}) + public String codec; + + @Param({"PARQUET_1_0", "PARQUET_2_0"}) + public String writerVersion; + + @Param({"true", "false"}) + public String dictionary; + + // Row-group block size in bytes: 128 MB (default), 256 MB (common production), 512 MB (stress) + @Param({"134217728", "268435456", "536870912"}) + public int blockSize; + + // Data page size in bytes: 1 MB (default), 4 MB (reduced overhead), 8 MB (max throughput) + @Param({"1048576", "4194304", "8388608"}) + public int pageSize; + + private Group[] rows; + + @Setup(Level.Trial) + public void setup() { + rows = TestDataFactory.generateRows( + TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED); + } + + @Benchmark + public void writeFile() throws IOException { + try (ParquetWriter writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE) + .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) + .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA) + .withCompressionCodec(CompressionCodecName.valueOf(codec)) + .withWriterVersion(WriterVersion.valueOf(writerVersion)) + .withDictionaryEncoding(Boolean.parseBoolean(dictionary)) + .withRowGroupSize(blockSize) + .withPageSize(pageSize) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java index b3b0df0ace..56aa0c7ddb 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/DataGenerator.java @@ -35,12 +35,14 @@ import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_BS512M_PS4M; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_BS512M_PS8M; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_GZIP; +import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_LZO; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_SNAPPY; import static org.apache.parquet.benchmarks.BenchmarkFiles.targetDir; import static org.apache.parquet.benchmarks.BenchmarkUtils.deleteIfExists; import static org.apache.parquet.benchmarks.BenchmarkUtils.exists; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.GZIP; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZO; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; @@ -111,8 +113,15 @@ public void generateAll() { ONE_MILLION); // generate data for different codecs - // generateData(parquetFile_1M_LZO, configuration, PARQUET_2_0, BLOCK_SIZE_DEFAULT, PAGE_SIZE_DEFAULT, - // FIXED_LEN_BYTEARRAY_SIZE, LZO, ONE_MILLION); + generateData( + file_1M_LZO, + configuration, + PARQUET_2_0, + BLOCK_SIZE_DEFAULT, + PAGE_SIZE_DEFAULT, + FIXED_LEN_BYTEARRAY_SIZE, + LZO, + ONE_MILLION); generateData( file_1M_SNAPPY, configuration, diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java new file mode 100644 index 0000000000..4eb54cf954 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileReadBenchmark.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.File; +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.LocalInputFile; +import org.apache.parquet.io.LocalOutputFile; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * File-level read benchmarks measuring end-to-end Parquet read throughput through the + * example {@link Group} API. A temporary file is generated once during setup from + * pre-generated rows using {@link LocalOutputFile}, then read repeatedly during the + * benchmark. + * + *

Parameterized across compression codec and writer version. The footer parse + * (via {@link LocalInputFile} open) is included in the timed section so the result + * reflects the full open-and-read cost a typical caller would observe. + * + *

{@link Mode#SingleShotTime} is used because each invocation does enough work + * (a full read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT + * amortization across invocations is unnecessary. Ten measurement iterations + * provide stable statistics for SS mode. + */ +@BenchmarkMode(Mode.SingleShotTime) +@Fork(1) +@Warmup(iterations = 5, batchSize = 1) +@Measurement(iterations = 10, batchSize = 1) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +public class FileReadBenchmark { + + @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW", "BROTLI", "LZO"}) + public String codec; + + @Param({"PARQUET_1_0", "PARQUET_2_0"}) + public String writerVersion; + + private File tempFile; + + @Setup(Level.Trial) + public void setup() throws IOException { + tempFile = File.createTempFile("parquet-read-bench-", ".parquet"); + tempFile.deleteOnExit(); + tempFile.delete(); // remove so the writer can create it + + Group[] rows = TestDataFactory.generateRows( + TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED); + try (ParquetWriter writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath())) + .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) + .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA) + .withCompressionCodec(CompressionCodecName.valueOf(codec)) + .withWriterVersion(WriterVersion.valueOf(writerVersion)) + .withDictionaryEncoding(true) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + } + + @TearDown(Level.Trial) + public void tearDown() { + if (tempFile != null && tempFile.exists()) { + tempFile.delete(); + } + } + + @Benchmark + public void readFile(Blackhole bh) throws IOException { + InputFile inputFile = new LocalInputFile(tempFile.toPath()); + try (ParquetReader reader = new ParquetReader.Builder(inputFile) { + @Override + protected ReadSupport getReadSupport() { + return new GroupReadSupport(); + } + }.build()) { + Group group; + while ((group = reader.read()) != null) { + bh.consume(group); + } + } + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java new file mode 100644 index 0000000000..c160025c98 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/FileWriteBenchmark.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/** + * File-level write benchmarks measuring end-to-end Parquet write throughput through the + * example {@link Group} API. Row contents are pre-generated during setup so compression + * and writer settings dominate the timed section, while writes still flow through the + * full Parquet writer path. + * + *

Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU and encoding cost + * from filesystem I/O. Parameterized across compression codec, writer version, and + * dictionary encoding. + * + *

{@link Mode#SingleShotTime} is used because each invocation does enough work + * (a full write of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT + * amortization across invocations is unnecessary. Ten measurement iterations + * provide stable statistics for SS mode. + */ +@BenchmarkMode(Mode.SingleShotTime) +@Fork(1) +@Warmup(iterations = 5, batchSize = 1) +@Measurement(iterations = 10, batchSize = 1) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +public class FileWriteBenchmark { + + @Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP", "LZ4_RAW", "BROTLI", "LZO"}) + public String codec; + + @Param({"PARQUET_1_0", "PARQUET_2_0"}) + public String writerVersion; + + @Param({"true", "false"}) + public String dictionary; + + private Group[] rows; + + @Setup(Level.Trial) + public void setup() { + rows = TestDataFactory.generateRows( + TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED); + } + + @Benchmark + public void writeFile() throws IOException { + try (ParquetWriter writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE) + .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) + .withType(TestDataFactory.FILE_BENCHMARK_SCHEMA) + .withCompressionCodec(CompressionCodecName.valueOf(codec)) + .withWriterVersion(WriterVersion.valueOf(writerVersion)) + .withDictionaryEncoding(Boolean.parseBoolean(dictionary)) + .build()) { + for (Group row : rows) { + writer.write(row); + } + } + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/InMemoryInputFile.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/InMemoryInputFile.java new file mode 100644 index 0000000000..06b6c3ede4 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/InMemoryInputFile.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.SeekableInputStream; + +/** + * An {@link InputFile} backed by an in-memory byte array. Useful for read benchmarks + * that need to isolate decoding and decompression CPU cost from filesystem I/O. + * Pair with {@link InMemoryOutputFile} to produce the byte array during setup. + */ +public final class InMemoryInputFile implements InputFile { + + private final byte[] data; + + public InMemoryInputFile(byte[] data) { + this.data = data; + } + + @Override + public long getLength() { + return data.length; + } + + @Override + public SeekableInputStream newStream() { + return new SeekableInputStream() { + private int pos = 0; + + @Override + public int read() { + return pos < data.length ? (data[pos++] & 0xFF) : -1; + } + + @Override + public int read(byte[] b, int off, int len) { + int remaining = data.length - pos; + if (remaining <= 0) return -1; + int n = Math.min(len, remaining); + System.arraycopy(data, pos, b, off, n); + pos += n; + return n; + } + + @Override + public long getPos() { + return pos; + } + + @Override + public void seek(long newPos) { + pos = (int) newPos; + } + + @Override + public void readFully(byte[] bytes) throws IOException { + readFully(bytes, 0, bytes.length); + } + + @Override + public void readFully(byte[] bytes, int start, int len) throws IOException { + if (pos + len > data.length) { + throw new EOFException("Unexpected end of data"); + } + System.arraycopy(data, pos, bytes, start, len); + pos += len; + } + + @Override + public int read(ByteBuffer buf) { + int len = buf.remaining(); + int remaining = data.length - pos; + if (remaining <= 0) return -1; + int n = Math.min(len, remaining); + buf.put(data, pos, n); + pos += n; + return n; + } + + @Override + public void readFully(ByteBuffer buf) throws IOException { + int len = buf.remaining(); + if (pos + len > data.length) { + throw new EOFException("Unexpected end of data"); + } + buf.put(data, pos, len); + pos += len; + } + + @Override + public void close() {} + }; + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/InMemoryOutputFile.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/InMemoryOutputFile.java new file mode 100644 index 0000000000..bb5237ee53 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/InMemoryOutputFile.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import org.apache.parquet.io.OutputFile; +import org.apache.parquet.io.PositionOutputStream; + +/** + * An {@link OutputFile} that captures all written data into an in-memory byte array. + * Useful for producing Parquet files that can later be read via {@link InMemoryInputFile} + * without touching the filesystem. + */ +public final class InMemoryOutputFile implements OutputFile { + + private final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + /** + * Returns the captured data as a byte array. Should only be called after the + * writer has been closed. + */ + public byte[] toByteArray() { + return baos.toByteArray(); + } + + /** Returns the current size of the captured data in bytes. */ + public int size() { + return baos.size(); + } + + @Override + public PositionOutputStream create(long blockSizeHint) { + return newStream(); + } + + @Override + public PositionOutputStream createOrOverwrite(long blockSizeHint) { + return newStream(); + } + + private PositionOutputStream newStream() { + return new PositionOutputStream() { + private long pos = 0; + + @Override + public void write(int b) throws IOException { + baos.write(b); + pos++; + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + baos.write(b, off, len); + pos += len; + } + + @Override + public long getPos() { + return pos; + } + }; + } + + @Override + public boolean supportsBlockSize() { + return false; + } + + @Override + public long defaultBlockSize() { + return 0; + } + + @Override + public String getPath() { + return "memory"; + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java index 2d6e3a52e3..b67e815ca7 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/ReadBenchmarks.java @@ -26,6 +26,7 @@ import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_BS512M_PS4M; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_BS512M_PS8M; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_GZIP; +import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_LZO; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_SNAPPY; import java.io.IOException; @@ -102,13 +103,11 @@ public void read1MRowsBS512MPS8MUncompressed(Blackhole blackhole) throws IOExcep read(file_1M_BS512M_PS8M, ONE_MILLION, blackhole); } - // TODO how to handle lzo jar? - // @Benchmark - // public void read1MRowsDefaultBlockAndPageSizeLZO(Blackhole blackhole) - // throws IOException - // { - // read(parquetFile_1M_LZO, ONE_MILLION, blackhole); - // } + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) + public void read1MRowsDefaultBlockAndPageSizeLZO(Blackhole blackhole) throws IOException { + read(file_1M_LZO, ONE_MILLION, blackhole); + } @Benchmark @BenchmarkMode(Mode.SingleShotTime) diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java new file mode 100644 index 0000000000..01e23af358 --- /dev/null +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/TestDataFactory.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.benchmarks; + +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; + +import java.util.Random; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; + +/** + * Utility class for generating test schemas and data for benchmarks. + */ +public final class TestDataFactory { + + /** Default number of rows for file-level benchmarks. */ + public static final int DEFAULT_ROW_COUNT = 100_000; + + /** Default RNG seed used across benchmarks for deterministic data. */ + public static final long DEFAULT_SEED = 42L; + + /** A standard multi-type schema used by file-level benchmarks. */ + public static final MessageType FILE_BENCHMARK_SCHEMA = Types.buildMessage() + .required(INT32) + .named("int32_field") + .required(INT64) + .named("int64_field") + .required(FLOAT) + .named("float_field") + .required(DOUBLE) + .named("double_field") + .required(BOOLEAN) + .named("boolean_field") + .required(BINARY) + .named("binary_field") + .named("benchmark_record"); + + private TestDataFactory() {} + + /** + * Creates a {@link SimpleGroupFactory} for the standard benchmark schema. + */ + public static SimpleGroupFactory newGroupFactory() { + return new SimpleGroupFactory(FILE_BENCHMARK_SCHEMA); + } + + /** + * Generates a single row of benchmark data. + * + * @param factory the group factory + * @param index the row index (used for deterministic data) + * @param random the random source + * @return a populated Group + */ + public static Group generateRow(SimpleGroupFactory factory, int index, Random random) { + return factory.newGroup() + .append("int32_field", index) + .append("int64_field", (long) index * 100) + .append("float_field", random.nextFloat()) + .append("double_field", random.nextDouble()) + .append("boolean_field", index % 2 == 0) + .append("binary_field", "value_" + (index % 1000)); + } + + /** + * Generates a deterministic set of rows for file-level benchmarks. + */ + public static Group[] generateRows(SimpleGroupFactory factory, int rowCount, long seed) { + Group[] rows = new Group[rowCount]; + Random random = new Random(seed); + for (int i = 0; i < rowCount; i++) { + rows[i] = generateRow(factory, i, random); + } + return rows; + } +} diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java index 41f961de44..e42e9277b6 100644 --- a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java +++ b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/WriteBenchmarks.java @@ -33,9 +33,11 @@ import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_BS512M_PS4M; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_BS512M_PS8M; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_GZIP; +import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_LZO; import static org.apache.parquet.benchmarks.BenchmarkFiles.file_1M_SNAPPY; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.GZIP; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZO; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED; import static org.openjdk.jmh.annotations.Scope.Thread; @@ -128,20 +130,19 @@ public void write1MRowsBS512MPS8MUncompressed() throws IOException { ONE_MILLION); } - // TODO how to handle lzo jar? - // @Benchmark - // public void write1MRowsDefaultBlockAndPageSizeLZO() - // throws IOException - // { - // dataGenerator.generateData(parquetFile_1M_LZO, - // configuration, - // WriterVersion.PARQUET_2_0, - // BLOCK_SIZE_DEFAULT, - // PAGE_SIZE_DEFAULT, - // FIXED_LEN_BYTEARRAY_SIZE, - // LZO, - // ONE_MILLION); - // } + @Benchmark + @BenchmarkMode(Mode.SingleShotTime) + public void write1MRowsDefaultBlockAndPageSizeLZO() throws IOException { + dataGenerator.generateData( + file_1M_LZO, + configuration, + PARQUET_2_0, + BLOCK_SIZE_DEFAULT, + PAGE_SIZE_DEFAULT, + FIXED_LEN_BYTEARRAY_SIZE, + LZO, + ONE_MILLION); + } @Benchmark @BenchmarkMode(Mode.SingleShotTime) diff --git a/parquet-common/src/main/java/org/apache/parquet/bytes/BytesInput.java b/parquet-common/src/main/java/org/apache/parquet/bytes/BytesInput.java index 0e66140744..722b7e892f 100644 --- a/parquet-common/src/main/java/org/apache/parquet/bytes/BytesInput.java +++ b/parquet-common/src/main/java/org/apache/parquet/bytes/BytesInput.java @@ -643,6 +643,20 @@ public ByteBuffer toByteBuffer() throws IOException { return java.nio.ByteBuffer.wrap(in, offset, length); } + /** + * Zero-copy override: returns the backing array directly when fully used, + * skipping the base-class BAOS allocation + copy on every decompressor call. + * Returning the mutable array is safe — the base class already exposes a + * mutable {@code BAOS.getBuf()}. + */ + @Override + public byte[] toByteArray() { + if (offset == 0 && length == in.length) { + return in; + } + return Arrays.copyOfRange(in, offset, offset + length); + } + @Override public long size() { return length; diff --git a/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java b/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java index 38d4b79219..c264742f68 100644 --- a/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java +++ b/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertSame; import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; @@ -403,4 +404,41 @@ private void validateToByteBufferIsInternal(Supplier factory) { verify(allocatorMock, never()).allocate(anyInt()); verify(callbackMock, never()).accept(any()); } + + // ---- Tests for ByteArrayBytesInput.toByteArray() zero-copy optimization ---- + + @Test + public void testByteArrayBytesInputToByteArrayZeroCopyFullArray() throws IOException { + byte[] data = new byte[100]; + RANDOM.nextBytes(data); + BytesInput bi = BytesInput.from(data, 0, data.length); + + // When offset=0 and length=array.length, toByteArray() should return the same array instance + byte[] result = bi.toByteArray(); + assertSame("Expected zero-copy (same array instance) for full-array BytesInput", data, result); + } + + @Test + public void testByteArrayBytesInputToByteArrayCopiesForSubArray() throws IOException { + byte[] data = new byte[100]; + RANDOM.nextBytes(data); + BytesInput bi = BytesInput.from(data, 10, 50); + + byte[] result = bi.toByteArray(); + assertEquals("Sub-array toByteArray() should have correct length", 50, result.length); + byte[] expected = new byte[50]; + System.arraycopy(data, 10, expected, 0, 50); + assertArrayEquals("Sub-array toByteArray() content mismatch", expected, result); + } + + @Test + public void testByteArrayBytesInputToByteArrayFromSimpleFactory() throws IOException { + byte[] data = new byte[200]; + RANDOM.nextBytes(data); + // BytesInput.from(byte[]) delegates to from(byte[], 0, length) + BytesInput bi = BytesInput.from(data); + + byte[] result = bi.toByteArray(); + assertSame("Expected zero-copy for BytesInput.from(byte[])", data, result); + } } diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index 0cc17c819d..531afc2231 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -161,6 +161,13 @@ zstd-jni ${zstd-jni.version} + + com.aayushatharva.brotli4j + brotli4j + ${brotli4j.version} + runtime + true + com.google.guava @@ -233,33 +240,4 @@ - - - - non-aarch64 - - - !aarch64 - - - - - jitpack.io - https://jitpack.io - Jitpack.io repository - - - - - - com.github.rdblue - brotli-codec - ${brotli-codec.version} - runtime - true - - - - - diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/CodecFactory.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/CodecFactory.java index 98b49835a6..951700fa94 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/CodecFactory.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/CodecFactory.java @@ -18,14 +18,25 @@ */ package org.apache.parquet.hadoop; +import com.github.luben.zstd.Zstd; +import com.github.luben.zstd.ZstdCompressCtx; +import com.github.luben.zstd.ZstdDecompressCtx; +import io.airlift.compress.lz4.Lz4Compressor; +import io.airlift.compress.lz4.Lz4Decompressor; +import io.airlift.compress.lzo.LzoHadoopStreams; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.lang.reflect.Method; import java.nio.ByteBuffer; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Objects; +import java.util.zip.Deflater; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; @@ -35,17 +46,22 @@ import org.apache.hadoop.util.ReflectionUtils; import org.apache.parquet.Preconditions; import org.apache.parquet.bytes.ByteBufferAllocator; +import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.compression.CompressionCodecFactory; import org.apache.parquet.conf.HadoopParquetConfiguration; import org.apache.parquet.conf.ParquetConfiguration; -import org.apache.parquet.hadoop.codec.Lz4RawCodec; import org.apache.parquet.hadoop.codec.ZstandardCodec; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.util.ConfigurationUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xerial.snappy.Snappy; public class CodecFactory implements CompressionCodecFactory { + private static final Logger LOG = LoggerFactory.getLogger(CodecFactory.class); + protected static final Map CODEC_BY_NAME = Collections.synchronizedMap(new HashMap()); @@ -59,6 +75,91 @@ public class CodecFactory implements CompressionCodecFactory { @Deprecated protected final Configuration configuration; + /** + * Reflection-based helper for brotli4j (runtime-only dependency). + * Initialized eagerly at class-load time; all fields are null if + * brotli4j is not on the classpath. + * + *

Uses {@code Encoder.compress(byte[], Encoder.Parameters)} for compression and + * {@code Decoder.decompress(byte[], int, int)} for decompression — the latter returns + * {@code byte[]} directly and avoids loading {@code DirectDecompress} which references + * {@code io.netty.buffer.ByteBuf} (optional Netty dependency not on our classpath). + */ + static final class Brotli4j { + static final boolean AVAILABLE; + // Encoder.compress(byte[], Object/*Encoder.Parameters*/) -> byte[] + private static final Method COMPRESS; + // Decoder.decompress(byte[], int/*offset*/, int/*length*/) -> byte[] + private static final Method DECOMPRESS; + // Encoder.Parameters class + private static final Class PARAMS_CLASS; + // Encoder.Parameters.setQuality(int) -> Encoder.Parameters + private static final Method SET_QUALITY; + + static { + boolean loaded = false; + Method compress = null, decompress = null, setQuality = null; + Class paramsClass = null; + try { + // Load native library + Class loader = Class.forName("com.aayushatharva.brotli4j.Brotli4jLoader"); + loader.getMethod("ensureAvailability").invoke(null); + + // Encoder.compress(byte[], Encoder.Parameters) -> byte[] + paramsClass = Class.forName("com.aayushatharva.brotli4j.encoder.Encoder$Parameters"); + Class encoder = Class.forName("com.aayushatharva.brotli4j.encoder.Encoder"); + compress = encoder.getMethod("compress", byte[].class, paramsClass); + + // Decoder.decompress(byte[], int, int) -> byte[] + // This avoids loading DirectDecompress which references io.netty.buffer.ByteBuf + Class decoder = Class.forName("com.aayushatharva.brotli4j.decoder.Decoder"); + decompress = decoder.getMethod("decompress", byte[].class, int.class, int.class); + + // Encoder.Parameters.setQuality(int) -> Encoder.Parameters + setQuality = paramsClass.getMethod("setQuality", int.class); + + loaded = true; + } catch (Throwable t) { + // brotli4j not available — BROTLI will fall through to Hadoop codec path + LOG.info("brotli4j not available, BROTLI codec will use Hadoop codec path: {}", t.toString()); + } + AVAILABLE = loaded; + COMPRESS = compress; + DECOMPRESS = decompress; + PARAMS_CLASS = paramsClass; + SET_QUALITY = setQuality; + } + + /** Create an {@code Encoder.Parameters} instance with the given quality. */ + static Object newParams(int quality) { + try { + Object params = PARAMS_CLASS.getConstructor().newInstance(); + SET_QUALITY.invoke(params, quality); + return params; + } catch (ReflectiveOperationException e) { + throw new RuntimeException("Failed to create Brotli encoder parameters", e); + } + } + + /** Compress using {@code Encoder.compress(byte[], Encoder.Parameters)}. */ + static byte[] compress(byte[] input, Object params) throws IOException { + try { + return (byte[]) COMPRESS.invoke(null, input, params); + } catch (ReflectiveOperationException e) { + throw new IOException("Brotli compression failed", e); + } + } + + /** Decompress using {@code Decoder.decompress(byte[], offset, length)}. */ + static byte[] decompress(byte[] input) throws IOException { + try { + return (byte[]) DECOMPRESS.invoke(null, input, 0, input.length); + } catch (ReflectiveOperationException e) { + throw new IOException("Brotli decompression failed", e); + } + } + } + static final BytesDecompressor NO_OP_DECOMPRESSOR = new BytesDecompressor() { @Override public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) { @@ -170,18 +271,7 @@ public BytesInput decompress(BytesInput bytes, int decompressedSize) throws IOEx decompressor.reset(); } InputStream is = codec.createInputStream(bytes.toInputStream(), decompressor); - - // Eagerly materialize the decompressed stream for codecs that require all input in a single buffer. - // ZSTD: releases off-heap resources early to avoid fragmentation (see parquet-format#398). - // LZ4_RAW: requires one-shot decompression; the lazy StreamBytesInput.writeInto() path reads via - // Channels.newChannel() in ~8KB chunks, causing the decompressor to be called with an undersized - // output buffer (see #3478). - if (codec instanceof ZstandardCodec || codec instanceof Lz4RawCodec) { - decompressed = BytesInput.copy(BytesInput.from(is, decompressedSize)); - is.close(); - } else { - decompressed = BytesInput.from(is, decompressedSize); - } + decompressed = BytesInput.from(is, decompressedSize); return decompressed; } @@ -271,13 +361,61 @@ public BytesDecompressor getDecompressor(CompressionCodecName codecName) { } protected BytesCompressor createCompressor(CompressionCodecName codecName) { - CompressionCodec codec = getCodec(codecName); - return codec == null ? NO_OP_COMPRESSOR : new HeapBytesCompressor(codecName, codec); + switch (codecName) { + case UNCOMPRESSED: + return NO_OP_COMPRESSOR; + case SNAPPY: + return new SnappyBytesCompressor(); + case ZSTD: + return new ZstdBytesCompressor( + conf.getInt( + ZstandardCodec.PARQUET_COMPRESS_ZSTD_LEVEL, + ZstandardCodec.DEFAULT_PARQUET_COMPRESS_ZSTD_LEVEL), + conf.getInt( + ZstandardCodec.PARQUET_COMPRESS_ZSTD_WORKERS, + ZstandardCodec.DEFAULTPARQUET_COMPRESS_ZSTD_WORKERS)); + case LZ4_RAW: + return new Lz4RawBytesCompressor(); + case GZIP: + int gzipLevel = conf.getInt("zlib.compress.level", Deflater.DEFAULT_COMPRESSION); + return new GzipBytesCompressor(gzipLevel, pageSize); + case LZO: + return new LzoBytesCompressor(pageSize); + case BROTLI: + if (Brotli4j.AVAILABLE) { + int brotliQuality = conf.getInt("compression.brotli.quality", 1); + return new BrotliBytesCompressor(brotliQuality); + } + // fall through to Hadoop codec path + default: + CompressionCodec codec = getCodec(codecName); + return codec == null ? NO_OP_COMPRESSOR : new HeapBytesCompressor(codecName, codec); + } } protected BytesDecompressor createDecompressor(CompressionCodecName codecName) { - CompressionCodec codec = getCodec(codecName); - return codec == null ? NO_OP_DECOMPRESSOR : new HeapBytesDecompressor(codec); + switch (codecName) { + case UNCOMPRESSED: + return NO_OP_DECOMPRESSOR; + case SNAPPY: + return new SnappyBytesDecompressor(); + case ZSTD: + return new ZstdBytesDecompressor(); + case LZ4_RAW: + return new Lz4RawBytesDecompressor(); + case GZIP: + return new GzipBytesDecompressor(); + case LZO: + return new LzoBytesDecompressor(); + case BROTLI: + if (Brotli4j.AVAILABLE) { + return new BrotliBytesDecompressor(); + } + // fall through to Hadoop codec path + default: + CompressionCodec codec = getCodec(codecName); + return codec == null ? NO_OP_DECOMPRESSOR : new HeapBytesDecompressor(codec); + } } /** @@ -315,15 +453,9 @@ protected CompressionCodec getCodec(CompressionCodecName codecName) { private String cacheKey(CompressionCodecName codecName) { String level = null; switch (codecName) { - case GZIP: - level = conf.get("zlib.compress.level"); - break; case BROTLI: level = conf.get("compression.brotli.quality"); break; - case ZSTD: - level = conf.get("parquet.compression.codec.zstd.level"); - break; default: // compression level is not supported; ignore it } @@ -367,4 +499,482 @@ public abstract void decompress(ByteBuffer input, int compressedSize, ByteBuffer public abstract void release(); } + + // ---- Optimized Snappy compressor/decompressor using direct JNI calls ---- + + /** + * Compresses using Snappy's byte-array JNI API directly, bypassing the Hadoop + * stream abstraction. This avoids intermediate direct ByteBuffer copies and + * reduces the compression to a single native call per page. + */ + static class SnappyBytesCompressor extends BytesCompressor { + private byte[] outputBuffer; + + @Override + public BytesInput compress(BytesInput bytes) throws IOException { + byte[] input = bytes.toByteArray(); + int maxLen = Snappy.maxCompressedLength(input.length); + if (outputBuffer == null || outputBuffer.length < maxLen) { + outputBuffer = new byte[maxLen]; + } + int compressed = Snappy.compress(input, 0, input.length, outputBuffer, 0); + return BytesInput.from(outputBuffer, 0, compressed); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.SNAPPY; + } + + @Override + public void release() { + outputBuffer = null; + } + } + + /** + * Decompresses using Snappy's JNI API directly. The {@link ByteBuffer} overload uses + * {@link Snappy#uncompress(ByteBuffer, ByteBuffer)} which, for direct buffers, passes + * native memory addresses straight to the snappy library with no JNI array pinning or + * intermediate copies. + */ + static class SnappyBytesDecompressor extends BytesDecompressor { + @Override + public BytesInput decompress(BytesInput bytes, int decompressedSize) throws IOException { + byte[] input = bytes.toByteArray(); + byte[] output = new byte[decompressedSize]; + Snappy.uncompress(input, 0, input.length, output, 0); + return BytesInput.from(output); + } + + @Override + public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) + throws IOException { + int origInputLimit = input.limit(); + input.limit(input.position() + compressedSize); + int origOutputLimit = output.limit(); + output.limit(output.position() + decompressedSize); + // Use slices so native API works on independent buffers; advance positions manually. + Snappy.uncompress(input.slice(), output.slice()); + input.position(input.limit()); + input.limit(origInputLimit); + output.position(output.limit()); + output.limit(origOutputLimit); + } + + @Override + public void release() {} + } + + // ---- Optimized ZSTD compressor/decompressor using zstd-jni context API directly ---- + + /** + * Compresses using a reusable {@link ZstdCompressCtx}, bypassing the Hadoop codec + * framework ({@code ZstandardCodec}, {@code CodecPool}, {@code CompressionOutputStream} + * wrapper). The context is created once at construction and reused across calls, + * avoiding per-call JNI context creation, internal buffer allocation, and Java stream + * overhead. This is 1.5-3.4x faster than the streaming approach for typical Parquet + * page sizes (64KB-1MB). Multi-threaded compression via {@code workers > 0} is + * supported through {@link ZstdCompressCtx#setWorkers(int)}. + */ + static class ZstdBytesCompressor extends BytesCompressor { + private final ZstdCompressCtx context; + private byte[] outputBuffer; + + ZstdBytesCompressor(int level, int workers) { + this.context = new ZstdCompressCtx(); + this.context.setLevel(level); + if (workers > 0) { + this.context.setWorkers(workers); + } + } + + @Override + public BytesInput compress(BytesInput bytes) throws IOException { + byte[] input = bytes.toByteArray(); + int maxLen = (int) Zstd.compressBound(input.length); + if (outputBuffer == null || outputBuffer.length < maxLen) { + outputBuffer = new byte[maxLen]; + } + int compressed = context.compressByteArray(outputBuffer, 0, outputBuffer.length, input, 0, input.length); + return BytesInput.from(outputBuffer, 0, compressed); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.ZSTD; + } + + @Override + public void release() { + context.close(); + outputBuffer = null; + } + } + + /** + * Decompresses using a reusable {@link ZstdDecompressCtx}, bypassing the Hadoop + * codec framework. The context is created once at construction and reused across + * calls, avoiding per-call JNI context creation, internal buffer allocation, and + * Java stream overhead. The {@link ByteBuffer} overload uses + * {@link Zstd#decompress(ByteBuffer, ByteBuffer)} to pass buffers directly to the + * native library without intermediate copies. + */ + static class ZstdBytesDecompressor extends BytesDecompressor { + private final ZstdDecompressCtx context; + + ZstdBytesDecompressor() { + this.context = new ZstdDecompressCtx(); + } + + @Override + public BytesInput decompress(BytesInput bytes, int decompressedSize) throws IOException { + byte[] input = bytes.toByteArray(); + byte[] output = new byte[decompressedSize]; + int decompressed = context.decompressByteArray(output, 0, decompressedSize, input, 0, input.length); + if (decompressed != decompressedSize) { + throw new IOException("Unexpected decompressed size: " + decompressed + " != " + decompressedSize); + } + return BytesInput.from(output); + } + + @Override + public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) + throws IOException { + int origInputLimit = input.limit(); + input.limit(input.position() + compressedSize); + int origOutputLimit = output.limit(); + output.limit(output.position() + decompressedSize); + // Zstd.decompress uses (dst, src) parameter order, matching the native zstd convention. + // Use slices so native API works on independent buffers; advance positions manually. + Zstd.decompress(output.slice(), input.slice()); + input.position(input.limit()); + input.limit(origInputLimit); + output.position(output.limit()); + output.limit(origOutputLimit); + } + + @Override + public void release() { + context.close(); + } + } + + // ---- Optimized LZ4_RAW compressor/decompressor using airlift LZ4 directly ---- + + /** + * Compresses using airlift's LZ4 compressor directly with heap ByteBuffers, + * bypassing the Hadoop stream abstraction and NonBlockedCompressor's direct + * buffer copies. + */ + static class Lz4RawBytesCompressor extends BytesCompressor { + private final Lz4Compressor compressor = new Lz4Compressor(); + private byte[] outputBuffer; + + @Override + public BytesInput compress(BytesInput bytes) throws IOException { + byte[] input = bytes.toByteArray(); + int maxLen = compressor.maxCompressedLength(input.length); + if (outputBuffer == null || outputBuffer.length < maxLen) { + outputBuffer = new byte[maxLen]; + } + ByteBuffer inputBuf = ByteBuffer.wrap(input); + ByteBuffer outputBuf = ByteBuffer.wrap(outputBuffer); + compressor.compress(inputBuf, outputBuf); + int compressedSize = outputBuf.position(); + return BytesInput.from(outputBuffer, 0, compressedSize); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.LZ4_RAW; + } + + @Override + public void release() { + outputBuffer = null; + } + } + + /** + * Decompresses using airlift's LZ4 decompressor directly with ByteBuffers. + * The {@link ByteBuffer} overload passes buffers straight through to the native + * decompressor, avoiding intermediate byte-array copies. + */ + static class Lz4RawBytesDecompressor extends BytesDecompressor { + private final Lz4Decompressor decompressor = new Lz4Decompressor(); + + @Override + public BytesInput decompress(BytesInput bytes, int decompressedSize) throws IOException { + byte[] input = bytes.toByteArray(); + byte[] output = new byte[decompressedSize]; + ByteBuffer inputBuf = ByteBuffer.wrap(input); + ByteBuffer outputBuf = ByteBuffer.wrap(output); + decompressor.decompress(inputBuf, outputBuf); + return BytesInput.from(output); + } + + @Override + public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) + throws IOException { + int origInputLimit = input.limit(); + input.limit(input.position() + compressedSize); + int origOutputLimit = output.limit(); + output.limit(output.position() + decompressedSize); + // Use slices so native API works on independent buffers; advance positions manually. + decompressor.decompress(input.slice(), output.slice()); + input.position(input.limit()); + input.limit(origInputLimit); + output.position(output.limit()); + output.limit(origOutputLimit); + } + + @Override + public void release() {} + } + + // ---- Optimized GZIP compressor/decompressor using JDK GZIPOutputStream/GZIPInputStream directly ---- + + /** + * Compresses using {@link GZIPOutputStream} directly, bypassing Hadoop's + * GzipCodec and the associated codec pool / stream wrapper overhead. + * + *

Note: this implementation always uses Java's built-in zlib via + * {@link GZIPOutputStream}. It does not use Hadoop native libraries, + * so hardware-accelerated compression via Intel ISA-L will not be used even if + * the native libraries are installed. The overhead reduction from bypassing the + * Hadoop codec framework typically outweighs the ISA-L advantage for the page + * sizes used by Parquet. + */ + static class GzipBytesCompressor extends BytesCompressor { + private final int level; + private final ByteArrayOutputStream baos; + + GzipBytesCompressor(int level, int pageSize) { + this.level = level; + this.baos = new ByteArrayOutputStream(pageSize); + } + + @Override + public BytesInput compress(BytesInput bytes) throws IOException { + baos.reset(); + try (GZIPOutputStream gos = new GZIPOutputStream(baos) { + { + def.setLevel(level); + } + }) { + bytes.writeAllTo(gos); + } + return BytesInput.from(baos); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.GZIP; + } + + @Override + public void release() {} + } + + /** + * Decompresses using {@link GZIPInputStream} directly, bypassing Hadoop's + * GzipCodec and the associated codec pool / stream wrapper overhead. + * CRC32 and size verification is handled by the JDK implementation. + * + *

Note: this implementation always uses Java's built-in zlib via + * {@link GZIPInputStream}. It does not use Hadoop native libraries, + * so hardware-accelerated decompression via Intel ISA-L will not be used even if + * the native libraries are installed. + */ + static class GzipBytesDecompressor extends BytesDecompressor { + @Override + public BytesInput decompress(BytesInput bytes, int decompressedSize) throws IOException { + try (GZIPInputStream gis = new GZIPInputStream(bytes.toInputStream())) { + byte[] output = new byte[decompressedSize]; + int offset = 0; + while (offset < decompressedSize) { + int read = gis.read(output, offset, decompressedSize - offset); + if (read < 0) { + throw new IOException( + "Unexpected end of GZIP stream at offset " + offset + " of " + decompressedSize); + } + offset += read; + } + return BytesInput.from(output); + } + } + + @Override + public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) + throws IOException { + // Wrap the input ByteBuffer slice in an InputStream to avoid allocating a temp byte array. + // GZIPInputStream is stream-based so we still need a temp output array. + ByteBuffer inputSlice = input.slice(); + inputSlice.limit(compressedSize); + try (GZIPInputStream gis = new GZIPInputStream(ByteBufferInputStream.wrap(inputSlice))) { + byte[] outputBytes = new byte[decompressedSize]; + int offset = 0; + while (offset < decompressedSize) { + int read = gis.read(outputBytes, offset, decompressedSize - offset); + if (read < 0) { + throw new IOException( + "Unexpected end of GZIP stream at offset " + offset + " of " + decompressedSize); + } + offset += read; + } + output.put(outputBytes); + } + input.position(input.position() + compressedSize); + } + + @Override + public void release() {} + } + + // ---- Optimized LZO compressor/decompressor using aircompressor's Hadoop-framed LZO directly ---- + + /** + * Compresses using aircompressor's LZO Hadoop-framed streams directly, + * bypassing the GPL-licensed {@code com.hadoop.compression.lzo.LzoCodec} and + * the associated Hadoop codec pool / stream wrapper overhead. The framing + * format (big-endian length-prefixed blocks) is wire-compatible with Hadoop's + * LzoCodec, so files produced by this compressor are readable by any standard + * Parquet reader. + */ + static class LzoBytesCompressor extends BytesCompressor { + private static final LzoHadoopStreams LZO_STREAMS = new LzoHadoopStreams(); + private final ByteArrayOutputStream baos; + + LzoBytesCompressor(int pageSize) { + this.baos = new ByteArrayOutputStream(pageSize); + } + + @Override + public BytesInput compress(BytesInput bytes) throws IOException { + baos.reset(); + try (OutputStream los = LZO_STREAMS.createOutputStream(baos)) { + bytes.writeAllTo(los); + } + return BytesInput.from(baos); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.LZO; + } + + @Override + public void release() {} + } + + /** + * Decompresses using aircompressor's LZO Hadoop-framed streams directly, + * bypassing the GPL-licensed Hadoop LzoCodec. Reads the same big-endian + * length-prefixed block framing that Hadoop's LzoCodec produces. + */ + static class LzoBytesDecompressor extends BytesDecompressor { + private static final LzoHadoopStreams LZO_STREAMS = new LzoHadoopStreams(); + + @Override + public BytesInput decompress(BytesInput bytes, int decompressedSize) throws IOException { + try (InputStream lis = LZO_STREAMS.createInputStream(bytes.toInputStream())) { + byte[] output = new byte[decompressedSize]; + int offset = 0; + while (offset < decompressedSize) { + int read = lis.read(output, offset, decompressedSize - offset); + if (read < 0) { + throw new IOException( + "Unexpected end of LZO stream at offset " + offset + " of " + decompressedSize); + } + offset += read; + } + return BytesInput.from(output); + } + } + + @Override + public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) + throws IOException { + ByteBuffer inputSlice = input.slice(); + inputSlice.limit(compressedSize); + try (InputStream lis = LZO_STREAMS.createInputStream(ByteBufferInputStream.wrap(inputSlice))) { + byte[] outputBytes = new byte[decompressedSize]; + int offset = 0; + while (offset < decompressedSize) { + int read = lis.read(outputBytes, offset, decompressedSize - offset); + if (read < 0) { + throw new IOException( + "Unexpected end of LZO stream at offset " + offset + " of " + decompressedSize); + } + offset += read; + } + output.put(outputBytes); + } + input.position(input.position() + compressedSize); + } + + @Override + public void release() {} + } + + /** + * Brotli compressor using brotli4j ({@code com.aayushatharva.brotli4j}) via reflection. + * Single-call byte-array API — no streaming overhead. Default quality=1 + * matches the old jbrotli default and gives a good speed/ratio trade-off. + */ + static class BrotliBytesCompressor extends BytesCompressor { + private final Object params; + + BrotliBytesCompressor(int quality) { + this.params = Brotli4j.newParams(quality); + } + + @Override + public BytesInput compress(BytesInput bytes) throws IOException { + byte[] input = bytes.toByteArray(); + byte[] compressed = Brotli4j.compress(input, params); + return BytesInput.from(compressed); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.BROTLI; + } + + @Override + public void release() {} + } + + /** + * Brotli decompressor using brotli4j ({@code com.aayushatharva.brotli4j}) via reflection. + * Single-call byte-array API. For the ByteBuffer overload the input slice + * is copied to a heap array, decompressed, and the result put into the + * output buffer — Brotli is slow enough that the copy overhead is negligible. + */ + static class BrotliBytesDecompressor extends BytesDecompressor { + + @Override + public BytesInput decompress(BytesInput bytes, int uncompressedSize) throws IOException { + byte[] compressed = bytes.toByteArray(); + byte[] decompressed = Brotli4j.decompress(compressed); + return BytesInput.from(decompressed); + } + + @Override + public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int decompressedSize) + throws IOException { + ByteBuffer inputSlice = input.slice(); + inputSlice.limit(compressedSize); + byte[] compressedBytes = new byte[compressedSize]; + inputSlice.get(compressedBytes); + + byte[] decompressed = Brotli4j.decompress(compressedBytes); + output.put(decompressed); + input.position(input.position() + compressedSize); + } + + @Override + public void release() {} + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java index b2b5233eeb..e6bc6891e8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/DirectCodecFactory.java @@ -103,8 +103,14 @@ protected BytesCompressor createCompressor(final CompressionCodecName codecName) return new SnappyCompressor(); case ZSTD: return new ZstdCompressor(); - // todo: create class similar to the SnappyCompressor for zlib and exclude it as - // snappy is above since it also generates allocateDirect calls. + case LZ4_RAW: + return new Lz4RawCompressor(); + case BROTLI: + if (Brotli4j.AVAILABLE) { + return new BrotliDirectCompressor(); + } + return super.createCompressor(codecName); + case LZO: default: return super.createCompressor(codecName); } @@ -117,6 +123,17 @@ protected BytesDecompressor createDecompressor(final CompressionCodecName codecN return new SnappyDecompressor(); case ZSTD: return new ZstdDecompressor(); + case LZ4_RAW: + return new Lz4RawDecompressor(); + case BROTLI: + if (Brotli4j.AVAILABLE) { + return new BrotliDirectDecompressor(); + } + // fall through to super (which also checks Brotli4j, then Hadoop codec) + case GZIP: + case LZO: + case UNCOMPRESSED: + return super.createDecompressor(codecName); default: CompressionCodec codec = getCodec(codecName); if (codec == null) { @@ -405,6 +422,26 @@ void closeDecompressor() { } } + /** + * Direct-memory LZ4_RAW decompressor using airlift's LZ4 decompressor with + * direct ByteBuffers, avoiding reflection-based {@link FullDirectDecompressor}. + */ + private class Lz4RawDecompressor extends BaseDecompressor { + private final io.airlift.compress.lz4.Lz4Decompressor decompressor = + new io.airlift.compress.lz4.Lz4Decompressor(); + + @Override + int decompress(ByteBuffer input, ByteBuffer output) { + decompressor.decompress(input, output); + return output.position(); + } + + @Override + void closeDecompressor() { + // no-op + } + } + private class ZstdCompressor extends BaseCompressor { private final ZstdCompressCtx context; @@ -437,6 +474,95 @@ void closeCompressor() { } } + /** + * Direct-memory LZ4_RAW compressor using airlift's LZ4 compressor with + * direct ByteBuffers, avoiding the stream-based heap path. + */ + private class Lz4RawCompressor extends BaseCompressor { + private final io.airlift.compress.lz4.Lz4Compressor compressor = new io.airlift.compress.lz4.Lz4Compressor(); + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.LZ4_RAW; + } + + @Override + int maxCompressedSize(int size) { + return compressor.maxCompressedLength(size); + } + + @Override + int compress(ByteBuffer input, ByteBuffer output) { + compressor.compress(input, output); + return output.position(); + } + + @Override + void closeCompressor() { + // no-op + } + } + + /** + * Direct-memory Brotli decompressor using brotli4j via reflection. + * brotli4j only exposes a byte-array API, so input/output are copied through heap arrays. + * Brotli is slow enough that the copy overhead is negligible. + */ + private class BrotliDirectDecompressor extends BaseDecompressor { + + @Override + int decompress(ByteBuffer input, ByteBuffer output) throws IOException { + byte[] compressedBytes = new byte[input.remaining()]; + input.get(compressedBytes); + byte[] decompressed = Brotli4j.decompress(compressedBytes); + output.put(decompressed); + return decompressed.length; + } + + @Override + void closeDecompressor() { + // no-op + } + } + + /** + * Direct-memory Brotli compressor using brotli4j via reflection. + * Uses quality=1 by default (fast compression, matching the old jbrotli default). + * brotli4j only exposes a byte-array API, so input/output are copied through heap arrays. + */ + private class BrotliDirectCompressor extends BaseCompressor { + private final Object params; + + BrotliDirectCompressor() { + this.params = Brotli4j.newParams(1); + } + + @Override + public CompressionCodecName getCodecName() { + return CompressionCodecName.BROTLI; + } + + @Override + int maxCompressedSize(int size) { + // Brotli worst case: input size + (input size >> 2) + 1K overhead for small inputs + return size + (size >> 2) + 1024; + } + + @Override + int compress(ByteBuffer input, ByteBuffer output) throws IOException { + byte[] inputBytes = new byte[input.remaining()]; + input.get(inputBytes); + byte[] compressed = Brotli4j.compress(inputBytes, params); + output.put(compressed); + return compressed.length; + } + + @Override + void closeCompressor() { + // no-op + } + } + /** * @deprecated Use {@link CodecFactory#NO_OP_COMPRESSOR} instead */ diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java index c78ee09ecc..85fdb6d287 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestDirectCodecFactory.java @@ -18,9 +18,12 @@ package org.apache.parquet.hadoop; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.BROTLI; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.GZIP; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZ4; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZ4_RAW; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZO; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.ZSTD; import java.io.IOException; import java.nio.ByteBuffer; @@ -28,7 +31,6 @@ import java.util.Random; import java.util.Set; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.ByteBufferReleaser; import org.apache.parquet.bytes.BytesInput; @@ -37,6 +39,7 @@ import org.apache.parquet.bytes.TrackingByteBufferAllocator; import org.apache.parquet.compression.CompressionCodecFactory.BytesInputCompressor; import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; +import org.apache.parquet.hadoop.codec.ZstandardCodec; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.junit.Assert; import org.junit.Test; @@ -81,11 +84,10 @@ private void test(int size, CompressionCodecName codec, boolean useOnHeapCompres final BytesInputDecompressor heapDecompressor = heapCodecFactory.getDecompressor(codec); if (codec == LZ4_RAW) { - // Hadoop codecs support direct decompressors only if the related native libraries are available. - // This is not the case for our CI so let's rely on LZ4_RAW where the implementation is our own. - Assert.assertTrue( + // LZ4_RAW should use a direct decompression path, not the heap-copy IndirectDecompressor. + Assert.assertFalse( String.format("The hadoop codec %s should support direct decompression", codec), - directDecompressor instanceof DirectCodecFactory.FullDirectDecompressor); + directDecompressor instanceof DirectCodecFactory.IndirectDecompressor); } final BytesInput directCompressed; @@ -214,13 +216,7 @@ public void compressionCodecs() { final int[] sizes = {4 * 1024, 1 * 1024 * 1024}; final boolean[] comp = {true, false}; Set codecsToSkip = new HashSet<>(); - codecsToSkip.add(LZO); // not distributed because it is GPL codecsToSkip.add(LZ4); // not distributed in the default version of Hadoop - final String arch = System.getProperty("os.arch"); - if ("aarch64".equals(arch)) { - // PARQUET-1975 brotli-codec does not have natives for ARM64 architectures - codecsToSkip.add(BROTLI); - } for (final int size : sizes) { for (final boolean useOnHeapComp : comp) { @@ -236,53 +232,401 @@ public void compressionCodecs() { } } - static class PublicCodecFactory extends CodecFactory { - // To make getCodec public + @Test + public void compressionLevelGzip() throws IOException { + Configuration config_zlib_1 = new Configuration(); + config_zlib_1.set("zlib.compress.level", "1"); + + Configuration config_zlib_9 = new Configuration(); + config_zlib_9.set("zlib.compress.level", "9"); + + // Generate compressible data so different levels produce different sizes + byte[] data = new byte[64 * 1024]; + new Random(42).nextBytes(data); + + final CodecFactory codecFactory_1 = new CodecFactory(config_zlib_1, pageSize); + final CodecFactory codecFactory_9 = new CodecFactory(config_zlib_9, pageSize); + + BytesInputCompressor compressor_1 = codecFactory_1.getCompressor(CompressionCodecName.GZIP); + BytesInputCompressor compressor_9 = codecFactory_9.getCompressor(CompressionCodecName.GZIP); + + long size_1 = compressor_1.compress(BytesInput.from(data)).size(); + long size_9 = compressor_9.compress(BytesInput.from(data)).size(); + + // Level 9 should produce smaller (or equal) output than level 1 + Assert.assertTrue("Expected level 9 (" + size_9 + ") <= level 1 (" + size_1 + ")", size_9 <= size_1); + + codecFactory_1.release(); + codecFactory_9.release(); + } + + @Test + public void compressionLevelZstd() throws IOException { + Configuration config_zstd_1 = new Configuration(); + config_zstd_1.set("parquet.compression.codec.zstd.level", "1"); + + Configuration config_zstd_19 = new Configuration(); + config_zstd_19.set("parquet.compression.codec.zstd.level", "19"); + + // Generate compressible data so different levels produce different sizes + byte[] data = new byte[64 * 1024]; + new Random(42).nextBytes(data); + + final CodecFactory codecFactory_1 = new CodecFactory(config_zstd_1, pageSize); + final CodecFactory codecFactory_19 = new CodecFactory(config_zstd_19, pageSize); + + BytesInputCompressor compressor_1 = codecFactory_1.getCompressor(CompressionCodecName.ZSTD); + BytesInputCompressor compressor_19 = codecFactory_19.getCompressor(CompressionCodecName.ZSTD); + + long size_1 = compressor_1.compress(BytesInput.from(data)).size(); + long size_19 = compressor_19.compress(BytesInput.from(data)).size(); - public PublicCodecFactory(Configuration configuration, int pageSize) { - super(configuration, pageSize); + // Level 19 should produce smaller (or equal) output than level 1 + Assert.assertTrue("Expected level 19 (" + size_19 + ") <= level 1 (" + size_1 + ")", size_19 <= size_1); + + codecFactory_1.release(); + codecFactory_19.release(); + } + + // ---- Tests for empty input (0 bytes) through direct compressor/decompressor path ---- + + @Test + public void emptyInputRoundTrip() throws IOException { + // Codecs that have direct bypass implementations in CodecFactory + CompressionCodecName[] directCodecs = {SNAPPY, ZSTD, LZ4_RAW, GZIP, LZO, BROTLI}; + for (CompressionCodecName codec : directCodecs) { + CodecFactory factory = new CodecFactory(new Configuration(), pageSize); + BytesInputCompressor compressor = factory.getCompressor(codec); + BytesInputDecompressor decompressor = factory.getDecompressor(codec); + + BytesInput compressed = compressor.compress(BytesInput.from(new byte[0])); + BytesInput decompressed = decompressor.decompress(compressed, 0); + Assert.assertEquals("Empty input round-trip failed for " + codec, 0, decompressed.toByteArray().length); + + compressor.release(); + decompressor.release(); + factory.release(); } + } + + // ---- Tests for GZIP consecutive compressions with a single compressor instance ---- - public org.apache.hadoop.io.compress.CompressionCodec getCodec(CompressionCodecName name) { - return super.getCodec(name); + @Test + public void gzipConsecutiveCompressionsProduceCorrectResults() throws IOException { + CodecFactory factory = new CodecFactory(new Configuration(), pageSize); + BytesInputCompressor compressor = factory.getCompressor(GZIP); + BytesInputDecompressor decompressor = factory.getDecompressor(GZIP); + + Random r = new Random(99); + for (int i = 0; i < 10; i++) { + byte[] data = new byte[4096 + i * 1024]; + r.nextBytes(data); + + BytesInput compressed = compressor.compress(BytesInput.from(data)); + BytesInput decompressed = decompressor.decompress(compressed, data.length); + Assert.assertArrayEquals( + "GZIP consecutive round-trip failed on iteration " + i, data, decompressed.toByteArray()); } + + compressor.release(); + decompressor.release(); + factory.release(); + } + + // ---- Tests for buffer reuse safety in Snappy/LZ4_RAW compressors ---- + + @Test + public void snappyCompressorBufferReuseSafety() throws IOException { + verifyCompressorOutputCopiedBeforeReuse(SNAPPY); + } + + @Test + public void lz4RawCompressorBufferReuseSafety() throws IOException { + verifyCompressorOutputCopiedBeforeReuse(LZ4_RAW); + } + + /** + * Verifies that the caller can safely copy the compressed output before the next + * compress() call overwrites the internal buffer. This is the documented contract + * for compressors that reuse output buffers. + */ + private void verifyCompressorOutputCopiedBeforeReuse(CompressionCodecName codec) throws IOException { + CodecFactory factory = new CodecFactory(new Configuration(), pageSize); + BytesInputCompressor compressor = factory.getCompressor(codec); + BytesInputDecompressor decompressor = factory.getDecompressor(codec); + + byte[] data1 = new byte[4096]; + byte[] data2 = new byte[4096]; + new Random(1).nextBytes(data1); + new Random(2).nextBytes(data2); + + // Compress first, copy result immediately + BytesInput compressed1 = compressor.compress(BytesInput.from(data1)); + byte[] compressed1Bytes = compressed1.toByteArray(); + + // Compress second (may overwrite internal buffer) + BytesInput compressed2 = compressor.compress(BytesInput.from(data2)); + byte[] compressed2Bytes = compressed2.toByteArray(); + + // Both should decompress correctly from the copied bytes + BytesInput decompressed1 = decompressor.decompress(BytesInput.from(compressed1Bytes), data1.length); + Assert.assertArrayEquals(codec + " buffer reuse: first input corrupted", data1, decompressed1.toByteArray()); + + BytesInput decompressed2 = decompressor.decompress(BytesInput.from(compressed2Bytes), data2.length); + Assert.assertArrayEquals(codec + " buffer reuse: second input corrupted", data2, decompressed2.toByteArray()); + + compressor.release(); + decompressor.release(); + factory.release(); + } + + // ---- Tests for ZSTD bufferPool config propagation through new direct compressor ---- + + @Test + public void zstdBufferPoolEnabledRoundTrip() throws IOException { + Configuration conf = new Configuration(); + conf.setBoolean(ZstandardCodec.PARQUET_COMPRESS_ZSTD_BUFFERPOOL_ENABLED, true); + verifyZstdRoundTrip(conf, "bufferPool=true"); + } + + @Test + public void zstdBufferPoolDisabledRoundTrip() throws IOException { + Configuration conf = new Configuration(); + conf.setBoolean(ZstandardCodec.PARQUET_COMPRESS_ZSTD_BUFFERPOOL_ENABLED, false); + verifyZstdRoundTrip(conf, "bufferPool=false"); + } + + /** + * Verifies ZSTD round-trip with different bufferPool configurations through the + * new direct ZstdBytesCompressor/ZstdBytesDecompressor path in CodecFactory. + */ + private void verifyZstdRoundTrip(Configuration conf, String label) throws IOException { + CodecFactory factory = new CodecFactory(conf, pageSize); + BytesInputCompressor compressor = factory.getCompressor(ZSTD); + BytesInputDecompressor decompressor = factory.getDecompressor(ZSTD); + + byte[] data = new byte[64 * 1024]; + new Random(42).nextBytes(data); + + BytesInput compressed = compressor.compress(BytesInput.from(data)); + BytesInput decompressed = decompressor.decompress(compressed, data.length); + Assert.assertArrayEquals("ZSTD round-trip failed with " + label, data, decompressed.toByteArray()); + + compressor.release(); + decompressor.release(); + factory.release(); } + // ---- Tests for ZSTD workers config propagation ---- + + @Test + public void zstdWorkersConfigRoundTrip() throws IOException { + Configuration conf = new Configuration(); + conf.setInt(ZstandardCodec.PARQUET_COMPRESS_ZSTD_WORKERS, 2); + CodecFactory factory = new CodecFactory(conf, pageSize); + BytesInputCompressor compressor = factory.getCompressor(ZSTD); + BytesInputDecompressor decompressor = factory.getDecompressor(ZSTD); + + byte[] data = new byte[64 * 1024]; + new Random(42).nextBytes(data); + + BytesInput compressed = compressor.compress(BytesInput.from(data)); + BytesInput decompressed = decompressor.decompress(compressed, data.length); + Assert.assertArrayEquals("ZSTD round-trip failed with workers=2", data, decompressed.toByteArray()); + + compressor.release(); + decompressor.release(); + factory.release(); + } + + // ---- Tests for ZSTD level through the direct CodecFactory path ---- + + @Test + public void zstdLevelConfigThroughDirectPath() throws IOException { + Configuration confLow = new Configuration(); + confLow.setInt(ZstandardCodec.PARQUET_COMPRESS_ZSTD_LEVEL, 1); + + Configuration confHigh = new Configuration(); + confHigh.setInt(ZstandardCodec.PARQUET_COMPRESS_ZSTD_LEVEL, 19); + + byte[] data = new byte[64 * 1024]; + new Random(42).nextBytes(data); + + CodecFactory factoryLow = new CodecFactory(confLow, pageSize); + CodecFactory factoryHigh = new CodecFactory(confHigh, pageSize); + + long sizeLow = + factoryLow.getCompressor(ZSTD).compress(BytesInput.from(data)).size(); + long sizeHigh = + factoryHigh.getCompressor(ZSTD).compress(BytesInput.from(data)).size(); + + Assert.assertTrue( + "Expected ZSTD level 19 (" + sizeHigh + ") <= level 1 (" + sizeLow + ")", sizeHigh <= sizeLow); + + factoryLow.release(); + factoryHigh.release(); + } + + // ---- Tests for GZIP level through the direct CodecFactory path ---- + @Test - public void cachingKeysGzip() { - Configuration config_zlib_2 = new Configuration(); - config_zlib_2.set("zlib.compress.level", "2"); + public void gzipLevelConfigThroughDirectPath() throws IOException { + Configuration confLow = new Configuration(); + confLow.setInt("zlib.compress.level", 1); + + Configuration confHigh = new Configuration(); + confHigh.setInt("zlib.compress.level", 9); + + byte[] data = new byte[64 * 1024]; + new Random(42).nextBytes(data); + + CodecFactory factoryLow = new CodecFactory(confLow, pageSize); + CodecFactory factoryHigh = new CodecFactory(confHigh, pageSize); + + BytesInputCompressor compLow = factoryLow.getCompressor(GZIP); + BytesInputCompressor compHigh = factoryHigh.getCompressor(GZIP); + + long sizeLow = compLow.compress(BytesInput.from(data)).size(); + long sizeHigh = compHigh.compress(BytesInput.from(data)).size(); + + Assert.assertTrue("Expected GZIP level 9 (" + sizeHigh + ") <= level 1 (" + sizeLow + ")", sizeHigh <= sizeLow); + + // Also verify round-trip for both levels + BytesInputDecompressor decompLow = factoryLow.getDecompressor(GZIP); + BytesInputDecompressor decompHigh = factoryHigh.getDecompressor(GZIP); - Configuration config_zlib_5 = new Configuration(); - config_zlib_5.set("zlib.compress.level", "5"); + Assert.assertArrayEquals( + data, + decompLow + .decompress(compLow.compress(BytesInput.from(data)), data.length) + .toByteArray()); + Assert.assertArrayEquals( + data, + decompHigh + .decompress(compHigh.compress(BytesInput.from(data)), data.length) + .toByteArray()); - final CodecFactory codecFactory_2 = new PublicCodecFactory(config_zlib_2, pageSize); - final CodecFactory codecFactory_5 = new PublicCodecFactory(config_zlib_5, pageSize); + factoryLow.release(); + factoryHigh.release(); + } + + // ---- Tests for BROTLI direct bypass (when native lib available) ---- + + @Test + public void brotliDirectFactoryRoundTrip() throws IOException { + // Test through the DirectCodecFactory path where BROTLI bypass lives + try (TrackingByteBufferAllocator alloc = TrackingByteBufferAllocator.wrap(new DirectByteBufferAllocator())) { + CodecFactory directFactory = CodecFactory.createDirectCodecFactory(new Configuration(), alloc, pageSize); + BytesInputCompressor compressor = directFactory.getCompressor(BROTLI); + BytesInputDecompressor decompressor = directFactory.getDecompressor(BROTLI); + + // Use compressible data (repeated patterns) so compression is verifiable + byte[] data = new byte[16 * 1024]; + for (int i = 0; i < data.length; i++) { + data[i] = (byte) (i % 251); + } - CompressionCodec codec_2_1 = codecFactory_2.getCodec(CompressionCodecName.GZIP); - CompressionCodec codec_2_2 = codecFactory_2.getCodec(CompressionCodecName.GZIP); - CompressionCodec codec_5_1 = codecFactory_5.getCodec(CompressionCodecName.GZIP); + BytesInput compressed = compressor.compress(BytesInput.from(data)); + BytesInput decompressed = decompressor.decompress(compressed, data.length); + Assert.assertArrayEquals("BROTLI direct round-trip failed", data, decompressed.toByteArray()); - Assert.assertEquals(codec_2_1, codec_2_2); - Assert.assertNotEquals(codec_2_1, codec_5_1); + // Test multiple consecutive compressions to verify state management + for (int i = 0; i < 5; i++) { + byte[] moreData = new byte[8 * 1024 + i * 1024]; + for (int j = 0; j < moreData.length; j++) { + moreData[j] = (byte) ((j + i) % 251); + } + BytesInput moreCompressed = compressor.compress(BytesInput.from(moreData)); + BytesInput moreDecompressed = decompressor.decompress(moreCompressed, moreData.length); + Assert.assertArrayEquals( + "BROTLI direct round-trip failed on iteration " + i, moreData, moreDecompressed.toByteArray()); + } + + compressor.release(); + decompressor.release(); + directFactory.release(); + } } + // ---- Test for cross-factory interop with new direct codecs ---- + @Test - public void cachingKeysZstd() { - Configuration config_zstd_2 = new Configuration(); - config_zstd_2.set("parquet.compression.codec.zstd.level", "2"); + public void crossFactoryInteropAllDirectCodecs() throws IOException { + CompressionCodecName[] codecs = {SNAPPY, ZSTD, LZ4_RAW, GZIP, LZO, BROTLI}; + + byte[] data = new byte[32 * 1024]; + new Random(42).nextBytes(data); + + CodecFactory heapFactory = new CodecFactory(new Configuration(), pageSize); + try (TrackingByteBufferAllocator alloc = TrackingByteBufferAllocator.wrap(new DirectByteBufferAllocator())) { + CodecFactory directFactory = CodecFactory.createDirectCodecFactory(new Configuration(), alloc, pageSize); + + for (CompressionCodecName codec : codecs) { + // heap compress -> direct decompress + BytesInput heapCompressed = heapFactory.getCompressor(codec).compress(BytesInput.from(data)); + BytesInput directDecompressed = + directFactory.getDecompressor(codec).decompress(heapCompressed, data.length); + Assert.assertArrayEquals(codec + " heap->direct failed", data, directDecompressed.toByteArray()); + + // direct compress -> heap decompress + BytesInput directCompressed = directFactory.getCompressor(codec).compress(BytesInput.from(data)); + BytesInput heapDecompressed = + heapFactory.getDecompressor(codec).decompress(directCompressed, data.length); + Assert.assertArrayEquals(codec + " direct->heap failed", data, heapDecompressed.toByteArray()); + } - Configuration config_zstd_5 = new Configuration(); - config_zstd_5.set("parquet.compression.codec.zstd.level", "5"); + directFactory.release(); + } + heapFactory.release(); + } - final CodecFactory codecFactory_2 = new PublicCodecFactory(config_zstd_2, pageSize); - final CodecFactory codecFactory_5 = new PublicCodecFactory(config_zstd_5, pageSize); + @Test + public void zstdCompressorBufferReuseSafety() throws IOException { + verifyCompressorOutputCopiedBeforeReuse(ZSTD); + } - CompressionCodec codec_2_1 = codecFactory_2.getCodec(CompressionCodecName.ZSTD); - CompressionCodec codec_2_2 = codecFactory_2.getCodec(CompressionCodecName.ZSTD); - CompressionCodec codec_5_1 = codecFactory_5.getCodec(CompressionCodecName.ZSTD); + @Test + public void brotliQualityConfigProducesValidOutput() throws IOException { + byte[] data = new byte[16 * 1024]; + for (int i = 0; i < data.length; i++) { + data[i] = (byte) (i % 251); + } - Assert.assertEquals(codec_2_1, codec_2_2); - Assert.assertNotEquals(codec_2_1, codec_5_1); + for (int quality : new int[] {0, 1, 6, 11}) { + Configuration conf = new Configuration(); + conf.setInt("compression.brotli.quality", quality); + CodecFactory factory = new CodecFactory(conf, pageSize); + BytesInputCompressor compressor = factory.getCompressor(BROTLI); + BytesInputDecompressor decompressor = factory.getDecompressor(BROTLI); + + BytesInput compressed = compressor.compress(BytesInput.from(data)); + BytesInput decompressed = decompressor.decompress(compressed, data.length); + Assert.assertArrayEquals( + "BROTLI quality=" + quality + " round-trip failed", data, decompressed.toByteArray()); + + compressor.release(); + decompressor.release(); + factory.release(); + } + } + + @Test + public void singleByteInputRoundTrip() throws IOException { + CompressionCodecName[] codecs = {SNAPPY, ZSTD, LZ4_RAW, GZIP, LZO, BROTLI}; + byte[] data = new byte[] {42}; + + for (CompressionCodecName codec : codecs) { + CodecFactory factory = new CodecFactory(new Configuration(), pageSize); + BytesInputCompressor compressor = factory.getCompressor(codec); + BytesInputDecompressor decompressor = factory.getDecompressor(codec); + + BytesInput compressed = compressor.compress(BytesInput.from(data)); + BytesInput decompressed = decompressor.decompress(compressed, 1); + Assert.assertArrayEquals("Single-byte round-trip failed for " + codec, data, decompressed.toByteArray()); + + compressor.release(); + decompressor.release(); + factory.release(); + } } } diff --git a/pom.xml b/pom.xml index 1bd9893d87..20063523e1 100644 --- a/pom.xml +++ b/pom.xml @@ -96,7 +96,7 @@ 1.7.33 1.11.5 33.6.0-jre - 0.1.1 + 1.23.0 5.23.0 0.27ea1 3.6.1