apache · iemejia · May 16, 2026
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDecodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleDecodingBenchmark.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Decoding-level micro-benchmarks for the RLE/bit-packing hybrid encoding used
+ * for {@code BOOLEAN} values in Parquet data pages V2.
+ * Encoding benchmarks live in {@link RleEncodingBenchmark}.
+ *
+ * <p>The {@code dataPattern} parameter exercises RLE's best cases (ALL_TRUE,
+ * ALL_FALSE), worst case (ALTERNATING), and realistic distributions (RANDOM,
+ * MOSTLY_TRUE_99, MOSTLY_FALSE_99).
+ *
+ * <p>Each invocation decodes {@value #VALUE_COUNT} values; throughput is
+ * reported per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class RleDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"ALL_TRUE", "ALL_FALSE", "ALTERNATING", "RANDOM", "MOSTLY_TRUE_99", "MOSTLY_FALSE_99"})
+  public String dataPattern;
+
+  /** RLE-encoded bytes with 4-byte LE length prefix (ValuesReader format). */
+  private byte[] encodedWithLengthPrefix;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    boolean[] data = RleEncodingBenchmark.generateData(dataPattern);
+
+    // Encode using the scalar ValuesWriter path
+    RunLengthBitPackingHybridValuesWriter w =
+        new RunLengthBitPackingHybridValuesWriter(1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (boolean v : data) {
+      w.writeBoolean(v);
+    }
+    encodedWithLengthPrefix = w.getBytes().toByteArray();
+    w.close();
+  }
+
+  // ---- Scalar decode via ValuesReader ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeBoolean(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridValuesReader r = new RunLengthBitPackingHybridValuesReader(1);
+    r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(r.readBoolean());
+    }
+  }
+}
diff --git a/...arks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java b/...arks/src/main/java/org/apache/parquet/benchmarks/RleDictionaryIndexDecodingBenchmark.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Encoding and decoding micro-benchmarks for synthetic dictionary-id pages using
+ * {@link RunLengthBitPackingHybridEncoder} and {@link RunLengthBitPackingHybridDecoder}.
+ * This isolates the RLE/bit-packing hybrid codec paths and is intentionally
+ * separate from full INT32/INT64 value encode/decode path benchmarks.
+ *
+ * <p>The encode benchmark measures the RLE encoder's {@code pack32Values} fast path
+ * and bit-packing throughput. The decode benchmark measures the corresponding
+ * {@code unpack32Values} fast path and RLE run expansion.
+ *
+ * <p>The {@code bitWidth} parameter exercises different packing densities (1-bit to
+ * 16-bit), and the {@code indexPattern} parameter exercises pure RLE (CONSTANT),
+ * mixed (LOW_CARDINALITY), and pure bit-packing (SEQUENTIAL, RANDOM) paths.
+ *
+ * <p>Per-invocation overhead (encoder/decoder construction and {@link ByteBufferInputStream}
+ * wrapping) is amortized over {@value #VALUE_COUNT} reads via
+ * {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class RleDictionaryIndexDecodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 1024 * 1024;
+
+  @Param({"1", "4", "8", "10", "16"})
+  public int bitWidth;
+
+  @Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY", "CONSTANT"})
+  public String indexPattern;
+
+  /** Raw RLE-encoded bytes (no length prefix). */
+  private byte[] encoded;
+
+  private int[] ids;
+
+  /** RLE-encoded bytes with 4-byte LE length prefix (ValuesReader format). */
+  private byte[] encodedWithLengthPrefix;
+
+  @Setup(Level.Trial)
+  public void setup() throws IOException {
+    int maxId = 1 << bitWidth;
+    ids = generateDictionaryIds(maxId);
+    try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        bitWidth, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
+      for (int id : ids) {
+        encoder.writeInt(id);
+      }
+      encoded = encoder.toBytes().toByteArray();
+    }
+
+    // Prepend 4-byte LE length for ValuesReader.initFromPage() format
+    encodedWithLengthPrefix = new byte[4 + encoded.length];
+    ByteBuffer.wrap(encodedWithLengthPrefix).order(ByteOrder.LITTLE_ENDIAN).putInt(encoded.length);
+    System.arraycopy(encoded, 0, encodedWithLengthPrefix, 4, encoded.length);
+  }
+
+  private int[] generateDictionaryIds(int maxId) {
+    switch (indexPattern) {
+      case "SEQUENTIAL":
+        int[] sequential = new int[VALUE_COUNT];
+        for (int i = 0; i < VALUE_COUNT; i++) {
+          sequential[i] = i % maxId;
+        }
+        return sequential;
+      case "RANDOM":
+        return TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, maxId, TestDataFactory.DEFAULT_SEED);
+      case "LOW_CARDINALITY":
+        int distinct = Math.min(TestDataFactory.LOW_CARDINALITY_DISTINCT, maxId);
+        return TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, distinct, TestDataFactory.DEFAULT_SEED);
+      case "CONSTANT":
+        int[] constant = new int[VALUE_COUNT];
+        java.util.Arrays.fill(constant, 0);
+        return constant;
+      default:
+        throw new IllegalArgumentException("Unknown index pattern: " + indexPattern);
+    }
+  }
+
+  // ---- Scalar encode via encoder ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeDictionaryIds() throws IOException {
+    try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
+        bitWidth, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
+      for (int id : ids) {
+        encoder.writeInt(id);
+      }
+      return encoder.toBytes().toByteArray();
+    }
+  }
+
+  // ---- Scalar decode via decoder ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeDictionaryIds(Blackhole bh) {
+    RunLengthBitPackingHybridDecoder decoder =
+        new RunLengthBitPackingHybridDecoder(bitWidth, ByteBuffer.wrap(encoded));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(decoder.readInt());
+    }
+  }
+
+  // ---- Scalar decode via ValuesReader wrapper ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public void decodeValuesReader(Blackhole bh) throws IOException {
+    RunLengthBitPackingHybridValuesReader reader = new RunLengthBitPackingHybridValuesReader(bitWidth);
+    reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
+    for (int i = 0; i < VALUE_COUNT; i++) {
+      bh.consume(reader.readInteger());
+    }
+  }
+}
diff --git a/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleEncodingBenchmark.java b/parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/RleEncodingBenchmark.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.benchmarks;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.apache.parquet.bytes.HeapByteBufferAllocator;
+import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Encoding-level micro-benchmarks for the RLE/bit-packing hybrid encoding used
+ * for {@code BOOLEAN} values in Parquet data pages V2.
+ * Decoding benchmarks live in {@link RleDecodingBenchmark}.
+ *
+ * <p>The {@code dataPattern} parameter exercises RLE's best cases (ALL_TRUE,
+ * ALL_FALSE), worst case (ALTERNATING), and realistic distributions (RANDOM,
+ * MOSTLY_TRUE_99, MOSTLY_FALSE_99).
+ *
+ * <p>Each invocation encodes {@value #VALUE_COUNT} values; throughput is
+ * reported per-value via {@link OperationsPerInvocation}.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@State(Scope.Thread)
+public class RleEncodingBenchmark {
+
+  static final int VALUE_COUNT = 100_000;
+  private static final int INIT_SLAB_SIZE = 64 * 1024;
+  private static final int PAGE_SIZE = 4 * 1024 * 1024;
+
+  @Param({"ALL_TRUE", "ALL_FALSE", "ALTERNATING", "RANDOM", "MOSTLY_TRUE_99", "MOSTLY_FALSE_99"})
+  public String dataPattern;
+
+  private boolean[] data;
+
+  @Setup(Level.Trial)
+  public void setup() {
+    data = generateData(dataPattern);
+  }
+
+  static boolean[] generateData(String pattern) {
+    boolean[] d = new boolean[VALUE_COUNT];
+    Random rng = new Random(42);
+    switch (pattern) {
+      case "ALL_TRUE":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = true;
+        break;
+      case "ALL_FALSE":
+        // already false
+        break;
+      case "ALTERNATING":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = (i & 1) == 0;
+        break;
+      case "RANDOM":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextBoolean();
+        break;
+      case "MOSTLY_TRUE_99":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) != 0;
+        break;
+      case "MOSTLY_FALSE_99":
+        for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) == 0;
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown pattern: " + pattern);
+    }
+    return d;
+  }
+
+  // ---- Scalar encode via ValuesWriter ----
+
+  @Benchmark
+  @OperationsPerInvocation(VALUE_COUNT)
+  public byte[] encodeBoolean() throws IOException {
+    RunLengthBitPackingHybridValuesWriter w =
+        new RunLengthBitPackingHybridValuesWriter(1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
+    for (boolean v : data) {
+      w.writeBoolean(v);
+    }
+    byte[] bytes = w.getBytes().toByteArray();
+    w.close();
+    return bytes;
+  }
+}