Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.benchmarks;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.concurrent.TimeUnit;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OperationsPerInvocation;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

/**
* Decoding-level micro-benchmarks for the RLE/bit-packing hybrid encoding used
* for {@code BOOLEAN} values in Parquet data pages V2.
* Encoding benchmarks live in {@link RleEncodingBenchmark}.
*
* <p>The {@code dataPattern} parameter exercises RLE's best cases (ALL_TRUE,
* ALL_FALSE), worst case (ALTERNATING), and realistic distributions (RANDOM,
* MOSTLY_TRUE_99, MOSTLY_FALSE_99).
*
* <p>Each invocation decodes {@value #VALUE_COUNT} values; throughput is
* reported per-value via {@link OperationsPerInvocation}.
*/
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@Fork(1)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@State(Scope.Thread)
public class RleDecodingBenchmark {

static final int VALUE_COUNT = 100_000;
private static final int INIT_SLAB_SIZE = 64 * 1024;
private static final int PAGE_SIZE = 4 * 1024 * 1024;

@Param({"ALL_TRUE", "ALL_FALSE", "ALTERNATING", "RANDOM", "MOSTLY_TRUE_99", "MOSTLY_FALSE_99"})
public String dataPattern;

/** RLE-encoded bytes with 4-byte LE length prefix (ValuesReader format). */
private byte[] encodedWithLengthPrefix;

@Setup(Level.Trial)
public void setup() throws IOException {
boolean[] data = RleEncodingBenchmark.generateData(dataPattern);

// Encode using the scalar ValuesWriter path
RunLengthBitPackingHybridValuesWriter w =
new RunLengthBitPackingHybridValuesWriter(1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
for (boolean v : data) {
w.writeBoolean(v);
}
encodedWithLengthPrefix = w.getBytes().toByteArray();
w.close();
}

// ---- Scalar decode via ValuesReader ----

@Benchmark
@OperationsPerInvocation(VALUE_COUNT)
public void decodeBoolean(Blackhole bh) throws IOException {
RunLengthBitPackingHybridValuesReader r = new RunLengthBitPackingHybridValuesReader(1);
r.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
for (int i = 0; i < VALUE_COUNT; i++) {
bh.consume(r.readBoolean());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.benchmarks;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.concurrent.TimeUnit;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OperationsPerInvocation;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

/**
* Encoding and decoding micro-benchmarks for synthetic dictionary-id pages using
* {@link RunLengthBitPackingHybridEncoder} and {@link RunLengthBitPackingHybridDecoder}.
* This isolates the RLE/bit-packing hybrid codec paths and is intentionally
* separate from full INT32/INT64 value encode/decode path benchmarks.
*
* <p>The encode benchmark measures the RLE encoder's {@code pack32Values} fast path
* and bit-packing throughput. The decode benchmark measures the corresponding
* {@code unpack32Values} fast path and RLE run expansion.
*
* <p>The {@code bitWidth} parameter exercises different packing densities (1-bit to
* 16-bit), and the {@code indexPattern} parameter exercises pure RLE (CONSTANT),
* mixed (LOW_CARDINALITY), and pure bit-packing (SEQUENTIAL, RANDOM) paths.
*
* <p>Per-invocation overhead (encoder/decoder construction and {@link ByteBufferInputStream}
* wrapping) is amortized over {@value #VALUE_COUNT} reads via
* {@link OperationsPerInvocation}.
*/
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@Fork(1)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@State(Scope.Thread)
public class RleDictionaryIndexDecodingBenchmark {

static final int VALUE_COUNT = 100_000;
private static final int INIT_SLAB_SIZE = 64 * 1024;
private static final int PAGE_SIZE = 1024 * 1024;

@Param({"1", "4", "8", "10", "16"})
public int bitWidth;

@Param({"SEQUENTIAL", "RANDOM", "LOW_CARDINALITY", "CONSTANT"})
public String indexPattern;

/** Raw RLE-encoded bytes (no length prefix). */
private byte[] encoded;

private int[] ids;

/** RLE-encoded bytes with 4-byte LE length prefix (ValuesReader format). */
private byte[] encodedWithLengthPrefix;

@Setup(Level.Trial)
public void setup() throws IOException {
int maxId = 1 << bitWidth;
ids = generateDictionaryIds(maxId);
try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
bitWidth, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
for (int id : ids) {
encoder.writeInt(id);
}
encoded = encoder.toBytes().toByteArray();
}

// Prepend 4-byte LE length for ValuesReader.initFromPage() format
encodedWithLengthPrefix = new byte[4 + encoded.length];
ByteBuffer.wrap(encodedWithLengthPrefix).order(ByteOrder.LITTLE_ENDIAN).putInt(encoded.length);
System.arraycopy(encoded, 0, encodedWithLengthPrefix, 4, encoded.length);
}

private int[] generateDictionaryIds(int maxId) {
switch (indexPattern) {
case "SEQUENTIAL":
int[] sequential = new int[VALUE_COUNT];
for (int i = 0; i < VALUE_COUNT; i++) {
sequential[i] = i % maxId;
}
return sequential;
case "RANDOM":
return TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, maxId, TestDataFactory.DEFAULT_SEED);
case "LOW_CARDINALITY":
int distinct = Math.min(TestDataFactory.LOW_CARDINALITY_DISTINCT, maxId);
return TestDataFactory.generateLowCardinalityInts(VALUE_COUNT, distinct, TestDataFactory.DEFAULT_SEED);
case "CONSTANT":
int[] constant = new int[VALUE_COUNT];
java.util.Arrays.fill(constant, 0);
return constant;
default:
throw new IllegalArgumentException("Unknown index pattern: " + indexPattern);
}
}

// ---- Scalar encode via encoder ----

@Benchmark
@OperationsPerInvocation(VALUE_COUNT)
public byte[] encodeDictionaryIds() throws IOException {
try (RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(
bitWidth, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator())) {
for (int id : ids) {
encoder.writeInt(id);
}
return encoder.toBytes().toByteArray();
}
}

// ---- Scalar decode via decoder ----

@Benchmark
@OperationsPerInvocation(VALUE_COUNT)
public void decodeDictionaryIds(Blackhole bh) {
RunLengthBitPackingHybridDecoder decoder =
new RunLengthBitPackingHybridDecoder(bitWidth, ByteBuffer.wrap(encoded));
for (int i = 0; i < VALUE_COUNT; i++) {
bh.consume(decoder.readInt());
}
}

// ---- Scalar decode via ValuesReader wrapper ----

@Benchmark
@OperationsPerInvocation(VALUE_COUNT)
public void decodeValuesReader(Blackhole bh) throws IOException {
RunLengthBitPackingHybridValuesReader reader = new RunLengthBitPackingHybridValuesReader(bitWidth);
reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(encodedWithLengthPrefix)));
for (int i = 0; i < VALUE_COUNT; i++) {
bh.consume(reader.readInteger());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.benchmarks;

import java.io.IOException;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OperationsPerInvocation;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

/**
* Encoding-level micro-benchmarks for the RLE/bit-packing hybrid encoding used
* for {@code BOOLEAN} values in Parquet data pages V2.
* Decoding benchmarks live in {@link RleDecodingBenchmark}.
*
* <p>The {@code dataPattern} parameter exercises RLE's best cases (ALL_TRUE,
* ALL_FALSE), worst case (ALTERNATING), and realistic distributions (RANDOM,
* MOSTLY_TRUE_99, MOSTLY_FALSE_99).
*
* <p>Each invocation encodes {@value #VALUE_COUNT} values; throughput is
* reported per-value via {@link OperationsPerInvocation}.
*/
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@Fork(1)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@State(Scope.Thread)
public class RleEncodingBenchmark {

static final int VALUE_COUNT = 100_000;
private static final int INIT_SLAB_SIZE = 64 * 1024;
private static final int PAGE_SIZE = 4 * 1024 * 1024;

@Param({"ALL_TRUE", "ALL_FALSE", "ALTERNATING", "RANDOM", "MOSTLY_TRUE_99", "MOSTLY_FALSE_99"})
public String dataPattern;

private boolean[] data;

@Setup(Level.Trial)
public void setup() {
data = generateData(dataPattern);
}

static boolean[] generateData(String pattern) {
boolean[] d = new boolean[VALUE_COUNT];
Random rng = new Random(42);
switch (pattern) {
case "ALL_TRUE":
for (int i = 0; i < VALUE_COUNT; i++) d[i] = true;
break;
case "ALL_FALSE":
// already false
break;
case "ALTERNATING":
for (int i = 0; i < VALUE_COUNT; i++) d[i] = (i & 1) == 0;
break;
case "RANDOM":
for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextBoolean();
break;
case "MOSTLY_TRUE_99":
for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) != 0;
break;
case "MOSTLY_FALSE_99":
for (int i = 0; i < VALUE_COUNT; i++) d[i] = rng.nextInt(100) == 0;
break;
default:
throw new IllegalArgumentException("Unknown pattern: " + pattern);
}
return d;
}

// ---- Scalar encode via ValuesWriter ----

@Benchmark
@OperationsPerInvocation(VALUE_COUNT)
public byte[] encodeBoolean() throws IOException {
RunLengthBitPackingHybridValuesWriter w =
new RunLengthBitPackingHybridValuesWriter(1, INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
for (boolean v : data) {
w.writeBoolean(v);
}
byte[] bytes = w.getBytes().toByteArray();
w.close();
return bytes;
}
}
Loading