diff --git a/sql/core/benchmarks/WritableColumnVectorBulkFillBenchmark-results.txt b/sql/core/benchmarks/WritableColumnVectorBulkFillBenchmark-results.txt new file mode 100644 index 0000000000000..1c10a8ef76edf --- /dev/null +++ b/sql/core/benchmarks/WritableColumnVectorBulkFillBenchmark-results.txt @@ -0,0 +1,257 @@ +================================================================================================ +WritableColumnVector bulk fill +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBooleans (boolean) count=1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 407.5 2.5 1.0X +OffHeap 0 0 0 285.6 3.5 0.7X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBooleans (boolean) count=8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 1432.7 0.7 1.0X +OffHeap 0 0 0 1523.2 0.7 1.1X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBooleans (boolean) count=64: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2570.2 0.4 1.0X +OffHeap 0 0 0 2550.2 0.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBooleans (boolean) count=512: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 1399.7 0.7 1.0X +OffHeap 0 0 0 19784.5 0.1 14.1X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBooleans (boolean) count=4096: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 3 3 0 1411.5 0.7 1.0X +OffHeap 0 0 0 39316.7 0.0 27.9X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBooleans (boolean) count=65536: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 24 24 0 2791.6 0.4 1.0X +OffHeap 2 2 0 44669.8 0.0 16.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBytes (byte) count=1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 186.6 5.4 1.0X +OffHeap 0 0 0 190.8 5.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBytes (byte) count=8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 1202.9 0.8 1.0X +OffHeap 0 0 0 1042.1 1.0 0.9X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBytes (byte) count=64: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2448.1 0.4 1.0X +OffHeap 0 0 0 2353.9 0.4 1.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBytes (byte) count=512: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2681.6 0.4 1.0X +OffHeap 0 0 0 19218.1 0.1 7.2X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBytes (byte) count=4096: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 2 2 0 2772.8 0.4 1.0X +OffHeap 0 0 0 39147.5 0.0 14.1X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putBytes (byte) count=65536: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 24 24 0 2788.2 0.4 1.0X +OffHeap 2 2 0 44447.8 0.0 15.9X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putShorts (short) count=1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 131.6 7.6 1.0X +OffHeap 0 0 0 136.3 7.3 1.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putShorts (short) count=8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 812.3 1.2 1.0X +OffHeap 0 0 0 849.4 1.2 1.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putShorts (short) count=64: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2213.0 0.5 1.0X +OffHeap 0 0 0 3059.3 0.3 1.4X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putShorts (short) count=512: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2652.7 0.4 1.0X +OffHeap 0 0 0 5001.9 0.2 1.9X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putShorts (short) count=4096: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 2 2 0 2775.6 0.4 1.0X +OffHeap 1 1 0 5637.0 0.2 2.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putShorts (short) count=65536: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 24 24 0 2796.0 0.4 1.0X +OffHeap 12 12 0 5668.3 0.2 2.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putInts (int) count=1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 134.9 7.4 1.0X +OffHeap 0 0 0 95.6 10.5 0.7X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putInts (int) count=8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 846.7 1.2 1.0X +OffHeap 0 0 0 620.7 1.6 0.7X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putInts (int) count=64: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2213.0 0.5 1.0X +OffHeap 0 0 0 2425.5 0.4 1.1X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putInts (int) count=512: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2610.3 0.4 1.0X +OffHeap 0 0 0 4574.9 0.2 1.8X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putInts (int) count=4096: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 2 2 0 2763.7 0.4 1.0X +OffHeap 1 1 0 5566.3 0.2 2.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putInts (int) count=65536: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 24 24 0 2785.3 0.4 1.0X +OffHeap 12 12 0 5650.4 0.2 2.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putLongs (long) count=1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 133.0 7.5 1.0X +OffHeap 0 0 0 95.6 10.5 0.7X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putLongs (long) count=8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 818.0 1.2 1.0X +OffHeap 0 0 0 620.7 1.6 0.8X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putLongs (long) count=64: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2213.0 0.5 1.0X +OffHeap 0 0 0 1634.3 0.6 0.7X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putLongs (long) count=512: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2596.3 0.4 1.0X +OffHeap 0 0 0 1778.7 0.6 0.7X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putLongs (long) count=4096: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 2 2 0 2738.4 0.4 1.0X +OffHeap 2 2 0 1709.1 0.6 0.6X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putLongs (long) count=65536: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 24 24 0 2787.6 0.4 1.0X +OffHeap 39 39 0 1706.4 0.6 0.6X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putNulls ((no value)) count=1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 122.6 8.2 1.0X +OffHeap 0 0 0 92.5 10.8 0.8X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putNulls ((no value)) count=8: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 739.0 1.4 1.0X +OffHeap 0 0 0 604.1 1.7 0.8X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putNulls ((no value)) count=64: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2186.4 0.5 1.0X +OffHeap 0 0 0 2586.5 0.4 1.2X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putNulls ((no value)) count=512: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 0 0 0 2605.6 0.4 1.0X +OffHeap 0 0 0 4708.2 0.2 1.8X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putNulls ((no value)) count=4096: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 2 2 0 2768.2 0.4 1.0X +OffHeap 1 1 0 5588.1 0.2 2.0X + +OpenJDK 64-Bit Server VM 17.0.19+10-LTS on Linux 6.17.0-1013-azure +AMD EPYC 9V74 80-Core Processor +putNulls ((no value)) count=65536: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +OnHeap 24 24 0 2790.8 0.4 1.0X +OffHeap 12 12 1 5652.1 0.2 2.0X + + diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java index ba740602b4c2b..03a645acedf56 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java @@ -33,6 +33,14 @@ public final class OffHeapColumnVector extends WritableColumnVector { private static final boolean bigEndianPlatform = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN); + // Below this count, byte-fill methods (putBytes / putBooleans) write bytes in an inline + // loop. At or above this count, they call Platform.setMemory which lowers to a native + // memset. The JNI fixed cost of setMemory dominates for very short fills; on the + // benchmarked hardware (Apple M4 Max + OpenJDK 21) the crossover sits between 64 and + // 512, so 128 is a conservative choice that avoids regression at small counts while + // retaining the bulk of the asymptotic gain. + private static final int SET_MEMORY_THRESHOLD = 128; + /** * Allocates columns to store elements of each field of the schema off heap. * Capacity is the initial capacity of the vector and it will grow as necessary. Capacity is @@ -151,9 +159,13 @@ public void putBoolean(int rowId, boolean value) { @Override public void putBooleans(int rowId, int count, boolean value) { - byte v = (byte)((value) ? 1 : 0); - for (int i = 0; i < count; ++i) { - Platform.putByte(null, data + rowId + i, v); + byte v = (byte) (value ? 1 : 0); + if (count < SET_MEMORY_THRESHOLD) { + for (int i = 0; i < count; ++i) { + Platform.putByte(null, data + rowId + i, v); + } + } else { + Platform.setMemory(data + rowId, v, count); } } @@ -193,8 +205,12 @@ public void putByte(int rowId, byte value) { @Override public void putBytes(int rowId, int count, byte value) { - for (int i = 0; i < count; ++i) { - Platform.putByte(null, data + rowId + i, value); + if (count < SET_MEMORY_THRESHOLD) { + for (int i = 0; i < count; ++i) { + Platform.putByte(null, data + rowId + i, value); + } + } else { + Platform.setMemory(data + rowId, value, count); } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java index 3e1f4d7a4f838..d5c550cf13cd3 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java @@ -146,10 +146,8 @@ public void putBoolean(int rowId, boolean value) { @Override public void putBooleans(int rowId, int count, boolean value) { - byte v = (byte)((value) ? 1 : 0); - for (int i = 0; i < count; ++i) { - byteData[i + rowId] = v; - } + byte v = (byte) (value ? 1 : 0); + java.util.Arrays.fill(byteData, rowId, rowId + count, v); } @Override @@ -191,9 +189,7 @@ public void putByte(int rowId, byte value) { @Override public void putBytes(int rowId, int count, byte value) { - for (int i = 0; i < count; ++i) { - byteData[i + rowId] = value; - } + java.util.Arrays.fill(byteData, rowId, rowId + count, value); } @Override @@ -253,9 +249,7 @@ public void putShort(int rowId, short value) { @Override public void putShorts(int rowId, int count, short value) { - for (int i = 0; i < count; ++i) { - shortData[i + rowId] = value; - } + java.util.Arrays.fill(shortData, rowId, rowId + count, value); } @Override @@ -395,9 +389,7 @@ public void putLong(int rowId, long value) { @Override public void putLongs(int rowId, int count, long value) { - for (int i = 0; i < count; ++i) { - longData[i + rowId] = value; - } + java.util.Arrays.fill(longData, rowId, rowId + count, value); } @Override diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/WritableColumnVectorBulkFillBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/WritableColumnVectorBulkFillBenchmark.scala new file mode 100644 index 0000000000000..59528831d65dc --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/WritableColumnVectorBulkFillBenchmark.scala @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.vectorized + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.sql.types._ + +/** + * Low-level benchmark for `WritableColumnVector`'s constant-value bulk-fill APIs: + * `putBooleans(rowId, count, value)`, `putBytes(rowId, count, value)`, + * `putShorts(rowId, count, value)`, `putInts(rowId, count, value)`, + * `putLongs(rowId, count, value)`, `putNulls(rowId, count)`. + * + * The count sweep spans from very short runs (where call overhead dominates) to long + * fills (where intrinsics such as `Arrays.fill` / `Unsafe.setMemory` should pay off): + * + * counts = 1, 8, 64, 512, 4096, 65536 + * + * Each timed case repeats one bulk fill `INNER_ITERS` times on the same column vector + * so the steady-state cost dominates per-iteration timer overhead. A `@volatile` sink + * read at the end of the inner loop blocks JIT from dead-code-eliminating the fills + * (`Arrays.fill` on a non-escaping array is otherwise removable). + * + * To run this benchmark: + * {{{ + * 1. build/sbt "sql/Test/runMain " + * 2. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results in "benchmarks/WritableColumnVectorBulkFillBenchmark-results.txt". + * 3. GHA: `Run benchmarks` workflow, class = `*WritableColumnVectorBulkFill*`. + * }}} + */ +object WritableColumnVectorBulkFillBenchmark extends BenchmarkBase { + + // Capacity must be >= max(counts) so a single fill covers the requested range. + private val CAPACITY = 65536 + private val INNER_ITERS = 1024 + private val NUM_ITERS = 5 + + private val COUNTS = Seq(1, 8, 64, 512, 4096, 65536) + + // @volatile sinks prevent JIT from dead-code-eliminating the fills. Read one + // element of the just-filled range into each sink at the end of every inner iter. + @volatile private var byteSink: Byte = 0 + @volatile private var shortSink: Short = 0 + @volatile private var intSink: Int = 0 + @volatile private var longSink: Long = 0L + + private def runFor( + label: String, + typeName: String, + newOnHeap: () => WritableColumnVector, + newOffHeap: () => WritableColumnVector, + doFill: (WritableColumnVector, Int) => Unit, + readSink: (WritableColumnVector, Int) => Unit): Unit = { + // One Benchmark per count so the "Per Row (ns)" column reports per-element cost. + // Within a Benchmark, all cases must share the same num: OnHeap and OffHeap both + // do INNER_ITERS * count elements per case. + COUNTS.foreach { count => + val benchmark = new Benchmark( + s"$label ($typeName) count=$count", + (INNER_ITERS.toLong) * count, NUM_ITERS, output = output) + + val onHeap = newOnHeap() + doFill(onHeap, count) // warm + readSink(onHeap, count - 1) + benchmark.addCase("OnHeap") { _ => + var i = 0 + while (i < INNER_ITERS) { + doFill(onHeap, count) + readSink(onHeap, count - 1) + i += 1 + } + } + + val offHeap = newOffHeap() + doFill(offHeap, count) + readSink(offHeap, count - 1) + benchmark.addCase("OffHeap") { _ => + var i = 0 + while (i < INNER_ITERS) { + doFill(offHeap, count) + readSink(offHeap, count - 1) + i += 1 + } + } + benchmark.run() + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("WritableColumnVector bulk fill") { + + // putBooleans(rowId, count, value=true) + runFor("putBooleans", "boolean", + () => new OnHeapColumnVector(CAPACITY, BooleanType), + () => new OffHeapColumnVector(CAPACITY, BooleanType), + (v, n) => v.putBooleans(0, n, true), + (v, idx) => byteSink = (if (v.getBoolean(idx)) 1 else 0).toByte) + + // putBytes(rowId, count, value=42) + runFor("putBytes", "byte", + () => new OnHeapColumnVector(CAPACITY, ByteType), + () => new OffHeapColumnVector(CAPACITY, ByteType), + (v, n) => v.putBytes(0, n, 42.toByte), + (v, idx) => byteSink = v.getByte(idx)) + + // putShorts(rowId, count, value=42) + runFor("putShorts", "short", + () => new OnHeapColumnVector(CAPACITY, ShortType), + () => new OffHeapColumnVector(CAPACITY, ShortType), + (v, n) => v.putShorts(0, n, 42.toShort), + (v, idx) => shortSink = v.getShort(idx)) + + // putInts(rowId, count, value=42) -- already optimized in SPARK-57024 for OnHeap; + // included for reference / regression detection. + runFor("putInts", "int", + () => new OnHeapColumnVector(CAPACITY, IntegerType), + () => new OffHeapColumnVector(CAPACITY, IntegerType), + (v, n) => v.putInts(0, n, 42), + (v, idx) => intSink = v.getInt(idx)) + + // putLongs(rowId, count, value=42L) + runFor("putLongs", "long", + () => new OnHeapColumnVector(CAPACITY, LongType), + () => new OffHeapColumnVector(CAPACITY, LongType), + (v, n) => v.putLongs(0, n, 42L), + (v, idx) => longSink = v.getLong(idx)) + + // putNulls(rowId, count) -- already optimized in SPARK-57024; reference baseline. + // isAllNull() is type/flag-driven, not state-driven, so repeated putNulls on the + // same vector still executes the fill (does not early-out). + runFor("putNulls", "(no value)", + () => new OnHeapColumnVector(CAPACITY, IntegerType), + () => new OffHeapColumnVector(CAPACITY, IntegerType), + (v, n) => v.putNulls(0, n), + (v, idx) => intSink = if (v.isNullAt(idx)) 1 else 0) + } + } +}