From f7d6733ec5537828d75b36c272dacfe33364db33 Mon Sep 17 00:00:00 2001 From: InCerryGit Date: Sun, 26 Apr 2026 11:22:41 +0800 Subject: [PATCH] perf: improve StringArray GetString decoding Decode strings directly from the offsets and values buffers instead of routing through GetBytes, avoiding duplicate bounds/null/offset work on a common read path. BenchmarkDotNet (StringArrayGetStringBenchmark, Count=1024): LegacyGetString 23.50 us / 48.08 KB; GetString 17.79 us / 48.08 KB; LegacyGetStringFromSlice 22.89 us / 48.00 KB; GetStringFromSlice 17.67 us / 48.00 KB. --- src/Apache.Arrow/Arrays/StringArray.cs | 21 ++- .../StringArrayGetStringBenchmark.cs | 133 ++++++++++++++++++ test/Apache.Arrow.Tests/StringArrayTests.cs | 99 +++++++++++++ 3 files changed, 248 insertions(+), 5 deletions(-) create mode 100644 test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs diff --git a/src/Apache.Arrow/Arrays/StringArray.cs b/src/Apache.Arrow/Arrays/StringArray.cs index 4998fae1..a0961ef4 100644 --- a/src/Apache.Arrow/Arrays/StringArray.cs +++ b/src/Apache.Arrow/Arrays/StringArray.cs @@ -124,22 +124,33 @@ public string GetString(int index, Encoding encoding = default) return materializedStrings[index]; } - ReadOnlySpan bytes = GetBytes(index, out bool isNull); + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } - if (isNull) + if (IsNull(index)) { return null; } - if (bytes.Length == 0) + ReadOnlySpan offsets = ValueOffsets; + int valueOffset = offsets[index]; + int valueLength = offsets[index + 1] - valueOffset; + + if (valueLength == 0) { return string.Empty; } + ReadOnlySpan values = Values; + + // Decode directly from the shared value buffer so the hot path only pays one + // bounds/null/offset pass before handing off to the requested encoding. unsafe { - fixed (byte* data = &MemoryMarshal.GetReference(bytes)) - return encoding.GetString(data, bytes.Length); + fixed (byte* data = &MemoryMarshal.GetReference(values)) + return encoding.GetString(data + valueOffset, valueLength); } } diff --git a/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs b/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs new file mode 100644 index 00000000..3c499711 --- /dev/null +++ b/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Text; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; + +namespace Apache.Arrow.Benchmarks +{ + [MemoryDiagnoser] + [ShortRunJob] + public class StringArrayGetStringBenchmark + { + private StringArray _array; + private StringArray _slice; + + [Params(1_024)] + public int Count { get; set; } + + [GlobalSetup] + public void GlobalSetup() + { + var builder = new StringArray.Builder(); + builder.Append("prefix"); + + for (int i = 0; i < Count; i++) + { + if ((i & 7) == 0) + { + builder.AppendNull(); + } + else if ((i & 7) == 1) + { + builder.Append(string.Empty); + } + else + { + builder.Append($"value-{i:0000}-payload"); + } + } + + builder.Append("suffix"); + + _array = builder.Build(); + _slice = (StringArray)_array.Slice(1, Count); + } + + [GlobalCleanup] + public void GlobalCleanup() + { + _slice.Dispose(); + _array.Dispose(); + } + + [Benchmark(Baseline = true)] + public int LegacyGetString() + { + int totalLength = 0; + for (int i = 0; i < _array.Length; i++) + { + totalLength += GetStringLegacy(_array, i)?.Length ?? 0; + } + + return totalLength; + } + + [Benchmark] + public int GetString() + { + int totalLength = 0; + for (int i = 0; i < _array.Length; i++) + { + totalLength += _array.GetString(i)?.Length ?? 0; + } + + return totalLength; + } + + [Benchmark] + public int LegacyGetStringFromSlice() + { + int totalLength = 0; + for (int i = 0; i < _slice.Length; i++) + { + totalLength += GetStringLegacy(_slice, i)?.Length ?? 0; + } + + return totalLength; + } + + [Benchmark] + public int GetStringFromSlice() + { + int totalLength = 0; + for (int i = 0; i < _slice.Length; i++) + { + totalLength += _slice.GetString(i)?.Length ?? 0; + } + + return totalLength; + } + + private static string GetStringLegacy(StringArray array, int index) + { + ReadOnlySpan bytes = array.GetBytes(index, out bool isNull); + + if (isNull) + { + return null; + } + + if (bytes.Length == 0) + { + return string.Empty; + } + + return Encoding.UTF8.GetString(bytes); + } + } +} diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs b/test/Apache.Arrow.Tests/StringArrayTests.cs index d79726f1..95b0caa0 100644 --- a/test/Apache.Arrow.Tests/StringArrayTests.cs +++ b/test/Apache.Arrow.Tests/StringArrayTests.cs @@ -82,6 +82,105 @@ public void ReturnsAppendedValueMaterialize(string firstValue, string secondValu Assert.True(array.IsMaterialized()); Assert.Equal(firstValue, retrievedValue); } + + [Fact] + public void ReturnsAppendedValueForSlice() + { + // Arrange + var array = new StringArray.Builder() + .Append("prefix") + .Append("value") + .AppendNull() + .Append(string.Empty) + .Build(); + + var slice = (StringArray)array.Slice(1, 3); + + // Act / Assert + Assert.Equal("value", slice.GetString(0)); + Assert.Null(slice.GetString(1)); + Assert.Equal(string.Empty, slice.GetString(2)); + } + + [Fact] + public void ReturnsAppendedValueForSliceAfterMaterialize() + { + // Arrange + var array = new StringArray.Builder() + .Append("prefix") + .Append("value") + .AppendNull() + .Append(string.Empty) + .Build(); + + var slice = (StringArray)array.Slice(1, 3); + + // Act + slice.Materialize(); + + // Assert + Assert.True(slice.IsMaterialized()); + Assert.Equal("value", slice.GetString(0)); + Assert.Null(slice.GetString(1)); + Assert.Equal(string.Empty, slice.GetString(2)); + } + + [Fact] + public void ReturnsAppendedValueWithCustomEncoding() + { + // Arrange + const string expected = "héllø"; + var array = new StringArray.Builder() + .Append(expected, Encoding.Unicode) + .Build(); + + // Act + var retrievedValue = array.GetString(0, Encoding.Unicode); + + // Assert + Assert.Equal(expected, retrievedValue); + } + + [Fact] + public void ReturnsAppendedValueWithCustomEncodingAfterMaterialize() + { + // Arrange + const string expected = "héllø"; + var array = new StringArray.Builder() + .Append(expected, Encoding.Unicode) + .Build(); + + // Act + array.Materialize(Encoding.Unicode); + var retrievedValue = array.GetString(0, Encoding.Unicode); + + // Assert + Assert.True(array.IsMaterialized(Encoding.Unicode)); + Assert.Equal(expected, retrievedValue); + } + + [Fact] + public void ReturnsAppendedValueForCustomEncodingSliceAfterMaterialize() + { + // Arrange + var array = new StringArray.Builder() + .Append("prefix", Encoding.Unicode) + .Append("héllø", Encoding.Unicode) + .AppendNull() + .Append(string.Empty, Encoding.Unicode) + .Build(); + + var slice = (StringArray)array.Slice(1, 3); + + // Act + slice.Materialize(Encoding.Unicode); + + // Assert + Assert.True(slice.IsMaterialized(Encoding.Unicode)); + Assert.Equal("héllø", slice.GetString(0, Encoding.Unicode)); + Assert.Null(slice.GetString(1, Encoding.Unicode)); + Assert.Equal(string.Empty, slice.GetString(2, Encoding.Unicode)); + } } public class Builder