Skip to content

Commit ab0eff4

Browse files
committed
Inline IndexOfSemicolon for > 32 bytes, remove old code
1 parent 2a4eea0 commit ab0eff4

File tree

2 files changed

+26
-65
lines changed

2 files changed

+26
-65
lines changed

1brc/App.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ public static unsafe void ProcessSpan(FixedDictionary<Utf8Span, Summary> result,
363363
while (remaining.Length > 0)
364364
{
365365
nuint idx = remaining.IndexOfSemicolon();
366-
nint value = remaining.ParseIntBranchless(idx, out var idx1);
366+
nint value = remaining.ParseInt(idx, out var idx1);
367367
result.GetValueRefOrAddDefault(new Utf8Span(remaining.Pointer, idx)).Apply(value);
368368
remaining = remaining.SliceUnsafe(idx1);
369369
}

1brc/Utf8Span.cs

Lines changed: 25 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,13 @@ public override int GetHashCode()
149149
public override string ToString() => new((sbyte*)Pointer, 0, (int)Length, Encoding.UTF8);
150150

151151
[MethodImpl(MethodImplOptions.AggressiveInlining)]
152-
public nint ParseIntBranchless(nuint start, out nuint lfIndex) {
152+
public nint ParseInt(nuint start, out nuint lfIndex)
153+
{
153154
// I took it from artsiomkorzun, but he mentions merykitty, while noahfalk mentions RagnarGrootKoerkamp. The trace is lost
154-
155+
155156
const long DOT_BITS = 0x10101000;
156157
const long MAGIC_MULTIPLIER = (100 * 0x1000000 + 10 * 0x10000 + 1);
157-
158+
158159
long word = *(long*)(Pointer + start + 1);
159160
long inverted = ~word;
160161
int dot = BitOperations.TrailingZeroCount(inverted & DOT_BITS);
@@ -166,83 +167,43 @@ public nint ParseIntBranchless(nuint start, out nuint lfIndex) {
166167
lfIndex = start + (uint)(dot >> 3) + 4u;
167168
return (nint)value;
168169
}
169-
170-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
171-
public nint ParseInt(nuint start, out nuint lfIndex)
172-
{
173-
var ptr = Pointer + start + 1;
174-
int sign;
175-
176-
if (*ptr == (byte)'-')
177-
{
178-
ptr++;
179-
sign = -1;
180-
lfIndex = start + 6;
181-
}
182-
else
183-
{
184-
sign = 1;
185-
lfIndex = start + 5;
186-
}
187-
188-
if (ptr[1] != '.')
189-
{
190-
lfIndex++;
191-
return (nint)(ptr[0] * 100u + ptr[1] * 10u + ptr[3] - '0' * 111u) * sign;
192-
}
193-
194-
return (nint)(ptr[0] * 10u + ptr[2] - ('0' * 11u)) * sign;
195-
}
196170

197-
/// <summary>
198-
/// Spec: Station name: non null UTF-8 string of min length 1 character and max length 100 bytes (i.e. this could be 100 one-byte characters, or 50 two-byte characters, etc.)
199-
/// </summary>
200171
[MethodImpl(MethodImplOptions.AggressiveInlining)]
201172
internal nuint IndexOfSemicolon()
202173
{
203174
const nuint vectorSize = 32;
204-
// nuint start = 0; // it's consistently faster with this useless variable (non constant)
205175

206176
if (Vector256.IsHardwareAccelerated)
207177
{
208178
var sepVec = Vector256.Create((byte)';');
209179

210180
var matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(Pointer), sepVec);
211181
var mask = Vector256.ExtractMostSignificantBits(matches);
212-
nuint tzc = (uint)BitOperations.TrailingZeroCount(mask);
213-
214-
if (mask == 0) // For non-taken branches prefer placing them in a "leaf" instead of mask != 0, somewhere on GH they explain why, it would be nice to find.
215-
tzc = IndexOfSemicolonCont(this);
216-
217-
return tzc;
182+
var idx = (nuint)BitOperations.TrailingZeroCount(mask);
218183

219-
[MethodImpl(MethodImplOptions.NoInlining)]
220-
static nuint IndexOfSemicolonCont(Utf8Span span)
184+
if (mask == 0) // 32-63
221185
{
222-
// A nicer version would be just a recursive call, even not here but above instead of this function.
223-
// It's as fast for the default case and very close for 10K. Yet, this manually unrolled continuation is faster for 10K.
224-
// return vectorSize + span.SliceUnsafe(vectorSize).IndexOfSemicolon();
225-
226-
var sepVec = Vector256.Create((byte)';');
227-
var matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(span.Pointer + vectorSize), sepVec);
228-
var mask = Vector256.ExtractMostSignificantBits(matches);
229-
var tzc = (uint)BitOperations.TrailingZeroCount(mask);
230-
if (mask != 0)
231-
return vectorSize + tzc;
232-
233-
const nuint vectorSize2 = 2 * vectorSize;
234-
matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(span.Pointer + vectorSize2), sepVec);
186+
matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(Pointer + vectorSize), sepVec);
235187
mask = Vector256.ExtractMostSignificantBits(matches);
236-
tzc = (uint)BitOperations.TrailingZeroCount(mask);
237-
if (mask != 0)
238-
return vectorSize2 + tzc;
239-
240-
const nuint vectorSize3 = 3 * vectorSize;
241-
matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(span.Pointer + vectorSize3), sepVec);
242-
mask = Vector256.ExtractMostSignificantBits(matches);
243-
tzc = (uint)BitOperations.TrailingZeroCount(mask);
244-
return vectorSize3 + tzc;
188+
idx = vectorSize + (uint)BitOperations.TrailingZeroCount(mask);
189+
190+
if (mask == 0) // 64-95
191+
{
192+
// const nuint vectorSize2 = 2 * vectorSize;
193+
matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(Pointer + 2 * vectorSize), sepVec);
194+
mask = Vector256.ExtractMostSignificantBits(matches);
195+
idx = 2 * vectorSize + (uint)BitOperations.TrailingZeroCount(mask);
196+
197+
if (mask == 0) // 96-127
198+
{
199+
matches = Vector256.Equals(Unsafe.ReadUnaligned<Vector256<byte>>(Pointer + 3 * vectorSize), sepVec);
200+
mask = Vector256.ExtractMostSignificantBits(matches);
201+
idx = 3 * vectorSize + (uint)BitOperations.TrailingZeroCount(mask);
202+
}
203+
}
245204
}
205+
206+
return idx;
246207
}
247208

248209
return IndexOf(0, (byte)';');

0 commit comments

Comments
 (0)