diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj index fb11022f..b43da99c 100644 --- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj +++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj @@ -3,6 +3,7 @@ Exe net48;net6.0 + True diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchFlat.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchFlat.cs new file mode 100644 index 00000000..f0523066 --- /dev/null +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchFlat.cs @@ -0,0 +1,320 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +#if NETCOREAPP3_1_OR_GREATER +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace BitFaster.Caching.Benchmarks.Lfu +{ + internal class CmSketchFlat where I : struct, IsaProbe + { + // A mixture of seeds from FNV-1a, CityHash, and Murmur3 + private static readonly ulong[] Seed = { 0xc3a5c85c97cb3127L, 0xb492b66fbe98f273L, 0x9ae16a3b2f90404fL, 0xcbf29ce484222325L }; + private static readonly long ResetMask = 0x7777777777777777L; + private static readonly long OneMask = 0x1111111111111111L; + + private int sampleSize; + private int tableMask; + private long[] table; + private int size; + + private readonly IEqualityComparer comparer; + + /// + /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer. + /// + /// The maximum size. + /// The equality comparer. + public CmSketchFlat(long maximumSize, IEqualityComparer comparer) + { + EnsureCapacity(maximumSize); + this.comparer = comparer; + } + + /// + /// Gets the reset sample size. + /// + public int ResetSampleSize => this.sampleSize; + + /// + /// Gets the size. + /// + public int Size => this.size; + + /// + /// Estimate the frequency of the specified value. + /// + /// The value. + /// The estimated frequency of the value. + public int EstimateFrequency(T value) + { +#if !NETCOREAPP3_1_OR_GREATER + return EstimateFrequencyStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + return EstimateFrequencyAvx(value); + } + else + { + return EstimateFrequencyStd(value); + } +#endif + } + + /// + /// Increment the count of the specified value. + /// + /// The value. + public void Increment(T value) + { +#if !NETCOREAPP3_1_OR_GREATER + IncrementStd(value); +#else + + I isa = default; + + if (isa.IsAvx2Supported) + { + IncrementAvx(value); + } + else + { + IncrementStd(value); + } +#endif + } + + /// + /// Clears the count for all items. + /// + public void Clear() + { + table = new long[table.Length]; + size = 0; + } + + private int EstimateFrequencyStd(T value) + { + int hash = Spread(comparer.GetHashCode(value)); + + int start = (hash & 3) << 2; + int frequency = int.MaxValue; + + for (int i = 0; i < 4; i++) + { + int index = IndexOf(hash, i); + int count = (int)(((ulong)table[index] >> ((start + i) << 2)) & 0xfL); + frequency = Math.Min(frequency, count); + } + return frequency; + } + + private void IncrementStd(T value) + { + int hash = Spread(comparer.GetHashCode(value)); + int start = (hash & 3) << 2; + + // Loop unrolling improves throughput by 5m ops/s + int index0 = IndexOf(hash, 0); + int index1 = IndexOf(hash, 1); + int index2 = IndexOf(hash, 2); + int index3 = IndexOf(hash, 3); + + bool added = IncrementAt(index0, start); + added |= IncrementAt(index1, start + 1); + added |= IncrementAt(index2, start + 2); + added |= IncrementAt(index3, start + 3); + + if (added && (++size == sampleSize)) + { + Reset(); + } + } + + private bool IncrementAt(int i, int j) + { + int offset = j << 2; + long mask = (0xfL << offset); + if ((table[i] & mask) != mask) + { + table[i] += (1L << offset); + return true; + } + return false; + } + + private void Reset() + { + // unroll, almost 2x faster + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + count0 = (count0 + count1) + (count2 + count3); + + size = (size - (count0 >> 2)) >> 1; + } + + private void EnsureCapacity(long maximumSize) + { + int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + + // clamp to 4 as min size + maximum = Math.Max(4, maximum); + + table = new long[(maximum == 0) ? 1 : BitOps.CeilingPowerOfTwo(maximum)]; + tableMask = Math.Max(0, table.Length - 1); + sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); + + size = 0; + } + + private int IndexOf(int item, int i) + { + ulong hash = ((ulong)item + Seed[i]) * Seed[i]; + hash += (hash >> 32); + return ((int)hash) & tableMask; + } + + private int Spread(int x) + { + uint y = (uint)x; + y = ((y >> 16) ^ y) * 0x45d9f3b; + y = ((y >> 16) ^ y) * 0x45d9f3b; + return (int)((y >> 16) ^ y); + } + +#if NETCOREAPP3_1_OR_GREATER + private unsafe int EstimateFrequencyAvx(T value) + { + int hash = Spread(comparer.GetHashCode(value)); + int start = (hash & 3) << 2; + + fixed (long* tablePtr = &table[0]) + { + var tableVector = Avx2.GatherVector256(tablePtr, IndexesOfAvx(hash), 8).AsUInt64(); + + Vector256 starts = Vector256.Create(0UL, 1UL, 2UL, 3UL); + starts = Avx2.Add(starts, Vector256.Create((ulong)start)); + starts = Avx2.ShiftLeftLogical(starts, 2); + + tableVector = Avx2.ShiftRightLogicalVariable(tableVector, starts); + tableVector = Avx2.And(tableVector, Vector256.Create(0xfUL)); + + Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); + Vector128 lower = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask) + .GetLower() + .AsUInt16(); + + // set the zeroed high parts of the long value to ushort.Max + var masked = Avx2.Blend(lower, Vector128.Create(ushort.MaxValue), 0b10101010); + return Avx2.MinHorizontal(masked).GetElement(0); + } + } + + private unsafe void IncrementAvx(T value) + { + int hash = Spread(comparer.GetHashCode(value)); + int start = (hash & 3) << 2; + + Vector128 indexes = IndexesOfAvx(hash); + + fixed (long* tablePtr = &table[0]) + { + var tableVector = Avx2.GatherVector256(tablePtr, indexes, 8); + + // offset = j << 2, where j [start+0, start+1, start+2, start+3] + Vector256 offset = Vector256.Create((ulong)start); + Vector256 add = Vector256.Create(0UL, 1UL, 2UL, 3UL); + offset = Avx2.Add(offset, add); + offset = Avx2.ShiftLeftLogical(offset, 2); + + // mask = (0xfL << offset) + Vector256 fifteen = Vector256.Create(0xfL); + Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offset); + + // (table[i] & mask) != mask) + // Note masked is 'equal' - therefore use AndNot below + Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask); + + // 1L << offset + Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offset); + + // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters) + inc = Avx2.AndNot(masked, inc); + + *(tablePtr + indexes.GetElement(0)) += inc.GetElement(0); + *(tablePtr + indexes.GetElement(1)) += inc.GetElement(1); + *(tablePtr + indexes.GetElement(2)) += inc.GetElement(2); + *(tablePtr + indexes.GetElement(3)) += inc.GetElement(3); + + Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Create(0).AsByte()); + bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111)); + + if (wasInc && (++size == sampleSize)) + { + Reset(); + } + } + } + + private Vector128 IndexesOfAvx(int item) + { + Vector256 VectorSeed = Vector256.Create(0xc3a5c85c97cb3127L, 0xb492b66fbe98f273L, 0x9ae16a3b2f90404fL, 0xcbf29ce484222325L); + Vector256 hash = Vector256.Create((ulong)item); + hash = Avx2.Add(hash, VectorSeed); + hash = Multiply(hash, VectorSeed); + + Vector256 shift = Vector256.Create(32UL); + Vector256 shifted = Avx2.ShiftRightLogicalVariable(hash, shift); + hash = Avx2.Add(hash, shifted); + + // Move [a1, a2, b1, b2, c1, c2, d1, d2] + // To [a1, b1, c1, d1, a2, b2, c2, d2] + // then GetLower() [a1, b1, c1, d1] + Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7); + Vector128 f = Avx2.PermuteVar8x32(hash.AsInt32(), permuteMask) + .GetLower(); + + Vector128 maskVector = Vector128.Create(tableMask); + return Avx2.And(f, maskVector); + } + + // taken from Agner Fog's vector library, see https://github.com/vectorclass/version2, vectori256.h + private static Vector256 Multiply(Vector256 a, Vector256 b) + { + // instruction does not exist. Split into 32-bit multiplies + Vector256 bswap = Avx2.Shuffle(b.AsInt32(), 0xB1); // swap H<->L + Vector256 prodlh = Avx2.MultiplyLow(a.AsInt32(), bswap); // 32 bit L*H products + Vector256 zero = Vector256.Create(0); // 0 + Vector256 prodlh2 = Avx2.HorizontalAdd(prodlh, zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0 + Vector256 prodlh3 = Avx2.Shuffle(prodlh2, 0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L + Vector256 prodll = Avx2.Multiply(a.AsUInt32(), b.AsUInt32()); // a0Lb0L,a1Lb1L, 64 bit unsigned products + return Avx2.Add(prodll.AsInt64(), prodlh3.AsInt64()).AsUInt64(); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 + } +#endif + } +} diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index 9ac592fb..1c37b0e6 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -13,39 +13,62 @@ public class SketchFrequency { const int sketchSize = 1_048_576; const int iterations = 1_048_576; - - private static CmSketchCore std = new CmSketchCore(sketchSize, EqualityComparer.Default); - private static CmSketchCore avx = new CmSketchCore(sketchSize, EqualityComparer.Default); + + private CmSketchFlat flatStd; + private CmSketchFlat flatAvx; + + private CmSketchCore blockStd; + private CmSketchCore blockAvx; + + [Params(32_768, 524_288, 8_388_608, 134_217_728)] + public int Size { get; set; } [GlobalSetup] public void Setup() { - for (int i = 0; i < iterations; i++) - { - if (i % 3 == 0) - { - std.Increment(i); - avx.Increment(i); - } - } + flatStd = new CmSketchFlat(Size, EqualityComparer.Default); + flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); + + blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] - public int EstimateFrequency() + public int FrequencyFlat() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += flatStd.EstimateFrequency(i) > flatStd.EstimateFrequency(i + 1) ? 1: 0; + + return count; + } + + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyFlatAvx() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += flatAvx.EstimateFrequency(i) > flatAvx.EstimateFrequency(i + 1) ? 1 : 0; + + return count; + } + + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyBlock() { int count = 0; for (int i = 0; i < iterations; i++) - count += std.EstimateFrequency(i) > std.EstimateFrequency(i + 1) ? 1: 0; + count += blockStd.EstimateFrequency(i) > blockStd.EstimateFrequency(i + 1) ? 1 : 0; return count; } [Benchmark(OperationsPerInvoke = iterations)] - public int EstimateFrequencyAvx() + public int FrequencyBlockAvx() { int count = 0; for (int i = 0; i < iterations; i++) - count += avx.EstimateFrequency(i) > avx.EstimateFrequency(i + 1) ? 1 : 0; + count += blockAvx.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0; return count; } diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index 5e3a984f..eb005032 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -11,26 +11,60 @@ namespace BitFaster.Caching.Benchmarks.Lfu [HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")] public class SketchIncrement { - const int sketchSize = 1_048_576; const int iterations = 1_048_576; - private static CmSketchCore std = new CmSketchCore(sketchSize, EqualityComparer.Default); - private static CmSketchCore avx = new CmSketchCore(sketchSize, EqualityComparer.Default); + + private CmSketchFlat flatStd; + private CmSketchFlat flatAvx; + + private CmSketchCore blockStd; + private CmSketchCore blockAvx; + + [Params(32_768, 524_288, 8_388_608, 134_217_728)] + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + flatStd = new CmSketchFlat(Size, EqualityComparer.Default); + flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); + + blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockAvx = new CmSketchCore(Size, EqualityComparer.Default); + } [Benchmark(Baseline = true, OperationsPerInvoke = iterations)] - public void Inc() + public void IncFlat() + { + for (int i = 0; i < iterations; i++) + { + flatStd.Increment(i); + } + } + + [Benchmark(OperationsPerInvoke = iterations)] + public void IncFlatAvx() + { + for (int i = 0; i < iterations; i++) + { + flatAvx.Increment(i); + } + } + + [Benchmark(OperationsPerInvoke = iterations)] + public void IncBlock() { for (int i = 0; i < iterations; i++) { - std.Increment(i); + blockStd.Increment(i); } } [Benchmark(OperationsPerInvoke = iterations)] - public void IncAvx() + public void IncBlockAvx() { for (int i = 0; i < iterations; i++) { - avx.Increment(i); + blockAvx.Increment(i); } } }