diff --git a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj
index fb11022f..b43da99c 100644
--- a/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj
+++ b/BitFaster.Caching.Benchmarks/BitFaster.Caching.Benchmarks.csproj
@@ -3,6 +3,7 @@
Exe
net48;net6.0
+ True
diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchFlat.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchFlat.cs
new file mode 100644
index 00000000..f0523066
--- /dev/null
+++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchFlat.cs
@@ -0,0 +1,320 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+#if NETCOREAPP3_1_OR_GREATER
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
+
+namespace BitFaster.Caching.Benchmarks.Lfu
+{
+ internal class CmSketchFlat where I : struct, IsaProbe
+ {
+ // A mixture of seeds from FNV-1a, CityHash, and Murmur3
+ private static readonly ulong[] Seed = { 0xc3a5c85c97cb3127L, 0xb492b66fbe98f273L, 0x9ae16a3b2f90404fL, 0xcbf29ce484222325L };
+ private static readonly long ResetMask = 0x7777777777777777L;
+ private static readonly long OneMask = 0x1111111111111111L;
+
+ private int sampleSize;
+ private int tableMask;
+ private long[] table;
+ private int size;
+
+ private readonly IEqualityComparer comparer;
+
+ ///
+ /// Initializes a new instance of the CmSketch class with the specified maximum size and equality comparer.
+ ///
+ /// The maximum size.
+ /// The equality comparer.
+ public CmSketchFlat(long maximumSize, IEqualityComparer comparer)
+ {
+ EnsureCapacity(maximumSize);
+ this.comparer = comparer;
+ }
+
+ ///
+ /// Gets the reset sample size.
+ ///
+ public int ResetSampleSize => this.sampleSize;
+
+ ///
+ /// Gets the size.
+ ///
+ public int Size => this.size;
+
+ ///
+ /// Estimate the frequency of the specified value.
+ ///
+ /// The value.
+ /// The estimated frequency of the value.
+ public int EstimateFrequency(T value)
+ {
+#if !NETCOREAPP3_1_OR_GREATER
+ return EstimateFrequencyStd(value);
+#else
+
+ I isa = default;
+
+ if (isa.IsAvx2Supported)
+ {
+ return EstimateFrequencyAvx(value);
+ }
+ else
+ {
+ return EstimateFrequencyStd(value);
+ }
+#endif
+ }
+
+ ///
+ /// Increment the count of the specified value.
+ ///
+ /// The value.
+ public void Increment(T value)
+ {
+#if !NETCOREAPP3_1_OR_GREATER
+ IncrementStd(value);
+#else
+
+ I isa = default;
+
+ if (isa.IsAvx2Supported)
+ {
+ IncrementAvx(value);
+ }
+ else
+ {
+ IncrementStd(value);
+ }
+#endif
+ }
+
+ ///
+ /// Clears the count for all items.
+ ///
+ public void Clear()
+ {
+ table = new long[table.Length];
+ size = 0;
+ }
+
+ private int EstimateFrequencyStd(T value)
+ {
+ int hash = Spread(comparer.GetHashCode(value));
+
+ int start = (hash & 3) << 2;
+ int frequency = int.MaxValue;
+
+ for (int i = 0; i < 4; i++)
+ {
+ int index = IndexOf(hash, i);
+ int count = (int)(((ulong)table[index] >> ((start + i) << 2)) & 0xfL);
+ frequency = Math.Min(frequency, count);
+ }
+ return frequency;
+ }
+
+ private void IncrementStd(T value)
+ {
+ int hash = Spread(comparer.GetHashCode(value));
+ int start = (hash & 3) << 2;
+
+ // Loop unrolling improves throughput by 5m ops/s
+ int index0 = IndexOf(hash, 0);
+ int index1 = IndexOf(hash, 1);
+ int index2 = IndexOf(hash, 2);
+ int index3 = IndexOf(hash, 3);
+
+ bool added = IncrementAt(index0, start);
+ added |= IncrementAt(index1, start + 1);
+ added |= IncrementAt(index2, start + 2);
+ added |= IncrementAt(index3, start + 3);
+
+ if (added && (++size == sampleSize))
+ {
+ Reset();
+ }
+ }
+
+ private bool IncrementAt(int i, int j)
+ {
+ int offset = j << 2;
+ long mask = (0xfL << offset);
+ if ((table[i] & mask) != mask)
+ {
+ table[i] += (1L << offset);
+ return true;
+ }
+ return false;
+ }
+
+ private void Reset()
+ {
+ // unroll, almost 2x faster
+ int count0 = 0;
+ int count1 = 0;
+ int count2 = 0;
+ int count3 = 0;
+
+ for (int i = 0; i < table.Length; i += 4)
+ {
+ count0 += BitOps.BitCount(table[i] & OneMask);
+ count1 += BitOps.BitCount(table[i + 1] & OneMask);
+ count2 += BitOps.BitCount(table[i + 2] & OneMask);
+ count3 += BitOps.BitCount(table[i + 3] & OneMask);
+
+ table[i] = (long)((ulong)table[i] >> 1) & ResetMask;
+ table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask;
+ table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask;
+ table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask;
+ }
+
+ count0 = (count0 + count1) + (count2 + count3);
+
+ size = (size - (count0 >> 2)) >> 1;
+ }
+
+ private void EnsureCapacity(long maximumSize)
+ {
+ int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1);
+
+ // clamp to 4 as min size
+ maximum = Math.Max(4, maximum);
+
+ table = new long[(maximum == 0) ? 1 : BitOps.CeilingPowerOfTwo(maximum)];
+ tableMask = Math.Max(0, table.Length - 1);
+ sampleSize = (maximumSize == 0) ? 10 : (10 * maximum);
+
+ size = 0;
+ }
+
+ private int IndexOf(int item, int i)
+ {
+ ulong hash = ((ulong)item + Seed[i]) * Seed[i];
+ hash += (hash >> 32);
+ return ((int)hash) & tableMask;
+ }
+
+ private int Spread(int x)
+ {
+ uint y = (uint)x;
+ y = ((y >> 16) ^ y) * 0x45d9f3b;
+ y = ((y >> 16) ^ y) * 0x45d9f3b;
+ return (int)((y >> 16) ^ y);
+ }
+
+#if NETCOREAPP3_1_OR_GREATER
+ private unsafe int EstimateFrequencyAvx(T value)
+ {
+ int hash = Spread(comparer.GetHashCode(value));
+ int start = (hash & 3) << 2;
+
+ fixed (long* tablePtr = &table[0])
+ {
+ var tableVector = Avx2.GatherVector256(tablePtr, IndexesOfAvx(hash), 8).AsUInt64();
+
+ Vector256 starts = Vector256.Create(0UL, 1UL, 2UL, 3UL);
+ starts = Avx2.Add(starts, Vector256.Create((ulong)start));
+ starts = Avx2.ShiftLeftLogical(starts, 2);
+
+ tableVector = Avx2.ShiftRightLogicalVariable(tableVector, starts);
+ tableVector = Avx2.And(tableVector, Vector256.Create(0xfUL));
+
+ Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7);
+ Vector128 lower = Avx2.PermuteVar8x32(tableVector.AsInt32(), permuteMask)
+ .GetLower()
+ .AsUInt16();
+
+ // set the zeroed high parts of the long value to ushort.Max
+ var masked = Avx2.Blend(lower, Vector128.Create(ushort.MaxValue), 0b10101010);
+ return Avx2.MinHorizontal(masked).GetElement(0);
+ }
+ }
+
+ private unsafe void IncrementAvx(T value)
+ {
+ int hash = Spread(comparer.GetHashCode(value));
+ int start = (hash & 3) << 2;
+
+ Vector128 indexes = IndexesOfAvx(hash);
+
+ fixed (long* tablePtr = &table[0])
+ {
+ var tableVector = Avx2.GatherVector256(tablePtr, indexes, 8);
+
+ // offset = j << 2, where j [start+0, start+1, start+2, start+3]
+ Vector256 offset = Vector256.Create((ulong)start);
+ Vector256 add = Vector256.Create(0UL, 1UL, 2UL, 3UL);
+ offset = Avx2.Add(offset, add);
+ offset = Avx2.ShiftLeftLogical(offset, 2);
+
+ // mask = (0xfL << offset)
+ Vector256 fifteen = Vector256.Create(0xfL);
+ Vector256 mask = Avx2.ShiftLeftLogicalVariable(fifteen, offset);
+
+ // (table[i] & mask) != mask)
+ // Note masked is 'equal' - therefore use AndNot below
+ Vector256 masked = Avx2.CompareEqual(Avx2.And(tableVector, mask), mask);
+
+ // 1L << offset
+ Vector256 inc = Avx2.ShiftLeftLogicalVariable(Vector256.Create(1L), offset);
+
+ // Mask to zero out non matches (add zero below) - first operand is NOT then AND result (order matters)
+ inc = Avx2.AndNot(masked, inc);
+
+ *(tablePtr + indexes.GetElement(0)) += inc.GetElement(0);
+ *(tablePtr + indexes.GetElement(1)) += inc.GetElement(1);
+ *(tablePtr + indexes.GetElement(2)) += inc.GetElement(2);
+ *(tablePtr + indexes.GetElement(3)) += inc.GetElement(3);
+
+ Vector256 result = Avx2.CompareEqual(masked.AsByte(), Vector256.Create(0).AsByte());
+ bool wasInc = Avx2.MoveMask(result.AsByte()) == unchecked((int)(0b1111_1111_1111_1111_1111_1111_1111_1111));
+
+ if (wasInc && (++size == sampleSize))
+ {
+ Reset();
+ }
+ }
+ }
+
+ private Vector128 IndexesOfAvx(int item)
+ {
+ Vector256 VectorSeed = Vector256.Create(0xc3a5c85c97cb3127L, 0xb492b66fbe98f273L, 0x9ae16a3b2f90404fL, 0xcbf29ce484222325L);
+ Vector256 hash = Vector256.Create((ulong)item);
+ hash = Avx2.Add(hash, VectorSeed);
+ hash = Multiply(hash, VectorSeed);
+
+ Vector256 shift = Vector256.Create(32UL);
+ Vector256 shifted = Avx2.ShiftRightLogicalVariable(hash, shift);
+ hash = Avx2.Add(hash, shifted);
+
+ // Move [a1, a2, b1, b2, c1, c2, d1, d2]
+ // To [a1, b1, c1, d1, a2, b2, c2, d2]
+ // then GetLower() [a1, b1, c1, d1]
+ Vector256 permuteMask = Vector256.Create(0, 2, 4, 6, 1, 3, 5, 7);
+ Vector128 f = Avx2.PermuteVar8x32(hash.AsInt32(), permuteMask)
+ .GetLower();
+
+ Vector128 maskVector = Vector128.Create(tableMask);
+ return Avx2.And(f, maskVector);
+ }
+
+ // taken from Agner Fog's vector library, see https://github.com/vectorclass/version2, vectori256.h
+ private static Vector256 Multiply(Vector256 a, Vector256 b)
+ {
+ // instruction does not exist. Split into 32-bit multiplies
+ Vector256 bswap = Avx2.Shuffle(b.AsInt32(), 0xB1); // swap H<->L
+ Vector256 prodlh = Avx2.MultiplyLow(a.AsInt32(), bswap); // 32 bit L*H products
+ Vector256 zero = Vector256.Create(0); // 0
+ Vector256 prodlh2 = Avx2.HorizontalAdd(prodlh, zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+ Vector256 prodlh3 = Avx2.Shuffle(prodlh2, 0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+ Vector256 prodll = Avx2.Multiply(a.AsUInt32(), b.AsUInt32()); // a0Lb0L,a1Lb1L, 64 bit unsigned products
+ return Avx2.Add(prodll.AsInt64(), prodlh3.AsInt64()).AsUInt64(); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+ }
+#endif
+ }
+}
diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs
index 9ac592fb..1c37b0e6 100644
--- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs
+++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs
@@ -13,39 +13,62 @@ public class SketchFrequency
{
const int sketchSize = 1_048_576;
const int iterations = 1_048_576;
-
- private static CmSketchCore std = new CmSketchCore(sketchSize, EqualityComparer.Default);
- private static CmSketchCore avx = new CmSketchCore(sketchSize, EqualityComparer.Default);
+
+ private CmSketchFlat flatStd;
+ private CmSketchFlat flatAvx;
+
+ private CmSketchCore blockStd;
+ private CmSketchCore blockAvx;
+
+ [Params(32_768, 524_288, 8_388_608, 134_217_728)]
+ public int Size { get; set; }
[GlobalSetup]
public void Setup()
{
- for (int i = 0; i < iterations; i++)
- {
- if (i % 3 == 0)
- {
- std.Increment(i);
- avx.Increment(i);
- }
- }
+ flatStd = new CmSketchFlat(Size, EqualityComparer.Default);
+ flatAvx = new CmSketchFlat(Size, EqualityComparer.Default);
+
+ blockStd = new CmSketchCore(Size, EqualityComparer.Default);
+ blockAvx = new CmSketchCore(Size, EqualityComparer.Default);
}
[Benchmark(Baseline = true, OperationsPerInvoke = iterations)]
- public int EstimateFrequency()
+ public int FrequencyFlat()
+ {
+ int count = 0;
+ for (int i = 0; i < iterations; i++)
+ count += flatStd.EstimateFrequency(i) > flatStd.EstimateFrequency(i + 1) ? 1: 0;
+
+ return count;
+ }
+
+ [Benchmark(OperationsPerInvoke = iterations)]
+ public int FrequencyFlatAvx()
+ {
+ int count = 0;
+ for (int i = 0; i < iterations; i++)
+ count += flatAvx.EstimateFrequency(i) > flatAvx.EstimateFrequency(i + 1) ? 1 : 0;
+
+ return count;
+ }
+
+ [Benchmark(OperationsPerInvoke = iterations)]
+ public int FrequencyBlock()
{
int count = 0;
for (int i = 0; i < iterations; i++)
- count += std.EstimateFrequency(i) > std.EstimateFrequency(i + 1) ? 1: 0;
+ count += blockStd.EstimateFrequency(i) > blockStd.EstimateFrequency(i + 1) ? 1 : 0;
return count;
}
[Benchmark(OperationsPerInvoke = iterations)]
- public int EstimateFrequencyAvx()
+ public int FrequencyBlockAvx()
{
int count = 0;
for (int i = 0; i < iterations; i++)
- count += avx.EstimateFrequency(i) > avx.EstimateFrequency(i + 1) ? 1 : 0;
+ count += blockAvx.EstimateFrequency(i) > blockAvx.EstimateFrequency(i + 1) ? 1 : 0;
return count;
}
diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs
index 5e3a984f..eb005032 100644
--- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs
+++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs
@@ -11,26 +11,60 @@ namespace BitFaster.Caching.Benchmarks.Lfu
[HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")]
public class SketchIncrement
{
- const int sketchSize = 1_048_576;
const int iterations = 1_048_576;
- private static CmSketchCore std = new CmSketchCore(sketchSize, EqualityComparer.Default);
- private static CmSketchCore avx = new CmSketchCore(sketchSize, EqualityComparer.Default);
+
+ private CmSketchFlat flatStd;
+ private CmSketchFlat flatAvx;
+
+ private CmSketchCore blockStd;
+ private CmSketchCore blockAvx;
+
+ [Params(32_768, 524_288, 8_388_608, 134_217_728)]
+ public int Size { get; set; }
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ flatStd = new CmSketchFlat(Size, EqualityComparer.Default);
+ flatAvx = new CmSketchFlat(Size, EqualityComparer.Default);
+
+ blockStd = new CmSketchCore(Size, EqualityComparer.Default);
+ blockAvx = new CmSketchCore(Size, EqualityComparer.Default);
+ }
[Benchmark(Baseline = true, OperationsPerInvoke = iterations)]
- public void Inc()
+ public void IncFlat()
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ flatStd.Increment(i);
+ }
+ }
+
+ [Benchmark(OperationsPerInvoke = iterations)]
+ public void IncFlatAvx()
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ flatAvx.Increment(i);
+ }
+ }
+
+ [Benchmark(OperationsPerInvoke = iterations)]
+ public void IncBlock()
{
for (int i = 0; i < iterations; i++)
{
- std.Increment(i);
+ blockStd.Increment(i);
}
}
[Benchmark(OperationsPerInvoke = iterations)]
- public void IncAvx()
+ public void IncBlockAvx()
{
for (int i = 0; i < iterations; i++)
{
- avx.Increment(i);
+ blockAvx.Increment(i);
}
}
}