From 977be98f662d3db860d59911144d235f60909a95 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 22 Sep 2022 11:15:53 -0700 Subject: [PATCH 1/4] unroll --- BitFaster.Caching.Benchmarks/Lfu/Reset.cs | 59 +++++++++++++++++++++++ BitFaster.Caching/Lfu/CmSketch.cs | 41 ++++++++++++++-- 2 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 BitFaster.Caching.Benchmarks/Lfu/Reset.cs diff --git a/BitFaster.Caching.Benchmarks/Lfu/Reset.cs b/BitFaster.Caching.Benchmarks/Lfu/Reset.cs new file mode 100644 index 00000000..e33a0b9b --- /dev/null +++ b/BitFaster.Caching.Benchmarks/Lfu/Reset.cs @@ -0,0 +1,59 @@ + +using BenchmarkDotNet.Attributes; + +namespace BitFaster.Caching.Benchmarks.Lfu +{ + public class Reset + { + static long ResetMask = 0x7777777777777777L; + static long OneMask = 0x1111111111111111L; + + long[] table; + + [Params(4, 128, 8192, 1048576)] + public int Size { get; set; } + + [GlobalSetup] + public void Setup() + { + table = new long[Size]; + } + + [Benchmark(Baseline = true)] + public int ResetStd() + { + int count = 0; + for (int i = 0; i < table.Length; i++) + { + count += BitOps.BitCount(table[i] & OneMask); + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + } + + return count; + } + + [Benchmark()] + public int Reset4() + { + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; + } + + return (count0 + count1) + (count2 + count3); + } + } +} diff --git a/BitFaster.Caching/Lfu/CmSketch.cs b/BitFaster.Caching/Lfu/CmSketch.cs index b0785be5..ce0b63b2 100644 --- a/BitFaster.Caching/Lfu/CmSketch.cs +++ b/BitFaster.Caching/Lfu/CmSketch.cs @@ -158,19 +158,37 @@ private bool IncrementAt(int i, int j) private void Reset() { - int count = 0; - for (int i = 0; i < table.Length; i++) + // unroll, almost 2x faster + int count0 = 0; + int count1 = 0; + int count2 = 0; + int count3 = 0; + + for (int i = 0; i < table.Length; i += 4) { - count += BitOps.BitCount(table[i] & OneMask); + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + count2 += BitOps.BitCount(table[i + 2] & OneMask); + count3 += BitOps.BitCount(table[i + 3] & OneMask); + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + table[i + 2] = (long)((ulong)table[i + 2] >> 1) & ResetMask; + table[i + 3] = (long)((ulong)table[i + 3] >> 1) & ResetMask; } - size = (size - (count >> 2)) >> 1; + + count0 = (count0 + count1) + (count2 + count3); + + size = (size - (count0 >> 2)) >> 1; } private void EnsureCapacity(long maximumSize) { int maximum = (int)Math.Min(maximumSize, int.MaxValue >> 1); + // clamp to 4 as min size + maximum = Math.Max(4, maximum); + table = new long[(maximum == 0) ? 1 : BitOps.CeilingPowerOfTwo(maximum)]; tableMask = Math.Max(0, table.Length - 1); sampleSize = (maximumSize == 0) ? 10 : (10 * maximum); @@ -301,6 +319,21 @@ private static Vector256 Multiply(Vector256 a, Vector256 b) Vector256 prodll = Avx2.Multiply(a.AsUInt32(), b.AsUInt32()); // a0Lb0L,a1Lb1L, 64 bit unsigned products return Avx2.Add(prodll.AsInt64(), prodlh3.AsInt64()).AsUInt64(); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 } + + // https://stackoverflow.com/questions/50081465/counting-1-bits-population-count-on-large-data-using-avx-512-or-avx-2 + private void ResetAvx() + { + Vector256 lut = Vector256.Create( + (byte) /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, + /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 + ); + } #endif } } From bbabc6aac3c7446b4ccd7d8b15697233bf417361 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 22 Sep 2022 11:17:48 -0700 Subject: [PATCH 2/4] cleanup --- BitFaster.Caching/Lfu/CmSketch.cs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/BitFaster.Caching/Lfu/CmSketch.cs b/BitFaster.Caching/Lfu/CmSketch.cs index ce0b63b2..c55030f9 100644 --- a/BitFaster.Caching/Lfu/CmSketch.cs +++ b/BitFaster.Caching/Lfu/CmSketch.cs @@ -319,21 +319,6 @@ private static Vector256 Multiply(Vector256 a, Vector256 b) Vector256 prodll = Avx2.Multiply(a.AsUInt32(), b.AsUInt32()); // a0Lb0L,a1Lb1L, 64 bit unsigned products return Avx2.Add(prodll.AsInt64(), prodlh3.AsInt64()).AsUInt64(); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 } - - // https://stackoverflow.com/questions/50081465/counting-1-bits-population-count-on-large-data-using-avx-512-or-avx-2 - private void ResetAvx() - { - Vector256 lut = Vector256.Create( - (byte) /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, - /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, - /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, - /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, - /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, - /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, - /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, - /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 - ); - } #endif } } From 69a0cc976cc92ce767891ead81ab015e49741e7b Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 22 Sep 2022 19:46:29 -0700 Subject: [PATCH 3/4] rename --- .../Lfu/{Reset.cs => SketchReset.cs} | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) rename BitFaster.Caching.Benchmarks/Lfu/{Reset.cs => SketchReset.cs} (72%) diff --git a/BitFaster.Caching.Benchmarks/Lfu/Reset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs similarity index 72% rename from BitFaster.Caching.Benchmarks/Lfu/Reset.cs rename to BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs index e33a0b9b..67864049 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/Reset.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs @@ -1,8 +1,10 @@  using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; namespace BitFaster.Caching.Benchmarks.Lfu { + [SimpleJob(RuntimeMoniker.Net60)] public class Reset { static long ResetMask = 0x7777777777777777L; @@ -20,7 +22,7 @@ public void Setup() } [Benchmark(Baseline = true)] - public int ResetStd() + public int Reset1() { int count = 0; for (int i = 0; i < table.Length; i++) @@ -32,6 +34,24 @@ public int ResetStd() return count; } + [Benchmark()] + public int Reset2() + { + int count0 = 0; + int count1 = 0; + + for (int i = 0; i < table.Length; i += 2) + { + count0 += BitOps.BitCount(table[i] & OneMask); + count1 += BitOps.BitCount(table[i + 1] & OneMask); + + table[i] = (long)((ulong)table[i] >> 1) & ResetMask; + table[i + 1] = (long)((ulong)table[i + 1] >> 1) & ResetMask; + } + + return count0 + count1; + } + [Benchmark()] public int Reset4() { From d2932e8e3e25b10857dc84329c65a6e33fa9d053 Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Thu, 22 Sep 2022 19:58:15 -0700 Subject: [PATCH 4/4] rename --- BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs index 67864049..0c774061 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchReset.cs @@ -5,7 +5,7 @@ namespace BitFaster.Caching.Benchmarks.Lfu { [SimpleJob(RuntimeMoniker.Net60)] - public class Reset + public class SketchReset { static long ResetMask = 0x7777777777777777L; static long OneMask = 0x1111111111111111L;