Updating count method to estimate unions of HyperLogLog counters

aaw · Sep 27, 2012 · ccf469a · fancyremarker · Sep 27, 2012 · ccf469a
1 parent 754ef5a
commit ccf469a
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@
 .pt
 docs/*
 log/*
+pkg/*
 .autotest
 *.watchr
 *.sublime-project

diff --git a/README.md b/README.md
@@ -14,7 +14,16 @@ instance is used for storing the counters. A simple example:
       counter.add('beatles', beatle)
     end
 
-    puts "There are approximately #{counter.count('beatles')} distinct beatles!"
+    puts "There are approximately #{counter.count('beatles')} distinct Beatles"
+
+You can also ask for an estimate from multiple counters and you'll get
+an estimate of the size of their union:
+
+    ['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|
+      counter.add('wings', wing_member)
+    end
+
+    puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"
 
 Each HyperLogLog counter uses a small, fixed amount of space but can
 estimate the cardinality of any set of up to around a billion values with

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.1.0
+0.2.0
diff --git a/lib/hyper_log_log.rb b/lib/hyper_log_log.rb
@@ -26,9 +26,12 @@ def add(counter_name, value)
      @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
   end
 
-  def count(counter_name)
-    all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
-    estimate_sum = all_estimates.map{ |f, score| 2 ** -score }.reduce(:+) || 0
+  def count(*counter_names)
+    all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
+                                 .reduce(:concat)
+                                 .group_by{ |value, score| value }
+                                 .map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
+    estimate_sum = all_estimates.reduce(:+) || 0
     estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
     if estimate <= 2.5 * @m
       if all_estimates.length == @m

diff --git a/spec/hyper_log_log_spec.rb b/spec/hyper_log_log_spec.rb
@@ -58,7 +58,7 @@
   # implementation, since it exercises all of the cases in HyperLogLog's
   # count method except for the correction for very large set sizes.
 
-  it "produces acceptable estimates" do
+  it "produces acceptable estimates for counts" do
     max_items = 1000
     redis = Redis.new
     (6..16).each do |b|
@@ -81,4 +81,59 @@
     end
   end
 
+  it "produces acceptable estimates for unions with few elements in common" do
+    b, max_items = 10, 2000
+    counter = HyperLogLog.new(Redis.new, b)
+    bad_estimates = 0
+    very_bad_estimates = 0
+    expected_relative_error = 1.04 / Math.sqrt(2 ** b)
+    max_items.times do |i|
+      value1 = Digest::MD5.hexdigest("value#{i}")
+      counter.add("mycounter1", value1)
+      value2 = Digest::MD5.hexdigest("value#{i}incounter2")
+      counter.add("mycounter2", value2)
+      value3 = Digest::MD5.hexdigest("this is value#{i}")
+      counter.add("mycounter3", value3)
+      actual = 3 * (i + 1)
+      approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
+      relative_error = (actual - approximate).abs / Float(actual)
+      bad_estimates += 1 if relative_error > expected_relative_error * 2
+      very_bad_estimates += 1 if relative_error > expected_relative_error * 3
+    end
+    bad_estimates.should < (3 * max_items) / 100.00
+    very_bad_estimates.should == 0
+  end
+
+  it "produces acceptable estimates for unions with many elements in common" do
+    b, max_items, intersection_size = 10, 1000, 2000
+    counter = HyperLogLog.new(Redis.new, b)
+    bad_estimates = 0
+    very_bad_estimates = 0
+    expected_relative_error = 1.04 / Math.sqrt(2 ** b)
+
+    intersection_size.times do |i|
+      value = Digest::MD5.hexdigest("test#{i}value")
+      ['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
+        counter.add(counter_name, value)
+      end
+    end
+
+    max_items.times do |i|
+      value1 = Digest::MD5.hexdigest("value#{i}")
+      counter.add("mycounter1", value1)
+      value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
+      counter.add("mycounter2", value2)
+      value3 = Digest::MD5.hexdigest("this is value#{i}")
+      counter.add("mycounter3", value3)
+      actual = 3 * (i + 1) + intersection_size
+      approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
+      relative_error = (actual - approximate).abs / Float(actual)
+      bad_estimates += 1 if relative_error > expected_relative_error * 2
+      very_bad_estimates += 1 if relative_error > expected_relative_error * 3
+    end
+
+    bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
+    very_bad_estimates.should == 0
+  end
+
 end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -23,4 +23,7 @@
   config.before(:each) do
     Redis.new.flushdb
   end
+  config.after(:each) do
+    Redis.new.flushdb
+  end
 end