Skip to content

Commit

Permalink
Updating count method to estimate unions of HyperLogLog counters
Browse files Browse the repository at this point in the history
  • Loading branch information
aaw committed Sep 27, 2012
1 parent 754ef5a commit ccf469a
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
.pt
docs/*
log/*
pkg/*
.autotest
*.watchr
*.sublime-project
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,16 @@ instance is used for storing the counters. A simple example:
counter.add('beatles', beatle)
end

puts "There are approximately #{counter.count('beatles')} distinct beatles!"
puts "There are approximately #{counter.count('beatles')} distinct Beatles"

You can also ask for an estimate from multiple counters and you'll get
an estimate of the size of their union:

['joe', 'denny', 'linda', 'jimmy', 'paul'].each do |wing_member|

This comment has been minimized.

Copy link
@fancyremarker

fancyremarker Sep 27, 2012

Should be wings_member.

counter.add('wings', wing_member)
end

puts "There are approximately #{counter.count('beatles', 'wings')} people who were in the Beatles or Wings"

Each HyperLogLog counter uses a small, fixed amount of space but can
estimate the cardinality of any set of up to around a billion values with
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1.0
0.2.0
9 changes: 6 additions & 3 deletions lib/hyper_log_log.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ def add(counter_name, value)
@redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
end

def count(counter_name)
all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
estimate_sum = all_estimates.map{ |f, score| 2 ** -score }.reduce(:+) || 0
def count(*counter_names)
all_estimates = counter_names.map{ |counter_name| @redis.zrange(counter_name, 0, -1, {withscores: true}) }
.reduce(:concat)
.group_by{ |value, score| value }
.map{ |group, counters| 2 ** -counters.map{ |x| x.last }.max }
estimate_sum = all_estimates.reduce(:+) || 0
estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
if estimate <= 2.5 * @m
if all_estimates.length == @m
Expand Down
57 changes: 56 additions & 1 deletion spec/hyper_log_log_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
# implementation, since it exercises all of the cases in HyperLogLog's
# count method except for the correction for very large set sizes.

it "produces acceptable estimates" do
it "produces acceptable estimates for counts" do
max_items = 1000
redis = Redis.new
(6..16).each do |b|
Expand All @@ -81,4 +81,59 @@
end
end

it "produces acceptable estimates for unions with few elements in common" do
b, max_items = 10, 2000
counter = HyperLogLog.new(Redis.new, b)
bad_estimates = 0
very_bad_estimates = 0
expected_relative_error = 1.04 / Math.sqrt(2 ** b)
max_items.times do |i|
value1 = Digest::MD5.hexdigest("value#{i}")
counter.add("mycounter1", value1)
value2 = Digest::MD5.hexdigest("value#{i}incounter2")
counter.add("mycounter2", value2)
value3 = Digest::MD5.hexdigest("this is value#{i}")
counter.add("mycounter3", value3)
actual = 3 * (i + 1)
approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
relative_error = (actual - approximate).abs / Float(actual)
bad_estimates += 1 if relative_error > expected_relative_error * 2
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
end
bad_estimates.should < (3 * max_items) / 100.00
very_bad_estimates.should == 0
end

it "produces acceptable estimates for unions with many elements in common" do
b, max_items, intersection_size = 10, 1000, 2000
counter = HyperLogLog.new(Redis.new, b)
bad_estimates = 0
very_bad_estimates = 0
expected_relative_error = 1.04 / Math.sqrt(2 ** b)

intersection_size.times do |i|
value = Digest::MD5.hexdigest("test#{i}value")
['mycounter1', 'mycounter2', 'mycounter3'].each do |counter_name|
counter.add(counter_name, value)
end
end

max_items.times do |i|
value1 = Digest::MD5.hexdigest("value#{i}")
counter.add("mycounter1", value1)
value2 = Digest::MD5.hexdigest("value#{i}isincounter2")
counter.add("mycounter2", value2)
value3 = Digest::MD5.hexdigest("this is value#{i}")
counter.add("mycounter3", value3)
actual = 3 * (i + 1) + intersection_size
approximate = counter.count("mycounter1", "mycounter2", "mycounter3")
relative_error = (actual - approximate).abs / Float(actual)
bad_estimates += 1 if relative_error > expected_relative_error * 2
very_bad_estimates += 1 if relative_error > expected_relative_error * 3
end

bad_estimates.should < ((3 * max_items) + intersection_size) / 100.00
very_bad_estimates.should == 0
end

end
3 changes: 3 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@
config.before(:each) do
Redis.new.flushdb
end
config.after(:each) do
Redis.new.flushdb
end
end

0 comments on commit ccf469a

Please sign in to comment.