Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Initial commit

  • Loading branch information...
commit cb24b734f77152555674d01a26fda21081eaaef4 1 parent ed6d775
@aaw authored
View
63 .gitignore
@@ -1,48 +1,21 @@
-# rcov generated
-coverage
-
-# rdoc generated
-rdoc
-
-# yard generated
-doc
-.yardoc
-
-# bundler
+.DS_Store
+*~
+*#
+.#*
+.yardoc/*
+.pt
+.rvmrc
+.redcar
+.powrc
+.irbrc
.bundle
+.pt
+docs/*
+log/*
+.autotest
+*.watchr
+*.sublime-project
+*.sublime-workspace
+rspec.failures
-# jeweler generated
-pkg
-
-# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
-#
-# * Create a file at ~/.gitignore
-# * Include files you want ignored
-# * Run: git config --global core.excludesfile ~/.gitignore
-#
-# After doing this, these files will be ignored in all your git projects,
-# saving you from having to 'pollute' every project you touch with them
-#
-# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
-#
-# For MacOS:
-#
-#.DS_Store
-
-# For TextMate
-#*.tmproj
-#tmtags
-
-# For emacs:
-#*~
-#\#*
-#.\#*
-
-# For vim:
-#*.swp
-
-# For redcar:
-#.redcar
-# For rubinius:
-#*.rbc
View
19 Gemfile
@@ -1,13 +1,10 @@
source "http://rubygems.org"
-# Add dependencies required to use your gem here.
-# Example:
-# gem "activesupport", ">= 2.3.5"
-# Add dependencies to develop your gem here.
-# Include everything needed to run rake, tests, features, etc.
-group :development do
- gem "rspec", "~> 2.3.0"
- gem "bundler", "~> 1.0.0"
- gem "jeweler", "~> 1.6.4"
- gem "rcov", ">= 0"
-end
+gem 'murmurhash3', '~> 0.1.3'
+gem 'redis', '~> 3.0.1'
+
+group :development, :test do
+ gem 'jeweler', '~> 1.8.4'
+ gem 'rake', '~> 0.9.2.2'
+ gem 'rspec', '~> 2.11.0'
+end
View
34 Gemfile.lock
@@ -0,0 +1,34 @@
+GEM
+ remote: http://rubygems.org/
+ specs:
+ diff-lcs (1.1.3)
+ git (1.2.5)
+ jeweler (1.8.4)
+ bundler (~> 1.0)
+ git (>= 1.2.5)
+ rake
+ rdoc
+ json (1.7.5)
+ murmurhash3 (0.1.3)
+ rake (0.9.2.2)
+ rdoc (3.12)
+ json (~> 1.4)
+ redis (3.0.1)
+ rspec (2.11.0)
+ rspec-core (~> 2.11.0)
+ rspec-expectations (~> 2.11.0)
+ rspec-mocks (~> 2.11.0)
+ rspec-core (2.11.1)
+ rspec-expectations (2.11.3)
+ diff-lcs (~> 1.1.3)
+ rspec-mocks (2.11.2)
+
+PLATFORMS
+ ruby
+
+DEPENDENCIES
+ jeweler (~> 1.8.4)
+ murmurhash3 (~> 0.1.3)
+ rake (~> 0.9.2.2)
+ redis (~> 3.0.1)
+ rspec (~> 2.11.0)
View
33 LICENSE.txt
@@ -1,20 +1,19 @@
-Copyright (c) 2012 Aaron Windsor
+Copyright (c) 2012 Art.sy, Inc.
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
View
46 README.md
@@ -0,0 +1,46 @@
+hyperloglog-redis
+=================
+
+This gem is an implementation of the HyperLogLog algorithm for estimating
+cardinalities of sets observed via a stream of events. A [Redis](http://redis.io)
+instance is used for storing the counters. A simple example:
+
+ require 'redis'
+ require 'hyperloglog-redis'
+
+ redis = Redis.new
+ counter = HyperLogLog.new(redis)
+ ['john', 'paul', 'george', 'ringo', 'john', 'paul'].each do |beatle|
+ counter.add('beatles', beatle)
+ end
+
+ puts "There are approximately #{counter.count('beatles')} distinct beatles!"
+
+Each HyperLogLog counter uses a small, fixed amount of space but can
+estimate the cardinality of any set of up to around a billion values with
+relative error of about 1.04 / Math.sqrt(2 ** b), where b is a parameter
+passed to the HyperLogLog initializer that defaults to 10. With b = 10,
+each counter is represented by a Redis sorted set with 2 ** b = 1024 values
+(a few KB of space) and we get an expected relative error of 3%. Contrast this
+with the amount of space needed to compute set cardinality exactly, which is
+over 100 MB for a even a bit vector representing a set with a billion values.
+
+The basic idea of HyperLogLog (and its predecessors PCSA and LogLog) is to apply
+a good hash function to each value you see in the stream and record the longest
+run of zeros that you've seen as a prefix of any hashed value. If the hash
+function is good, you'd expect that its bits are statistically independent, so
+seeing a value that starts with exactly X zeros should happen with probability
+2 ** -(X + 1). So if you've seen a run of 5 zeros in one of your hash values,
+you're likely to have around 2 ** 6 = 64 values in the underlying set. The actual
+implementation and analysis are much more advanced than this, but that's the idea.
+
+The HyperLogLog algorithm is described and analyzed in the paper
+["HyperLogLog: the analysis of a near-optimal cardinality estimation
+algorithm"](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf)
+by Flajolet, Fusy, Gandouet, and Meunier. Our implementation closely
+follows the program described in Section 4 of that paper.
+
+Installation
+============
+
+ gem install hyperloglog-redis
View
17 Rakefile
@@ -18,7 +18,7 @@ Jeweler::Tasks.new do |gem|
gem.homepage = "http://github.com/aaw/hyperloglog-redis"
gem.license = "MIT"
gem.summary = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end}
- gem.description = %Q{TODO: longer description of your gem}
+ gem.description = %Q{An implementation of the HyperLogLog set cardinality estimation algorithm in Ruby using Redis as a back-end}
gem.email = "aaron.windsor@gmail.com"
gem.authors = ["Aaron Windsor"]
# dependencies defined in Gemfile
@@ -31,19 +31,4 @@ RSpec::Core::RakeTask.new(:spec) do |spec|
spec.pattern = FileList['spec/**/*_spec.rb']
end
-RSpec::Core::RakeTask.new(:rcov) do |spec|
- spec.pattern = 'spec/**/*_spec.rb'
- spec.rcov = true
-end
-
task :default => :spec
-
-require 'rake/rdoctask'
-Rake::RDocTask.new do |rdoc|
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
-
- rdoc.rdoc_dir = 'rdoc'
- rdoc.title = "hyperloglog-redis #{version}"
- rdoc.rdoc_files.include('README*')
- rdoc.rdoc_files.include('lib/**/*.rb')
-end
View
56 lib/hyper_log_log.rb
@@ -0,0 +1,56 @@
+require 'redis'
+require 'murmurhash3'
+
+class HyperLogLog
+ def initialize(redis, b=10)
+ raise "Accuracy not supported. Please choose a value of b between 4 and 16" if b < 4 || b > 16
+ @redis = redis
+ @bits_in_hash = 32 - b
+ @m = (2 ** b).to_i
+ if @m == 16
+ @alpha = 0.673
+ elsif @m == 32
+ @alpha = 0.697
+ elsif @m == 64
+ @alpha = 0.709
+ else
+ @alpha = 0.7213/(1 + 1.079/@m)
+ end
+ end
+
+ def add(counter_name, value)
+ hash = MurmurHash3::V32.murmur3_32_str_hash(value)
+ function_name = (hash % @m).to_s
+ w = hash / @m
+ max_run_of_zeros = @redis.zscore(counter_name, function_name)
+ @redis.zadd(counter_name, [(max_run_of_zeros || 0), rho(w)].max, function_name)
+ end
+
+ def count(counter_name)
+ all_estimates = @redis.zrange(counter_name, 0, -1, {withscores: true})
+ estimate_sum = all_estimates.map{ |f, score| 2 ** -score }.reduce(:+) || 0
+ estimate = @alpha * @m * @m * ((estimate_sum + @m - all_estimates.length) ** -1)
+ if estimate <= 2.5 * @m
+ if all_estimates.length == @m
+ estimate.round
+ else # Correction for small sets
+ (@m * Math.log(Float(@m)/(@m - all_estimates.length))).round
+ end
+ elsif estimate <= 2 ** 32 / 30.0
+ estimate.round
+ else # Correction for large sets
+ (-2**32 * Math.log(1 - estimate/(2.0**32))).round
+ end
+ end
+
+ # rho(i) is the position of the first 1 in the binary representation of i,
+ # reading from most significant to least significant bits. Some examples:
+ # rho(1...) = 1, rho(001...) = 3, rho(000...0) = @bits_in_hash + 1
+ def rho(i)
+ if i == 0
+ @bits_in_hash + 1
+ else
+ @bits_in_hash - Math.log(i, 2).floor
+ end
+ end
+end
View
1  lib/hyperloglog-redis.rb
@@ -0,0 +1 @@
+require "hyper_log_log"
View
84 spec/hyper_log_log_spec.rb
@@ -0,0 +1,84 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+
+describe HyperLogLog do
+
+ it "doesn't change its count when it sees values that it's already seen" do
+ redis = Redis.new
+ counter = HyperLogLog.new(redis, 10)
+ test_set = (1..100).map{ |x| x.to_s }
+ test_set.each{ |value| counter.add("mycounter", value) }
+ original_estimate = counter.count("mycounter")
+ 5.times do
+ test_set.each do |value|
+ counter.add("mycounter", value)
+ counter.count("mycounter").should == original_estimate
+ end
+ end
+ end
+
+ it "can maintain more than one logically distinct counter" do
+ redis = Redis.new
+ counter = HyperLogLog.new(redis, 10)
+ other_estimate = counter.count("counter2")
+ (1..100).each do |i|
+ counter.add("counter1", i.to_s)
+ counter.count("counter2").should == other_estimate
+ end
+ other_estimate = counter.count("counter1")
+ (101..200).each do |i|
+ counter.add("counter2", i.to_s)
+ counter.count("counter1").should == other_estimate
+ end
+ other_estimate = counter.count("counter2")
+ (201..300).each do |i|
+ counter.add("counter1", i.to_s)
+ counter.count("counter2").should == other_estimate
+ end
+ counter.count("counter1").should > 100
+ counter.count("counter2").should > 50
+ counter.count("counter1").should > counter.count("counter2")
+ end
+
+
+ # With parameter b, HyperLogLog should produce estimates that have
+ # relative error of 1.04 / Math.sqrt(2 ** b). Of course, this analysis
+ # is based on assumptions that aren't necessarily true in practice and
+ # the observed relative error will depend on the distribution of data
+ # we receive as well as the interaction of the murmur hash implementation
+ # with that data. Keeping that in mind, the following spec makes sure
+ # that in the process of adding 1000 values to a set, HyperLogLog only
+ # gives bad estimates (more than twice the expected relative error) in
+ # less than 1% of the cases and never gives very bad estimates (more than
+ # three times the expected relative error.)
+ #
+ # It's fine to fudge these numbers a little if the implementation changes,
+ # since you can clearly find a different set of values that make this test
+ # fail even without changing the implementation. But it should serve as a
+ # good indication that there aren't any logical errors in the HyperLogLog
+ # implementation, since it exercises all of the cases in HyperLogLog's
+ # count method except for the correction for very large set sizes.
+
+ it "produces acceptable estimates" do
+ max_items = 1000
+ redis = Redis.new
+ (6..16).each do |b|
+ counter = HyperLogLog.new(redis, b)
+ redis.del('mycounter')
+ bad_estimates = 0
+ very_bad_estimates = 0
+ expected_relative_error = 1.04 / Math.sqrt(2 ** b)
+ max_items.times do |i|
+ value = Digest::MD5.hexdigest("value#{i}")
+ counter.add("mycounter", value)
+ actual = i + 1
+ approximate = counter.count("mycounter")
+ relative_error = (actual - approximate).abs / Float(actual)
+ bad_estimates += 1 if relative_error > expected_relative_error * 2
+ very_bad_estimates += 1 if relative_error > expected_relative_error * 3
+ end
+ bad_estimates.should < max_items / 100.00
+ very_bad_estimates.should == 0
+ end
+ end
+
+end
View
16 spec/spec_helper.rb
@@ -1,12 +1,26 @@
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
$LOAD_PATH.unshift(File.dirname(__FILE__))
require 'rspec'
+require 'redis'
require 'hyperloglog-redis'
+db_number = ENV['REDIS_TEST_DATABASE'] || '15'
+ENV['REDIS_URL'] = "redis://localhost:6379/#{db_number}"
+redis = Redis.new
+if redis.keys('*').length > 0
+ puts "Warning! These specs use database #{db_number} on your local redis instance"
+ puts "running on port 6379. Your database #{db_number} seems to have keys in it."
+ puts "Please clear them before running the specs or set the environment"
+ puts "variable REDIS_TEST_DATABASE to use a different database number."
+ raise SystemExit
+end
+
# Requires supporting files with custom matchers and macros, etc,
# in ./support/ and its subdirectories.
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
RSpec.configure do |config|
-
+ config.before(:each) do
+ Redis.new.flushdb
+ end
end
Please sign in to comment.
Something went wrong with that request. Please try again.