diff --git a/lib/tdigest/tdigest.rb b/lib/tdigest/tdigest.rb index 17c09ac..bf25eba 100644 --- a/lib/tdigest/tdigest.rb +++ b/lib/tdigest/tdigest.rb @@ -16,6 +16,16 @@ def initialize(delta = 0.01, k = 25, cx = 1.1) reset! end + def +(other) + # Uses delta, k and cx from the caller + t = self.class.new(@delta, @k, @cx) + data = self.centroids.values + other.centroids.values + while data.length > 0 + t.push_centroid(data.delete_at(rand(data.length))) + end + t + end + def as_bytes # compression as defined by Java implementation size = @centroids.size @@ -117,6 +127,13 @@ def find_nearest(x) end end + def merge!(other) + # Uses delta, k and cx from the caller + t = self + other + @centroids = t.centroids + compress! + end + def p_rank(x) is_array = x.is_a? Array x = [x] unless is_array @@ -312,6 +329,14 @@ def _digest(x, n) _cumulate(false) + # If the number of centroids has grown to a very large size, + # it may be due to values being inserted in sorted order. + # We combat that by replaying the centroids in random order, + # which is what compress! does + if @centroids.size > (@k / @delta) + compress! + end + nil end diff --git a/test/tdigest_test.rb b/test/tdigest_test.rb index a673b41..76508d4 100644 --- a/test/tdigest_test.rb +++ b/test/tdigest_test.rb @@ -82,9 +82,11 @@ def test_that_it_has_a_version_number describe 'with alot of uniformly distributed points' do it 'has minimal error' do + seed = srand(1234) # Makes the values a proper fixture N = 100_000 maxerr = 0 values = Array.new(N).map { rand } + srand(seed) tdigest.push(values) tdigest.compress! @@ -135,6 +137,13 @@ def test_that_it_has_a_version_number 123829787.23404256, 103191489.36170213] end + + it 'does not blow up if data comes in sorted' do + tdigest.push(0..10_000) + tdigest.centroids.size.must_be :<, 5_000 + tdigest.compress! + tdigest.centroids.size.must_be :<, 1_000 + end end describe '#size' do @@ -145,4 +154,77 @@ def test_that_it_has_a_version_number tdigest.size.must_equal n end end + + describe '#+' do + it 'works with empty tdigests' do + other = ::TDigest::TDigest.new(0.001, 50, 1.2) + (tdigest + other).centroids.size.must_equal 0 + end + + describe 'adding two tdigests' do + before do + @other = ::TDigest::TDigest.new(0.001, 50, 1.2) + [tdigest, @other].each do |td| + td.push(60, 100) + 10.times { td.push(rand * 100) } + end + end + + it 'has the parameters of the left argument (the calling tdigest)' do + new_tdigest = tdigest + @other + new_tdigest.instance_variable_get(:@delta).must_equal tdigest.instance_variable_get(:@delta) + new_tdigest.instance_variable_get(:@k).must_equal tdigest.instance_variable_get(:@k) + new_tdigest.instance_variable_get(:@cx).must_equal tdigest.instance_variable_get(:@cx) + end + + it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do + new_tdigest = tdigest + @other + new_tdigest.centroids.size.must_be :<=, tdigest.centroids.size + @other.centroids.size + end + + it 'has the size of the two digests combined' do + new_tdigest = tdigest + @other + new_tdigest.size.must_equal (tdigest.size + @other.size) + end + end + end + + describe '#merge!' do + it 'works with empty tdigests' do + other = ::TDigest::TDigest.new(0.001, 50, 1.2) + tdigest.merge!(other) + (tdigest).centroids.size.must_equal 0 + end + + describe 'with populated tdigests' do + before do + @other = ::TDigest::TDigest.new(0.001, 50, 1.2) + [tdigest, @other].each do |td| + td.push(60, 100) + 10.times { td.push(rand * 100) } + end + end + + it 'has the parameters of the calling tdigest' do + vars = [:@delta, :@k, :@cs] + expected = Hash[vars.map { |v| [v, tdigest.instance_variable_get(v)] }] + tdigest.merge!(@other) + vars.each do |v| + tdigest.instance_variable_get(v).must_equal expected[v] + end + end + + it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do + combined_size = tdigest.centroids.size + @other.centroids.size + tdigest.merge!(@other) + tdigest.centroids.size.must_be :<=, combined_size + end + + it 'has the size of the two digests combined' do + combined_size = tdigest.size + @other.size + tdigest.merge!(@other) + tdigest.size.must_equal combined_size + end + end + end end