From fed1597c00396b31871c27d9ece1beedc1f99d48 Mon Sep 17 00:00:00 2001 From: Bjarne Fruergaard Date: Mon, 18 Jan 2016 13:20:59 -0800 Subject: [PATCH 1/6] Adds functionality for compress! to (eventually, sometimes) trigger on inserts --- lib/tdigest/tdigest.rb | 8 ++++++++ test/tdigest_test.rb | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/lib/tdigest/tdigest.rb b/lib/tdigest/tdigest.rb index 17c09ac..3363a3e 100644 --- a/lib/tdigest/tdigest.rb +++ b/lib/tdigest/tdigest.rb @@ -312,6 +312,14 @@ def _digest(x, n) _cumulate(false) + # If the number of centroids has grown to a very large size, + # it may be due to values being inserted in sorted order. + # We combat that by replaying the centroids in random order, + # which is what compress! does + if @centroids.size > (@k / @delta) + compress! + end + nil end diff --git a/test/tdigest_test.rb b/test/tdigest_test.rb index a673b41..d7d83be 100644 --- a/test/tdigest_test.rb +++ b/test/tdigest_test.rb @@ -135,6 +135,13 @@ def test_that_it_has_a_version_number 123829787.23404256, 103191489.36170213] end + + it 'does not blow up if data comes in sorted' do + tdigest.push(0..10_000) + tdigest.centroids.size.must_be :<, 5_000 + tdigest.compress! + tdigest.centroids.size.must_be :<, 1_000 + end end describe '#size' do From 3acf536124001547682b53642323ddf6987e0929 Mon Sep 17 00:00:00 2001 From: Bjarne Fruergaard Date: Mon, 18 Jan 2016 13:21:43 -0800 Subject: [PATCH 2/6] Fixture in test case using randomized values --- test/tdigest_test.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/tdigest_test.rb b/test/tdigest_test.rb index d7d83be..435bee2 100644 --- a/test/tdigest_test.rb +++ b/test/tdigest_test.rb @@ -82,9 +82,11 @@ def test_that_it_has_a_version_number describe 'with alot of uniformly distributed points' do it 'has minimal error' do + seed = srand(1234) # Makes the values a proper fixture N = 100_000 maxerr = 0 values = Array.new(N).map { rand } + srand(seed) tdigest.push(values) tdigest.compress! From bbd0a0e2b8ae0a15db1676d8cee8d05cef677a65 Mon Sep 17 00:00:00 2001 From: Bjarne Fruergaard Date: Mon, 18 Jan 2016 13:22:00 -0800 Subject: [PATCH 3/6] Adds functionality to add tdigests --- lib/tdigest/tdigest.rb | 18 ++++++++++++++++++ test/tdigest_test.rb | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/lib/tdigest/tdigest.rb b/lib/tdigest/tdigest.rb index 3363a3e..9bdec31 100644 --- a/lib/tdigest/tdigest.rb +++ b/lib/tdigest/tdigest.rb @@ -16,6 +16,24 @@ def initialize(delta = 0.01, k = 25, cx = 1.1) reset! end + def +(other) + # Uses delta, k and cx from the caller + t = self.class.new(@delta, @k, @cx) + data = self.centroids.values + other.centroids.values + while data.length > 0 + t.push_centroid(data.delete_at(rand(data.length))) + end + t.compress! + t + end + + def merge!(other) + # Uses delta, k and cx from the caller + t = self + other + @centroids = t.centroids + compress! + end + def as_bytes # compression as defined by Java implementation size = @centroids.size diff --git a/test/tdigest_test.rb b/test/tdigest_test.rb index 435bee2..d68d6f1 100644 --- a/test/tdigest_test.rb +++ b/test/tdigest_test.rb @@ -154,4 +154,38 @@ def test_that_it_has_a_version_number tdigest.size.must_equal n end end + + describe '#+' do + it 'works with empty tdigests' do + other = ::TDigest::TDigest.new(0.001, 50, 1.2) + (tdigest + other).centroids.size.must_equal 0 + end + + describe 'adding two tdigests' do + before do + @other = ::TDigest::TDigest.new(0.001, 50, 1.2) + [tdigest, @other].each do |td| + td.push(60, 100) + 10.times { td.push(rand * 100) } + end + end + + it 'has the parameters of the left argument (the calling tdigest)' do + new_tdigest = tdigest + @other + new_tdigest.instance_variable_get(:@delta).must_equal tdigest.instance_variable_get(:@delta) + new_tdigest.instance_variable_get(:@k).must_equal tdigest.instance_variable_get(:@k) + new_tdigest.instance_variable_get(:@cx).must_equal tdigest.instance_variable_get(:@cx) + end + + it 'results in a tdigest with number of centroids less than or equal to the combined size' do + new_tdigest = tdigest + @other + new_tdigest.centroids.size.must_be :<=, tdigest.centroids.size + @other.centroids.size + end + + it 'has the size of the two digests combined' do + new_tdigest = tdigest + @other + new_tdigest.size.must_equal (tdigest.size + @other.size) + end + end + end end From 9e06dc19c476b455d711e85e56f87c4183d88c1c Mon Sep 17 00:00:00 2001 From: Bjarne Fruergaard Date: Mon, 18 Jan 2016 13:31:49 -0800 Subject: [PATCH 4/6] Adds merge! for merging another tdigest into an instance --- lib/tdigest/tdigest.rb | 14 +++++++------- test/tdigest_test.rb | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/lib/tdigest/tdigest.rb b/lib/tdigest/tdigest.rb index 9bdec31..225d4c5 100644 --- a/lib/tdigest/tdigest.rb +++ b/lib/tdigest/tdigest.rb @@ -27,13 +27,6 @@ def +(other) t end - def merge!(other) - # Uses delta, k and cx from the caller - t = self + other - @centroids = t.centroids - compress! - end - def as_bytes # compression as defined by Java implementation size = @centroids.size @@ -135,6 +128,13 @@ def find_nearest(x) end end + def merge!(other) + # Uses delta, k and cx from the caller + t = self + other + @centroids = t.centroids + compress! + end + def p_rank(x) is_array = x.is_a? Array x = [x] unless is_array diff --git a/test/tdigest_test.rb b/test/tdigest_test.rb index d68d6f1..ac19b26 100644 --- a/test/tdigest_test.rb +++ b/test/tdigest_test.rb @@ -177,7 +177,7 @@ def test_that_it_has_a_version_number new_tdigest.instance_variable_get(:@cx).must_equal tdigest.instance_variable_get(:@cx) end - it 'results in a tdigest with number of centroids less than or equal to the combined size' do + it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do new_tdigest = tdigest + @other new_tdigest.centroids.size.must_be :<=, tdigest.centroids.size + @other.centroids.size end @@ -188,4 +188,43 @@ def test_that_it_has_a_version_number end end end + + describe '#merge!' do + it 'works with empty tdigests' do + other = ::TDigest::TDigest.new(0.001, 50, 1.2) + tdigest.merge!(other) + (tdigest).centroids.size.must_equal 0 + end + + describe 'with populated tdigests' do + before do + @other = ::TDigest::TDigest.new(0.001, 50, 1.2) + [tdigest, @other].each do |td| + td.push(60, 100) + 10.times { td.push(rand * 100) } + end + end + + it 'has the parameters of the calling tdigest' do + vars = [:@delta, :@k, :@cs] + expected = vars.map { |v| [v, tdigest.instance_variable_get(v)] }.to_h + tdigest.merge!(@other) + vars.each do |v| + tdigest.instance_variable_get(v).must_equal expected[v] + end + end + + it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do + combined_size = tdigest.centroids.size + @other.centroids.size + tdigest.merge!(@other) + tdigest.centroids.size.must_be :<=, combined_size + end + + it 'has the size of the two digests combined' do + combined_size = tdigest.size + @other.size + tdigest.merge!(@other) + tdigest.size.must_equal combined_size + end + end + end end From aef412373e803edff60e9772f138707f9d392b45 Mon Sep 17 00:00:00 2001 From: Bjarne Fruergaard Date: Mon, 18 Jan 2016 13:41:37 -0800 Subject: [PATCH 5/6] Change .to_h to Hash[.] (ruby < 2 compatibile) --- test/tdigest_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tdigest_test.rb b/test/tdigest_test.rb index ac19b26..76508d4 100644 --- a/test/tdigest_test.rb +++ b/test/tdigest_test.rb @@ -207,7 +207,7 @@ def test_that_it_has_a_version_number it 'has the parameters of the calling tdigest' do vars = [:@delta, :@k, :@cs] - expected = vars.map { |v| [v, tdigest.instance_variable_get(v)] }.to_h + expected = Hash[vars.map { |v| [v, tdigest.instance_variable_get(v)] }] tdigest.merge!(@other) vars.each do |v| tdigest.instance_variable_get(v).must_equal expected[v] From bc98a59311510ed8136090b4b638702166799ab2 Mon Sep 17 00:00:00 2001 From: Bjarne Fruergaard Date: Mon, 18 Jan 2016 14:27:25 -0800 Subject: [PATCH 6/6] Compress in + is redundant when already inserting in random order --- lib/tdigest/tdigest.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/tdigest/tdigest.rb b/lib/tdigest/tdigest.rb index 225d4c5..bf25eba 100644 --- a/lib/tdigest/tdigest.rb +++ b/lib/tdigest/tdigest.rb @@ -23,7 +23,6 @@ def +(other) while data.length > 0 t.push_centroid(data.delete_at(rand(data.length))) end - t.compress! t end