Skip to content

Commit

Permalink
Merge pull request #3 from castle/tdigest-add-other
Browse files Browse the repository at this point in the history
Merge and add functionality
  • Loading branch information
wallin committed Jan 19, 2016
2 parents cae9238 + bc98a59 commit da4f313
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 0 deletions.
25 changes: 25 additions & 0 deletions lib/tdigest/tdigest.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ def initialize(delta = 0.01, k = 25, cx = 1.1)
reset!
end

def +(other)
# Uses delta, k and cx from the caller
t = self.class.new(@delta, @k, @cx)
data = self.centroids.values + other.centroids.values
while data.length > 0
t.push_centroid(data.delete_at(rand(data.length)))
end
t
end

def as_bytes
# compression as defined by Java implementation
size = @centroids.size
Expand Down Expand Up @@ -117,6 +127,13 @@ def find_nearest(x)
end
end

def merge!(other)
# Uses delta, k and cx from the caller
t = self + other
@centroids = t.centroids
compress!
end

def p_rank(x)
is_array = x.is_a? Array
x = [x] unless is_array
Expand Down Expand Up @@ -312,6 +329,14 @@ def _digest(x, n)

_cumulate(false)

# If the number of centroids has grown to a very large size,
# it may be due to values being inserted in sorted order.
# We combat that by replaying the centroids in random order,
# which is what compress! does
if @centroids.size > (@k / @delta)
compress!
end

nil
end

Expand Down
82 changes: 82 additions & 0 deletions test/tdigest_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,11 @@ def test_that_it_has_a_version_number

describe 'with alot of uniformly distributed points' do
it 'has minimal error' do
seed = srand(1234) # Makes the values a proper fixture
N = 100_000
maxerr = 0
values = Array.new(N).map { rand }
srand(seed)

tdigest.push(values)
tdigest.compress!
Expand Down Expand Up @@ -135,6 +137,13 @@ def test_that_it_has_a_version_number
123829787.23404256,
103191489.36170213]
end

it 'does not blow up if data comes in sorted' do
tdigest.push(0..10_000)
tdigest.centroids.size.must_be :<, 5_000
tdigest.compress!
tdigest.centroids.size.must_be :<, 1_000
end
end

describe '#size' do
Expand All @@ -145,4 +154,77 @@ def test_that_it_has_a_version_number
tdigest.size.must_equal n
end
end

describe '#+' do
it 'works with empty tdigests' do
other = ::TDigest::TDigest.new(0.001, 50, 1.2)
(tdigest + other).centroids.size.must_equal 0
end

describe 'adding two tdigests' do
before do
@other = ::TDigest::TDigest.new(0.001, 50, 1.2)
[tdigest, @other].each do |td|
td.push(60, 100)
10.times { td.push(rand * 100) }
end
end

it 'has the parameters of the left argument (the calling tdigest)' do
new_tdigest = tdigest + @other
new_tdigest.instance_variable_get(:@delta).must_equal tdigest.instance_variable_get(:@delta)
new_tdigest.instance_variable_get(:@k).must_equal tdigest.instance_variable_get(:@k)
new_tdigest.instance_variable_get(:@cx).must_equal tdigest.instance_variable_get(:@cx)
end

it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do
new_tdigest = tdigest + @other
new_tdigest.centroids.size.must_be :<=, tdigest.centroids.size + @other.centroids.size
end

it 'has the size of the two digests combined' do
new_tdigest = tdigest + @other
new_tdigest.size.must_equal (tdigest.size + @other.size)
end
end
end

describe '#merge!' do
it 'works with empty tdigests' do
other = ::TDigest::TDigest.new(0.001, 50, 1.2)
tdigest.merge!(other)
(tdigest).centroids.size.must_equal 0
end

describe 'with populated tdigests' do
before do
@other = ::TDigest::TDigest.new(0.001, 50, 1.2)
[tdigest, @other].each do |td|
td.push(60, 100)
10.times { td.push(rand * 100) }
end
end

it 'has the parameters of the calling tdigest' do
vars = [:@delta, :@k, :@cs]
expected = Hash[vars.map { |v| [v, tdigest.instance_variable_get(v)] }]
tdigest.merge!(@other)
vars.each do |v|
tdigest.instance_variable_get(v).must_equal expected[v]
end
end

it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do
combined_size = tdigest.centroids.size + @other.centroids.size
tdigest.merge!(@other)
tdigest.centroids.size.must_be :<=, combined_size
end

it 'has the size of the two digests combined' do
combined_size = tdigest.size + @other.size
tdigest.merge!(@other)
tdigest.size.must_equal combined_size
end
end
end
end

0 comments on commit da4f313

Please sign in to comment.