Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge and add functionality #3

Merged
merged 6 commits into from
Jan 19, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions lib/tdigest/tdigest.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ def initialize(delta = 0.01, k = 25, cx = 1.1)
reset!
end

def +(other)
# Uses delta, k and cx from the caller
t = self.class.new(@delta, @k, @cx)
data = self.centroids.values + other.centroids.values
while data.length > 0
t.push_centroid(data.delete_at(rand(data.length)))
end
t
end

def as_bytes
# compression as defined by Java implementation
size = @centroids.size
Expand Down Expand Up @@ -117,6 +127,13 @@ def find_nearest(x)
end
end

def merge!(other)
# Uses delta, k and cx from the caller
t = self + other
@centroids = t.centroids
compress!
end

def p_rank(x)
is_array = x.is_a? Array
x = [x] unless is_array
Expand Down Expand Up @@ -312,6 +329,14 @@ def _digest(x, n)

_cumulate(false)

# If the number of centroids has grown to a very large size,
# it may be due to values being inserted in sorted order.
# We combat that by replaying the centroids in random order,
# which is what compress! does
if @centroids.size > (@k / @delta)
compress!
end

nil
end

Expand Down
82 changes: 82 additions & 0 deletions test/tdigest_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,11 @@ def test_that_it_has_a_version_number

describe 'with alot of uniformly distributed points' do
it 'has minimal error' do
seed = srand(1234) # Makes the values a proper fixture
N = 100_000
maxerr = 0
values = Array.new(N).map { rand }
srand(seed)

tdigest.push(values)
tdigest.compress!
Expand Down Expand Up @@ -135,6 +137,13 @@ def test_that_it_has_a_version_number
123829787.23404256,
103191489.36170213]
end

it 'does not blow up if data comes in sorted' do
tdigest.push(0..10_000)
tdigest.centroids.size.must_be :<, 5_000
tdigest.compress!
tdigest.centroids.size.must_be :<, 1_000
end
end

describe '#size' do
Expand All @@ -145,4 +154,77 @@ def test_that_it_has_a_version_number
tdigest.size.must_equal n
end
end

describe '#+' do
it 'works with empty tdigests' do
other = ::TDigest::TDigest.new(0.001, 50, 1.2)
(tdigest + other).centroids.size.must_equal 0
end

describe 'adding two tdigests' do
before do
@other = ::TDigest::TDigest.new(0.001, 50, 1.2)
[tdigest, @other].each do |td|
td.push(60, 100)
10.times { td.push(rand * 100) }
end
end

it 'has the parameters of the left argument (the calling tdigest)' do
new_tdigest = tdigest + @other
new_tdigest.instance_variable_get(:@delta).must_equal tdigest.instance_variable_get(:@delta)
new_tdigest.instance_variable_get(:@k).must_equal tdigest.instance_variable_get(:@k)
new_tdigest.instance_variable_get(:@cx).must_equal tdigest.instance_variable_get(:@cx)
end

it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do
new_tdigest = tdigest + @other
new_tdigest.centroids.size.must_be :<=, tdigest.centroids.size + @other.centroids.size
end

it 'has the size of the two digests combined' do
new_tdigest = tdigest + @other
new_tdigest.size.must_equal (tdigest.size + @other.size)
end
end
end

describe '#merge!' do
it 'works with empty tdigests' do
other = ::TDigest::TDigest.new(0.001, 50, 1.2)
tdigest.merge!(other)
(tdigest).centroids.size.must_equal 0
end

describe 'with populated tdigests' do
before do
@other = ::TDigest::TDigest.new(0.001, 50, 1.2)
[tdigest, @other].each do |td|
td.push(60, 100)
10.times { td.push(rand * 100) }
end
end

it 'has the parameters of the calling tdigest' do
vars = [:@delta, :@k, :@cs]
expected = Hash[vars.map { |v| [v, tdigest.instance_variable_get(v)] }]
tdigest.merge!(@other)
vars.each do |v|
tdigest.instance_variable_get(v).must_equal expected[v]
end
end

it 'results in a tdigest with number of centroids less than or equal to the combined centroids size' do
combined_size = tdigest.centroids.size + @other.centroids.size
tdigest.merge!(@other)
tdigest.centroids.size.must_be :<=, combined_size
end

it 'has the size of the two digests combined' do
combined_size = tdigest.size + @other.size
tdigest.merge!(@other)
tdigest.size.must_equal combined_size
end
end
end
end