Skip to content

Commit

Permalink
reimplement the chunk index merging in C
Browse files Browse the repository at this point in the history
the python code could take a rather long time and likely most of it was converting stuff from python to C and back.
  • Loading branch information
ThomasWaldmann committed Aug 6, 2015
1 parent 7e21d95 commit a1e039b
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 2 deletions.
19 changes: 19 additions & 0 deletions borg/_hashindex.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,3 +385,22 @@ hashindex_summarize(HashIndex *index, long long *total_size, long long *total_cs
*total_unique_chunks = unique_chunks;
*total_chunks = chunks;
}

static void
hashindex_merge(HashIndex *index, HashIndex *other)
{
int32_t key_size = index->key_size;
const int32_t *other_values;
int32_t *my_values;
void *key = NULL;

while((key = hashindex_next_key(other, key))) {
other_values = key + key_size;
my_values = hashindex_get(index, key);
if(my_values == NULL) {
hashindex_set(index, key, other_values);
} else {
*my_values += *other_values;
}
}
}
3 changes: 1 addition & 2 deletions borg/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,7 @@ def create_master_idx(chunk_idx, tf_in, tmp_dir):
tf_in.extract(archive_id_hex, tmp_dir)
chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
add(chunk_idx, chunk_id, size, csize, incr=count)
chunk_idx.merge(archive_chunk_idx)
os.unlink(chunk_idx_path)

self.begin_txn()
Expand Down
4 changes: 4 additions & 0 deletions borg/hashindex.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ cdef extern from "_hashindex.c":
void hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
long long *unique_size, long long *unique_csize,
long long *total_unique_chunks, long long *total_chunks)
void hashindex_merge(HashIndex *index, HashIndex *other)
int hashindex_get_size(HashIndex *index)
int hashindex_write(HashIndex *index, char *path)
void *hashindex_get(HashIndex *index, void *key)
Expand Down Expand Up @@ -190,6 +191,9 @@ cdef class ChunkIndex(IndexBase):
&total_unique_chunks, &total_chunks)
return total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks

def merge(self, ChunkIndex other):
hashindex_merge(self.index, other.index)


cdef class ChunkKeyIterator:
cdef ChunkIndex idx
Expand Down
22 changes: 22 additions & 0 deletions borg/testsuite/hashindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from . import BaseTestCase


def H(x):
# make some 32byte long thing that depends on x
return bytes('%-0.32d' % x, 'ascii')


class HashIndexTestCase(BaseTestCase):

def _generic_test(self, cls, make_value, sha):
Expand Down Expand Up @@ -78,3 +83,20 @@ def test_iteritems(self):
second_half = list(idx.iteritems(marker=all[49][0]))
self.assert_equal(len(second_half), 50)
self.assert_equal(second_half, all[50:])

def test_chunkindex_merge(self):
idx1 = ChunkIndex()
idx1[H(1)] = 1, 100, 100
idx1[H(2)] = 2, 200, 200
idx1[H(3)] = 3, 300, 300
# no H(4) entry
idx2 = ChunkIndex()
idx2[H(1)] = 4, 100, 100
idx2[H(2)] = 5, 200, 200
# no H(3) entry
idx2[H(4)] = 6, 400, 400
idx1.merge(idx2)
assert idx1[H(1)] == (5, 100, 100)
assert idx1[H(2)] == (7, 200, 200)
assert idx1[H(3)] == (3, 300, 300)
assert idx1[H(4)] == (6, 400, 400)

0 comments on commit a1e039b

Please sign in to comment.