reimplement the chunk index merging in C

the python code could take a rather long time and likely most of it was converting stuff from python to C and back.
borgbackup · Aug 6, 2015 · a1e039b · a1e039b
1 parent 7e21d95
commit a1e039b
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 2 deletions.
diff --git a/borg/_hashindex.c b/borg/_hashindex.c
@@ -385,3 +385,22 @@ hashindex_summarize(HashIndex *index, long long *total_size, long long *total_cs
     *total_unique_chunks = unique_chunks;
     *total_chunks = chunks;
 }
+
+static void
+hashindex_merge(HashIndex *index, HashIndex *other)
+{
+    int32_t key_size = index->key_size;
+    const int32_t *other_values;
+    int32_t *my_values;
+    void *key = NULL;
+
+    while((key = hashindex_next_key(other, key))) {
+        other_values = key + key_size;
+        my_values = hashindex_get(index, key);
+        if(my_values == NULL) {
+            hashindex_set(index, key, other_values);
+        } else {
+            *my_values += *other_values;
+        }
+    }
+}
diff --git a/borg/cache.py b/borg/cache.py
@@ -309,8 +309,7 @@ def create_master_idx(chunk_idx, tf_in, tmp_dir):
                 tf_in.extract(archive_id_hex, tmp_dir)
                 chunk_idx_path = os.path.join(tmp_dir, archive_id_hex).encode('utf-8')
                 archive_chunk_idx = ChunkIndex.read(chunk_idx_path)
-                for chunk_id, (count, size, csize) in archive_chunk_idx.iteritems():
-                    add(chunk_idx, chunk_id, size, csize, incr=count)
+                chunk_idx.merge(archive_chunk_idx)
                 os.unlink(chunk_idx_path)
 
         self.begin_txn()

diff --git a/borg/hashindex.pyx b/borg/hashindex.pyx
@@ -14,6 +14,7 @@ cdef extern from "_hashindex.c":
     void hashindex_summarize(HashIndex *index, long long *total_size, long long *total_csize,
                              long long *unique_size, long long *unique_csize,
                              long long *total_unique_chunks, long long *total_chunks)
+    void hashindex_merge(HashIndex *index, HashIndex *other)
     int hashindex_get_size(HashIndex *index)
     int hashindex_write(HashIndex *index, char *path)
     void *hashindex_get(HashIndex *index, void *key)
@@ -190,6 +191,9 @@ cdef class ChunkIndex(IndexBase):
                             &total_unique_chunks, &total_chunks)
         return total_size, total_csize, unique_size, unique_csize, total_unique_chunks, total_chunks
 
+    def merge(self, ChunkIndex other):
+        hashindex_merge(self.index, other.index)
+
 
 cdef class ChunkKeyIterator:
     cdef ChunkIndex idx

diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py
@@ -6,6 +6,11 @@
 from . import BaseTestCase
 
 
+def H(x):
+    # make some 32byte long thing that depends on x
+    return bytes('%-0.32d' % x, 'ascii')
+
+
 class HashIndexTestCase(BaseTestCase):
 
     def _generic_test(self, cls, make_value, sha):
@@ -78,3 +83,20 @@ def test_iteritems(self):
         second_half = list(idx.iteritems(marker=all[49][0]))
         self.assert_equal(len(second_half), 50)
         self.assert_equal(second_half, all[50:])
+
+    def test_chunkindex_merge(self):
+        idx1 = ChunkIndex()
+        idx1[H(1)] = 1, 100, 100
+        idx1[H(2)] = 2, 200, 200
+        idx1[H(3)] = 3, 300, 300
+        # no H(4) entry
+        idx2 = ChunkIndex()
+        idx2[H(1)] = 4, 100, 100
+        idx2[H(2)] = 5, 200, 200
+        # no H(3) entry
+        idx2[H(4)] = 6, 400, 400
+        idx1.merge(idx2)
+        assert idx1[H(1)] == (5, 100, 100)
+        assert idx1[H(2)] == (7, 200, 200)
+        assert idx1[H(3)] == (3, 300, 300)
+        assert idx1[H(4)] == (6, 400, 400)