From 079424b881ecb47ac9c72c06d5459cd281cb3e8c Mon Sep 17 00:00:00 2001 From: Hauke Rehfeld Date: Thu, 30 Apr 2020 00:50:15 +0200 Subject: [PATCH 1/6] list: extract function for hashfunction name to hashfunction --- src/borg/helpers/parseformat.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index cea722ee7b..6ee0e32a09 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -834,13 +834,16 @@ def calculate_csize(self, item): # note: does not support hardlink slaves, they will be csize 0 return item.get_size(compressed=True) - def hash_item(self, hash_function, item): - if 'chunks' not in item: - return "" + def prepare_hash_function(self, hash_function): if hash_function in hashlib.algorithms_guaranteed: hash = hashlib.new(hash_function) elif hash_function == 'xxh64': hash = self.xxh64() + return hash + def hash_item(self, hash_function, item): + if 'chunks' not in item: + return "" + hash = self.prepare_hash_function(hash_function) for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks]): hash.update(data) return hash.hexdigest() From bc05f0893f691c3c2632001312527ebd71797f24 Mon Sep 17 00:00:00 2001 From: Hauke Rehfeld Date: Thu, 30 Apr 2020 00:51:18 +0200 Subject: [PATCH 2/6] list: expose checksum over chunkmetadata to list --format --- src/borg/helpers/parseformat.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 6ee0e32a09..5c5f69114d 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -688,6 +688,7 @@ class ItemFormatter(BaseFormatter): ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'), ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), tuple(sorted(hash_algorithms)), + tuple(('chunk_ids_%s' % alg for alg in sorted(hash_algorithms))), ('archiveid', 'archivename', 'extra'), ('health', ) ) @@ -765,6 +766,7 @@ def __init__(self, archive, format, *, json_lines=False): } for hash_function in self.hash_algorithms: self.add_key(hash_function, partial(self.hash_item, hash_function)) + self.call_keys['chunk_ids_%s' % hash_function] = partial(self.hash_chunks, hash_function) self.used_call_keys = set(self.call_keys) & self.format_keys def format_item_json(self, item): @@ -840,6 +842,17 @@ def prepare_hash_function(self, hash_function): elif hash_function == 'xxh64': hash = self.xxh64() return hash + + def hash_chunks(self, hash_function, item): + if 'chunks' not in item: + return "" + hash = self.prepare_hash_function(hash_function) + for chunk in item.chunks: + hash.update(chunk.id) + hash.update(bytes(chunk.size)) + hash.update(bytes(chunk.csize)) + return hash.hexdigest() + def hash_item(self, hash_function, item): if 'chunks' not in item: return "" From 6c8d1f51021e10fea6dc8f362f51680d0ae2b093 Mon Sep 17 00:00:00 2001 From: Hauke Rehfeld Date: Thu, 30 Apr 2020 16:49:33 +0200 Subject: [PATCH 3/6] list formatter: also add chunker_params_ --- src/borg/helpers/parseformat.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 5c5f69114d..88a49181c1 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -688,7 +688,8 @@ class ItemFormatter(BaseFormatter): ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'), ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), tuple(sorted(hash_algorithms)), - tuple(('chunk_ids_%s' % alg for alg in sorted(hash_algorithms))), + tuple(['chunk_ids_%s' % alg for alg in sorted(hash_algorithms)] + [ + 'chunker_params_%s' % alg for alg in sorted(hash_algorithms)]), ('archiveid', 'archivename', 'extra'), ('health', ) ) @@ -767,6 +768,7 @@ def __init__(self, archive, format, *, json_lines=False): for hash_function in self.hash_algorithms: self.add_key(hash_function, partial(self.hash_item, hash_function)) self.call_keys['chunk_ids_%s' % hash_function] = partial(self.hash_chunks, hash_function) + self.call_keys['chunker_params_%s' % hash_function] = partial(self.hash_chunker_params, hash_function) self.used_call_keys = set(self.call_keys) & self.format_keys def format_item_json(self, item): @@ -843,6 +845,14 @@ def prepare_hash_function(self, hash_function): hash = self.xxh64() return hash + def hash_chunker_params(self, hash_function, item): + hash = self.prepare_hash_function(hash_function) + + chunker_params = self.archive.metadata.get('chunker_params') + for info in chunker_params: + hash.update(bytes(info)) + return hash.hexdigest() + def hash_chunks(self, hash_function, item): if 'chunks' not in item: return "" From 07c2ae5a150f46042e5911f3c94bbc3154ce0395 Mon Sep 17 00:00:00 2001 From: Hauke Rehfeld Date: Thu, 30 Apr 2020 17:19:55 +0200 Subject: [PATCH 4/6] list formatter: fix: chunk ids are sufficient to hash. --- src/borg/helpers/parseformat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 88a49181c1..2da923fa43 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -859,8 +859,6 @@ def hash_chunks(self, hash_function, item): hash = self.prepare_hash_function(hash_function) for chunk in item.chunks: hash.update(chunk.id) - hash.update(bytes(chunk.size)) - hash.update(bytes(chunk.csize)) return hash.hexdigest() def hash_item(self, hash_function, item): From e760f34d6609f30a6dd398c0be5ef31c6fbe861c Mon Sep 17 00:00:00 2001 From: Hauke Rehfeld Date: Thu, 30 Apr 2020 17:26:29 +0200 Subject: [PATCH 5/6] list formatter: chunker_params are not hashed but given as plaintext --- src/borg/helpers/parseformat.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 2da923fa43..60f8731c41 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -688,8 +688,7 @@ class ItemFormatter(BaseFormatter): ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'), ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), tuple(sorted(hash_algorithms)), - tuple(['chunk_ids_%s' % alg for alg in sorted(hash_algorithms)] + [ - 'chunker_params_%s' % alg for alg in sorted(hash_algorithms)]), + tuple(['chunker_params', *('chunk_ids_%s' % alg for alg in sorted(hash_algorithms))]), ('archiveid', 'archivename', 'extra'), ('health', ) ) @@ -756,6 +755,7 @@ def __init__(self, archive, format, *, json_lines=False): 'csize': self.calculate_csize, 'dsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.size), 'dcsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.csize), + 'chunker_params': self.hash_chunker_params, 'num_chunks': self.calculate_num_chunks, 'unique_chunks': partial(self.sum_unique_chunks_metadata, lambda chunk: 1), 'isomtime': partial(self.format_iso_time, 'mtime'), @@ -768,7 +768,6 @@ def __init__(self, archive, format, *, json_lines=False): for hash_function in self.hash_algorithms: self.add_key(hash_function, partial(self.hash_item, hash_function)) self.call_keys['chunk_ids_%s' % hash_function] = partial(self.hash_chunks, hash_function) - self.call_keys['chunker_params_%s' % hash_function] = partial(self.hash_chunker_params, hash_function) self.used_call_keys = set(self.call_keys) & self.format_keys def format_item_json(self, item): @@ -845,13 +844,9 @@ def prepare_hash_function(self, hash_function): hash = self.xxh64() return hash - def hash_chunker_params(self, hash_function, item): - hash = self.prepare_hash_function(hash_function) - + def hash_chunker_params(self, item): chunker_params = self.archive.metadata.get('chunker_params') - for info in chunker_params: - hash.update(bytes(info)) - return hash.hexdigest() + return '-'.join(map(repr, chunker_params)) def hash_chunks(self, hash_function, item): if 'chunks' not in item: From d7b3620caa7633ce937aaad0dce063ee2d9cf2f1 Mon Sep 17 00:00:00 2001 From: Hauke Rehfeld Date: Thu, 30 Apr 2020 17:38:21 +0200 Subject: [PATCH 6/6] list formatter: only expose sha256 variant of chunk_ids_checksum --- src/borg/helpers/parseformat.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 60f8731c41..88770b0556 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -688,7 +688,7 @@ class ItemFormatter(BaseFormatter): ('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'), ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'), tuple(sorted(hash_algorithms)), - tuple(['chunker_params', *('chunk_ids_%s' % alg for alg in sorted(hash_algorithms))]), + tuple(['chunker_params', 'chunk_ids_checksum']), ('archiveid', 'archivename', 'extra'), ('health', ) ) @@ -756,6 +756,7 @@ def __init__(self, archive, format, *, json_lines=False): 'dsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.size), 'dcsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.csize), 'chunker_params': self.hash_chunker_params, + 'chunk_ids_checksum': self.hash_chunks, 'num_chunks': self.calculate_num_chunks, 'unique_chunks': partial(self.sum_unique_chunks_metadata, lambda chunk: 1), 'isomtime': partial(self.format_iso_time, 'mtime'), @@ -767,7 +768,6 @@ def __init__(self, archive, format, *, json_lines=False): } for hash_function in self.hash_algorithms: self.add_key(hash_function, partial(self.hash_item, hash_function)) - self.call_keys['chunk_ids_%s' % hash_function] = partial(self.hash_chunks, hash_function) self.used_call_keys = set(self.call_keys) & self.format_keys def format_item_json(self, item): @@ -848,10 +848,12 @@ def hash_chunker_params(self, item): chunker_params = self.archive.metadata.get('chunker_params') return '-'.join(map(repr, chunker_params)) - def hash_chunks(self, hash_function, item): + def hash_chunks(self, item): if 'chunks' not in item: return "" - hash = self.prepare_hash_function(hash_function) + hash_function = 'sha256' + assert hash_function in hashlib.algorithms_guaranteed, hashlib.algorithms_guaranteed + hash = hashlib.new(hash_function) for chunk in item.chunks: hash.update(chunk.id) return hash.hexdigest()