Merge pull request #7830 from ThomasWaldmann/typed-repoobjs

replace TAMs by typed repo objects, fixes #7670
borgbackup · Sep 30, 2023 · 8fc0944 · 8fc0944
2 parents 4a688ee + bd1d734
commit 8fc0944
Show file tree

Hide file tree

Showing 23 changed files with 295 additions and 494 deletions.
diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
@@ -365,7 +365,6 @@ or modified. It looks like this:
                 'time': '2017-05-05T12:42:22.942864',
             },
         },
-        'tam': ...,
     }
 
 The *version* field can be either 1 or 2. The versions differ in the
@@ -379,10 +378,6 @@ the repository. It is used by *borg check*, which verifies that all keys
 in all items are a subset of these keys. Thus, an older version of *borg check*
 supporting this mechanism can correctly detect keys introduced in later versions.
 
-The *tam* key is part of the :ref:`tertiary authentication mechanism <tam_description>`
-(formerly known as "tertiary authentication for metadata") and authenticates
-the manifest, since an ID check is not possible.
-
 *config* is a general-purpose location for additional metadata. All versions
 of Borg preserve its contents.
 

diff --git a/docs/internals/security.rst b/docs/internals/security.rst
@@ -67,43 +67,26 @@ in a particular part of its own data structure assigns this meaning.
 This results in a directed acyclic graph of authentication from the manifest
 to the data chunks of individual files.
 
-.. _tam_description:
+Above used to be all for borg 1.x and was the reason why it needed the
+tertiary authentication mechanism (TAM) for manifest and archives.
 
-.. rubric:: Authenticating the manifest
+borg 2 now stores the ro_type ("meaning") of a repo object's data into that
+object's metadata (like e.g.: manifest vs. archive vs. user file content data).
+When loading data from the repo, borg verifies that the type of object it got
+matches the type it wanted. borg 2 does not use TAMs any more.
 
-Since the manifest has a fixed ID (000...000) the aforementioned authentication
-does not apply to it, indeed, cannot apply to it; it is impossible to authenticate
-the root node of a DAG through its edges, since the root node has no incoming edges.
+As both the object's metadata and data are AEAD encrypted and also bound to
+the object ID (via giving the ID as AAD), there is no way an attacker (without
+access to the borg key) could change the type of the object or move content
+to a different object ID.
 
-With the scheme as described so far an attacker could easily replace the manifest,
-therefore Borg includes a tertiary authentication mechanism (TAM) that is applied
-to the manifest (see :ref:`tam_vuln`).
+This effectively 'anchors' the manifest (and also other metadata, like archives)
+to the key, which is controlled by the client, thereby anchoring the entire DAG,
+making it impossible for an attacker to add, remove or modify any part of the
+DAG without Borg being able to detect the tampering.
 
-TAM works by deriving a separate key through HKDF_ from the other encryption and
-authentication keys and calculating the HMAC of the metadata to authenticate [#]_::
-
-    # RANDOM(n) returns n random bytes
-    salt = RANDOM(64)
-
-    ikm = id_key || crypt_key
-    # *context* depends on the operation, for manifest authentication it is
-    # the ASCII string "borg-metadata-authentication-manifest".
-    tam_key = HKDF-SHA-512(ikm, salt, context)
-
-    # *data* is a dict-like structure
-    data[hmac] = zeroes
-    packed = pack(data)
-    data[hmac] = HMAC(tam_key, packed)
-    packed_authenticated = pack(data)
-
-Since an attacker cannot gain access to this key and also cannot make the
-client authenticate arbitrary data using this mechanism, the attacker is unable
-to forge the authentication.
-
-This effectively 'anchors' the manifest to the key, which is controlled by the
-client, thereby anchoring the entire DAG, making it impossible for an attacker
-to add, remove or modify any part of the DAG without Borg being able to detect
-the tampering.
+Passphrase notes
+----------------
 
 Note that when using BORG_PASSPHRASE the attacker cannot swap the *entire*
 repository against a new repository with e.g. repokey mode and no passphrase,
@@ -113,11 +96,6 @@ However, interactively a user might not notice this kind of attack
 immediately, if she assumes that the reason for the absent passphrase
 prompt is a set BORG_PASSPHRASE. See issue :issue:`2169` for details.
 
-.. [#] The reason why the authentication tag is stored in the packed
-       data itself is that older Borg versions can still read the
-       manifest this way, while a changed layout would have broken
-       compatibility.
-
 .. _security_encryption:
 
 Encryption

diff --git a/src/borg/archive.py b/src/borg/archive.py
diff --git a/src/borg/archiver/debug_cmd.py b/src/borg/archiver/debug_cmd.py
@@ -35,7 +35,7 @@ def do_debug_dump_archive_items(self, args, repository, manifest):
         repo_objs = manifest.repo_objs
         archive = Archive(manifest, args.name)
         for i, item_id in enumerate(archive.metadata.items):
-            _, data = repo_objs.parse(item_id, repository.get(item_id))
+            _, data = repo_objs.parse(item_id, repository.get(item_id), ro_type=ROBJ_ARCHIVE_STREAM)
             filename = "%06d_%s.items" % (i, bin_to_hex(item_id))
             print("Dumping", filename)
             with open(filename, "wb") as fd:
@@ -65,7 +65,8 @@ def output(fd):
             fd.write(do_indent(prepare_dump_dict(archive_meta_orig)))
             fd.write(",\n")
 
-            _, data = repo_objs.parse(archive_meta_orig["id"], repository.get(archive_meta_orig["id"]))
+            archive_id = archive_meta_orig["id"]
+            _, data = repo_objs.parse(archive_id, repository.get(archive_id), ro_type=ROBJ_ARCHIVE_META)
             archive_org_dict = msgpack.unpackb(data, object_hook=StableDict)
 
             fd.write('    "_meta":\n')
@@ -77,10 +78,10 @@ def output(fd):
             first = True
             items = []
             for chunk_id in archive_org_dict["item_ptrs"]:
-                _, data = repo_objs.parse(chunk_id, repository.get(chunk_id))
+                _, data = repo_objs.parse(chunk_id, repository.get(chunk_id), ro_type=ROBJ_ARCHIVE_CHUNKIDS)
                 items.extend(msgpack.unpackb(data))
             for item_id in items:
-                _, data = repo_objs.parse(item_id, repository.get(item_id))
+                _, data = repo_objs.parse(item_id, repository.get(item_id), ro_type=ROBJ_ARCHIVE_STREAM)
                 unpacker.feed(data)
                 for item in unpacker:
                     item = prepare_dump_dict(item)
@@ -101,7 +102,7 @@ def output(fd):
     def do_debug_dump_manifest(self, args, repository, manifest):
         """dump decoded repository manifest"""
         repo_objs = manifest.repo_objs
-        _, data = repo_objs.parse(manifest.MANIFEST_ID, repository.get(manifest.MANIFEST_ID))
+        _, data = repo_objs.parse(manifest.MANIFEST_ID, repository.get(manifest.MANIFEST_ID), ro_type=ROBJ_MANIFEST)
 
         meta = prepare_dump_dict(msgpack.unpackb(data, object_hook=StableDict))
 
@@ -116,7 +117,7 @@ def do_debug_dump_repo_objs(self, args, repository):
 
         def decrypt_dump(i, id, cdata, tag=None, segment=None, offset=None):
             if cdata is not None:
-                _, data = repo_objs.parse(id, cdata)
+                _, data = repo_objs.parse(id, cdata, ro_type=ROBJ_DONTCARE)
             else:
                 _, data = {}, b""
             tag_str = "" if tag is None else "_" + tag
@@ -211,7 +212,7 @@ def print_finding(info, wanted, data, offset):
                 break
             for id in ids:
                 cdata = repository.get(id)
-                _, data = repo_objs.parse(id, cdata)
+                _, data = repo_objs.parse(id, cdata, ro_type=ROBJ_DONTCARE)
 
                 # try to locate wanted sequence crossing the border of last_data and data
                 boundary_data = last_data[-(len(wanted) - 1) :] + data[: len(wanted) - 1]
@@ -284,7 +285,7 @@ def do_debug_parse_obj(self, args, repository, manifest):
             cdata = f.read()
 
         repo_objs = manifest.repo_objs
-        meta, data = repo_objs.parse(id=id, cdata=cdata)
+        meta, data = repo_objs.parse(id=id, cdata=cdata, ro_type=ROBJ_DONTCARE)
 
         with open(args.json_path, "w") as f:
             json.dump(meta, f)
@@ -315,7 +316,8 @@ def do_debug_format_obj(self, args, repository, manifest):
             meta = json.load(f)
 
         repo_objs = manifest.repo_objs
-        data_encrypted = repo_objs.format(id=id, meta=meta, data=data)
+        # TODO: support misc repo object types other than ROBJ_FILE_STREAM
+        data_encrypted = repo_objs.format(id=id, meta=meta, data=data, ro_type=ROBJ_FILE_STREAM)
 
         with open(args.object_path, "wb") as f:
             f.write(data_encrypted)

diff --git a/src/borg/archiver/rcompress_cmd.py b/src/borg/archiver/rcompress_cmd.py
@@ -37,7 +37,7 @@ def find_chunks(repository, repo_objs, stats, ctype, clevel, olevel):
         if not chunk_ids:
             break
         for id, chunk_no_data in zip(chunk_ids, repository.get_many(chunk_ids, read_data=False)):
-            meta = repo_objs.parse_meta(id, chunk_no_data)
+            meta = repo_objs.parse_meta(id, chunk_no_data, ro_type=ROBJ_DONTCARE)
             compr_found = meta["ctype"], meta["clevel"], meta.get("olevel", -1)
             if compr_found != compr_wanted:
                 recompress_ids.append(id)
@@ -57,13 +57,14 @@ def process_chunks(repository, repo_objs, stats, recompress_ids, olevel):
     for id, chunk in zip(recompress_ids, repository.get_many(recompress_ids, read_data=True)):
         old_size = len(chunk)
         stats["old_size"] += old_size
-        meta, data = repo_objs.parse(id, chunk)
+        meta, data = repo_objs.parse(id, chunk, ro_type=ROBJ_DONTCARE)
+        ro_type = meta.pop("type", None)
         compr_old = meta["ctype"], meta["clevel"], meta.get("olevel", -1)
         if olevel == -1:
             # if the chunk was obfuscated, but should not be in future, remove related metadata
             meta.pop("olevel", None)
             meta.pop("psize", None)
-        chunk = repo_objs.format(id, meta, data)
+        chunk = repo_objs.format(id, meta, data, ro_type=ro_type)
         compr_done = meta["ctype"], meta["clevel"], meta.get("olevel", -1)
         if compr_done != compr_old:
             # we actually changed something

diff --git a/src/borg/archiver/tar_cmds.py b/src/borg/archiver/tar_cmds.py
@@ -115,7 +115,9 @@ def item_content_stream(item):
             """
             Return a file-like object that reads from the chunks of *item*.
             """
-            chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _ in item.chunks], is_preloaded=True)
+            chunk_iterator = archive.pipeline.fetch_many(
+                [chunk_id for chunk_id, _ in item.chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
+            )
             if pi:
                 info = [remove_surrogates(item.path)]
                 return ChunkIteratorFileWrapper(

diff --git a/src/borg/archiver/transfer_cmd.py b/src/borg/archiver/transfer_cmd.py
@@ -111,7 +111,11 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non
                                         # keep compressed payload same, verify via assert_id (that will
                                         # decompress, but avoid needing to compress it again):
                                         meta, data = other_manifest.repo_objs.parse(
-                                            chunk_id, cdata, decompress=True, want_compressed=True
+                                            chunk_id,
+                                            cdata,
+                                            decompress=True,
+                                            want_compressed=True,
+                                            ro_type=ROBJ_FILE_STREAM,
                                         )
                                         meta, data = upgrader.upgrade_compressed_chunk(meta, data)
                                         chunk_entry = cache.add_chunk(
@@ -124,12 +128,20 @@ def do_transfer(self, args, *, repository, manifest, cache, other_repository=Non
                                             size=size,
                                             ctype=meta["ctype"],
                                             clevel=meta["clevel"],
+                                            ro_type=ROBJ_FILE_STREAM,
                                         )
                                     elif args.recompress == "always":
                                         # always decompress and re-compress file data chunks
-                                        meta, data = other_manifest.repo_objs.parse(chunk_id, cdata)
+                                        meta, data = other_manifest.repo_objs.parse(
+                                            chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
+                                        )
                                         chunk_entry = cache.add_chunk(
-                                            chunk_id, meta, data, stats=archive.stats, wait=False
+                                            chunk_id,
+                                            meta,
+                                            data,
+                                            stats=archive.stats,
+                                            wait=False,
+                                            ro_type=ROBJ_FILE_STREAM,
                                         )
                                     else:
                                         raise ValueError(f"unsupported recompress mode: {args.recompress}")

diff --git a/src/borg/cache.py b/src/borg/cache.py
@@ -12,7 +12,7 @@
 
 files_cache_logger = create_logger("borg.debug.files_cache")
 
-from .constants import CACHE_README, FILES_CACHE_MODE_DISABLED
+from .constants import CACHE_README, FILES_CACHE_MODE_DISABLED, ROBJ_FILE_STREAM
 from .hashindex import ChunkIndex, ChunkIndexEntry, CacheSynchronizer
 from .helpers import Location
 from .helpers import Error
@@ -755,7 +755,7 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
             nonlocal processed_item_metadata_chunks
             csize, data = decrypted_repository.get(archive_id)
             chunk_idx.add(archive_id, 1, len(data))
-            archive, _ = self.key.unpack_and_verify_archive(data)
+            archive = self.key.unpack_archive(data)
             archive = ArchiveItem(internal_dict=archive)
             if archive.version not in (1, 2):  # legacy
                 raise Exception("Unknown archive metadata version")
@@ -939,7 +939,21 @@ def update_compatibility(self):
         self.cache_config.ignored_features.update(repo_features - my_features)
         self.cache_config.mandatory_features.update(repo_features & my_features)
 
-    def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=None, ctype=None, clevel=None):
+    def add_chunk(
+        self,
+        id,
+        meta,
+        data,
+        *,
+        stats,
+        wait=True,
+        compress=True,
+        size=None,
+        ctype=None,
+        clevel=None,
+        ro_type=ROBJ_FILE_STREAM,
+    ):
+        assert ro_type is not None
         if not self.txn_active:
             self.begin_txn()
         if size is None and compress:
@@ -949,7 +963,9 @@ def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=Non
             return self.chunk_incref(id, stats)
         if size is None:
             raise ValueError("when giving compressed data for a new chunk, the uncompressed size must be given also")
-        cdata = self.repo_objs.format(id, meta, data, compress=compress, size=size, ctype=ctype, clevel=clevel)
+        cdata = self.repo_objs.format(
+            id, meta, data, compress=compress, size=size, ctype=ctype, clevel=clevel, ro_type=ro_type
+        )
         self.repository.put(id, cdata, wait=wait)
         self.chunks.add(id, 1, size)
         stats.update(size, not refcount)
@@ -1113,7 +1129,8 @@ def file_known_and_unchanged(self, hashed_path, path_hash, st):
     def memorize_file(self, hashed_path, path_hash, st, ids):
         pass
 
-    def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=None):
+    def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=None, ro_type=ROBJ_FILE_STREAM):
+        assert ro_type is not None
         if not self._txn_active:
             self.begin_txn()
         if size is None and compress:
@@ -1123,7 +1140,7 @@ def add_chunk(self, id, meta, data, *, stats, wait=True, compress=True, size=Non
         refcount = self.seen_chunk(id, size)
         if refcount:
             return self.chunk_incref(id, stats, size=size)
-        cdata = self.repo_objs.format(id, meta, data, compress=compress)
+        cdata = self.repo_objs.format(id, meta, data, compress=compress, ro_type=ro_type)
         self.repository.put(id, cdata, wait=wait)
         self.chunks.add(id, 1, size)
         stats.update(size, not refcount)

diff --git a/src/borg/constants.py b/src/borg/constants.py
@@ -33,6 +33,14 @@
 # forcing to 0o100XXX later
 STDIN_MODE_DEFAULT = 0o660
 
+# RepoObj types
+ROBJ_MANIFEST = "M"  # Manifest (directory of archives, other metadata) object
+ROBJ_ARCHIVE_META = "A"  # main archive metadata object
+ROBJ_ARCHIVE_CHUNKIDS = "C"  # objects with a list of archive metadata stream chunkids
+ROBJ_ARCHIVE_STREAM = "S"  # archive metadata stream chunk (containing items)
+ROBJ_FILE_STREAM = "F"  # file content stream chunk (containing user data)
+ROBJ_DONTCARE = "*"  # used to parse without type assertion (= accept any type)
+
 # in borg < 1.3, this has been defined like this:
 # 20 MiB minus 41 bytes for a PUT header (because the "size" field in the Repository includes
 # the header, and the total size was set to precisely 20 MiB for borg < 1.3).