Permalink
Browse files

Store metadata in the index, in bupindex.meta; only store unique values.

See DESIGN for more information.

Update the index format header to 'BUPI\0\0\0\5' (version 5).

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Reviewed-by: Zoran Zaric <zz@zoranzaric.de>
  • Loading branch information...
1 parent 1aa4481 commit 8585613c1f45f3e20feec00b24fc7e3a948fa23e @rlbdv rlbdv committed Nov 13, 2012
Showing with 269 additions and 58 deletions.
  1. +19 −1 DESIGN
  2. +6 −4 Documentation/bup-save.md
  3. +27 −5 cmd/index-cmd.py
  4. +10 −4 cmd/save-cmd.py
  5. +97 −23 lib/bup/index.py
  6. +39 −21 lib/bup/t/tindex.py
  7. +71 −0 t/test-meta.sh
View
20 DESIGN
@@ -395,9 +395,27 @@ it did before the addition of metadata, and restore files using the
tree information.
The nice thing about this design is that you can walk through each
-file in a tree just by opening the tree and the .bupmeta contents, and
+file in a tree just by opening the tree and the .bupm contents, and
iterating through both at the same time.
+Since the contents of any .bupm file should match the state of the
+filesystem when it was *indexed*, bup must record the detailed
+metadata in the index. To do this, bup records four values in the
+index, the atime, mtime, and ctime (as timespecs), and an integer
+offset into a secondary "metadata store" which has the same name as
+the index, but with ".meta" appended. This secondary store contains
+the encoded Metadata object corresponding to each path in the index.
+
+Currently, in order to decrease the storage required for the metadata
+store, bup only writes unique values there, reusing offsets when
+appropriate across the index. The effectiveness of this approach
+relies on the expectation that there will be many duplicate metadata
+records. Storing the full timestamps in the index is intended to make
+that more likely, because it makes it unnecessary to record those
+values in the secondary store. So bup clears them before encoding the
+Metadata objects destined for the index, and timestamp differences
+don't contribute to the uniqueness of the metadata.
+
Bup supports recording and restoring hardlinks, and it does so by
tracking sets of paths that correspond to the same dev/inode pair when
indexing. This information is stored in an optional file with the
View
@@ -21,10 +21,12 @@ first update the index using `bup index`. The reasons
for separating the two steps are described in the man page
for `bup-index`(1).
-By default, metadata will be saved for every path. However, if
-`--strip`, `--strip-path`, or `--graft` is specified, metadata will
-not be saved for the root directory (*/*). See `bup-restore`(1) for
-more information about the handling of metadata.
+By default, metadata will be saved for every path, and the metadata
+for any unindexed parent directories of indexed paths will be taken
+directly from the filesystem. However, if `--strip`, `--strip-path`,
+or `--graft` is specified, metadata will not be saved for the root
+directory (*/*). See `bup-restore`(1) for more information about the
+handling of metadata.
# OPTIONS
View
@@ -1,6 +1,7 @@
#!/usr/bin/env python
+
import sys, stat, time, os
-from bup import options, git, index, drecurse, hlinkdb
+from bup import metadata, options, git, index, drecurse, hlinkdb
from bup.helpers import *
from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE
@@ -52,7 +53,8 @@ def update_index(top, excluded_paths):
# tmax and start must be epoch nanoseconds.
tmax = (time.time() - 1) * 10**9
ri = index.Reader(indexfile)
- wi = index.Writer(indexfile, tmax)
+ msw = index.MetaStoreWriter(indexfile + '.meta')
+ wi = index.Writer(indexfile, msw, tmax)
rig = IterHelper(ri.iter(name=top))
tstart = int(time.time()) * 10**9
@@ -87,7 +89,22 @@ def hashgen(name):
hlinks.del_path(rig.cur.name)
if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1:
hlinks.add_path(path, pst.st_dev, pst.st_ino)
- rig.cur.from_stat(pst, tstart)
+ meta = metadata.from_path(path, statinfo=pst)
+ # Clear these so they don't bloat the store -- they're
+ # already in the index (since they vary a lot and they're
+ # fixed length). If you've noticed "tmax", you might
+ # wonder why it's OK to do this, since that code may
+ # adjust (mangle) the index mtime and ctime -- producing
+ # fake values which must not end up in a .bupm. However,
+ # it looks like that shouldn't be possible: (1) When
+ # "save" validates the index entry, it always reads the
+ # metadata from the filesytem. (2) Metadata is only
+ # read/used from the index if hashvalid is true. (3) index
+ # always invalidates "faked" entries, because "old != new"
+ # in from_stat().
+ meta.ctime = meta.mtime = meta.atime = 0
+ meta_ofs = msw.store(meta)
+ rig.cur.from_stat(pst, meta_ofs, tstart)
if not (rig.cur.flags & index.IX_HASHVALID):
if hashgen:
(rig.cur.gitmode, rig.cur.sha) = hashgen(path)
@@ -97,7 +114,11 @@ def hashgen(name):
rig.cur.repack()
rig.next()
else: # new paths
- wi.add(path, pst, hashgen = hashgen)
+ meta = metadata.from_path(path, statinfo=pst)
+ # See same assignment to 0, above, for rationale.
+ meta.atime = meta.mtime = meta.ctime = 0
+ meta_ofs = msw.store(meta)
+ wi.add(path, pst, meta_ofs, hashgen = hashgen)
if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1:
hlinks.add_path(path, pst.st_dev, pst.st_ino)
@@ -115,7 +136,7 @@ def hashgen(name):
check_index(ri)
log('check: before merging: newfile\n')
check_index(wr)
- mi = index.Writer(indexfile, tmax)
+ mi = index.Writer(indexfile, msw, tmax)
for e in index.merge(ri, wr):
# FIXME: shouldn't we remove deleted entries eventually? When?
@@ -128,6 +149,7 @@ def hashgen(name):
else:
wi.close()
+ msw.close()
hlinks.commit_save()
View
@@ -180,6 +180,7 @@ def progress_report(n):
indexfile = opt.indexfile or git.repo('bupindex')
r = index.Reader(indexfile)
+msr = index.MetaStoreReader(indexfile + '.meta')
hlink_db = hlinkdb.HLinkDB(indexfile + '.hlink')
def already_saved(ent):
@@ -291,6 +292,7 @@ def find_hardlink_target(hlink_db, ent):
if first_root == None:
dir_name, fs_path = dirp[0]
first_root = dirp[0]
+ # Not indexed, so just grab the FS metadata or use empty metadata.
meta = metadata.from_path(fs_path) if fs_path else metadata.Metadata()
_push(dir_name, meta)
elif first_root != dirp[0]:
@@ -303,6 +305,7 @@ def find_hardlink_target(hlink_db, ent):
# If switching to a new sub-tree, start a new sub-tree.
for path_component in dirp[len(parts):]:
dir_name, fs_path = path_component
+ # Not indexed, so just grab the FS metadata or use empty metadata.
meta = metadata.from_path(fs_path) if fs_path else metadata.Metadata()
_push(dir_name, meta)
@@ -330,10 +333,11 @@ def find_hardlink_target(hlink_db, ent):
git_info = (ent.gitmode, git_name, id)
shalists[-1].append(git_info)
sort_key = git.shalist_item_sort_key((ent.mode, file, id))
- hlink = find_hardlink_target(hlink_db, ent)
- metalists[-1].append((sort_key,
- metadata.from_path(ent.name,
- hardlink_target=hlink)))
+ meta = msr.metadata_at(ent.meta_ofs)
+ meta.hardlink_target = find_hardlink_target(hlink_db, ent)
+ # Restore the times that were cleared to 0 in the metastore.
+ (meta.atime, meta.mtime, meta.ctime) = (ent.atime, ent.mtime, ent.ctime)
+ metalists[-1].append((sort_key, meta))
else:
if stat.S_ISREG(ent.mode):
try:
@@ -394,6 +398,7 @@ def find_hardlink_target(hlink_db, ent):
# Finish the root directory.
tree = _pop(force_tree = None,
+ # When there's a collision, use empty metadata for the root.
dir_metadata = metadata.Metadata() if root_collision else None)
if opt.tree:
@@ -404,6 +409,7 @@ def find_hardlink_target(hlink_db, ent):
if opt.commit:
print commit.encode('hex')
+msr.close()
w.close() # must close before we can update the ref
if opt.name:
Oops, something went wrong.

0 comments on commit 8585613

Please sign in to comment.