Permalink
Browse files

Overhaul restore destination handling, and stripping/grafting behavior.

Change restore to respond to source paths like this (assume outdir
corresponds to "." if there no -C argument or to -C outdir):

  /foo/what/ever - extract ever to outdir/ever
  /foo/what/ever/ - extract ever/* to outdir/*
  /foo/what/ever/. - extract ever/. to outdir/. (i.e. outdir == ever).

Also fix handling of top-level commit symlinks.  Previously bup would
just restore /foo/latest as a dummy symlink like this:

   latest -> ../.commit/SHA

Instead, dereference latest and restore the target instead.

Tighten up stripping/grafting with additional argument checks, and
handle any root collisions by creating a fake root dir (see comments
in save-cmd.py).  Bup still doesn't handle other path collisions yet,
i.e. if both /foo/bar and /bar are remapped to /bar.

Signed-off-by: Rob Browning <rlb@defaultvalue.org>
Reviewed-by: Zoran Zaric <zz@zoranzaric.de>
  • Loading branch information...
1 parent 66ff902 commit 042eaac10b2650a71f7e8604cfac8213091ec1be @rlbdv rlbdv committed Aug 18, 2012
Showing with 396 additions and 104 deletions.
  1. +56 −8 cmd/restore-cmd.py
  2. +66 −30 cmd/save-cmd.py
  3. +42 −7 lib/bup/helpers.py
  4. +24 −2 lib/bup/t/thelpers.py
  5. +2 −2 lib/bup/vfs.py
  6. +206 −55 t/test.sh
View
@@ -116,8 +116,41 @@ def write_file_content(fullname, n):
outf.close()
+def do_root(n):
+ # Very similar to do_node(), except that this function doesn't
+ # create a path for n's destination directory (and so ignores
+ # n.fullname). It assumes the destination is '.', and restores
+ # n's metadata and content there.
+ global total_restored, opt
+ meta_stream = None
+ try:
+ # Directory metadata is the first entry in any .bupm file in
+ # the directory. Get it.
+ mfile = n.metadata_file() # VFS file -- cannot close().
+ if mfile:
+ meta_stream = mfile.open()
+ meta = metadata.Metadata.read(meta_stream)
+ print_info(n, '.')
+ total_restored += 1
+ plog('Restoring: %d\r' % total_restored)
+ for sub in n:
+ m = None
+ # Don't get metadata if this is a dir -- handled in sub do_node().
+ if meta_stream and not stat.S_ISDIR(sub.mode):
+ m = metadata.Metadata.read(meta_stream)
+ do_node(n, sub, m)
+ if meta:
+ meta.apply_to_path('.', restore_numeric_ids = opt.numeric_ids)
+ finally:
+ if meta_stream:
+ meta_stream.close()
+
+
def do_node(top, n, meta=None):
- # meta will be None for dirs, and when there is no .bupm (i.e. no metadata)
+ # Create n.fullname(), relative to the current directory, and
+ # restore all of its metadata, when available. The meta argument
+ # will be None for dirs, or when there is no .bupm (i.e. no
+ # metadata).
global total_restored, opt
meta_stream = None
try:
@@ -152,8 +185,7 @@ def do_node(top, n, meta=None):
m = metadata.Metadata.read(meta_stream)
do_node(top, sub, m)
if meta and not created_hardlink:
- meta.apply_to_path(fullname,
- restore_numeric_ids=opt.numeric_ids)
+ meta.apply_to_path(fullname, restore_numeric_ids = opt.numeric_ids)
finally:
if meta_stream:
meta_stream.close()
@@ -183,15 +215,31 @@ def do_node(top, n, meta=None):
continue
isdir = stat.S_ISDIR(n.mode)
if not name or name == '.':
- # trailing slash: extract children to cwd
+ # Source is /foo/what/ever/ or /foo/what/ever/. -- extract
+ # what/ever/* to the current directory, and if name == '.'
+ # (i.e. /foo/what/ever/.), then also restore what/ever's
+ # metadata to the current directory.
if not isdir:
add_error('%r: not a directory' % d)
else:
- for sub in n:
- do_node(n, sub)
+ if name == '.':
+ do_root(n)
+ else:
+ for sub in n:
+ do_node(n, sub)
else:
- # no trailing slash: extract node and its children to cwd
- do_node(n.parent, n)
+ # Source is /foo/what/ever -- extract ./ever to cwd.
+ if isinstance(n, vfs.FakeSymlink):
+ # Source is actually /foo/what, i.e. a top-level commit
+ # like /foo/latest, which is a symlink to ../.commit/SHA.
+ # So dereference it, and restore ../.commit/SHA/. to
+ # "./what/.".
+ target = n.dereference()
+ mkdirp(n.name)
+ os.chdir(n.name)
+ do_root(target)
+ else:
+ do_node(n.parent, n)
if not opt.quiet:
progress('Restoring: %d, done.\n' % total_restored)
View
@@ -100,23 +100,33 @@ def eatslash(dir):
# created. The sort_key must be computed using the element's real
# name and mode rather than the git mode and (possibly mangled) name.
-parts = ['']
-shalists = [[]]
-metalists = [[]]
+# Maintain a stack of information representing the current location in
+# the archive being constructed. The current path is recorded in
+# parts, which will be something like ['', 'home', 'someuser'], and
+# the accumulated content and metadata for of the dirs in parts is
+# stored in parallel stacks in shalists and metalists.
+
+parts = [] # Current archive position (stack of dir names).
+shalists = [] # Hashes for each dir in paths.
+metalists = [] # Metadata for each dir in paths.
+
def _push(part, metadata):
- assert(part)
+ # Enter a new archive directory -- make it the current directory.
parts.append(part)
shalists.append([])
- # First entry is dir metadata, which is represented with an empty name.
- metalists.append([('', metadata)])
+ metalists.append([('', metadata)]) # This dir's metadata (no name).
+
-def _pop(force_tree):
+def _pop(force_tree, dir_metadata=None):
+ # Leave the current archive directory and add its tree to its parent.
assert(len(parts) >= 1)
part = parts.pop()
shalist = shalists.pop()
metalist = metalists.pop()
if metalist:
+ if dir_metadata: # Override the original metadata pushed for this dir.
+ metalist = [('', dir_metadata)] + metalist[1:]
sorted_metalist = sorted(metalist, key = lambda x : x[0])
metadata = ''.join([m[1].encode() for m in sorted_metalist])
shalist.append((0100644, '.bupm', w.new_blob(metadata)))
@@ -126,12 +136,9 @@ def _pop(force_tree):
git.mangle_name(part,
GIT_MODE_TREE, GIT_MODE_TREE),
tree))
- else:
- # This was the toplevel, so put it back for sanity (i.e. cd .. from /).
- shalists.append(shalist)
- metalists.append(metalist)
return tree
+
lastremain = None
def progress_report(n):
global count, subcount, lastremain
@@ -205,6 +212,19 @@ def find_hardlink_target(hlink_db, ent):
progress('Reading index: %d, done.\n' % ftotal)
hashsplit.progress_callback = progress_report
+# Root collisions occur when strip or graft options map more than one
+# path to the same directory (paths which originally had separate
+# parents). When that situation is detected, use empty metadata for
+# the parent. Otherwise, use the metadata for the common parent.
+# Collision example: "bup save ... --strip /foo /foo/bar /bar".
+
+# FIXME: Add collision tests, or handle collisions some other way.
+
+# FIXME: Detect/handle strip/graft name collisions (other than root),
+# i.e. if '/foo/bar' and '/bar' both map to '/'.
+
+first_root = None
+root_collision = None
tstart = time.time()
count = subcount = fcount = 0
lastskip_name = None
@@ -254,21 +274,42 @@ def find_hardlink_target(hlink_db, ent):
else:
dirp = path_components(dir)
+ # At this point, dirp contains a representation of the archive
+ # path that looks like [(archive_dir_name, real_fs_path), ...].
+ # So given "bup save ... --strip /foo/bar /foo/bar/baz", dirp
+ # might look like this at some point:
+ # [('', '/foo/bar'), ('baz', '/foo/bar/baz'), ...].
+
+ # This dual representation supports stripping/grafting, where the
+ # archive path may not have a direct correspondence with the
+ # filesystem. The root directory is represented by an initial
+ # component named '', and any component that doesn't have a
+ # corresponding filesystem directory (due to grafting, for
+ # example) will have a real_fs_path of None, i.e. [('', None),
+ # ...].
+
+ if first_root == None:
+ dir_name, fs_path = dirp[0]
+ first_root = dirp[0]
+ meta = metadata.from_path(fs_path) if fs_path else metadata.Metadata()
+ _push(dir_name, meta)
+ elif first_root != dirp[0]:
+ root_collision = True
+
+ # If switching to a new sub-tree, finish the current sub-tree.
while parts > [x[0] for x in dirp]:
_pop(force_tree = None)
- if dir != '/':
- for path_component in dirp[len(parts):]:
- dir_name, fs_path = path_component
- if fs_path:
- meta = metadata.from_path(fs_path)
- else:
- meta = metadata.Metadata()
- _push(dir_name, meta)
+ # If switching to a new sub-tree, start a new sub-tree.
+ for path_component in dirp[len(parts):]:
+ dir_name, fs_path = path_component
+ meta = metadata.from_path(fs_path) if fs_path else metadata.Metadata()
+ _push(dir_name, meta)
if not file:
- # no filename portion means this is a subdir. But
- # sub/parentdirectories already handled in the pop/push() part above.
+ if len(parts) == 1:
+ continue # We're at the top level -- keep the current root dir
+ # Since there's no filename, this is a subdir -- finish it.
oldtree = already_saved(ent) # may be None
newtree = _pop(force_tree = oldtree)
if not oldtree:
@@ -346,19 +387,14 @@ def find_hardlink_target(hlink_db, ent):
progress('Saving: %.2f%% (%d/%dk, %d/%d files), done. \n'
% (pct, count/1024, total/1024, fcount, ftotal))
-while len(parts) > 1: # _pop() all the parts above the indexed items.
+while len(parts) > 1: # _pop() all the parts above the root
_pop(force_tree = None)
assert(len(shalists) == 1)
assert(len(metalists) == 1)
-if not (opt.strip or opt.strip_path or graft_points):
- # For now, only save metadata for the root directory when there
- # isn't any path grafting or stripping that might create multiple
- # roots.
- shalist = shalists[-1]
- metadata = ''.join([metadata.from_path('/').encode()])
- shalist.append((0100644, '.bupm', w.new_blob(metadata)))
-tree = w.new_tree(shalists[-1])
+# Finish the root directory.
+tree = _pop(force_tree = None,
+ dir_metadata = metadata.Metadata() if root_collision else None)
if opt.tree:
print tree.encode('hex')
View
@@ -647,6 +647,14 @@ def parse_date_or_fatal(str, fatal):
return date
+# FIXME: Carefully consider the use of functions (os.path.*, etc.)
+# that resolve against the current filesystem in the strip/graft
+# functions for example, but elsewhere as well. I suspect bup's not
+# always being careful about that. For some cases, the contents of
+# the current filesystem should be irrelevant, and consulting it might
+# produce the wrong result, perhaps via unintended symlink resolution,
+# for example.
+
def path_components(path):
"""Break path into a list of pairs of the form (name,
full_path_to_name). Path must start with '/'.
@@ -688,20 +696,47 @@ def stripped_path_components(path, strip_prefixes):
def grafted_path_components(graft_points, path):
- # Find the first '/' after the graft prefix, match that to the
- # original source base dir, then move on.
+ # Create a result that consists of some number of faked graft
+ # directories before the graft point, followed by all of the real
+ # directories from path that are after the graft point. Arrange
+ # for the directory at the graft point in the result to correspond
+ # to the "orig" directory in --graft orig=new. See t/thelpers.py
+ # for some examples.
+
+ # Note that given --graft orig=new, orig and new have *nothing* to
+ # do with each other, even if some of their component names
+ # match. i.e. --graft /foo/bar/baz=/foo/bar/bax is semantically
+ # equivalent to --graft /foo/bar/baz=/x/y/z, or even
+ # /foo/bar/baz=/x.
+
+ # FIXME: This can't be the best solution...
clean_path = os.path.abspath(path)
for graft_point in graft_points:
old_prefix, new_prefix = graft_point
+ # Expand prefixes iff not absolute paths.
+ old_prefix = os.path.normpath(old_prefix)
+ new_prefix = os.path.normpath(new_prefix)
if clean_path.startswith(old_prefix):
- grafted_path = re.sub(r'^' + old_prefix, new_prefix,
- clean_path)
- result = [(p, None) for p in grafted_path.split('/')]
- result[-1] = (result[-1][0], clean_path)
+ escaped_prefix = re.escape(old_prefix)
+ grafted_path = re.sub(r'^' + escaped_prefix, new_prefix, clean_path)
+ # Handle /foo=/ (at least) -- which produces //whatever.
+ grafted_path = '/' + grafted_path.lstrip('/')
+ clean_path_components = path_components(clean_path)
+ # Count the components that were stripped.
+ strip_count = 0 if old_prefix == '/' else old_prefix.count('/')
+ new_prefix_parts = new_prefix.split('/')
+ result_prefix = grafted_path.split('/')[:new_prefix.count('/')]
+ result = [(p, None) for p in result_prefix] \
+ + clean_path_components[strip_count:]
+ # Now set the graft point name to match the end of new_prefix.
+ graft_point = len(result_prefix)
+ result[graft_point] = \
+ (new_prefix_parts[-1], clean_path_components[strip_count][1])
+ if new_prefix == '/': # --graft ...=/ is a special case.
+ return result[1:]
return result
return path_components(clean_path)
-
# hashlib is only available in python 2.5 or higher, but the 'sha' module
# produces a DeprecationWarning in python 2.6 or higher. We want to support
# python 2.4 and above without any stupid warnings, so let's try using hashlib
View
@@ -48,10 +48,32 @@ def test_stripped_path_components():
[('', '/foo/bar/baz')])
WVEXCEPT(Exception, stripped_path_components, 'foo', [])
+
@wvtest
def test_grafted_path_components():
WVPASSEQ(grafted_path_components([('/chroot', '/')], '/foo'),
[('', '/'), ('foo', '/foo')])
- WVPASSEQ(grafted_path_components([('/foo/bar', '')], '/foo/bar/baz/bax'),
- [('', None), ('baz', None), ('bax', '/foo/bar/baz/bax')])
+ WVPASSEQ(grafted_path_components([('/foo/bar', '/')], '/foo/bar/baz/bax'),
+ [('', '/foo/bar'),
+ ('baz', '/foo/bar/baz'),
+ ('bax', '/foo/bar/baz/bax')])
+ WVPASSEQ(grafted_path_components([('/foo/bar/baz', '/bax')],
+ '/foo/bar/baz/1/2'),
+ [('', None),
+ ('bax', '/foo/bar/baz'),
+ ('1', '/foo/bar/baz/1'),
+ ('2', '/foo/bar/baz/1/2')])
+ WVPASSEQ(grafted_path_components([('/foo', '/bar/baz/bax')],
+ '/foo/bar'),
+ [('', None),
+ ('bar', None),
+ ('baz', None),
+ ('bax', '/foo'),
+ ('bar', '/foo/bar')])
+ WVPASSEQ(grafted_path_components([('/foo/bar/baz', '/a/b/c')],
+ '/foo/bar/baz'),
+ [('', None), ('a', None), ('b', None), ('c', '/foo/bar/baz')])
+ WVPASSEQ(grafted_path_components([('/', '/a/b/c/')], '/foo/bar'),
+ [('', None), ('a', None), ('b', None), ('c', '/'),
+ ('foo', '/foo'), ('bar', '/foo/bar')])
WVEXCEPT(Exception, grafted_path_components, 'foo', [])
View
@@ -174,8 +174,8 @@ def __init__(self, parent, name, mode, hash):
self._metadata = None
def __repr__(self):
- return "<bup.vfs.Node object at X - name:%r hash:%s parent:%r>" \
- % (self.name, self.hash.encode('hex'),
+ return "<%s object at X - name:%r hash:%s parent:%r>" \
+ % (self.__class__, self.name, self.hash.encode('hex'),
self.parent.name if self.parent else None)
def __cmp__(a, b):
Oops, something went wrong.

0 comments on commit 042eaac

Please sign in to comment.