From c078168c16a21eb6cabf02f75a918a0874793976 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 21:34:51 -0800
Subject: [PATCH 01/12] options.py: make --usage just print the usage message.

This is a relatively common option in other programs, so let's make it work
in case someone tries to use it.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/options.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/bup/options.py b/lib/bup/options.py
index ec0f6ed67..9be711106 100644
--- a/lib/bup/options.py
+++ b/lib/bup/options.py
@@ -115,7 +115,7 @@ def __init__(self, optspec, optfunc=getopt.gnu_getopt,
         self.optfunc = optfunc
         self._aliases = {}
         self._shortopts = 'h?'
-        self._longopts = ['help']
+        self._longopts = ['help', 'usage']
         self._hasparms = {}
         self._defaults = {}
         self._usagestr = self._gen_usage()
@@ -214,7 +214,7 @@ def parse(self, args):
 
         for (k,v) in flags:
             k = k.lstrip('-')
-            if k in ('h', '?', 'help'):
+            if k in ('h', '?', 'help', 'usage'):
                 self.usage()
             if k.startswith('no-'):
                 k = self._aliases[k[3:]]

From 5a9bc0a6c31d2d0824261685b6ce7a359729436d Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 21:37:16 -0800
Subject: [PATCH 02/12] options.py: o.fatal(): print error after, not before,
 usage message.

git prints the error *before* the usage message, but the more I play with
it, the more I'm annoyed by that behaviour.  The usage message can be pretty
long, and the error gots lost way above the usage message.  The most
important thing *is* the error, so let's print it last.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/options.py |  7 ++++---
 main.py            | 13 ++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/lib/bup/options.py b/lib/bup/options.py
index 9be711106..c9a6cf52e 100644
--- a/lib/bup/options.py
+++ b/lib/bup/options.py
@@ -184,14 +184,15 @@ def _gen_usage(self):
     def usage(self, msg=""):
         """Print usage string to stderr and abort."""
         sys.stderr.write(self._usagestr)
+        if msg:
+            sys.stderr.write(msg)
         e = self._onabort and self._onabort(msg) or None
         if e:
             raise e
 
-    def fatal(self, s):
+    def fatal(self, msg):
         """Print an error message to stderr and abort with usage string."""
-        msg = 'error: %s\n' % s
-        sys.stderr.write(msg)
+        msg = '\nerror: %s\n' % msg
         return self.usage(msg)
 
     def parse(self, args):
diff --git a/main.py b/main.py
index bd6fb0b89..c8ffce287 100755
--- a/main.py
+++ b/main.py
@@ -29,7 +29,7 @@
 # after running 'bup newliner', the tty_width() ioctl won't work anymore
 os.environ['WIDTH'] = str(tty_width())
 
-def usage():
+def usage(msg=""):
     log('Usage: bup [-?|--help] [-d BUP_DIR] [--debug] [--profile] '
         '<command> [options...]\n\n')
     common = dict(
@@ -62,6 +62,8 @@ def usage():
     
     log("See 'bup help COMMAND' for more information on " +
         "a specific command.\n")
+    if msg:
+        log("\n%s\n" % msg)
     sys.exit(99)
 
 
@@ -73,8 +75,7 @@ def usage():
     optspec = ['help', 'version', 'debug', 'profile', 'bup-dir=']
     global_args, subcmd = getopt.getopt(argv[1:], '?VDd:', optspec)
 except getopt.GetoptError, ex:
-    log('error: ' + ex.msg + '\n')
-    usage()
+    usage('error: %s' % ex.msg)
 
 help_requested = None
 dest_dir = None
@@ -93,8 +94,7 @@ def usage():
     elif opt[0] in ['-d', '--bup-dir']:
         dest_dir = opt[1]
     else:
-        log('error: unexpected option "%s"\n' % opt[0])
-        usage()
+        usage('error: unexpected option "%s"' % opt[0])
 
 if len(subcmd) == 0:
     if help_requested:
@@ -124,8 +124,7 @@ def subpath(s):
 
 subcmd[0] = subpath(subcmd_name)
 if not os.path.exists(subcmd[0]):
-    log('error: unknown command "%s"\n' % subcmd_name)
-    usage()
+    usage('error: unknown command "%s"' % subcmd_name)
 
 already_fixed = atoi(os.environ.get('BUP_FORCE_TTY'))
 if subcmd_name in ['mux', 'ftp', 'help']:

From 7976e29118c97dd1f8dc3a733e8a74f6a95c0431 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Thu, 17 Feb 2011 01:56:31 -0800
Subject: [PATCH 03/12] hashsplit.py: simplify code and fix BLOB_MAX handling.

This reduces the number of lines without removing functionality.  I renamed
a few constants to make more sense.

The only functional change is that BLOB_MAX is now an actual maximum instead
of a variable number depending on buf.used().  Previously, it might have
been as large as BLOB_READ_SIZE = 1MB, which is much larger than BLOB_MAX =
16k.  Now BLOB_MAX is actually the max.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/hashsplit.py | 59 +++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 5de6a3fa1..938fcaaa2 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -2,15 +2,17 @@
 from bup import _helpers
 from bup.helpers import *
 
-BLOB_LWM = 8192*2
-BLOB_MAX = BLOB_LWM*2
-BLOB_HWM = 1024*1024
+BLOB_MAX = 8192*2   # 8192 is the "typical" blob size for bupsplit
+BLOB_READ_SIZE = 1024*1024
 MAX_PER_TREE = 256
 progress_callback = None
 max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
 max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
 fanout = 16
 
+# The purpose of this type of buffer is to avoid copying on peek(), get(),
+# and eat().  We do copy the buffer contents on put(), but that should
+# be ok if we always only put() large amounts of data at a time.
 class Buf:
     def __init__(self):
         self.data = ''
@@ -36,16 +38,7 @@ def used(self):
         return len(self.data) - self.start
 
 
-def splitbuf(buf):
-    b = buf.peek(buf.used())
-    (ofs, bits) = _helpers.splitbuf(b)
-    if ofs:
-        buf.eat(ofs)
-        return (buffer(b, 0, ofs), bits)
-    return (None, 0)
-
-
-def blobiter(files, progress=None):
+def readfile_iter(files, progress=None):
     for filenum,f in enumerate(files):
         ofs = 0
         b = ''
@@ -53,7 +46,7 @@ def blobiter(files, progress=None):
             if progress:
                 progress(filenum, len(b))
             fadvise_done(f, max(0, ofs - 1024*1024))
-            b = f.read(BLOB_HWM)
+            b = f.read(BLOB_READ_SIZE)
             ofs += len(b)
             if not b:
                 fadvise_done(f, ofs)
@@ -61,35 +54,29 @@ def blobiter(files, progress=None):
             yield b
 
 
-def drainbuf(buf, finalize):
+def _splitbuf(buf):
     while 1:
-        (blob, bits) = splitbuf(buf)
-        if blob:
-            yield (blob, bits)
+        b = buf.peek(buf.used())
+        (ofs, bits) = _helpers.splitbuf(b)
+        if ofs:
+            buf.eat(ofs)
+            yield buffer(b, 0, ofs), bits
         else:
             break
     if buf.used() > BLOB_MAX:
         # limit max blob size
-        yield (buf.get(buf.used()), 0)
-    elif finalize and buf.used():
-        yield (buf.get(buf.used()), 0)
+        yield buf.get(BLOB_MAX), 0
 
 
 def _hashsplit_iter(files, progress):
-    assert(BLOB_HWM > BLOB_MAX)
+    assert(BLOB_READ_SIZE > BLOB_MAX)
     buf = Buf()
-    fi = blobiter(files, progress)
-    while 1:
-        for i in drainbuf(buf, finalize=False):
-            yield i
-        while buf.used() < BLOB_HWM:
-            bnew = next(fi)
-            if not bnew:
-                # eof
-                for i in drainbuf(buf, finalize=True):
-                    yield i
-                return
-            buf.put(bnew)
+    for inblock in readfile_iter(files, progress):
+        buf.put(inblock)
+        for buf_and_bits in _splitbuf(buf):
+            yield buf_and_bits
+    if buf.used():
+        yield buf.get(buf.used()), 0
 
 
 def _hashsplit_iter_keep_boundaries(files, progress):
@@ -101,8 +88,8 @@ def prog(filenum, nbytes):
                 return progress(real_filenum, nbytes)
         else:
             prog = None
-        for i in _hashsplit_iter([f], progress=prog):
-            yield i
+        for buf_and_bits in _hashsplit_iter([f], progress=prog):
+            yield buf_and_bits
 
 
 def hashsplit_iter(files, keep_boundaries, progress):

From 84f4cf05c68f0fa3e594542520e9c71e459bfb66 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 20:33:36 -0800
Subject: [PATCH 04/12] hashsplit.py: okay, *really* fix BLOB_MAX.

In some conditions, we were still splitting into blobs larger than BLOB_MAX.
Fix that too.

Unfortunately adding an assertion about it in the 'bup split' main loop
slows things down by a measurable amount, so I can't easily add that to
prevent this from happening by accidenta again in the future.

After implementing this, it looks like 8192 (typical blob size) times two
isn't big enough to prevent this from kicking in in "normal" cases; let's
use 4x instead.  In my test file, we exceed this maximum much less.  (Every
time we exceed BLOB_MAX, it means the bupsplit algorithm isn't working, so
we won't be deduplicating as effectively.  So we want that to be rare.)

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/hashsplit.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 938fcaaa2..6134b6111 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -2,7 +2,7 @@
 from bup import _helpers
 from bup.helpers import *
 
-BLOB_MAX = 8192*2   # 8192 is the "typical" blob size for bupsplit
+BLOB_MAX = 8192*4   # 8192 is the "typical" blob size for bupsplit
 BLOB_READ_SIZE = 1024*1024
 MAX_PER_TREE = 256
 progress_callback = None
@@ -58,12 +58,14 @@ def _splitbuf(buf):
     while 1:
         b = buf.peek(buf.used())
         (ofs, bits) = _helpers.splitbuf(b)
+        if ofs > BLOB_MAX:
+            ofs = BLOB_MAX
         if ofs:
             buf.eat(ofs)
             yield buffer(b, 0, ofs), bits
         else:
             break
-    if buf.used() > BLOB_MAX:
+    while buf.used() >= BLOB_MAX:
         # limit max blob size
         yield buf.get(BLOB_MAX), 0
 

From eb65206d3d1eada4c7c291ee7433c3bfad350373 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Thu, 17 Feb 2011 02:30:47 -0800
Subject: [PATCH 05/12] hashsplit.py: convert from 'bits' to 'level' earlier in
 the sequence.

The hierarchy level is a more directly useful measurement than the bit count,
although right now neither is used very heavily.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 cmd/split-cmd.py     |  7 ++++---
 lib/bup/_helpers.c   |  1 +
 lib/bup/hashsplit.py | 33 +++++++++++++++------------------
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py
index 756d1b566..e016d1e8c 100755
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -130,9 +130,10 @@ def read_ids():
     tree = pack_writer.new_tree(shalist)
 else:
     last = 0
-    for (blob, bits) in hashsplit.hashsplit_iter(files,
-                                    keep_boundaries=opt.keep_boundaries,
-                                    progress=prog):
+    it = hashsplit.hashsplit_iter(files,
+                                  keep_boundaries=opt.keep_boundaries,
+                                  progress=prog)
+    for (blob, level) in it:
         hashsplit.total_split += len(blob)
         if opt.copy:
             sys.stdout.write(str(blob))
diff --git a/lib/bup/_helpers.c b/lib/bup/_helpers.c
index 4d12ddfe9..d077cd971 100644
--- a/lib/bup/_helpers.c
+++ b/lib/bup/_helpers.c
@@ -87,6 +87,7 @@ static PyObject *splitbuf(PyObject *self, PyObject *args)
     if (!PyArg_ParseTuple(args, "t#", &buf, &len))
 	return NULL;
     out = bupsplit_find_ofs(buf, len, &bits);
+    if (out) assert(bits >= BUP_BLOBBITS);
     return Py_BuildValue("ii", out, bits);
 }
 
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 6134b6111..f9d5a4dfe 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -54,7 +54,7 @@ def readfile_iter(files, progress=None):
             yield b
 
 
-def _splitbuf(buf):
+def _splitbuf(buf, basebits, fanbits):
     while 1:
         b = buf.peek(buf.used())
         (ofs, bits) = _helpers.splitbuf(b)
@@ -62,7 +62,8 @@ def _splitbuf(buf):
             ofs = BLOB_MAX
         if ofs:
             buf.eat(ofs)
-            yield buffer(b, 0, ofs), bits
+            level = (bits-basebits)//fanbits  # integer division
+            yield buffer(b, 0, ofs), level
         else:
             break
     while buf.used() >= BLOB_MAX:
@@ -72,11 +73,13 @@ def _splitbuf(buf):
 
 def _hashsplit_iter(files, progress):
     assert(BLOB_READ_SIZE > BLOB_MAX)
+    basebits = _helpers.blobbits()
+    fanbits = int(math.log(fanout or 128, 2))
     buf = Buf()
     for inblock in readfile_iter(files, progress):
         buf.put(inblock)
-        for buf_and_bits in _splitbuf(buf):
-            yield buf_and_bits
+        for buf_and_level in _splitbuf(buf, basebits, fanbits):
+            yield buf_and_level
     if buf.used():
         yield buf.get(buf.used()), 0
 
@@ -90,8 +93,8 @@ def prog(filenum, nbytes):
                 return progress(real_filenum, nbytes)
         else:
             prog = None
-        for buf_and_bits in _hashsplit_iter([f], progress=prog):
-            yield buf_and_bits
+        for buf_and_level in _hashsplit_iter([f], progress=prog):
+            yield buf_and_level
 
 
 def hashsplit_iter(files, keep_boundaries, progress):
@@ -104,14 +107,14 @@ def hashsplit_iter(files, keep_boundaries, progress):
 total_split = 0
 def _split_to_blobs(w, files, keep_boundaries, progress):
     global total_split
-    for (blob, bits) in hashsplit_iter(files, keep_boundaries, progress):
+    for (blob, level) in hashsplit_iter(files, keep_boundaries, progress):
         sha = w.new_blob(blob)
         total_split += len(blob)
         if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
             w.breakpoint()
         if progress_callback:
             progress_callback(len(blob))
-        yield (sha, len(blob), bits)
+        yield (sha, len(blob), level)
 
 
 def _make_shalist(l):
@@ -143,21 +146,15 @@ def split_to_shalist(w, files, keep_boundaries, progress=None):
     sl = _split_to_blobs(w, files, keep_boundaries, progress)
     if not fanout:
         shal = []
-        for (sha,size,bits) in sl:
+        for (sha,size,level) in sl:
             shal.append(('100644', sha, size))
         return _make_shalist(shal)[0]
     else:
-        base_bits = _helpers.blobbits()
-        fanout_bits = int(math.log(fanout, 2))
-        def bits_to_idx(n):
-            assert(n >= base_bits)
-            return (n - base_bits)/fanout_bits
         stacks = [[]]
-        for (sha,size,bits) in sl:
-            assert(bits <= 32)
+        for (sha,size,level) in sl:
             stacks[0].append(('100644', sha, size))
-            if bits > base_bits:
-                _squish(w, stacks, bits_to_idx(bits))
+            if level:
+                _squish(w, stacks, level)
         #log('stacks: %r\n' % [len(i) for i in stacks])
         _squish(w, stacks, len(stacks)-1)
         #log('stacks: %r\n' % [len(i) for i in stacks])

From a90f2ef771a19fcaf514aac9e7160674f1536eb0 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Thu, 17 Feb 2011 03:10:23 -0800
Subject: [PATCH 06/12] cmd/split: fixup progress message, and print -b output
 incrementally.

As a side effect, you can no longer combine -b with -t, -c, or -n.  But that
was kind of a pointless thing to do anyway, because it silently enforced
--fanout=0, which is almost certainly not what you wanted, precisely if you
were using -t, -c, or -n.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 cmd/split-cmd.py     | 30 ++++++++++++++++++------------
 lib/bup/hashsplit.py |  5 +++--
 lib/bup/helpers.py   | 16 +++++++++++++++-
 t/test.sh            |  2 +-
 4 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py
index e016d1e8c..363896f79 100755
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -5,20 +5,22 @@
 
 
 optspec = """
-bup split [-tcb] [-n name] [--bench] [filenames...]
+bup split <-t|-c|-b|-n name|--copy|--noop> [--bench] [filenames...]
 --
-r,remote=  remote repository path
+ Modes:
 b,blobs    output a series of blob ids
 t,tree     output a tree id
 c,commit   output a commit id
-n,name=    name of backup set to update (if any)
+n,name=    save the result under the given name
+noop       split the input, but throw away the result
+copy       split the input, copy it to stdout, don't save to repo
+ Options:
+r,remote=  remote repository path
 d,date=    date for the commit (seconds since the epoch)
 q,quiet    don't print progress messages
 v,verbose  increase log output (can be used more than once)
 git-ids    read a list of git object ids from stdin and split their contents
 keep-boundaries  don't let one chunk span two input files
-noop       don't actually save the data anywhere
-copy       just copy input to output, hashsplitting along the way
 bench      print benchmark timings to stderr
 max-pack-size=  maximum bytes in a single pack
 max-pack-objects=  maximum number of objects in a single pack
@@ -36,6 +38,8 @@
 if (opt.noop or opt.copy) and (opt.blobs or opt.tree or 
                                opt.commit or opt.name):
     o.fatal('-N and --copy are incompatible with -b, -t, -c, -n')
+if opt.blobs and (opt.tree or opt.commit or opt.name):
+    o.fatal('-b is incompatible with -t, -c, -n')
 if extra and opt.git_ids:
     o.fatal("don't provide filenames when using --git-ids")
 
@@ -123,7 +127,14 @@ def read_ids():
     # the input either comes from a series of files or from stdin.
     files = extra and (open(fn) for fn in extra) or [sys.stdin]
 
-if pack_writer:
+if pack_writer and opt.blobs:
+    shalist = hashsplit.split_to_blobs(pack_writer, files,
+                                       keep_boundaries=opt.keep_boundaries,
+                                       progress=prog)
+    for (sha, size, level) in shalist:
+        print sha.encode('hex')
+        reprogress()
+elif pack_writer:  # tree or commit or name
     shalist = hashsplit.split_to_shalist(pack_writer, files,
                                          keep_boundaries=opt.keep_boundaries,
                                          progress=prog)
@@ -139,15 +150,10 @@ def read_ids():
             sys.stdout.write(str(blob))
         megs = hashsplit.total_split/1024/1024
         if not opt.quiet and last != megs:
-            progress('%d Mbytes read\r' % megs)
             last = megs
-    progress('%d Mbytes read, done.\n' % megs)
 
 if opt.verbose:
     log('\n')
-if opt.blobs:
-    for (mode,name,bin) in shalist:
-        print bin.encode('hex')
 if opt.tree:
     print tree.encode('hex')
 if opt.commit or opt.name:
@@ -172,7 +178,7 @@ def read_ids():
 secs = time.time() - start_time
 size = hashsplit.total_split
 if opt.bench:
-    log('\nbup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
+    log('bup: %.2fkbytes in %.2f secs = %.2f kbytes/sec\n'
         % (size/1024., secs, size/1024./secs))
 
 if saved_errors:
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index f9d5a4dfe..439c63db6 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -105,7 +105,7 @@ def hashsplit_iter(files, keep_boundaries, progress):
 
 
 total_split = 0
-def _split_to_blobs(w, files, keep_boundaries, progress):
+def split_to_blobs(w, files, keep_boundaries, progress):
     global total_split
     for (blob, level) in hashsplit_iter(files, keep_boundaries, progress):
         sha = w.new_blob(blob)
@@ -143,7 +143,8 @@ def _squish(w, stacks, n):
 
 
 def split_to_shalist(w, files, keep_boundaries, progress=None):
-    sl = _split_to_blobs(w, files, keep_boundaries, progress)
+    sl = split_to_blobs(w, files, keep_boundaries, progress)
+    assert(fanout != 0)
     if not fanout:
         shal = []
         for (sha,size,level) in sl:
diff --git a/lib/bup/helpers.py b/lib/bup/helpers.py
index 5b0028364..ed976bfe3 100644
--- a/lib/bup/helpers.py
+++ b/lib/bup/helpers.py
@@ -67,16 +67,20 @@ def debug2(s):
 
 
 istty = os.isatty(2) or atoi(os.environ.get('BUP_FORCE_TTY'))
+_last_progress = ''
 def progress(s):
     """Calls log() if stderr is a TTY.  Does nothing otherwise."""
+    global _last_progress
     if istty:
         log(s)
+        _last_progress = s
 
 
 def qprogress(s):
     """Calls progress() only if we haven't printed progress in a while.
     
-    This avoids overloading the stderr buffer with excess junk."""
+    This avoids overloading the stderr buffer with excess junk.
+    """
     global _last_prog
     now = time.time()
     if now - _last_prog > 0.1:
@@ -84,6 +88,16 @@ def qprogress(s):
         _last_prog = now
 
 
+def reprogress():
+    """Calls progress() to redisplay the most recent progress message.
+
+    Useful after you've printed some other message that wipes out the
+    progress line.
+    """
+    if _last_progress and _last_progress.endswith('\r'):
+        progress(_last_progress)
+
+
 def mkdirp(d, mode=None):
     """Recursively create directories on path 'd'.
 
diff --git a/t/test.sh b/t/test.sh
index 7186710c2..9b5e55aad 100755
--- a/t/test.sh
+++ b/t/test.sh
@@ -208,7 +208,7 @@ WVSTART "save/git-fsck"
     #git prune
     (cd "$TOP/t/sampledata" && WVPASS bup save -vvn master /) || WVFAIL
     n=$(git fsck --full --strict 2>&1 | 
-      egrep -v 'dangling (commit|tree)' |
+      egrep -v 'dangling (commit|tree|blob)' |
       tee -a /dev/stderr | 
       wc -l)
     WVPASS [ "$n" -eq 0 ]

From 9133f733cdde36d7ecd627d339f90d87b7d2b0e6 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Thu, 17 Feb 2011 04:22:50 -0800
Subject: [PATCH 07/12] hashsplit.py: remove PackWriter-specific knowledge.

Let's use callback functions explicitly instead of passing around special
objects; that makes the dependencies a bit more clear and hopefully opens
the way to some more refactoring for clarity.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 cmd/save-cmd.py      |  3 ++-
 cmd/split-cmd.py     | 10 ++++++----
 lib/bup/client.py    |  1 -
 lib/bup/git.py       |  6 +++++-
 lib/bup/hashsplit.py | 30 ++++++++++++++----------------
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py
index 8c2ba40ab..7219ab4b5 100755
--- a/cmd/save-cmd.py
+++ b/cmd/save-cmd.py
@@ -263,7 +263,8 @@ def wantrecurse_during(ent):
                 lastskip_name = ent.name
             else:
                 try:
-                    (mode, id) = hashsplit.split_to_blob_or_tree(w, [f],
+                    (mode, id) = hashsplit.split_to_blob_or_tree(
+                                            w.new_blob, w.new_tree, [f],
                                             keep_boundaries=False)
                 except IOError, e:
                     add_error('%s: %s' % (ent.name, e))
diff --git a/cmd/split-cmd.py b/cmd/split-cmd.py
index 363896f79..b243c6140 100755
--- a/cmd/split-cmd.py
+++ b/cmd/split-cmd.py
@@ -47,9 +47,9 @@
     git.verbose = opt.verbose - 1
     opt.bench = 1
 if opt.max_pack_size:
-    hashsplit.max_pack_size = parse_num(opt.max_pack_size)
+    git.max_pack_size = parse_num(opt.max_pack_size)
 if opt.max_pack_objects:
-    hashsplit.max_pack_objects = parse_num(opt.max_pack_objects)
+    git.max_pack_objects = parse_num(opt.max_pack_objects)
 if opt.fanout:
     hashsplit.fanout = parse_num(opt.fanout)
 if opt.blobs:
@@ -128,14 +128,16 @@ def read_ids():
     files = extra and (open(fn) for fn in extra) or [sys.stdin]
 
 if pack_writer and opt.blobs:
-    shalist = hashsplit.split_to_blobs(pack_writer, files,
+    shalist = hashsplit.split_to_blobs(pack_writer.new_blob, files,
                                        keep_boundaries=opt.keep_boundaries,
                                        progress=prog)
     for (sha, size, level) in shalist:
         print sha.encode('hex')
         reprogress()
 elif pack_writer:  # tree or commit or name
-    shalist = hashsplit.split_to_shalist(pack_writer, files,
+    shalist = hashsplit.split_to_shalist(pack_writer.new_blob,
+                                         pack_writer.new_tree,
+                                         files,
                                          keep_boundaries=opt.keep_boundaries,
                                          progress=prog)
     tree = pack_writer.new_tree(shalist)
diff --git a/lib/bup/client.py b/lib/bup/client.py
index f94106723..556390556 100644
--- a/lib/bup/client.py
+++ b/lib/bup/client.py
@@ -177,7 +177,6 @@ def sync_indexes(self):
             self.sync_index(idx)
         git.auto_midx(self.cachedir)
 
-
     def sync_index(self, name):
         #debug1('requesting %r\n' % name)
         self.check_busy()
diff --git a/lib/bup/git.py b/lib/bup/git.py
index 6db392a22..fd364b542 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -6,6 +6,8 @@
 from bup.helpers import *
 from bup import _helpers, path, midx, bloom
 
+max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
+max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
 SEEK_END=2  # os.SEEK_END is not defined in python 2.4
 
 verbose = 0
@@ -509,6 +511,8 @@ def _write(self, sha, type, content):
         if not sha:
             sha = calc_hash(type, content)
         size, crc = self._raw_write(_encode_packobj(type, content), sha=sha)
+        if self.outbytes >= max_pack_size or self.count >= max_pack_objects:
+            self.breakpoint()
         return sha
 
     def breakpoint(self):
@@ -531,10 +535,10 @@ def exists(self, id, want_source=False):
 
     def maybe_write(self, type, content):
         """Write an object to the pack file if not present and return its id."""
-        self._require_objcache()
         sha = calc_hash(type, content)
         if not self.exists(sha):
             self._write(sha, type, content)
+            self._require_objcache()
             self.objcache.add(sha)
         return sha
 
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 439c63db6..1819294d1 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -6,8 +6,6 @@
 BLOB_READ_SIZE = 1024*1024
 MAX_PER_TREE = 256
 progress_callback = None
-max_pack_size = 1000*1000*1000  # larger packs will slow down pruning
-max_pack_objects = 200*1000  # cache memory usage is about 83 bytes per object
 fanout = 16
 
 # The purpose of this type of buffer is to avoid copying on peek(), get(),
@@ -105,13 +103,11 @@ def hashsplit_iter(files, keep_boundaries, progress):
 
 
 total_split = 0
-def split_to_blobs(w, files, keep_boundaries, progress):
+def split_to_blobs(makeblob, files, keep_boundaries, progress):
     global total_split
     for (blob, level) in hashsplit_iter(files, keep_boundaries, progress):
-        sha = w.new_blob(blob)
+        sha = makeblob(blob)
         total_split += len(blob)
-        if w.outbytes >= max_pack_size or w.count >= max_pack_objects:
-            w.breakpoint()
         if progress_callback:
             progress_callback(len(blob))
         yield (sha, len(blob), level)
@@ -127,7 +123,7 @@ def _make_shalist(l):
     return (shalist, total)
 
 
-def _squish(w, stacks, n):
+def _squish(maketree, stacks, n):
     i = 0
     while i<n or len(stacks[i]) > MAX_PER_TREE:
         while len(stacks) <= i+1:
@@ -136,14 +132,15 @@ def _squish(w, stacks, n):
             stacks[i+1] += stacks[i]
         elif stacks[i]:
             (shalist, size) = _make_shalist(stacks[i])
-            tree = w.new_tree(shalist)
+            tree = maketree(shalist)
             stacks[i+1].append(('40000', tree, size))
         stacks[i] = []
         i += 1
 
 
-def split_to_shalist(w, files, keep_boundaries, progress=None):
-    sl = split_to_blobs(w, files, keep_boundaries, progress)
+def split_to_shalist(makeblob, maketree, files,
+                     keep_boundaries, progress=None):
+    sl = split_to_blobs(makeblob, files, keep_boundaries, progress)
     assert(fanout != 0)
     if not fanout:
         shal = []
@@ -155,21 +152,22 @@ def split_to_shalist(w, files, keep_boundaries, progress=None):
         for (sha,size,level) in sl:
             stacks[0].append(('100644', sha, size))
             if level:
-                _squish(w, stacks, level)
+                _squish(maketree, stacks, level)
         #log('stacks: %r\n' % [len(i) for i in stacks])
-        _squish(w, stacks, len(stacks)-1)
+        _squish(maketree, stacks, len(stacks)-1)
         #log('stacks: %r\n' % [len(i) for i in stacks])
         return _make_shalist(stacks[-1])[0]
 
 
-def split_to_blob_or_tree(w, files, keep_boundaries):
-    shalist = list(split_to_shalist(w, files, keep_boundaries))
+def split_to_blob_or_tree(makeblob, maketree, files, keep_boundaries):
+    shalist = list(split_to_shalist(makeblob, maketree,
+                                    files, keep_boundaries))
     if len(shalist) == 1:
         return (shalist[0][0], shalist[0][2])
     elif len(shalist) == 0:
-        return ('100644', w.new_blob(''))
+        return ('100644', makeblob(''))
     else:
-        return ('40000', w.new_tree(shalist))
+        return ('40000', maketree(shalist))
 
 
 def open_noatime(name):

From 252c21db981dfdc23bae4c515929c3a4f814ddea Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 17:57:48 -0800
Subject: [PATCH 08/12] git.py: rename treeparse to tree_decode() and add
 tree_encode().

tree_encode() gets most of its functionality from PackWriter.new_tree(),
which is not just a one liner that calls tree_encode().  We will soon want
to be able to calculate tree hashes without actually writing a tree to a
packfile, so let's split out that functionality.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/git.py | 86 +++++++++++++++++++++++++++-----------------------
 lib/bup/vfs.py |  4 +--
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/lib/bup/git.py b/lib/bup/git.py
index fd364b542..d94de38a3 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -119,6 +119,49 @@ def demangle_name(name):
         return (name, BUP_NORMAL)
 
 
+def calc_hash(type, content):
+    """Calculate some content's hash in the Git fashion."""
+    header = '%s %d\0' % (type, len(content))
+    sum = Sha1(header)
+    sum.update(content)
+    return sum.digest()
+
+
+def _shalist_sort_key(ent):
+    (mode, name, id) = ent
+    if stat.S_ISDIR(int(mode, 8)):
+        return name + '/'
+    else:
+        return name
+
+
+def tree_encode(shalist):
+    """Generate a git tree object from (mode,name,hash) tuples."""
+    shalist = sorted(shalist, key = _shalist_sort_key)
+    l = []
+    for (mode,name,bin) in shalist:
+        assert(mode)
+        assert(mode != '0')
+        assert(mode[0] != '0')
+        assert(name)
+        assert(len(bin) == 20)
+        l.append('%s %s\0%s' % (mode,name,bin))
+    return ''.join(l)
+
+
+def tree_decode(buf):
+    """Generate a list of (mode,name,hash) from the git tree object in buf."""
+    ofs = 0
+    while ofs < len(buf):
+        z = buf[ofs:].find('\0')
+        assert(z > 0)
+        spl = buf[ofs:ofs+z].split(' ', 1)
+        assert(len(spl) == 2)
+        sha = buf[ofs+z+1:ofs+z+1+20]
+        ofs += z+1+20
+        yield (spl[0], spl[1], sha)
+
+
 def _encode_packobj(type, content):
     szout = ''
     sz = len(content)
@@ -405,22 +448,6 @@ def add(self, hash):
         self.also.add(hash)
 
 
-def calc_hash(type, content):
-    """Calculate some content's hash in the Git fashion."""
-    header = '%s %d\0' % (type, len(content))
-    sum = Sha1(header)
-    sum.update(content)
-    return sum.digest()
-
-
-def _shalist_sort_key(ent):
-    (mode, name, id) = ent
-    if stat.S_ISDIR(int(mode, 8)):
-        return name + '/'
-    else:
-        return name
-
-
 def open_idx(filename):
     if filename.endswith('.idx'):
         f = open(filename, 'rb')
@@ -548,16 +575,8 @@ def new_blob(self, blob):
 
     def new_tree(self, shalist):
         """Create a tree object in the pack."""
-        shalist = sorted(shalist, key = _shalist_sort_key)
-        l = []
-        for (mode,name,bin) in shalist:
-            assert(mode)
-            assert(mode != '0')
-            assert(mode[0] != '0')
-            assert(name)
-            assert(len(bin) == 20)
-            l.append('%s %s\0%s' % (mode,name,bin))
-        return self.maybe_write('tree', ''.join(l))
+        content = tree_encode(shalist)
+        return self.maybe_write('tree', content)
 
     def _new_commit(self, tree, parent, author, adate, committer, cdate, msg):
         l = []
@@ -817,19 +836,6 @@ def check_repo_or_die(path=None):
             sys.exit(15)
 
 
-def treeparse(buf):
-    """Generate a list of (mode, name, hash) tuples of objects from 'buf'."""
-    ofs = 0
-    while ofs < len(buf):
-        z = buf[ofs:].find('\0')
-        assert(z > 0)
-        spl = buf[ofs:ofs+z].split(' ', 1)
-        assert(len(spl) == 2)
-        sha = buf[ofs+z+1:ofs+z+1+20]
-        ofs += z+1+20
-        yield (spl[0], spl[1], sha)
-
-
 _ver = None
 def ver():
     """Get Git's version and ensure a usable version is installed.
@@ -989,7 +995,7 @@ def _join(self, it):
                 yield blob
         elif type == 'tree':
             treefile = ''.join(it)
-            for (mode, name, sha) in treeparse(treefile):
+            for (mode, name, sha) in tree_decode(treefile):
                 for blob in self.join(sha.encode('hex')):
                     yield blob
         elif type == 'commit':
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 16a8d33b8..0a17918e8 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -42,7 +42,7 @@ def _treeget(hash):
     it = cp().get(hash.encode('hex'))
     type = it.next()
     assert(type == 'tree')
-    return git.treeparse(''.join(it))
+    return git.tree_decode(''.join(it))
 
 
 def _tree_decode(hash):
@@ -383,7 +383,7 @@ def _mksubs(self):
             it = cp().get(self.hash.encode('hex') + ':')
             type = it.next()
         assert(type == 'tree')
-        for (mode,mangled_name,sha) in git.treeparse(''.join(it)):
+        for (mode,mangled_name,sha) in git.tree_decode(''.join(it)):
             mode = int(mode, 8)
             name = mangled_name
             (name,bupmode) = git.demangle_name(mangled_name)

From 48c6f484d8fdaf2dedcdb645e10c7fca2c164400 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 18:02:12 -0800
Subject: [PATCH 09/12] Replace 040000 and 0100644 constants with
 GIT_MODE_{TREE,FILE}

Those constants were scattered in *way* too many places.  While we're there,
fix the inconsistent usage of strings vs. ints when specifying the file
mode; there's no good reason to be passing strings around (except that I
foolishly did that in the original code in version 0.01).

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 cmd/index-cmd.py     |  4 ++--
 cmd/save-cmd.py      | 13 +++++++------
 lib/bup/git.py       | 13 ++++++++-----
 lib/bup/hashsplit.py | 14 +++++++++-----
 lib/bup/index.py     |  1 +
 lib/bup/vfs.py       | 18 +++++++++---------
 t/test.sh            |  4 ++++
 7 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/cmd/index-cmd.py b/cmd/index-cmd.py
index a04355096..89854113a 100755
--- a/cmd/index-cmd.py
+++ b/cmd/index-cmd.py
@@ -2,7 +2,7 @@
 import sys, stat, time, os
 from bup import options, git, index, drecurse
 from bup.helpers import *
-
+from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE
 
 class IterHelper:
     def __init__(self, l):
@@ -57,7 +57,7 @@ def update_index(top, excluded_paths):
     hashgen = None
     if opt.fake_valid:
         def hashgen(name):
-            return (0100644, index.FAKE_SHA)
+            return (GIT_MODE_FILE, index.FAKE_SHA)
 
     total = 0
     bup_dir = os.path.abspath(git.repo())
diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py
index 7219ab4b5..4b8ca1216 100755
--- a/cmd/save-cmd.py
+++ b/cmd/save-cmd.py
@@ -2,6 +2,7 @@
 import sys, stat, time, math
 from bup import hashsplit, git, options, index, client
 from bup.helpers import *
+from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE
 
 
 optspec = """
@@ -99,8 +100,9 @@ def _pop(force_tree):
     shalist = shalists.pop()
     tree = force_tree or w.new_tree(shalist)
     if shalists:
-        shalists[-1].append(('40000',
-                             git.mangle_name(part, 040000, 40000),
+        shalists[-1].append((GIT_MODE_TREE,
+                             git.mangle_name(part,
+                                             GIT_MODE_TREE, GIT_MODE_TREE),
                              tree))
     else:  # this was the toplevel, so put it back for sanity
         shalists.append(shalist)
@@ -237,7 +239,7 @@ def wantrecurse_during(ent):
             if lastskip_name and lastskip_name.startswith(ent.name):
                 ent.invalidate()
             else:
-                ent.validate(040000, newtree)
+                ent.validate(GIT_MODE_TREE, newtree)
             ent.repack()
         if exists and wasmissing:
             count += oldsize
@@ -246,9 +248,8 @@ def wantrecurse_during(ent):
     # it's not a directory
     id = None
     if hashvalid:
-        mode = '%o' % ent.gitmode
         id = ent.sha
-        shalists[-1].append((mode, 
+        shalists[-1].append((ent.gitmode, 
                              git.mangle_name(file, ent.mode, ent.gitmode),
                              id))
     else:
@@ -287,7 +288,7 @@ def wantrecurse_during(ent):
                 add_error(Exception('skipping special file "%s"' % ent.name))
                 lastskip_name = ent.name
         if id:
-            ent.validate(int(mode, 8), id)
+            ent.validate(mode, id)
             ent.repack()
             shalists[-1].append((mode,
                                  git.mangle_name(file, ent.mode, ent.gitmode),
diff --git a/lib/bup/git.py b/lib/bup/git.py
index d94de38a3..3d8b9dab5 100644
--- a/lib/bup/git.py
+++ b/lib/bup/git.py
@@ -129,7 +129,8 @@ def calc_hash(type, content):
 
 def _shalist_sort_key(ent):
     (mode, name, id) = ent
-    if stat.S_ISDIR(int(mode, 8)):
+    assert(mode+0 == mode)
+    if stat.S_ISDIR(mode):
         return name + '/'
     else:
         return name
@@ -141,11 +142,12 @@ def tree_encode(shalist):
     l = []
     for (mode,name,bin) in shalist:
         assert(mode)
-        assert(mode != '0')
-        assert(mode[0] != '0')
+        assert(mode+0 == mode)
         assert(name)
         assert(len(bin) == 20)
-        l.append('%s %s\0%s' % (mode,name,bin))
+        s = '%o %s\0%s' % (mode,name,bin)
+        assert(s[0] != '0')  # 0-padded octal is not acceptable in a git tree
+        l.append(s)
     return ''.join(l)
 
 
@@ -157,9 +159,10 @@ def tree_decode(buf):
         assert(z > 0)
         spl = buf[ofs:ofs+z].split(' ', 1)
         assert(len(spl) == 2)
+        mode,name = spl
         sha = buf[ofs+z+1:ofs+z+1+20]
         ofs += z+1+20
-        yield (spl[0], spl[1], sha)
+        yield (int(mode, 8), name, sha)
 
 
 def _encode_packobj(type, content):
diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 1819294d1..2b2163b99 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -8,6 +8,10 @@
 progress_callback = None
 fanout = 16
 
+GIT_MODE_FILE = 0100644
+GIT_MODE_TREE = 040000
+assert(GIT_MODE_TREE != 40000)  # 0xxx should be treated as octal
+
 # The purpose of this type of buffer is to avoid copying on peek(), get(),
 # and eat().  We do copy the buffer contents on put(), but that should
 # be ok if we always only put() large amounts of data at a time.
@@ -133,7 +137,7 @@ def _squish(maketree, stacks, n):
         elif stacks[i]:
             (shalist, size) = _make_shalist(stacks[i])
             tree = maketree(shalist)
-            stacks[i+1].append(('40000', tree, size))
+            stacks[i+1].append((GIT_MODE_TREE, tree, size))
         stacks[i] = []
         i += 1
 
@@ -145,12 +149,12 @@ def split_to_shalist(makeblob, maketree, files,
     if not fanout:
         shal = []
         for (sha,size,level) in sl:
-            shal.append(('100644', sha, size))
+            shal.append((GIT_MODE_FILE, sha, size))
         return _make_shalist(shal)[0]
     else:
         stacks = [[]]
         for (sha,size,level) in sl:
-            stacks[0].append(('100644', sha, size))
+            stacks[0].append((GIT_MODE_FILE, sha, size))
             if level:
                 _squish(maketree, stacks, level)
         #log('stacks: %r\n' % [len(i) for i in stacks])
@@ -165,9 +169,9 @@ def split_to_blob_or_tree(makeblob, maketree, files, keep_boundaries):
     if len(shalist) == 1:
         return (shalist[0][0], shalist[0][2])
     elif len(shalist) == 0:
-        return ('100644', makeblob(''))
+        return (GIT_MODE_FILE, makeblob(''))
     else:
-        return ('40000', maketree(shalist))
+        return (GIT_MODE_TREE, maketree(shalist))
 
 
 def open_noatime(name):
diff --git a/lib/bup/index.py b/lib/bup/index.py
index f35fdc87d..7483407fa 100644
--- a/lib/bup/index.py
+++ b/lib/bup/index.py
@@ -136,6 +136,7 @@ def invalidate(self):
     def validate(self, gitmode, sha):
         assert(sha)
         assert(gitmode)
+        assert(gitmode+0 == gitmode)
         self.gitmode = gitmode
         self.sha = sha
         self.flags |= IX_HASHVALID|IX_EXISTS
diff --git a/lib/bup/vfs.py b/lib/bup/vfs.py
index 0a17918e8..454355337 100644
--- a/lib/bup/vfs.py
+++ b/lib/bup/vfs.py
@@ -6,6 +6,7 @@
 import os, re, stat, time
 from bup import git
 from helpers import *
+from bup.hashsplit import GIT_MODE_TREE, GIT_MODE_FILE
 
 EMPTY_SHA='\0'*20
 
@@ -46,7 +47,7 @@ def _treeget(hash):
 
 
 def _tree_decode(hash):
-    tree = [(int(name,16),stat.S_ISDIR(int(mode,8)),sha)
+    tree = [(int(name,16),stat.S_ISDIR(mode),sha)
             for (mode,name,sha)
             in _treeget(hash)]
     assert(tree == list(sorted(tree)))
@@ -384,11 +385,10 @@ def _mksubs(self):
             type = it.next()
         assert(type == 'tree')
         for (mode,mangled_name,sha) in git.tree_decode(''.join(it)):
-            mode = int(mode, 8)
             name = mangled_name
             (name,bupmode) = git.demangle_name(mangled_name)
             if bupmode == git.BUP_CHUNKED:
-                mode = 0100644
+                mode = GIT_MODE_FILE
             if stat.S_ISDIR(mode):
                 self._subs[name] = Dir(self, name, mode, sha)
             elif stat.S_ISLNK(mode):
@@ -408,7 +408,7 @@ class CommitDir(Node):
     the number of commits grows big.
     """
     def __init__(self, parent, name):
-        Node.__init__(self, parent, name, 040000, EMPTY_SHA)
+        Node.__init__(self, parent, name, GIT_MODE_TREE, EMPTY_SHA)
 
     def _mksubs(self):
         self._subs = {}
@@ -436,13 +436,13 @@ def _mksubs(self):
 class CommitList(Node):
     """A list of commits with hashes that start with the current node's name."""
     def __init__(self, parent, name):
-        Node.__init__(self, parent, name, 040000, EMPTY_SHA)
+        Node.__init__(self, parent, name, GIT_MODE_TREE, EMPTY_SHA)
         self.commits = {}
 
     def _mksubs(self):
         self._subs = {}
         for (name, (hash, date)) in self.commits.items():
-            n1 = Dir(self, name, 040000, hash)
+            n1 = Dir(self, name, GIT_MODE_TREE, hash)
             n1.ctime = n1.mtime = date
             self._subs[name] = n1
 
@@ -450,7 +450,7 @@ def _mksubs(self):
 class TagDir(Node):
     """A directory that contains all tags in the repository."""
     def __init__(self, parent, name):
-        Node.__init__(self, parent, name, 040000, EMPTY_SHA)
+        Node.__init__(self, parent, name, GIT_MODE_TREE, EMPTY_SHA)
 
     def _mksubs(self):
         self._subs = {}
@@ -472,7 +472,7 @@ class BranchList(Node):
     /.commit/??/ . The symlink is named after the commit date.
     """
     def __init__(self, parent, name, hash):
-        Node.__init__(self, parent, name, 040000, hash)
+        Node.__init__(self, parent, name, GIT_MODE_TREE, hash)
 
     def _mksubs(self):
         self._subs = {}
@@ -514,7 +514,7 @@ class RefList(Node):
     that are reachable via a ref (e.g. a branch).  See CommitDir for details.
     """
     def __init__(self, parent):
-        Node.__init__(self, parent, '/', 040000, EMPTY_SHA)
+        Node.__init__(self, parent, '/', GIT_MODE_TREE, EMPTY_SHA)
 
     def _mksubs(self):
         self._subs = {}
diff --git a/t/test.sh b/t/test.sh
index 9b5e55aad..609714302 100755
--- a/t/test.sh
+++ b/t/test.sh
@@ -105,6 +105,10 @@ d/
 a
 ./"
 WVPASS bup save -t $D/d
+WVPASS bup index --fake-invalid $D/d/z
+WVPASS bup save -t $D/d/z
+WVPASS bup save -t $D/d/z  # test regenerating trees when no files are changed
+WVPASS bup save -t $D/d
 WVPASSEQ "$(cd $D && bup index -m)" \
 "f
 a

From d130210ac92511773b54219aaed4253ea63ba3ac Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 18:48:06 -0800
Subject: [PATCH 10/12] hashsplit: use shorter offset-filenames inside trees.

We previously zero-padded all the filenames (which are hexified versions of
the file offsets) to 16 characters, which corresponds to a maximum file size
that fits into a 64-bit integer.  I realized that there's no reason to
use a fixed padding length; just pad all the entries in a particular tree to
the length of the longest entry (to ensure that sorting
alphabetically is still equivalent to sorting numerically).

This saves a small amount of space in each tree, which is probably
irrelevant given that gzip compression can quite easily compress extra
zeroes.  But it also makes browsing the tree in git look a little prettier.

This is backwards compatible with old versions of vfs.py, since vfs.py has
always just treated the numbers as an ordered set of numbers, and doesn't
care how much zero padding they have.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 lib/bup/hashsplit.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/bup/hashsplit.py b/lib/bup/hashsplit.py
index 2b2163b99..914c2bb5a 100644
--- a/lib/bup/hashsplit.py
+++ b/lib/bup/hashsplit.py
@@ -119,11 +119,14 @@ def split_to_blobs(makeblob, files, keep_boundaries, progress):
 
 def _make_shalist(l):
     ofs = 0
+    l = list(l)
+    total = sum(size for mode,sha,size, in l)
+    vlen = len('%x' % total)
     shalist = []
     for (mode, sha, size) in l:
-        shalist.append((mode, '%016x' % ofs, sha))
+        shalist.append((mode, '%0*x' % (vlen,ofs), sha))
         ofs += size
-    total = ofs
+    assert(ofs == total)
     return (shalist, total)
 
 

From 35d4a6a5272c995545df1d6210129d7389505ba0 Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 20:48:15 -0800
Subject: [PATCH 11/12] cmd/newliner: restrict progress lines to the screen
 width.

Otherwise \r won't work as expected.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 cmd/newliner-cmd.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmd/newliner-cmd.py b/cmd/newliner-cmd.py
index 966725e8f..68d327b7e 100755
--- a/cmd/newliner-cmd.py
+++ b/cmd/newliner-cmd.py
@@ -15,6 +15,7 @@
 r = re.compile(r'([\r\n])')
 lastlen = 0
 all = ''
+width = options._tty_width() or 78
 while 1:
     l = r.split(all, 1)
     if len(l) <= 1:
@@ -32,6 +33,8 @@
     else:
         assert(len(l) == 3)
         (line, splitchar, all) = l
+        if splitchar == '\r':
+            line = line[:width]
         sys.stdout.write('%-*s%s' % (lastlen, line, splitchar))
         if splitchar == '\r':
             lastlen = len(line)

From 6f02181e8d333b1e7e81ebb56d23923cf7e4227c Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Sat, 19 Feb 2011 21:21:45 -0800
Subject: [PATCH 12/12] helpers: separately determine if stdout and stderr are
 ttys.

Previously we only cared if stderr was a tty (since we use that to determine
if we should print progress() or not).  But we might want to check stdout as
well, for the same reason that gzip does: we should be refusing to write
binary data to a terminal.

Signed-off-by: Avery Pennarun <apenwarr@gmail.com>
---
 cmd/fsck-cmd.py    | 4 ++--
 cmd/save-cmd.py    | 2 +-
 lib/bup/_helpers.c | 7 ++++---
 lib/bup/helpers.py | 5 +++--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cmd/fsck-cmd.py b/cmd/fsck-cmd.py
index 44decd894..8ba2771d8 100755
--- a/cmd/fsck-cmd.py
+++ b/cmd/fsck-cmd.py
@@ -36,7 +36,7 @@ def par2_setup():
 
 def parv(lvl):
     if opt.verbose >= lvl:
-        if istty:
+        if istty2:
             return []
         else:
             return ['-q']
@@ -203,6 +203,6 @@ def do_pack(base, last):
     if not opt.verbose:
         progress('fsck (%d/%d)\r' % (count, len(extra)))
 
-if not opt.verbose and istty:
+if not opt.verbose and istty2:
     log('fsck done.           \n')
 sys.exit(code)
diff --git a/cmd/save-cmd.py b/cmd/save-cmd.py
index 4b8ca1216..24376af02 100755
--- a/cmd/save-cmd.py
+++ b/cmd/save-cmd.py
@@ -31,7 +31,7 @@
 if not extra:
     o.fatal("no filenames given")
 
-opt.progress = (istty and not opt.quiet)
+opt.progress = (istty2 and not opt.quiet)
 opt.smaller = parse_num(opt.smaller or 0)
 if opt.bwlimit:
     client.bwlimit = parse_num(opt.bwlimit)
diff --git a/lib/bup/_helpers.c b/lib/bup/_helpers.c
index d077cd971..af9d06f35 100644
--- a/lib/bup/_helpers.c
+++ b/lib/bup/_helpers.c
@@ -9,7 +9,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
-static int istty = 0;
+static int istty2 = 0;
 
 // Probably we should use autoconf or something and set HAVE_PY_GETARGCARGV...
 #if __WIN32__ || __CYGWIN__
@@ -400,7 +400,7 @@ static PyObject *merge_into(PyObject *self, PyObject *args)
     {
 	struct idx *idx;
 	uint32_t new_prefix;
-	if (count % 102424 == 0 && istty)
+	if (count % 102424 == 0 && istty2)
 	    fprintf(stderr, "midx: writing %.2f%% (%d/%d)\r",
 		    count*100.0/total, count, total);
 	idx = idxs[last_i];
@@ -655,7 +655,8 @@ static PyMethodDef faster_methods[] = {
 
 PyMODINIT_FUNC init_helpers(void)
 {
+    char *e = getenv("BUP_FORCE_TTY");
     Py_InitModule("_helpers", faster_methods);
-    istty = isatty(2) || getenv("BUP_FORCE_TTY");
+    istty2 = isatty(2) || (atoi(e ? e : "0") & 2);
     unpythonize_argv();
 }
diff --git a/lib/bup/helpers.py b/lib/bup/helpers.py
index ed976bfe3..1432b8b19 100644
--- a/lib/bup/helpers.py
+++ b/lib/bup/helpers.py
@@ -66,12 +66,13 @@ def debug2(s):
         log(s)
 
 
-istty = os.isatty(2) or atoi(os.environ.get('BUP_FORCE_TTY'))
+istty1 = os.isatty(1) or (atoi(os.environ.get('BUP_FORCE_TTY')) & 1)
+istty2 = os.isatty(2) or (atoi(os.environ.get('BUP_FORCE_TTY')) & 2)
 _last_progress = ''
 def progress(s):
     """Calls log() if stderr is a TTY.  Does nothing otherwise."""
     global _last_progress
-    if istty:
+    if istty2:
         log(s)
         _last_progress = s