diff --git a/cmd/midx-cmd.py b/cmd/midx-cmd.py index 8b2ea2b3d..4aa200995 100755 --- a/cmd/midx-cmd.py +++ b/cmd/midx-cmd.py @@ -1,5 +1,6 @@ #!/usr/bin/env python import sys, math, struct, glob, resource +import tempfile, shutil from bup import options, git from bup.helpers import * @@ -31,13 +32,20 @@ def max_files(): return mf -def merge(idxlist, bits, table): - count = 0 - for e in git.idxmerge(idxlist, final_progress=False): - count += 1 - prefix = git.extract_bits(e, bits) - table[prefix] = count - yield e +def merge_into(tf_sha, tf_nmap, idxlist, bits, entries, total): + prefix = 0 + it = git.idxmerge(idxlist, final_progress=False, total=total) + for i, (e, idx) in enumerate(it): + new_prefix = git.extract_bits(e, bits) + if new_prefix != prefix: + for p in xrange(prefix, new_prefix): + yield i + prefix = new_prefix + tf_sha.write(e) + tf_nmap.write(struct.pack('!I', idx)) + i += 1 + for p in xrange(prefix, entries): + yield i def _do_midx(outdir, outfilename, infilenames, prefixstr): @@ -48,12 +56,12 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr): inp = [] total = 0 - allfilenames = {} + allfilenames = [] for name in infilenames: ix = git.open_idx(name) + inp.append(ix.iter_with_idx_i(len(allfilenames))) for n in ix.idxnames: - allfilenames[n] = 1 - inp.append(ix) + allfilenames.append(os.path.basename(n)) total += len(ix) log('midx: %screating from %d files (%d objects).\n' @@ -69,25 +77,32 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr): entries = 2**bits debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits)) - table = [0]*entries - try: os.unlink(outfilename) except OSError: pass f = open(outfilename + '.tmp', 'w+') - f.write('MIDX\0\0\0\2') - f.write(struct.pack('!I', bits)) + f.write('MIDX') + f.write(struct.pack('!II', git.MIDX_VERSION, bits)) assert(f.tell() == 12) - f.write('\0'*4*entries) - - for e in merge(inp, bits, table): - f.write(e) - - f.write('\0'.join(os.path.basename(p) for p in allfilenames.keys())) - f.seek(12) - f.write(struct.pack('!%dI' % entries, *table)) + tf_sha = tempfile.TemporaryFile(dir=outdir) + tf_nmap = tempfile.TemporaryFile(dir=outdir) + for t in merge_into(tf_sha, tf_nmap, inp, bits, entries, total): + f.write(struct.pack('!I', t)) + assert(f.tell() == 12 + 4*entries) + + tf_sha.seek(0) + shutil.copyfileobj(tf_sha, f) + tf_sha.close() + assert(f.tell() == 12 + 4*entries + 20*t) # t may be < total due to dupes + + tf_nmap.seek(0) + shutil.copyfileobj(tf_nmap, f) + tf_nmap.close() + assert(f.tell() == 12 + 4*entries + 24*t) # t may be < total due to dupes + + f.write('\0'.join(allfilenames)) f.close() os.rename(outfilename + '.tmp', outfilename) @@ -97,12 +112,11 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr): assert(len(p.idxnames) == len(infilenames)) print p.idxnames assert(len(p) == total) - pi = iter(p) - for i in merge(inp, total, bits, table): + for pe, e in p, git.idxmerge(inp, final_progress=False): assert(i == pi.next()) assert(p.exists(i)) - return total,outfilename + return total, outfilename def do_midx(outdir, outfilename, infilenames, prefixstr): diff --git a/cmd/server-cmd.py b/cmd/server-cmd.py index 3bc998bcf..a5e9abde1 100755 --- a/cmd/server-cmd.py +++ b/cmd/server-cmd.py @@ -89,18 +89,9 @@ def receive_objects_v2(conn, junk): #debug2('read %d bytes\n' % n) _check(w, n, len(buf), 'object read: expected %d bytes, got %d\n') if not dumb_server_mode: - oldpack = w.exists(shar) + oldpack = w.exists(shar, want_source=True) if oldpack: - if oldpack == True or oldpack.endswith('.midx'): - # FIXME: we shouldn't really have to know about midx files - # at this layer. But exists() on a midx doesn't return the - # packname (since it doesn't know)... probably we should - # just fix that deficiency of midx files eventually, - # although it'll make the files bigger. This method is - # certainly not very efficient. - oldpack = w.objcache.packname_containing(shar) - debug2('new suggestion: %r\n' % oldpack) - w.objcache.refresh() + assert(not oldpack == True) assert(oldpack.endswith('.idx')) (dir,name) = os.path.split(oldpack) if not (name in suggested): diff --git a/lib/bup/git.py b/lib/bup/git.py index 19fe9bc6b..a1a75625c 100644 --- a/lib/bup/git.py +++ b/lib/bup/git.py @@ -6,7 +6,7 @@ from bup.helpers import * from bup import _helpers, path -MIDX_VERSION = 2 +MIDX_VERSION = 4 """Discussion of bloom constants for bup: @@ -247,9 +247,11 @@ def find_offset(self, hash): return self._ofs_from_idx(idx) return None - def exists(self, hash): + def exists(self, hash, want_source=False): """Return nonempty if the object exists in this index.""" - return hash and (self._idx_from_hash(hash) != None) and True or None + if hash and (self._idx_from_hash(hash) != None): + return want_source and self.name or True + return None def __len__(self): return int(self.fanout[255]) @@ -275,6 +277,10 @@ def _idx_from_hash(self, hash): return mid return None + def iter_with_idx_i(self, idx_i): + for e in self: + yield e, idx_i + class PackIdxV1(PackIdx): """Object representation of a Git pack index (version 1) file.""" @@ -475,9 +481,10 @@ def __init__(self, filename): self.entries = 2**self.bits self.fanout = buffer(self.map, 12, self.entries*4) shaofs = 12 + self.entries*4 - nsha = self._fanget(self.entries-1) + self.nsha = nsha = self._fanget(self.entries-1) self.shatable = buffer(self.map, shaofs, nsha*20) - self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0') + self.whichlist = buffer(self.map, shaofs + nsha*20, nsha*4) + self.idxnames = str(self.map[shaofs + 24*nsha:]).split('\0') def _init_failed(self): self.bits = 0 @@ -494,7 +501,13 @@ def _fanget(self, i): def _get(self, i): return str(self.shatable[i*20:(i+1)*20]) - def exists(self, hash): + def _get_idx_i(self, i): + return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0] + + def _get_idxname(self, i): + return self.idxnames[self._get_idx_i(i)] + + def exists(self, hash, want_source=False): """Return nonempty if the object exists in the index files.""" global _total_searches, _total_steps _total_searches += 1 @@ -525,9 +538,13 @@ def exists(self, hash): end = mid endv = _helpers.firstword(v) else: # got it! - return True + return want_source and self._get_idxname(mid) or True return None + def iter_with_idx_i(self, ofs): + for i in xrange(self._fanget(self.entries-1)): + yield buffer(self.shatable, i*20, 20), ofs+self._get_idx_i(i) + def __iter__(self): for i in xrange(self._fanget(self.entries-1)): yield buffer(self.shatable, i*20, 20) @@ -560,7 +577,7 @@ def __iter__(self): def __len__(self): return sum(len(pack) for pack in self.packs) - def exists(self, hash): + def exists(self, hash, want_source=False): """Return nonempty if the object exists in the index files.""" global _total_searches _total_searches += 1 @@ -575,10 +592,11 @@ def exists(self, hash): for i in xrange(len(self.packs)): p = self.packs[i] _total_searches -= 1 # will be incremented by sub-pack - if p.exists(hash): + ix = p.exists(hash, want_source=want_source) + if ix: # reorder so most recently used packs are searched first self.packs = [p] + self.packs[:i] + self.packs[i+1:] - return p.name + return ix self.do_bloom = True return None @@ -658,21 +676,6 @@ def refresh(self, skip_midx = False): debug1('PackIdxList: using %d index%s.\n' % (len(self.packs), len(self.packs)!=1 and 'es' or '')) - def packname_containing(self, hash): - # figure out which pack contains a given hash. - # FIXME: if the midx file format would just *store* this information, - # we could calculate it a lot more efficiently. But it's not needed - # often, so let's do it like this. - for f in glob.glob(os.path.join(self.dir,'*.idx')): - full = os.path.join(self.dir, f) - try: - ix = open_idx(full) - except GitError, e: - add_error(e) - continue - if ix.exists(hash): - return full - def add(self, hash): """Insert an additional object in the list.""" self.also.add(hash) @@ -715,7 +718,7 @@ def open_idx(filename): raise GitError('idx filenames must end with .idx or .midx') -def idxmerge(idxlist, final_progress=True): +def idxmerge(idxlist, final_progress=True, total=None): """Generate a list of all the objects reachable in a PackIdxList.""" def pfunc(count, total): progress('Reading indexes: %.2f%% (%d/%d)\r' @@ -723,7 +726,7 @@ def pfunc(count, total): def pfinal(count, total): if final_progress: log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total)) - return merge_iter(idxlist, 10024, pfunc, pfinal) + return merge_iter(idxlist, 10024, pfunc, pfinal, total=total) def _make_objcache(): @@ -800,10 +803,10 @@ def _require_objcache(self): raise GitError( "PackWriter not opened or can't check exists w/o objcache") - def exists(self, id): + def exists(self, id, want_source=False): """Return non-empty if an object is found in the object cache.""" self._require_objcache() - return self.objcache.exists(id) + return self.objcache.exists(id, want_source=want_source) def maybe_write(self, type, content): """Write an object to the pack file if not present and return its id.""" diff --git a/lib/bup/helpers.py b/lib/bup/helpers.py index 7b5eeadaf..fdba7dd23 100644 --- a/lib/bup/helpers.py +++ b/lib/bup/helpers.py @@ -88,13 +88,13 @@ def next(it): return None -def merge_iter(iters, pfreq, pfunc, pfinal, key=None): +def merge_iter(iters, pfreq, pfunc, pfinal, key=None, total=None): if key: samekey = lambda e, pe: getattr(e, key) == getattr(pe, key, None) else: samekey = operator.eq count = 0 - total = sum(len(it) for it in iters) + total = total or sum(len(it) for it in iters) iters = (iter(it) for it in iters) heap = ((next(it),it) for it in iters) heap = [(e,it) for e,it in heap if e]