Skip to content

Commit

Permalink
midx4: midx2 with idx backreferences
Browse files Browse the repository at this point in the history
Like midx3, this adds a lookup table of 4 bytes per entry to
reference an entry in the idxnames list.  2 bytes should be plenty, but
disk is cheap and the table will only be referenced when bup server gets
an object that's already in the midx.

Signed-off-by: Brandon Low <lostlogic@lostlogicx.com>
  • Loading branch information
Brandon Low authored and apenwarr committed Feb 7, 2011
1 parent dde9f9a commit ccfa3bd
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 67 deletions.
64 changes: 39 additions & 25 deletions cmd/midx-cmd.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
import sys, math, struct, glob, resource
import tempfile, shutil
from bup import options, git
from bup.helpers import *

Expand Down Expand Up @@ -31,13 +32,20 @@ def max_files():
return mf


def merge(idxlist, bits, table):
count = 0
for e in git.idxmerge(idxlist, final_progress=False):
count += 1
prefix = git.extract_bits(e, bits)
table[prefix] = count
yield e
def merge_into(tf_sha, tf_nmap, idxlist, bits, entries, total):
prefix = 0
it = git.idxmerge(idxlist, final_progress=False, total=total)
for i, (e, idx) in enumerate(it):
new_prefix = git.extract_bits(e, bits)
if new_prefix != prefix:
for p in xrange(prefix, new_prefix):
yield i
prefix = new_prefix
tf_sha.write(e)
tf_nmap.write(struct.pack('!I', idx))
i += 1
for p in xrange(prefix, entries):
yield i


def _do_midx(outdir, outfilename, infilenames, prefixstr):
Expand All @@ -48,12 +56,12 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):

inp = []
total = 0
allfilenames = {}
allfilenames = []
for name in infilenames:
ix = git.open_idx(name)
inp.append(ix.iter_with_idx_i(len(allfilenames)))
for n in ix.idxnames:
allfilenames[n] = 1
inp.append(ix)
allfilenames.append(os.path.basename(n))
total += len(ix)

log('midx: %screating from %d files (%d objects).\n'
Expand All @@ -69,25 +77,32 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
entries = 2**bits
debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits))

table = [0]*entries

try:
os.unlink(outfilename)
except OSError:
pass
f = open(outfilename + '.tmp', 'w+')
f.write('MIDX\0\0\0\2')
f.write(struct.pack('!I', bits))
f.write('MIDX')
f.write(struct.pack('!II', git.MIDX_VERSION, bits))
assert(f.tell() == 12)
f.write('\0'*4*entries)

for e in merge(inp, bits, table):
f.write(e)

f.write('\0'.join(os.path.basename(p) for p in allfilenames.keys()))

f.seek(12)
f.write(struct.pack('!%dI' % entries, *table))
tf_sha = tempfile.TemporaryFile(dir=outdir)
tf_nmap = tempfile.TemporaryFile(dir=outdir)
for t in merge_into(tf_sha, tf_nmap, inp, bits, entries, total):
f.write(struct.pack('!I', t))
assert(f.tell() == 12 + 4*entries)

tf_sha.seek(0)
shutil.copyfileobj(tf_sha, f)
tf_sha.close()
assert(f.tell() == 12 + 4*entries + 20*t) # t may be < total due to dupes

tf_nmap.seek(0)
shutil.copyfileobj(tf_nmap, f)
tf_nmap.close()
assert(f.tell() == 12 + 4*entries + 24*t) # t may be < total due to dupes

f.write('\0'.join(allfilenames))
f.close()
os.rename(outfilename + '.tmp', outfilename)

Expand All @@ -97,12 +112,11 @@ def _do_midx(outdir, outfilename, infilenames, prefixstr):
assert(len(p.idxnames) == len(infilenames))
print p.idxnames
assert(len(p) == total)
pi = iter(p)
for i in merge(inp, total, bits, table):
for pe, e in p, git.idxmerge(inp, final_progress=False):
assert(i == pi.next())
assert(p.exists(i))

return total,outfilename
return total, outfilename


def do_midx(outdir, outfilename, infilenames, prefixstr):
Expand Down
13 changes: 2 additions & 11 deletions cmd/server-cmd.py
Expand Up @@ -89,18 +89,9 @@ def receive_objects_v2(conn, junk):
#debug2('read %d bytes\n' % n)
_check(w, n, len(buf), 'object read: expected %d bytes, got %d\n')
if not dumb_server_mode:
oldpack = w.exists(shar)
oldpack = w.exists(shar, want_source=True)
if oldpack:
if oldpack == True or oldpack.endswith('.midx'):
# FIXME: we shouldn't really have to know about midx files
# at this layer. But exists() on a midx doesn't return the
# packname (since it doesn't know)... probably we should
# just fix that deficiency of midx files eventually,
# although it'll make the files bigger. This method is
# certainly not very efficient.
oldpack = w.objcache.packname_containing(shar)
debug2('new suggestion: %r\n' % oldpack)
w.objcache.refresh()
assert(not oldpack == True)
assert(oldpack.endswith('.idx'))
(dir,name) = os.path.split(oldpack)
if not (name in suggested):
Expand Down
61 changes: 32 additions & 29 deletions lib/bup/git.py
Expand Up @@ -6,7 +6,7 @@
from bup.helpers import *
from bup import _helpers, path

MIDX_VERSION = 2
MIDX_VERSION = 4

"""Discussion of bloom constants for bup:
Expand Down Expand Up @@ -247,9 +247,11 @@ def find_offset(self, hash):
return self._ofs_from_idx(idx)
return None

def exists(self, hash):
def exists(self, hash, want_source=False):
"""Return nonempty if the object exists in this index."""
return hash and (self._idx_from_hash(hash) != None) and True or None
if hash and (self._idx_from_hash(hash) != None):
return want_source and self.name or True
return None

def __len__(self):
return int(self.fanout[255])
Expand All @@ -275,6 +277,10 @@ def _idx_from_hash(self, hash):
return mid
return None

def iter_with_idx_i(self, idx_i):
for e in self:
yield e, idx_i


class PackIdxV1(PackIdx):
"""Object representation of a Git pack index (version 1) file."""
Expand Down Expand Up @@ -475,9 +481,10 @@ def __init__(self, filename):
self.entries = 2**self.bits
self.fanout = buffer(self.map, 12, self.entries*4)
shaofs = 12 + self.entries*4
nsha = self._fanget(self.entries-1)
self.nsha = nsha = self._fanget(self.entries-1)
self.shatable = buffer(self.map, shaofs, nsha*20)
self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
self.whichlist = buffer(self.map, shaofs + nsha*20, nsha*4)
self.idxnames = str(self.map[shaofs + 24*nsha:]).split('\0')

def _init_failed(self):
self.bits = 0
Expand All @@ -494,7 +501,13 @@ def _fanget(self, i):
def _get(self, i):
return str(self.shatable[i*20:(i+1)*20])

def exists(self, hash):
def _get_idx_i(self, i):
return struct.unpack('!I', self.whichlist[i*4:(i+1)*4])[0]

def _get_idxname(self, i):
return self.idxnames[self._get_idx_i(i)]

def exists(self, hash, want_source=False):
"""Return nonempty if the object exists in the index files."""
global _total_searches, _total_steps
_total_searches += 1
Expand Down Expand Up @@ -525,9 +538,13 @@ def exists(self, hash):
end = mid
endv = _helpers.firstword(v)
else: # got it!
return True
return want_source and self._get_idxname(mid) or True
return None

def iter_with_idx_i(self, ofs):
for i in xrange(self._fanget(self.entries-1)):
yield buffer(self.shatable, i*20, 20), ofs+self._get_idx_i(i)

def __iter__(self):
for i in xrange(self._fanget(self.entries-1)):
yield buffer(self.shatable, i*20, 20)
Expand Down Expand Up @@ -560,7 +577,7 @@ def __iter__(self):
def __len__(self):
return sum(len(pack) for pack in self.packs)

def exists(self, hash):
def exists(self, hash, want_source=False):
"""Return nonempty if the object exists in the index files."""
global _total_searches
_total_searches += 1
Expand All @@ -575,10 +592,11 @@ def exists(self, hash):
for i in xrange(len(self.packs)):
p = self.packs[i]
_total_searches -= 1 # will be incremented by sub-pack
if p.exists(hash):
ix = p.exists(hash, want_source=want_source)
if ix:
# reorder so most recently used packs are searched first
self.packs = [p] + self.packs[:i] + self.packs[i+1:]
return p.name
return ix
self.do_bloom = True
return None

Expand Down Expand Up @@ -658,21 +676,6 @@ def refresh(self, skip_midx = False):
debug1('PackIdxList: using %d index%s.\n'
% (len(self.packs), len(self.packs)!=1 and 'es' or ''))

def packname_containing(self, hash):
# figure out which pack contains a given hash.
# FIXME: if the midx file format would just *store* this information,
# we could calculate it a lot more efficiently. But it's not needed
# often, so let's do it like this.
for f in glob.glob(os.path.join(self.dir,'*.idx')):
full = os.path.join(self.dir, f)
try:
ix = open_idx(full)
except GitError, e:
add_error(e)
continue
if ix.exists(hash):
return full

def add(self, hash):
"""Insert an additional object in the list."""
self.also.add(hash)
Expand Down Expand Up @@ -715,15 +718,15 @@ def open_idx(filename):
raise GitError('idx filenames must end with .idx or .midx')


def idxmerge(idxlist, final_progress=True):
def idxmerge(idxlist, final_progress=True, total=None):
"""Generate a list of all the objects reachable in a PackIdxList."""
def pfunc(count, total):
progress('Reading indexes: %.2f%% (%d/%d)\r'
% (count*100.0/total, count, total))
def pfinal(count, total):
if final_progress:
log('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
return merge_iter(idxlist, 10024, pfunc, pfinal)
return merge_iter(idxlist, 10024, pfunc, pfinal, total=total)


def _make_objcache():
Expand Down Expand Up @@ -800,10 +803,10 @@ def _require_objcache(self):
raise GitError(
"PackWriter not opened or can't check exists w/o objcache")

def exists(self, id):
def exists(self, id, want_source=False):
"""Return non-empty if an object is found in the object cache."""
self._require_objcache()
return self.objcache.exists(id)
return self.objcache.exists(id, want_source=want_source)

def maybe_write(self, type, content):
"""Write an object to the pack file if not present and return its id."""
Expand Down
4 changes: 2 additions & 2 deletions lib/bup/helpers.py
Expand Up @@ -88,13 +88,13 @@ def next(it):
return None


def merge_iter(iters, pfreq, pfunc, pfinal, key=None):
def merge_iter(iters, pfreq, pfunc, pfinal, key=None, total=None):
if key:
samekey = lambda e, pe: getattr(e, key) == getattr(pe, key, None)
else:
samekey = operator.eq
count = 0
total = sum(len(it) for it in iters)
total = total or sum(len(it) for it in iters)
iters = (iter(it) for it in iters)
heap = ((next(it),it) for it in iters)
heap = [(e,it) for e,it in heap if e]
Expand Down

0 comments on commit ccfa3bd

Please sign in to comment.