Skip to content

Commit

Permalink
vfs: take advantage of bup chunking to make file seeking faster.
Browse files Browse the repository at this point in the history
If you have a huge file, you can now seek around inside it (eg. in 'bup
fuse') without having to read its entire contents.  Calculating the file
size is also really fast now.

This makes a bup fuse-mounted filesystem much more useful for real-time
access.  For example, I was able to connect to an sqlite3 database and have
it work at a reasonable speed.  (Obviously, since 'bup fuse' is written in
python and doesn't currently support threading, the speed could still be
improved, but at least it's no longer godawful.)
  • Loading branch information
apenwarr committed Apr 23, 2010
1 parent 0ed56b2 commit 0e3565a
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 38 deletions.
4 changes: 3 additions & 1 deletion cmd/fuse-cmd.py
Expand Up @@ -93,7 +93,9 @@ def release(self, path, flags):
def read(self, path, size, offset):
log('--read(%r)\n' % path)
n = cache_get(self.top, path)
return n.readbytes(offset, size)
o = n.open()
o.seek(offset)
return o.read(size)


if not hasattr(fuse, '__version__'):
Expand Down
167 changes: 130 additions & 37 deletions lib/bup/vfs.py
Expand Up @@ -23,13 +23,103 @@ class TooManySymlinks(NodeError):
pass


class FileReader:
def __init__(self, node):
self.n = node
def _treeget(hash):
it = cp().get(hash.encode('hex'))
type = it.next()
assert(type == 'tree')
return git._treeparse(''.join(it))


def _tree_decode(hash):
tree = [(int(name,16),stat.S_ISDIR(int(mode,8)),sha)
for (mode,name,sha)
in _treeget(hash)]
assert(tree == list(sorted(tree)))
return tree


def _chunk_len(hash):
return sum(len(b) for b in cp().join(hash.encode('hex')))


def _last_chunk_info(hash):
tree = _tree_decode(hash)
assert(tree)
(ofs,isdir,sha) = tree[-1]
if isdir:
(subofs, sublen) = _last_chunk_info(sha)
return (ofs+subofs, sublen)
else:
return (ofs, _chunk_len(sha))


def _total_size(hash):
(lastofs, lastsize) = _last_chunk_info(hash)
return lastofs + lastsize


def _chunkiter(hash, startofs):
assert(startofs >= 0)
tree = _tree_decode(hash)

# skip elements before startofs
for i in xrange(len(tree)):
if i+1 >= len(tree) or tree[i+1][0] > startofs:
break
first = i

# iterate through what's left
for i in xrange(first, len(tree)):
(ofs,isdir,sha) = tree[i]
skipmore = startofs-ofs
if skipmore < 0:
skipmore = 0
if isdir:
for b in _chunkiter(sha, skipmore):
yield b
else:
yield ''.join(cp().join(sha.encode('hex')))[skipmore:]


class _ChunkReader:
def __init__(self, hash, isdir, startofs):
if isdir:
self.it = _chunkiter(hash, startofs)
self.blob = None
else:
self.it = None
self.blob = cp().get(hash.encode('hex'))

def next(self, size):
out = ''
while len(out) < size:
if self.it and not self.blob:
try:
self.blob = self.it.next()
except StopIteration:
self.it = None
if self.blob:
want = size - len(out)
out += self.blob[:want]
self.blob = self.blob[want:]
if not self.it:
break
log('next(%d) returned %d\n' % (size, len(out)))
return out


class _FileReader:
def __init__(self, hash, size, isdir):
self.hash = hash
self.ofs = 0
self.size = self.n.size()
self.size = size
self.isdir = isdir
self.reader = None

def seek(self, ofs):
if self.ofs == ofs:
return
self.reader = None
if ofs > self.size:
self.ofs = self.size
elif ofs < 0:
Expand All @@ -43,7 +133,13 @@ def tell(self):
def read(self, count = -1):
if count < 0:
count = self.size - self.ofs
buf = self.n.readbytes(self.ofs, count)
if not self.reader:
self.reader = _ChunkReader(self.hash, self.isdir, self.ofs)
try:
buf = self.reader.next(count)
except:
self.reader = None
raise # our offsets will be all screwed up otherwise
self.ofs += len(buf)
return buf

Expand Down Expand Up @@ -131,48 +227,45 @@ def size(self):

def open(self):
raise NotFile('%s is not a regular file' % self.name)

def readbytes(self, ofs, count):
raise NotFile('%s is not a regular file' % self.name)

def read(self, num = -1):
if num < 0:
num = self.size()
return self.readbytes(0, num)




class File(Node):
def __init__(self, parent, name, mode, hash):
def __init__(self, parent, name, mode, hash, bupmode):
Node.__init__(self, parent, name, mode, hash)
self.bupmode = bupmode
self._cached_size = None
self._filereader = None

def _content(self):
return cp().join(self.hash.encode('hex'))

def open(self):
return FileReader(self)
# You'd think FUSE might call this only once each time a file is
# opened, but no; it's really more of a refcount, and it's called
# once per read(). Thus, it's important to cache the filereader
# object here so we're not constantly re-seeking.
if not self._filereader:
self._filereader = _FileReader(self.hash, self.size(),
self.bupmode == git.BUP_CHUNKED)
return self._filereader

def size(self):
# FIXME inefficient. If a file is chunked, we could just check
# the offset + size of the very last chunk.
if self._cached_size == None:
self._cached_size = sum(len(blob) for blob in self._content())
log('<<<<File.size() is calculating\n')
if self.bupmode == git.BUP_CHUNKED:
self._cached_size = _total_size(self.hash)
else:
self._cached_size = _chunk_len(self.hash)
return self._cached_size

def readbytes(self, ofs, count):
# FIXME inefficient. If a file is chunked, we could choose to
# read only the required chunks rather than joining the whole thing.
buf = ''.join(self._content())
return buf[ofs:ofs+count]



_symrefs = 0
class Symlink(File):
def __init__(self, parent, name, hash):
File.__init__(self, parent, name, 0120000, hash)
def __init__(self, parent, name, hash, bupmode):
File.__init__(self, parent, name, 0120000, hash, bupmode)

def size(self):
return len(self.readlink())

def readlink(self):
return self.read(1024)
return ''.join(cp().join(self.hash.encode('hex')))

def dereference(self):
global _symrefs
Expand All @@ -191,10 +284,10 @@ def _lresolve(self, parts):

class FakeSymlink(Symlink):
def __init__(self, parent, name, toname):
Symlink.__init__(self, parent, name, EMPTY_SHA)
Symlink.__init__(self, parent, name, EMPTY_SHA, git.BUP_NORMAL)
self.toname = toname

def _content(self):
def readlink(self):
return self.toname


Expand All @@ -217,9 +310,9 @@ def _mksubs(self):
if stat.S_ISDIR(mode):
self._subs[name] = Dir(self, name, mode, sha)
elif stat.S_ISLNK(mode):
self._subs[name] = Symlink(self, name, sha)
self._subs[name] = Symlink(self, name, sha, bupmode)
else:
self._subs[name] = File(self, name, mode, sha)
self._subs[name] = File(self, name, mode, sha, bupmode)


class CommitList(Node):
Expand Down

0 comments on commit 0e3565a

Please sign in to comment.