Skip to content

Commit

Permalink
Split directory recursion stuff from cmd-index.py into drecurse.py.
Browse files Browse the repository at this point in the history
Also add a new command, 'bup drecurse', which just recurses through a
directory tree and prints all the filenames.  This is useful for timing
performance vs. the native 'find' command.

The result is a bit embarrassing; for my home directory of about 188000
files, drecurse is about 10x slower:

$ time bup drecurse -q ~
real	0m2.935s
user	0m2.312s
sys	0m0.580s

$ time find ~ -printf ''
real	0m0.385s
user	0m0.096s
sys	0m0.284s

time find ~ -printf '%s\n' >/dev/null
real	0m0.662s
user	0m0.208s
sys	0m0.456s
  • Loading branch information
apenwarr committed Feb 3, 2010
1 parent e27c726 commit 5db9f39
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 110 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ endif
default: all

all: bup-split bup-join bup-save bup-init bup-server bup-index bup-tick \
bup-midx bup-fuse bup-ls bup-damage bup-fsck bup-margin \
bup-midx bup-fuse bup-ls bup-damage bup-fsck bup-margin bup-drecurse \
bup memtest randomgen$(EXT) _hashsplit$(SOEXT)

randomgen$(EXT): randomgen.o
Expand Down
20 changes: 20 additions & 0 deletions cmd-drecurse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python
import options, drecurse
from helpers import *

optspec = """
bup drecurse <path>
--
x,xdev don't cross filesystem boundaries
q,quiet don't actually print filenames
"""
o = options.Options('bup drecurse', optspec)
(opt, flags, extra) = o.parse(sys.argv[1:])

if len(extra) != 1:
log("drecurse: exactly one filename expected\n")
o.usage()

for (name,st) in drecurse.recursive_dirlist(extra, opt.xdev):
if not opt.quiet:
print name
98 changes: 2 additions & 96 deletions cmd-index.py
Original file line number Diff line number Diff line change
@@ -1,103 +1,9 @@
#!/usr/bin/env python
import os, sys, stat, time
import options, git, index
import options, git, index, drecurse
from helpers import *


try:
O_LARGEFILE = os.O_LARGEFILE
except AttributeError:
O_LARGEFILE = 0


class OsFile:
def __init__(self, path):
self.fd = None
self.fd = os.open(path, os.O_RDONLY|O_LARGEFILE|os.O_NOFOLLOW)

def __del__(self):
if self.fd:
fd = self.fd
self.fd = None
os.close(fd)

def fchdir(self):
os.fchdir(self.fd)


saved_errors = []
def add_error(e):
saved_errors.append(e)
log('\n%s\n' % e)


# the use of fchdir() and lstat() are for two reasons:
# - help out the kernel by not making it repeatedly look up the absolute path
# - avoid race conditions caused by doing listdir() on a changing symlink
def dirlist(path):
l = []
try:
OsFile(path).fchdir()
except OSError, e:
add_error(e)
return l
for n in os.listdir('.'):
try:
st = os.lstat(n)
except OSError, e:
add_error(Exception('in %s: %s' % (index.realpath(path), str(e))))
continue
if stat.S_ISDIR(st.st_mode):
n += '/'
l.append((os.path.join(path, n), st))
l.sort(reverse=True)
return l


def _recursive_dirlist(path, xdev):
olddir = OsFile('.')
for (path,pst) in dirlist(path):
if xdev != None and pst.st_dev != xdev:
log('Skipping %r: different filesystem.\n' % path)
continue
if stat.S_ISDIR(pst.st_mode):
for i in _recursive_dirlist(path, xdev=xdev):
yield i
yield (path,pst)
olddir.fchdir()


def _matchlen(a,b):
bi = iter(b)
count = 0
for ai in a:
try:
if bi.next() == ai:
count += 1
except StopIteration:
break
return count


def recursive_dirlist(paths):
last = ()
for path in paths:
pathsplit = index.pathsplit(path)
while _matchlen(pathsplit, last) < len(last):
yield (''.join(last), None)
last.pop()
pst = os.lstat(path)
if opt.xdev:
xdev = pst.st_dev
else:
xdev = None
if stat.S_ISDIR(pst.st_mode):
for i in _recursive_dirlist(path, xdev=xdev):
yield i
yield (path,pst)
last = pathsplit[:-1]


def merge_indexes(out, r1, r2):
log('bup: merging indexes.\n')
for e in index._last_writer_wins_iter([r1, r2]):
Expand Down Expand Up @@ -132,7 +38,7 @@ def hashgen(name):

#log('doing: %r\n' % paths)

for (path,pst) in recursive_dirlist([top]):
for (path,pst) in drecurse.recursive_dirlist([top], xdev=opt.xdev):
#log('got: %r\n' % path)
if opt.verbose>=2 or (opt.verbose==1 and stat.S_ISDIR(pst.st_mode)):
sys.stdout.write('%s\n' % path)
Expand Down
92 changes: 92 additions & 0 deletions drecurse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import stat
from helpers import *

try:
O_LARGEFILE = os.O_LARGEFILE
except AttributeError:
O_LARGEFILE = 0


class OsFile:
def __init__(self, path):
self.fd = None
self.fd = os.open(path, os.O_RDONLY|O_LARGEFILE|os.O_NOFOLLOW)

def __del__(self):
if self.fd:
fd = self.fd
self.fd = None
os.close(fd)

def fchdir(self):
os.fchdir(self.fd)


# the use of fchdir() and lstat() are for two reasons:
# - help out the kernel by not making it repeatedly look up the absolute path
# - avoid race conditions caused by doing listdir() on a changing symlink
def dirlist(path):
l = []
try:
OsFile(path).fchdir()
except OSError, e:
add_error(e)
return l
for n in os.listdir('.'):
try:
st = os.lstat(n)
except OSError, e:
add_error(Exception('in %s: %s' % (index.realpath(path), str(e))))
continue
if stat.S_ISDIR(st.st_mode):
n += '/'
l.append((os.path.join(path, n), st))
l.sort(reverse=True)
return l


def _recursive_dirlist(path, xdev):
olddir = OsFile('.')
for (path,pst) in dirlist(path):
if xdev != None and pst.st_dev != xdev:
log('Skipping %r: different filesystem.\n' % path)
continue
if stat.S_ISDIR(pst.st_mode):
for i in _recursive_dirlist(path, xdev=xdev):
yield i
yield (path,pst)
olddir.fchdir()


def _matchlen(a,b):
bi = iter(b)
count = 0
for ai in a:
try:
if bi.next() == ai:
count += 1
except StopIteration:
break
return count


def recursive_dirlist(paths, xdev):
assert(type(paths) != type(''))
last = ()
for path in paths:
ps = pathsplit(path)
while _matchlen(ps, last) < len(last):
yield (''.join(last), None)
last.pop()
pst = os.lstat(path)
if xdev:
xdev = pst.st_dev
else:
xdev = None
if stat.S_ISDIR(pst.st_mode):
for i in _recursive_dirlist(path, xdev=xdev):
yield i
yield (path,pst)
last = ps[:-1]


17 changes: 17 additions & 0 deletions helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ def readpipe(argv):
return r


# FIXME: this function isn't very generic, because it splits the filename
# in an odd way and depends on a terminating '/' to indicate directories.
# But it's used in a couple of places, so let's put it here.
def pathsplit(p):
l = p.split('/')
l = list([i+'/' for i in l[:-1]]) + l[-1:]
if l[-1] == '':
l.pop() # extra blank caused by terminating '/'
return l


_username = None
def username():
global _username
Expand Down Expand Up @@ -169,3 +180,9 @@ def mmap_read(f, len = 0):

def mmap_readwrite(f, len = 0):
return _mmap_do(f, len, mmap.MAP_SHARED, mmap.PROT_READ|mmap.PROT_WRITE)


saved_errors = []
def add_error(e):
saved_errors.append(e)
log('\n%s\n' % e)
19 changes: 6 additions & 13 deletions index.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,13 @@ def __init__(self, filename):
if f:
b = f.read(len(INDEX_HDR))
if b != INDEX_HDR:
raise Error('%s: header: expected %r, got %r'
log('warning: %s: header: expected %r, got %r'
% (filename, INDEX_HDR, b))
st = os.fstat(f.fileno())
if st.st_size:
self.m = mmap_readwrite(f)
self.writable = True
else:
st = os.fstat(f.fileno())
if st.st_size:
self.m = mmap_readwrite(f)
self.writable = True

def __del__(self):
self.close()
Expand Down Expand Up @@ -216,14 +217,6 @@ def _last_writer_wins_iter(iters):
l = filter(None, l)


def pathsplit(p):
l = p.split('/')
l = list([i+'/' for i in l[:-1]]) + l[-1:]
if l[-1] == '':
l.pop() # extra blank caused by terminating '/'
return l


class Writer:
def __init__(self, filename):
self.stack = []
Expand Down

0 comments on commit 5db9f39

Please sign in to comment.