This repository has been archived by the owner on Nov 9, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
betterwalk.py
308 lines (257 loc) · 10.8 KB
/
betterwalk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""BetterWalk, a better and faster os.walk() for Python.
BetterWalk is a somewhat better and significantly faster version of Python's
os.walk(), as well as a generator version of os.listdir(). See README.md or
https://github.com/benhoyt/betterwalk for rationale and documentation.
BetterWalk is released under the new BSD 3-clause license. See LICENSE.txt for
the full license text.
"""
import ctypes
import fnmatch
import os
import stat
import sys
__version__ = '0.6'
__all__ = ['iterdir', 'iterdir_stat', 'walk']
# Windows implementation
if sys.platform == 'win32':
from ctypes import wintypes
# Various constants from windows.h
INVALID_HANDLE_VALUE = ctypes.c_void_p(-1).value
ERROR_FILE_NOT_FOUND = 2
ERROR_NO_MORE_FILES = 18
FILE_ATTRIBUTE_READONLY = 1
FILE_ATTRIBUTE_DIRECTORY = 16
FILE_ATTRIBUTE_REPARSE_POINT = 1024
# Numer of seconds between 1601-01-01 and 1970-01-01
SECONDS_BETWEEN_EPOCHS = 11644473600
kernel32 = ctypes.windll.kernel32
# ctypes wrappers for (wide string versions of) FindFirstFile,
# FindNextFile, and FindClose
FindFirstFile = kernel32.FindFirstFileW
FindFirstFile.argtypes = [
wintypes.LPCWSTR,
ctypes.POINTER(wintypes.WIN32_FIND_DATAW),
]
FindFirstFile.restype = wintypes.HANDLE
FindNextFile = kernel32.FindNextFileW
FindNextFile.argtypes = [
wintypes.HANDLE,
ctypes.POINTER(wintypes.WIN32_FIND_DATAW),
]
FindNextFile.restype = wintypes.BOOL
FindClose = kernel32.FindClose
FindClose.argtypes = [wintypes.HANDLE]
FindClose.restype = wintypes.BOOL
# The conversion functions below are taken more or less straight from
# CPython's Modules/posixmodule.c
def attributes_to_mode(attributes):
"""Convert Win32 dwFileAttributes to st_mode."""
mode = 0
if attributes & FILE_ATTRIBUTE_DIRECTORY:
mode |= stat.S_IFDIR | 0o111
else:
mode |= stat.S_IFREG
if attributes & FILE_ATTRIBUTE_READONLY:
mode |= 0o444
else:
mode |= 0o666
if attributes & FILE_ATTRIBUTE_REPARSE_POINT:
mode |= stat.S_IFLNK
return mode
def filetime_to_time(filetime):
"""Convert Win32 FILETIME to time since Unix epoch in seconds."""
total = filetime.dwHighDateTime << 32 | filetime.dwLowDateTime
return total / 10000000.0 - SECONDS_BETWEEN_EPOCHS
def find_data_to_stat(data):
"""Convert Win32 FIND_DATA struct to stat_result."""
st_mode = attributes_to_mode(data.dwFileAttributes)
st_size = data.nFileSizeHigh << 32 | data.nFileSizeLow
st_atime = filetime_to_time(data.ftLastAccessTime)
st_mtime = filetime_to_time(data.ftLastWriteTime)
st_ctime = filetime_to_time(data.ftCreationTime)
# These are set to zero rather than None, per CPython's posixmodule.c
st_ino = 0
st_dev = 0
st_nlink = 0
st_uid = 0
st_gid = 0
return os.stat_result((st_mode, st_ino, st_dev, st_nlink, st_uid,
st_gid, st_size, st_atime, st_mtime, st_ctime))
def win_error(error, filename):
exc = WindowsError(error, ctypes.FormatError(error))
exc.filename = filename
return exc
def iterdir_stat(path='.', pattern='*', fields=None):
"""See iterdir_stat.__doc__ below for docstring."""
# We can ignore "fields" in Windows, as FindFirst/Next gives full stat
if '[' in pattern or pattern.endswith('?'):
# Windows FindFirst/Next doesn't support bracket matching, and it
# doesn't handle ? at the end patterns as per fnmatch; use fnmatch
wildcard = '*'
else:
# Otherwise use built-in FindFirst/Next wildcard matching
wildcard = pattern
pattern = None
# Call FindFirstFile and handle errors
data = wintypes.WIN32_FIND_DATAW()
data_p = ctypes.byref(data)
filename = os.path.join(path, wildcard)
handle = FindFirstFile(filename, data_p)
if handle == INVALID_HANDLE_VALUE:
error = ctypes.GetLastError()
if error == ERROR_FILE_NOT_FOUND:
# No files, don't yield anything
return
raise win_error(error, path)
# Call FindNextFile in a loop, stopping when no more files
try:
while True:
# Skip '.' and '..' (current and parent directory), but
# otherwise yield (filename, stat_result) tuple
name = data.cFileName
if name not in ('.', '..'):
if pattern is None or fnmatch.fnmatch(name, pattern):
st = find_data_to_stat(data)
yield (name, st)
success = FindNextFile(handle, data_p)
if not success:
error = ctypes.GetLastError()
if error == ERROR_NO_MORE_FILES:
break
raise win_error(error, path)
finally:
if not FindClose(handle):
raise win_error(ctypes.GetLastError(), path)
# Linux, OS X, and BSD implementation
elif sys.platform.startswith(('linux', 'darwin')) or 'bsd' in sys.platform:
import ctypes.util
DIR_p = ctypes.c_void_p
# Rather annoying how the dirent struct is slightly different on each
# platform. The only fields we care about are d_name and d_type.
class dirent(ctypes.Structure):
if sys.platform.startswith('linux'):
_fields_ = (
('d_ino', ctypes.c_ulong),
('d_off', ctypes.c_long),
('d_reclen', ctypes.c_ushort),
('d_type', ctypes.c_byte),
('d_name', ctypes.c_char * 256),
)
else:
_fields_ = (
('d_ino', ctypes.c_uint32), # must be uint32, not ulong
('d_reclen', ctypes.c_ushort),
('d_type', ctypes.c_byte),
('d_namlen', ctypes.c_byte),
('d_name', ctypes.c_char * 256),
)
DT_UNKNOWN = 0
dirent_p = ctypes.POINTER(dirent)
dirent_pp = ctypes.POINTER(dirent_p)
libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True)
opendir = libc.opendir
opendir.argtypes = [ctypes.c_char_p]
opendir.restype = DIR_p
readdir_r = libc.readdir_r
readdir_r.argtypes = [DIR_p, dirent_p, dirent_pp]
readdir_r.restype = ctypes.c_int
closedir = libc.closedir
closedir.argtypes = [DIR_p]
closedir.restype = ctypes.c_int
file_system_encoding = sys.getfilesystemencoding()
def type_to_stat(d_type):
"""Convert dirent.d_type value to stat_result."""
st_mode = d_type << 12
return os.stat_result((st_mode,) + (None,) * 9)
def posix_error(filename):
errno = ctypes.get_errno()
exc = OSError(errno, os.strerror(errno))
exc.filename = filename
return exc
def iterdir_stat(path='.', pattern='*', fields=None):
"""See iterdir_stat.__doc__ below for docstring."""
# If we need more than just st_mode_type (dirent.d_type), we need to
# call stat() on each file
need_stat = fields is not None and set(fields) != set(['st_mode_type'])
dir_p = opendir(path.encode(file_system_encoding))
if not dir_p:
raise posix_error(path)
try:
entry = dirent()
result = dirent_p()
while True:
if readdir_r(dir_p, entry, result):
raise posix_error(path)
if not result:
break
name = entry.d_name.decode(file_system_encoding)
if name not in ('.', '..'):
if pattern == '*' or fnmatch.fnmatch(name, pattern):
if need_stat or entry.d_type == DT_UNKNOWN:
st = os.stat(os.path.join(path, name))
else:
st = type_to_stat(entry.d_type)
yield (name, st)
finally:
if closedir(dir_p):
raise posix_error(path)
# Some other system -- have to fall back to using os.listdir() and os.stat()
else:
def iterdir_stat(path='.', pattern='*', fields=None):
"""See iterdir_stat.__doc__ below for docstring."""
names = os.listdir(path)
if pattern != '*':
names = fnmatch.filter(names, pattern)
for name in names:
if fields is not None:
st = os.stat(os.path.join(path, name))
else:
st = os.stat_result((None,) * 10)
yield (name, st)
iterdir_stat.__doc__ = """
Yield tuples of (filename, stat_result) for each filename that matches
"pattern" in the directory given by "path". Like os.listdir(), '.' and '..'
are skipped, and the values are yielded in system-dependent order.
Pattern matching is done as per fnmatch.fnmatch(), but is more efficient if
the system's directory iteration supports pattern matching (like Windows).
The "fields" parameter specifies which fields to provide in each stat_result.
If None, only the fields the operating system can get "for free" are present
in stat_result. Otherwise "fields" must be an iterable of 'st_*' attribute
names that the caller wants in each stat_result. The only special attribute
name is 'st_mode_type', which means the type bits in the st_mode field.
In practice, all fields are provided for free on Windows; whereas only the
st_mode_type information is provided for free on Linux, Mac OS X, and BSD.
"""
def iterdir(path='.', pattern='*'):
"""Like iterdir_stat(), but only yield the filenames."""
for name, st in iterdir_stat(path, pattern=pattern):
yield name
def walk(top, topdown=True, onerror=None, followlinks=False):
"""Just like os.walk(), but faster, as it uses iterdir_stat internally."""
# Determine which are files and which are directories
dirs = []
dir_stats = []
nondirs = []
try:
for name, st in iterdir_stat(top, fields=['st_mode_type']):
if stat.S_ISDIR(st.st_mode):
dirs.append(name)
dir_stats.append(st)
else:
nondirs.append(name)
except OSError as err:
if onerror is not None:
onerror(err)
return
# Yield before recursion if going top down
if topdown:
yield top, dirs, nondirs
# Recurse into sub-directories, following symbolic links if "followlinks"
for name, st in zip(dirs, dir_stats):
new_path = os.path.join(top, name)
if followlinks or not stat.S_ISLNK(st.st_mode):
for x in walk(new_path, topdown, onerror, followlinks):
yield x
# Yield before recursion if going bottom up
if not topdown:
yield top, dirs, nondirs