Skip to content

Commit

Permalink
Use a process pool to calculate hashes and perform stat()
Browse files Browse the repository at this point in the history
Fixes #23
  • Loading branch information
ambv committed May 17, 2020
1 parent 104e07b commit 0dc3390
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 38 deletions.
10 changes: 8 additions & 2 deletions bin/bitrot
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-

# Copyright (C) 2013 by Łukasz Langa
#
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
Expand All @@ -26,5 +26,11 @@ from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from multiprocessing import freeze_support

from bitrot import run_from_command_line
run_from_command_line()


if __name__ == "__main__":
freeze_support()
run_from_command_line()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
include_package_data = True,
zip_safe = False, # if only because of the readme file
install_requires = [
'futures; python_version == "2.7"'
],

classifiers = [
'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License',
Expand Down
88 changes: 53 additions & 35 deletions src/bitrot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
import time
import unicodedata

from concurrent.futures import ProcessPoolExecutor, wait, as_completed


DEFAULT_CHUNK_SIZE = 16384 # block size in HFS+; 4X the block size in ext4
DOT_THRESHOLD = 200
Expand Down Expand Up @@ -144,14 +146,51 @@ def list_existing_paths(directory, expected=(), ignored=(), follow_links=False):
return paths, total_size


def compute_one(path, chunk_size):
"""Return a tuple with (unicode path, size, mtime, sha1). Takes a binary path."""
p_uni = normalize_path(path)
try:
st = os.stat(path)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
raise BitrotException

raise # Not expected? https://github.com/ambv/bitrot/issues/

new_mtime = int(st.st_mtime)

try:
new_sha1 = sha1(path, chunk_size)
except (IOError, OSError) as e:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p_uni, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
raise BitrotException

return p_uni, st.st_size, int(st.st_mtime), new_sha1


class BitrotException(Exception):
pass


class Bitrot(object):
def __init__(
self, verbosity=1, test=False, follow_links=False, commit_interval=300,
chunk_size=DEFAULT_CHUNK_SIZE,
chunk_size=DEFAULT_CHUNK_SIZE, workers=os.cpu_count(),
):
self.verbosity = verbosity
self.test = test
Expand All @@ -160,6 +199,7 @@ def __init__(
self.chunk_size = chunk_size
self._last_reported_size = ''
self._last_commit_ts = 0
self.pool = ProcessPoolExecutor(max_workers=workers)

def maybe_commit(self, conn):
if time.time() < self._last_commit_ts + self.commit_interval:
Expand Down Expand Up @@ -195,44 +235,18 @@ def run(self):
follow_links=self.follow_links,
)
paths_uni = set(normalize_path(p) for p in paths)
futures = [self.pool.submit(compute_one, p, self.chunk_size) for p in paths]

for p in sorted(paths):
p_uni = normalize_path(p)
for future in as_completed(futures):
try:
st = os.stat(p)
except OSError as ex:
if ex.errno in IGNORED_FILE_SYSTEM_ERRORS:
# The file disappeared between listing existing paths and
# this run or is (temporarily?) locked with different
# permissions. We'll just skip it for now.
print(
'\rwarning: `{}` is currently unavailable for '
'reading: {}'.format(
p_uni, ex,
),
file=sys.stderr,
)
continue

raise # Not expected? https://github.com/ambv/bitrot/issues/
p_uni, new_size, new_mtime, new_sha1 = future.result()
except BitrotException:
continue

new_mtime = int(st.st_mtime)
current_size += st.st_size
current_size += new_size
if self.verbosity:
self.report_progress(current_size, total_size)

try:
new_sha1 = sha1(p, self.chunk_size)
except (IOError, OSError) as e:
print(
'\rwarning: cannot compute hash of {} [{}]'.format(
p, errno.errorcode[e.args[0]],
),
file=sys.stderr,
)
missing_paths.discard(p_uni)
continue

if p_uni not in missing_paths:
# We are not expecting this path, it wasn't in the database yet.
# It's either new or a rename. Let's handle that.
Expand Down Expand Up @@ -271,11 +285,11 @@ def run(self):
continue

if stored_sha1 != new_sha1:
errors.append(p)
errors.append(p_uni)
print(
'\rerror: SHA1 mismatch for {}: expected {}, got {}.'
' Last good hash checked on {}.'.format(
p.decode(FSENCODING), stored_sha1, new_sha1, stored_ts
p_uni, stored_sha1, new_sha1, stored_ts
),
file=sys.stderr,
)
Expand Down Expand Up @@ -538,6 +552,9 @@ def run_from_command_line():
'--commit-interval', type=float, default=300,
help='min time in seconds between commits '
'(0 commits on every operation)')
parser.add_argument(
'-w', '--workers', type=int, default=os.cpu_count(),
help='run this many workers (use -w1 for slow magnetic disks)')
parser.add_argument(
'--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
help='read files this many bytes at a time')
Expand All @@ -563,6 +580,7 @@ def run_from_command_line():
follow_links=args.follow_links,
commit_interval=args.commit_interval,
chunk_size=args.chunk_size,
workers=args.workers,
)
if args.fsencoding:
FSENCODING = args.fsencoding
Expand Down

0 comments on commit 0dc3390

Please sign in to comment.