Skip to content

Commit

Permalink
Merge pull request #140 from MattHardcastle/add-normalized-walk
Browse files Browse the repository at this point in the history
Add normalized_walk function to DeepScan

This is more efficient than reevaluating the platform (Darwin) in a busy loop.

Thanks to MattHardcastle!
  • Loading branch information
az0 committed Jan 1, 2017
2 parents 3367ffc + 37dfcae commit eafd2b7
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Expand Up @@ -24,6 +24,8 @@ install:
- cd ..
# coveralls.io
- pip install python-coveralls requests[security]
# install mock
- pip install mock
script: make tests COVERAGE="coverage run --include='bleachbit/*'"
after_success:
- coverage report
Expand Down
1 change: 1 addition & 0 deletions appveyor.yml
Expand Up @@ -42,6 +42,7 @@ cache:
test_script:
# shorten very long path because of error https://github.com/az0/bleachbit/issues/166
- 'set PATH=c:\windows\system32;c:\windows;c:\windows\system32\wbem'
- '%PYTHON_HOME%/Scripts/pip.exe install mock'
- '%PYTHON_HOME%/python.exe tests/TestAll.py'

artifacts:
Expand Down
35 changes: 25 additions & 10 deletions bleachbit/DeepScan.py
Expand Up @@ -27,21 +27,37 @@

import logging
import os
import platform
import re
import sys
import unicodedata


def normalize_filename(fn):
UTF8 = 'utf-8'


def to_unicode(s):
"""
Converts non-unicode UTF-8 string to unicode obj. Does nothing if
string is already unicode.
"""
return s if isinstance(s, unicode) else unicode(s, UTF8)


def normalized_walk(top, **kwargs):
"""
macOS uses decomposed UTF-8 to store filename. This functions
recomposes them on macOS.
macOS uses decomposed UTF-8 to store filenames. This functions
is like `os.walk` but recomposes those decomposed filenames on
macOS
"""
if 'darwin' == sys.platform:
return unicodedata.normalize(
'NFC', fn.decode('utf-8')).encode('utf-8')
if 'Darwin' == platform.system():
for dirpath, dirnames, filenames in os.walk(top, **kwargs):
yield dirpath, dirnames, [
unicodedata.normalize('NFC', to_unicode(fn)).encode(UTF8)
for fn in filenames
]
else:
return fn
for result in os.walk(top, **kwargs):
yield result


class DeepScan:
Expand All @@ -66,12 +82,11 @@ def scan(self):
yield_time = time.time()

for (top, regexes) in self.searches.items():
for (dirpath, dirnames, filenames) in os.walk(top):
for (dirpath, dirnames, filenames) in normalized_walk(top):
for regex in regexes:
# fixme, don't match filename twice
r = re.compile(regex)
for filename in filenames:
filename = normalize_filename(filename)
if r.search(filename):
yield os.path.join(dirpath, filename)
if time.time() - yield_time > 0.25:
Expand Down
25 changes: 24 additions & 1 deletion tests/TestDeepScan.py
Expand Up @@ -33,7 +33,7 @@
import common

sys.path.append('.')
from bleachbit.DeepScan import DeepScan
from bleachbit.DeepScan import DeepScan, normalized_walk
from bleachbit.Common import expanduser


Expand Down Expand Up @@ -133,6 +133,29 @@ def test_delete(self):
# clean up
shutil.rmtree(base)

def test_normalized_walk_darwin(self):
import mock

with mock.patch('os.walk') as mock_walk:
mock_walk.return_value = [
('/foo', ('bar',), ['ba\xcc\x80z']),
('/foo/bar', (), ['spam', 'eggs']),
]
with mock.patch('platform.system') as mock_platform_system:
mock_platform_system.return_value = 'Darwin'
self.assertEqual(list(normalized_walk('.')), [
('/foo', ('bar',), ['b\xc3\xa0z']),
('/foo/bar', (), ['spam', 'eggs']),
])

with mock.patch('os.walk') as mock_walk:
expected = [
('/foo', ('bar',), ['baz']),
('/foo/bar', (), ['spam', 'eggs']),
]
mock_walk.return_value = expected
self.assertEqual(list(normalized_walk('.')), expected)


def suite():
return unittest.makeSuite(DeepScanTestCase)
Expand Down

0 comments on commit eafd2b7

Please sign in to comment.