Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Fix bug when filtering s3 locations #531

Merged
merged 2 commits into from

2 participants

@jamesls
Owner

The pattern is evaluated against the entire bucket. This doesn't
mattern for suffix searches, but for a prefix search, you'd have to
include the bucket name. This is inconsistent with filtering local
files. Now they're both consistent. Given:

  --exclude 'foo*'

This will filter any full path that startsw with foo. So locally:

  rootdir/
    foo.txt      # yes
    foo1.txt     # yes
    foo/bar.txt  # yes
    bar.txt      # no

And on s3, we now have the same results:

  bucket/
    foo.txt      # yes
    foo1.txt     # yes
    foo/bar.txt  # yes
    bar.txt      # no

I also added debug logs to help customers troubleshoot why their
filters aren't working the way they expect. For example:

s3.filters - DEBUG - /private/tmp/syncme/level-1/file-7 matched exclude filter: /private/tmp/syncme/*
s3.filters - DEBUG - /private/tmp/syncme/level-1/file-7 did not match include filter: /private/tmp/syncme/f*
s3.filters - DEBUG - /private/tmp/syncme/level-1/file-7 matched include filter: /private/tmp/syncme/level*
s3.filters - DEBUG - =/private/tmp/syncme/level-1/file-7 final filtered status, should_include: True
jamesls added some commits
@jamesls jamesls Clean up existing filter unit tests
This makes the unit tests more concise and easier to
follow.  Removes a lot of the duplication.  Also makes it
easier to write new filter tests.
8c2941a
@jamesls jamesls Fix bug when filtering s3 locations
The pattern is evaluated against the entire bucket.  This doesn't
mattern for suffix searches, but for a prefix search, you'd have to
include the bucket name.  This is inconsistent with filtering local
files.  Now they're both consistent.  Given:

  --exclude 'foo*'

This will filter any full path that startsw with foo.  So locally:

  rootdir/
    foo.txt      # yes
    foo1.txt     # yes
    foo/bar.txt  # yes
    bar.txt      # no

And on s3, we now have the same results:

  bucket/
    foo.txt      # yes
    foo1.txt     # yes
    foo/bar.txt  # yes
    bar.txt      # no

I also added debug logs to help customers troubleshoot why their
filters aren't working the way they expect.
5063e56
@garnaat

LGTM

@jamesls jamesls merged commit 5063e56 into aws:develop
@jamesls jamesls deleted the jamesls:s3-fnmatch-bucket branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Dec 4, 2013
  1. @jamesls

    Clean up existing filter unit tests

    jamesls authored
    This makes the unit tests more concise and easier to
    follow.  Removes a lot of the duplication.  Also makes it
    easier to write new filter tests.
Commits on Dec 5, 2013
  1. @jamesls

    Fix bug when filtering s3 locations

    jamesls authored
    The pattern is evaluated against the entire bucket.  This doesn't
    mattern for suffix searches, but for a prefix search, you'd have to
    include the bucket name.  This is inconsistent with filtering local
    files.  Now they're both consistent.  Given:
    
      --exclude 'foo*'
    
    This will filter any full path that startsw with foo.  So locally:
    
      rootdir/
        foo.txt      # yes
        foo1.txt     # yes
        foo/bar.txt  # yes
        bar.txt      # no
    
    And on s3, we now have the same results:
    
      bucket/
        foo.txt      # yes
        foo1.txt     # yes
        foo/bar.txt  # yes
        bar.txt      # no
    
    I also added debug logs to help customers troubleshoot why their
    filters aren't working the way they expect.
This page is out of date. Refresh to see the latest.
View
18 awscli/customizations/s3/filters.py
@@ -10,10 +10,14 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
+import logging
import fnmatch
import os
+LOG = logging.getLogger(__name__)
+
+
class Filter(object):
"""
This is a universal exclude/include filter.
@@ -55,13 +59,21 @@ def call(self, file_infos):
else:
path_pattern = pattern[1].replace(os.sep, '/')
- full_path_pattern = path_pattern
-
+ full_path_pattern = os.path.join(file_path.split('/')[0],
+ path_pattern)
is_match = fnmatch.fnmatch(file_path, full_path_pattern)
if is_match and pattern_type == '--include':
file_status = (file_info, True)
+ LOG.debug("%s matched include filter: %s",
+ file_path, full_path_pattern)
elif is_match and pattern_type == '--exclude':
file_status = (file_info, False)
-
+ LOG.debug("%s matched exclude filter: %s",
+ file_path, full_path_pattern)
+ else:
+ LOG.debug("%s did not match %s filter: %s",
+ file_path, pattern_type[2:], full_path_pattern)
+ LOG.debug("=%s final filtered status, should_include: %s",
+ file_path, file_status[1])
if file_status[1]:
yield file_info
View
211 tests/unit/customizations/s3/test_filters.py
@@ -19,136 +19,115 @@
class FiltersTest(unittest.TestCase):
def setUp(self):
- self.local_files = []
- self.loc_file1 = FileInfo(src=os.path.abspath('test.txt'), dest='',
- compare_key='', size=10,
- last_update=0, src_type='local',
- dest_type='s3', operation_name='',
- service=None, endpoint=None)
- self.loc_file2 = FileInfo(src=os.path.abspath('test.jpg'), dest='',
- compare_key='', size=10,
- last_update=0, src_type='local',
- dest_type='s3', operation_name='',
- service=None, endpoint=None)
- path = 'directory' + os.sep + 'test.jpg'
- self.loc_file3 = FileInfo(src=os.path.abspath(path), dest='',
- compare_key='', size=10,
- last_update=0, src_type='local',
- dest_type='s3', operation_name='',
- service=None, endpoint=None)
- self.local_files.append(self.loc_file1)
- self.local_files.append(self.loc_file2)
- self.local_files.append(self.loc_file3)
-
- self.s3_files = []
- self.s3_file1 = FileInfo('bucket/test.txt', dest='',
- compare_key='', size=10,
- last_update=0, src_type='s3',
- dest_type='s3', operation_name='',
- service=None, endpoint=None)
- self.s3_file2 = FileInfo('bucket/test.jpg', dest='',
- compare_key='', size=10,
- last_update=0, src_type='s3',
- dest_type='s3', operation_name='',
- service=None, endpoint=None)
- self.s3_file3 = FileInfo('bucket/key/test.jpg', dest='',
- compare_key='', size=10,
- last_update=0, src_type='s3',
- dest_type='s3', operation_name='',
- service=None, endpoint=None)
- self.s3_files.append(self.s3_file1)
- self.s3_files.append(self.s3_file2)
- self.s3_files.append(self.s3_file3)
+ self.local_files = [
+ self.file_info('test.txt'),
+ self.file_info('test.jpg'),
+ self.file_info(os.path.join('directory', 'test.jpg')),
+ ]
+ self.s3_files = [
+ self.file_info('bucket/test.txt'),
+ self.file_info('bucket/test.jpg'),
+ self.file_info('bucket/key/test.jpg'),
+ ]
+
+ def file_info(self, filename, src_type='local'):
+ if src_type == 'local':
+ filename = os.path.abspath(filename)
+ dest_type = 's3'
+ else:
+ dest_type = 'local'
+ return FileInfo(src=filename, dest='',
+ compare_key='', size=10,
+ last_update=0, src_type=src_type,
+ dest_type=dest_type, operation_name='',
+ service=None, endpoint=None)
def test_no_filter(self):
- """
- No filters
- """
- patterns = []
exc_inc_filter = Filter({})
- files = exc_inc_filter.call(iter(self.local_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, self.local_files)
-
- files = exc_inc_filter.call(iter(self.s3_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, self.s3_files)
+ matched_files = list(exc_inc_filter.call(self.local_files))
+ self.assertEqual(matched_files, self.local_files)
+
+ matched_files2 = list(exc_inc_filter.call(self.s3_files))
+ self.assertEqual(matched_files2, self.s3_files)
def test_include(self):
- """
- Only an include file
- """
patterns = [['--include', '*.txt']]
- exc_inc_filter = Filter({'filters': patterns})
- files = exc_inc_filter.call(iter(self.local_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, self.local_files)
-
- files = exc_inc_filter.call(iter(self.s3_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, self.s3_files)
+ include_filter = Filter({'filters': [['--include', '*.txt']]})
+ matched_files = list(include_filter.call(self.local_files))
+ self.assertEqual(matched_files, self.local_files)
+
+ matched_files2 = list(include_filter.call(self.s3_files))
+ self.assertEqual(matched_files2, self.s3_files)
def test_exclude(self):
- """
- Only an exclude filter
- """
- patterns = [['--exclude', '*']]
- exc_inc_filter = Filter({'filters': patterns})
- files = exc_inc_filter.call(iter(self.local_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, [])
-
- files = exc_inc_filter.call(iter(self.s3_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, [])
+ exclude_filter = Filter({'filters': [['--exclude', '*']]})
+ matched_files = list(exclude_filter.call(self.local_files))
+ self.assertEqual(matched_files, [])
+
+ matched_files = list(exclude_filter.call(self.s3_files))
+ self.assertEqual(matched_files, [])
def test_exclude_include(self):
- """
- Exclude everything and then include all .txt files
- """
patterns = [['--exclude', '*'], ['--include', '*.txt']]
- exc_inc_filter = Filter({'filters': patterns})
- files = exc_inc_filter.call(iter(self.local_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, [self.loc_file1])
-
- files = exc_inc_filter.call(iter(self.s3_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, [self.s3_file1])
+ exclude_include_filter = Filter({'filters': patterns})
+ matched_files = list(exclude_include_filter.call(self.local_files))
+ self.assertEqual(matched_files, [self.local_files[0]])
+
+ matched_files = list(exclude_include_filter.call(self.s3_files))
+ self.assertEqual(matched_files, [self.s3_files[0]])
def test_include_exclude(self):
- """
- Include all .txt files then exclude everything
- """
patterns = [['--include', '*.txt'], ['--exclude', '*']]
- exc_inc_filter = Filter({'filters': patterns})
- files = exc_inc_filter.call(iter(self.local_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, [])
-
- files = exc_inc_filter.call(iter(self.s3_files))
- result_list = []
- for filename in files:
- result_list.append(filename)
- self.assertEqual(result_list, [])
+ exclude_all_filter = Filter({'filters': patterns})
+ matched_files = list(exclude_all_filter.call(self.local_files))
+ self.assertEqual(matched_files, [])
+
+ matched_files = list(exclude_all_filter.call(self.s3_files))
+ self.assertEqual(matched_files, [])
+
+ def test_prefix_filtering_consistent(self):
+ # The same filter should work for both local and remote files.
+ # So if I have a directory with 2 files:
+ local_files = [
+ self.file_info('test1.txt'),
+ self.file_info('nottest1.txt'),
+ ]
+ # And the same 2 files remote (note that the way FileInfo objects
+ # are constructed, we'll have the bucket name but no leading '/'
+ # character):
+ remote_files = [
+ self.file_info('bucket/test1.txt', src_type='s3'),
+ self.file_info('bucket/nottest1.txt', src_type='s3'),
+ ]
+ # If I apply the filter to the local to the local files.
+ exclude_filter = Filter({'filters': [['--exclude', 't*']]})
+ filtered_files = list(exclude_filter.call(local_files))
+ self.assertEqual(len(filtered_files), 1)
+ self.assertEqual(os.path.basename(filtered_files[0].src),
+ 'nottest1.txt')
+
+ # I should get the same result if I apply the same filter to s3
+ # objects.
+ same_filtered_files = list(exclude_filter.call(remote_files))
+ self.assertEqual(len(same_filtered_files), 1)
+ self.assertEqual(os.path.basename(same_filtered_files[0].src),
+ 'nottest1.txt')
+
+ def test_bucket_exclude_with_prefix(self):
+ s3_files = [
+ self.file_info('bucket/dir1/key1.txt', src_type='s3'),
+ self.file_info('bucket/dir1/key2.txt', src_type='s3'),
+ self.file_info('bucket/dir1/notkey3.txt', src_type='s3'),
+ ]
+ filtered_files = list(
+ Filter({'filters': [['--exclude', 'dir1/*']]}).call(s3_files))
+ self.assertEqual(filtered_files, [])
+
+ key_files = list(
+ Filter({'filters': [['--exclude', 'dir1/key*']]}).call(s3_files))
+ self.assertEqual(len(key_files), 1)
+ self.assertEqual(key_files[0].src, 'bucket/dir1/notkey3.txt')
+
if __name__ == "__main__":
unittest.main()
Something went wrong with that request. Please try again.