Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix S3 sync issue with keys containing urlencode values #755

Merged
merged 1 commit into from
Apr 18, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions awscli/customizations/s3/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,39 @@ def list_objects(self, bucket, prefix=None):
kwargs = {'bucket': bucket, 'encoding_type': 'url'}
if prefix is not None:
kwargs['prefix'] = prefix
pages = self._operation.paginate(self._endpoint, **kwargs)
for response, page in pages:
contents = page['Contents']
for content in contents:
source_path = bucket + '/' + unquote_str(content['Key'])
size = content['Size']
last_update = self._date_parser(content['LastModified'])
yield source_path, size, last_update
# This event handler is needed because we use encoding_type url and
# we're paginating. The pagination token is the last Key of the
# Contents list. However, botocore does not know that the encoding
# type needs to be urldecoded.
with ScopedEventHandler(self._operation.session, 'after-call.s3.ListObjects',
self._decode_keys):
pages = self._operation.paginate(self._endpoint, **kwargs)
for response, page in pages:
contents = page['Contents']
for content in contents:
source_path = bucket + '/' + content['Key']
size = content['Size']
last_update = self._date_parser(content['LastModified'])
yield source_path, size, last_update

def _decode_keys(self, parsed, **kwargs):
for content in parsed['Contents']:
content['Key'] = unquote_str(content['Key'])


class ScopedEventHandler(object):
"""Register an event callback for the duration of a scope."""

def __init__(self, session, event_name, handler):
self._session = session
self._event_name = event_name
self._handler = handler

def __enter__(self):
self._session.register(self._event_name, self._handler)

def __exit__(self, exc_type, exc_value, traceback):
self._session.unregister(self._event_name, self._handler)


IORequest = namedtuple('IORequest', ['filename', 'offset', 'data'])
Expand Down
72 changes: 47 additions & 25 deletions tests/integration/customizations/s3/test_plugin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# -*- coding: utf-8 -*-
# Copyright 2013 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
Expand All @@ -25,6 +24,7 @@
import signal

import botocore.session
import six

from tests.integration import aws
from tests.unit.customizations.s3 import create_bucket as _create_bucket
Expand Down Expand Up @@ -68,6 +68,17 @@ def extra_teardown(self):
# Subclasses can use this to define extra teardown steps.
pass

def assert_key_contents_equal(self, bucket, key, expected_contents):
if isinstance(expected_contents, six.StringIO):
expected_contents = expected_contents.getvalue()
actual_contents = self.get_key_contents(bucket, key)
# The contents can be huge so we try to give helpful error messages
# without necessarily printing the actual contents.
self.assertEqual(len(actual_contents), len(expected_contents))
if actual_contents != expected_contents:
self.fail("Contents for %s/%s do not match (but they "
"have the same length)" % (bucket, key))

def create_bucket(self):
bucket_name = _create_bucket(self.session)
self.addCleanup(self.delete_bucket, bucket_name)
Expand Down Expand Up @@ -147,8 +158,7 @@ def test_mv_local_to_s3(self):
# When we move an object, the local file is gone:
self.assertTrue(not os.path.exists(full_path))
# And now resides in s3.
contents = self.get_key_contents(bucket_name, 'foo.txt')
self.assertEqual(contents, 'this is foo.txt')
self.assert_key_contents_equal(bucket_name, 'foo.txt', 'this is foo.txt')

def test_mv_s3_to_local(self):
bucket_name = self.create_bucket()
Expand Down Expand Up @@ -179,22 +189,21 @@ def test_mv_s3_to_s3(self):
def test_mv_s3_to_s3_multipart(self):
from_bucket = self.create_bucket()
to_bucket = self.create_bucket()
file_contents = 'abcd' * (1024 * 1024 * 10)
file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
self.put_object(from_bucket, 'foo.txt', file_contents)

p = aws('s3 mv s3://%s/foo.txt s3://%s/foo.txt' % (from_bucket,
to_bucket))
self.assert_no_errors(p)
contents = self.get_key_contents(to_bucket, 'foo.txt')
self.assertEqual(contents, file_contents)
self.assert_key_contents_equal(to_bucket, 'foo.txt', file_contents)
# And verify that the object no longer exists in the from_bucket.
self.assertTrue(not self.key_exists(from_bucket, key_name='foo.txt'))

def test_mv_s3_to_s3_multipart_recursive(self):
from_bucket = self.create_bucket()
to_bucket = self.create_bucket()

large_file_contents = 'abcd' * (1024 * 1024 * 10)
large_file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
small_file_contents = 'small file contents'
self.put_object(from_bucket, 'largefile', large_file_contents)
self.put_object(from_bucket, 'smallfile', small_file_contents)
Expand All @@ -211,29 +220,28 @@ def test_mv_s3_to_s3_multipart_recursive(self):
self.assertTrue(self.key_exists(to_bucket, key_name='smallfile'))

# And the contents are what we expect.
self.assertEqual(self.get_key_contents(to_bucket, 'smallfile'),
small_file_contents)
self.assertEqual(self.get_key_contents(to_bucket, 'largefile'),
large_file_contents)
self.assert_key_contents_equal(to_bucket, 'smallfile',
small_file_contents)
self.assert_key_contents_equal(to_bucket, 'largefile',
large_file_contents)

def test_mv_with_large_file(self):
bucket_name = self.create_bucket()
# 40MB will force a multipart upload.
file_contents = 'abcd' * (1024 * 1024 * 10)
foo_txt = self.files.create_file('foo.txt', file_contents)
file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
foo_txt = self.files.create_file('foo.txt', file_contents.getvalue())
p = aws('s3 mv %s s3://%s/foo.txt' % (foo_txt, bucket_name))
self.assert_no_errors(p)
# When we move an object, the local file is gone:
self.assertTrue(not os.path.exists(foo_txt))
# And now resides in s3.
contents = self.get_key_contents(bucket_name, 'foo.txt')
self.assertEqual(len(contents), len(file_contents))
self.assert_key_contents_equal(bucket_name, 'foo.txt', file_contents)

# Now verify we can download this file.
p = aws('s3 mv s3://%s/foo.txt %s' % (bucket_name, foo_txt))
self.assert_no_errors(p)
self.assertTrue(os.path.exists(foo_txt))
self.assertEqual(os.path.getsize(foo_txt), len(file_contents))
self.assertEqual(os.path.getsize(foo_txt), len(file_contents.getvalue()))

def test_mv_to_nonexistent_bucket(self):
full_path = self.files.create_file('foo.txt', 'this is foo.txt')
Expand Down Expand Up @@ -315,16 +323,12 @@ def test_cp_without_trailing_slash(self):
def test_cp_s3_s3_multipart(self):
from_bucket = self.create_bucket()
to_bucket = self.create_bucket()
file_contents = 'abcd' * (1024 * 1024 * 10)
file_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
self.put_object(from_bucket, 'foo.txt', file_contents)

p = aws('s3 cp s3://%s/foo.txt s3://%s/foo.txt' % (from_bucket, to_bucket))
self.assert_no_errors(p)
contents = self.get_key_contents(to_bucket, 'foo.txt')
# Don't use assertEqual() here, this will spit out a huge
# 20mb diff of 'abcd' chars. Just let the user know we failed.
if contents != file_contents:
self.fail("Downlaoded contents of 10mb file are not the same.")
self.assert_key_contents_equal(to_bucket, 'foo.txt', file_contents)
self.assertTrue(self.key_exists(from_bucket, key_name='foo.txt'))

def test_guess_mime_type(self):
Expand All @@ -342,18 +346,19 @@ def test_guess_mime_type(self):
def test_download_large_file(self):
# This will force a multipart download.
bucket_name = self.create_bucket()
foo_contents = 'abcd' * (1024 * 1024 * 10)
foo_contents = six.StringIO('abcd' * (1024 * 1024 * 10))
self.put_object(bucket_name, key_name='foo.txt', contents=foo_contents)
local_foo_txt = self.files.full_path('foo.txt')
p = aws('s3 cp s3://%s/foo.txt %s' % (bucket_name, local_foo_txt))
self.assert_no_errors(p)
self.assertEqual(os.path.getsize(local_foo_txt), len(foo_contents))
self.assertEqual(os.path.getsize(local_foo_txt),
len(foo_contents.getvalue()))

@unittest.skipIf(platform.system() not in ['Darwin', 'Linux'],
'SIGINT not supported on Windows.')
def test_download_ctrl_c_does_not_hang(self):
bucket_name = self.create_bucket()
foo_contents = 'abcd' * (1024 * 1024 * 20)
foo_contents = six.StringIO('abcd' * (1024 * 1024 * 20))
self.put_object(bucket_name, key_name='foo.txt', contents=foo_contents)
local_foo_txt = self.files.full_path('foo.txt')
process = aws('s3 cp s3://%s/foo.txt %s' % (bucket_name, local_foo_txt), wait_for_finish=False)
Expand Down Expand Up @@ -398,6 +403,23 @@ def test_download_non_existent_key(self):


class TestSync(BaseS3CLICommand):
def test_sync_with_plus_chars(self):
# 1. Create > 1000 files with '+' in the filename.
# 2. Sync up to s3.
# 3. Sync up to s3
# 4. Verify nothing was synced up down from s3 in step 3.
bucket_name = self.create_bucket()
filenames = []
for i in range(2000):
# Create a file with a space char and a '+' char in the filename.
filenames.append(self.files.create_file('foo +%06d' % i, contents=''))
p = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
self.assert_no_errors(p)
time.sleep(1)
p2 = aws('s3 sync %s s3://%s/' % (self.files.rootdir, bucket_name))
self.assertNotIn('upload:', p2.stdout)
self.assertEqual('', p2.stdout)

def test_sync_to_from_s3(self):
bucket_name = self.create_bucket()
foo_txt = self.files.create_file('foo.txt', 'foo contents')
Expand Down
6 changes: 6 additions & 0 deletions tests/unit/customizations/s3/fake_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ def emit(self, *args, **kwargs):
def emit_first_non_none_response(self, *args, **kwargs):
pass

def register(self, name, handler):
pass

def unregister(self, name, handler):
pass


class FakeService(object):
"""
Expand Down
30 changes: 27 additions & 3 deletions tests/unit/customizations/s3/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@

import mock

from botocore.hooks import HierarchicalEmitter
from awscli.customizations.s3.utils import find_bucket_key, find_chunksize
from awscli.customizations.s3.utils import ReadFileChunk
from awscli.customizations.s3.utils import relative_path
from awscli.customizations.s3.utils import StablePriorityQueue
from awscli.customizations.s3.utils import BucketLister
from awscli.customizations.s3.utils import ScopedEventHandler
from awscli.customizations.s3.constants import MAX_SINGLE_UPLOAD_SIZE


Expand Down Expand Up @@ -196,13 +198,23 @@ def test_priority_attr_is_missing(self):
class TestBucketList(unittest.TestCase):
def setUp(self):
self.operation = mock.Mock()
self.emitter = HierarchicalEmitter()
self.operation.session.register = self.emitter.register
self.operation.session.unregister = self.emitter.unregister
self.endpoint = mock.sentinel.endpoint
self.date_parser = mock.Mock()
self.date_parser.return_value = mock.sentinel.now
self.responses = []

def fake_paginate(self, *args, **kwargs):
for response in self.responses:
self.emitter.emit('after-call.s3.ListObjects', parsed=response[1])
return self.responses

def test_list_objects(self):
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': 'a', 'Size': 1},
Expand All @@ -224,7 +236,8 @@ def test_urlencoded_keys(self):
# them before yielding them. For example, note the %0D
# in bar.txt:
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': 'bar%0D.txt', 'Size': 1}]}),
Expand All @@ -236,7 +249,8 @@ def test_urlencoded_keys(self):

def test_urlencoded_with_unicode_keys(self):
now = mock.sentinel.now
self.operation.paginate.return_value = [
self.operation.paginate = self.fake_paginate
self.responses = [
(None, {'Contents': [
{'LastModified': '2014-02-27T04:20:38.000Z',
'Key': '%E2%9C%93', 'Size': 1}]}),
Expand All @@ -246,5 +260,15 @@ def test_urlencoded_with_unicode_keys(self):
# And note how it's been converted to '\r'.
self.assertEqual(objects, [(u'foo/\u2713', 1, now)])


class TestScopedEventHandler(unittest.TestCase):
def test_scoped_session_handler(self):
session = mock.Mock()
scoped = ScopedEventHandler(session, 'eventname', 'handler')
with scoped:
session.register.assert_called_with('eventname', 'handler')
session.unregister.assert_called_with('eventname', 'handler')


if __name__ == "__main__":
unittest.main()