Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Incorporate marcacohen's fixes for computing checksum when downloadin…

…g large files on Windows
  • Loading branch information...
commit f83da62cae1f3f3be34658655830f8a171cf594f 1 parent dfc9809
@mfschwartz mfschwartz authored
View
24 boto/s3/key.py
@@ -31,6 +31,7 @@
from boto.provider import Provider
from boto.s3.user import User
from boto import UserAgent
+from boto.utils import compute_md5
try:
from hashlib import md5
except ImportError:
@@ -595,19 +596,16 @@ def compute_md5(self, fp):
as the first element and the base64 encoded version of the
plain digest as the second element.
"""
- m = md5()
- fp.seek(0)
- s = fp.read(self.BufferSize)
- while s:
- m.update(s)
- s = fp.read(self.BufferSize)
- hex_md5 = m.hexdigest()
- base64md5 = base64.encodestring(m.digest())
- if base64md5[-1] == '\n':
- base64md5 = base64md5[0:-1]
- self.size = fp.tell()
- fp.seek(0)
- return (hex_md5, base64md5)
+ tup = compute_md5(fp)
+ # Returned values are MD5 hash, base64 encoded MD5 hash, and file size.
+ # The internal implementation of compute_md5() needs to return the
+ # file size but we don't want to return that value to the external
+ # caller because it changes the class interface (i.e. it might
+ # break some code) so we consume the third tuple value here and
+ # return the remainder of the tuple to the caller, thereby preserving
+ # the existing interface.
+ self.size = tup[2]
+ return tup[0:2]
def set_contents_from_stream(self, fp, headers=None, replace=True,
cb=None, num_cb=10, policy=None,
View
26 boto/s3/resumable_download_handler.py
@@ -212,27 +212,6 @@ def _attempt_resumable_download(self, key, fp, headers, cb, num_cb,
override_num_retries=0)
fp.flush()
- def _check_final_md5(self, key, file_name):
- """
- Checks that etag from server agrees with md5 computed after the
- download completes. This is important, since the download could
- have spanned a number of hours and multiple processes (e.g.,
- gsutil runs), and the user could change some of the file and not
- realize they have inconsistent data.
- """
- fp = open(file_name, 'r')
- if key.bucket.connection.debug >= 1:
- print 'Checking md5 against etag.'
- hex_md5 = key.compute_md5(fp)[0]
- if hex_md5 != key.etag.strip('"\''):
- file_name = fp.name
- fp.close()
- os.unlink(file_name)
- raise ResumableDownloadException(
- 'File changed during download: md5 signature doesn\'t match '
- 'etag (incorrect downloaded file deleted)',
- ResumableTransferDisposition.ABORT)
-
def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
version_id=None):
"""
@@ -287,7 +266,10 @@ def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
torrent, version_id)
# Download succceded, so remove the tracker file (if have one).
self._remove_tracker_file()
- self._check_final_md5(key, fp.name)
+ # Previously, check_final_md5() was called here to validate
+ # downloaded file's checksum, however, to be consistent with
+ # non-resumable downloads, this call was removed. Checksum
+ # validation of file contents should be done by the caller.
if debug >= 1:
print 'Resumable download complete.'
return
View
37 boto/utils.py
@@ -55,6 +55,11 @@
from email.Utils import formatdate
from email import Encoders
import gzip
+import base64
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
try:
@@ -689,3 +694,35 @@ def guess_mime_type(content, deftype):
rtype = mimetype
break
return(rtype)
+
+def compute_md5(fp, buf_size=8192):
+ """
+ Compute MD5 hash on passed file and return results in a tuple of values.
+
+ :type fp: file
+ :param fp: File pointer to the file to MD5 hash. The file pointer
+ will be reset to the beginning of the file before the
+ method returns.
+
+ :type buf_size: integer
+ :param buf_size: Number of bytes per read request.
+
+ :rtype: tuple
+ :return: A tuple containing the hex digest version of the MD5 hash
+ as the first element, the base64 encoded version of the
+ plain digest as the second element and the file size as
+ the third element.
+ """
+ m = md5()
+ fp.seek(0)
+ s = fp.read(buf_size)
+ while s:
+ m.update(s)
+ s = fp.read(buf_size)
+ hex_md5 = m.hexdigest()
+ base64md5 = base64.encodestring(m.digest())
+ if base64md5[-1] == '\n':
+ base64md5 = base64md5[0:-1]
+ file_size = fp.tell()
+ fp.seek(0)
+ return (hex_md5, base64md5, file_size)
View
43 tests/s3/mock_storage_service.py
@@ -28,6 +28,13 @@
import copy
import boto
+import base64
+from boto.utils import compute_md5
+
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
NOT_IMPL = None
@@ -53,10 +60,12 @@ def __init__(self, bucket=None, name=None):
self.bucket = bucket
self.name = name
self.data = None
+ self.etag = None
self.size = None
self.content_encoding = None
self.content_type = None
self.last_modified = 'Wed, 06 Oct 2010 05:11:54 GMT'
+ self.BufferSize = 8192
def get_contents_as_string(self, headers=NOT_IMPL,
cb=NOT_IMPL, num_cb=NOT_IMPL,
@@ -93,6 +102,7 @@ def set_contents_from_file(self, fp, headers=None, replace=NOT_IMPL,
policy=NOT_IMPL, md5=NOT_IMPL,
res_upload_handler=NOT_IMPL):
self.data = fp.read()
+ self.set_etag()
self.size = len(self.data)
self._handle_headers(headers)
@@ -100,6 +110,7 @@ def set_contents_from_string(self, s, headers=NOT_IMPL, replace=NOT_IMPL,
cb=NOT_IMPL, num_cb=NOT_IMPL, policy=NOT_IMPL,
md5=NOT_IMPL, reduced_redundancy=NOT_IMPL):
self.data = copy.copy(s)
+ self.set_etag()
self.size = len(s)
self._handle_headers(headers)
@@ -118,6 +129,38 @@ def copy(self, dst_bucket_name, dst_key, metadata=NOT_IMPL,
return dst_bucket.copy_key(dst_key, self.bucket.name,
self.name, metadata)
+ def set_etag(self):
+ """
+ Set etag attribute by generating hex MD5 checksum on current
+ contents of mock key.
+ """
+ m = md5()
+ m.update(self.data)
+ hex_md5 = m.hexdigest()
+ self.etag = hex_md5
+
+ def compute_md5(self, fp):
+ """
+ :type fp: file
+ :param fp: File pointer to the file to MD5 hash. The file pointer
+ will be reset to the beginning of the file before the
+ method returns.
+
+ :rtype: tuple
+ :return: A tuple containing the hex digest version of the MD5 hash
+ as the first element and the base64 encoded version of the
+ plain digest as the second element.
+ """
+ tup = compute_md5(fp)
+ # Returned values are MD5 hash, base64 encoded MD5 hash, and file size.
+ # The internal implementation of compute_md5() needs to return the
+ # file size but we don't want to return that value to the external
+ # caller because it changes the class interface (i.e. it might
+ # break some code) so we consume the third tuple value here and
+ # return the remainder of the tuple to the caller, thereby preserving
+ # the existing interface.
+ self.size = tup[2]
+ return tup[0:2]
class MockBucket(object):
View
18 tests/s3/test_gsconnection.py
@@ -30,6 +30,7 @@
import unittest
import time
import os
+import re
from boto.gs.connection import GSConnection
from boto import storage_uri
@@ -218,12 +219,25 @@ def test_2_copy_key(self):
def test_3_default_object_acls(self):
"""test default object acls"""
+ # regexp for matching project-private default object ACL
+ project_private_re = '\s*<AccessControlList>\s*<Entries>\s*<Entry>' \
+ '\s*<Scope type="GroupById"><ID>[0-9a-fA-F]+</ID></Scope>' \
+ '\s*<Permission>FULL_CONTROL</Permission>\s*</Entry>\s*<Entry>' \
+ '\s*<Scope type="GroupById"><ID>[0-9a-fA-F]+</ID></Scope>' \
+ '\s*<Permission>FULL_CONTROL</Permission>\s*</Entry>\s*<Entry>' \
+ '\s*<Scope type="GroupById"><ID>[0-9a-fA-F]+</ID></Scope>' \
+ '\s*<Permission>READ</Permission></Entry>\s*</Entries>' \
+ '\s*</AccessControlList>\s*'
c = GSConnection()
# create a new bucket
bucket_name = 'test-%d' % int(time.time())
bucket = c.create_bucket(bucket_name)
# now call get_bucket to see if it's really there
bucket = c.get_bucket(bucket_name)
+ # get default acl and make sure it's project-private
+ acl = bucket.get_def_acl()
+ assert re.search(project_private_re, acl.to_xml())
+ # set default acl to a canned acl and verify it gets set
bucket.set_def_acl('public-read')
acl = bucket.get_def_acl()
# save public-read acl for later test
@@ -252,6 +266,10 @@ def test_3_default_object_acls(self):
bucket_name = 'test-%d' % int(time.time())
uri = storage_uri('gs://' + bucket_name)
uri.create_bucket()
+ # get default acl and make sure it's project-private
+ acl = uri.get_def_acl()
+ assert re.search(project_private_re, acl.to_xml())
+ # set default acl to a canned acl and verify it gets set
uri.set_def_acl('public-read')
acl = uri.get_def_acl()
# save public-read acl for later test
View
82 tests/s3/test_resumable_downloads.py
@@ -379,88 +379,6 @@ def test_zero_length_object_download(self):
self.dst_fp, res_download_handler=res_download_handler)
self.assertEqual(0, get_cur_file_size(self.dst_fp))
- def test_download_with_object_size_change_between_starts(self):
- """
- Tests resumable download on an object that changes sizes between inital
- download start and restart
- """
- harnass = CallbackTestHarnass(
- fail_after_n_bytes=self.larger_src_key_size/2, num_times_to_fail=2)
- # Set up first process' ResumableDownloadHandler not to do any
- # retries (initial download request will establish expected size to
- # download server).
- res_download_handler = ResumableDownloadHandler(
- tracker_file_name=self.tracker_file_name, num_retries=0)
- try:
- self.larger_src_key.get_contents_to_file(
- self.dst_fp, cb=harnass.call,
- res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # First abort (from harnass-forced failure) should be
- # ABORT_CUR_PROCESS.
- self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT_CUR_PROCESS)
- # Ensure a tracker file survived.
- self.assertTrue(os.path.exists(self.tracker_file_name))
- # Try it again, this time with different src key (simulating an
- # object that changes sizes between downloads).
- try:
- self.small_src_key.get_contents_to_file(
- self.dst_fp, res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # This abort should be a hard abort (object size changing during
- # transfer).
- self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT)
- self.assertNotEqual(
- e.message.find('md5 signature doesn\'t match etag'), -1)
-
- def test_download_with_file_content_change_during_download(self):
- """
- Tests resumable download on an object where the file content changes
- without changing length while download in progress
- """
- harnass = CallbackTestHarnass(
- fail_after_n_bytes=self.larger_src_key_size/2, num_times_to_fail=2)
- # Set up first process' ResumableDownloadHandler not to do any
- # retries (initial download request will establish expected size to
- # download server).
- res_download_handler = ResumableDownloadHandler(
- tracker_file_name=self.tracker_file_name, num_retries=0)
- dst_filename = self.dst_fp.name
- try:
- self.larger_src_key.get_contents_to_file(
- self.dst_fp, cb=harnass.call,
- res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # First abort (from harnass-forced failure) should be
- # ABORT_CUR_PROCESS.
- self.assertEqual(e.disposition,
- ResumableTransferDisposition.ABORT_CUR_PROCESS)
- # Ensure a tracker file survived.
- self.assertTrue(os.path.exists(self.tracker_file_name))
- # Before trying again change the first byte of the file fragment
- # that was already downloaded.
- orig_size = get_cur_file_size(self.dst_fp)
- self.dst_fp.seek(0, os.SEEK_SET)
- self.dst_fp.write('a')
- # Ensure the file size didn't change.
- self.assertEqual(orig_size, get_cur_file_size(self.dst_fp))
- try:
- self.larger_src_key.get_contents_to_file(
- self.dst_fp, cb=harnass.call,
- res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # This abort should be a hard abort (file content changing during
- # transfer).
- self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT)
- self.assertNotEqual(
- e.message.find('md5 signature doesn\'t match etag'), -1)
- # Ensure the bad data wasn't left around.
- self.assertFalse(os.path.exists(dst_filename))
-
def test_download_with_invalid_tracker_etag(self):
"""
Tests resumable download with a tracker file containing an invalid etag
Please sign in to comment.
Something went wrong with that request. Please try again.