Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Incorporate marcacohen's fixes for computing checksum when downloadin…

…g large files on Windows
  • Loading branch information...
commit f83da62cae1f3f3be34658655830f8a171cf594f 1 parent dfc9809
Mike Schwartz mfschwartz authored
24 boto/s3/key.py
View
@@ -31,6 +31,7 @@
from boto.provider import Provider
from boto.s3.user import User
from boto import UserAgent
+from boto.utils import compute_md5
try:
from hashlib import md5
except ImportError:
@@ -595,19 +596,16 @@ def compute_md5(self, fp):
as the first element and the base64 encoded version of the
plain digest as the second element.
"""
- m = md5()
- fp.seek(0)
- s = fp.read(self.BufferSize)
- while s:
- m.update(s)
- s = fp.read(self.BufferSize)
- hex_md5 = m.hexdigest()
- base64md5 = base64.encodestring(m.digest())
- if base64md5[-1] == '\n':
- base64md5 = base64md5[0:-1]
- self.size = fp.tell()
- fp.seek(0)
- return (hex_md5, base64md5)
+ tup = compute_md5(fp)
+ # Returned values are MD5 hash, base64 encoded MD5 hash, and file size.
+ # The internal implementation of compute_md5() needs to return the
+ # file size but we don't want to return that value to the external
+ # caller because it changes the class interface (i.e. it might
+ # break some code) so we consume the third tuple value here and
+ # return the remainder of the tuple to the caller, thereby preserving
+ # the existing interface.
+ self.size = tup[2]
+ return tup[0:2]
def set_contents_from_stream(self, fp, headers=None, replace=True,
cb=None, num_cb=10, policy=None,
26 boto/s3/resumable_download_handler.py
View
@@ -212,27 +212,6 @@ def _attempt_resumable_download(self, key, fp, headers, cb, num_cb,
override_num_retries=0)
fp.flush()
- def _check_final_md5(self, key, file_name):
- """
- Checks that etag from server agrees with md5 computed after the
- download completes. This is important, since the download could
- have spanned a number of hours and multiple processes (e.g.,
- gsutil runs), and the user could change some of the file and not
- realize they have inconsistent data.
- """
- fp = open(file_name, 'r')
- if key.bucket.connection.debug >= 1:
- print 'Checking md5 against etag.'
- hex_md5 = key.compute_md5(fp)[0]
- if hex_md5 != key.etag.strip('"\''):
- file_name = fp.name
- fp.close()
- os.unlink(file_name)
- raise ResumableDownloadException(
- 'File changed during download: md5 signature doesn\'t match '
- 'etag (incorrect downloaded file deleted)',
- ResumableTransferDisposition.ABORT)
-
def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
version_id=None):
"""
@@ -287,7 +266,10 @@ def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
torrent, version_id)
# Download succceded, so remove the tracker file (if have one).
self._remove_tracker_file()
- self._check_final_md5(key, fp.name)
+ # Previously, check_final_md5() was called here to validate
+ # downloaded file's checksum, however, to be consistent with
+ # non-resumable downloads, this call was removed. Checksum
+ # validation of file contents should be done by the caller.
if debug >= 1:
print 'Resumable download complete.'
return
37 boto/utils.py
View
@@ -55,6 +55,11 @@
from email.Utils import formatdate
from email import Encoders
import gzip
+import base64
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
try:
@@ -689,3 +694,35 @@ def guess_mime_type(content, deftype):
rtype = mimetype
break
return(rtype)
+
+def compute_md5(fp, buf_size=8192):
+ """
+ Compute MD5 hash on passed file and return results in a tuple of values.
+
+ :type fp: file
+ :param fp: File pointer to the file to MD5 hash. The file pointer
+ will be reset to the beginning of the file before the
+ method returns.
+
+ :type buf_size: integer
+ :param buf_size: Number of bytes per read request.
+
+ :rtype: tuple
+ :return: A tuple containing the hex digest version of the MD5 hash
+ as the first element, the base64 encoded version of the
+ plain digest as the second element and the file size as
+ the third element.
+ """
+ m = md5()
+ fp.seek(0)
+ s = fp.read(buf_size)
+ while s:
+ m.update(s)
+ s = fp.read(buf_size)
+ hex_md5 = m.hexdigest()
+ base64md5 = base64.encodestring(m.digest())
+ if base64md5[-1] == '\n':
+ base64md5 = base64md5[0:-1]
+ file_size = fp.tell()
+ fp.seek(0)
+ return (hex_md5, base64md5, file_size)
43 tests/s3/mock_storage_service.py
View
@@ -28,6 +28,13 @@
import copy
import boto
+import base64
+from boto.utils import compute_md5
+
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
NOT_IMPL = None
@@ -53,10 +60,12 @@ def __init__(self, bucket=None, name=None):
self.bucket = bucket
self.name = name
self.data = None
+ self.etag = None
self.size = None
self.content_encoding = None
self.content_type = None
self.last_modified = 'Wed, 06 Oct 2010 05:11:54 GMT'
+ self.BufferSize = 8192
def get_contents_as_string(self, headers=NOT_IMPL,
cb=NOT_IMPL, num_cb=NOT_IMPL,
@@ -93,6 +102,7 @@ def set_contents_from_file(self, fp, headers=None, replace=NOT_IMPL,
policy=NOT_IMPL, md5=NOT_IMPL,
res_upload_handler=NOT_IMPL):
self.data = fp.read()
+ self.set_etag()
self.size = len(self.data)
self._handle_headers(headers)
@@ -100,6 +110,7 @@ def set_contents_from_string(self, s, headers=NOT_IMPL, replace=NOT_IMPL,
cb=NOT_IMPL, num_cb=NOT_IMPL, policy=NOT_IMPL,
md5=NOT_IMPL, reduced_redundancy=NOT_IMPL):
self.data = copy.copy(s)
+ self.set_etag()
self.size = len(s)
self._handle_headers(headers)
@@ -118,6 +129,38 @@ def copy(self, dst_bucket_name, dst_key, metadata=NOT_IMPL,
return dst_bucket.copy_key(dst_key, self.bucket.name,
self.name, metadata)
+ def set_etag(self):
+ """
+ Set etag attribute by generating hex MD5 checksum on current
+ contents of mock key.
+ """
+ m = md5()
+ m.update(self.data)
+ hex_md5 = m.hexdigest()
+ self.etag = hex_md5
+
+ def compute_md5(self, fp):
+ """
+ :type fp: file
+ :param fp: File pointer to the file to MD5 hash. The file pointer
+ will be reset to the beginning of the file before the
+ method returns.
+
+ :rtype: tuple
+ :return: A tuple containing the hex digest version of the MD5 hash
+ as the first element and the base64 encoded version of the
+ plain digest as the second element.
+ """
+ tup = compute_md5(fp)
+ # Returned values are MD5 hash, base64 encoded MD5 hash, and file size.
+ # The internal implementation of compute_md5() needs to return the
+ # file size but we don't want to return that value to the external
+ # caller because it changes the class interface (i.e. it might
+ # break some code) so we consume the third tuple value here and
+ # return the remainder of the tuple to the caller, thereby preserving
+ # the existing interface.
+ self.size = tup[2]
+ return tup[0:2]
class MockBucket(object):
18 tests/s3/test_gsconnection.py
View
@@ -30,6 +30,7 @@
import unittest
import time
import os
+import re
from boto.gs.connection import GSConnection
from boto import storage_uri
@@ -218,12 +219,25 @@ def test_2_copy_key(self):
def test_3_default_object_acls(self):
"""test default object acls"""
+ # regexp for matching project-private default object ACL
+ project_private_re = '\s*<AccessControlList>\s*<Entries>\s*<Entry>' \
+ '\s*<Scope type="GroupById"><ID>[0-9a-fA-F]+</ID></Scope>' \
+ '\s*<Permission>FULL_CONTROL</Permission>\s*</Entry>\s*<Entry>' \
+ '\s*<Scope type="GroupById"><ID>[0-9a-fA-F]+</ID></Scope>' \
+ '\s*<Permission>FULL_CONTROL</Permission>\s*</Entry>\s*<Entry>' \
+ '\s*<Scope type="GroupById"><ID>[0-9a-fA-F]+</ID></Scope>' \
+ '\s*<Permission>READ</Permission></Entry>\s*</Entries>' \
+ '\s*</AccessControlList>\s*'
c = GSConnection()
# create a new bucket
bucket_name = 'test-%d' % int(time.time())
bucket = c.create_bucket(bucket_name)
# now call get_bucket to see if it's really there
bucket = c.get_bucket(bucket_name)
+ # get default acl and make sure it's project-private
+ acl = bucket.get_def_acl()
+ assert re.search(project_private_re, acl.to_xml())
+ # set default acl to a canned acl and verify it gets set
bucket.set_def_acl('public-read')
acl = bucket.get_def_acl()
# save public-read acl for later test
@@ -252,6 +266,10 @@ def test_3_default_object_acls(self):
bucket_name = 'test-%d' % int(time.time())
uri = storage_uri('gs://' + bucket_name)
uri.create_bucket()
+ # get default acl and make sure it's project-private
+ acl = uri.get_def_acl()
+ assert re.search(project_private_re, acl.to_xml())
+ # set default acl to a canned acl and verify it gets set
uri.set_def_acl('public-read')
acl = uri.get_def_acl()
# save public-read acl for later test
82 tests/s3/test_resumable_downloads.py
View
@@ -379,88 +379,6 @@ def test_zero_length_object_download(self):
self.dst_fp, res_download_handler=res_download_handler)
self.assertEqual(0, get_cur_file_size(self.dst_fp))
- def test_download_with_object_size_change_between_starts(self):
- """
- Tests resumable download on an object that changes sizes between inital
- download start and restart
- """
- harnass = CallbackTestHarnass(
- fail_after_n_bytes=self.larger_src_key_size/2, num_times_to_fail=2)
- # Set up first process' ResumableDownloadHandler not to do any
- # retries (initial download request will establish expected size to
- # download server).
- res_download_handler = ResumableDownloadHandler(
- tracker_file_name=self.tracker_file_name, num_retries=0)
- try:
- self.larger_src_key.get_contents_to_file(
- self.dst_fp, cb=harnass.call,
- res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # First abort (from harnass-forced failure) should be
- # ABORT_CUR_PROCESS.
- self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT_CUR_PROCESS)
- # Ensure a tracker file survived.
- self.assertTrue(os.path.exists(self.tracker_file_name))
- # Try it again, this time with different src key (simulating an
- # object that changes sizes between downloads).
- try:
- self.small_src_key.get_contents_to_file(
- self.dst_fp, res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # This abort should be a hard abort (object size changing during
- # transfer).
- self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT)
- self.assertNotEqual(
- e.message.find('md5 signature doesn\'t match etag'), -1)
-
- def test_download_with_file_content_change_during_download(self):
- """
- Tests resumable download on an object where the file content changes
- without changing length while download in progress
- """
- harnass = CallbackTestHarnass(
- fail_after_n_bytes=self.larger_src_key_size/2, num_times_to_fail=2)
- # Set up first process' ResumableDownloadHandler not to do any
- # retries (initial download request will establish expected size to
- # download server).
- res_download_handler = ResumableDownloadHandler(
- tracker_file_name=self.tracker_file_name, num_retries=0)
- dst_filename = self.dst_fp.name
- try:
- self.larger_src_key.get_contents_to_file(
- self.dst_fp, cb=harnass.call,
- res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # First abort (from harnass-forced failure) should be
- # ABORT_CUR_PROCESS.
- self.assertEqual(e.disposition,
- ResumableTransferDisposition.ABORT_CUR_PROCESS)
- # Ensure a tracker file survived.
- self.assertTrue(os.path.exists(self.tracker_file_name))
- # Before trying again change the first byte of the file fragment
- # that was already downloaded.
- orig_size = get_cur_file_size(self.dst_fp)
- self.dst_fp.seek(0, os.SEEK_SET)
- self.dst_fp.write('a')
- # Ensure the file size didn't change.
- self.assertEqual(orig_size, get_cur_file_size(self.dst_fp))
- try:
- self.larger_src_key.get_contents_to_file(
- self.dst_fp, cb=harnass.call,
- res_download_handler=res_download_handler)
- self.fail('Did not get expected ResumableDownloadException')
- except ResumableDownloadException, e:
- # This abort should be a hard abort (file content changing during
- # transfer).
- self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT)
- self.assertNotEqual(
- e.message.find('md5 signature doesn\'t match etag'), -1)
- # Ensure the bad data wasn't left around.
- self.assertFalse(os.path.exists(dst_filename))
-
def test_download_with_invalid_tracker_etag(self):
"""
Tests resumable download with a tracker file containing an invalid etag
Please sign in to comment.
Something went wrong with that request. Please try again.