Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge pull request #1409 from yovadia12/compose

Support for GCS object composition and CRC32c hashes.
  • Loading branch information...
commit dcff946e220f22783d12d169dbbcbbdd7db80842 2 parents e1edf37 + 416c629
@mfschwartz mfschwartz authored
View
242 boto/gs/key.py
@@ -27,6 +27,7 @@
from boto.exception import BotoClientError
from boto.s3.key import Key as S3Key
from boto.s3.keyfile import KeyFile
+from boto.utils import compute_hash
class Key(S3Key):
"""
@@ -47,7 +48,7 @@ class Key(S3Key):
:ivar last_modified: The string timestamp representing the last
time this object was modified in GS.
:ivar owner: The ID of the owner of this object.
- :ivar storage_class: The storage class of the object. Currently, one of:
+ :ivar storage_class: The storage class of the object. Currently, one of:
STANDARD | DURABLE_REDUCED_AVAILABILITY.
:ivar md5: The MD5 hash of the contents of the object.
:ivar size: The size, in bytes, of the object.
@@ -55,9 +56,16 @@ class Key(S3Key):
:ivar metageneration: The generation number of the object metadata.
:ivar encrypted: Whether the object is encrypted while at rest on
the server.
+ :ivar cloud_hashes: Dictionary of checksums as supplied by the storage
+ provider.
"""
- generation = None
- metageneration = None
+
+ def __init__(self, bucket=None, name=None, generation=None):
+ super(Key, self).__init__(bucket=bucket, name=name)
+ self.generation = generation
+ self.meta_generation = None
+ self.cloud_hashes = {}
+ self.component_count = None
def __repr__(self):
if self.generation and self.metageneration:
@@ -100,17 +108,164 @@ def handle_version_headers(self, resp, force=False):
self.metageneration = resp.getheader('x-goog-metageneration', None)
self.generation = resp.getheader('x-goog-generation', None)
+ def handle_addl_headers(self, headers):
+ for key, value in headers:
+ if key == 'x-goog-hash':
+ for hash_pair in value.split(','):
+ alg, b64_digest = hash_pair.strip().split('=', 1)
+ self.cloud_hashes[alg] = binascii.a2b_base64(b64_digest)
+ elif key == 'x-goog-component-count':
+ self.component_count = int(value)
+
+
def get_file(self, fp, headers=None, cb=None, num_cb=10,
torrent=False, version_id=None, override_num_retries=None,
- response_headers=None):
+ response_headers=None, hash_algs=None):
query_args = None
if self.generation:
query_args = ['generation=%s' % self.generation]
self._get_file_internal(fp, headers=headers, cb=cb, num_cb=num_cb,
override_num_retries=override_num_retries,
response_headers=response_headers,
+ hash_algs=hash_algs,
query_args=query_args)
+ def get_contents_to_file(self, fp, headers=None,
+ cb=None, num_cb=10,
+ torrent=False,
+ version_id=None,
+ res_download_handler=None,
+ response_headers=None,
+ hash_algs=None):
+ """
+ Retrieve an object from GCS using the name of the Key object as the
+ key in GCS. Write the contents of the object to the file pointed
+ to by 'fp'.
+
+ :type fp: File -like object
+ :param fp:
+
+ :type headers: dict
+ :param headers: additional HTTP headers that will be sent with
+ the GET request.
+
+ :type cb: function
+ :param cb: a callback function that will be called to report
+ progress on the upload. The callback should accept two
+ integer parameters, the first representing the number of
+ bytes that have been successfully transmitted to GCS and
+ the second representing the size of the to be transmitted
+ object.
+
+ :type cb: int
+ :param num_cb: (optional) If a callback is specified with the
+ cb parameter this parameter determines the granularity of
+ the callback by defining the maximum number of times the
+ callback will be called during the file transfer.
+
+ :type torrent: bool
+ :param torrent: If True, returns the contents of a torrent
+ file as a string.
+
+ :type res_upload_handler: ResumableDownloadHandler
+ :param res_download_handler: If provided, this handler will
+ perform the download.
+
+ :type response_headers: dict
+ :param response_headers: A dictionary containing HTTP
+ headers/values that will override any headers associated
+ with the stored object in the response. See
+ http://goo.gl/sMkcC for details.
+ """
+ if self.bucket != None:
+ if res_download_handler:
+ res_download_handler.get_file(self, fp, headers, cb, num_cb,
+ torrent=torrent,
+ version_id=version_id,
+ hash_algs=hash_algs)
+ else:
+ self.get_file(fp, headers, cb, num_cb, torrent=torrent,
+ version_id=version_id,
+ response_headers=response_headers,
+ hash_algs=hash_algs)
+
+ def compute_hash(self, fp, algorithm, size=None):
+ """
+ :type fp: file
+ :param fp: File pointer to the file to hash. The file
+ pointer will be reset to the same position before the
+ method returns.
+
+ :type algorithm: zero-argument constructor for hash objects that
+ implements update() and digest() (e.g. hashlib.md5)
+
+ :type size: int
+ :param size: (optional) The Maximum number of bytes to read
+ from the file pointer (fp). This is useful when uploading
+ a file in multiple parts where the file is being split
+ in place into different parts. Less bytes may be available.
+ """
+ hex_digest, b64_digest, data_size = compute_hash(
+ fp, size=size, hash_algorithm=algorithm)
+ # The internal implementation of compute_hash() needs to return the
+ # data size, but we don't want to return that value to the external
+ # caller because it changes the class interface (i.e. it might
+ # break some code), so we consume the third tuple value here and
+ # return the remainder of the tuple to the caller, thereby preserving
+ # the existing interface.
+ self.size = data_size
+ return (hex_digest, b64_digest)
+
+ def send_file(self, fp, headers=None, cb=None, num_cb=10,
+ query_args=None, chunked_transfer=False, size=None,
+ hash_algs=None):
+ """
+ Upload a file to GCS.
+
+ :type fp: file
+ :param fp: The file pointer to upload. The file pointer must
+ point point at the offset from which you wish to upload.
+ ie. if uploading the full file, it should point at the
+ start of the file. Normally when a file is opened for
+ reading, the fp will point at the first byte. See the
+ bytes parameter below for more info.
+
+ :type headers: dict
+ :param headers: The headers to pass along with the PUT request
+
+ :type num_cb: int
+ :param num_cb: (optional) If a callback is specified with the
+ cb parameter this parameter determines the granularity of
+ the callback by defining the maximum number of times the
+ callback will be called during the file
+ transfer. Providing a negative integer will cause your
+ callback to be called with each buffer read.
+
+ :type query_args: string
+ :param query_args: Arguments to pass in the query string.
+
+ :type chunked_transfer: boolean
+ :param chunked_transfer: (optional) If true, we use chunked
+ Transfer-Encoding.
+
+ :type size: int
+ :param size: (optional) The Maximum number of bytes to read
+ from the file pointer (fp). This is useful when uploading
+ a file in multiple parts where you are splitting the file
+ up into different ranges to be uploaded. If not specified,
+ the default behaviour is to read all bytes from the file
+ pointer. Less bytes may be available.
+
+ :type hash_algs: dictionary
+ :param hash_algs: (optional) Dictionary of hash algorithms and
+ corresponding hashing class that implements update() and digest().
+ Defaults to {'md5': hashlib.md5}.
+ """
+ self._send_file_internal(fp, headers=headers, cb=cb, num_cb=num_cb,
+ query_args=query_args,
+ chunked_transfer=chunked_transfer, size=size,
+ hash_algs=hash_algs)
+
def delete(self):
return self.bucket.delete_key(self.name, version_id=self.version_id,
generation=self.generation)
@@ -289,7 +444,8 @@ def set_contents_from_file(self, fp, headers=None, replace=True,
provider = self.bucket.connection.provider
if res_upload_handler and size:
# could use size instead of file_length if provided but...
- raise BotoClientError('"size" param not supported for resumable uploads.')
+ raise BotoClientError(
+ '"size" param not supported for resumable uploads.')
headers = headers or {}
if policy:
headers[provider.acl_header] = policy
@@ -431,22 +587,21 @@ def set_contents_from_filename(self, filename, headers=None, replace=True,
this value. If set to the value 0, the object will only be written
if it doesn't already exist.
"""
- # Clear out any previously computed md5 hashes, since we are setting the content.
- self.md5 = None
- self.base64md5 = None
+ # Clear out any previously computed hashes, since we are setting the
+ # content.
+ self.local_hashes = {}
- fp = open(filename, 'rb')
- self.set_contents_from_file(fp, headers, replace, cb, num_cb,
- policy, md5, res_upload_handler,
- if_generation=if_generation)
- fp.close()
+ with open(filename, 'rb') as fp:
+ self.set_contents_from_file(fp, headers, replace, cb, num_cb,
+ policy, md5, res_upload_handler,
+ if_generation=if_generation)
def set_contents_from_string(self, s, headers=None, replace=True,
cb=None, num_cb=10, policy=None, md5=None,
if_generation=None):
"""
- Store an object in S3 using the name of the Key object as the
- key in S3 and the string 's' as the contents.
+ Store an object in GCS using the name of the Key object as the
+ key in GCS and the string 's' as the contents.
See set_contents_from_file method for details about the
parameters.
@@ -460,10 +615,10 @@ def set_contents_from_string(self, s, headers=None, replace=True,
:type cb: function
:param cb: a callback function that will be called to report
- progress on the upload. The callback should accept
+ progress on the upload. The callback should accept
two integer parameters, the first representing the
number of bytes that have been successfully
- transmitted to S3 and the second representing the
+ transmitted to GCS and the second representing the
size of the to be transmitted object.
:type cb: int
@@ -473,19 +628,19 @@ def set_contents_from_string(self, s, headers=None, replace=True,
the maximum number of times the callback will
be called during the file transfer.
- :type policy: :class:`boto.s3.acl.CannedACLStrings`
+ :type policy: :class:`boto.gs.acl.CannedACLStrings`
:param policy: A canned ACL policy that will be applied to the
- new key in S3.
+ new key in GCS.
:type md5: A tuple containing the hexdigest version of the MD5
checksum of the file as the first element and the
Base64-encoded version of the plain checksum as the
- second element. This is the same format returned by
+ second element. This is the same format returned by
the compute_md5 method.
:param md5: If you need to compute the MD5 for any reason prior
to upload, it's silly to have to do it twice so this
param, if present, will be used as the MD5 values
- of the file. Otherwise, the checksum will be computed.
+ of the file. Otherwise, the checksum will be computed.
:type if_generation: int
:param if_generation: (optional) If set to a generation number, the
@@ -550,12 +705,6 @@ def set_contents_from_stream(self, *args, **kwargs):
:param policy: A canned ACL policy that will be applied to the new key
in GS.
- :type reduced_redundancy: bool
- :param reduced_redundancy: If True, this will set the storage
- class of the new Key to be REDUCED_REDUNDANCY. The Reduced
- Redundancy Storage (RRS) feature of S3, provides lower
- redundancy at lower storage cost.
-
:type size: int
:param size: (optional) The Maximum number of bytes to read from
the file pointer (fp). This is useful when uploading a
@@ -702,3 +851,42 @@ def set_canned_acl(self, acl_str, headers=None, generation=None,
if_generation=if_generation,
if_metageneration=if_metageneration
)
+
+ def compose(self, components, content_type=None, headers=None):
+ """Create a new object from a sequence of existing objects.
+
+ The content of the object representing this Key will be the
+ concatenation of the given object sequence. For more detail, visit
+
+ https://developers.google.com/storage/docs/composite-objects
+
+ :type components list of Keys
+ :param components List of gs.Keys representing the component objects
+
+ :type content_type (optional) string
+ :param content_type Content type for the new composite object.
+ """
+ compose_req = []
+ for key in components:
+ if key.bucket.name != self.bucket.name:
+ raise BotoClientError(
+ 'GCS does not support inter-bucket composing')
+
+ generation_tag = ''
+ if key.generation:
+ generation_tag = ('<Generation>%s</Generation>'
+ % str(key.generation))
+ compose_req.append('<Component><Name>%s</Name>%s</Component>' %
+ (key.name, generation_tag))
+ compose_req_xml = ('<ComposeRequest>%s</ComposeRequest>' %
+ ''.join(compose_req))
+ headers = headers or {}
+ if content_type:
+ headers['Content-Type'] = content_type
+ resp = self.bucket.connection.make_request('PUT', self.bucket.name,
+ self.name, headers=headers,
+ query_args='compose',
+ data=compose_req_xml)
+ if resp.status < 200 or resp.status > 299:
+ raise self.bucket.connection.provider.storage_response_error(
+ resp.status, resp.reason, resp.read())
View
58 boto/gs/resumable_upload_handler.py
@@ -322,7 +322,7 @@ def _start_new_resumable_upload(self, key, headers=None):
self._save_tracker_uri_to_file()
def _upload_file_bytes(self, conn, http_conn, fp, file_length,
- total_bytes_uploaded, cb, num_cb, md5sum, headers):
+ total_bytes_uploaded, cb, num_cb, headers):
"""
Makes one attempt to upload file bytes, using an existing resumable
upload connection.
@@ -376,7 +376,8 @@ def _upload_file_bytes(self, conn, http_conn, fp, file_length,
http_conn.set_debuglevel(0)
while buf:
http_conn.send(buf)
- md5sum.update(buf)
+ for alg in self.digesters:
+ self.digesters[alg].update(buf)
total_bytes_uploaded += len(buf)
if cb:
i += 1
@@ -416,7 +417,7 @@ def _upload_file_bytes(self, conn, http_conn, fp, file_length,
(resp.status, resp.reason), disposition)
def _attempt_resumable_upload(self, key, fp, file_length, headers, cb,
- num_cb, md5sum):
+ num_cb):
"""
Attempts a resumable upload.
@@ -435,9 +436,9 @@ def _attempt_resumable_upload(self, key, fp, file_length, headers, cb,
if server_end:
# If the server already has some of the content, we need to
- # update the md5 with the bytes that have already been
+ # update the digesters with the bytes that have already been
# uploaded to ensure we get a complete hash in the end.
- print 'Catching up md5 for resumed upload'
+ print 'Catching up hash digest(s) for resumed upload'
fp.seek(0)
# Read local file's bytes through position server has. For
# example, if server has (0, 3) we want to read 3-0+1=4 bytes.
@@ -446,13 +447,14 @@ def _attempt_resumable_upload(self, key, fp, file_length, headers, cb,
chunk = fp.read(min(key.BufferSize, bytes_to_go))
if not chunk:
raise ResumableUploadException(
- 'Hit end of file during resumable upload md5 '
+ 'Hit end of file during resumable upload hash '
'catchup. This should not happen under\n'
'normal circumstances, as it indicates the '
'server has more bytes of this transfer\nthan'
' the current file size. Restarting upload.',
ResumableTransferDisposition.START_OVER)
- md5sum.update(chunk)
+ for alg in self.digesters:
+ self.digesters[alg].update(chunk)
bytes_to_go -= len(chunk)
if conn.debug >= 1:
@@ -492,7 +494,7 @@ def _attempt_resumable_upload(self, key, fp, file_length, headers, cb,
# and can report that progress on next attempt.
try:
return self._upload_file_bytes(conn, http_conn, fp, file_length,
- total_bytes_uploaded, cb, num_cb, md5sum,
+ total_bytes_uploaded, cb, num_cb,
headers)
except (ResumableUploadException, socket.error):
resp = self._query_server_state(conn, file_length)
@@ -556,9 +558,9 @@ def track_progress_less_iterations(self, server_had_bytes_before_attempt,
else:
self.progress_less_iterations += 1
if roll_back_md5:
- # Rollback any potential md5sum updates, as we did not
+ # Rollback any potential hash updates, as we did not
# make any progress in this iteration.
- self.md5sum = self.md5sum_before_attempt
+ self.digesters = self.digesters_before_attempt
if self.progress_less_iterations > self.num_retries:
# Don't retry any longer in the current process.
@@ -575,7 +577,7 @@ def track_progress_less_iterations(self, server_had_bytes_before_attempt,
(self.progress_less_iterations, sleep_time_secs))
time.sleep(sleep_time_secs)
- def send_file(self, key, fp, headers, cb=None, num_cb=10):
+ def send_file(self, key, fp, headers, cb=None, num_cb=10, hash_algs=None):
"""
Upload a file to a key into a bucket on GS, using GS resumable upload
protocol.
@@ -603,6 +605,12 @@ def send_file(self, key, fp, headers, cb=None, num_cb=10):
during the file transfer. Providing a negative integer will cause
your callback to be called with each buffer read.
+ :type hash_algs: dictionary
+ :param hash_algs: (optional) Dictionary mapping hash algorithm
+ descriptions to corresponding state-ful hashing objects that
+ implement update(), digest(), and copy() (e.g. hashlib.md5()).
+ Defaults to {'md5': md5()}.
+
Raises ResumableUploadException if a problem occurs during the transfer.
"""
@@ -613,22 +621,25 @@ def send_file(self, key, fp, headers, cb=None, num_cb=10):
# that header.
CT = 'Content-Type'
if CT in headers and headers[CT] is None:
- del headers[CT]
+ del headers[CT]
headers['User-Agent'] = UserAgent
# Determine file size different ways for case where fp is actually a
# wrapper around a Key vs an actual file.
if isinstance(fp, KeyFile):
- file_length = fp.getkey().size
+ file_length = fp.getkey().size
else:
- fp.seek(0, os.SEEK_END)
- file_length = fp.tell()
- fp.seek(0)
+ fp.seek(0, os.SEEK_END)
+ file_length = fp.tell()
+ fp.seek(0)
debug = key.bucket.connection.debug
# Compute the MD5 checksum on the fly.
- self.md5sum = md5()
+ if hash_algs is None:
+ hash_algs = {'md5': md5}
+ self.digesters = dict(
+ (alg, hash_algs[alg]()) for alg in hash_algs or {})
# Use num-retries from constructor if one was provided; else check
# for a value specified in the boto config file; else default to 5.
@@ -638,19 +649,20 @@ def send_file(self, key, fp, headers, cb=None, num_cb=10):
while True: # Retry as long as we're making progress.
server_had_bytes_before_attempt = self.server_has_bytes
- self.md5sum_before_attempt = self.md5sum.copy()
+ self.digesters_before_attempt = dict(
+ (alg, self.digesters[alg].copy())
+ for alg in self.digesters)
try:
# Save generation and metageneration in class state so caller
# can find these values, for use in preconditions of future
# operations on the uploaded object.
(etag, self.generation, self.metageneration) = (
self._attempt_resumable_upload(key, fp, file_length,
- headers, cb, num_cb,
- self.md5sum))
+ headers, cb, num_cb))
- # Get the final md5 for the uploaded content.
- hd = self.md5sum.hexdigest()
- key.md5, key.base64md5 = key.get_md5_from_hexdigest(hd)
+ # Get the final digests for the uploaded content.
+ for alg in self.digesters:
+ key.local_hashes[alg] = self.digesters[alg].digest()
# Upload succceded, so remove the tracker file (if have one).
self._remove_tracker_file()
View
3  boto/s3/bucket.py
@@ -209,6 +209,7 @@ def _get_key_internal(self, key_name, headers, query_args_l):
k.handle_version_headers(response)
k.handle_encryption_headers(response)
k.handle_restore_headers(response)
+ k.handle_addl_headers(response.getheaders())
return k, response
else:
if response.status == 404:
@@ -622,6 +623,7 @@ def _delete_key_internal(self, key_name, headers=None, version_id=None,
k = self.key_class(self)
k.name = key_name
k.handle_version_headers(response)
+ k.handle_addl_headers(response.getheaders())
return k
def copy_key(self, new_key_name, src_bucket_name,
@@ -715,6 +717,7 @@ def copy_key(self, new_key_name, src_bucket_name,
if hasattr(key, 'Error'):
raise provider.storage_copy_error(key.Code, key.Message, body)
key.handle_version_headers(response)
+ key.handle_addl_headers(response.getheaders())
if preserve_acl:
self.set_xml_acl(acl, new_key_name)
return key
View
154 boto/s3/key.py
@@ -109,8 +109,6 @@ def __init__(self, bucket=None, name=None):
self.last_modified = None
self.owner = None
self.storage_class = 'STANDARD'
- self.md5 = None
- self.base64md5 = None
self.path = None
self.resp = None
self.mode = None
@@ -126,6 +124,7 @@ def __init__(self, bucket=None, name=None):
# restored object.
self.ongoing_restore = None
self.expiry_date = None
+ self.local_hashes = {}
def __repr__(self):
if self.bucket:
@@ -133,18 +132,6 @@ def __repr__(self):
else:
return '<Key: None,%s>' % self.name
- def __getattr__(self, name):
- if name == 'key':
- return self.name
- else:
- raise AttributeError
-
- def __setattr__(self, name, value):
- if name == 'key':
- self.__dict__['name'] = value
- else:
- self.__dict__[name] = value
-
def __iter__(self):
return self
@@ -155,6 +142,38 @@ def provider(self):
provider = self.bucket.connection.provider
return provider
+ @property
+ def key(self):
+ return self.name
+
+ @key.setter
+ def key(self, value):
+ self.name = value
+
+ @property
+ def md5(self):
+ if 'md5' in self.local_hashes and self.local_hashes['md5']:
+ return binascii.b2a_hex(self.local_hashes['md5'])
+
+ @md5.setter
+ def md5(self, value):
+ if value:
+ self.local_hashes['md5'] = binascii.a2b_hex(value)
+ elif 'md5' in self.local_hashes:
+ self.local_hashes.pop('md5', None)
+
+ @property
+ def base64md5(self):
+ if 'md5' in self.local_hashes and self.local_hashes['md5']:
+ return binascii.b2a_base64(self.local_hashes['md5']).rstrip('\n')
+
+ @base64md5.setter
+ def base64md5(self, value):
+ if value:
+ self.local_hashes['md5'] = binascii.a2b_base64(value)
+ elif 'md5' in self.local_hashes:
+ del self.local_hashes['md5']
+
def get_md5_from_hexdigest(self, md5_hexdigest):
"""
A utility function to create the 2-tuple (md5hexdigest, base64md5)
@@ -169,7 +188,8 @@ def get_md5_from_hexdigest(self, md5_hexdigest):
def handle_encryption_headers(self, resp):
provider = self.bucket.connection.provider
if provider.server_side_encryption_header:
- self.encrypted = resp.getheader(provider.server_side_encryption_header, None)
+ self.encrypted = resp.getheader(
+ provider.server_side_encryption_header, None)
else:
self.encrypted = None
@@ -202,6 +222,13 @@ def handle_restore_headers(self, response):
elif key == 'expiry-date':
self.expiry_date = val
+ def handle_addl_headers(self, headers):
+ """
+ Used by Key subclasses to do additional, provider-specific
+ processing of response headers. No-op for this base class.
+ """
+ pass
+
def open_read(self, headers=None, query_args='',
override_num_retries=None, response_headers=None):
"""
@@ -265,6 +292,7 @@ def open_read(self, headers=None, query_args='',
self.content_disposition = value
self.handle_version_headers(self.resp)
self.handle_encryption_headers(self.resp)
+ self.handle_addl_headers(self.resp.getheaders())
def open_write(self, headers=None, override_num_retries=None):
"""
@@ -646,20 +674,12 @@ def send_file(self, fp, headers=None, cb=None, num_cb=10,
point point at the offset from which you wish to upload.
ie. if uploading the full file, it should point at the
start of the file. Normally when a file is opened for
- reading, the fp will point at the first byte. See the
+ reading, the fp will point at the first byte. See the
bytes parameter below for more info.
:type headers: dict
:param headers: The headers to pass along with the PUT request
- :type cb: function
- :param cb: a callback function that will be called to report
- progress on the upload. The callback should accept two
- integer parameters, the first representing the number of
- bytes that have been successfully transmitted to S3 and
- the second representing the size of the to be transmitted
- object.
-
:type num_cb: int
:param num_cb: (optional) If a callback is specified with the
cb parameter this parameter determines the granularity of
@@ -668,6 +688,13 @@ def send_file(self, fp, headers=None, cb=None, num_cb=10,
transfer. Providing a negative integer will cause your
callback to be called with each buffer read.
+ :type query_args: string
+ :param query_args: (optional) Arguments to pass in the query string.
+
+ :type chunked_transfer: boolean
+ :param chunked_transfer: (optional) If true, we use chunked
+ Transfer-Encoding.
+
:type size: int
:param size: (optional) The Maximum number of bytes to read
from the file pointer (fp). This is useful when uploading
@@ -676,6 +703,13 @@ def send_file(self, fp, headers=None, cb=None, num_cb=10,
the default behaviour is to read all bytes from the file
pointer. Less bytes may be available.
"""
+ self._send_file_internal(fp, headers=headers, cb=cb, num_cb=num_cb,
+ query_args=query_args,
+ chunked_transfer=chunked_transfer, size=size)
+
+ def _send_file_internal(self, fp, headers=None, cb=None, num_cb=10,
+ query_args=None, chunked_transfer=False, size=None,
+ hash_algs=None):
provider = self.bucket.connection.provider
try:
spos = fp.tell()
@@ -683,6 +717,12 @@ def send_file(self, fp, headers=None, cb=None, num_cb=10,
spos = None
self.read_from_stream = False
+ # If hash_algs is unset and the MD5 hasn't already been computed,
+ # default to an MD5 hash_alg to hash the data on-the-fly.
+ if hash_algs is None and not self.md5:
+ hash_algs = {'md5': md5}
+ digesters = dict((alg, hash_algs[alg]()) for alg in hash_algs or {})
+
def sender(http_conn, method, path, data, headers):
# This function is called repeatedly for temporary retries
# so we must be sure the file pointer is pointing at the
@@ -701,12 +741,6 @@ def sender(http_conn, method, path, data, headers):
http_conn.putheader(key, headers[key])
http_conn.endheaders()
- # Calculate all MD5 checksums on the fly, if not already computed
- if not self.base64md5:
- m = md5()
- else:
- m = None
-
save_debug = self.bucket.connection.debug
self.bucket.connection.debug = 0
# If the debuglevel < 4 we don't want to show connection
@@ -729,7 +763,8 @@ def sender(http_conn, method, path, data, headers):
# of data transferred, except when we know size.
cb_count = (1024 * 1024) / self.BufferSize
elif num_cb > 1:
- cb_count = int(math.ceil(cb_size / self.BufferSize / (num_cb - 1.0)))
+ cb_count = int(
+ math.ceil(cb_size / self.BufferSize / (num_cb - 1.0)))
elif num_cb < 0:
cb_count = -1
else:
@@ -754,8 +789,8 @@ def sender(http_conn, method, path, data, headers):
http_conn.send('\r\n')
else:
http_conn.send(chunk)
- if m:
- m.update(chunk)
+ for alg in digesters:
+ digesters[alg].update(chunk)
if bytes_togo:
bytes_togo -= chunk_len
if bytes_togo <= 0:
@@ -772,10 +807,8 @@ def sender(http_conn, method, path, data, headers):
self.size = data_len
- if m:
- # Use the chunked trailer for the digest
- hd = m.hexdigest()
- self.md5, self.base64md5 = self.get_md5_from_hexdigest(hd)
+ for alg in digesters:
+ self.local_hashes[alg] = digesters[alg].digest()
if chunked_transfer:
http_conn.send('0\r\n')
@@ -846,6 +879,7 @@ def sender(http_conn, method, path, data, headers):
sender=sender,
query_args=query_args)
self.handle_version_headers(resp, force=True)
+ self.handle_addl_headers(resp.getheaders())
def compute_md5(self, fp, size=None):
"""
@@ -858,14 +892,9 @@ def compute_md5(self, fp, size=None):
:param size: (optional) The Maximum number of bytes to read
from the file pointer (fp). This is useful when uploading
a file in multiple parts where the file is being split
- inplace into different parts. Less bytes may be available.
-
- :rtype: tuple
- :return: A tuple containing the hex digest version of the MD5
- hash as the first element and the base64 encoded version
- of the plain digest as the second element.
+ in place into different parts. Less bytes may be available.
"""
- tup = compute_md5(fp, size=size)
+ hex_digest, b64_digest, data_size = compute_md5(fp, size=size)
# Returned values are MD5 hash, base64 encoded MD5 hash, and data size.
# The internal implementation of compute_md5() needs to return the
# data size but we don't want to return that value to the external
@@ -873,8 +902,8 @@ def compute_md5(self, fp, size=None):
# break some code) so we consume the third tuple value here and
# return the remainder of the tuple to the caller, thereby preserving
# the existing interface.
- self.size = tup[2]
- return tup[0:2]
+ self.size = data_size
+ return (hex_digest, b64_digest)
def set_contents_from_stream(self, fp, headers=None, replace=True,
cb=None, num_cb=10, policy=None,
@@ -1203,14 +1232,11 @@ class of the new Key to be REDUCED_REDUNDANCY. The Reduced
:rtype: int
:return: The number of bytes written to the key.
"""
- fp = open(filename, 'rb')
- try:
- return self.set_contents_from_file(fp, headers, replace,
- cb, num_cb, policy,
- md5, reduced_redundancy,
+ with open(filename, 'rb') as fp:
+ return self.set_contents_from_file(fp, headers, replace, cb,
+ num_cb, policy, md5,
+ reduced_redundancy,
encrypt_key=encrypt_key)
- finally:
- fp.close()
def set_contents_from_string(self, s, headers=None, replace=True,
cb=None, num_cb=10, policy=None, md5=None,
@@ -1321,11 +1347,12 @@ def get_file(self, fp, headers=None, cb=None, num_cb=10,
torrent=torrent, version_id=version_id,
override_num_retries=override_num_retries,
response_headers=response_headers,
+ hash_algs=None,
query_args=None)
def _get_file_internal(self, fp, headers=None, cb=None, num_cb=10,
torrent=False, version_id=None, override_num_retries=None,
- response_headers=None, query_args=None):
+ response_headers=None, hash_algs=None, query_args=None):
if headers is None:
headers = {}
save_debug = self.bucket.connection.debug
@@ -1335,9 +1362,11 @@ def _get_file_internal(self, fp, headers=None, cb=None, num_cb=10,
query_args = query_args or []
if torrent:
query_args.append('torrent')
- m = None
- else:
- m = md5()
+
+ if hash_algs is None and not torrent:
+ hash_algs = {'md5': md5}
+ digesters = dict((alg, hash_algs[alg]()) for alg in hash_algs or {})
+
# If a version_id is passed in, use that. If not, check to see
# if the Key object has an explicit version_id and, if so, use that.
# Otherwise, don't pass a version_id query param.
@@ -1347,7 +1376,8 @@ def _get_file_internal(self, fp, headers=None, cb=None, num_cb=10,
query_args.append('versionId=%s' % version_id)
if response_headers:
for key in response_headers:
- query_args.append('%s=%s' % (key, urllib.quote(response_headers[key])))
+ query_args.append('%s=%s' % (
+ key, urllib.quote(response_headers[key])))
query_args = '&'.join(query_args)
self.open('r', headers, query_args=query_args,
override_num_retries=override_num_retries)
@@ -1373,8 +1403,8 @@ def _get_file_internal(self, fp, headers=None, cb=None, num_cb=10,
for bytes in self:
fp.write(bytes)
data_len += len(bytes)
- if m:
- m.update(bytes)
+ for alg in digesters:
+ digesters[alg].update(bytes)
if cb:
if cb_size > 0 and data_len >= cb_size:
break
@@ -1384,8 +1414,8 @@ def _get_file_internal(self, fp, headers=None, cb=None, num_cb=10,
i = 0
if cb and (cb_count <= 1 or i > 0) and data_len > 0:
cb(data_len, cb_size)
- if m:
- self.md5 = m.hexdigest()
+ for alg in digesters:
+ self.local_hashes[alg] = digesters[alg].digest()
if self.size is None and not torrent and "Range" not in headers:
self.size = data_len
self.close()
View
27 boto/s3/resumable_download_handler.py
@@ -90,7 +90,7 @@ class ResumableDownloadHandler(object):
Handler for resumable downloads.
"""
- ETAG_REGEX = '([a-z0-9]{32})\n'
+ MIN_ETAG_LEN = 5
RETRYABLE_EXCEPTIONS = (httplib.HTTPException, IOError, socket.error,
socket.gaierror)
@@ -127,11 +127,11 @@ def _load_tracker_file_etag(self):
f = None
try:
f = open(self.tracker_file_name, 'r')
- etag_line = f.readline()
- m = re.search(self.ETAG_REGEX, etag_line)
- if m:
- self.etag_value_for_current_download = m.group(1)
- else:
+ self.etag_value_for_current_download = f.readline().rstrip('\n')
+ # We used to match an MD5-based regex to ensure that the etag was
+ # read correctly. Since ETags need not be MD5s, we now do a simple
+ # length sanity check instead.
+ if len(self.etag_value_for_current_download) < self.MIN_ETAG_LEN:
print('Couldn\'t read etag in tracker file (%s). Restarting '
'download from scratch.' % self.tracker_file_name)
except IOError, e:
@@ -173,7 +173,7 @@ def _remove_tracker_file(self):
os.unlink(self.tracker_file_name)
def _attempt_resumable_download(self, key, fp, headers, cb, num_cb,
- torrent, version_id):
+ torrent, version_id, hash_algs):
"""
Attempts a resumable download.
@@ -213,11 +213,11 @@ def _attempt_resumable_download(self, key, fp, headers, cb, num_cb,
# Disable AWSAuthConnection-level retry behavior, since that would
# cause downloads to restart from scratch.
key.get_file(fp, headers, cb, num_cb, torrent, version_id,
- override_num_retries=0)
+ override_num_retries=0, hash_algs=hash_algs)
fp.flush()
def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
- version_id=None):
+ version_id=None, hash_algs=None):
"""
Retrieves a file from a Key
:type key: :class:`boto.s3.key.Key` or subclass
@@ -249,6 +249,11 @@ def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
:type version_id: string
:param version_id: The version ID (optional)
+ :type hash_algs: dictionary
+ :param hash_algs: (optional) Dictionary of hash algorithms and
+ corresponding hashing class that implements update() and digest().
+ Defaults to {'md5': hashlib/md5.md5}.
+
Raises ResumableDownloadException if a problem occurs during
the transfer.
"""
@@ -267,7 +272,7 @@ def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
had_file_bytes_before_attempt = get_cur_file_size(fp)
try:
self._attempt_resumable_download(key, fp, headers, cb, num_cb,
- torrent, version_id)
+ torrent, version_id, hash_algs)
# Download succceded, so remove the tracker file (if have one).
self._remove_tracker_file()
# Previously, check_final_md5() was called here to validate
@@ -286,7 +291,7 @@ def get_file(self, key, fp, headers, cb=None, num_cb=10, torrent=False,
# so we need to close and reopen the key before resuming
# the download.
key.get_file(fp, headers, cb, num_cb, torrent, version_id,
- override_num_retries=0)
+ override_num_retries=0, hash_algs=hash_algs)
except ResumableDownloadException, e:
if (e.disposition ==
ResumableTransferDisposition.ABORT_CUR_PROCESS):
View
23 boto/storage_uri.py
@@ -195,12 +195,20 @@ def get_contents_to_stream(self, fp, headers=None, version_id=None):
def get_contents_to_file(self, fp, headers=None, cb=None, num_cb=10,
torrent=False, version_id=None,
- res_download_handler=None, response_headers=None):
+ res_download_handler=None, response_headers=None,
+ hash_algs=None):
self._check_object_uri('get_contents_to_file')
key = self.get_key(None, headers)
self.check_response(key, 'key', self.uri)
- key.get_contents_to_file(fp, headers, cb, num_cb, torrent, version_id,
- res_download_handler, response_headers)
+ if hash_algs:
+ key.get_contents_to_file(fp, headers, cb, num_cb, torrent,
+ version_id, res_download_handler,
+ response_headers,
+ hash_algs=hash_algs)
+ else:
+ key.get_contents_to_file(fp, headers, cb, num_cb, torrent,
+ version_id, res_download_handler,
+ response_headers)
def get_contents_as_string(self, validate=False, headers=None, cb=None,
num_cb=10, torrent=False, version_id=None):
@@ -742,6 +750,15 @@ def set_metadata(self, metadata_plus, metadata_minus, preserve_acl,
preserve_acl,
headers=headers)
+ def compose(self, components, content_type=None, headers=None):
+ self._check_object_uri('compose')
+ component_keys = []
+ for suri in components:
+ component_keys.append(suri.new_key())
+ component_keys[-1].generation = suri.generation
+ self.new_key().compose(
+ component_keys, content_type=content_type, headers=headers)
+
def exists(self, headers=None):
"""Returns True if the object exists or False if it doesn't"""
if not self.object_name:
View
4 boto/utils.py
@@ -90,7 +90,9 @@
# GET bucket?storageClass is not part of the S3 API.)
'storageClass',
# websiteConfig is a QSA for buckets in Google Cloud Storage.
- 'websiteConfig']
+ 'websiteConfig',
+ # compose is a QSA for objects in Google Cloud Storage.
+ 'compose']
_first_cap_regex = re.compile('(.)([A-Z][a-z]+)')
View
4 tests/integration/gs/test_resumable_downloads.py
@@ -113,9 +113,7 @@ def test_failed_download_with_persistent_tracker(self):
self.assertTrue(os.path.exists(tracker_file_name))
f = open(tracker_file_name)
etag_line = f.readline()
- m = re.search(ResumableDownloadHandler.ETAG_REGEX, etag_line)
- f.close()
- self.assertTrue(m)
+ self.assertEquals(etag_line.rstrip('\n'), small_src_key.etag.strip('"\''))
def test_retryable_exception_recovery(self):
"""
View
38 tests/integration/gs/test_storage_uri.py
@@ -23,10 +23,12 @@
"""Unit tests for StorageUri interface."""
+import binascii
import re
import StringIO
from boto import storage_uri
+from boto.exception import BotoClientError
from boto.gs.acl import SupportedPermissions as perms
from tests.integration.gs.testcase import GSTestCase
@@ -79,7 +81,7 @@ def testSetAclXml(self):
"<Permission>READ</Permission></Entry>")
acl_string = re.sub(r"</Entries>",
all_users_read_permission + "</Entries>",
- bucket_acl.to_xml())
+ bucket_acl.to_xml())
# Test-generated owner IDs are not currently valid for buckets
acl_no_owner_string = re.sub(r"<Owner>.*</Owner>", "", acl_string)
@@ -123,3 +125,37 @@ def testPropertiesUpdated(self):
k = b.get_key("obj")
self.assertEqual(k.generation, key_uri.generation)
self.assertEquals(k.get_contents_as_string(), "data3")
+
+ def testCompose(self):
+ data1 = 'hello '
+ data2 = 'world!'
+ expected_crc = 1238062967
+
+ b = self._MakeBucket()
+ bucket_uri = storage_uri("gs://%s" % b.name)
+ key_uri1 = bucket_uri.clone_replace_name("component1")
+ key_uri1.set_contents_from_string(data1)
+ key_uri2 = bucket_uri.clone_replace_name("component2")
+ key_uri2.set_contents_from_string(data2)
+
+ # Simple compose.
+ key_uri_composite = bucket_uri.clone_replace_name("composite")
+ components = [key_uri1, key_uri2]
+ key_uri_composite.compose(components, content_type='text/plain')
+ self.assertEquals(key_uri_composite.get_contents_as_string(),
+ data1 + data2)
+ composite_key = key_uri_composite.get_key()
+ cloud_crc32c = binascii.hexlify(
+ composite_key.cloud_hashes['crc32c'])
+ self.assertEquals(cloud_crc32c, hex(expected_crc)[2:])
+ self.assertEquals(composite_key.content_type, 'text/plain')
+
+ # Compose disallowed between buckets.
+ key_uri1.bucket_name += '2'
+ try:
+ key_uri_composite.compose(components)
+ self.fail('Composing between buckets didn\'t fail as expected.')
+ except BotoClientError as err:
+ self.assertEquals(
+ err.reason, 'GCS does not support inter-bucket composing')
+
Please sign in to comment.
Something went wrong with that request. Please try again.