Permalink
Browse files

Reduce memory usage for chunk_hashes

On a 180MB file this reduced total memory
usage by approximately 40%.  This was also marginally
faster (but not by much).

I've also added the start of unittests for the writer module,
and I've written some very basic unittests for the chunk_hashes
function.
  • Loading branch information...
jamesls committed Sep 6, 2012
1 parent f1b007e commit f117db58ae25a788a0ab522e89986cea6bded31a
Showing with 37 additions and 9 deletions.
  1. +11 −9 boto/glacier/writer.py
  2. +26 −0 tests/unit/glacier/test_writer.py
View
@@ -28,15 +28,17 @@
import json
-def chunk_hashes(str):
- """
- Break up the byte-string into 1MB chunks and return sha256 hashes
- for each.
- """
- chunk = 1024 * 1024
- chunk_count = int(math.ceil(len(str) / float(chunk)))
- chunks = [str[i * chunk:(i + 1) * chunk] for i in range(chunk_count)]
- return [hashlib.sha256(x).digest() for x in chunks]
+_ONE_MEGABYTE = 1024 * 1024
+
+
+def chunk_hashes(bytestring, chunk_size=_ONE_MEGABYTE):
+ chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))
+ hashes = []
+ for i in xrange(chunk_count):
+ start = i * chunk_size
+ end = (i + 1) * chunk_size
+ hashes.append(hashlib.sha256(bytestring[start:end]).digest())
+ return hashes
def tree_hash(fo):
@@ -0,0 +1,26 @@
+from hashlib import sha256
+
+from tests.unit import unittest
+import mock
+
+from boto.glacier.writer import Writer, chunk_hashes
+
+
+class TestChunking(unittest.TestCase):
+ def test_chunk_hashes_exact(self):
+ chunks = chunk_hashes('a' * (2 * 1024 * 1024))
+ self.assertEqual(len(chunks), 2)
+ self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest())
+
+ def test_chunks_with_leftovers(self):
+ bytestring = 'a' * (2 * 1024 * 1024 + 20)
+ chunks = chunk_hashes(bytestring)
+ self.assertEqual(len(chunks), 3)
+ self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest())
+ self.assertEqual(chunks[1], sha256('a' * 1024 * 1024).digest())
+ self.assertEqual(chunks[2], sha256('a' * 20).digest())
+
+ def test_less_than_one_chunk(self):
+ chunks = chunk_hashes('aaaa')
+ self.assertEqual(len(chunks), 1)
+ self.assertEqual(chunks[0], sha256('aaaa').digest())

0 comments on commit f117db5

Please sign in to comment.