apache · katsiapis · Dec 26, 2016 · Dec 27, 2016 · Dec 26, 2016 · chamikaramj
diff --git a/sdks/python/apache_beam/io/avroio.py b/sdks/python/apache_beam/io/avroio.py
@@ -16,7 +16,7 @@
 #
 """Implements a source for reading Avro files."""
 
-import cStringIO as StringIO
+import cStringIO
 import os
 import zlib
 
@@ -198,8 +198,10 @@ def _decompress_bytes(data, codec):
         raise ValueError('Snappy does not seem to be installed.')
 
       # Compressed data includes a 4-byte CRC32 checksum which we verify.
-      result = snappy.decompress(data[:-4])
-      avroio.BinaryDecoder(StringIO.StringIO(data[-4:])).check_crc32(result)
+      # We take care to avoid extra copies of data while slicing large objects
+      # by use of a buffer.
+      result = snappy.decompress(buffer(data)[:-4])
+      avroio.BinaryDecoder(cStringIO.StringIO(data[-4:])).check_crc32(result)
       return result
     else:
       raise ValueError('Unknown codec: %r', codec)
@@ -209,7 +211,7 @@ def num_records(self):
 
   def records(self):
     decoder = avroio.BinaryDecoder(
-        StringIO.StringIO(self._decompressed_block_bytes))
+        cStringIO.StringIO(self._decompressed_block_bytes))
     reader = avroio.DatumReader(
         writers_schema=self._schema, readers_schema=self._schema)
 

diff --git a/sdks/python/apache_beam/io/filebasedsource_test.py b/sdks/python/apache_beam/io/filebasedsource_test.py
@@ -16,7 +16,7 @@
 #
 
 import bz2
-import cStringIO as StringIO
+import cStringIO
 import gzip
 import logging
 import math
@@ -451,7 +451,7 @@ def test_read_pattern_gzip(self):
     chunks = [lines[splits[i-1]:splits[i]] for i in xrange(1, len(splits))]
     compressed_chunks = []
     for c in chunks:
-      out = StringIO.StringIO()
+      out = cStringIO.StringIO()
       with gzip.GzipFile(fileobj=out, mode="w") as f:
         f.write('\n'.join(c))
       compressed_chunks.append(out.getvalue())
@@ -498,7 +498,7 @@ def test_read_auto_pattern(self):
     chunks = [lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))]
     compressed_chunks = []
     for c in chunks:
-      out = StringIO.StringIO()
+      out = cStringIO.StringIO()
       with gzip.GzipFile(fileobj=out, mode="w") as f:
         f.write('\n'.join(c))
       compressed_chunks.append(out.getvalue())
@@ -518,7 +518,7 @@ def test_read_auto_pattern_compressed_and_uncompressed(self):
     chunks_to_write = []
     for i, c in enumerate(chunks):
       if i%2 == 0:
-        out = StringIO.StringIO()
+        out = cStringIO.StringIO()
         with gzip.GzipFile(fileobj=out, mode="w") as f:
           f.write('\n'.join(c))
         chunks_to_write.append(out.getvalue())

diff --git a/sdks/python/apache_beam/io/fileio.py b/sdks/python/apache_beam/io/fileio.py
@@ -649,7 +649,7 @@ class _CompressedFile(object):
   def __init__(self,
                fileobj,
                compression_type=CompressionTypes.GZIP,
-               read_size=16384):
+               read_size=gcsio.DEFAULT_READ_BUFFER_SIZE):
     if not fileobj:
       raise ValueError('fileobj must be opened file but was %s' % fileobj)
     self._validate_compression_type(compression_type)

diff --git a/sdks/python/apache_beam/io/gcsio.py b/sdks/python/apache_beam/io/gcsio.py
@@ -20,7 +20,7 @@
 https://github.com/GoogleCloudPlatform/appengine-gcs-client.
 """
 
-import cStringIO as StringIO
+import cStringIO
 import errno
 import fnmatch
 import logging
@@ -418,7 +418,7 @@ def __init__(self,
     get_request.generation = metadata.generation
 
     # Initialize read buffer state.
-    self.download_stream = StringIO.StringIO()
+    self.download_stream = cStringIO.StringIO()
     self.downloader = transfer.Download(
         self.download_stream, auto_transfer=False, chunksize=buffer_size)
     self.client.objects.Get(get_request, download=self.downloader)
@@ -558,7 +558,7 @@ def _get_segment(self, start, size):
     end = start + size - 1
     self.downloader.GetRange(start, end)
     value = self.download_stream.getvalue()
-    # Clear the StringIO object after we've read its contents.
+    # Clear the cStringIO object after we've read its contents.
     self.download_stream.truncate(0)
     assert len(value) == size
     return value