attempted to replace StringIO in tests, not entirely successful

Yelp · Apr 12, 2015 · f54da15 · f54da15
1 parent f3fa37d
commit f54da15
Show file tree

Hide file tree

Showing 19 changed files with 152 additions and 132 deletions.
diff --git a/mrjob/inline.py b/mrjob/inline.py
@@ -154,7 +154,7 @@ def _run_step(self, step_num, step_type, input_path, output_path,
 
         if has_combiner:
             sorted_lines = sorted(child_stdout.getvalue().splitlines())
-            combiner_stdin = BytesIO('\n'.join(sorted_lines))
+            combiner_stdin = BytesIO(b'\n'.join(sorted_lines))
         else:
             child_stdout.flush()
 

diff --git a/mrjob/launch.py b/mrjob/launch.py
@@ -662,11 +662,11 @@ def sandbox(self, stdin=None, stdout=None, stderr=None):
         ``stdin`` is empty by default. You can set it to anything that yields
         lines::
 
-            mr_job.sandbox(stdin=BytesIO('some_data\\n'))
+            mr_job.sandbox(stdin=BytesIO(b'some_data\\n'))
 
         or, equivalently::
 
-            mr_job.sandbox(stdin=['some_data\\n'])
+            mr_job.sandbox(stdin=[b'some_data\\n'])
 
         For convenience, this sandbox() returns self, so you can do::
 

diff --git a/tests/compress.py b/tests/compress.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 """Utilities to compress data in memory."""
 import gzip
-from StringIO import StringIO
+from io import BytesIO
 
 
 # use bz2.compress() to compress bz2 data
 
 def gzip_compress(data):
     """return the gzip-compressed version of the given bytes."""
-    s = StringIO()
+    s = BytesIO()
     g = gzip.GzipFile(fileobj=s, mode='wb')
     g.write(data)
     g.close()

diff --git a/tests/fs/__init__.py b/tests/fs/__init__.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from StringIO import StringIO
+from io import BytesIO
+
+from mrjob.py2 import stdin
 
 from tests.sandbox import SandboxedTestCase
 
@@ -38,11 +40,11 @@ class MockPopen(object):
 
             def __init__(self, args, stdin=None, stdout=None, stderr=None):
                 self.args = args
-                self.stdin = stdin if stdin is not None else StringIO()
+                self.stdin = stdin if stdin is not None else BytesIO()
 
                 # discard incoming stdout/stderr objects
-                self.stdout = StringIO()
-                self.stderr = StringIO()
+                self.stdout = BytesIO()
+                self.stderr = BytesIO()
 
                 if stdin is None:
                     self._run()
@@ -62,8 +64,8 @@ def _run(self):
                 outer.io_log.append((self.stdout_result, self.stderr_result))
 
                 # expose the results as readable file objects
-                self.stdout = StringIO(self.stdout_result)
-                self.stderr = StringIO(self.stderr_result)
+                self.stdout = BytesIO(self.stdout_result)
+                self.stderr = BytesIO(self.stderr_result)
 
             def communicate(self, stdin=None):
                 if stdin is not None:

diff --git a/tests/mockhadoop.py b/tests/mockhadoop.py
@@ -64,7 +64,7 @@ def add_mock_hadoop_output(parts):
 
     Args:
     parts -- a list of the contents of parts files, which should be iterables
-        that return lines (e.g. lists, StringIOs).
+        that return lines (e.g. lists, BytesIOs).
 
     The environment variable MOCK_HADOOP_OUTPUT must be set.
     """

diff --git a/tests/py2.py b/tests/py2.py
@@ -0,0 +1,14 @@
+"""More Python 2/3 compatibility stuff that is only used for testing."""
+from mrjob.py2 import IN_PY2
+
+# a StringIO that you can safely set sys.stdout or sys.stderr to
+# (for logging or printing)
+#
+# Don't use this for mocking out files or subprocess stdout/stderr;
+# use io.BytesIO instead
+#
+# TODO: maybe move this to tests.py2?
+if IN_PY2:
+    from StringIO import StringIO
+else:
+    from io import StringIO
diff --git a/tests/test_emr.py b/tests/test_emr.py
@@ -13,21 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for EMRJobRunner"""
-from contextlib import contextmanager
-from contextlib import nested
 import copy
-from datetime import datetime
-from datetime import timedelta
 import getpass
 import itertools
 import logging
 import os
 import os.path
 import posixpath
 import shutil
-from StringIO import StringIO
 import tempfile
 import time
+from contextlib import contextmanager
+from contextlib import nested
+from datetime import datetime
+from datetime import timedelta
+from io import BytesIO
 
 from mock import patch
 from mock import Mock
@@ -224,7 +224,7 @@ def teardown_ssh(self, master_ssh_root):
     def run_and_get_job_flow(self, *args):
         # set up a job flow without caring about what the job is or what its
         # inputs are.
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
         mr_job = MRTwoStepJob(
             ['-r', 'emr', '-v'] + list(args))
         mr_job.sandbox(stdin=stdin)
@@ -245,7 +245,7 @@ class EMRJobRunnerEndToEndTestCase(MockEMRAndS3TestCase):
 
     def test_end_to_end(self):
         # read from STDIN, a local file, and a remote file
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
 
         local_input_path = os.path.join(self.tmp_dir, 'input')
         with open(local_input_path, 'w') as local_input_file:
@@ -347,7 +347,7 @@ def test_failed_job(self):
         self.mock_emr_failures = {('j-MOCKJOBFLOW0', 0): None}
 
         with no_handlers_for_logger('mrjob.emr'):
-            stderr = StringIO()
+            stderr = BytesIO()
             log_to_stream('mrjob.emr', stderr)
 
             with mr_job.make_runner() as runner:
@@ -377,7 +377,7 @@ def test_failed_job(self):
 
     def _test_remote_scratch_cleanup(self, mode, scratch_len, log_len):
         self.add_mock_s3_data({'walrus': {'logs/j-MOCKJOBFLOW0/1': '1\n'}})
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
 
         mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                                '--s3-log-uri', 's3://walrus/logs',
@@ -432,7 +432,7 @@ def test_cleanup_error(self):
     def test_args_version_018(self):
         self.add_mock_s3_data({'walrus': {'logs/j-MOCKJOBFLOW0/1': '1\n'}})
         # read from STDIN, a local file, and a remote file
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
 
         mr_job = MRTwoStepJob(['-r', 'emr', '-v',
                                '--hadoop-version=0.18', '--ami-version=1.0'])
@@ -449,7 +449,7 @@ def test_args_version_018(self):
     def test_args_version_020_205(self):
         self.add_mock_s3_data({'walrus': {'logs/j-MOCKJOBFLOW0/1': '1\n'}})
         # read from STDIN, a local file, and a remote file
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
 
         mr_job = MRTwoStepJob(['-r', 'emr', '-v', '--ami-version=2.0'])
         mr_job.sandbox(stdin=stdin)
@@ -522,7 +522,7 @@ def test_attach_to_existing_job_flow(self):
             name='Development Job Flow', log_uri=None,
             keep_alive=True)
 
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
         self.mock_emr_output = {(emr_job_flow_id, 1): [
             '1\t"bar"\n1\t"foo"\n2\tnull\n']}
 
@@ -598,7 +598,7 @@ def test_visible(self):
 class IAMJobFlowRoleTestCase(MockEMRAndS3TestCase):
 
     def run_and_get_job_flow(self, *args):
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
         mr_job = MRTwoStepJob(
             ['-r', 'emr', '-v'] + list(args))
         mr_job.sandbox(stdin=stdin)
@@ -881,7 +881,7 @@ def test_region_bucket_match(self):
     def test_region_bucket_does_not_match(self):
         # aws_region specified, bucket specified with incorrect location
         with no_handlers_for_logger():
-            stderr = StringIO()
+            stderr = BytesIO()
             log = logging.getLogger('mrjob.emr')
             log.addHandler(logging.StreamHandler(stderr))
             log.setLevel(logging.WARNING)
@@ -1256,7 +1256,7 @@ def test_task_type_defaults_to_core_type(self):
             task=(20, 'c1.medium', None))
 
     def test_mixing_instance_number_opts_on_cmd_line(self):
-        stderr = StringIO()
+        stderr = BytesIO()
         with no_handlers_for_logger():
             log_to_stream('mrjob.emr', stderr)
             self._test_instance_groups(
@@ -1272,7 +1272,7 @@ def test_mixing_instance_number_opts_in_mrjob_conf(self):
                                num_ec2_core_instances=5,
                                num_ec2_task_instances=9)
 
-        stderr = StringIO()
+        stderr = BytesIO()
         with no_handlers_for_logger():
             log_to_stream('mrjob.emr', stderr)
             self._test_instance_groups(
@@ -1287,7 +1287,7 @@ def test_cmd_line_instance_numbers_beat_mrjob_conf(self):
         self.set_in_mrjob_conf(num_ec2_core_instances=5,
                                num_ec2_task_instances=9)
 
-        stderr = StringIO()
+        stderr = BytesIO()
         with no_handlers_for_logger():
             log_to_stream('mrjob.emr', stderr)
             self._test_instance_groups(
@@ -1382,7 +1382,7 @@ def test_python_exception(self):
         }})
         self.assertEqual(
             self.runner._find_probable_cause_of_failure([1]),
-            {'lines': list(StringIO(TRACEBACK_START + PY_EXCEPTION)),
+            {'lines': list(BytesIO(TRACEBACK_START + PY_EXCEPTION)),
              'log_file_uri': BUCKET_URI + ATTEMPT_0_DIR + 'stderr',
              'input_uri': BUCKET_URI + 'input.gz'})
 
@@ -1393,7 +1393,7 @@ def test_python_exception_without_input_uri(self):
         }})
         self.assertEqual(
             self.runner._find_probable_cause_of_failure([1]),
-            {'lines': list(StringIO(TRACEBACK_START + PY_EXCEPTION)),
+            {'lines': list(BytesIO(TRACEBACK_START + PY_EXCEPTION)),
              'log_file_uri': BUCKET_URI + ATTEMPT_0_DIR + 'stderr',
              'input_uri': None})
 
@@ -1409,7 +1409,7 @@ def test_java_exception(self):
         }})
         self.assertEqual(
             self.runner._find_probable_cause_of_failure([1]),
-            {'lines': list(StringIO(JAVA_STACK_TRACE)),
+            {'lines': list(BytesIO(JAVA_STACK_TRACE)),
              'log_file_uri': BUCKET_URI + ATTEMPT_0_DIR + 'syslog',
              'input_uri': BUCKET_URI + 'input.gz'})
 
@@ -1422,7 +1422,7 @@ def test_java_exception_without_input_uri(self):
         }})
         self.assertEqual(
             self.runner._find_probable_cause_of_failure([1]),
-            {'lines': list(StringIO(JAVA_STACK_TRACE)),
+            {'lines': list(BytesIO(JAVA_STACK_TRACE)),
              'log_file_uri': BUCKET_URI + ATTEMPT_0_DIR + 'syslog',
              'input_uri': None})
 
@@ -1576,7 +1576,7 @@ def tearDown(self):
     def test_empty_counters_running_job(self):
         self.runner._describe_jobflow().state = 'RUNNING'
         with no_handlers_for_logger():
-            stderr = StringIO()
+            stderr = BytesIO()
             log_to_stream('mrjob.emr', stderr)
             self.runner._fetch_counters([1], skip_s3_wait=True)
             self.assertIn('5 minutes', stderr.getvalue())
@@ -2172,7 +2172,7 @@ def rm_tmp_dir(self):
 
     def test_no_mapper(self):
         # read from STDIN, a local file, and a remote file
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
 
         local_input_path = os.path.join(self.tmp_dir, 'input')
         with open(local_input_path, 'w') as local_input_file:
@@ -3061,7 +3061,7 @@ def die_ssh(*args, **kwargs):
 
         with no_handlers_for_logger('mrjob.emr'):
             r = self._quick_runner()
-            stderr = StringIO()
+            stderr = BytesIO()
             log_to_stream('mrjob.emr', stderr)
             with patch.object(mrjob.emr, 'ssh_terminate_single_job',
                               side_effect=die_ssh):
@@ -3076,7 +3076,7 @@ def die_io(*args, **kwargs):
             r = self._quick_runner()
             with patch.object(mrjob.emr, 'ssh_terminate_single_job',
                               side_effect=die_io):
-                stderr = StringIO()
+                stderr = BytesIO()
                 log_to_stream('mrjob.emr', stderr)
                 r._cleanup_job()
                 self.assertIn('Unable to kill job', stderr.getvalue())
@@ -3225,7 +3225,7 @@ def setUp(self):
         super(BuildStreamingStepTestCase, self).setUp()
         with patch_fs_s3():
             self.runner = EMRJobRunner(
-                mr_job_script='my_job.py', conf_paths=[], stdin=StringIO())
+                mr_job_script='my_job.py', conf_paths=[], stdin=BytesIO())
         self.runner._steps = []  # don't actually run `my_job.py --steps`
         self.runner._add_job_files_for_upload()
 

diff --git a/tests/test_hadoop.py b/tests/test_hadoop.py
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Test the hadoop job runner."""
-from StringIO import StringIO
 import getpass
 import os
 import pty
+from io import BytesIO
 from subprocess import CalledProcessError
 from subprocess import check_call
 
@@ -146,7 +146,7 @@ class HadoopJobRunnerEndToEndTestCase(MockHadoopTestCase):
 
     def _test_end_to_end(self, args=()):
         # read from STDIN, a local file, and a remote file
-        stdin = StringIO('foo\nbar\n')
+        stdin = BytesIO(b'foo\nbar\n')
 
         local_input_path = os.path.join(self.tmp_dir, 'input')
         with open(local_input_path, 'w') as local_input_file:
@@ -274,7 +274,7 @@ def setUp(self):
         super(StreamingArgsTestCase, self).setUp()
         self.runner = HadoopJobRunner(
             hadoop_bin='hadoop', hadoop_streaming_jar='streaming.jar',
-            mr_job_script='my_job.py', stdin=StringIO())
+            mr_job_script='my_job.py', stdin=BytesIO())
         self.runner._add_job_files_for_upload()
 
         self.runner._hadoop_version='0.20.204'