go back to using "string" (per offline discussion with @buzkor)

Yelp · May 6, 2015 · 8fe93e0 · 8fe93e0
1 parent 56395ef
commit 8fe93e0
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 46 deletions.
diff --git a/mrjob/fs/hadoop.py b/mrjob/fs/hadoop.py
@@ -21,7 +21,7 @@
 from subprocess import CalledProcessError
 
 from mrjob.fs.base import Filesystem
-from mrjob.py2 import to_text
+from mrjob.py2 import to_string
 from mrjob.parse import is_uri
 from mrjob.parse import urlparse
 from mrjob.util import cmd_line
@@ -85,7 +85,7 @@ def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
         log_func = log.debug if proc.returncode == 0 else log.error
         if not return_stdout:
             for line in BytesIO(stdout):
-                log_func('STDOUT: ' + to_text(line.rstrip(b'\r\n')))
+                log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))
 
         # check if STDERR is okay
         stderr_is_ok = False
@@ -97,7 +97,7 @@ def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
 
         if not stderr_is_ok:
             for line in BytesIO(stderr):
-                log_func('STDERR: ' + to_text(line.rstrip(b'\r\n')))
+                log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))
 
         ok_returncodes = ok_returncodes or [0]
 
@@ -163,7 +163,7 @@ def ls(self, path_glob):
             if not path_index:
                 raise IOError("Could not locate path in string %r" % line)
 
-            path = to_text(line.split(b' ', path_index)[-1])
+            path = to_string(line.split(b' ', path_index)[-1])
             # handle fully qualified URIs from newer versions of Hadoop ls
             # (see Pull Request #577)
             if is_uri(path):

diff --git a/mrjob/hadoop.py b/mrjob/hadoop.py
@@ -42,7 +42,7 @@
 from mrjob.logparsers import scan_for_counters_in_files
 from mrjob.logparsers import best_error_from_logs
 from mrjob.parse import HADOOP_STREAMING_JAR_RE
-from mrjob.py2 import to_text
+from mrjob.py2 import to_string
 from mrjob.parse import is_uri
 from mrjob.runner import MRJobRunner
 from mrjob.runner import RunnerOptionStore
@@ -326,7 +326,7 @@ def _run_job_in_hadoop(self):
 
                 # there shouldn't be much output to STDOUT
                 for line in step_proc.stdout:
-                    log.error('STDOUT: ' + to_text(line.strip(b'\n')))
+                    log.error('STDOUT: ' + to_string(line.strip(b'\n')))
 
                 returncode = step_proc.wait()
             else:
@@ -387,7 +387,7 @@ def treat_eio_as_eof(iter):
 
         for line in treat_eio_as_eof(stderr):
             line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
-            log.info('HADOOP: ' + to_text(line))
+            log.info('HADOOP: ' + to_string(line))
 
             if b'Streaming Job Failed!' in line:
                 raise Exception(line)

diff --git a/mrjob/parse.py b/mrjob/parse.py
@@ -24,7 +24,7 @@
 from mrjob.compat import uses_020_counters
 from mrjob.py2 import PY2
 from mrjob.py2 import ParseResult
-from mrjob.py2 import to_text
+from mrjob.py2 import to_string
 from mrjob.py2 import urlparse as urlparse_buggy
 
 try:
@@ -215,7 +215,7 @@ def find_python_traceback(lines):
 
     for line in lines:
         # don't return bytes in Python 3
-        line = to_text(line)
+        line = to_string(line)
 
         if in_traceback:
             tb_lines.append(line)
@@ -278,7 +278,7 @@ def find_hadoop_java_stack_trace(lines):
                     if not line.startswith(b'        at '):
                         break
                     st_lines.append(line)
-                return [to_text(line) for line in st_lines]
+                return [to_string(line) for line in st_lines]
     else:
         return None
 
@@ -301,7 +301,7 @@ def find_input_uri_for_mapper(lines):
     for line in lines:
         match = _OPENING_FOR_READING_RE.match(line)
         if match:
-            val = to_text(match.group(1))
+            val = to_string(match.group(1))
     return val
 
 
@@ -326,7 +326,7 @@ def find_interesting_hadoop_streaming_error(lines):
             _HADOOP_STREAMING_ERROR_RE.match(line) or
             _HADOOP_STREAMING_ERROR_RE_2.match(line))
         if match:
-            msg = to_text(match.group(1))
+            msg = to_string(match.group(1))
             if msg != 'Job not Successful!':
                 return msg
     return None
@@ -379,7 +379,7 @@ def find_job_log_multiline_error(lines):
                     if line.strip() == b'"':
                         break
                     st_lines.append(line)
-                return [to_text(line) for line in st_lines]
+                return [to_string(line) for line in st_lines]
     return None
 
 
@@ -444,8 +444,8 @@ def parse_mr_job_stderr(stderr, counters=None):
             group, counter, amount_str = m.groups()
 
             # don't leave these as bytes on Python 3
-            group = to_text(group)
-            counter = to_text(counter)
+            group = to_string(group)
+            counter = to_string(counter)
 
             counters.setdefault(group, {})
             counters[group].setdefault(counter, 0)
@@ -455,10 +455,10 @@ def parse_mr_job_stderr(stderr, counters=None):
         m = _STATUS_RE.match(line.rstrip(b'\r\n'))
         if m:
             # don't leave as bytes on Python 3
-            statuses.append(to_text(m.group(1)))
+            statuses.append(to_string(m.group(1)))
             continue
 
-        other.append(to_text(line))
+        other.append(to_string(line))
 
     return {'counters': counters, 'statuses': statuses, 'other': other}
 
@@ -515,8 +515,8 @@ def _parse_counters_0_18(counter_string):
         log.warning('Cannot parse Hadoop counter string: %s' % counter_string)
 
     for m in groups:
-        yield (to_text(m.group('group')),
-               to_text(m.group('name')),
+        yield (to_string(m.group('group')),
+               to_string(m.group('name')),
                int(m.group('value')))
 
 
@@ -534,14 +534,14 @@ def _parse_counters_0_20(counter_string):
             group_name = counter_unescape(group_name)
         except ValueError:
             log.warning("Could not decode group name %r" % group_name)
-            group_name = to_text(group_name)
+            group_name = to_string(group_name)
 
         for counter_id, counter_name, counter_value in matches:
             try:
                 counter_name = counter_unescape(counter_name)
             except ValueError:
                 log.warning("Could not decode counter name %r" % counter_name)
-                counter_name = to_text(counter_name)
+                counter_name = to_string(counter_name)
 
             yield group_name, counter_name, int(counter_value)
 

diff --git a/mrjob/py2.py b/mrjob/py2.py
@@ -23,7 +23,7 @@
 Python 2, where str (bytes) and unicode can be used interchangeably.
 
 So really our string datatypes fall into two categories, bytes, and
-"text", which is either ``str`` (i.e., bytes) or ``unicode`` in Python 2,
+"strings", which is either ``str`` (i.e., bytes) or ``unicode`` in Python 2,
 and ``str`` (i.e. unicode) in Python 3.
 
 These things should always be bytes:
@@ -41,7 +41,7 @@
 Note that both Python 2.6+ and Python 3.3+ have the ``bytes`` type and
 ``b''`` constants built-in.
 
-These things should always be text:
+These things should always be strings:
 
 - streams that you print() to (e.g. ``sys.stdout`` if you mock it out)
 - streams that you log to
@@ -53,29 +53,29 @@
 - Hadoop status messages
 - anything else we parse out of log files
 
-These things are text because it makes for simpler code:
+These things are strings because it makes for simpler code:
 
 - contents of config files
 - contents of scripts output by mrjob (e.g. the setup wrapper script)
 - contents of empty files
 
-Use the ``StringIO`` from this module to deal with text (it's
+Use the ``StringIO`` from this module to deal with strings (it's
 ``StringIO.StringIO`` in Python 2 and ``io.StringIO`` in Python 3).
 
 Please use ``%`` for format strings and not ``format()``, which is much more
 picky about mixing unicode and bytes.
 
 This module provides a ``string_types`` tuple (name from the ``six`` library)
-so you can check if something is text.
+so you can check if something is a string.
 
 We don't provide a ``unicode`` constant:
 
 - Use ``not isinstance(..., bytes)`` to check if a string is Unicode
 - To convert ``bytes`` to ``unicode``, use ``.decode('utf_8')`.
 - Python 3.3+ has ``u''`` literals; please use sparingly
 
-If you need to convert bytes of unknown encoding to text (e.g. to ``print()``
-or log them), use ``to_text()``.
+If you need to convert bytes of unknown encoding to a string (e.g. to
+``print()`` or log them), use ``to_string()`` from this module.
 
 Iterables
 ---------
@@ -151,11 +151,10 @@
 ParseResult
 
 
-def to_text(s):
-    """On Python 3, convert ``bytes`` to ``str`` (i.e. unicode).
+def to_string(s):
+    """Convert ``bytes`` to ``str``, leaving ``unicode`` unchanged.
 
-    (In Python 2, either ``str`` or ``unicode`` is acceptable as text, so
-    do nothing.)
+    (This means on Python 2, ``to_string()`` does nothing.)
 
     Use this if you need to ``print()`` or log bytes of an unknown encoding,
     or to parse strings out of bytes of unknown encoding (e.g. a log file).

diff --git a/mrjob/ssh.py b/mrjob/ssh.py
@@ -23,7 +23,7 @@
 from subprocess import Popen
 from subprocess import PIPE
 
-from mrjob.py2 import to_text
+from mrjob.py2 import to_string
 
 SSH_PREFIX = 'ssh://'
 SSH_LOG_ROOT = '/mnt/var/log/hadoop'
@@ -141,7 +141,7 @@ def ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file):
 
     cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '"
     args = ['bash -c "%s"' % cmd]
-    ips = to_text(check_output(
+    ips = to_string(check_output(
         *ssh_run(ssh_bin, master_address, ec2_key_pair_file, args)))
     return [ip for ip in ips.split('\n') if ip]
 
@@ -177,7 +177,7 @@ def ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None):
     :param keyfile: Name of the EMR private key file on the master node in case
                     ``path`` exists on one of the slave nodes
     """
-    out = to_text(check_output(*ssh_run_with_recursion(
+    out = to_string(check_output(*ssh_run_with_recursion(
         ssh_bin, address, ec2_key_pair_file, keyfile,
         ['find', '-L', path, '-type', 'f'])))
     if 'No such file or directory' in out:
@@ -198,7 +198,7 @@ def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
 
     :return: ``True`` if successful, ``False`` if no job was running
     """
-    job_list_out = to_text(check_output(*ssh_run(
+    job_list_out = to_string(check_output(*ssh_run(
         ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])))
     job_list_lines = job_list_out.splitlines()
 
@@ -218,9 +218,9 @@ def job_list_output_error():
     job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
     if not job_info_match:
         job_list_output_error()
-    job_id = to_text(job_info_match.group(1))
+    job_id = to_string(job_info_match.group(1))
 
-    job_kill_out = to_text(check_output(*ssh_run(
+    job_kill_out = to_string(check_output(*ssh_run(
         ssh_bin, address, ec2_key_pair_file,
         ['hadoop', 'job', '-kill', job_id])))
 

diff --git a/tests/test_py2.py b/tests/test_py2.py
@@ -13,27 +13,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from mrjob.py2 import to_text
+from mrjob.py2 import to_string
 
 from tests.py2 import TestCase
 
 
-class ToTextTestCase(TestCase):
+class ToStringTestCase(TestCase):
 
     def test_None(self):
-        self.assertRaises(TypeError, to_text, None)
+        self.assertRaises(TypeError, to_string, None)
 
     def test_ascii_bytes(self):
-        self.assertEqual(to_text(b'foo'), 'foo')
+        self.assertEqual(to_string(b'foo'), 'foo')
 
     def test_utf_8_bytes(self):
-        self.assertEqual(to_text(b'caf\xc3\xa9'), 'café')
+        self.assertEqual(to_string(b'caf\xc3\xa9'), 'café')
 
     def test_latin_1_bytes(self):
-        self.assertEqual(to_text(b'caf\xe9'), 'caf\xe9')
+        self.assertEqual(to_string(b'caf\xe9'), 'caf\xe9')
 
     def test_ascii_unicode(self):
-        self.assertEqual(to_text(u'foo'), u'foo')
+        self.assertEqual(to_string(u'foo'), u'foo')
 
     def test_non_ascii_unicode(self):
-        self.assertEqual(to_text(u'café'), u'café')
+        self.assertEqual(to_string(u'café'), u'café')