Skip to content

Commit

Permalink
go back to using "string" (per offline discussion with @buzkor)
Browse files Browse the repository at this point in the history
  • Loading branch information
David Marin committed May 6, 2015
1 parent 56395ef commit 8fe93e0
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 46 deletions.
8 changes: 4 additions & 4 deletions mrjob/fs/hadoop.py
Expand Up @@ -21,7 +21,7 @@
from subprocess import CalledProcessError

from mrjob.fs.base import Filesystem
from mrjob.py2 import to_text
from mrjob.py2 import to_string
from mrjob.parse import is_uri
from mrjob.parse import urlparse
from mrjob.util import cmd_line
Expand Down Expand Up @@ -85,7 +85,7 @@ def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
log_func = log.debug if proc.returncode == 0 else log.error
if not return_stdout:
for line in BytesIO(stdout):
log_func('STDOUT: ' + to_text(line.rstrip(b'\r\n')))
log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))

# check if STDERR is okay
stderr_is_ok = False
Expand All @@ -97,7 +97,7 @@ def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,

if not stderr_is_ok:
for line in BytesIO(stderr):
log_func('STDERR: ' + to_text(line.rstrip(b'\r\n')))
log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))

ok_returncodes = ok_returncodes or [0]

Expand Down Expand Up @@ -163,7 +163,7 @@ def ls(self, path_glob):
if not path_index:
raise IOError("Could not locate path in string %r" % line)

path = to_text(line.split(b' ', path_index)[-1])
path = to_string(line.split(b' ', path_index)[-1])
# handle fully qualified URIs from newer versions of Hadoop ls
# (see Pull Request #577)
if is_uri(path):
Expand Down
6 changes: 3 additions & 3 deletions mrjob/hadoop.py
Expand Up @@ -42,7 +42,7 @@
from mrjob.logparsers import scan_for_counters_in_files
from mrjob.logparsers import best_error_from_logs
from mrjob.parse import HADOOP_STREAMING_JAR_RE
from mrjob.py2 import to_text
from mrjob.py2 import to_string
from mrjob.parse import is_uri
from mrjob.runner import MRJobRunner
from mrjob.runner import RunnerOptionStore
Expand Down Expand Up @@ -326,7 +326,7 @@ def _run_job_in_hadoop(self):

# there shouldn't be much output to STDOUT
for line in step_proc.stdout:
log.error('STDOUT: ' + to_text(line.strip(b'\n')))
log.error('STDOUT: ' + to_string(line.strip(b'\n')))

returncode = step_proc.wait()
else:
Expand Down Expand Up @@ -387,7 +387,7 @@ def treat_eio_as_eof(iter):

for line in treat_eio_as_eof(stderr):
line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
log.info('HADOOP: ' + to_text(line))
log.info('HADOOP: ' + to_string(line))

if b'Streaming Job Failed!' in line:
raise Exception(line)
Expand Down
28 changes: 14 additions & 14 deletions mrjob/parse.py
Expand Up @@ -24,7 +24,7 @@
from mrjob.compat import uses_020_counters
from mrjob.py2 import PY2
from mrjob.py2 import ParseResult
from mrjob.py2 import to_text
from mrjob.py2 import to_string
from mrjob.py2 import urlparse as urlparse_buggy

try:
Expand Down Expand Up @@ -215,7 +215,7 @@ def find_python_traceback(lines):

for line in lines:
# don't return bytes in Python 3
line = to_text(line)
line = to_string(line)

if in_traceback:
tb_lines.append(line)
Expand Down Expand Up @@ -278,7 +278,7 @@ def find_hadoop_java_stack_trace(lines):
if not line.startswith(b' at '):
break
st_lines.append(line)
return [to_text(line) for line in st_lines]
return [to_string(line) for line in st_lines]
else:
return None

Expand All @@ -301,7 +301,7 @@ def find_input_uri_for_mapper(lines):
for line in lines:
match = _OPENING_FOR_READING_RE.match(line)
if match:
val = to_text(match.group(1))
val = to_string(match.group(1))
return val


Expand All @@ -326,7 +326,7 @@ def find_interesting_hadoop_streaming_error(lines):
_HADOOP_STREAMING_ERROR_RE.match(line) or
_HADOOP_STREAMING_ERROR_RE_2.match(line))
if match:
msg = to_text(match.group(1))
msg = to_string(match.group(1))
if msg != 'Job not Successful!':
return msg
return None
Expand Down Expand Up @@ -379,7 +379,7 @@ def find_job_log_multiline_error(lines):
if line.strip() == b'"':
break
st_lines.append(line)
return [to_text(line) for line in st_lines]
return [to_string(line) for line in st_lines]
return None


Expand Down Expand Up @@ -444,8 +444,8 @@ def parse_mr_job_stderr(stderr, counters=None):
group, counter, amount_str = m.groups()

# don't leave these as bytes on Python 3
group = to_text(group)
counter = to_text(counter)
group = to_string(group)
counter = to_string(counter)

counters.setdefault(group, {})
counters[group].setdefault(counter, 0)
Expand All @@ -455,10 +455,10 @@ def parse_mr_job_stderr(stderr, counters=None):
m = _STATUS_RE.match(line.rstrip(b'\r\n'))
if m:
# don't leave as bytes on Python 3
statuses.append(to_text(m.group(1)))
statuses.append(to_string(m.group(1)))
continue

other.append(to_text(line))
other.append(to_string(line))

return {'counters': counters, 'statuses': statuses, 'other': other}

Expand Down Expand Up @@ -515,8 +515,8 @@ def _parse_counters_0_18(counter_string):
log.warning('Cannot parse Hadoop counter string: %s' % counter_string)

for m in groups:
yield (to_text(m.group('group')),
to_text(m.group('name')),
yield (to_string(m.group('group')),
to_string(m.group('name')),
int(m.group('value')))


Expand All @@ -534,14 +534,14 @@ def _parse_counters_0_20(counter_string):
group_name = counter_unescape(group_name)
except ValueError:
log.warning("Could not decode group name %r" % group_name)
group_name = to_text(group_name)
group_name = to_string(group_name)

for counter_id, counter_name, counter_value in matches:
try:
counter_name = counter_unescape(counter_name)
except ValueError:
log.warning("Could not decode counter name %r" % counter_name)
counter_name = to_text(counter_name)
counter_name = to_string(counter_name)

yield group_name, counter_name, int(counter_value)

Expand Down
21 changes: 10 additions & 11 deletions mrjob/py2.py
Expand Up @@ -23,7 +23,7 @@
Python 2, where str (bytes) and unicode can be used interchangeably.
So really our string datatypes fall into two categories, bytes, and
"text", which is either ``str`` (i.e., bytes) or ``unicode`` in Python 2,
"strings", which is either ``str`` (i.e., bytes) or ``unicode`` in Python 2,
and ``str`` (i.e. unicode) in Python 3.
These things should always be bytes:
Expand All @@ -41,7 +41,7 @@
Note that both Python 2.6+ and Python 3.3+ have the ``bytes`` type and
``b''`` constants built-in.
These things should always be text:
These things should always be strings:
- streams that you print() to (e.g. ``sys.stdout`` if you mock it out)
- streams that you log to
Expand All @@ -53,29 +53,29 @@
- Hadoop status messages
- anything else we parse out of log files
These things are text because it makes for simpler code:
These things are strings because it makes for simpler code:
- contents of config files
- contents of scripts output by mrjob (e.g. the setup wrapper script)
- contents of empty files
Use the ``StringIO`` from this module to deal with text (it's
Use the ``StringIO`` from this module to deal with strings (it's
``StringIO.StringIO`` in Python 2 and ``io.StringIO`` in Python 3).
Please use ``%`` for format strings and not ``format()``, which is much more
picky about mixing unicode and bytes.
This module provides a ``string_types`` tuple (name from the ``six`` library)
so you can check if something is text.
so you can check if something is a string.
We don't provide a ``unicode`` constant:
- Use ``not isinstance(..., bytes)`` to check if a string is Unicode
- To convert ``bytes`` to ``unicode``, use ``.decode('utf_8')`.
- Python 3.3+ has ``u''`` literals; please use sparingly
If you need to convert bytes of unknown encoding to text (e.g. to ``print()``
or log them), use ``to_text()``.
If you need to convert bytes of unknown encoding to a string (e.g. to
``print()`` or log them), use ``to_string()`` from this module.
Iterables
---------
Expand Down Expand Up @@ -151,11 +151,10 @@
ParseResult


def to_text(s):
"""On Python 3, convert ``bytes`` to ``str`` (i.e. unicode).
def to_string(s):
"""Convert ``bytes`` to ``str``, leaving ``unicode`` unchanged.
(In Python 2, either ``str`` or ``unicode`` is acceptable as text, so
do nothing.)
(This means on Python 2, ``to_string()`` does nothing.)
Use this if you need to ``print()`` or log bytes of an unknown encoding,
or to parse strings out of bytes of unknown encoding (e.g. a log file).
Expand Down
12 changes: 6 additions & 6 deletions mrjob/ssh.py
Expand Up @@ -23,7 +23,7 @@
from subprocess import Popen
from subprocess import PIPE

from mrjob.py2 import to_text
from mrjob.py2 import to_string

SSH_PREFIX = 'ssh://'
SSH_LOG_ROOT = '/mnt/var/log/hadoop'
Expand Down Expand Up @@ -141,7 +141,7 @@ def ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file):

cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '"
args = ['bash -c "%s"' % cmd]
ips = to_text(check_output(
ips = to_string(check_output(
*ssh_run(ssh_bin, master_address, ec2_key_pair_file, args)))
return [ip for ip in ips.split('\n') if ip]

Expand Down Expand Up @@ -177,7 +177,7 @@ def ssh_ls(ssh_bin, address, ec2_key_pair_file, path, keyfile=None):
:param keyfile: Name of the EMR private key file on the master node in case
``path`` exists on one of the slave nodes
"""
out = to_text(check_output(*ssh_run_with_recursion(
out = to_string(check_output(*ssh_run_with_recursion(
ssh_bin, address, ec2_key_pair_file, keyfile,
['find', '-L', path, '-type', 'f'])))
if 'No such file or directory' in out:
Expand All @@ -198,7 +198,7 @@ def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
:return: ``True`` if successful, ``False`` if no job was running
"""
job_list_out = to_text(check_output(*ssh_run(
job_list_out = to_string(check_output(*ssh_run(
ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])))
job_list_lines = job_list_out.splitlines()

Expand All @@ -218,9 +218,9 @@ def job_list_output_error():
job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
if not job_info_match:
job_list_output_error()
job_id = to_text(job_info_match.group(1))
job_id = to_string(job_info_match.group(1))

job_kill_out = to_text(check_output(*ssh_run(
job_kill_out = to_string(check_output(*ssh_run(
ssh_bin, address, ec2_key_pair_file,
['hadoop', 'job', '-kill', job_id])))

Expand Down
16 changes: 8 additions & 8 deletions tests/test_py2.py
Expand Up @@ -13,27 +13,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mrjob.py2 import to_text
from mrjob.py2 import to_string

from tests.py2 import TestCase


class ToTextTestCase(TestCase):
class ToStringTestCase(TestCase):

def test_None(self):
self.assertRaises(TypeError, to_text, None)
self.assertRaises(TypeError, to_string, None)

def test_ascii_bytes(self):
self.assertEqual(to_text(b'foo'), 'foo')
self.assertEqual(to_string(b'foo'), 'foo')

def test_utf_8_bytes(self):
self.assertEqual(to_text(b'caf\xc3\xa9'), 'café')
self.assertEqual(to_string(b'caf\xc3\xa9'), 'café')

def test_latin_1_bytes(self):
self.assertEqual(to_text(b'caf\xe9'), 'caf\xe9')
self.assertEqual(to_string(b'caf\xe9'), 'caf\xe9')

def test_ascii_unicode(self):
self.assertEqual(to_text(u'foo'), u'foo')
self.assertEqual(to_string(u'foo'), u'foo')

def test_non_ascii_unicode(self):
self.assertEqual(to_text(u'café'), u'café')
self.assertEqual(to_string(u'café'), u'café')

0 comments on commit 8fe93e0

Please sign in to comment.