Skip to content

Commit

Permalink
replaced mrjob_tar_gz with mrjob_zip. Hand-tested locally on EMR
Browse files Browse the repository at this point in the history
  • Loading branch information
David Marin committed Dec 2, 2016
1 parent 7d05296 commit 8c913db
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 19 deletions.
2 changes: 1 addition & 1 deletion docs/guides/configs-all-runners.rst
Expand Up @@ -41,7 +41,7 @@ options related to file uploading.
:set: all
:default: (automatic)

Should we automatically tar up the mrjob library and install it when we run
Should we automatically zip up the mrjob library and install it when we run
job? By default, we do unless :mrjob-opt:`interpreter` is set.

Set this to ``False`` if you've already installed ``mrjob`` on your
Expand Down
8 changes: 4 additions & 4 deletions mrjob/dataproc.py
Expand Up @@ -462,7 +462,7 @@ def _add_bootstrap_files_for_upload(self):
Create the master bootstrap script if necessary.
"""
# lazily create mrjob.tar.gz
# lazily create mrjob.zip
if self._bootstrap_mrjob():
self._create_mrjob_zip()
self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)
Expand Down Expand Up @@ -846,7 +846,7 @@ def _create_master_bootstrap_script_if_needed(self):
if not (self._bootstrap or self._bootstrap_mrjob()):
return

# create mrjob.tar.gz if we need it, and add commands to install it
# create mrjob.zip if we need it, and add commands to install it
mrjob_bootstrap = []
if self._bootstrap_mrjob():
assert self._mrjob_zip_path
Expand All @@ -860,9 +860,9 @@ def _create_master_bootstrap_script_if_needed(self):
"'from distutils.sysconfig import get_python_lib;"
" print(get_python_lib())')" %
cmd_line(self._python_bin())])
# un-tar mrjob.tar.gz
# unzip mrjob.zip
mrjob_bootstrap.append(
['sudo tar xfz ', path_dict, ' -C $__mrjob_PYTHON_LIB'])
['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])
# re-compile pyc files now, since mappers/reducers can't
# write to this directory. Don't fail if there is extra
# un-compileable crud in the tarball (this would matter if
Expand Down
14 changes: 7 additions & 7 deletions mrjob/emr.py
Expand Up @@ -942,7 +942,7 @@ def _add_bootstrap_files_for_upload(self, persistent=False):
persistent -- set by make_persistent_cluster()
"""
# lazily create mrjob.tar.gz
# lazily create mrjob.zip
if self._bootstrap_mrjob():
self._create_mrjob_zip()
self._bootstrap_dir_mgr.add('file', self._mrjob_zip_path)
Expand Down Expand Up @@ -2403,7 +2403,7 @@ def _create_master_bootstrap_script_if_needed(self):
self._bootstrap_mrjob()):
return

# create mrjob.tar.gz if we need it, and add commands to install it
# create mrjob.zip if we need it, and add commands to install it
mrjob_bootstrap = []
if self._bootstrap_mrjob():
# _add_bootstrap_files_for_upload() should have done this
Expand All @@ -2418,9 +2418,9 @@ def _create_master_bootstrap_script_if_needed(self):
"'from distutils.sysconfig import get_python_lib;"
" print(get_python_lib())')" %
cmd_line(self._python_bin())])
# un-tar mrjob.tar.gz
# copy mrjob.zip over
mrjob_bootstrap.append(
['sudo tar xfz ', path_dict, ' -C $__mrjob_PYTHON_LIB'])
['sudo unzip ', path_dict, ' -d $__mrjob_PYTHON_LIB'])
# re-compile pyc files now, since mappers/reducers can't
# write to this directory. Don't fail if there is extra
# un-compileable crud in the tarball (this would matter if
Expand All @@ -2429,7 +2429,7 @@ def _create_master_bootstrap_script_if_needed(self):
['sudo %s -m compileall -f $__mrjob_PYTHON_LIB/mrjob && true' %
cmd_line(self._python_bin())])

# TODO: isn't it b.sh now?
# TODO: shouldn't it be b.sh now?
# we call the script b.py because there's a character limit on
# bootstrap script names (or there was at one time, anyway)
path = os.path.join(self._get_local_tmp_dir(), 'b.py')
Expand Down Expand Up @@ -3117,9 +3117,9 @@ def _pool_hash(self):
(pooling requires the exact same version of :py:mod:`mrjob` anyway).
"""
things_to_hash = [
# exclude mrjob.tar.gz because it's only created if the
# exclude mrjob.zip because it's only created if the
# job starts its own cluster (also, its hash changes every time
# since the tarball contains different timestamps).
# since the zip file contains different timestamps).
# The filenames/md5sums are sorted because we need to
# ensure the order they're added doesn't affect the hash
# here. Previously this used a dict, but Python doesn't
Expand Down
4 changes: 2 additions & 2 deletions mrjob/options.py
Expand Up @@ -267,14 +267,14 @@ def _port_range_callback(option, opt_str, value, parser):
switches=[
(['--bootstrap-mrjob'], dict(
action='store_true',
help=("Automatically tar up the mrjob library and install it"
help=("Automatically zip up the mrjob library and install it"
" when we run the mrjob. This is the default. Use"
" --no-bootstrap-mrjob if you've already installed"
" mrjob on your Hadoop cluster."),
)),
(['--no-bootstrap-mrjob'], dict(
action='store_false',
help=("Don't automatically tar up the mrjob library and"
help=("Don't automatically zip up the mrjob library and"
" install it when we run this job. Use this if you've"
" already installed mrjob on your Hadoop cluster."),
)),
Expand Down
8 changes: 5 additions & 3 deletions mrjob/runner.py
Expand Up @@ -886,9 +886,11 @@ def _create_setup_wrapper_script(
setup = self._setup

if self._bootstrap_mrjob() and self.BOOTSTRAP_MRJOB_IN_SETUP:
# patch setup to add mrjob.tar.gz to PYTYHONPATH
# patch setup to add mrjob.zip to PYTHONPATH
mrjob_zip = self._create_mrjob_zip()
path_dict = {'type': 'archive', 'name': None, 'path': mrjob_zip}
# this is a file, not an archive, since Python can import directly
# from .zip files
path_dict = {'type': 'file', 'name': None, 'path': mrjob_zip}
self._working_dir_mgr.add(**path_dict)
setup = [['export PYTHONPATH=', path_dict, ':$PYTHONPATH']] + setup

Expand Down Expand Up @@ -919,7 +921,7 @@ def _parse_setup(self):
:py:func:`mrjob.setup.parse_setup_cmd()`.
If *bootstrap_mrjob* and ``self.BOOTSTRAP_MRJOB_IN_SETUP`` are both
true, create mrjob.tar.gz (if it doesn't exist already) and
true, create mrjob.zip (if it doesn't exist already) and
prepend a setup command that adds it to PYTHONPATH.
Patch in *py_files*.
Expand Down
4 changes: 2 additions & 2 deletions mrjob/tools/emr/create_cluster.py
Expand Up @@ -50,11 +50,11 @@
File to upload to the master node before running
bootstrap_cmds (for example, debian packages). You can
use --bootstrap-file more than once.
--bootstrap-mrjob Automatically tar up the mrjob library and install it
--bootstrap-mrjob Automatically zip up the mrjob library and install it
when we run the mrjob. This is the default. Use --no-
bootstrap-mrjob if you've already installed mrjob on
your Hadoop cluster.
--no-bootstrap-mrjob Don't automatically tar up the mrjob library and
--no-bootstrap-mrjob Don't automatically zip up the mrjob library and
install it when we run this job. Use this if you've
already installed mrjob on your Hadoop cluster.
--bootstrap-python Attempt to install a compatible version of Python at
Expand Down

0 comments on commit 8c913db

Please sign in to comment.