Skip to content

Commit

Permalink
Merge pull request #1817 from davidmarin/ebs-root-volume-gb
Browse files Browse the repository at this point in the history
add ebs_root_volume_gb option (fixes #1812)
  • Loading branch information
David Marin committed Aug 20, 2018
2 parents b61e7dc + 279f4e2 commit 98f2d6b
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 3 deletions.
20 changes: 19 additions & 1 deletion docs/guides/emr-opts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ files takes precedence, and the command line beats config files. In
the case of a tie, `instance_fleets` beats `instance_groups` beats
other instance options.

You may set :mrjob-opt:`ebs_root_volume_gb` regardless of which style
of instance configuration you use.


.. mrjob-opt::
:config: instance_fleets
:switch: --instance-fleet
Expand Down Expand Up @@ -161,7 +165,8 @@ other instance options.
string on the command line or use data structures in the config file
(which is itself basically JSON).

This is the primary way to configure EBS volumes. For example:
This allows for more fine-tuned EBS volume configuration than
:mrjob-opt:`ebs_root_volume_gb`. For example:

.. code-block:: yaml
Expand Down Expand Up @@ -230,6 +235,19 @@ other instance options.

This option used to be named *ec2_task_instance_bid_price*.

.. mrjob-opt::
:config: ebs_root_volume_gb
:switch: --ebs-root-volume-gb
:type: integer
:set: emr
:default: ``None``

When specified (and not zero), sets the size of the root EBS volume,
in GiB.

.. versionadded:: 0.6.5


Cluster software configuration
------------------------------

Expand Down
12 changes: 11 additions & 1 deletion docs/guides/pooling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,17 @@ additional jobs, and eventually shut itself down after it has been idle
for a certain amount of time (by default, ten minutes; see
:mrjob-opt:`max_mins_idle`).

.. note::
.. warning::

When using cluster pooling prior to v0.6.0, make sure to set
:mrjob-opt:`max_hours_idle`, or your cluster will never shut down.

.. note::

Pooling is a way to reduce latency, not to save money. Though
pooling was originally created to optimize AWS's practice of billing by
the full hour, this `ended in October 2017 <https://aws.amazon.com/about-aws/whats-new/2017/10/amazon-emr-now-supports-per-second-billing/>`_.

Pooling is designed so that jobs run against the same :py:mod:`mrjob.conf` can
share the same clusters. This means that the version of :py:mod:`mrjob` and
bootstrap configuration. Other options that affect which cluster a job can
Expand Down Expand Up @@ -46,6 +52,10 @@ Pooling is also somewhat flexible about EBS volumes (see
but larger volumes or volumes with more I/O ops per second are acceptable,
as are additional volumes of any type.

Pooling cannot match configurations with explicitly set
:mrjob-opt:`ebs_root_volume_gb` against clusters that use the default (or vice
versa) because the EMR API does not report what the default value is.

If you are using :mrjob-opt:`instance_fleets`, your jobs will only join other
clusters which use instance fleets. The rules are similar, but jobs will
only join clusters whose fleets use the same set of instances or a subset;
Expand Down
2 changes: 1 addition & 1 deletion mrjob/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,4 @@
'Andrea Zonca <andrea.zonca@gmail.com>',
]

__version__ = '0.6.4'
__version__ = '0.6.5.dev0'
18 changes: 18 additions & 0 deletions mrjob/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ class EMRJobRunner(HadoopInTheCloudJobRunner, LogInterpretationMixin):
'bootstrap_spark',
'cloud_log_dir',
'core_instance_bid_price',
'ebs_root_volume_gb',
'ec2_key_pair',
'ec2_key_pair_file',
'emr_action_on_failure',
Expand Down Expand Up @@ -1162,6 +1163,10 @@ def _cluster_kwargs(self, persistent=False):
else:
Instances['InstanceGroups'] = self._instance_groups()

# EBS Root volume size
if self._opts['ebs_root_volume_gb']:
kwargs['EbsRootVolumeSize'] = self._opts['ebs_root_volume_gb']

# bootstrap actions
kwargs['BootstrapActions'] = BootstrapActions = []

Expand Down Expand Up @@ -2386,6 +2391,19 @@ def add_if_match(cluster):
max_steps = map_version(
image_version, _IMAGE_VERSION_TO_MAX_STEPS)

if self._opts['ebs_root_volume_gb']:
if 'EbsRootVolumeSize' not in cluster:
log.debug(' EBS root volume size not set')
return
elif (cluster['EbsRootVolumeSize'] <
self._opts['ebs_root_volume_gb']):
log.debug(' EBS root volume size too small')
return
else:
if 'EbsRootVolumeSize' in cluster:
log.debug(' uses non-default EBS root volume size')
return

applications = self._applications()
if applications:
# use case-insensitive mapping (see #1417)
Expand Down
10 changes: 10 additions & 0 deletions mrjob/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,16 @@ def __call__(self, parser, namespace, value, option_string=None):
)),
],
),
ebs_root_volume_gb=dict(
cloud_role='launch',
switches=[
(['--ebs-root-volume-gb'], dict(
help=('Size of root EBS volume, in GiB. Must be an integer.'
'Set to 0 to use the default'),
type=int,
)),
],
),
ec2_key_pair=dict(
cloud_role='launch',
switches=[
Expand Down
7 changes: 7 additions & 0 deletions tests/mock_boto3/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,13 @@ def _error(message):
_validate_param(kwargs, 'VisibleToAllUsers', bool)
cluster['VisibleToAllUsers'] = kwargs.pop('VisibleToAllUsers')

# EbsRootVolumeSize
if 'EbsRootVolumeSize' in kwargs:
# whatever the default EbsRootVolumeSize is, the API doesn't
# report it if you don't set it
_validate_param(kwargs, 'EbsRootVolumeSize', integer_types)
cluster['EbsRootVolumeSize'] = kwargs.pop('EbsRootVolumeSize')

# pass BootstrapActions off to helper
if 'BootstrapActions' in kwargs:
self._add_bootstrap_actions(
Expand Down
26 changes: 26 additions & 0 deletions tests/test_emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,32 @@ def test_dont_take_down_cluster_on_failure(self):
self.assertEqual(cluster['Status']['State'], 'WAITING')


class EbsRootVolumeGBTestCase(MockBoto3TestCase):

def test_defaults(self):
cluster = self.run_and_get_cluster()
self.assertNotIn('EbsRootVolumeSize', cluster)

def test_set_to_integer(self):
cluster = self.run_and_get_cluster('--ebs-root-volume-gb', '999')
self.assertEqual(cluster['EbsRootVolumeSize'], 999)

def test_zero_means_default(self):
cluster = self.run_and_get_cluster('--ebs-root-volume-gb', '0')
self.assertNotIn('EbsRootVolumeSize', cluster)

def test_must_be_integer_on_cmd_line(self):
self.assertRaises(
ValueError,
self.run_and_get_cluster, '--ebs-root-volume-gb', '99.99')

def test_must_be_integer_in_config(self):
BAD_MRJOB_CONF = {'runners': {'emr': {'ebs_root_volume_gb': 99.99}}}

with mrjob_conf_patcher(BAD_MRJOB_CONF):
self.assertRaises(ParamValidationError, self.run_and_get_cluster)


class VisibleToAllUsersTestCase(MockBoto3TestCase):

def test_defaults(self):
Expand Down
40 changes: 40 additions & 0 deletions tests/test_emr_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,46 @@ def test_unknown_instance_type_against_other_instance_types(self):
'--instance-type', 'a1.sauce',
'--num-core-instances', '2'])

# note that ebs_root_volume_gb is independent from EBS config on
# instance fleets and instance groups

def test_join_cluster_with_same_ebs_root_volume_gb(self):
_, cluster_id = self.make_pooled_cluster(
ebs_root_volume_gb=123)

self.assertJoins(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--ebs-root-volume-gb', '123'])

def test_join_cluster_with_larger_ebs_root_volume_gb(self):
_, cluster_id = self.make_pooled_cluster(
ebs_root_volume_gb=456)

self.assertJoins(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--ebs-root-volume-gb', '123'])

def test_dont_join_cluster_with_smaller_ebs_root_volume_gb(self):
_, cluster_id = self.make_pooled_cluster(
ebs_root_volume_gb=11)

self.assertDoesNotJoin(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--ebs-root-volume-gb', '123'])

def test_dont_join_cluster_with_default_ebs_root_volume_gb(self):
_, cluster_id = self.make_pooled_cluster()

self.assertDoesNotJoin(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--ebs-root-volume-gb', '123'])

def test_dont_join_cluster_with_non_default_ebs_root_volume_gb(self):
_, cluster_id = self.make_pooled_cluster(
ebs_root_volume_gb=123)

self.assertDoesNotJoin(cluster_id, ['-r', 'emr', '--pool-clusters'])

def _ig_with_ebs_config(
self, device_configs=(), iops=None,
num_volumes=None,
Expand Down
1 change: 1 addition & 0 deletions tests/tools/emr/test_create_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def test_runner_kwargs(self):
'conf_paths': None,
'core_instance_bid_price': None,
'core_instance_type': None,
'ebs_root_volume_gb': None,
'ec2_key_pair': None,
'emr_api_params': None,
'emr_configurations': None,
Expand Down

0 comments on commit 98f2d6b

Please sign in to comment.