Skip to content

Commit

Permalink
Merge pull request #1821 from davidmarin/image-id
Browse files Browse the repository at this point in the history
add image_id option (fixes #1805)
  • Loading branch information
David Marin committed Aug 21, 2018
2 parents 98f2d6b + 31ced21 commit 7e87065
Show file tree
Hide file tree
Showing 12 changed files with 146 additions and 6 deletions.
2 changes: 1 addition & 1 deletion docs/_templates/indexsidebar.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ <h3>Quick Links</h3>
Config options (all runners)</a></li>
<li><a href="{{ pathto('guides/configs-hadoopy-runners') }}">
Config options (Hadoop)</a></li>
<li><a href="{{ pathto('guides/emr-opts') }}">
<li><a href="{{ pathto('guides/cloud-opts') }}">
Config options (cloud services)</a></li>
<ul>
<li><a href="{{ pathto('guides/emr-opts') }}">
Expand Down
24 changes: 24 additions & 0 deletions docs/guides/cloud-opts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,30 @@ Number and type of instances
Cluster software configuration
------------------------------

.. mrjob-opt::
:config: image_id
:switch: --image-id
:type: :ref:`string <data-type-string>`
:set: cloud
:default: None

ID of a custom machine image.

On EMR, this is complimentary with :mrjob-opt:`image_version`; you
can install packages and libraries on your custom AMI, but it's up to
EMR to install Hadoop, create the ``hadoop`` user, etc.
:mrjob-opt:`image_version` may not be less than 5.7.0.

For more details about how to create a custom AMI that works with EMR, see
`Best Practices and Considerations
<https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-custom-ami.html#emr-custom-ami-considerations>`_.

.. note::

This is not yet implemented in the Dataproc runner.

.. versionadded:: 0.6.5

.. mrjob-opt::
:config: image_version
:switch: --image-version
Expand Down
3 changes: 2 additions & 1 deletion docs/guides/emr-opts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ of instance configuration you use.
Cluster software configuration
------------------------------

See also :mrjob-opt:`image_version` and :mrjob-opt:`bootstrap`.
See also :mrjob-opt:`bootstrap`, :mrjob-opt:`image_id`, and
:mrjob-opt:`image_version`.

.. mrjob-opt::
:config: applications
Expand Down
1 change: 1 addition & 0 deletions mrjob/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ class HadoopInTheCloudJobRunner(MRJobBinRunner):
'core_instance_type',
'extra_cluster_params',
'hadoop_streaming_jar',
'image_id',
'image_version',
'instance_type',
'master_instance_type',
Expand Down
5 changes: 5 additions & 0 deletions mrjob/dataproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,11 @@ def __init__(self, **kwargs):
raise DataprocException(
'Dataproc v1 expects core/task instance types to be identical')

# see #1820
if self._opts['image_id']:
log.warning('mrjob does not yet support custom machine images'
' on Dataproc')

# load credentials and project ID
self._credentials, auth_project_id = google.auth.default(
scopes=[_FULL_SCOPE]) # needed for $GOOGLE_APPLICATION_CREDENTIALS
Expand Down
17 changes: 14 additions & 3 deletions mrjob/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
# for new (since October 10, 2012) accounts (see #1025)
_DEFAULT_EMR_REGION = 'us-west-2'

# default AMI to use on EMR. This will be updated with each version
# default AMI to use on EMR. This may be updated with each version
_DEFAULT_IMAGE_VERSION = '5.8.0'

# first AMI version that we can't run bash -e on (see #1548)
Expand Down Expand Up @@ -373,12 +373,16 @@ def __init__(self, **kwargs):

# check AMI version
if self._opts['image_version'].startswith('1.'):
log.warning('1.x AMIs will probably not work because they use'
log.warning('1.x AMIs will not work because they use'
' Python 2.5. Use a later AMI version or mrjob v0.4.2')
elif not version_gte(self._opts['image_version'], '2.4.3'):
log.warning("AMIs prior to 2.4.3 probably will not work because"
" they don't support Python 2.7. Use a later AMI"
" version or mrjob v0.5.11")
elif not self._image_version_gte('5.7.0'):
if self._opts['image_id']:
log.warning('AMIs prior to 5.7.0 will probably not work'
' with custom machine images')

if self._opts['emr_api_params'] is not None:
log.warning('emr_api_params is deprecated and does nothing.'
Expand Down Expand Up @@ -1152,6 +1156,9 @@ def _cluster_kwargs(self, persistent=False):
else:
kwargs['AmiVersion'] = self._opts['image_version']

if self._opts['image_id']:
kwargs['CustomAmiId'] = self._opts['image_id']

# capitalizing Instances because it's just an API parameter
kwargs['Instances'] = Instances = {}

Expand Down Expand Up @@ -2281,7 +2288,7 @@ def _usable_clusters(self, exclude=None, num_steps=1):
same setup as our own, that is:
- same bootstrap setup (including mrjob version)
- have the same AMI version
- have the same AMI version and custom AMI ID (if any)
- install the same applications (if we requested any)
- same number and type of instances
Expand Down Expand Up @@ -2391,6 +2398,10 @@ def add_if_match(cluster):
max_steps = map_version(
image_version, _IMAGE_VERSION_TO_MAX_STEPS)

if self._opts['image_id'] != cluster.get('CustomAmiId'):
log.debug(' custom image ID mismatch')
return

if self._opts['ebs_root_volume_gb']:
if 'EbsRootVolumeSize' not in cluster:
log.debug(' EBS root volume size not set')
Expand Down
3 changes: 3 additions & 0 deletions mrjob/examples/mr_phone_to_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
To find the latest crawl:
``aws s3 ls s3://commoncrawl/crawl-data/ | grep CC-MAIN``
WET data is often added after a release; usually the second-most recent
release is a safe bet.
"""
import re
from itertools import islice
Expand Down
10 changes: 9 additions & 1 deletion mrjob/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,11 +727,19 @@ def __call__(self, parser, namespace, value, option_string=None):
)),
],
),
image_id=dict(
cloud_role='launch',
switches=[
(['--image-id'], dict(
help='ID of custom AWS machine image (AMI) to use',
)),
],
),
image_version=dict(
cloud_role='launch',
switches=[
(['--image-version'], dict(
help='EMR/Dataproc machine image to launch clusters with',
help='version of EMR/Dataproc machine image to run',
)),
],
),
Expand Down
6 changes: 6 additions & 0 deletions tests/mock_boto3/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,12 @@ def _error(message):
'Must specify exactly one of the following:'
' release label, AMI version, or Hadoop version.')

# CustomAmiId
if kwargs.get('CustomAmiId'):
if not version_gte(running_ami_version, '5.7.0'):
raise _error('Custom AMI is not allowed')
cluster['CustomAmiId'] = kwargs.pop('CustomAmiId')

# Applications
hadoop_version = map_version(
running_ami_version, AMI_HADOOP_VERSION_UPDATES)
Expand Down
37 changes: 37 additions & 0 deletions tests/test_emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,43 @@ def test_hadoop_version_option_does_nothing(self):
self.assertEqual(runner.get_hadoop_version(), '2.7.3')


class CustomAmiTestCase(MockBoto3TestCase):

def test_default(self):
with self.make_runner() as runner:
runner.run()

cluster = runner._describe_cluster()
self.assertNotIn('CustomAmiId', cluster)

def test_custom_ami(self):
with self.make_runner('--image-id', 'ami-blanchin') as runner:
runner.run()

cluster = runner._describe_cluster()
self.assertEqual(cluster['CustomAmiId'], 'ami-blanchin')

def test_image_version_too_low(self):
log = self.start(patch('mrjob.emr.log'))

# must be at least 5.7.0
with self.make_runner('--image-id', 'ami-blanchin',
'--image-version', '5.6.0') as runner:
self.assertTrue(log.warning.called)

self.assertRaises(ClientError, runner.run)

def test_release_label_too_low(self):
log = self.start(patch('mrjob.emr.log'))

# must be at least 5.7.0
with self.make_runner('--image-id', 'ami-blanchin',
'--release-label', 'emr-5.6.0') as runner:
self.assertTrue(log.warning.called)

self.assertRaises(ClientError, runner.run)


class AvailabilityZoneTestCase(MockBoto3TestCase):

MRJOB_CONF_CONTENTS = {'runners': {'emr': {
Expand Down
43 changes: 43 additions & 0 deletions tests/test_emr_pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,49 @@ def test_release_label_hides_ami_version(self):
'--release-label', 'emr-4.0.0',
'--image-version', '1.0.0'])

def test_pooling_with_custom_ami(self):
_, cluster_id = self.make_pooled_cluster(image_id='ami-blanchin')

self.assertJoins(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--image-id', 'ami-blanchin'])

def test_dont_join_pool_with_wrong_custom_ami(self):
_, cluster_id = self.make_pooled_cluster(image_id='ami-blanchin')

self.assertDoesNotJoin(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--image-id', 'ami-awake'])

def test_dont_join_pool_with_non_custom_ami(self):
_, cluster_id = self.make_pooled_cluster()

self.assertDoesNotJoin(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--image-id', 'ami-blanchin'])

def test_dont_join_pool_with_custom_ami_if_not_set(self):
_, cluster_id = self.make_pooled_cluster(image_id='ami-blanchin')

self.assertDoesNotJoin(cluster_id, [
'-r', 'emr', '--pool-clusters'])

def test_join_pool_with_matching_custom_ami_and_ami_version(self):
_, cluster_id = self.make_pooled_cluster(image_id='ami-blanchin',
image_version='5.10.0')

self.assertJoins(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--image-id', 'ami-blanchin', '--release-label', 'emr-5.10.0'])

def test_dont_join_pool_with_right_custom_ami_but_wrong_version(self):
_, cluster_id = self.make_pooled_cluster(image_id='ami-blanchin',
image_version='5.9.0')

self.assertDoesNotJoin(cluster_id, [
'-r', 'emr', '--pool-clusters',
'--image-id', 'ami-blanchin', '--image-version', '5.10.0'])

def test_matching_applications(self):
_, cluster_id = self.make_pooled_cluster(
image_version='4.0.0', applications=['Mahout'])
Expand Down
1 change: 1 addition & 0 deletions tests/tools/emr/test_create_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def test_runner_kwargs(self):
'iam_endpoint': None,
'iam_instance_profile': None,
'iam_service_role': None,
'image_id': None,
'image_version': None,
'instance_fleets': None,
'instance_groups': None,
Expand Down

0 comments on commit 7e87065

Please sign in to comment.