Merge pull request #976 from davidmarin/emr_action_on_failure

--emr-action-on-failure
Yelp · Mar 12, 2015 · dc05321 · dc05321
2 parents 98cf82c + 8d4caeb
commit dc05321
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 6 deletions.
diff --git a/docs/guides/emr-opts.rst b/docs/guides/emr-opts.rst
@@ -465,6 +465,26 @@ Number and type of instances
 Choosing/creating a job flow to join
 ------------------------------------
 
+.. mrjob-opt::
+    :config: emr_action_on_failure
+    :switch: --emr-action-on-failure
+    :type: :ref:`string <data-type-string>`
+    :set: emr
+    :default: (automatic)
+
+    What happens if step of your job fails
+
+    * ``'CANCEL_AND_WAIT'`` cancels all steps on the job flow
+    * ``'CONTINUE'`` continues to the next step (useful when submitting several
+        jobs to the same job flow)
+    * ``'TERMINATE_CLUSTER'`` shuts down the job flow entirely
+
+    The default is ``'CANCEL_AND_WAIT'`` when using pooling (see
+    :mrjob-opt:`pool_emr_job_flows`) or an existing job flow (see
+    :mrjob-opt:`emr_job_flow_id`), and ``'TERMINATE_CLUSTER'`` otherwise.
+
+    .. versionadded:: 0.4.3
+
 .. mrjob-opt::
     :config: emr_job_flow_id
     :switch: --emr-job-flow-id
@@ -513,6 +533,7 @@ Choosing/creating a job flow to join
     flow every 30 seconds until this many minutes have passed, then start a new
     job flow instead of joining one.
 
+
 S3 paths and options
 --------------------
 MRJob uses boto to manipulate/access S3. Older versions of boto prior to 2.25.0

diff --git a/mrjob/emr.py b/mrjob/emr.py
@@ -339,6 +339,7 @@ class EMRRunnerOptionStore(RunnerOptionStore):
         'emr_endpoint',
         'emr_job_flow_id',
         'emr_job_flow_pool_name',
+        'emr_action_on_failure',
         'enable_emr_debugging',
         'hadoop_streaming_jar_on_emr',
         'hadoop_version',
@@ -1296,11 +1297,13 @@ def _job_flow_args(self, persistent=False, steps=None):
     @property
     def _action_on_failure(self):
         # don't terminate other people's job flows
-        if (self._opts['emr_job_flow_id'] or
+        if (self._opts['emr_action_on_failure']):
+            return self._opts['emr_action_on_failure']
+        elif (self._opts['emr_job_flow_id'] or
                 self._opts['pool_emr_job_flows']):
             return 'CANCEL_AND_WAIT'
         else:
-            return 'TERMINATE_JOB_FLOW'
+            return 'TERMINATE_CLUSTER'
 
     def _build_steps(self):
         """Return a list of boto Step objects corresponding to the

diff --git a/mrjob/options.py b/mrjob/options.py
@@ -423,6 +423,11 @@ def add_emr_opts(opt_group):
             '--emr-job-flow-id', dest='emr_job_flow_id', default=None,
             help='ID of an existing EMR job flow to use'),
 
+        opt_group.add_option(
+            '--emr-action-on-failure', dest='emr_action_on_failure', default=None,
+            help=('Action to take when a step fails'
+                 ' (e.g. TERMINATE_CLUSTER | CANCEL_AND_WAIT | CONTINUE)')),
+
         opt_group.add_option(
             '--enable-emr-debugging', dest='enable_emr_debugging',
             default=None, action='store_true',

diff --git a/tests/mockboto.py b/tests/mockboto.py
@@ -359,7 +359,7 @@ def run_jobflow(self,
                     name, log_uri, ec2_keyname=None, availability_zone=None,
                     master_instance_type='m1.small',
                     slave_instance_type='m1.small', num_instances=1,
-                    action_on_failure='TERMINATE_JOB_FLOW', keep_alive=False,
+                    action_on_failure='TERMINATE_CLUSTER', keep_alive=False,
                     enable_debugging=False,
                     hadoop_version=None,
                     steps=None,
@@ -558,7 +558,7 @@ def make_fake_action(real_action):
 
         if enable_debugging:
             debugging_step = JarStep(name='Setup Hadoop Debugging',
-                                     action_on_failure='TERMINATE_JOB_FLOW',
+                                     action_on_failure='TERMINATE_CLUSTER',
                                      main_class=None,
                                      jar=EmrConnection.DebuggingJar,
                                      step_args=EmrConnection.DebuggingArgs)
@@ -746,7 +746,9 @@ def simulate_progress(self, jobflow_id, now=None):
                 reason = self.mock_emr_failures[(jobflow_id, step_num)]
                 if reason:
                     job_flow.reason = reason
-                if step.actiononfailure == 'TERMINATE_JOB_FLOW':
+                # TERMINATED_JOB_FLOW is the old name for TERMINATE_CLUSTER
+                if step.actiononfailure in (
+                        'TERMINATE_CLUSTER','TERMINATE_JOB_FLOW'):
                     job_flow.state = 'SHUTTING_DOWN'
                     if not reason:
                         job_flow.reason = 'Shut down as step failed'

diff --git a/tests/test_emr.py b/tests/test_emr.py
@@ -3453,3 +3453,34 @@ def test_input_output_interpolation(self):
             streaming_input_arg = streaming_args[
                 streaming_args.index('-input') + 1]
             self.assertEqual(jar_output_arg, streaming_input_arg)
+
+
+class ActionOnFailureTestCase(MockEMRAndS3TestCase):
+
+    def test_default(self):
+        runner = EMRJobRunner()
+        self.assertEqual(runner._action_on_failure,
+                         'TERMINATE_CLUSTER')
+
+    def test_default_with_job_flow_id(self):
+        runner = EMRJobRunner(emr_job_flow_id='j-JOBFLOW')
+        self.assertEqual(runner._action_on_failure,
+                         'CANCEL_AND_WAIT')
+
+    def test_default_with_pooling(self):
+        runner = EMRJobRunner(pool_emr_job_flows=True)
+        self.assertEqual(runner._action_on_failure,
+                         'CANCEL_AND_WAIT')
+
+    def test_option(self):
+        runner = EMRJobRunner(emr_action_on_failure='CONTINUE')
+        self.assertEqual(runner._action_on_failure,
+                         'CONTINUE')
+
+    def test_switch(self):
+        mr_job = MRWordCount(
+            ['-r', 'emr', '--emr-action-on-failure', 'CONTINUE'])
+        mr_job.sandbox()
+
+        with mr_job.make_runner() as runner:
+            self.assertEqual(runner._action_on_failure, 'CONTINUE')
diff --git a/tests/tools/emr/test_terminate_idle_job_flows.py b/tests/tools/emr/test_terminate_idle_job_flows.py
@@ -56,7 +56,7 @@ def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar',
                  start_hours_ago=None,
                  end_hours_ago=None,
                  name='Streaming Step',
-                 action_on_failure='TERMINATE_JOB_FLOW',
+                 action_on_failure='TERMINATE_CLUSTER',
                  **kwargs):
             if create_hours_ago:
                 kwargs['creationdatetime'] = to_iso8601(