Merge pull request #1735 from davidmarin/idle-grace-period

Idle termination scripts now have 10-minute grace period. Fixes #1694
Yelp · Feb 23, 2018 · a4bd934 · a4bd934
2 parents 96dc2f3 + d57c06a
commit a4bd934
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 26 deletions.
diff --git a/docs/guides/emr-opts.rst b/docs/guides/emr-opts.rst
@@ -372,7 +372,7 @@ Cluster creation and configuration
     :switch: --max-mins-idle
     :type: float
     :set: emr
-    :default: 5
+    :default: 10.0
 
     Automatically terminate persistent/pooled clusters that have been idle at
     least this many minutes.
@@ -383,6 +383,12 @@ Cluster creation and configuration
        versions, you needed to set :mrjob-opt:`max_hours_idle`, set this
        option explicitly, or use :ref:`terminate-idle-clusters`.
 
+    .. versionchanged:: 0.6.2
+
+       No matter how small a value you set this to, there is a grace period
+       of 10 minutes between when the idle termination daemon launches
+       and when it may first terminate the cluster, to allow Hadoop to
+       accept your first job.
 
 .. mrjob-opt::
     :config: max_hours_idle

diff --git a/mrjob/bootstrap/terminate_idle_cluster_dataproc.sh b/mrjob/bootstrap/terminate_idle_cluster_dataproc.sh
@@ -43,25 +43,38 @@
 MAX_SECS_IDLE=$(/usr/share/google/get_metadata_value attributes/mrjob-max-secs-idle)
 if [ -z "${MAX_SECS_IDLE}" ]; then MAX_SECS_IDLE=300; fi
 
+GRACE_PERIOD_SECS=$(/usr/share/google/get_metadata_value attributes/mrjob-grace-period-secs)
+if [ -z "${GRACE_PERIOD_SECS}" ]; then GRACE_PERIOD_SECS=600; fi
+
+
 (
 while true  # the only way out is to SHUT DOWN THE MACHINE
 do
     # get the uptime as an integer (expr can't handle decimals)
     UPTIME=$(cat /proc/uptime | cut -f 1 -d .)
 
+    if [ -z "$START" ]
+    then
+        START=${UPTIME}
+    fi
+
     if [ -z "${LAST_ACTIVE}" ] || \
         (which yarn > /dev/null && \
             nice yarn application -list 2> /dev/null | \
             grep -v 'Total number' | grep -q RUNNING)
     then
         LAST_ACTIVE=${UPTIME}
     else
-	# the cluster is idle! how long has this been going on?
-        SECS_IDLE=$(expr ${UPTIME} - ${LAST_ACTIVE})
-        if expr ${SECS_IDLE} '>' ${MAX_SECS_IDLE} > /dev/null
+        SECS_RUN=$(expr ${UPTIME} - ${START})
+        if expr ${SECS_RUN} '>' ${GRACE_PERIOD_SECS} > /dev/null
         then
-            yes | gcloud dataproc clusters delete $(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name) --async
-            exit
+            # the cluster is idle! how long has this been going on?
+            SECS_IDLE=$(expr ${UPTIME} - ${LAST_ACTIVE})
+            if expr ${SECS_IDLE} '>' ${MAX_SECS_IDLE} > /dev/null
+            then
+                yes | gcloud dataproc clusters delete $(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name) --async
+                exit
+            fi
         fi
     fi
 

diff --git a/mrjob/bootstrap/terminate_idle_cluster.sh → ...b/bootstrap/terminate_idle_cluster_emr.sh b/mrjob/bootstrap/terminate_idle_cluster.sh → ...b/bootstrap/terminate_idle_cluster_emr.sh
@@ -3,8 +3,8 @@
 # Copyright 2013 Lyft
 # Copyright 2014 Alex Konradi
 # Copyright 2015 Yelp and Contributors
-# Copyright 2016 Yelp
-# Copyright 2017 Yelp
+# Copyright 2016-2017 Yelp
+# Copyright 2018 Yelp
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@
 # enough AND we're close enough to the end of an EC2 billing hour, we
 # shut down the master node, which kills the cluster.
 
-# By default, we allow an idle time of 15 minutes, and shut down within
-# the last 5 minutes of the hour.
+# By default, we allow an idle time of 5 minutes. Also, be default, we
+# won't terminate the cluster in the first 10 minutes this script runs.
 
 # Caveats:
 
@@ -43,13 +43,16 @@
 
 # full usage:
 #
-# ./terminate_idle_cluster.sh [ max_secs_idle  ]
+# ./terminate_idle_cluster_emr.sh [ max_secs_idle  ]
 #
 # Both arguments must be integers
 
 MAX_SECS_IDLE=$1
 if [ -z "$MAX_SECS_IDLE" ]; then MAX_SECS_IDLE=300; fi
 
+GRACE_PERIOD_SECS=$2
+if [ -z "$GRACE_PERIOD_SECS" ]; then GRACE_PERIOD_SECS=600; fi
+
 # exit if this isn't the master node
 grep -q 'isMaster.*false' /mnt/var/lib/info/instance.json && exit 0
 
@@ -59,6 +62,11 @@ do
     # get the uptime as an integer (expr can't handle decimals)
     UPTIME=$(cat /proc/uptime | cut -f 1 -d .)
 
+    if [ -z "$START" ]
+    then
+        START=$UPTIME
+    fi
+
     # if LAST_ACTIVE hasn't been initialized, hadoop hasn't been installed
     # yet (this happens on 4.x AMIs), or there are jobs running, just set
     # LAST_ACTIVE to UPTIME. This also checks yarn application if it
@@ -72,15 +80,23 @@ do
     then
         LAST_ACTIVE=$UPTIME
     else
-	# the cluster is idle! how long has this been going on?
-        SECS_IDLE=$(expr $UPTIME - $LAST_ACTIVE)
-
-        if expr $SECS_IDLE '>' $MAX_SECS_IDLE > /dev/null
+        SECS_RUN=$(expr $UPTIME - $START)
+        if expr $SECS_RUN '>' $GRACE_PERIOD_SECS > /dev/null
         then
-            sudo shutdown -h now
-            exit
+
+            # the cluster is idle! how long has this been going on?
+            SECS_IDLE=$(expr $UPTIME - $LAST_ACTIVE)
+
+            if expr $SECS_IDLE '>' $MAX_SECS_IDLE > /dev/null
+            then
+                sudo shutdown -h now
+                exit
+            fi
         fi
     fi
+
+    # sleep so we don't peg the CPU
+    sleep 5
 done
 # close file handles to daemonize the script; otherwise bootstrapping
 # never finishes

diff --git a/mrjob/cloud.py b/mrjob/cloud.py
@@ -112,12 +112,6 @@ def _fix_opts(self, opts, source=None):
             else:
                 opts['max_mins_idle'] = _DEFAULT_MAX_MINS_IDLE
 
-        # warn about issues with
-        if opts['max_mins_idle'] < _DEFAULT_MAX_MINS_IDLE:
-            log.warning('Setting max_mins_idle to less than %.1f may result'
-                        ' in cluster shutting down before job can run' %
-                        _DEFAULT_MAX_MINS_IDLE)
-
         return opts
 
     def _combine_opts(self, opt_list):

diff --git a/mrjob/emr.py b/mrjob/emr.py
@@ -152,7 +152,7 @@
 _MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH = os.path.join(
     os.path.dirname(mrjob.__file__),
     'bootstrap',
-    'terminate_idle_cluster.sh')
+    'terminate_idle_cluster_emr.sh')
 
 # default AWS region to use for EMR. Using us-west-2 because it is the default
 # for new (since October 10, 2012) accounts (see #1025)

diff --git a/tests/test_emr.py b/tests/test_emr.py
@@ -1573,7 +1573,7 @@ def test_bootstrap_actions_get_added(self):
         # check for idle timeout script
         self.assertTrue(actions[3]['ScriptPath'].startswith('s3://mrjob-'))
         self.assertTrue(actions[3]['ScriptPath'].endswith(
-            'terminate_idle_cluster.sh'))
+            'terminate_idle_cluster_emr.sh'))
         self.assertEqual(actions[3]['Args'], ['600'])
         self.assertEqual(actions[3]['Name'], 'idle timeout')
 
@@ -1628,7 +1628,7 @@ def test_local_bootstrap_action(self):
         # check for idle timeout script
         self.assertTrue(actions[2]['ScriptPath'].startswith('s3://mrjob-'))
         self.assertTrue(actions[2]['ScriptPath'].endswith(
-            'terminate_idle_cluster.sh'))
+            'terminate_idle_cluster_emr.sh'))
         self.assertEqual(actions[2]['Args'], ['600'])
         self.assertEqual(actions[2]['Name'], 'idle timeout')