Skip to content

Commit

Permalink
Merge pull request #1735 from davidmarin/idle-grace-period
Browse files Browse the repository at this point in the history
Idle termination scripts now have 10-minute grace period. Fixes #1694
  • Loading branch information
David Marin committed Feb 23, 2018
2 parents 96dc2f3 + d57c06a commit a4bd934
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 26 deletions.
8 changes: 7 additions & 1 deletion docs/guides/emr-opts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ Cluster creation and configuration
:switch: --max-mins-idle
:type: float
:set: emr
:default: 5
:default: 10.0
Automatically terminate persistent/pooled clusters that have been idle at
least this many minutes.
Expand All @@ -383,6 +383,12 @@ Cluster creation and configuration
versions, you needed to set :mrjob-opt:`max_hours_idle`, set this
option explicitly, or use :ref:`terminate-idle-clusters`.
.. versionchanged:: 0.6.2
No matter how small a value you set this to, there is a grace period
of 10 minutes between when the idle termination daemon launches
and when it may first terminate the cluster, to allow Hadoop to
accept your first job.
.. mrjob-opt::
:config: max_hours_idle
Expand Down
23 changes: 18 additions & 5 deletions mrjob/bootstrap/terminate_idle_cluster_dataproc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,38 @@
MAX_SECS_IDLE=$(/usr/share/google/get_metadata_value attributes/mrjob-max-secs-idle)
if [ -z "${MAX_SECS_IDLE}" ]; then MAX_SECS_IDLE=300; fi

GRACE_PERIOD_SECS=$(/usr/share/google/get_metadata_value attributes/mrjob-grace-period-secs)
if [ -z "${GRACE_PERIOD_SECS}" ]; then GRACE_PERIOD_SECS=600; fi


(
while true # the only way out is to SHUT DOWN THE MACHINE
do
# get the uptime as an integer (expr can't handle decimals)
UPTIME=$(cat /proc/uptime | cut -f 1 -d .)

if [ -z "$START" ]
then
START=${UPTIME}
fi

if [ -z "${LAST_ACTIVE}" ] || \
(which yarn > /dev/null && \
nice yarn application -list 2> /dev/null | \
grep -v 'Total number' | grep -q RUNNING)
then
LAST_ACTIVE=${UPTIME}
else
# the cluster is idle! how long has this been going on?
SECS_IDLE=$(expr ${UPTIME} - ${LAST_ACTIVE})
if expr ${SECS_IDLE} '>' ${MAX_SECS_IDLE} > /dev/null
SECS_RUN=$(expr ${UPTIME} - ${START})
if expr ${SECS_RUN} '>' ${GRACE_PERIOD_SECS} > /dev/null
then
yes | gcloud dataproc clusters delete $(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name) --async
exit
# the cluster is idle! how long has this been going on?
SECS_IDLE=$(expr ${UPTIME} - ${LAST_ACTIVE})
if expr ${SECS_IDLE} '>' ${MAX_SECS_IDLE} > /dev/null
then
yes | gcloud dataproc clusters delete $(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name) --async
exit
fi
fi
fi

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# Copyright 2013 Lyft
# Copyright 2014 Alex Konradi
# Copyright 2015 Yelp and Contributors
# Copyright 2016 Yelp
# Copyright 2017 Yelp
# Copyright 2016-2017 Yelp
# Copyright 2018 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -28,8 +28,8 @@
# enough AND we're close enough to the end of an EC2 billing hour, we
# shut down the master node, which kills the cluster.

# By default, we allow an idle time of 15 minutes, and shut down within
# the last 5 minutes of the hour.
# By default, we allow an idle time of 5 minutes. Also, be default, we
# won't terminate the cluster in the first 10 minutes this script runs.

# Caveats:

Expand All @@ -43,13 +43,16 @@

# full usage:
#
# ./terminate_idle_cluster.sh [ max_secs_idle ]
# ./terminate_idle_cluster_emr.sh [ max_secs_idle ]
#
# Both arguments must be integers

MAX_SECS_IDLE=$1
if [ -z "$MAX_SECS_IDLE" ]; then MAX_SECS_IDLE=300; fi

GRACE_PERIOD_SECS=$2
if [ -z "$GRACE_PERIOD_SECS" ]; then GRACE_PERIOD_SECS=600; fi

# exit if this isn't the master node
grep -q 'isMaster.*false' /mnt/var/lib/info/instance.json && exit 0

Expand All @@ -59,6 +62,11 @@ do
# get the uptime as an integer (expr can't handle decimals)
UPTIME=$(cat /proc/uptime | cut -f 1 -d .)

if [ -z "$START" ]
then
START=$UPTIME
fi

# if LAST_ACTIVE hasn't been initialized, hadoop hasn't been installed
# yet (this happens on 4.x AMIs), or there are jobs running, just set
# LAST_ACTIVE to UPTIME. This also checks yarn application if it
Expand All @@ -72,15 +80,23 @@ do
then
LAST_ACTIVE=$UPTIME
else
# the cluster is idle! how long has this been going on?
SECS_IDLE=$(expr $UPTIME - $LAST_ACTIVE)

if expr $SECS_IDLE '>' $MAX_SECS_IDLE > /dev/null
SECS_RUN=$(expr $UPTIME - $START)
if expr $SECS_RUN '>' $GRACE_PERIOD_SECS > /dev/null
then
sudo shutdown -h now
exit

# the cluster is idle! how long has this been going on?
SECS_IDLE=$(expr $UPTIME - $LAST_ACTIVE)

if expr $SECS_IDLE '>' $MAX_SECS_IDLE > /dev/null
then
sudo shutdown -h now
exit
fi
fi
fi

# sleep so we don't peg the CPU
sleep 5
done
# close file handles to daemonize the script; otherwise bootstrapping
# never finishes
Expand Down
6 changes: 0 additions & 6 deletions mrjob/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,6 @@ def _fix_opts(self, opts, source=None):
else:
opts['max_mins_idle'] = _DEFAULT_MAX_MINS_IDLE

# warn about issues with
if opts['max_mins_idle'] < _DEFAULT_MAX_MINS_IDLE:
log.warning('Setting max_mins_idle to less than %.1f may result'
' in cluster shutting down before job can run' %
_DEFAULT_MAX_MINS_IDLE)

return opts

def _combine_opts(self, opt_list):
Expand Down
2 changes: 1 addition & 1 deletion mrjob/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
_MAX_MINS_IDLE_BOOTSTRAP_ACTION_PATH = os.path.join(
os.path.dirname(mrjob.__file__),
'bootstrap',
'terminate_idle_cluster.sh')
'terminate_idle_cluster_emr.sh')

# default AWS region to use for EMR. Using us-west-2 because it is the default
# for new (since October 10, 2012) accounts (see #1025)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1573,7 +1573,7 @@ def test_bootstrap_actions_get_added(self):
# check for idle timeout script
self.assertTrue(actions[3]['ScriptPath'].startswith('s3://mrjob-'))
self.assertTrue(actions[3]['ScriptPath'].endswith(
'terminate_idle_cluster.sh'))
'terminate_idle_cluster_emr.sh'))
self.assertEqual(actions[3]['Args'], ['600'])
self.assertEqual(actions[3]['Name'], 'idle timeout')

Expand Down Expand Up @@ -1628,7 +1628,7 @@ def test_local_bootstrap_action(self):
# check for idle timeout script
self.assertTrue(actions[2]['ScriptPath'].startswith('s3://mrjob-'))
self.assertTrue(actions[2]['ScriptPath'].endswith(
'terminate_idle_cluster.sh'))
'terminate_idle_cluster_emr.sh'))
self.assertEqual(actions[2]['Args'], ['600'])
self.assertEqual(actions[2]['Name'], 'idle timeout')

Expand Down

0 comments on commit a4bd934

Please sign in to comment.