From b58fd50206f6085eb2ebd31dd0703990a11cc3d1 Mon Sep 17 00:00:00 2001 From: Tom Duffield Date: Fri, 25 Jan 2019 14:50:50 -0600 Subject: [PATCH] Add new "TerminateInstanceAfterJob" configuration This new setting will allow you to stop (and terminate) an instance after it has completed a job. Signed-off-by: Tom Duffield --- README.md | 16 +++++++++++++++- packer/conf/bin/bk-install-elastic-stack.sh | 15 +++++++++++++++ .../buildkite-agent/scripts/mark-asg-unhealthy | 8 ++++++++ packer/scripts/install-buildkite-agent.sh | 3 ++- templates/aws-stack.yml | 18 ++++++++++++++++++ 5 files changed, 58 insertions(+), 2 deletions(-) create mode 100755 packer/conf/buildkite-agent/scripts/mark-asg-unhealthy diff --git a/README.md b/README.md index a1b6a745c..9edf4daf9 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Features: - [What Type of Builds Does This Support?](#what-type-of-builds-does-this-support) - [Multiple Instances of the Stack](#multiple-instances-of-the-stack) - [Autoscaling](#autoscaling) +- [Terminating the instance after job is complete](#terminating-the-instance-after-job-is-complete) - [Docker Registry Support](#docker-registry-support) - [Versions](#versions) - [Updating Your Stack](#updating-your-stack) @@ -141,6 +142,19 @@ This means you can scale down to zero when idle, which means you can use larger Metrics are collected with a Lambda function, polling every minute. +## Terminating the instance after job is complete + +You may set `BuildkiteTerminateInstanceAfterJob` to `true` to force the instance to terminate after it completes a job. Setting this value to `true` tells the stack to apply the following configurations: + +1. Sets `disconnect-after-job` and `disconnect-after-job-timeout` in the `buildkite-agent.cfg` file. +2. Adds `ExecStopPost` steps to the agent's systemd service to mark the instance as unhealthy and to stop the instance. + +While not enforced, it is highly recommended you also set your `AgentsPerInstance` value to `1`. + +You may configure `BuildkiteTerminateInstanceAfterJobTimeout` to control how long an instance will wait for a job before terminating itself. You can use this setting to tune your ASG to optimize the queue for availability based on your tolerance for scaling events. The default value is 30 minutes (1800 seconds). + +We strongly encourage you to find an alternative to this setting if at all possible. The turn around time for replacing these instances is currently slow (5-10 minutes depending on other stack configuration settings). If you need single use jobs, we suggest looking at our container plugins like `docker`, `docker-compose`, and `ecs`, all which can be found [here](https://buildkite.com/plugins). + ## Docker Registry Support If you want to push or pull from registries such as [Docker Hub](https://hub.docker.com/) or [Quay](https://quay.io/) you can use the `environment` hook in your secrets bucket to export the following environment variables: @@ -194,7 +208,7 @@ Within each stream the logs are grouped by instance id. To debug an agent first find the instance id from the agent in Buildkite, head to your [CloudWatch Logs Dashboard](https://console.aws.amazon.com/cloudwatch/home?#logs:), choose either the system or Buildkite Agent log group, and then search for the instance id in the list of log streams. -# Customizing Instances with a Bootstrap Script +## Customizing Instances with a Bootstrap Script You can customize your stack’s instances by using the `BootstrapScriptUrl` stack parameter to run a bash script on instance boot. To set up a bootstrap script, create an S3 bucket with the script, and set the `BootstrapScriptUrl` parameter, for example `s3://my_bucket_name/my_bootstrap.sh`. diff --git a/packer/conf/bin/bk-install-elastic-stack.sh b/packer/conf/bin/bk-install-elastic-stack.sh index 1945a201e..c41a61936 100755 --- a/packer/conf/bin/bk-install-elastic-stack.sh +++ b/packer/conf/bin/bk-install-elastic-stack.sh @@ -98,6 +98,21 @@ plugins-path=/var/lib/buildkite-agent/plugins experiment="${BUILDKITE_AGENT_EXPERIMENTS}" EOF +if [[ "${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB:-false}" == "true" ]] ; then + cat << EOF >> /etc/buildkite-agent/buildkite-agent.cfg +disconnect-after-job=true +disconnect-after-job-timeout=${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB_TIMEOUT} +EOF + + mkdir -p /etc/systemd/system/buildkite-agent@.service.d/ + + cat << EOF > /etc/systemd/system/buildkite-agent@.service.d/10-power-off-stop.conf +[Service] +ExecStopPost=/usr/local/bin/mark-asg-unhealthy +ExecStopPost=/bin/sudo poweroff +EOF +fi + chown buildkite-agent: /etc/buildkite-agent/buildkite-agent.cfg if [[ -n "${BUILDKITE_AUTHORIZED_USERS_URL}" ]] ; then diff --git a/packer/conf/buildkite-agent/scripts/mark-asg-unhealthy b/packer/conf/buildkite-agent/scripts/mark-asg-unhealthy new file mode 100755 index 000000000..f9dafab91 --- /dev/null +++ b/packer/conf/buildkite-agent/scripts/mark-asg-unhealthy @@ -0,0 +1,8 @@ +#!/bin/bash + +set -euo pipefail + +instance_id=$(curl -fsSL http://169.254.169.254/latest/meta-data/instance-id) +region=$(curl -fsSL http://169.254.169.254/latest/meta-data/placement/availability-zone | head -c -1) + +aws autoscaling set-instance-health --instance-id "$instance_id" --region "$region" --health-status "Unhealthy" diff --git a/packer/scripts/install-buildkite-agent.sh b/packer/scripts/install-buildkite-agent.sh index ddf01150f..1745b29eb 100755 --- a/packer/scripts/install-buildkite-agent.sh +++ b/packer/scripts/install-buildkite-agent.sh @@ -49,8 +49,9 @@ sudo chown -R buildkite-agent: /var/lib/buildkite-agent/plugins echo "Adding systemd service template..." sudo cp /tmp/conf/buildkite-agent/systemd/buildkite-agent@.service /etc/systemd/system/buildkite-agent@.service -echo "Adding termination script..." +echo "Adding termination scripts..." sudo cp /tmp/conf/buildkite-agent/scripts/stop-agent-gracefully /usr/local/bin/stop-agent-gracefully +sudo cp /tmp/conf/buildkite-agent/scripts/mark-asg-unhealthy /usr/local/bin/mark-asg-unhealthy echo "Copying built-in plugins..." sudo mkdir -p /usr/local/buildkite-aws-stack/plugins diff --git a/templates/aws-stack.yml b/templates/aws-stack.yml index 13559cdb9..71a5eef0a 100644 --- a/templates/aws-stack.yml +++ b/templates/aws-stack.yml @@ -18,6 +18,8 @@ Metadata: - BuildkiteAgentTags - BuildkiteAgentTimestampLines - BuildkiteAgentExperiments + - BuildkiteTerminateInstanceAfterJob + - BuildkiteTerminateInstanceAfterJobTimeout - Label: default: Network Configuration @@ -118,6 +120,20 @@ Parameters: Type: String Default: "" + BuildkiteTerminateInstanceAfterJob: + Description: Whether or not to terminate the instance after the job has completed. + Type: String + AllowedValues: + - "true" + - "false" + Default: "false" + + BuildkiteTerminateInstanceAfterJobTimeout: + Description: When BuilkditeTerminateInstanceAfterJob is "true", how many seconds to wait for a job before terminating the instance. + Type: Number + Default: 1800 + MinValue: 1 + BuildkiteQueue: Description: Queue name that agents will use, targeted in pipeline steps using "queue={value}" Type: String @@ -684,6 +700,8 @@ Resources: BUILDKITE_AUTHORIZED_USERS_URL="${AuthorizedUsersUrl}" \ BUILDKITE_ECR_POLICY=${ECRAccessPolicy} \ BUILDKITE_LIFECYCLE_TOPIC=${AgentLifecycleTopic} \ + BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=${BuildkiteTerminateInstanceAfterJob} \ + BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB_TIMEOUT=${BuildkiteTerminateInstanceAfterJobTimeout} \ AWS_DEFAULT_REGION=${AWS::Region} \ SECRETS_PLUGIN_ENABLED=${EnableSecretsPlugin} \ ECR_PLUGIN_ENABLED=${EnableECRPlugin} \