diff --git a/README.md b/README.md index a1b6a745c..9edf4daf9 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Features: - [What Type of Builds Does This Support?](#what-type-of-builds-does-this-support) - [Multiple Instances of the Stack](#multiple-instances-of-the-stack) - [Autoscaling](#autoscaling) +- [Terminating the instance after job is complete](#terminating-the-instance-after-job-is-complete) - [Docker Registry Support](#docker-registry-support) - [Versions](#versions) - [Updating Your Stack](#updating-your-stack) @@ -141,6 +142,19 @@ This means you can scale down to zero when idle, which means you can use larger Metrics are collected with a Lambda function, polling every minute. +## Terminating the instance after job is complete + +You may set `BuildkiteTerminateInstanceAfterJob` to `true` to force the instance to terminate after it completes a job. Setting this value to `true` tells the stack to apply the following configurations: + +1. Sets `disconnect-after-job` and `disconnect-after-job-timeout` in the `buildkite-agent.cfg` file. +2. Adds `ExecStopPost` steps to the agent's systemd service to mark the instance as unhealthy and to stop the instance. + +While not enforced, it is highly recommended you also set your `AgentsPerInstance` value to `1`. + +You may configure `BuildkiteTerminateInstanceAfterJobTimeout` to control how long an instance will wait for a job before terminating itself. You can use this setting to tune your ASG to optimize the queue for availability based on your tolerance for scaling events. The default value is 30 minutes (1800 seconds). + +We strongly encourage you to find an alternative to this setting if at all possible. The turn around time for replacing these instances is currently slow (5-10 minutes depending on other stack configuration settings). If you need single use jobs, we suggest looking at our container plugins like `docker`, `docker-compose`, and `ecs`, all which can be found [here](https://buildkite.com/plugins). + ## Docker Registry Support If you want to push or pull from registries such as [Docker Hub](https://hub.docker.com/) or [Quay](https://quay.io/) you can use the `environment` hook in your secrets bucket to export the following environment variables: @@ -194,7 +208,7 @@ Within each stream the logs are grouped by instance id. To debug an agent first find the instance id from the agent in Buildkite, head to your [CloudWatch Logs Dashboard](https://console.aws.amazon.com/cloudwatch/home?#logs:), choose either the system or Buildkite Agent log group, and then search for the instance id in the list of log streams. -# Customizing Instances with a Bootstrap Script +## Customizing Instances with a Bootstrap Script You can customize your stack’s instances by using the `BootstrapScriptUrl` stack parameter to run a bash script on instance boot. To set up a bootstrap script, create an S3 bucket with the script, and set the `BootstrapScriptUrl` parameter, for example `s3://my_bucket_name/my_bootstrap.sh`. diff --git a/packer/conf/bin/bk-install-elastic-stack.sh b/packer/conf/bin/bk-install-elastic-stack.sh index 65c3e580f..e752f08af 100755 --- a/packer/conf/bin/bk-install-elastic-stack.sh +++ b/packer/conf/bin/bk-install-elastic-stack.sh @@ -98,6 +98,21 @@ plugins-path=/var/lib/buildkite-agent/plugins experiment="${BUILDKITE_AGENT_EXPERIMENTS}" EOF +if [[ "${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB:-false}" == "true" ]] ; then + cat << EOF >> /etc/buildkite-agent/buildkite-agent.cfg +disconnect-after-job=true +disconnect-after-job-timeout=${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB_TIMEOUT} +EOF + + mkdir -p /etc/systemd/system/buildkite-agent@.service.d/ + + cat << EOF > /etc/systemd/system/buildkite-agent@.service.d/10-power-off-stop.conf +[Service] +ExecStopPost=/usr/local/bin/terminate-instance ${BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB_DECREASE_DESIRED_CAPACITY} +ExecStopPost=/bin/sudo poweroff +EOF +fi + chown buildkite-agent: /etc/buildkite-agent/buildkite-agent.cfg if [[ -n "${BUILDKITE_AUTHORIZED_USERS_URL}" ]] ; then diff --git a/packer/conf/buildkite-agent/scripts/terminate-instance b/packer/conf/buildkite-agent/scripts/terminate-instance new file mode 100755 index 000000000..ed847f8ea --- /dev/null +++ b/packer/conf/buildkite-agent/scripts/terminate-instance @@ -0,0 +1,14 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "${1:-false}" == "true" ]] ; then + should_decrement="should" +else + should_decrement="no-should" +fi + +instance_id=$(curl -fsSL http://169.254.169.254/latest/meta-data/instance-id) +region=$(curl -fsSL http://169.254.169.254/latest/meta-data/placement/availability-zone | head -c -1) + +aws autoscaling terminate-instance-in-auto-scaling-group --region "$region" --instance-id "$instance_id" "--${should_decrement}-decrement-desired-capacity" diff --git a/packer/scripts/install-buildkite-agent.sh b/packer/scripts/install-buildkite-agent.sh index 09baba190..42736643f 100755 --- a/packer/scripts/install-buildkite-agent.sh +++ b/packer/scripts/install-buildkite-agent.sh @@ -49,8 +49,9 @@ sudo chown -R buildkite-agent: /var/lib/buildkite-agent/plugins echo "Adding systemd service template..." sudo cp /tmp/conf/buildkite-agent/systemd/buildkite-agent@.service /etc/systemd/system/buildkite-agent@.service -echo "Adding termination script..." +echo "Adding termination scripts..." sudo cp /tmp/conf/buildkite-agent/scripts/stop-agent-gracefully /usr/local/bin/stop-agent-gracefully +sudo cp /tmp/conf/buildkite-agent/scripts/terminate-instance /usr/local/bin/terminate-instance echo "Copying built-in plugins..." sudo mkdir -p /usr/local/buildkite-aws-stack/plugins diff --git a/templates/aws-stack.yml b/templates/aws-stack.yml index a0e770af1..88fbc3c8e 100644 --- a/templates/aws-stack.yml +++ b/templates/aws-stack.yml @@ -19,6 +19,9 @@ Metadata: - BuildkiteAgentTags - BuildkiteAgentTimestampLines - BuildkiteAgentExperiments + - BuildkiteTerminateInstanceAfterJob + - BuildkiteTerminateInstanceAfterJobTimeout + - BuildkiteTerminateInstanceAfterJobDecreaseDesiredCapacity - Label: default: Network Configuration @@ -124,6 +127,28 @@ Parameters: Type: String Default: "" + BuildkiteTerminateInstanceAfterJob: + Description: Set to "true" to terminate the instance after a job has completed. + Type: String + AllowedValues: + - "true" + - "false" + Default: "false" + + BuildkiteTerminateInstanceAfterJobTimeout: + Description: When BuilkditeTerminateInstanceAfterJob is "true", how many seconds to wait for a job before terminating the instance. + Type: Number + Default: 1800 + MinValue: 1 + + BuildkiteTerminateInstanceAfterJobDecreaseDesiredCapacity: + Description: Set to "true" to decrease the desired capacity of the autoscaling group by 1 when terminating an instance after the job completes. + Type: String + AllowedValues: + - "true" + - "false" + Default: "false" + BuildkiteQueue: Description: Queue name that agents will use, targeted in pipeline steps using "queue={value}" Type: String @@ -697,6 +722,9 @@ Resources: BUILDKITE_AUTHORIZED_USERS_URL="${AuthorizedUsersUrl}" \ BUILDKITE_ECR_POLICY=${ECRAccessPolicy} \ BUILDKITE_LIFECYCLE_TOPIC=${AgentLifecycleTopic} \ + BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=${BuildkiteTerminateInstanceAfterJob} \ + BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB_TIMEOUT=${BuildkiteTerminateInstanceAfterJobTimeout} \ + BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB_DECREASE_DESIRED_CAPACITY=${BuildkiteTerminateInstanceAfterJobDecreaseDesiredCapacity} \ AWS_DEFAULT_REGION=${AWS::Region} \ SECRETS_PLUGIN_ENABLED=${EnableSecretsPlugin} \ ECR_PLUGIN_ENABLED=${EnableECRPlugin} \