Skip to content

Commit

Permalink
Merge pull request #1777 from jonahbull/retry-server-errors-for-startjob
Browse files Browse the repository at this point in the history
retry 5xx responses when attempting to start a job
  • Loading branch information
moskyb committed Oct 10, 2022
2 parents 5d3e56e + 6e01873 commit 00f2048
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
14 changes: 8 additions & 6 deletions agent/job_runner.go
Expand Up @@ -683,20 +683,22 @@ func (r *JobRunner) executePreBootstrapHook(hook string) (bool, error) {
}

// Starts the job in the Buildkite Agent API. We'll retry on connection-related
// issues, but if a connection succeeds and we get an error response back from
// issues, but if a connection succeeds and we get an client error response back from
// Buildkite, we won't bother retrying. For example, a "no such host" will
// retry, but a 422 from Buildkite won't.
// retry, but an HTTP response from Buildkite that isn't retryable won't.
func (r *JobRunner) startJob(startedAt time.Time) error {
r.job.StartedAt = startedAt.UTC().Format(time.RFC3339Nano)

return roko.NewRetrier(
roko.WithMaxAttempts(30),
roko.WithStrategy(roko.Constant(5*time.Second)),
roko.WithMaxAttempts(7),
roko.WithStrategy(roko.Exponential(2*time.Second, 0)),
).Do(func(rtr *roko.Retrier) error {
_, err := r.apiClient.StartJob(r.job)
response, err := r.apiClient.StartJob(r.job)

if err != nil {
if api.IsRetryableError(err) {
if response != nil && api.IsRetryableStatus(response) {
r.logger.Warn("%s (%s)", err, rtr)
} else if api.IsRetryableError(err) {
r.logger.Warn("%s (%s)", err, rtr)
} else {
r.logger.Warn("Buildkite rejected the call to start the job (%s)", err)
Expand Down
16 changes: 16 additions & 0 deletions api/retryable.go
Expand Up @@ -3,9 +3,12 @@ package api
import (
"io"
"net"
"net/http"
"net/url"
"strings"
"syscall"

"golang.org/x/exp/slices"
)

var retrableErrorSuffixes = []string{
Expand All @@ -18,6 +21,19 @@ var retrableErrorSuffixes = []string{
io.EOF.Error(),
}

var retryableStatuses = []int{
http.StatusTooManyRequests, // 429
http.StatusInternalServerError, // 500
http.StatusBadGateway, // 502
http.StatusServiceUnavailable, // 503
http.StatusGatewayTimeout, // 504
}

// IsRetryableStatus returns true if the response's StatusCode is one that we should retry.
func IsRetryableStatus(r *Response) bool {
return slices.Contains(retryableStatuses, r.StatusCode)
}

// Looks at a bunch of connection related errors, and returns true if the error
// matches one of them.
func IsRetryableError(err error) bool {
Expand Down

0 comments on commit 00f2048

Please sign in to comment.