Skip to content

Commit

Permalink
BES: make uploader retry attempts configurable
Browse files Browse the repository at this point in the history
Depends on different Build Event Service setup, there could be different failure modes that may tolerate less or more failures for Build Events uploading.

Allow users to tweak the number without having to use a custom JVM args or shipping a fork of Bazel with these number tweaked.

Closes #16305.

PiperOrigin-RevId: 482303303
Change-Id: I71d9aeaf7527b0ff1a81af069390eedee2c22aa0
  • Loading branch information
sluongng authored and Copybara-Service committed Oct 19, 2022
1 parent 58edc17 commit e7218d5
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 17 deletions.
Expand Up @@ -93,12 +93,6 @@
public final class BuildEventServiceUploader implements Runnable {
private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();

/** Configuration knobs related to RPC retries. Values chosen by good judgement. */
private static final int MAX_NUM_RETRIES =
Integer.parseInt(System.getProperty("BAZEL_BES_NUM_RETRIES_ON_RPC_FAILURE", "4"));

private static final int DELAY_MILLIS = 1000;

private final BuildEventServiceClient besClient;
private final BuildEventArtifactUploader buildEventUploader;
private final BuildEventServiceProtoUtil besProtoUtil;
Expand Down Expand Up @@ -544,7 +538,7 @@ private void publishBuildEvents()
BuildProgress.Code.BES_STREAM_NOT_RETRYING_FAILURE,
message);
}
if (retryAttempt == MAX_NUM_RETRIES) {
if (retryAttempt == buildEventProtocolOptions.besUploadMaxRetries) {
String message =
String.format(
"Not retrying publishBuildEvents, no more attempts left: status='%s'",
Expand Down Expand Up @@ -629,7 +623,7 @@ private void publishLifecycleEvent(PublishLifecycleEventRequest request)
throws DetailedStatusException, InterruptedException {
int retryAttempt = 0;
StatusException cause = null;
while (retryAttempt <= MAX_NUM_RETRIES) {
while (retryAttempt <= this.buildEventProtocolOptions.besUploadMaxRetries) {
try {
besClient.publish(request);
return;
Expand All @@ -656,7 +650,7 @@ private void publishLifecycleEvent(PublishLifecycleEventRequest request)
throw withFailureDetail(
cause,
BuildProgress.Code.BES_UPLOAD_RETRY_LIMIT_EXCEEDED_FAILURE,
"All retry attempts failed.");
String.format("All %d retry attempts failed.", retryAttempt - 1));
}

private void ensureUploadThreadStarted() {
Expand Down Expand Up @@ -723,9 +717,12 @@ private static boolean shouldRetryStatus(Status status) {
&& !status.getCode().equals(Code.FAILED_PRECONDITION);
}

private static long retrySleepMillis(int attempt) {
private long retrySleepMillis(int attempt) {
Preconditions.checkArgument(attempt >= 0, "attempt must be nonnegative: %s", attempt);
// This somewhat matches the backoff used for gRPC connection backoffs.
return (long) (DELAY_MILLIS * Math.pow(1.6, attempt));
return (long)
(this.buildEventProtocolOptions.besUploadRetryInitialDelay.toMillis()
* Math.pow(1.6, attempt));
}

private DetailedStatusException withFailureDetail(
Expand Down
Expand Up @@ -18,6 +18,7 @@
import com.google.devtools.common.options.OptionDocumentationCategory;
import com.google.devtools.common.options.OptionEffectTag;
import com.google.devtools.common.options.OptionsBase;
import java.time.Duration;

/** Options used to configure the build event protocol. */
public class BuildEventProtocolOptions extends OptionsBase {
Expand All @@ -34,14 +35,31 @@ public class BuildEventProtocolOptions extends OptionsBase {
public boolean legacyImportantOutputs;

@Option(
name = "experimental_build_event_upload_strategy",
defaultValue = "null",
documentationCategory = OptionDocumentationCategory.LOGGING,
effectTags = {OptionEffectTag.AFFECTS_OUTPUTS},
help = "Selects how to upload artifacts referenced in the build event protocol."
)
name = "experimental_build_event_upload_strategy",
defaultValue = "null",
documentationCategory = OptionDocumentationCategory.LOGGING,
effectTags = {OptionEffectTag.AFFECTS_OUTPUTS},
help = "Selects how to upload artifacts referenced in the build event protocol.")
public String buildEventUploadStrategy;

@Option(
name = "experimental_build_event_upload_max_retries",
defaultValue = "4",
documentationCategory = OptionDocumentationCategory.LOGGING,
effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
help = "The maximum number of times Bazel should retry uploading a build event.")
public int besUploadMaxRetries;

@Option(
name = "experimental_build_event_upload_retry_minimum_delay",
defaultValue = "1s",
documentationCategory = OptionDocumentationCategory.LOGGING,
effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
help =
"Initial, minimum delay for exponential backoff retries when BEP upload fails. (exponent:"
+ " 1.6)")
public Duration besUploadRetryInitialDelay;

@Option(
name = "experimental_stream_log_file_uploads",
defaultValue = "false",
Expand Down
Expand Up @@ -232,6 +232,16 @@ public void testCreatesStreamerForBesTransport() throws Exception {
.isInstanceOf(BuildEventServiceTransport.class);
}

@Test
public void testRetryCount() throws Exception {
runBuildWithOptions(
"--bes_backend=does.not.exist:1234", "--experimental_build_event_upload_max_retries=3");
afterBuildCommand();

events.assertContainsError(
"The Build Event Protocol upload failed: All 3 retry attempts failed");
}

@Test
public void testConnectivityFailureDisablesBesStreaming() throws Exception {
class FailingConnectivityStatusProvider extends BlazeModule
Expand Down

0 comments on commit e7218d5

Please sign in to comment.