Skip to content

Commit 6115d94

Browse files
aherrmanncopybara-github
authored andcommitted
Retry on HTTP remote cache fetch failure
Bazel's previous behavior was to rebuild an artifact locally if fetching it from an HTTP remote cache failed. This behavior is different from GRPC remote cache case where Bazel will retry the fetch. The lack of retry is an issue for multiple reasons: On one hand rebuilding locally can be slower than fetching from the remote cache, on the other hand if a build action is not bit reproducible, as is the case with some compilers, then the local rebuild will trigger cache misses on further build actions that depend on the current artifact. This change aims to avoid theses issues by retrying the fetch in the HTTP cache case similarly to how the GRPC cache client does it. Some care needs to be taken due to the design of Bazel's internal remote cache client API. For a fetch the client is given an `OutputStream` object that it is expected to write the fetched data to. This may be a temporary file on disk that will be moved to the final location after the fetch completed. On retry, we need to be careful to not duplicate previously written data when writing into this `OutputStream`. Due to the generality of the `OutputStream` interface we cannot reset the file handle or write pointer to start fresh. Instead, this change follows the same pattern used in the GRPC cache client. Namely, keep track of the data previously written and continue from that offset on retry. With this change the HTTP cache client will attempt to fetch the data from the remote cache via an HTTP range request. So that the server only needs to send the data that is still missing. If the server replies with a 206 Partial Content response, then we write the received data directly into the output stream, if the server does not support range requests and instead replies with the full data, then we drop the duplicate prefix and only write into the output stream from the required offset. This patch has been running successfully in production [here](digital-asset/daml#11238). cc @cocreature Closes #14258. PiperOrigin-RevId: 508604846 Change-Id: I10a5d2a658e9c32a9d9fcd6bd29f6a0b95e84566
1 parent 69d43b4 commit 6115d94

File tree

11 files changed

+443
-45
lines changed

11 files changed

+443
-45
lines changed

src/main/java/com/google/devtools/build/lib/remote/RemoteCacheClientFactory.java

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,22 @@ public static RemoteCacheClient create(
5959
@Nullable Credentials creds,
6060
AuthAndTLSOptions authAndTlsOptions,
6161
Path workingDirectory,
62-
DigestUtil digestUtil)
62+
DigestUtil digestUtil,
63+
RemoteRetrier retrier)
6364
throws IOException {
6465
Preconditions.checkNotNull(workingDirectory, "workingDirectory");
6566
if (isHttpCache(options) && isDiskCache(options)) {
6667
return createDiskAndHttpCache(
67-
workingDirectory, options.diskCache, options, creds, authAndTlsOptions, digestUtil);
68+
workingDirectory,
69+
options.diskCache,
70+
options,
71+
creds,
72+
authAndTlsOptions,
73+
digestUtil,
74+
retrier);
6875
}
6976
if (isHttpCache(options)) {
70-
return createHttp(options, creds, authAndTlsOptions, digestUtil);
77+
return createHttp(options, creds, authAndTlsOptions, digestUtil, retrier);
7178
}
7279
if (isDiskCache(options)) {
7380
return createDiskCache(
@@ -90,7 +97,8 @@ private static RemoteCacheClient createHttp(
9097
RemoteOptions options,
9198
Credentials creds,
9299
AuthAndTLSOptions authAndTlsOptions,
93-
DigestUtil digestUtil) {
100+
DigestUtil digestUtil,
101+
RemoteRetrier retrier) {
94102
Preconditions.checkNotNull(options.remoteCache, "remoteCache");
95103

96104
try {
@@ -109,6 +117,7 @@ private static RemoteCacheClient createHttp(
109117
options.remoteVerifyDownloads,
110118
ImmutableList.copyOf(options.remoteHeaders),
111119
digestUtil,
120+
retrier,
112121
creds,
113122
authAndTlsOptions);
114123
} else {
@@ -122,6 +131,7 @@ private static RemoteCacheClient createHttp(
122131
options.remoteVerifyDownloads,
123132
ImmutableList.copyOf(options.remoteHeaders),
124133
digestUtil,
134+
retrier,
125135
creds,
126136
authAndTlsOptions);
127137
}
@@ -151,15 +161,16 @@ private static RemoteCacheClient createDiskAndHttpCache(
151161
RemoteOptions options,
152162
Credentials cred,
153163
AuthAndTLSOptions authAndTlsOptions,
154-
DigestUtil digestUtil)
164+
DigestUtil digestUtil,
165+
RemoteRetrier retrier)
155166
throws IOException {
156167
Path cacheDir =
157168
workingDirectory.getRelative(Preconditions.checkNotNull(diskCachePath, "diskCachePath"));
158169
if (!cacheDir.exists()) {
159170
cacheDir.createDirectoryAndParents();
160171
}
161172

162-
RemoteCacheClient httpCache = createHttp(options, cred, authAndTlsOptions, digestUtil);
173+
RemoteCacheClient httpCache = createHttp(options, cred, authAndTlsOptions, digestUtil, retrier);
163174
return createDiskAndRemoteClient(
164175
workingDirectory,
165176
diskCachePath,

src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import com.google.devtools.build.lib.remote.common.RemoteCacheClient;
6363
import com.google.devtools.build.lib.remote.common.RemoteExecutionClient;
6464
import com.google.devtools.build.lib.remote.downloader.GrpcRemoteDownloader;
65+
import com.google.devtools.build.lib.remote.http.HttpException;
6566
import com.google.devtools.build.lib.remote.logging.LoggingInterceptor;
6667
import com.google.devtools.build.lib.remote.options.RemoteBuildEventUploadMode;
6768
import com.google.devtools.build.lib.remote.options.RemoteOptions;
@@ -101,17 +102,20 @@
101102
import io.grpc.Channel;
102103
import io.grpc.ClientInterceptor;
103104
import io.grpc.ManagedChannel;
105+
import io.netty.handler.codec.http.HttpResponseStatus;
104106
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
105107
import java.io.IOException;
106108
import java.net.URI;
107109
import java.net.URISyntaxException;
110+
import java.nio.channels.ClosedChannelException;
108111
import java.util.List;
109112
import java.util.Optional;
110113
import java.util.concurrent.ExecutorService;
111114
import java.util.concurrent.Executors;
112115
import java.util.concurrent.LinkedBlockingQueue;
113116
import java.util.concurrent.ThreadFactory;
114117
import java.util.concurrent.ThreadPoolExecutor;
118+
import java.util.function.Predicate;
115119
import java.util.regex.Pattern;
116120
import javax.annotation.Nullable;
117121

@@ -216,6 +220,29 @@ private static ServerCapabilities getAndVerifyServerCapabilities(
216220
return capabilities;
217221
}
218222

223+
public static final Predicate<? super Exception> RETRIABLE_HTTP_ERRORS =
224+
e -> {
225+
boolean retry = false;
226+
if (e instanceof ClosedChannelException) {
227+
retry = true;
228+
} else if (e instanceof HttpException) {
229+
int status = ((HttpException) e).response().status().code();
230+
retry =
231+
status == HttpResponseStatus.INTERNAL_SERVER_ERROR.code()
232+
|| status == HttpResponseStatus.BAD_GATEWAY.code()
233+
|| status == HttpResponseStatus.SERVICE_UNAVAILABLE.code()
234+
|| status == HttpResponseStatus.GATEWAY_TIMEOUT.code();
235+
} else if (e instanceof IOException) {
236+
String msg = e.getMessage().toLowerCase();
237+
if (msg.contains("connection reset by peer")) {
238+
retry = true;
239+
} else if (msg.contains("operation timed out")) {
240+
retry = true;
241+
}
242+
}
243+
return retry;
244+
};
245+
219246
private void initHttpAndDiskCache(
220247
CommandEnvironment env,
221248
Credentials credentials,
@@ -230,7 +257,9 @@ private void initHttpAndDiskCache(
230257
credentials,
231258
authAndTlsOptions,
232259
Preconditions.checkNotNull(env.getWorkingDirectory(), "workingDirectory"),
233-
digestUtil);
260+
digestUtil,
261+
new RemoteRetrier(
262+
remoteOptions, RETRIABLE_HTTP_ERRORS, retryScheduler, Retrier.ALLOW_ALL_CALLS));
234263
} catch (IOException e) {
235264
handleInitFailure(env, e, Code.CACHE_INIT_FAILURE);
236265
return;

src/main/java/com/google/devtools/build/lib/remote/http/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ java_library(
2020
deps = [
2121
"//src/main/java/com/google/devtools/build/lib/analysis:blaze_version_info",
2222
"//src/main/java/com/google/devtools/build/lib/authandtls",
23+
"//src/main/java/com/google/devtools/build/lib/remote:Retrier",
2324
"//src/main/java/com/google/devtools/build/lib/remote/common",
2425
"//src/main/java/com/google/devtools/build/lib/remote/util",
2526
"//src/main/java/com/google/devtools/build/lib/vfs",

src/main/java/com/google/devtools/build/lib/remote/http/DownloadCommand.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,18 @@ final class DownloadCommand {
2525
private final boolean casDownload;
2626
private final Digest digest;
2727
private final OutputStream out;
28+
private final long offset;
2829

29-
DownloadCommand(URI uri, boolean casDownload, Digest digest, OutputStream out) {
30+
DownloadCommand(URI uri, boolean casDownload, Digest digest, OutputStream out, long offset) {
3031
this.uri = Preconditions.checkNotNull(uri);
3132
this.casDownload = casDownload;
3233
this.digest = Preconditions.checkNotNull(digest);
3334
this.out = Preconditions.checkNotNull(out);
35+
this.offset = offset;
36+
}
37+
38+
DownloadCommand(URI uri, boolean casDownload, Digest digest, OutputStream out) {
39+
this(uri, casDownload, digest, out, 0);
3440
}
3541

3642
public URI uri() {
@@ -48,4 +54,8 @@ public Digest digest() {
4854
public OutputStream out() {
4955
return out;
5056
}
57+
58+
public long offset() {
59+
return offset;
60+
}
5161
}

src/main/java/com/google/devtools/build/lib/remote/http/HttpCacheClient.java

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import com.google.common.util.concurrent.MoreExecutors;
2626
import com.google.common.util.concurrent.SettableFuture;
2727
import com.google.devtools.build.lib.authandtls.AuthAndTLSOptions;
28+
import com.google.devtools.build.lib.remote.RemoteRetrier;
2829
import com.google.devtools.build.lib.remote.common.CacheNotFoundException;
2930
import com.google.devtools.build.lib.remote.common.RemoteActionExecutionContext;
3031
import com.google.devtools.build.lib.remote.common.RemoteCacheClient;
@@ -81,8 +82,10 @@
8182
import java.util.List;
8283
import java.util.Map.Entry;
8384
import java.util.NoSuchElementException;
85+
import java.util.Optional;
8486
import java.util.concurrent.TimeUnit;
8587
import java.util.concurrent.atomic.AtomicBoolean;
88+
import java.util.concurrent.atomic.AtomicLong;
8689
import java.util.function.Function;
8790
import java.util.regex.Pattern;
8891
import javax.annotation.Nullable;
@@ -129,6 +132,7 @@ public final class HttpCacheClient implements RemoteCacheClient {
129132
private final boolean useTls;
130133
private final boolean verifyDownloads;
131134
private final DigestUtil digestUtil;
135+
private final RemoteRetrier retrier;
132136

133137
private final Object closeLock = new Object();
134138

@@ -150,6 +154,7 @@ public static HttpCacheClient create(
150154
boolean verifyDownloads,
151155
ImmutableList<Entry<String, String>> extraHttpHeaders,
152156
DigestUtil digestUtil,
157+
RemoteRetrier retrier,
153158
@Nullable final Credentials creds,
154159
AuthAndTLSOptions authAndTlsOptions)
155160
throws Exception {
@@ -162,6 +167,7 @@ public static HttpCacheClient create(
162167
verifyDownloads,
163168
extraHttpHeaders,
164169
digestUtil,
170+
retrier,
165171
creds,
166172
authAndTlsOptions,
167173
null);
@@ -175,6 +181,7 @@ public static HttpCacheClient create(
175181
boolean verifyDownloads,
176182
ImmutableList<Entry<String, String>> extraHttpHeaders,
177183
DigestUtil digestUtil,
184+
RemoteRetrier retrier,
178185
@Nullable final Credentials creds,
179186
AuthAndTLSOptions authAndTlsOptions)
180187
throws Exception {
@@ -189,6 +196,7 @@ public static HttpCacheClient create(
189196
verifyDownloads,
190197
extraHttpHeaders,
191198
digestUtil,
199+
retrier,
192200
creds,
193201
authAndTlsOptions,
194202
domainSocketAddress);
@@ -202,6 +210,7 @@ public static HttpCacheClient create(
202210
verifyDownloads,
203211
extraHttpHeaders,
204212
digestUtil,
213+
retrier,
205214
creds,
206215
authAndTlsOptions,
207216
domainSocketAddress);
@@ -219,6 +228,7 @@ private HttpCacheClient(
219228
boolean verifyDownloads,
220229
ImmutableList<Entry<String, String>> extraHttpHeaders,
221230
DigestUtil digestUtil,
231+
RemoteRetrier retrier,
222232
@Nullable final Credentials creds,
223233
AuthAndTLSOptions authAndTlsOptions,
224234
@Nullable SocketAddress socketAddress)
@@ -284,6 +294,7 @@ public void channelCreated(Channel ch) {
284294
this.extraHttpHeaders = extraHttpHeaders;
285295
this.verifyDownloads = verifyDownloads;
286296
this.digestUtil = digestUtil;
297+
this.retrier = retrier;
287298
}
288299

289300
@SuppressWarnings("FutureReturnValueIgnored")
@@ -441,8 +452,11 @@ public ListenableFuture<Void> downloadBlob(
441452
RemoteActionExecutionContext context, Digest digest, OutputStream out) {
442453
final DigestOutputStream digestOut =
443454
verifyDownloads ? digestUtil.newDigestOutputStream(out) : null;
455+
final AtomicLong casBytesDownloaded = new AtomicLong();
444456
return Futures.transformAsync(
445-
get(digest, digestOut != null ? digestOut : out, /* casDownload= */ true),
457+
retrier.executeAsync(
458+
() ->
459+
get(digest, digestOut != null ? digestOut : out, Optional.of(casBytesDownloaded))),
446460
(v) -> {
447461
try {
448462
if (digestOut != null) {
@@ -458,7 +472,8 @@ public ListenableFuture<Void> downloadBlob(
458472
}
459473

460474
@SuppressWarnings("FutureReturnValueIgnored")
461-
private ListenableFuture<Void> get(Digest digest, final OutputStream out, boolean casDownload) {
475+
private ListenableFuture<Void> get(
476+
Digest digest, final OutputStream out, Optional<AtomicLong> casBytesDownloaded) {
462477
final AtomicBoolean dataWritten = new AtomicBoolean();
463478
OutputStream wrappedOut =
464479
new OutputStream() {
@@ -469,12 +484,18 @@ private ListenableFuture<Void> get(Digest digest, final OutputStream out, boolea
469484
@Override
470485
public void write(byte[] b, int offset, int length) throws IOException {
471486
dataWritten.set(true);
487+
if (casBytesDownloaded.isPresent()) {
488+
casBytesDownloaded.get().addAndGet(length);
489+
}
472490
out.write(b, offset, length);
473491
}
474492

475493
@Override
476494
public void write(int b) throws IOException {
477495
dataWritten.set(true);
496+
if (casBytesDownloaded.isPresent()) {
497+
casBytesDownloaded.get().incrementAndGet();
498+
}
478499
out.write(b);
479500
}
480501

@@ -483,7 +504,12 @@ public void flush() throws IOException {
483504
out.flush();
484505
}
485506
};
486-
DownloadCommand downloadCmd = new DownloadCommand(uri, casDownload, digest, wrappedOut);
507+
long offset = 0;
508+
if (casBytesDownloaded.isPresent()) {
509+
offset = casBytesDownloaded.get().get();
510+
}
511+
DownloadCommand downloadCmd =
512+
new DownloadCommand(uri, casBytesDownloaded.isPresent(), digest, wrappedOut, offset);
487513
SettableFuture<Void> outerF = SettableFuture.create();
488514
acquireDownloadChannel()
489515
.addListener(
@@ -575,8 +601,11 @@ private void getAfterCredentialRefresh(DownloadCommand cmd, SettableFuture<Void>
575601
public ListenableFuture<CachedActionResult> downloadActionResult(
576602
RemoteActionExecutionContext context, ActionKey actionKey, boolean inlineOutErr) {
577603
return Futures.transform(
578-
Utils.downloadAsActionResult(
579-
actionKey, (digest, out) -> get(digest, out, /* casDownload= */ false)),
604+
retrier.executeAsync(
605+
() ->
606+
Utils.downloadAsActionResult(
607+
actionKey,
608+
(digest, out) -> get(digest, out, /* casBytesDownloaded= */ Optional.empty()))),
580609
CachedActionResult::remote,
581610
MoreExecutors.directExecutor());
582611
}
@@ -670,20 +699,28 @@ private void uploadAfterCredentialRefresh(UploadCommand upload, SettableFuture<V
670699
@Override
671700
public ListenableFuture<Void> uploadFile(
672701
RemoteActionExecutionContext context, Digest digest, Path file) {
673-
try {
674-
return uploadAsync(
675-
digest.getHash(), digest.getSizeBytes(), file.getInputStream(), /* casUpload= */ true);
676-
} catch (IOException e) {
677-
// Can be thrown from file.getInputStream.
678-
return Futures.immediateFailedFuture(e);
679-
}
702+
return retrier.executeAsync(
703+
() -> {
704+
try {
705+
return uploadAsync(
706+
digest.getHash(),
707+
digest.getSizeBytes(),
708+
file.getInputStream(),
709+
/* casUpload= */ true);
710+
} catch (IOException e) {
711+
// Can be thrown from file.getInputStream.
712+
return Futures.immediateFailedFuture(e);
713+
}
714+
});
680715
}
681716

682717
@Override
683718
public ListenableFuture<Void> uploadBlob(
684719
RemoteActionExecutionContext context, Digest digest, ByteString data) {
685-
return uploadAsync(
686-
digest.getHash(), digest.getSizeBytes(), data.newInput(), /* casUpload= */ true);
720+
return retrier.executeAsync(
721+
() ->
722+
uploadAsync(
723+
digest.getHash(), digest.getSizeBytes(), data.newInput(), /* casUpload= */ true));
687724
}
688725

689726
@Override

0 commit comments

Comments
 (0)