Skip to content

Commit

Permalink
Fix a race condition in Bazel's Windows process management.
Browse files Browse the repository at this point in the history
(Second version of this PR that fixes the issues from the last time and refactors the code a bit to make it easier to understand and maintain.)

While we did correctly terminate the job object of a process when calling Subprocess.destroy and we did correctly wait for the process itself to exit in Subprocess.waitFor, we didn't wait for its children to exit.

This is usually not a big problem, except when they keep open some file handles to files that Bazel immediately tris to delete after killing / waiting for the main process, in which case the files might still be in use. This is the reason why we're seeing errors like this on CI:

https://buildkite.com/bazel/bazel-bazel/builds/5774#27afa2c9-9ff1-4224-9df7-ba0253e7e317/193-303

"Caught I/O exception: Cannot delete path (something below $TEST_TMPDIR): Access is denied."

This CL modifies the behavior to explicitly terminate the job object in Subprocess.waitFor when the main process has exited and then wait for all transitive subprocesses to exit, before continuing.

Due to some interesting semantics, one cannot simply use WaitForSingleObject on the Job handle, instead one has to use an IoCompletionPort that gets a signal when the last job in the group has exited: https://blogs.msdn.microsoft.com/oldnewthing/20130405-00/?p=4743

Closes #6860.

PiperOrigin-RevId: 229371044
  • Loading branch information
philwo authored and Copybara-Service committed Jan 15, 2019
1 parent d5aa8f6 commit 4473bb1
Show file tree
Hide file tree
Showing 2 changed files with 633 additions and 471 deletions.
Expand Up @@ -14,17 +14,17 @@

package com.google.devtools.build.lib.windows;

import com.google.common.base.Throwables;
import com.google.devtools.build.lib.shell.Subprocess;
import com.google.devtools.build.lib.windows.jni.WindowsProcesses;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

/**
Expand All @@ -34,6 +34,11 @@ public class WindowsSubprocess implements Subprocess {
// For debugging purposes.
private String commandLine;

private static enum WaitResult {
SUCCESS,
TIMEOUT
}

/**
* Output stream for writing to the stdin of a Windows process.
*/
Expand Down Expand Up @@ -125,9 +130,9 @@ public Thread newThread(Runnable runnable) {
private final OutputStream stdinStream;
private final ProcessInputStream stdoutStream;
private final ProcessInputStream stderrStream;
private final CountDownLatch waitLatch;
private final Future<WaitResult> processFuture;
private final long timeoutMillis;
private final AtomicBoolean timedout = new AtomicBoolean(false);
private boolean timedout = false;

WindowsSubprocess(long nativeProcess, String commandLine, boolean stdoutRedirected,
boolean stderrRedirected, long timeoutMillis) {
Expand All @@ -140,35 +145,34 @@ public Thread newThread(Runnable runnable) {
stderrStream =
stderrRedirected ? null : new ProcessInputStream(WindowsProcesses.getStderr(nativeProcess));
stdinStream = new ProcessOutputStream();
waitLatch = new CountDownLatch(1);
// Every Windows process we start consumes a thread here. This is suboptimal, but seems to be
// the sanest way to reconcile WaitForMultipleObjects() and Java-style interruption.
@SuppressWarnings("unused")
Future<?> possiblyIgnoredError = WAITER_POOL.submit(this::waiterThreadFunc);
processFuture = WAITER_POOL.submit(this::waiterThreadFunc);
}

private void waiterThreadFunc() {
// Waits for the process to finish.
private WaitResult waiterThreadFunc() {
switch (WindowsProcesses.waitFor(nativeProcess, timeoutMillis)) {
case 0:
// Excellent, process finished in time.
break;
return WaitResult.SUCCESS;

case 1:
// Timeout. Terminate the process if we can.
timedout.set(true);
WindowsProcesses.terminate(nativeProcess);
break;
// Timeout. We don't need to call `terminate` here, because waitFor
// automatically terminates the process in case of a timeout.
return WaitResult.TIMEOUT;

case 2:
default:
// Error. There isn't a lot we can do -- the process is still alive but
// WaitForMultipleObjects() failed for some odd reason. We'll pretend it terminated and
// log a message to jvm.out .
System.err.println(
"Waiting for process " + WindowsProcesses.getProcessPid(nativeProcess) + " failed");
break;
// WaitForSingleObject() failed for some odd reason. This should
// basically never happen, but if it does... let's get a stack trace.
String errorMessage = WindowsProcesses.processGetLastError(nativeProcess);
throw new IllegalStateException(
"Waiting for process "
+ WindowsProcesses.getProcessPid(nativeProcess)
+ " failed: "
+ errorMessage);
}

waitLatch.countDown();
}

@Override
Expand Down Expand Up @@ -200,17 +204,24 @@ public synchronized int exitValue() {

@Override
public boolean finished() {
return waitLatch.getCount() == 0;
return processFuture.isDone();
}

@Override
public boolean timedout() {
return timedout.get();
return timedout;
}

@Override
public void waitFor() throws InterruptedException {
waitLatch.await();
try {
timedout = processFuture.get() == WaitResult.TIMEOUT;
} catch (ExecutionException e) {
Throwables.throwIfUnchecked(e.getCause());
// This should never happen, because waiterThreadFunc does not throw any
// checked exceptions.
throw new IllegalStateException("Unexpected exception", e);
}
}

@Override
Expand Down

0 comments on commit 4473bb1

Please sign in to comment.