Permalink
Browse files

Raise the maximum number of processes and open files to their maximum.

Under macOS, the default soft resource limits for open files and concurrent
processes are pretty low, but their corresponding hard defaults are
reasonable.  Because the soft limits are low, Bazel sometimes fails during
large builds -- not because of Bazel itself, but because the executed
actions do "too much work" or because the --jobs setting was high enough
to cause all parallel tasks to exceed the limits.

Instead of trying to fix the actions themselves, start by trying to raise
the system limits as a best-effort operation.  And, given that this code
is fairly portable, try to do it on all POSIX systems and not just macOS.
Note that, for non-macOS systems, this might still not do what's promised
in all circumstances because I'm currently only implementing
GetExplicitSystemLimit on macOS.

RELNOTES: None.
PiperOrigin-RevId: 161401482
  • Loading branch information...
jmmv authored and laszlocsomor committed Jul 10, 2017
1 parent 1fb46ce commit a96369c1c66df624bbee0d9e1c7ad38fea55cd26
@@ -1334,6 +1334,12 @@ int Main(int argc, const char *argv[], WorkspaceLayout *workspace_layout,
globals = new GlobalVariables(option_processor);
blaze::SetupStdStreams();

// Best-effort operation to raise the resource limits from soft to hard. We
// do this early during the main program instead of just before execing the
// Blaze server binary, because it's easier (for testing purposes) and because
// the Blaze client also benefits from this (e.g. during installation).
UnlimitResources();

// Must be done before command line parsing.
ComputeWorkspace(workspace_layout);
globals->binary_path = CheckAndGetBinaryPath(argv[0]);
@@ -12,17 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "src/main/cpp/blaze_util_platform.h"

#include <sys/types.h>
#include <sys/resource.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/un.h>

#include <libproc.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>

#include <CoreFoundation/CoreFoundation.h>

#include <cerrno>
#include <cstdio>
#include <cstring>

#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/blaze_util_platform.h"
#include "src/main/cpp/util/errors.h"
#include "src/main/cpp/util/exit_code.h"
#include "src/main/cpp/util/file.h"
@@ -225,4 +234,33 @@ void ExcludePathFromBackup(const string &path) {
}
}

int32_t GetExplicitSystemLimit(const int resource) {
const char* sysctl_name;
switch (resource) {
case RLIMIT_NOFILE:
sysctl_name = "kern.maxfilesperproc";
break;
case RLIMIT_NPROC:
sysctl_name = "kern.maxprocperuid";
break;
default:
return 0;
}

int32_t limit;
size_t len = sizeof(limit);
if (sysctlbyname(sysctl_name, &limit, &len, nullptr, 0) == -1) {
fprintf(stderr, "Warning: failed to get value of sysctl %s: %s\n",
sysctl_name, std::strerror(errno));
return 0;
}
if (len != sizeof(limit)) {
fprintf(stderr, "Warning: failed to get value of sysctl %s: returned "
"data length %zd did not match expected size %zd\n",
sysctl_name, len, sizeof(limit));
return 0;
}
return limit;
}

} // namespace blaze.
@@ -168,4 +168,8 @@ bool KillServerProcess(int pid) {
void ExcludePathFromBackup(const string &path) {
}

int32_t GetExplicitSystemLimit(const int resource) {
return -1;
}

} // namespace blaze
@@ -273,4 +273,8 @@ bool KillServerProcess(int pid) {
void ExcludePathFromBackup(const string &path) {
}

int32_t GetExplicitSystemLimit(const int resource) {
return -1;
}

} // namespace blaze
@@ -211,6 +211,22 @@ bool IsStderrStandardTerminal();
// connected, or 80 if there is no such terminal.
int GetStderrTerminalColumns();

// Gets the system-wide explicit limit for the given resource.
//
// The resource is one of the RLIMIT_* constants defined in sys/resource.h.
// Returns 0 if the limit could not be fetched and returns -1 if the function
// is not implemented for this platform.
//
// It is OK to call this function with a parameter of -1 to check if the
// function is implemented for the platform.
int32_t GetExplicitSystemLimit(const int resource);

// Raises soft system resource limits to hard limits in an attempt to let
// large builds work. This is a best-effort operation and may or may not be
// implemented for a given platform. Returns true if all limits were properly
// raised; false otherwise.
bool UnlimitResources();

} // namespace blaze

#endif // BAZEL_SRC_MAIN_CPP_BLAZE_UTIL_PLATFORM_H_
@@ -15,6 +15,7 @@
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h> // PATH_MAX
#include <poll.h>
#include <pwd.h>
@@ -26,9 +27,12 @@
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/resource.h>
#include <sys/wait.h>
#include <unistd.h>

#include <cassert>

#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/blaze_util_platform.h"
#include "src/main/cpp/global_variables.h"
@@ -684,4 +688,58 @@ int GetStderrTerminalColumns() {
return 80; // default if not a terminal.
}

// Raises a resource limit to the maximum allowed value.
//
// This function raises the limit of the resource given in "resource" from its
// soft limit to its hard limit. If the hard limit is unlimited, uses the
// kernel-level limit fetched from the sysctl property given in "sysctl_name"
// because setting the soft limit to unlimited may not work.
//
// Note that this is a best-effort operation. Any failure during this process
// will result in a warning but execution will continue.
static bool UnlimitResource(const int resource) {
struct rlimit rl;
if (getrlimit(resource, &rl) == -1) {
fprintf(stderr, "Warning: failed to get resource limit %d: %s\n", resource,
strerror(errno));
return false;
}

if (rl.rlim_cur == rl.rlim_max) {
// Nothing to do. Return early to prevent triggering any warnings caused by
// the code below. This way, we will only show warnings the first time the
// Blaze server is started and not on each command invocation.
return true;
}

rl.rlim_cur = rl.rlim_max;
if (rl.rlim_cur == RLIM_INFINITY) {
const rlim_t explicit_limit = GetExplicitSystemLimit(resource);
if (explicit_limit <= 0) {
// If not implemented (-1) or on an error (0), do nothing and try to
// increase the soft limit to the hard one. This might fail, but it's good
// to try anyway.
assert(rl.rlim_cur == rl.rlim_max);
} else {
rl.rlim_cur = explicit_limit;
}
}

if (setrlimit(resource, &rl) == -1) {
fprintf(stderr, "Warning: failed to raise resource limit %d to %" PRIdMAX
": %s\n", resource, static_cast<intmax_t>(rl.rlim_cur),
strerror(errno));
return false;
}

return true;
}

bool UnlimitResources() {
bool success = true;
success &= UnlimitResource(RLIMIT_NOFILE);
success &= UnlimitResource(RLIMIT_NPROC);
return success;
}

} // namespace blaze.
@@ -1435,4 +1435,8 @@ int GetStderrTerminalColumns() {
return 80; // default if not a terminal.
}

bool UnlimitResources() {
return true; // Nothing to do so assume success.
}

} // namespace blaze
@@ -23,6 +23,7 @@ cc_test(
],
"//conditions:default": [
"blaze_util_test.cc",
"blaze_util_posix_test.cc",
],
}),
deps = [
@@ -0,0 +1,181 @@
// Copyright 2017 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <sys/types.h>
#include <sys/resource.h>
#include <sys/wait.h>

#include <inttypes.h>
#include <unistd.h>

#include <cerrno>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>

#include "src/main/cpp/blaze_util.h"
#include "src/main/cpp/blaze_util_platform.h"
#include "gtest/gtest.h"

namespace blaze {

// Test fixture for the UnlimitResources function.
//
// The test cases in this fixture are special because the setup forks a
// subprocess and the actual testing is supposed to happen in such subprocess.
// This is because resource limits are process-wide so we must ensure that our
// testing does not interfere with other tests in this fixture or with other
// tests in the whole test program.
//
// What this means is that each test case function must check if IsChild() is
// false first. If it is, the function must return immediately. If it is not,
// then the function can proceed to execute the test but care must be taken: the
// test function cannot use any of the gunit functions, nor it can use std::exit
// to terminate. Instead, the function must use Die() to exit on a failure.
class UnlimitResourcesTest : public testing::Test {
protected:
UnlimitResourcesTest() {
pid_ = fork();
EXPECT_NE(-1, pid_);
}

virtual ~UnlimitResourcesTest() {
if (IsChild()) {
_exit(EXIT_SUCCESS);
} else {
int status;
EXPECT_NE(-1, waitpid(pid_, &status, 0));
EXPECT_TRUE(WIFEXITED(status));
EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
}
}

// Returns true if the test function is running in the child subprocess.
bool IsChild() {
return pid_ == 0;
}

// Description of the resource limits to test for.
static struct limits_spec {
const char* name;
const int resource;
} limits_[];

// Aborts execution with the given message and fails the test case.
// This can only be called when IsChild() is true.
static void Die(const char* fmt, ...) ATTRIBUTE_NORETURN {
va_list ap;
va_start(ap, fmt);
std::vfprintf(stderr, fmt, ap);
va_end(ap);
_exit(EXIT_FAILURE);
}

// Version of getrlimit(3) that fails the test on error.
// This can only be called when IsChild() is true.
static struct rlimit GetrlimitOrDie(const int resource) {
struct rlimit rl;
if (getrlimit(resource, &rl) == -1) {
Die("getrlimit(%d) failed: %s\n", resource, std::strerror(errno));
}
return rl;
}

// Version of setrlimit(3) that fails the test on error.
// This can only be called when IsChild() is true.
static void SetrlimitOrDie(const int resource, struct rlimit rl) {
if (setrlimit(resource, &rl) == -1) {
Die("setrlimit(%d) failed with cur=%" PRIdMAX ", max=%" PRIdMAX ": %s\n",
resource, static_cast<intmax_t>(rl.rlim_cur),
static_cast<intmax_t>(rl.rlim_max), std::strerror(errno));
}
}

private:
// PID of the test subprocess, or 0 if we are the subprocess.
pid_t pid_;
};

struct UnlimitResourcesTest::limits_spec UnlimitResourcesTest::limits_[] = {
{ "RLIMIT_NOFILE", RLIMIT_NOFILE },
{ "RLIMIT_NPROC", RLIMIT_NPROC },
{ nullptr, 0 },
};

TEST_F(UnlimitResourcesTest, SuccessWithExplicitLimits) {
if (!IsChild()) return;
// The rest of this test runs in a subprocess. See the fixture's docstring
// for details on what this implies.

// Lower the limits to very low values that should always work.
for (struct limits_spec* limit = limits_; limit->name != nullptr; limit++) {
struct rlimit rl = GetrlimitOrDie(limit->resource);
rl.rlim_cur = 1;
rl.rlim_max = 8;
SetrlimitOrDie(limit->resource, rl);
}

if (!blaze::UnlimitResources()) {
Die("UnlimitResources returned error; see output for diagnostics\n");
}

// Check that the soft limits were raised to the explicit hard limits we set.
for (struct limits_spec* limit = limits_; limit->name != nullptr; limit++) {
const struct rlimit rl = GetrlimitOrDie(limit->resource);
if (rl.rlim_cur != rl.rlim_max) {
Die("UnlimitResources did not increase the soft %s to its hard limit\n",
limit->name);
}
}
}

TEST_F(UnlimitResourcesTest, SuccessWithPossiblyInfiniteLimits) {
if (!IsChild()) return;
// The rest of this test runs in a subprocess. See the fixture's docstring
// for details on what this implies.

if (GetExplicitSystemLimit(-1) == -1) {
fprintf(stderr, "GetExplicitSystemLimit not implemented for this platform; "
"cannot verify the behavior of UnlimitResources\n");
return;
}

// Lower only the soft limits to very low values and assume that the hard
// limits are set to infinity; otherwise, there is nothing we can do because
// we may not have permissions to increase them.
for (struct limits_spec* limit = limits_; limit->name != nullptr; limit++) {
struct rlimit rl = GetrlimitOrDie(limit->resource);
if (rl.rlim_max != RLIM_INFINITY) {
fprintf(stderr, "Hard resource limit for %s is not infinity; will not "
"be able to meaningfully test anything\n", limit->name);
}
rl.rlim_cur = 1;
SetrlimitOrDie(limit->resource, rl);
}

if (!blaze::UnlimitResources()) {
Die("UnlimitResources returned error; see output for diagnostics\n");
}

// Check that the soft limits were increased to a higher explicit number.
for (struct limits_spec* limit = limits_; limit->name != nullptr; limit++) {
const struct rlimit rl = GetrlimitOrDie(limit->resource);
if (rl.rlim_cur == 1 || rl.rlim_cur == RLIM_INFINITY) {
Die("UnlimitResources did not increase the soft %s to the system limit\n",
limit->name);
}
}
}

} // namespace blaze

0 comments on commit a96369c

Please sign in to comment.