From b4dbed0c599862cc3886fc644cda95d0a38b4f70 Mon Sep 17 00:00:00 2001 From: Googler Date: Thu, 18 Aug 2022 08:47:46 -0700 Subject: [PATCH] Add system load average to profile, guarded by --experimental_collect_load_average_in_profiler flag. PiperOrigin-RevId: 468468092 Change-Id: Ic69b06efa15e3cb85a22b4c5b3561a8be2aba22c --- .../profiler/CollectLocalResourceUsage.java | 36 ++++++-- .../devtools/build/lib/profiler/Profiler.java | 19 ++-- .../build/lib/profiler/ProfilerTask.java | 1 + .../build/lib/runtime/BlazeRuntime.java | 2 +- .../lib/runtime/CommonCommandOptions.java | 10 ++- .../buildtool/util/BlazeRuntimeWrapper.java | 1 + .../build/lib/profiler/ProfilerTest.java | 19 +++- src/test/shell/integration/profiler_test.sh | 88 +++++++++++++++++++ 8 files changed, 161 insertions(+), 15 deletions(-) create mode 100755 src/test/shell/integration/profiler_test.sh diff --git a/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java b/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java index 4653899083c844..8cd3842ce1d9a9 100644 --- a/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java +++ b/src/main/java/com/google/devtools/build/lib/profiler/CollectLocalResourceUsage.java @@ -28,6 +28,7 @@ import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; import java.time.Duration; +import java.util.Objects; import java.util.concurrent.TimeUnit; /** Thread to collect local resource usage data and log into JSON profile. */ @@ -38,6 +39,7 @@ public class CollectLocalResourceUsage extends Thread { private final BugReporter bugReporter; private final boolean collectWorkerDataInProfiler; + private final boolean collectLoadAverage; private volatile boolean stopLocalUsageCollection; private volatile boolean profilingStarted; @@ -57,6 +59,9 @@ public class CollectLocalResourceUsage extends Thread { @GuardedBy("this") private TimeSeries workersMemoryUsage; + @GuardedBy("this") + private TimeSeries systemLoadAverage; + private Stopwatch stopwatch; private final WorkerMetricsCollector workerMetricsCollector; @@ -64,10 +69,12 @@ public class CollectLocalResourceUsage extends Thread { CollectLocalResourceUsage( BugReporter bugReporter, WorkerMetricsCollector workerMetricsCollector, - boolean collectWorkerDataInProfiler) { + boolean collectWorkerDataInProfiler, + boolean collectLoadAverage) { this.bugReporter = checkNotNull(bugReporter); this.collectWorkerDataInProfiler = collectWorkerDataInProfiler; this.workerMetricsCollector = workerMetricsCollector; + this.collectLoadAverage = collectLoadAverage; } @Override @@ -92,6 +99,11 @@ public void run() { new TimeSeries( /* startTimeMillis= */ stopwatch.elapsed().toMillis(), BUCKET_DURATION.toMillis()); } + if (collectLoadAverage) { + systemLoadAverage = + new TimeSeries( + /* startTimeMillis= */ stopwatch.elapsed().toMillis(), BUCKET_DURATION.toMillis()); + } } OperatingSystemMXBean osBean = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean(); @@ -108,8 +120,8 @@ public void run() { Duration nextElapsed = stopwatch.elapsed(); long nextCpuTimeNanos = osBean.getProcessCpuTime(); - double systemLoad = osBean.getSystemCpuLoad(); - double systemUsage = systemLoad * numProcessors; + double systemCpuLoad = osBean.getSystemCpuLoad(); + double systemUsage = systemCpuLoad * numProcessors; long systemMemoryUsageMb = -1; if (OS.getCurrent() == OS.LINUX) { @@ -147,11 +159,15 @@ public void run() { workerMemoryUsageMb = this.workerMetricsCollector.collectMetrics().stream() .map(WorkerMetric::getWorkerStat) - .filter(x -> x != null) + .filter(Objects::nonNull) .mapToInt(WorkerMetric.WorkerStat::getUsedMemoryInKB) .sum() / 1024; } + double loadAverage = 0; + if (collectLoadAverage) { + loadAverage = osBean.getSystemLoadAverage(); + } double deltaNanos = nextElapsed.minus(previousElapsed).toNanos(); double cpuLevel = (nextCpuTimeNanos - previousCpuTimeNanos) / deltaNanos; @@ -173,7 +189,11 @@ public void run() { } if (collectWorkerDataInProfiler && (workersMemoryUsage != null)) { workersMemoryUsage.addRange( - previousElapsed.toMillis(), nextElapsed.toMillis(), (double) workerMemoryUsageMb); + previousElapsed.toMillis(), nextElapsed.toMillis(), workerMemoryUsageMb); + } + if (collectLoadAverage && loadAverage > 0) { + systemLoadAverage.addRange( + previousElapsed.toMillis(), nextElapsed.toMillis(), loadAverage); } } previousElapsed = nextElapsed; @@ -217,6 +237,12 @@ synchronized void logCollectedData() { profiler, workersMemoryUsage, ProfilerTask.WORKERS_MEMORY_USAGE, startTimeNanos, len); } workersMemoryUsage = null; + + if (collectLoadAverage) { + logCollectedData( + profiler, systemLoadAverage, ProfilerTask.SYSTEM_LOAD_AVERAGE, startTimeNanos, len); + } + systemLoadAverage = null; } private static void logCollectedData( diff --git a/src/main/java/com/google/devtools/build/lib/profiler/Profiler.java b/src/main/java/com/google/devtools/build/lib/profiler/Profiler.java index eaa0641994d959..ed7e03e5bee257 100644 --- a/src/main/java/com/google/devtools/build/lib/profiler/Profiler.java +++ b/src/main/java/com/google/devtools/build/lib/profiler/Profiler.java @@ -375,6 +375,7 @@ public static Duration getProcessCpuTimeMaybe() { * some tasks may get aggregated if they finished quick enough * @param clock a {@code BlazeClock.instance()} * @param execStartTimeNanos execution start time in nanos obtained from {@code clock.nanoTime()} + * @param collectLoadAverage If true, collects system load average (as seen in uptime(1)) */ public synchronized void start( ImmutableSet profiledTasks, @@ -390,6 +391,7 @@ public synchronized void start( boolean includeTargetLabel, boolean collectTaskHistograms, boolean collectWorkerDataInProfiler, + boolean collectLoadAverage, WorkerMetricsCollector workerMetricsCollector, BugReporter bugReporter) throws IOException { @@ -448,7 +450,7 @@ public synchronized void start( // Start collecting Bazel and system-wide CPU metric collection. resourceUsageThread = new CollectLocalResourceUsage( - bugReporter, workerMetricsCollector, collectWorkerDataInProfiler); + bugReporter, workerMetricsCollector, collectWorkerDataInProfiler, collectLoadAverage); resourceUsageThread.setDaemon(true); resourceUsageThread.start(); } @@ -1157,7 +1159,8 @@ public void run() { || data.type == ProfilerTask.ACTION_COUNTS || data.type == ProfilerTask.SYSTEM_CPU_USAGE || data.type == ProfilerTask.SYSTEM_MEMORY_USAGE - || data.type == ProfilerTask.WORKERS_MEMORY_USAGE) { + || data.type == ProfilerTask.WORKERS_MEMORY_USAGE + || data.type == ProfilerTask.SYSTEM_LOAD_AVERAGE) { // Skip counts equal to zero. They will show up as a thin line in the profile. if ("0.0".equals(data.description)) { continue; @@ -1166,12 +1169,10 @@ public void run() { writer.beginObject(); writer.setIndent(""); writer.name("name").value(data.type.description); - if (data.type == ProfilerTask.LOCAL_MEMORY_USAGE) { - writer.name("cname").value("olive"); - } // Pick acceptable counter colors manually, unfortunately we have to pick from these - // weird reserved names. + // weird reserved names from + // https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html switch (data.type) { case LOCAL_CPU_USAGE: writer.name("cname").value("good"); @@ -1188,6 +1189,9 @@ public void run() { case WORKERS_MEMORY_USAGE: writer.name("cname").value("rail_animation"); break; + case SYSTEM_LOAD_AVERAGE: + writer.name("cname").value("generic_work"); + break; default: // won't happen } @@ -1220,6 +1224,9 @@ public void run() { case WORKERS_MEMORY_USAGE: writer.name("workers memory").value(data.description); break; + case SYSTEM_LOAD_AVERAGE: + writer.name("load").value(data.description); + break; default: // won't happen } diff --git a/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java b/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java index 9bfea6cc8d4692..64d0bbd88dcc60 100644 --- a/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java +++ b/src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java @@ -70,6 +70,7 @@ public enum ProfilerTask { LOCAL_MEMORY_USAGE("Memory usage (Bazel)"), SYSTEM_MEMORY_USAGE("Memory usage (total)"), WORKERS_MEMORY_USAGE("Workers memory usage"), + SYSTEM_LOAD_AVERAGE("System load average"), STARLARK_PARSER("Starlark Parser", Threshold.FIFTY_MILLIS), STARLARK_USER_FN("Starlark user function call", Threshold.FIFTY_MILLIS), STARLARK_BUILTIN_FN("Starlark builtin function call", Threshold.FIFTY_MILLIS), diff --git a/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java b/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java index 32c006da0326af..ee93b33ec0d44f 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/BlazeRuntime.java @@ -53,7 +53,6 @@ import com.google.devtools.build.lib.clock.Clock; import com.google.devtools.build.lib.events.Event; import com.google.devtools.build.lib.events.ExtendedEventHandler; -import com.google.devtools.build.lib.events.ExtendedEventHandler.Postable; import com.google.devtools.build.lib.events.OutputFilter; import com.google.devtools.build.lib.exec.BinTools; import com.google.devtools.build.lib.jni.JniLoader; @@ -399,6 +398,7 @@ ProfilerStartedEvent initProfiler( options.profileIncludeTargetLabel, options.alwaysProfileSlowOperations, options.collectWorkerDataInProfiler, + options.collectLoadAverageInProfiler, WorkerMetricsCollector.instance(), bugReporter); // Instead of logEvent() we're calling the low level function to pass the timings we took in diff --git a/src/main/java/com/google/devtools/build/lib/runtime/CommonCommandOptions.java b/src/main/java/com/google/devtools/build/lib/runtime/CommonCommandOptions.java index 942a61c0c7efe6..8e4ffdca330754 100644 --- a/src/main/java/com/google/devtools/build/lib/runtime/CommonCommandOptions.java +++ b/src/main/java/com/google/devtools/build/lib/runtime/CommonCommandOptions.java @@ -313,9 +313,17 @@ public String getTypeDescription() { defaultValue = "false", documentationCategory = OptionDocumentationCategory.UNDOCUMENTED, effectTags = {OptionEffectTag.AFFECTS_OUTPUTS, OptionEffectTag.BAZEL_MONITORING}, - help = "If enabled, profiler collects worker's aggregated resource data.") + help = "If enabled, the profiler collects worker's aggregated resource data.") public boolean collectWorkerDataInProfiler; + @Option( + name = "experimental_collect_load_average_in_profiler", + defaultValue = "false", + documentationCategory = OptionDocumentationCategory.UNDOCUMENTED, + effectTags = {OptionEffectTag.AFFECTS_OUTPUTS, OptionEffectTag.BAZEL_MONITORING}, + help = "If enabled, the profiler collects the system's overall load average.") + public boolean collectLoadAverageInProfiler; + @Option( name = "memory_profile", defaultValue = "null", diff --git a/src/test/java/com/google/devtools/build/lib/buildtool/util/BlazeRuntimeWrapper.java b/src/test/java/com/google/devtools/build/lib/buildtool/util/BlazeRuntimeWrapper.java index 17dcdab04149ec..0db96eb41bfab8 100644 --- a/src/test/java/com/google/devtools/build/lib/buildtool/util/BlazeRuntimeWrapper.java +++ b/src/test/java/com/google/devtools/build/lib/buildtool/util/BlazeRuntimeWrapper.java @@ -324,6 +324,7 @@ public void executeBuild(List targets) throws Exception { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), runtime.getBugReporter()); diff --git a/src/test/java/com/google/devtools/build/lib/profiler/ProfilerTest.java b/src/test/java/com/google/devtools/build/lib/profiler/ProfilerTest.java index f1dd686cbe6dd9..ebd59a8c62f758 100644 --- a/src/test/java/com/google/devtools/build/lib/profiler/ProfilerTest.java +++ b/src/test/java/com/google/devtools/build/lib/profiler/ProfilerTest.java @@ -110,6 +110,7 @@ private ByteArrayOutputStream start(ImmutableSet tasks, Profiler.F /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); return buffer; @@ -130,6 +131,7 @@ private void startUnbuffered(ImmutableSet tasks) throws IOExceptio /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); } @@ -235,6 +237,7 @@ public void testProfilerRecordingAllEvents() throws Exception { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); try (SilentCloseable c = profiler.profile(ProfilerTask.ACTION, "action task")) { @@ -316,6 +319,7 @@ public void testProfilerWorkerMetrics() throws Exception { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ true, + /*collectLoadAverage=*/ false, workerMetricsCollector, BugReporter.defaultInstance()); Thread.sleep(400); @@ -347,6 +351,7 @@ public void testProfilerRecordingOnlySlowestEvents() throws Exception { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); profiler.logSimpleTask(10000, 20000, ProfilerTask.VFS_STAT, "stat"); @@ -472,6 +477,7 @@ public void testProfilerRecordsNothing() throws Exception { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); profiler.logSimpleTask(10000, 20000, ProfilerTask.VFS_STAT, "stat"); @@ -609,10 +615,13 @@ public void testPhaseTasks() throws Exception { .hasSize(102); // thread2b + last task + finishing } - // Filter out CPU and memory usage events. These are non-deterministic depending on the duration + // Filter out CPU usage/memory usage/load average events. These are non-deterministic depending on + // the duration // of the profile. private static List removeUsageEvents(List events) { - return events.stream().filter(e -> !e.name().contains("usage")).collect(Collectors.toList()); + return events.stream() + .filter(e -> (!e.name().contains("usage") && !e.name().contains("load average"))) + .collect(toImmutableList()); } /** @@ -672,6 +681,7 @@ public long nanoTime() { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); profiler.logSimpleTask(badClock.nanoTime(), ProfilerTask.INFO, "some task"); @@ -733,6 +743,7 @@ public void write(int b) throws IOException { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); profiler.logSimpleTaskDuration( @@ -764,6 +775,7 @@ public void write(int b) throws IOException { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); profiler.logSimpleTaskDuration( @@ -790,6 +802,7 @@ public void testPrimaryOutputForAction() throws Exception { /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); try (SilentCloseable c = profiler.profileAction(ProfilerTask.ACTION, "test", "foo.out", "")) { @@ -824,6 +837,7 @@ public void testTargetLabelForAction() throws Exception { /*includeTargetLabel=*/ true, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); try (SilentCloseable c = @@ -857,6 +871,7 @@ private ByteArrayOutputStream getJsonProfileOutputStream(boolean slimProfile) th /*includeTargetLabel=*/ false, /*collectTaskHistograms=*/ true, /*collectWorkerDataInProfiler=*/ false, + /*collectLoadAverage=*/ false, WorkerMetricsCollector.instance(), BugReporter.defaultInstance()); long curTime = Profiler.nanoTimeMaybe(); diff --git a/src/test/shell/integration/profiler_test.sh b/src/test/shell/integration/profiler_test.sh new file mode 100755 index 00000000000000..01938890cc3646 --- /dev/null +++ b/src/test/shell/integration/profiler_test.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# +# Copyright 2022 The Bazel Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# --- begin runfiles.bash initialization --- +# Copy-pasted from Bazel's Bash runfiles library (tools/bash/runfiles/runfiles.bash). +if [[ ! -d "${RUNFILES_DIR:-/dev/null}" && ! -f "${RUNFILES_MANIFEST_FILE:-/dev/null}" ]]; then + if [[ -f "$0.runfiles_manifest" ]]; then + export RUNFILES_MANIFEST_FILE="$0.runfiles_manifest" + elif [[ -f "$0.runfiles/MANIFEST" ]]; then + export RUNFILES_MANIFEST_FILE="$0.runfiles/MANIFEST" + elif [[ -f "$0.runfiles/bazel_tools/tools/bash/runfiles/runfiles.bash" ]]; then + export RUNFILES_DIR="$0.runfiles" + fi +fi +if [[ -f "${RUNFILES_DIR:-/dev/null}/bazel_tools/tools/bash/runfiles/runfiles.bash" ]]; then + source "${RUNFILES_DIR}/bazel_tools/tools/bash/runfiles/runfiles.bash" +elif [[ -f "${RUNFILES_MANIFEST_FILE:-/dev/null}" ]]; then + source "$(grep -m1 "^bazel_tools/tools/bash/runfiles/runfiles.bash " \ + "$RUNFILES_MANIFEST_FILE" | cut -d ' ' -f 2-)" +else + echo >&2 "ERROR: cannot find @bazel_tools//tools/bash/runfiles:runfiles.bash" + exit 1 +fi +# --- end runfiles.bash initialization --- + +source "$(rlocation "io_bazel/src/test/shell/integration_test_setup.sh")" \ + || { echo "integration_test_setup.sh not found!" >&2; exit 1; } + +cd "$TEST_TMPDIR" + +function set_up() { + cd ${WORKSPACE_DIR} +} + +function tear_down() { + # Start with a fresh bazel server. + bazel shutdown +} + +function helper() { + startup_option="$1" + command_option="$2" + # Name of the calling function to end up using distinct packages. + local -r pkg=${FUNCNAME[1]} + mkdir $pkg || fail "mkdir $pkg" + cat > "$pkg/BUILD" << 'EOF' +genrule( + name = "foo", + srcs = ["whocares.in"], + outs = ["foo.out"], + cmd = "touch $@", +) +EOF + touch "$pkg/whocares.in" + echo Testing in $pkg + bazel $startup_option build $command_option "//$pkg:foo" \ + --profile=/tmp/profile.log &> "$TEST_log" || fail "Expected success." + grep '"ph":"C"' /tmp/profile.log > "$TEST_log" \ + || fail "Missing profile file." +} + +function test_metrics() { + helper "" "" + expect_log 'CPU usage (Bazel).*"cpu":"[0-9.]\+"' + expect_log 'CPU usage (total).*"system cpu":"[0-9.]\+"' + expect_log 'Memory usage (Bazel).*"memory":"[0-9.]\+"' + expect_log 'Memory usage (total).*"system memory":"[0-9.]\+"' +} + +function test_metrics_with_load_average() { + helper "" "--experimental_collect_load_average_in_profiler" + expect_log 'System load average.*"load":"[0-9.]\+"' +} + +run_suite "Integration tests for profiler data."