Skip to content

Commit

Permalink
Add system load average to profile, guarded by --experimental_collect…
Browse files Browse the repository at this point in the history
…_load_average_in_profiler flag.

PiperOrigin-RevId: 468468092
Change-Id: Ic69b06efa15e3cb85a22b4c5b3561a8be2aba22c
  • Loading branch information
larsrc-google authored and Copybara-Service committed Aug 18, 2022
1 parent 6d7cd1a commit b4dbed0
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 15 deletions.
Expand Up @@ -28,6 +28,7 @@
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.time.Duration;
import java.util.Objects;
import java.util.concurrent.TimeUnit;

/** Thread to collect local resource usage data and log into JSON profile. */
Expand All @@ -38,6 +39,7 @@ public class CollectLocalResourceUsage extends Thread {

private final BugReporter bugReporter;
private final boolean collectWorkerDataInProfiler;
private final boolean collectLoadAverage;

private volatile boolean stopLocalUsageCollection;
private volatile boolean profilingStarted;
Expand All @@ -57,17 +59,22 @@ public class CollectLocalResourceUsage extends Thread {
@GuardedBy("this")
private TimeSeries workersMemoryUsage;

@GuardedBy("this")
private TimeSeries systemLoadAverage;

private Stopwatch stopwatch;

private final WorkerMetricsCollector workerMetricsCollector;

CollectLocalResourceUsage(
BugReporter bugReporter,
WorkerMetricsCollector workerMetricsCollector,
boolean collectWorkerDataInProfiler) {
boolean collectWorkerDataInProfiler,
boolean collectLoadAverage) {
this.bugReporter = checkNotNull(bugReporter);
this.collectWorkerDataInProfiler = collectWorkerDataInProfiler;
this.workerMetricsCollector = workerMetricsCollector;
this.collectLoadAverage = collectLoadAverage;
}

@Override
Expand All @@ -92,6 +99,11 @@ public void run() {
new TimeSeries(
/* startTimeMillis= */ stopwatch.elapsed().toMillis(), BUCKET_DURATION.toMillis());
}
if (collectLoadAverage) {
systemLoadAverage =
new TimeSeries(
/* startTimeMillis= */ stopwatch.elapsed().toMillis(), BUCKET_DURATION.toMillis());
}
}
OperatingSystemMXBean osBean =
(OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean();
Expand All @@ -108,8 +120,8 @@ public void run() {
Duration nextElapsed = stopwatch.elapsed();
long nextCpuTimeNanos = osBean.getProcessCpuTime();

double systemLoad = osBean.getSystemCpuLoad();
double systemUsage = systemLoad * numProcessors;
double systemCpuLoad = osBean.getSystemCpuLoad();
double systemUsage = systemCpuLoad * numProcessors;

long systemMemoryUsageMb = -1;
if (OS.getCurrent() == OS.LINUX) {
Expand Down Expand Up @@ -147,11 +159,15 @@ public void run() {
workerMemoryUsageMb =
this.workerMetricsCollector.collectMetrics().stream()
.map(WorkerMetric::getWorkerStat)
.filter(x -> x != null)
.filter(Objects::nonNull)
.mapToInt(WorkerMetric.WorkerStat::getUsedMemoryInKB)
.sum()
/ 1024;
}
double loadAverage = 0;
if (collectLoadAverage) {
loadAverage = osBean.getSystemLoadAverage();
}

double deltaNanos = nextElapsed.minus(previousElapsed).toNanos();
double cpuLevel = (nextCpuTimeNanos - previousCpuTimeNanos) / deltaNanos;
Expand All @@ -173,7 +189,11 @@ public void run() {
}
if (collectWorkerDataInProfiler && (workersMemoryUsage != null)) {
workersMemoryUsage.addRange(
previousElapsed.toMillis(), nextElapsed.toMillis(), (double) workerMemoryUsageMb);
previousElapsed.toMillis(), nextElapsed.toMillis(), workerMemoryUsageMb);
}
if (collectLoadAverage && loadAverage > 0) {
systemLoadAverage.addRange(
previousElapsed.toMillis(), nextElapsed.toMillis(), loadAverage);
}
}
previousElapsed = nextElapsed;
Expand Down Expand Up @@ -217,6 +237,12 @@ synchronized void logCollectedData() {
profiler, workersMemoryUsage, ProfilerTask.WORKERS_MEMORY_USAGE, startTimeNanos, len);
}
workersMemoryUsage = null;

if (collectLoadAverage) {
logCollectedData(
profiler, systemLoadAverage, ProfilerTask.SYSTEM_LOAD_AVERAGE, startTimeNanos, len);
}
systemLoadAverage = null;
}

private static void logCollectedData(
Expand Down
19 changes: 13 additions & 6 deletions src/main/java/com/google/devtools/build/lib/profiler/Profiler.java
Expand Up @@ -375,6 +375,7 @@ public static Duration getProcessCpuTimeMaybe() {
* some tasks may get aggregated if they finished quick enough
* @param clock a {@code BlazeClock.instance()}
* @param execStartTimeNanos execution start time in nanos obtained from {@code clock.nanoTime()}
* @param collectLoadAverage If true, collects system load average (as seen in uptime(1))
*/
public synchronized void start(
ImmutableSet<ProfilerTask> profiledTasks,
Expand All @@ -390,6 +391,7 @@ public synchronized void start(
boolean includeTargetLabel,
boolean collectTaskHistograms,
boolean collectWorkerDataInProfiler,
boolean collectLoadAverage,
WorkerMetricsCollector workerMetricsCollector,
BugReporter bugReporter)
throws IOException {
Expand Down Expand Up @@ -448,7 +450,7 @@ public synchronized void start(
// Start collecting Bazel and system-wide CPU metric collection.
resourceUsageThread =
new CollectLocalResourceUsage(
bugReporter, workerMetricsCollector, collectWorkerDataInProfiler);
bugReporter, workerMetricsCollector, collectWorkerDataInProfiler, collectLoadAverage);
resourceUsageThread.setDaemon(true);
resourceUsageThread.start();
}
Expand Down Expand Up @@ -1157,7 +1159,8 @@ public void run() {
|| data.type == ProfilerTask.ACTION_COUNTS
|| data.type == ProfilerTask.SYSTEM_CPU_USAGE
|| data.type == ProfilerTask.SYSTEM_MEMORY_USAGE
|| data.type == ProfilerTask.WORKERS_MEMORY_USAGE) {
|| data.type == ProfilerTask.WORKERS_MEMORY_USAGE
|| data.type == ProfilerTask.SYSTEM_LOAD_AVERAGE) {
// Skip counts equal to zero. They will show up as a thin line in the profile.
if ("0.0".equals(data.description)) {
continue;
Expand All @@ -1166,12 +1169,10 @@ public void run() {
writer.beginObject();
writer.setIndent("");
writer.name("name").value(data.type.description);
if (data.type == ProfilerTask.LOCAL_MEMORY_USAGE) {
writer.name("cname").value("olive");
}

// Pick acceptable counter colors manually, unfortunately we have to pick from these
// weird reserved names.
// weird reserved names from
// https://github.com/catapult-project/catapult/blob/master/tracing/tracing/base/color_scheme.html
switch (data.type) {
case LOCAL_CPU_USAGE:
writer.name("cname").value("good");
Expand All @@ -1188,6 +1189,9 @@ public void run() {
case WORKERS_MEMORY_USAGE:
writer.name("cname").value("rail_animation");
break;
case SYSTEM_LOAD_AVERAGE:
writer.name("cname").value("generic_work");
break;
default:
// won't happen
}
Expand Down Expand Up @@ -1220,6 +1224,9 @@ public void run() {
case WORKERS_MEMORY_USAGE:
writer.name("workers memory").value(data.description);
break;
case SYSTEM_LOAD_AVERAGE:
writer.name("load").value(data.description);
break;
default:
// won't happen
}
Expand Down
Expand Up @@ -70,6 +70,7 @@ public enum ProfilerTask {
LOCAL_MEMORY_USAGE("Memory usage (Bazel)"),
SYSTEM_MEMORY_USAGE("Memory usage (total)"),
WORKERS_MEMORY_USAGE("Workers memory usage"),
SYSTEM_LOAD_AVERAGE("System load average"),
STARLARK_PARSER("Starlark Parser", Threshold.FIFTY_MILLIS),
STARLARK_USER_FN("Starlark user function call", Threshold.FIFTY_MILLIS),
STARLARK_BUILTIN_FN("Starlark builtin function call", Threshold.FIFTY_MILLIS),
Expand Down
Expand Up @@ -53,7 +53,6 @@
import com.google.devtools.build.lib.clock.Clock;
import com.google.devtools.build.lib.events.Event;
import com.google.devtools.build.lib.events.ExtendedEventHandler;
import com.google.devtools.build.lib.events.ExtendedEventHandler.Postable;
import com.google.devtools.build.lib.events.OutputFilter;
import com.google.devtools.build.lib.exec.BinTools;
import com.google.devtools.build.lib.jni.JniLoader;
Expand Down Expand Up @@ -399,6 +398,7 @@ ProfilerStartedEvent initProfiler(
options.profileIncludeTargetLabel,
options.alwaysProfileSlowOperations,
options.collectWorkerDataInProfiler,
options.collectLoadAverageInProfiler,
WorkerMetricsCollector.instance(),
bugReporter);
// Instead of logEvent() we're calling the low level function to pass the timings we took in
Expand Down
Expand Up @@ -313,9 +313,17 @@ public String getTypeDescription() {
defaultValue = "false",
documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
effectTags = {OptionEffectTag.AFFECTS_OUTPUTS, OptionEffectTag.BAZEL_MONITORING},
help = "If enabled, profiler collects worker's aggregated resource data.")
help = "If enabled, the profiler collects worker's aggregated resource data.")
public boolean collectWorkerDataInProfiler;

@Option(
name = "experimental_collect_load_average_in_profiler",
defaultValue = "false",
documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
effectTags = {OptionEffectTag.AFFECTS_OUTPUTS, OptionEffectTag.BAZEL_MONITORING},
help = "If enabled, the profiler collects the system's overall load average.")
public boolean collectLoadAverageInProfiler;

@Option(
name = "memory_profile",
defaultValue = "null",
Expand Down
Expand Up @@ -324,6 +324,7 @@ public void executeBuild(List<String> targets) throws Exception {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
runtime.getBugReporter());

Expand Down
Expand Up @@ -110,6 +110,7 @@ private ByteArrayOutputStream start(ImmutableSet<ProfilerTask> tasks, Profiler.F
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
return buffer;
Expand All @@ -130,6 +131,7 @@ private void startUnbuffered(ImmutableSet<ProfilerTask> tasks) throws IOExceptio
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
}
Expand Down Expand Up @@ -235,6 +237,7 @@ public void testProfilerRecordingAllEvents() throws Exception {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
try (SilentCloseable c = profiler.profile(ProfilerTask.ACTION, "action task")) {
Expand Down Expand Up @@ -316,6 +319,7 @@ public void testProfilerWorkerMetrics() throws Exception {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ true,
/*collectLoadAverage=*/ false,
workerMetricsCollector,
BugReporter.defaultInstance());
Thread.sleep(400);
Expand Down Expand Up @@ -347,6 +351,7 @@ public void testProfilerRecordingOnlySlowestEvents() throws Exception {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
profiler.logSimpleTask(10000, 20000, ProfilerTask.VFS_STAT, "stat");
Expand Down Expand Up @@ -472,6 +477,7 @@ public void testProfilerRecordsNothing() throws Exception {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
profiler.logSimpleTask(10000, 20000, ProfilerTask.VFS_STAT, "stat");
Expand Down Expand Up @@ -609,10 +615,13 @@ public void testPhaseTasks() throws Exception {
.hasSize(102); // thread2b + last task + finishing
}

// Filter out CPU and memory usage events. These are non-deterministic depending on the duration
// Filter out CPU usage/memory usage/load average events. These are non-deterministic depending on
// the duration
// of the profile.
private static List<TraceEvent> removeUsageEvents(List<TraceEvent> events) {
return events.stream().filter(e -> !e.name().contains("usage")).collect(Collectors.toList());
return events.stream()
.filter(e -> (!e.name().contains("usage") && !e.name().contains("load average")))
.collect(toImmutableList());
}

/**
Expand Down Expand Up @@ -672,6 +681,7 @@ public long nanoTime() {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
profiler.logSimpleTask(badClock.nanoTime(), ProfilerTask.INFO, "some task");
Expand Down Expand Up @@ -733,6 +743,7 @@ public void write(int b) throws IOException {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
profiler.logSimpleTaskDuration(
Expand Down Expand Up @@ -764,6 +775,7 @@ public void write(int b) throws IOException {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
profiler.logSimpleTaskDuration(
Expand All @@ -790,6 +802,7 @@ public void testPrimaryOutputForAction() throws Exception {
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
try (SilentCloseable c = profiler.profileAction(ProfilerTask.ACTION, "test", "foo.out", "")) {
Expand Down Expand Up @@ -824,6 +837,7 @@ public void testTargetLabelForAction() throws Exception {
/*includeTargetLabel=*/ true,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
try (SilentCloseable c =
Expand Down Expand Up @@ -857,6 +871,7 @@ private ByteArrayOutputStream getJsonProfileOutputStream(boolean slimProfile) th
/*includeTargetLabel=*/ false,
/*collectTaskHistograms=*/ true,
/*collectWorkerDataInProfiler=*/ false,
/*collectLoadAverage=*/ false,
WorkerMetricsCollector.instance(),
BugReporter.defaultInstance());
long curTime = Profiler.nanoTimeMaybe();
Expand Down

0 comments on commit b4dbed0

Please sign in to comment.