Skip to content

Commit

Permalink
[FLINK-7812][metrics] Add system resources metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
pnowojski committed Aug 2, 2018
1 parent f18cf7f commit 46d7f75
Show file tree
Hide file tree
Showing 14 changed files with 759 additions and 4 deletions.
10 changes: 10 additions & 0 deletions docs/_includes/generated/task_manager_configuration.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@
<td style="word-wrap: break-word;">true</td>
<td>Enable SSL support for the taskmanager data transport. This is applicable only when the global flag for internal SSL (security.ssl.internal.enabled) is set to true</td>
</tr>
<tr>
<td><h5>taskmanager.debug.additional-logging</h5></td>
<td style="word-wrap: break-word;">false</td>
<td></td>
</tr>
<tr>
<td><h5>taskmanager.debug.additional-logging-interval</h5></td>
<td style="word-wrap: break-word;">5000</td>
<td></td>
</tr>
<tr>
<td><h5>taskmanager.debug.memory.log</h5></td>
<td style="word-wrap: break-word;">false</td>
Expand Down
141 changes: 141 additions & 0 deletions docs/monitoring/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,147 @@ Thus, in order to infer the metric identifier:
</tbody>
</table>

### System resources

Logging of system resources is disabled by default. When `taskmanager.debug.additional-logging`
is enabled on a TaskManager additional metrics listed below will be available. System
resources metrics are updated periodically and they present average values for a configured
interval (`taskmanager.debug.additional-logging-interval`).

Logging of system resources require couple of optional dependencies to be present on the
classpath (for example placed in Flink's `lib` directory):

- `com.github.oshi:oshi-core:3.4.0` (licensed under EPL 1.0 license)
- `net.java.dev.jna:jna-platform:jar:4.2.2`
- `net.java.dev.jna:jna:jar:4.2.2`

Failures in this regard will be reported as warning messages like `NoClassDefFoundError`
logged by `SystemResourcesMetricsInitializer` during the startup.

#### System CPU

<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Scope</th>
<th class="text-left" style="width: 25%">Infix</th>
<th class="text-left" style="width: 23%">Metrics</th>
<th class="text-left" style="width: 32%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="12"><strong>TaskManager</strong></th>
<td rowspan="12">System.CPU</td>
<td>Usage</td>
<td>Overall % of CPU usage on the machine.</td>
</tr>
<tr>
<td>Idle</td>
<td>% of CPU Idle usage on the machine.</td>
</tr>
<tr>
<td>Sys</td>
<td>% of System CPU usage on the machine.</td>
</tr>
<tr>
<td>User</td>
<td>% of User CPU usage on the machine.</td>
</tr>
<tr>
<td>IOWait</td>
<td>% of IOWait CPU usage on the machine.</td>
</tr>
<tr>
<td>Irq</td>
<td>% of Irq CPU usage on the machine.</td>
</tr>
<tr>
<td>SoftIrq</td>
<td>% of SoftIrq CPU usage on the machine.</td>
</tr>
<tr>
<td>Nice</td>
<td>% of Nice Idle usage on the machine.</td>
</tr>
<tr>
<td>Load1min</td>
<td>Average CPU load over 1 minute</td>
</tr>
<tr>
<td>Load5min</td>
<td>Average CPU load over 5 minute</td>
</tr>
<tr>
<td>Load15min</td>
<td>Average CPU load over 15 minute</td>
</tr>
<tr>
<td>UsageCPU*</td>
<td>% of CPU usage per each processor</td>
</tr>
</tbody>
</table>

#### System memory

<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Scope</th>
<th class="text-left" style="width: 25%">Infix</th>
<th class="text-left" style="width: 23%">Metrics</th>
<th class="text-left" style="width: 32%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="4"><strong>TaskManager</strong></th>
<td rowspan="2">System.Memory</td>
<td>Available</td>
<td>Available memory in bytes</td>
</tr>
<tr>
<td>Total</td>
<td>Total memory in bytes</td>
</tr>
<tr>
<td rowspan="2">System.Swap</td>
<td>Used</td>
<td>Used swap bytes</td>
</tr>
<tr>
<td>Total</td>
<td>Total swap in bytes</td>
</tr>
</tbody>
</table>

#### System network

<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Scope</th>
<th class="text-left" style="width: 25%">Infix</th>
<th class="text-left" style="width: 23%">Metrics</th>
<th class="text-left" style="width: 32%">Description</th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="2"><strong>TaskManager</strong></th>
<td rowspan="2">System.Network.INTERFACE_NAME</td>
<td>ReceiveRate</td>
<td>Average receive rate in bytes per second</td>
</tr>
<tr>
<td>SendRate</td>
<td>Average send rate in bytes per second</td>
</tr>
</tbody>
</table>

## Latency tracking

Flink allows to track the latency of records traveling through the system. To enable the latency tracking
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,21 @@ public class TaskManagerOptions {
.withDeprecatedKeys("taskmanager.debug.memory.logIntervalMs")
.withDescription("The interval (in ms) for the log thread to log the current memory usage.");

/**
* Whether TaskManager should log additional metrics (such as system resources).
*/
public static final ConfigOption<Boolean> ADDITIONAL_LOGGING =
key("taskmanager.debug.additional-logging")
.defaultValue(false);

/**
* Interval between probing of additional metrics specified in milliseconds. Has an effect only when
* {@link #ADDITIONAL_LOGGING} is enabled.
*/
public static final ConfigOption<Long> ADDITIONAL_LOGGING_INTERVAL =
key("taskmanager.debug.additional-logging-interval")
.defaultValue(5000L);

// ------------------------------------------------------------------------
// Managed Memory Options
// ------------------------------------------------------------------------
Expand Down
7 changes: 7 additions & 0 deletions flink-runtime/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,13 @@ under the License.
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
</dependency>

<!-- Used only for additional logging. Optional because of unclear EPL 1.0 license compatibility. -->
<dependency>
<groupId>com.github.oshi</groupId>
<artifactId>oshi-core</artifactId>
<optional>true</optional>
</dependency>
</dependencies>

<!-- Dependency Management to converge transitive dependency versions -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.apache.flink.runtime.rpc.akka.AkkaRpcServiceUtils;
import org.apache.flink.runtime.security.SecurityConfiguration;
import org.apache.flink.runtime.security.SecurityUtils;
import org.apache.flink.runtime.taskexecutor.utils.SystemResourcesMetricsInitializer;
import org.apache.flink.runtime.taskmanager.MemoryLogger;
import org.apache.flink.runtime.util.EnvironmentInformation;
import org.apache.flink.runtime.util.ExecutorThreadFactory;
Expand Down Expand Up @@ -371,6 +372,12 @@ public static TaskExecutor startTaskManager(

TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);

if (taskManagerServicesConfiguration.isSystemResourcesLoggingEnabled()) {
SystemResourcesMetricsInitializer.instantiateSystemMetrics(
taskManagerMetricGroup,
taskManagerServicesConfiguration.getSystemResourcesLoggingInterval());
}

return new TaskExecutor(
rpcService,
taskManagerConfiguration,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.flink.runtime.taskexecutor;

import org.apache.flink.api.common.time.Time;
import org.apache.flink.configuration.CheckpointingOptions;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
Expand Down Expand Up @@ -83,6 +84,10 @@ public class TaskManagerServicesConfiguration {

private final boolean localRecoveryEnabled;

private boolean systemResourcesLoggingEnabled;

private Time systemResourcesLoggingInterval;

public TaskManagerServicesConfiguration(
InetAddress taskManagerAddress,
String[] tmpDirPaths,
Expand All @@ -95,7 +100,9 @@ public TaskManagerServicesConfiguration(
MemoryType memoryType,
boolean preAllocateMemory,
float memoryFraction,
long timerServiceShutdownTimeout) {
long timerServiceShutdownTimeout,
boolean systemResourcesLoggingEnabled,
Time systemResourcesLoggingInterval) {

this.taskManagerAddress = checkNotNull(taskManagerAddress);
this.tmpDirPaths = checkNotNull(tmpDirPaths);
Expand All @@ -113,6 +120,9 @@ public TaskManagerServicesConfiguration(
checkArgument(timerServiceShutdownTimeout >= 0L, "The timer " +
"service shutdown timeout must be greater or equal to 0.");
this.timerServiceShutdownTimeout = timerServiceShutdownTimeout;

this.systemResourcesLoggingEnabled = systemResourcesLoggingEnabled;
this.systemResourcesLoggingInterval = checkNotNull(systemResourcesLoggingInterval);
}

// --------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -179,6 +189,14 @@ public long getTimerServiceShutdownTimeout() {
return timerServiceShutdownTimeout;
}

public boolean isSystemResourcesLoggingEnabled() {
return systemResourcesLoggingEnabled;
}

public Time getSystemResourcesLoggingInterval() {
return systemResourcesLoggingInterval;
}

// --------------------------------------------------------------------------------------------
// Parsing of Flink configuration
// --------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -264,6 +282,10 @@ public static TaskManagerServicesConfiguration fromConfiguration(

long timerServiceShutdownTimeout = AkkaUtils.getTimeout(configuration).toMillis();

boolean systemResourcesLoggingEnabled = configuration.getBoolean(TaskManagerOptions.ADDITIONAL_LOGGING);

Time systemResourcesLoggingInterval = Time.milliseconds(configuration.getLong(TaskManagerOptions.ADDITIONAL_LOGGING_INTERVAL));

return new TaskManagerServicesConfiguration(
remoteAddress,
tmpDirs,
Expand All @@ -276,7 +298,9 @@ public static TaskManagerServicesConfiguration fromConfiguration(
memType,
preAllocateMemory,
memoryFraction,
timerServiceShutdownTimeout);
timerServiceShutdownTimeout,
systemResourcesLoggingEnabled,
systemResourcesLoggingInterval);
}

// --------------------------------------------------------------------------
Expand Down

0 comments on commit 46d7f75

Please sign in to comment.