Skip to content

Commit

Permalink
YARN-10471. Prevent logs for any container from becoming larger than …
Browse files Browse the repository at this point in the history
…a configurable size. Contributed by Eric Payne
  • Loading branch information
jbrennan333 committed Oct 29, 2020
1 parent f17e067 commit 8ee6bc2
Show file tree
Hide file tree
Showing 6 changed files with 347 additions and 4 deletions.
Expand Up @@ -78,4 +78,9 @@ public class ContainerExitStatus {
*/
public static final int KILLED_BY_CONTAINER_SCHEDULER = -108;

/**
* Container was terminated for generating excess log data.
*/
public static final int KILLED_FOR_EXCESS_LOGS = -109;

}
Expand Up @@ -1912,6 +1912,25 @@ public static boolean isAclEnabled(Configuration conf) {
public static final String APPLICATION_TAG_BASED_PLACEMENT_USER_WHITELIST =
APPLICATION_TAG_BASED_PLACEMENT_PREFIX + ".username.whitelist";

/** Enable switch for container log monitoring. */
public static final String NM_CONTAINER_LOG_MONITOR_ENABLED =
NM_PREFIX + "container-log-monitor.enable";
public static final boolean DEFAULT_NM_CONTAINER_LOG_MONITOR_ENABLED = false;
/** How often to monitor logs generated by containers. */
public static final String NM_CONTAINER_LOG_MON_INTERVAL_MS =
NM_PREFIX + "container-log-monitor.interval-ms";
public static final int DEFAULT_NM_CONTAINER_LOG_MON_INTERVAL_MS = 60000;
/** The disk space limit for a single container log directory. */
public static final String NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES =
NM_PREFIX + "container-log-monitor.dir-size-limit-bytes";
public static final long DEFAULT_NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES =
1000000000L;
/** The disk space limit for all of a container's logs. */
public static final String NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES =
NM_PREFIX + "container-log-monitor.total-size-limit-bytes";
public static final long DEFAULT_NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES =
10000000000L;

/** Enable/disable container metrics. */
@Private
public static final String NM_CONTAINER_METRICS_ENABLE =
Expand Down
Expand Up @@ -1665,6 +1665,34 @@
<name>yarn.nodemanager.container-monitor.interval-ms</name>
</property>

<property>
<description>Flag to enable the container log monitor which enforces
container log directory size limits.</description>
<name>yarn.nodemanager.container-log-monitor.enable</name>
<value>false</value>
</property>

<property>
<description>How often to check the usage of a container's log directories
in milliseconds</description>
<name>yarn.nodemanager.container-log-monitor.interval-ms</name>
<value>60000</value>
</property>

<property>
<description>The disk space limit, in bytes, for a single
container log directory</description>
<name>yarn.nodemanager.container-log-monitor.dir-size-limit-bytes</name>
<value>1000000000</value>
</property>

<property>
<description>The disk space limit, in bytes, for all of a container's
logs</description>
<name>yarn.nodemanager.container-log-monitor.total-size-limit-bytes</name>
<value>10000000000</value>
</property>

<property>
<description>Class that calculates containers current resource utilization.
If not set, the value for yarn.nodemanager.resource-calculator.class will
Expand Down
Expand Up @@ -28,6 +28,7 @@
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
Expand All @@ -45,11 +46,14 @@
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
import org.apache.hadoop.yarn.server.nodemanager.webapp.ContainerLogsUtils;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;

import java.util.Arrays;
import java.io.File;
import java.util.Map;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;

Expand All @@ -67,6 +71,10 @@ public class ContainersMonitorImpl extends AbstractService implements

private long monitoringInterval;
private MonitoringThread monitoringThread;
private int logCheckInterval;
private LogMonitorThread logMonitorThread;
private long logDirSizeLimit;
private long logTotalSizeLimit;
private CGroupElasticMemoryController oomListenerThread;
private boolean containerMetricsEnabled;
private long containerMetricsPeriodMs;
Expand Down Expand Up @@ -94,6 +102,7 @@ public class ContainersMonitorImpl extends AbstractService implements
private boolean elasticMemoryEnforcement;
private boolean strictMemoryEnforcement;
private boolean containersMonitorEnabled;
private boolean logMonitorEnabled;

private long maxVCoresAllottedForContainers;

Expand Down Expand Up @@ -122,6 +131,8 @@ public ContainersMonitorImpl(ContainerExecutor exec,

this.monitoringThread = new MonitoringThread();

this.logMonitorThread = new LogMonitorThread();

this.containersUtilization = ResourceUtilization.newInstance(0, 0, 0.0f);
}

Expand All @@ -133,6 +144,16 @@ protected void serviceInit(Configuration myConf) throws Exception {
this.conf.getLong(YarnConfiguration.NM_RESOURCE_MON_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_RESOURCE_MON_INTERVAL_MS));

this.logCheckInterval =
conf.getInt(YarnConfiguration.NM_CONTAINER_LOG_MON_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_MON_INTERVAL_MS);
this.logDirSizeLimit =
conf.getLong(YarnConfiguration.NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES,
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_DIR_SIZE_LIMIT_BYTES);
this.logTotalSizeLimit =
conf.getLong(YarnConfiguration.NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES,
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_TOTAL_SIZE_LIMIT_BYTES);

this.resourceCalculatorPlugin =
ResourceCalculatorPlugin.getContainersMonitorPlugin(this.conf);
LOG.info("Using ResourceCalculatorPlugin: {}",
Expand Down Expand Up @@ -214,6 +235,11 @@ protected void serviceInit(Configuration myConf) throws Exception {
isContainerMonitorEnabled() && monitoringInterval > 0;
LOG.info("ContainersMonitor enabled: {}", containersMonitorEnabled);

logMonitorEnabled =
conf.getBoolean(YarnConfiguration.NM_CONTAINER_LOG_MONITOR_ENABLED,
YarnConfiguration.DEFAULT_NM_CONTAINER_LOG_MONITOR_ENABLED);
LOG.info("Container Log Monitor Enabled: "+ logMonitorEnabled);

nodeCpuPercentageForYARN =
NodeManagerHardwareUtils.getNodeCpuPercentage(this.conf);

Expand Down Expand Up @@ -284,13 +310,16 @@ protected void serviceStart() throws Exception {
if (oomListenerThread != null) {
oomListenerThread.start();
}
if (logMonitorEnabled) {
this.logMonitorThread.start();
}
super.serviceStart();
}

@Override
protected void serviceStop() throws Exception {
stopped = true;
if (containersMonitorEnabled) {
stopped = true;
this.monitoringThread.interrupt();
try {
this.monitoringThread.join();
Expand All @@ -306,6 +335,13 @@ protected void serviceStop() throws Exception {
}
}
}
if (logMonitorEnabled) {
this.logMonitorThread.interrupt();
try {
this.logMonitorThread.join();
} catch (InterruptedException e) {
}
}
super.serviceStop();
}

Expand Down Expand Up @@ -752,7 +788,8 @@ && isProcessTreeOverLimit(containerId.toString(),
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
}

if (isMemoryOverLimit) {
if (isMemoryOverLimit
&& trackingContainers.remove(containerId) != null) {
// Virtual or physical memory over limit. Fail the container and
// remove
// the corresponding process tree
Expand All @@ -766,7 +803,6 @@ && isProcessTreeOverLimit(containerId.toString(),
eventDispatcher.getEventHandler().handle(
new ContainerKillEvent(containerId,
containerExitStatus, msg));
trackingContainers.remove(containerId);
LOG.info("Removed ProcessTree with root {}", pId);
}
}
Expand Down Expand Up @@ -834,6 +870,72 @@ private String formatUsageString(long currentVmemUsage, long vmemLimit,
}
}

private class LogMonitorThread extends Thread {
LogMonitorThread() {
super("Container Log Monitor");
}

@Override
public void run() {
while (!stopped && !Thread.currentThread().isInterrupted()) {
for (Entry<ContainerId, ProcessTreeInfo> entry :
trackingContainers.entrySet()) {
ContainerId containerId = entry.getKey();
ProcessTreeInfo ptInfo = entry.getValue();
Container container = context.getContainers().get(containerId);
if (container == null) {
continue;
}
try {
List<File> logDirs = ContainerLogsUtils.getContainerLogDirs(
containerId, container.getUser(), context);
long totalLogDataBytes = 0;
for (File dir : logDirs) {
long currentDirSizeBytes = FileUtil.getDU(dir);
totalLogDataBytes += currentDirSizeBytes;
String killMsg = null;
if (currentDirSizeBytes > logDirSizeLimit) {
killMsg = String.format(
"Container [pid=%s,containerID=%s] is logging beyond "
+ "the container single log directory limit.%n"
+ "Limit: %d Log Directory Size: %d Log Directory: %s"
+ "%nKilling container.%n",
ptInfo.getPID(), containerId, logDirSizeLimit,
currentDirSizeBytes, dir);
} else if (totalLogDataBytes > logTotalSizeLimit) {
killMsg = String.format(
"Container [pid=%s,containerID=%s] is logging beyond "
+ "the container total log limit.%n"
+ "Limit: %d Total Size: >=%d"
+ "%nKilling container.%n",
ptInfo.getPID(), containerId, logTotalSizeLimit,
totalLogDataBytes);
}
if (killMsg != null
&& trackingContainers.remove(containerId) != null) {
LOG.warn(killMsg);
eventDispatcher.getEventHandler().handle(
new ContainerKillEvent(containerId,
ContainerExitStatus.KILLED_FOR_EXCESS_LOGS, killMsg));
LOG.info("Removed ProcessTree with root " + ptInfo.getPID());
break;
}
}
} catch (Exception e) {
LOG.warn("Uncaught exception in ContainerMemoryManager "
+ "while monitoring log usage for " + containerId, e);
}
}
try {
Thread.sleep(logCheckInterval);
} catch (InterruptedException e) {
LOG.info("Log monitor thread was interrupted. "
+ "Stopping container log monitoring.");
}
}
}
}

private void updateContainerMetrics(ContainersMonitorEvent monitoringEvent) {
if (!containerMetricsEnabled || monitoringEvent == null) {
return;
Expand Down

0 comments on commit 8ee6bc2

Please sign in to comment.