Skip to content

Commit

Permalink
YARN-3809. Failed to launch new attempts because ApplicationMasterLau…
Browse files Browse the repository at this point in the history
…ncher's threads all hang. Contributed by Jun Gong
  • Loading branch information
jlowe committed Jun 24, 2015
1 parent 72d08a0 commit 2a20dd9
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 3 deletions.
3 changes: 3 additions & 0 deletions hadoop-yarn-project/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,9 @@ Release 2.7.1 - UNRELEASED
YARN-3842. NMProxy should retry on NMNotYetReadyException.
(Robert Kanter via kasha)

YARN-3809. Failed to launch new attempts because
ApplicationMasterLauncher's threads all hang (Jun Gong via jlowe)

Release 2.7.0 - 2015-04-20

INCOMPATIBLE CHANGES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,16 @@ private static void addDeprecatedKeys() {
RM_PREFIX + "client.thread-count";
public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50;

/** Number of threads used to launch/cleanup AM.*/
public static final String RM_AMLAUNCHER_THREAD_COUNT =
RM_PREFIX + "amlauncher.thread-count";
public static final int DEFAULT_RM_AMLAUNCHER_THREAD_COUNT = 50;

/** Retry times to connect with NM.*/
public static final String RM_NODEMANAGER_CONNECT_RETIRES =
RM_PREFIX + "nodemanager-connect-retries";
public static final int DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES = 10;

/** The Kerberos principal for the resource manager.*/
public static final String RM_PRINCIPAL =
RM_PREFIX + "principal";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@
<value>50</value>
</property>

<property>
<description>Number of threads used to launch/cleanup AM.</description>
<name>yarn.resourcemanager.amlauncher.thread-count</name>
<value>50</value>
</property>

<property>
<description>Retry times to connect with NM.</description>
<name>yarn.resourcemanager.nodemanager-connect-retries</name>
<value>10</value>
</property>

<property>
<description>The expiry interval for application master reporting.</description>
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,17 @@
package org.apache.hadoop.yarn.server.resourcemanager.amlauncher;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
Expand All @@ -34,7 +39,7 @@ public class ApplicationMasterLauncher extends AbstractService implements
EventHandler<AMLauncherEvent> {
private static final Log LOG = LogFactory.getLog(
ApplicationMasterLauncher.class);
private final ThreadPoolExecutor launcherPool;
private ThreadPoolExecutor launcherPool;
private LauncherThread launcherHandlingThread;

private final BlockingQueue<Runnable> masterEvents
Expand All @@ -45,11 +50,30 @@ public class ApplicationMasterLauncher extends AbstractService implements
public ApplicationMasterLauncher(RMContext context) {
super(ApplicationMasterLauncher.class.getName());
this.context = context;
this.launcherPool = new ThreadPoolExecutor(10, 10, 1,
TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>());
this.launcherHandlingThread = new LauncherThread();
}

@Override
protected void serviceInit(Configuration conf) throws Exception {
int threadCount = conf.getInt(
YarnConfiguration.RM_AMLAUNCHER_THREAD_COUNT,
YarnConfiguration.DEFAULT_RM_AMLAUNCHER_THREAD_COUNT);
ThreadFactory tf = new ThreadFactoryBuilder()
.setNameFormat("ApplicationMasterLauncher #%d")
.build();
launcherPool = new ThreadPoolExecutor(threadCount, threadCount, 1,
TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>());
launcherPool.setThreadFactory(tf);

Configuration newConf = new YarnConfiguration(conf);
newConf.setInt(CommonConfigurationKeysPublic.
IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
conf.getInt(YarnConfiguration.RM_NODEMANAGER_CONNECT_RETIRES,
YarnConfiguration.DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES));
setConfig(newConf);
super.serviceInit(newConf);
}

@Override
protected void serviceStart() throws Exception {
launcherHandlingThread.start();
Expand Down

0 comments on commit 2a20dd9

Please sign in to comment.