From f2667f274d1ee3d1c643f19221b9e83141f2137a Mon Sep 17 00:00:00 2001 From: xuchenCN Date: Mon, 29 Jun 2015 18:53:04 +0800 Subject: [PATCH 1/2] [SPARK-8374] [YARN] Job frequently hangs after YARN preemption --- .../scala/org/apache/spark/deploy/yarn/YarnAllocator.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 940873fbd046c..c951eb8652b4e 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -397,10 +397,12 @@ private[yarn] class YarnAllocator( completedContainer.getState, completedContainer.getExitStatus)) // Hadoop 2.2.X added a ContainerExitStatus we should switch to use - // there are some exit status' we shouldn't necessarily count against us, but for - // now I think its ok as none of the containers are expected to exit + // there are some exit status' we shouldn't necessarily count against us. + // So we should keep targetNumExecutors == numExecutorsRunning + // to avoid application starve because YARN scheduler PREEMPTED if (completedContainer.getExitStatus == ContainerExitStatus.PREEMPTED) { logInfo("Container preempted: " + containerId) + numExecutorsRunning -= 1 } else if (completedContainer.getExitStatus == -103) { // vmem limit exceeded logWarning(memLimitExceededLogMessage( completedContainer.getDiagnostics, From 9555dd52c66efc5fa4b748ff9253309ac70b9c0a Mon Sep 17 00:00:00 2001 From: xuchenCN Date: Mon, 29 Jun 2015 19:04:42 +0800 Subject: [PATCH 2/2] [SPARK-8374] [YARN] Job frequently hangs after YARN preemption --- .../main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index c951eb8652b4e..35b2b7d747b12 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -399,7 +399,7 @@ private[yarn] class YarnAllocator( // Hadoop 2.2.X added a ContainerExitStatus we should switch to use // there are some exit status' we shouldn't necessarily count against us. // So we should keep targetNumExecutors == numExecutorsRunning - // to avoid application starve because YARN scheduler PREEMPTED + // to avoid application starve because YARN scheduler preemption if (completedContainer.getExitStatus == ContainerExitStatus.PREEMPTED) { logInfo("Container preempted: " + containerId) numExecutorsRunning -= 1