From 4198b3b7380c7ea70e7a502f89ca45484f512e86 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Wed, 26 Nov 2014 10:13:54 -0800 Subject: [PATCH 1/2] [SPARK-4584] [yarn] Remove security manager from Yarn AM. The security manager adds a lot of overhead to the runtime of the app, and causes a severe performance regression. Even stubbing out all unneeded methods (all except checkExit()) does not help. So, instead, penalize users who do an explicit System.exit() by leaving them in "undefined behavior" territory: if they do that, the Yarn backend won't be able to report the final app status to the RM. The result is that the final status of the application might not match the user's expectations. One side-effect of the change is that users who do an explicit System.exit() will lose the AM retry functionality. Since there is no way to know if the exit was because of success or failure, the AM right now errs on the side of it being a successful exit. --- .../spark/deploy/yarn/ApplicationMaster.scala | 39 +------------------ 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index e90672c004d4b..f2f75915e2f16 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -60,7 +60,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, @volatile private var exitCode = 0 @volatile private var unregistered = false @volatile private var finished = false - @volatile private var finalStatus = FinalApplicationStatus.UNDEFINED + @volatile private var finalStatus = FinalApplicationStatus.SUCCEEDED @volatile private var finalMsg: String = "" @volatile private var userClassThread: Thread = _ @@ -214,7 +214,6 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, private def runDriver(securityMgr: SecurityManager): Unit = { addAmIpFilter() - setupSystemSecurityManager() userClassThread = startUserClass() // This a bit hacky, but we need to wait until the spark.driver.port property has @@ -402,42 +401,6 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, } } - /** - * This system security manager applies to the entire process. - * It's main purpose is to handle the case if the user code does a System.exit. - * This allows us to catch that and properly set the YARN application status and - * cleanup if needed. - */ - private def setupSystemSecurityManager(): Unit = { - try { - var stopped = false - System.setSecurityManager(new java.lang.SecurityManager() { - override def checkExit(paramInt: Int) { - if (!stopped) { - logInfo("In securityManager checkExit, exit code: " + paramInt) - if (paramInt == 0) { - finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS) - } else { - finish(FinalApplicationStatus.FAILED, - paramInt, - "User class exited with non-zero exit code") - } - stopped = true - } - } - // required for the checkExit to work properly - override def checkPermission(perm: java.security.Permission): Unit = {} - }) - } - catch { - case e: SecurityException => - finish(FinalApplicationStatus.FAILED, - ApplicationMaster.EXIT_SECURITY, - "Error in setSecurityManager") - logError("Error in setSecurityManager:", e) - } - } - /** * Start the user class, which contains the spark driver, in a separate Thread. * If the main routine exits cleanly or exits with System.exit(0) we From 21f2502947eb2a242168cc5a50225df5c36c9161 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Wed, 26 Nov 2014 13:16:00 -0800 Subject: [PATCH 2/2] Do not retry apps that use System.exit(). --- .../spark/deploy/yarn/ApplicationMaster.scala | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index f2f75915e2f16..18e4a497e86b5 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -106,10 +106,14 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts if (!finished) { - // this shouldn't ever happen, but if it does assume weird failure - finish(FinalApplicationStatus.FAILED, + // This happens when the user application calls System.exit(). We have the choice + // of either failing of succeeding at this point. We report success to avoid + // retrying applications that have succeeded (System.exit(0)), which means that + // applications that explicitly exit with a non-zero status will also show up as + // succeeded in the RM UI. + finish(finalStatus, ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION, - "shutdown hook called without cleanly finishing") + "Shutdown hook called before final status was reported.") } if (!unregistered) { @@ -164,17 +168,18 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments, final def finish(status: FinalApplicationStatus, code: Int, msg: String = null) = synchronized { if (!finished) { + val inShutdown = Utils.inShutdown() logInfo(s"Final app status: ${status}, exitCode: ${code}" + Option(msg).map(msg => s", (reason: $msg)").getOrElse("")) exitCode = code finalStatus = status finalMsg = msg finished = true - if (Thread.currentThread() != reporterThread && reporterThread != null) { + if (!inShutdown && Thread.currentThread() != reporterThread && reporterThread != null) { logDebug("shutting down reporter thread") reporterThread.interrupt() } - if (Thread.currentThread() != userClassThread && userClassThread != null) { + if (!inShutdown && Thread.currentThread() != userClassThread && userClassThread != null) { logDebug("shutting down user thread") userClassThread.interrupt() }