apache · arincon85 · Feb 8, 2017 · mridulm · Mar 10, 2017 · mridulm
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -46,6 +46,7 @@ import org.apache.spark._
 import org.apache.spark.api.r.RUtils
 import org.apache.spark.deploy.rest._
 import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.scheduler.{KerberosUser, KerberosUtil}
 import org.apache.spark.util._
 
 /**
@@ -151,9 +152,13 @@ object SparkSubmit extends CommandLineUtils {
     val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
 
     def doRunMain(): Unit = {
+      if (args.principal != null && args.keytab!= null) {
+        KerberosUser.securize(args.principal, args.keytab)
+      }
       if (args.proxyUser != null) {
         val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
           UserGroupInformation.getCurrentUser())
+        KerberosUtil.proxyUser = Option(proxyUser)
         try {
           proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -257,10 +257,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
           "either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.")
       }
     }
-
-    if (proxyUser != null && principal != null) {
-      SparkSubmit.printErrorAndExit("Only one of --proxy-user or --principal can be provided.")
-    }
   }
 
   private def validateKillArguments(): Unit = {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1006,7 +1006,7 @@ class DAGScheduler(
         runningStages -= stage
         return
     }
-
+    val tokens = KerberosUtil.getHadoopDelegationTokens
     val tasks: Seq[Task[_]] = try {
       val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
       stage match {
@@ -1018,7 +1018,7 @@ class DAGScheduler(
             stage.pendingPartitions += id
             new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
               taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
-              Option(sc.applicationId), sc.applicationAttemptId)
+              Option(sc.applicationId), sc.applicationAttemptId, Option(tokens))
           }
 
         case stage: ResultStage =>
@@ -1028,7 +1028,7 @@ class DAGScheduler(
             val locs = taskIdToLocations(id)
             new ResultTask(stage.id, stage.latestInfo.attemptId,
               taskBinary, part, locs, id, properties, serializedTaskMetrics,
-              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
+              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId, Option(tokens))
           }
       }
     } catch {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/KerberosFunction.scala b/core/src/main/scala/org/apache/spark/scheduler/KerberosFunction.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.spark.internal.Logging
+
+object KerberosFunction extends Logging {
+
+  def executeSecure[U, T](tokens: Option[Array[Byte]], funct: (U => T), inputParameters: U): T = {
+    if (tokens.isDefined) {
+      KerberosUtil.useTokenAuth(tokens.get)
+    }
+    funct(inputParameters)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/KerberosUser.scala b/core/src/main/scala/org/apache/spark/scheduler/KerberosUser.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import org.apache.spark.{SparkConf, SparkEnv, SparkException}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.internal.Logging
+
+object KerberosUser extends Logging {
+
+  def securize (principal: String, keytab: String) : Unit = {
+    val hadoopConf = SparkHadoopUtil.get.newConfiguration(new SparkConf())
+    hadoopConf.set("hadoop.security.authentication", "Kerberos")
+    UserGroupInformation.setConfiguration(hadoopConf)
+    UserGroupInformation.loginUserFromKeytab(principal, keytab)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/KerberosUtil.scala b/core/src/main/scala/org/apache/spark/scheduler/KerberosUtil.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import java.security.PrivilegedExceptionAction
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.mapred.Master
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.{SparkConf, SparkEnv, SparkException}
+import org.apache.spark.internal.Logging
+
+object KerberosUtil  extends Logging {
+  var proxyUser : Option[UserGroupInformation] = None
+  def securize (principal: String, keytab: String) : Unit = {
+    val hadoopConf = SparkHadoopUtil.get.newConfiguration(new SparkConf())
+    hadoopConf.set("hadoop.security.authentication", "Kerberos")
+    UserGroupInformation.setConfiguration(hadoopConf)
+    UserGroupInformation.loginUserFromKeytab(principal, keytab)
+  }
+
+
+  def getHadoopDelegationTokens : Array[Byte] = {
+    val ugi = proxyUser match {
+      case Some(user) => user
+      case None => UserGroupInformation.getLoginUser
+    }
+    val principal = ugi.getUserName
+    val hadoopConf = SparkHadoopUtil.get.conf
+    val namenodes = Set(FileSystem.get(hadoopConf).getHomeDirectory())
+    logInfo(s"Found these HDFS namenodes: $namenodes")
+    val ugiCreds = ugi.getCredentials
+    ugi.doAs(new PrivilegedExceptionAction[Unit] {
+      override def run() = {
+        // use the job principal itself to renew the tokens
+        obtainTokensForNamenodes(namenodes, hadoopConf, ugiCreds, Some(principal))
+      }
+    })
+    // write tokens into a memory file to transfer it to the executors
+    val tokenBuf = new java.io.ByteArrayOutputStream(1024 * 1024)
+    ugiCreds.writeTokenStorageToStream(new java.io.DataOutputStream(tokenBuf))
+    logDebug(s"Wrote ${tokenBuf.size()} bytes of token data")
+
+    hadoopConf.set("hadoop.security.authentication", "Kerberos")
+    tokenBuf.toByteArray
+  }
+  def obtainTokensForNamenodes(
+                                paths: Set[Path],
+                                conf: Configuration,
+                                creds: Credentials,
+                                renewer: Option[String] = None
+                              ): Unit = {
+    if (UserGroupInformation.isSecurityEnabled()) {
+      val delegTokenRenewer = renewer.getOrElse(getTokenRenewer(conf))
+      paths.foreach { dst =>
+        val dstFs = dst.getFileSystem(conf)
+        logInfo("getting token for namenode: " + dst)
+        dstFs.addDelegationTokens(delegTokenRenewer, creds)
+      }
+    }
+
+  }
+  def getTokenRenewer(conf: Configuration): String = {
+    val delegTokenRenewer = Master.getMasterPrincipal(conf)
+    logDebug("delegation token renewer is: " + delegTokenRenewer)
+    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
+      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
+      logError(errorMessage)
+      throw new SparkException(errorMessage)
+    }
+    delegTokenRenewer
+  }
+
+   def useTokenAuth(tokens: Array[Byte]) {
+     val sparkConf = SparkEnv.get.conf
+     logInfo(s"Found delegation tokens of ${tokens.length} bytes")
+
+     // configure to use tokens for HDFS login
+     val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
+     hadoopConf.set("hadoop.security.authentication", "Token")
+     UserGroupInformation.setConfiguration(hadoopConf)
+
+     // decode tokens and add them to the credentials
+     val creds = UserGroupInformation.getCurrentUser.getCredentials
+     val tokensBuf = new java.io.ByteArrayInputStream(tokens)
+     creds.readTokenStorageStream(new java.io.DataInputStream(tokensBuf))
+     UserGroupInformation.getCurrentUser.addCredentials(creds)
+   }
+
+}
+
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -60,9 +60,10 @@ private[spark] class ResultTask[T, U](
     serializedTaskMetrics: Array[Byte],
     jobId: Option[Int] = None,
     appId: Option[String] = None,
-    appAttemptId: Option[String] = None)
+    appAttemptId: Option[String] = None,
+    tokens: Option[Array[Byte]] = None)
   extends Task[U](stageId, stageAttemptId, partition.index, localProperties, serializedTaskMetrics,
-    jobId, appId, appAttemptId)
+    jobId, appId, appAttemptId,tokens)
   with Serializable {
 
   @transient private[this] val preferredLocs: Seq[TaskLocation] = {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -60,9 +60,10 @@ private[spark] class ShuffleMapTask(
     serializedTaskMetrics: Array[Byte],
     jobId: Option[Int] = None,
     appId: Option[String] = None,
-    appAttemptId: Option[String] = None)
+    appAttemptId: Option[String] = None,
+    tokens: Option[Array[Byte]] = None)
   extends Task[MapStatus](stageId, stageAttemptId, partition.index, localProperties,
-    serializedTaskMetrics, jobId, appId, appAttemptId)
+    serializedTaskMetrics, jobId, appId, appAttemptId,tokens)
   with Logging {
 
   /** A constructor used only in test suites. This does not require passing in an RDD. */

diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -65,7 +65,8 @@ private[spark] abstract class Task[T](
       SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array(),
     val jobId: Option[Int] = None,
     val appId: Option[String] = None,
-    val appAttemptId: Option[String] = None) extends Serializable {
+    val appAttemptId: Option[String] = None,
+    val tokens: Option[Array[Byte]] = None) extends Serializable {
 
   @transient lazy val metrics: TaskMetrics =
     SparkEnv.get.closureSerializer.newInstance().deserialize(ByteBuffer.wrap(serializedTaskMetrics))
@@ -110,7 +111,7 @@ private[spark] abstract class Task[T](
       Option(attemptNumber)).setCurrentContext()
 
     try {
-      runTask(context)
+      KerberosFunction.executeSecure(tokens, runTask, context)
     } catch {
       case e: Throwable =>
         // Catch all errors; run task failure callbacks, and rethrow the exception.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -389,11 +389,10 @@ class SparkSession private(
     Dataset.ofRows(self, LogicalRelation(baseRelation))
   }
 
-  /* ------------------------------- *
-   |  Methods for creating DataSets  |
-   * ------------------------------- */
-
-  /**
+   /* ------------------------------- *
+    |  Methods for creating DataSets  |
+    * ------------------------------- */
+/**
    * :: Experimental ::
    * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an
    * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation)