From 9f7ed8f62ad1bfb1cf7dd7c5b3f9863f68ec6fe9 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 24 Jun 2014 15:45:14 -0700 Subject: [PATCH 1/3] Clarify client vs cluster deploy mode + add safeguards The existing docs are highly misleading. For standalone mode, for example, it encourages the user to use standalone-cluster mode, which is not officially supported. The safeguards have been added in Spark submit itself to prevent bad documentation from leading users down the wrong path in the future. --- .../org/apache/spark/deploy/SparkSubmit.scala | 17 ++++++++++++++--- .../spark/deploy/SparkSubmitArguments.scala | 5 +++-- docs/running-on-mesos.md | 3 ++- docs/spark-standalone.md | 8 +++----- docs/submitting-applications.md | 12 +++++++++++- 5 files changed, 33 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 7e9a9344e61f9..517835cb25f07 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -19,7 +19,7 @@ package org.apache.spark.deploy import java.io.{File, PrintStream} import java.lang.reflect.InvocationTargetException -import java.net.{URI, URL} +import java.net.URL import scala.collection.mutable.{ArrayBuffer, HashMap, Map} @@ -117,14 +117,25 @@ object SparkSubmit { val isPython = args.isPython val isYarnCluster = clusterManager == YARN && deployOnCluster + // For mesos, only client mode is supported if (clusterManager == MESOS && deployOnCluster) { - printErrorAndExit("Cannot currently run driver on the cluster in Mesos") + printErrorAndExit("Mesos cluster mode is currently not supported.") + } + + // For standalone, only client mode is supported + if (clusterManager == STANDALONE && deployOnCluster) { + printErrorAndExit("Standalone cluster mode is currently not supported.") + } + + // For shells, only client mode is applicable + if (isShell(args.primaryResource) && deployOnCluster) { + printErrorAndExit("Cluster mode is not applicable to Spark shells.") } // If we're running a python app, set the main class to our specific python runner if (isPython) { if (deployOnCluster) { - printErrorAndExit("Cannot currently run Python driver programs on cluster") + printErrorAndExit("Cluster mode is currently not supported for python.") } if (args.primaryResource == PYSPARK_SHELL) { args.mainClass = "py4j.GatewayServer" diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index f1032ea8dbada..ff45e2ce228ea 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -338,8 +338,9 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) { """Usage: spark-submit [options] [app options] |Options: | --master MASTER_URL spark://host:port, mesos://host:port, yarn, or local. - | --deploy-mode DEPLOY_MODE Where to run the driver program: either "client" to run - | on the local machine, or "cluster" to run inside cluster. + | --deploy-mode DEPLOY_MODE Whether to launch the driver program locally (client) or + | on one of the worker machines inside the cluster (cluster) + | (Default: client). | --class CLASS_NAME Your application's main class (for Java / Scala apps). | --name NAME A name of your application. | --jars JARS Comma-separated list of local jars to include on the driver diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index e3c8922404365..bd046cfc1837d 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -127,7 +127,8 @@ val sc = new SparkContext(conf) {% endhighlight %} (You can also use [`spark-submit`](submitting-applications.html) and configure `spark.executor.uri` -in the [conf/spark-defaults.conf](configuration.html#loading-default-configurations) file.) +in the [conf/spark-defaults.conf](configuration.html#loading-default-configurations) file. Note +that `spark-submit` currently only supports deploying the Spark driver in `client` mode for Mesos.) When running a shell, the `spark.executor.uri` parameter is inherited from `SPARK_EXECUTOR_URI`, so it does not need to be redundantly passed in as a system property. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 3c1ce06083ede..7eed1ffb972f9 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -235,11 +235,9 @@ You can also pass an option `--cores ` to control the number of cores # Launching Compiled Spark Applications -Spark supports two deploy modes: applications may run with the driver inside the client process or -entirely inside the cluster. The -[`spark-submit` script](submitting-applications.html) provides the -most straightforward way to submit a compiled Spark application to the cluster in either deploy -mode. +The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to +submit a compiled Spark application to the cluster. For standalone clusters, Spark currently +only supports deploying the driver inside the client process. If your application is launched through Spark submit, then the application jar is automatically distributed to all worker nodes. For any additional jars that your application depends on, you diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md index d2864fe4c2f65..78f4c246fe4cf 100644 --- a/docs/submitting-applications.md +++ b/docs/submitting-applications.md @@ -42,10 +42,20 @@ Some of the commonly used options are: * `--class`: The entry point for your application (e.g. `org.apache.spark.examples.SparkPi`) * `--master`: The [master URL](#master-urls) for the cluster (e.g. `spark://23.195.26.187:7077`) -* `--deploy-mode`: Whether to deploy your driver program within the cluster or run it locally as an external client (either `cluster` or `client`) +* `--deploy-mode`: Whether to deploy your driver on the worker nodes (`cluster`) or locally as an external client (`client`) (default: `client`)* * `application-jar`: Path to a bundled jar including your application and all dependencies. The URL must be globally visible inside of your cluster, for instance, an `hdfs://` path or a `file://` path that is present on all nodes. * `application-arguments`: Arguments passed to the main method of your main class, if any +*In general, if you are hosting your own cluster, `cluster` mode provides little benefit over its +alternative. The main use case for `cluster` mode is when the machine that launches the application +is far from the worker machines, in which case the communication between the driver and the +executors suffers from high network latency. Note that `cluster` mode is currently not supported +for standalone clusters, Mesos clusters, and python applications. + +In `client` mode, on the other hand, the driver is launched directly within the client +`spark-submit` process, with the input and output of the application attached to the console. +Thus, this mode is especially suitable for applications that involve the REPL (e.g. Spark shell). + For Python applications, simply pass a `.py` file in the place of `` instead of a JAR, and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`. From c827f32a1ef1b57e966509a4b38f033f40e6ae33 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Thu, 26 Jun 2014 18:50:02 -0700 Subject: [PATCH 2/3] Clarify spark submit messages --- .../main/scala/org/apache/spark/deploy/SparkSubmit.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 517835cb25f07..b050dccb6d57f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -119,23 +119,23 @@ object SparkSubmit { // For mesos, only client mode is supported if (clusterManager == MESOS && deployOnCluster) { - printErrorAndExit("Mesos cluster mode is currently not supported.") + printErrorAndExit("Cluster deploy mode is currently not supported for Mesos clusters.") } // For standalone, only client mode is supported if (clusterManager == STANDALONE && deployOnCluster) { - printErrorAndExit("Standalone cluster mode is currently not supported.") + printErrorAndExit("Cluster deploy mode is currently not supported for standalone clusters.") } // For shells, only client mode is applicable if (isShell(args.primaryResource) && deployOnCluster) { - printErrorAndExit("Cluster mode is not applicable to Spark shells.") + printErrorAndExit("Cluster deploy mode is not applicable to Spark shells.") } // If we're running a python app, set the main class to our specific python runner if (isPython) { if (deployOnCluster) { - printErrorAndExit("Cluster mode is currently not supported for python.") + printErrorAndExit("Cluster deploy mode is currently not supported for python.") } if (args.primaryResource == PYSPARK_SHELL) { args.mainClass = "py4j.GatewayServer" From 5ea2460086f0682a9de67da0ba1f417dc4e4a986 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Fri, 27 Jun 2014 16:10:12 -0700 Subject: [PATCH 3/3] Rephrase cluster vs client explanation --- .../spark/deploy/SparkSubmitArguments.scala | 4 ++-- docs/spark-standalone.md | 3 ++- docs/submitting-applications.md | 20 ++++++++++--------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index ff45e2ce228ea..57655aa4c32b1 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -338,8 +338,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) { """Usage: spark-submit [options] [app options] |Options: | --master MASTER_URL spark://host:port, mesos://host:port, yarn, or local. - | --deploy-mode DEPLOY_MODE Whether to launch the driver program locally (client) or - | on one of the worker machines inside the cluster (cluster) + | --deploy-mode DEPLOY_MODE Whether to launch the driver program locally ("client") or + | on one of the worker machines inside the cluster ("cluster") | (Default: client). | --class CLASS_NAME Your application's main class (for Java / Scala apps). | --name NAME A name of your application. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 7eed1ffb972f9..f5c0f7cef83d2 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -237,7 +237,8 @@ You can also pass an option `--cores ` to control the number of cores The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to submit a compiled Spark application to the cluster. For standalone clusters, Spark currently -only supports deploying the driver inside the client process. +only supports deploying the driver inside the client process that is submitting the application +(`client` deploy mode). If your application is launched through Spark submit, then the application jar is automatically distributed to all worker nodes. For any additional jars that your application depends on, you diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md index 78f4c246fe4cf..e05883072bfa8 100644 --- a/docs/submitting-applications.md +++ b/docs/submitting-applications.md @@ -46,15 +46,17 @@ Some of the commonly used options are: * `application-jar`: Path to a bundled jar including your application and all dependencies. The URL must be globally visible inside of your cluster, for instance, an `hdfs://` path or a `file://` path that is present on all nodes. * `application-arguments`: Arguments passed to the main method of your main class, if any -*In general, if you are hosting your own cluster, `cluster` mode provides little benefit over its -alternative. The main use case for `cluster` mode is when the machine that launches the application -is far from the worker machines, in which case the communication between the driver and the -executors suffers from high network latency. Note that `cluster` mode is currently not supported -for standalone clusters, Mesos clusters, and python applications. - -In `client` mode, on the other hand, the driver is launched directly within the client -`spark-submit` process, with the input and output of the application attached to the console. -Thus, this mode is especially suitable for applications that involve the REPL (e.g. Spark shell). +*A common deployment strategy is to submit your application from a gateway machine that is +physically co-located with your worker machines (e.g. Master node in a standalone EC2 cluster). +In this setup, `client` mode is appropriate. In `client` mode, the driver is launched directly +within the client `spark-submit` process, with the input and output of the application attached +to the console. Thus, this mode is especially suitable for applications that involve the REPL +(e.g. Spark shell). + +Alternatively, if your application is submitted from a machine far from the worker machines (e.g. +locally on your laptop), it is common to use `cluster` mode to minimize network latency between +the drivers and the executors. Note that `cluster` mode is currently not supported for standalone +clusters, Mesos clusters, or python applications. For Python applications, simply pass a `.py` file in the place of `` instead of a JAR, and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`.