[SPARK-13576][BUILD] Don't create assembly for examples.

As part of the goal to stop creating assemblies in Spark, this change modifies the mvn and sbt builds to not create an assembly for examples. Instead, dependencies are copied to the build directory (under target/scala-xx/jars), and in the final archive, into the "examples/jars" directory. To avoid having to deal too much with Windows batch files, I made examples run through the launcher library; the spark-submit launcher now has a special mode to run examples, which adds all the necessary jars to the spark-submit command line, and replaces the bash and batch scripts that were used to run examples. The scripts are now just a thin wrapper around spark-submit; another advantage is that now all spark-submit options are supported. There are a few glitches; in the mvn build, a lot of duplicated dependencies get copied, because they are promoted to "compile" scope due to extra dependencies in the examples module (such as HBase). In the sbt build, all dependencies are copied, because there doesn't seem to be an easy way to filter things. I plan to clean some of this up when the rest of the tasks are finished. When the main assembly is replaced with jars, we can remove duplicate jars from the examples directory during packaging. Tested by running SparkPi in: maven build, sbt build, dist created by make-distribution.sh. Finally: note that running the "assembly" target in sbt doesn't build the examples anymore. You need to run "package" for that. Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #11452 from vanzin/SPARK-13576.
apache · Mar 15, 2016 · 48978ab · 48978ab
1 parent bd5365b
commit 48978ab
Show file tree

Hide file tree

Showing 9 changed files with 157 additions and 179 deletions.
diff --git a/bin/run-example b/bin/run-example
@@ -21,56 +21,5 @@ if [ -z "${SPARK_HOME}" ]; then
   export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
 fi
 
-EXAMPLES_DIR="${SPARK_HOME}"/examples
-
-. "${SPARK_HOME}"/bin/load-spark-env.sh
-
-if [ -n "$1" ]; then
-  EXAMPLE_CLASS="$1"
-  shift
-else
-  echo "Usage: ./bin/run-example <example-class> [example-args]" 1>&2
-  echo "  - set MASTER=XX to use a specific master" 1>&2
-  echo "  - can use abbreviated example class name relative to com.apache.spark.examples" 1>&2
-  echo "     (e.g. SparkPi, mllib.LinearRegression, streaming.KinesisWordCountASL)" 1>&2
-  exit 1
-fi
-
-if [ -f "${SPARK_HOME}/RELEASE" ]; then
-  JAR_PATH="${SPARK_HOME}/lib"
-else
-  JAR_PATH="${EXAMPLES_DIR}/target/scala-${SPARK_SCALA_VERSION}"
-fi
-
-JAR_COUNT=0
-
-for f in "${JAR_PATH}"/spark-examples-*hadoop*.jar; do
-  if [[ ! -e "$f" ]]; then
-    echo "Failed to find Spark examples assembly in ${SPARK_HOME}/lib or ${SPARK_HOME}/examples/target" 1>&2
-    echo "You need to build Spark before running this program" 1>&2
-    exit 1
-  fi
-  SPARK_EXAMPLES_JAR="$f"
-  JAR_COUNT=$((JAR_COUNT+1))
-done
-
-if [ "$JAR_COUNT" -gt "1" ]; then
-  echo "Found multiple Spark examples assembly jars in ${JAR_PATH}" 1>&2
-  ls "${JAR_PATH}"/spark-examples-*hadoop*.jar 1>&2
-  echo "Please remove all but one jar." 1>&2
-  exit 1
-fi
-
-export SPARK_EXAMPLES_JAR
-
-EXAMPLE_MASTER=${MASTER:-"local[*]"}
-
-if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
-  EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
-fi
-
-exec "${SPARK_HOME}"/bin/spark-submit \
-  --master $EXAMPLE_MASTER \
-  --class $EXAMPLE_CLASS \
-  "$SPARK_EXAMPLES_JAR" \
-  "$@"
+export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]"
+exec "${SPARK_HOME}"/bin/spark-submit run-example "$@"
diff --git a/bin/run-example.cmd b/bin/run-example.cmd
@@ -17,7 +17,6 @@ rem See the License for the specific language governing permissions and
 rem limitations under the License.
 rem
 
-rem This is the entry point for running a Spark example. To avoid polluting
-rem the environment, it just launches a new cmd to do the real work.
-
-cmd /V /E /C "%~dp0run-example2.cmd" %*
+set SPARK_HOME=%~dp0..
+set _SPARK_CMD_USAGE=Usage: ./bin/run-example [options] example-class [example args]
+cmd /V /E /C "%~dp0spark-submit.cmd" run-example %*
diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
@@ -166,11 +166,14 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 
 # Copy jars
 cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
-cp "$SPARK_HOME"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/"
 # This will fail if the -Pyarn profile is not provided
 # In this case, silence the error and ignore the return code of this command
 cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
 
+# Copy examples and dependencies
+mkdir -p "$DISTDIR/examples/jars"
+cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars"
+
 # Copy example sources (needed for python and SQL)
 mkdir -p "$DISTDIR/examples/src/main"
 cp -r "$SPARK_HOME"/examples/src/main "$DISTDIR/examples/src/"

diff --git a/examples/pom.xml b/examples/pom.xml
@@ -322,36 +322,36 @@
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-shade-plugin</artifactId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>prepare-test-jar</id>
+            <phase>none</phase>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
         <configuration>
-          <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar</outputFile>
-          <artifactSet>
-            <includes>
-              <include>*:*</include>
-            </includes>
-          </artifactSet>
-          <filters>
-            <filter>
-              <artifact>*:*</artifact>
-              <excludes>
-                <exclude>META-INF/*.SF</exclude>
-                <exclude>META-INF/*.DSA</exclude>
-                <exclude>META-INF/*.RSA</exclude>
-              </excludes>
-            </filter>
-          </filters>
-          <transformers>
-            <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
-            <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
-              <resource>reference.conf</resource>
-            </transformer>
-            <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
-              <resource>log4j.properties</resource>
-            </transformer>
-          </transformers>
+          <outputDirectory>${jars.target.dir}</outputDirectory>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <includeScope>runtime</includeScope>
+              <outputDirectory>${jars.target.dir}</outputDirectory>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
   <profiles>

diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -30,7 +30,8 @@
  * driver-side options and special parsing behavior needed for the special-casing certain internal
  * Spark applications.
  * <p>
- * This class has also some special features to aid launching pyspark.
+ * This class has also some special features to aid launching shells (pyspark and sparkR) and also
+ * examples.
  */
 class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
 
@@ -62,6 +63,17 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
    */
   static final String SPARKR_SHELL_RESOURCE = "sparkr-shell";
 
+  /**
+   * Name of app resource used to identify examples. When running examples, args[0] should be
+   * this name. The app resource will identify the example class to run.
+   */
+  static final String RUN_EXAMPLE = "run-example";
+
+  /**
+   * Prefix for example class names.
+   */
+  static final String EXAMPLE_CLASS_PREFIX = "org.apache.spark.examples.";
+
   /**
    * This map must match the class names for available special classes, since this modifies the way
    * command line parsing works. This maps the class name to the resource to use when calling
@@ -78,6 +90,7 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
 
   final List<String> sparkArgs;
   private final boolean printInfo;
+  private final boolean isExample;
 
   /**
    * Controls whether mixing spark-submit arguments with app arguments is allowed. This is needed
@@ -89,10 +102,13 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
   SparkSubmitCommandBuilder() {
     this.sparkArgs = new ArrayList<>();
     this.printInfo = false;
+    this.isExample = false;
   }
 
   SparkSubmitCommandBuilder(List<String> args) {
-    this.sparkArgs = new ArrayList<>();
+    this.allowsMixedArguments = false;
+
+    boolean isExample = false;
     List<String> submitArgs = args;
     if (args.size() > 0 && args.get(0).equals(PYSPARK_SHELL)) {
       this.allowsMixedArguments = true;
@@ -102,10 +118,14 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
       this.allowsMixedArguments = true;
       appResource = SPARKR_SHELL_RESOURCE;
       submitArgs = args.subList(1, args.size());
-    } else {
-      this.allowsMixedArguments = false;
+    } else if (args.size() > 0 && args.get(0).equals(RUN_EXAMPLE)) {
+      isExample = true;
+      submitArgs = args.subList(1, args.size());
     }
 
+    this.sparkArgs = new ArrayList<>();
+    this.isExample = isExample;
+
     OptionParser parser = new OptionParser();
     parser.parse(submitArgs);
     this.printInfo = parser.infoRequested;
@@ -155,6 +175,10 @@ List<String> buildSparkSubmitArgs() {
       args.add(propertiesFile);
     }
 
+    if (isExample) {
+      jars.addAll(findExamplesJars());
+    }
+
     if (!jars.isEmpty()) {
       args.add(parser.JARS);
       args.add(join(",", jars));
@@ -170,6 +194,9 @@ List<String> buildSparkSubmitArgs() {
       args.add(join(",", pyFiles));
     }
 
+    if (!printInfo) {
+      checkArgument(!isExample || mainClass != null, "Missing example class name.");
+    }
     if (mainClass != null) {
       args.add(parser.CLASS);
       args.add(mainClass);
@@ -308,6 +335,25 @@ private boolean isThriftServer(String mainClass) {
       mainClass.equals("org.apache.spark.sql.hive.thriftserver.HiveThriftServer2"));
   }
 
+  private List<String> findExamplesJars() {
+    List<String> examplesJars = new ArrayList<>();
+    String sparkHome = getSparkHome();
+
+    File jarsDir;
+    if (new File(sparkHome, "RELEASE").isFile()) {
+      jarsDir = new File(sparkHome, "examples/jars");
+    } else {
+      jarsDir = new File(sparkHome,
+        String.format("examples/target/scala-%s/jars", getScalaVersion()));
+    }
+    checkState(jarsDir.isDirectory(), "Examples jars directory '%s' does not exist.",
+        jarsDir.getAbsolutePath());
+
+    for (File f: jarsDir.listFiles()) {
+      examplesJars.add(f.getAbsolutePath());
+    }
+    return examplesJars;
+  }
 
   private class OptionParser extends SparkSubmitOptionParser {
 
@@ -367,6 +413,14 @@ protected boolean handleUnknown(String opt) {
       if (allowsMixedArguments) {
         appArgs.add(opt);
         return true;
+      } else if (isExample) {
+        String className = opt;
+        if (!className.startsWith(EXAMPLE_CLASS_PREFIX)) {
+          className = EXAMPLE_CLASS_PREFIX + className;
+        }
+        mainClass = className;
+        appResource = "spark-internal";
+        return false;
       } else {
         checkArgument(!opt.startsWith("-"), "Unrecognized option: %s", opt);
         sparkArgs.add(opt);
@@ -376,8 +430,10 @@ protected boolean handleUnknown(String opt) {
 
     @Override
     protected void handleExtraArgs(List<String> extra) {
-      for (String arg : extra) {
-        sparkArgs.add(arg);
+      if (isExample) {
+        appArgs.addAll(extra);
+      } else {
+        sparkArgs.addAll(extra);
       }
     }