From 144af84f727cb11befc11723152e65e5e84cbb16 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 18 May 2014 16:44:26 -0700 Subject: [PATCH 1/4] Update Windows scripts to match latest binary package layout Also fixed an issues where SparkSubmit was trying to parse local files as URLs, which fails on Windows because they contain backslashes. We didn't need to treat those as URLs to check if a file exists. --- README.md | 7 +-- bin/compute-classpath.cmd | 17 ++++++- bin/run-example | 23 ++++----- bin/run-example2.cmd | 51 ++++++++++++++----- bin/spark-class2.cmd | 2 + .../org/apache/spark/deploy/SparkSubmit.scala | 2 +- 6 files changed, 73 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 9c2e32b90f162..6211a5889a3f5 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,14 @@ You can find the latest Spark documentation, including a programming guide, on the project webpage at . This README file only contains basic setup instructions. - ## Building Spark Spark is built on Scala 2.10. To build Spark and its example programs, run: ./sbt/sbt assembly +(You do not need to do this if you downloaded a pre-built package.) + ## Interactive Scala Shell The easiest way to start using Spark is through the Scala shell: @@ -41,9 +42,9 @@ And run the following command, which should also return 1000: Spark also comes with several sample programs in the `examples` directory. To run one of them, use `./bin/run-example [params]`. For example: - ./bin/run-example org.apache.spark.examples.SparkLR + ./bin/run-example SparkPi -will run the Logistic Regression example locally. +will run the Pi example locally. You can set the MASTER environment variable when running examples to submit examples to a cluster. This can be a mesos:// or spark:// URL, diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index 065553eb31939..a987bf56908e7 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -31,7 +31,7 @@ if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" rem Build up classpath set CLASSPATH=%FWDIR%conf if exist "%FWDIR%RELEASE" ( - for %%d in ("%FWDIR%jars\spark-assembly*.jar") do ( + for %%d in ("%FWDIR%lib\spark-assembly*.jar") do ( set ASSEMBLY_JAR=%%d ) ) else ( @@ -42,6 +42,21 @@ if exist "%FWDIR%RELEASE" ( set CLASSPATH=%CLASSPATH%;%ASSEMBLY_JAR% +rem When Hive support is needed, Datanucleus jars must be included on the classpath. +rem Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost. +rem Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is +rem built with Hive, so look for them there. +if exist "%FWDIR%RELEASE" ( + set datanucleus_dir=%FWDIR%\lib +) else ( + set datanucleus_dir=%FWDIR%\lib_managed\jars +) +set "datanucleus_jars=" +for %%d in ("%datanucleus_dir%\datanucleus-*.jar") do ( + set datanucleus_jars=!datanucleus_jars!;%%d +) +set CLASSPATH=%CLASSPATH%;%datanucleus_jars% + set SPARK_CLASSES=%FWDIR%core\target\scala-%SCALA_VERSION%\classes set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%repl\target\scala-%SCALA_VERSION%\classes set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%mllib\target\scala-%SCALA_VERSION%\classes diff --git a/bin/run-example b/bin/run-example index 146951ac0ee56..7caab31daef39 100755 --- a/bin/run-example +++ b/bin/run-example @@ -23,6 +23,16 @@ FWDIR="$(cd `dirname $0`/..; pwd)" export SPARK_HOME="$FWDIR" EXAMPLES_DIR="$FWDIR"/examples +if [ -n "$1" ]; then + EXAMPLE_CLASS="$1" + shift +else + echo "Usage: ./bin/run-example [example-args]" + echo " - set MASTER=XX to use a specific master" + echo " - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)" + exit 1 +fi + if [ -f "$FWDIR/RELEASE" ]; then export SPARK_EXAMPLES_JAR=`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar` elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar ]; then @@ -37,17 +47,6 @@ fi EXAMPLE_MASTER=${MASTER:-"local[*]"} -if [ -n "$1" ]; then - EXAMPLE_CLASS="$1" - shift -else - echo "usage: ./bin/run-example [example-args]" - echo " - set MASTER=XX to use a specific master" - echo " - can use abbreviated example class name (e.g. SparkPi, mllib.MovieLensALS)" - echo - exit -1 -fi - if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS" fi @@ -55,5 +54,5 @@ fi ./bin/spark-submit \ --master $EXAMPLE_MASTER \ --class $EXAMPLE_CLASS \ - $SPARK_EXAMPLES_JAR \ + "$SPARK_EXAMPLES_JAR" \ "$@" diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd index 40abb9af74246..5832d1fa5f774 100644 --- a/bin/run-example2.cmd +++ b/bin/run-example2.cmd @@ -30,7 +30,9 @@ if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" rem Test that an argument was given if not "x%1"=="x" goto arg_given - echo Usage: run-example ^ [^] + echo Usage: run-example ^ [^] + echo - set MASTER=XX to use a specific master + echo - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression) goto exit :arg_given @@ -38,8 +40,14 @@ set EXAMPLES_DIR=%FWDIR%examples rem Figure out the JAR file that our examples were packaged into. set SPARK_EXAMPLES_JAR= -for %%d in ("%EXAMPLES_DIR%\target\scala-%SCALA_VERSION%\spark-examples*assembly*.jar") do ( - set SPARK_EXAMPLES_JAR=%%d +if exist "%FWDIR%RELEASE" ( + for %%d in ("%FWDIR%lib\spark-examples*.jar") do ( + set SPARK_EXAMPLES_JAR=%%d + ) +) else ( + for %%d in ("%EXAMPLES_DIR%\target\scala-%SCALA_VERSION%\spark-examples*.jar") do ( + set SPARK_EXAMPLES_JAR=%%d + ) ) if "x%SPARK_EXAMPLES_JAR%"=="x" ( echo Failed to find Spark examples assembly JAR. @@ -47,15 +55,34 @@ if "x%SPARK_EXAMPLES_JAR%"=="x" ( goto exit ) -rem Compute Spark classpath using external script -set DONT_PRINT_CLASSPATH=1 -call "%FWDIR%bin\compute-classpath.cmd" -set DONT_PRINT_CLASSPATH=0 -set CLASSPATH=%SPARK_EXAMPLES_JAR%;%CLASSPATH% +rem Set master from MASTER environment variable if given +if "x%MASTER%"=="x" ( + set EXAMPLE_MASTER=local[*] +) else ( + set EXAMPLE_MASTER=%MASTER% +) + +rem If the EXAMPLE_CLASS does not start with org.apache.spark.examples, add that +set EXAMPLE_CLASS=%1 +set PREFIX=%EXAMPLE_CLASS:~0,25% +if not %PREFIX%==org.apache.spark.examples ( + set EXAMPLE_CLASS=org.apache.spark.examples.%EXAMPLE_CLASS% +) + +rem Get the tail of the argument list, to skip the first one. This is surprisingly +rem complicated on Windows. +set "ARGS=" +:top +shift +if "%~1" neq "" ( + set ARGS=%ARGS% "%~1" + goto :top +) +if defined ARGS set ARGS=%ARGS:~1% -rem Figure out where java is. -set RUNNER=java -if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java +call "%FWDIR%bin\spark-submit.cmd" ^ + --master %EXAMPLE_MASTER% ^ + --class %EXAMPLE_CLASS% ^ + "%SPARK_EXAMPLES_JAR%" %ARGS% -"%RUNNER%" -cp "%CLASSPATH%" %JAVA_OPTS% %* :exit diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index 4302c1b6b7ff4..266edd9fa9835 100755 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -17,6 +17,8 @@ rem See the License for the specific language governing permissions and rem limitations under the License. rem +setlocal enabledelayedexpansion + set SCALA_VERSION=2.10 rem Figure out where the Spark framework is installed diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index a99b2176e2b5e..c54331c00fab8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -299,7 +299,7 @@ object SparkSubmit { } private def addJarToClasspath(localJar: String, loader: ExecutorURLClassLoader) { - val localJarFile = new File(new URI(localJar).getPath) + val localJarFile = new File(localJar) if (!localJarFile.exists()) { printWarning(s"Jar $localJar does not exist, skipping.") } From d3b71c7611e5d52519fecb4e9268b0ec362ad65d Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 18 May 2014 17:16:40 -0700 Subject: [PATCH 2/4] Properly exclude datanucleus files in Maven assembly They are excluded in SBT, but the rule added in Maven didn't actually remove the files from the JAR. The JARs built still worked despite this, but it's better to remove them than have 2 copies on the classpath. --- assembly/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index abd8935339992..963357b9ab167 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -96,7 +96,7 @@ *:* - org.datanucleus:* + org/datanucleus/** META-INF/*.SF META-INF/*.DSA META-INF/*.RSA From 228577bc9dd3c884c2dc40dc9f0405790467dbcd Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Mon, 19 May 2014 00:52:26 -0700 Subject: [PATCH 3/4] Review comments --- bin/compute-classpath.cmd | 11 +++++++++-- bin/run-example2.cmd | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index a987bf56908e7..45a58f24bfb87 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -20,6 +20,13 @@ rem rem This script computes Spark's classpath and prints it to stdout; it's used by both the "run" rem script and the ExecutorRunner in standalone cluster mode. +rem If we're called from spark-class2.cmd, it already set enabledelayedexpansion and setting +rem it here would stop us from affecting its copy of the CLASSPATH variable; otherwise we +rem need to set it here because we use !classpath! below. +if "%DONT_PRINT_CLASSPATH%"=="1" goto skip_delayed_expansion +setlocal enabledelayedexpansion +:skip_delayed_expansion + set SCALA_VERSION=2.10 rem Figure out where the Spark framework is installed @@ -47,9 +54,9 @@ rem Datanucleus jars do not work if only included in the uber jar as plugin.xml rem Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is rem built with Hive, so look for them there. if exist "%FWDIR%RELEASE" ( - set datanucleus_dir=%FWDIR%\lib + set datanucleus_dir=%FWDIR%lib ) else ( - set datanucleus_dir=%FWDIR%\lib_managed\jars + set datanucleus_dir=%FWDIR%lib_managed\jars ) set "datanucleus_jars=" for %%d in ("%datanucleus_dir%\datanucleus-*.jar") do ( diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd index 5832d1fa5f774..eadedd7fa61ff 100644 --- a/bin/run-example2.cmd +++ b/bin/run-example2.cmd @@ -30,7 +30,7 @@ if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" rem Test that an argument was given if not "x%1"=="x" goto arg_given - echo Usage: run-example ^ [^] + echo Usage: run-example ^ [example-args] echo - set MASTER=XX to use a specific master echo - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression) goto exit From d558f960d8b7fd1d92e0ff8d34b6d6b60f52e94f Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Mon, 19 May 2014 01:03:09 -0700 Subject: [PATCH 4/4] Fix comment --- bin/compute-classpath.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/compute-classpath.cmd b/bin/compute-classpath.cmd index 45a58f24bfb87..58710cd1bd548 100644 --- a/bin/compute-classpath.cmd +++ b/bin/compute-classpath.cmd @@ -22,7 +22,7 @@ rem script and the ExecutorRunner in standalone cluster mode. rem If we're called from spark-class2.cmd, it already set enabledelayedexpansion and setting rem it here would stop us from affecting its copy of the CLASSPATH variable; otherwise we -rem need to set it here because we use !classpath! below. +rem need to set it here because we use !datanucleus_jars! below. if "%DONT_PRINT_CLASSPATH%"=="1" goto skip_delayed_expansion setlocal enabledelayedexpansion :skip_delayed_expansion