Merge branch 'master' into SPARK-2889

Conflicts: core/src/main/scala/org/apache/spark/util/FileLogger.scala yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
apache · Aug 20, 2014 · 0ac3fdf · 0ac3fdf
2 parents 3f26760 + b3ec51b
commit 0ac3fdf
Show file tree

Hide file tree

Showing 378 changed files with 11,217 additions and 4,691 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -25,6 +25,7 @@ log4j-defaults.properties
 bootstrap-tooltip.js
 jquery-1.11.1.min.js
 sorttable.js
+.*avsc
 .*txt
 .*json
 .*data

diff --git a/.travis.yml b/.travis.yml
diff --git a/README.md b/README.md
@@ -115,6 +115,11 @@ If your project is built with Maven, add this to your POM file's `<dependencies>
     </dependency>
 
 
+## A Note About Thrift JDBC server and CLI for Spark SQL
+
+Spark SQL supports Thrift JDBC server and CLI.
+See sql-programming-guide.md for more information about using the JDBC server.
+
 ## Configuration
 
 Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)

diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -163,11 +163,6 @@
           <artifactId>spark-hive_${scala.binary.version}</artifactId>
           <version>${project.version}</version>
         </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive-thriftserver</id>
-      <dependencies>
         <dependency>
           <groupId>org.apache.spark</groupId>
           <artifactId>spark-hive-thriftserver_${scala.binary.version}</artifactId>

diff --git a/bin/pyspark b/bin/pyspark
@@ -23,12 +23,18 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Export this as SPARK_HOME
 export SPARK_HOME="$FWDIR"
 
+source $FWDIR/bin/utils.sh
+
 SCALA_VERSION=2.10
 
-if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+function usage() {
   echo "Usage: ./bin/pyspark [options]" 1>&2
   $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
   exit 0
+}
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+  usage
 fi
 
 # Exit if the user hasn't compiled Spark
@@ -66,10 +72,11 @@ fi
 # Build up arguments list manually to preserve quotes and backslashes.
 # We export Spark submit arguments as an environment variable because shell.py must run as a
 # PYTHONSTARTUP script, which does not take in arguments. This is required for IPython notebooks.
-
+SUBMIT_USAGE_FUNCTION=usage
+gatherSparkSubmitOpts "$@"
 PYSPARK_SUBMIT_ARGS=""
 whitespace="[[:space:]]"
-for i in "$@"; do
+for i in "${SUBMISSION_OPTS[@]}"; do
   if [[ $i =~ \" ]]; then i=$(echo $i | sed 's/\"/\\\"/g'); fi
   if [[ $i =~ $whitespace ]]; then i=\"$i\"; fi
   PYSPARK_SUBMIT_ARGS="$PYSPARK_SUBMIT_ARGS $i"
@@ -90,7 +97,10 @@ fi
 if [[ "$1" =~ \.py$ ]]; then
   echo -e "\nWARNING: Running python applications through ./bin/pyspark is deprecated as of Spark 1.0." 1>&2
   echo -e "Use ./bin/spark-submit <python file>\n" 1>&2
-  exec $FWDIR/bin/spark-submit "$@"
+  primary=$1
+  shift
+  gatherSparkSubmitOpts "$@"
+  exec $FWDIR/bin/spark-submit "${SUBMISSION_OPTS[@]}" $primary "${APPLICATION_OPTS[@]}"
 else
   # Only use ipython if no command line arguments were provided [SPARK-1134]
   if [[ "$IPYTHON" = "1" ]]; then

diff --git a/bin/spark-class b/bin/spark-class
@@ -17,6 +17,8 @@
 # limitations under the License.
 #
 
+# NOTE: Any changes to this file must be reflected in SparkSubmitDriverBootstrapper.scala!
+
 cygwin=false
 case "`uname`" in
     CYGWIN*) cygwin=true;;
@@ -39,7 +41,7 @@ fi
 
 if [ -n "$SPARK_MEM" ]; then
   echo -e "Warning: SPARK_MEM is deprecated, please use a more specific config option" 1>&2
-  echo -e "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)." 1>&2
+  echo -e "(e.g., spark.executor.memory or spark.driver.memory)." 1>&2
 fi
 
 # Use SPARK_MEM or 512m as the default memory, to be overridden by specific options
@@ -73,11 +75,17 @@ case "$1" in
     OUR_JAVA_MEM=${SPARK_EXECUTOR_MEMORY:-$DEFAULT_MEM}
     ;;
 
-  # Spark submit uses SPARK_SUBMIT_OPTS and SPARK_JAVA_OPTS
-    'org.apache.spark.deploy.SparkSubmit')
-    OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_SUBMIT_OPTS \
-      -Djava.library.path=$SPARK_SUBMIT_LIBRARY_PATH"
+  # Spark submit uses SPARK_JAVA_OPTS + SPARK_SUBMIT_OPTS +
+  # SPARK_DRIVER_MEMORY + SPARK_SUBMIT_DRIVER_MEMORY.
+  'org.apache.spark.deploy.SparkSubmit')
+    OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_SUBMIT_OPTS"
     OUR_JAVA_MEM=${SPARK_DRIVER_MEMORY:-$DEFAULT_MEM}
+    if [ -n "$SPARK_SUBMIT_LIBRARY_PATH" ]; then
+      OUR_JAVA_OPTS="$OUR_JAVA_OPTS -Djava.library.path=$SPARK_SUBMIT_LIBRARY_PATH"
+    fi
+    if [ -n "$SPARK_SUBMIT_DRIVER_MEMORY" ]; then
+      OUR_JAVA_MEM="$SPARK_SUBMIT_DRIVER_MEMORY"
+    fi
     ;;
 
   *)
@@ -101,11 +109,12 @@ fi
 # Set JAVA_OPTS to be able to load native libraries and to set heap size
 JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
 JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"
+
 # Load extra JAVA_OPTS from conf/java-opts, if it exists
 if [ -e "$FWDIR/conf/java-opts" ] ; then
   JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
 fi
-export JAVA_OPTS
+
 # Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
 
 TOOLS_DIR="$FWDIR"/tools
@@ -146,10 +155,28 @@ if $cygwin; then
 fi
 export CLASSPATH
 
-if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
-  echo -n "Spark Command: " 1>&2
-  echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
-  echo -e "========================================\n" 1>&2
+# In Spark submit client mode, the driver is launched in the same JVM as Spark submit itself.
+# Here we must parse the properties file for relevant "spark.driver.*" configs before launching
+# the driver JVM itself. Instead of handling this complexity in Bash, we launch a separate JVM
+# to prepare the launch environment of this driver JVM.
+
+if [ -n "$SPARK_SUBMIT_BOOTSTRAP_DRIVER" ]; then
+  # This is used only if the properties file actually contains these special configs
+  # Export the environment variables needed by SparkSubmitDriverBootstrapper
+  export RUNNER
+  export CLASSPATH
+  export JAVA_OPTS
+  export OUR_JAVA_MEM
+  export SPARK_CLASS=1
+  shift # Ignore main class (org.apache.spark.deploy.SparkSubmit) and use our own
+  exec "$RUNNER" org.apache.spark.deploy.SparkSubmitDriverBootstrapper "$@"
+else
+  # Note: The format of this command is closely echoed in SparkSubmitDriverBootstrapper.scala
+  if [ -n "$SPARK_PRINT_LAUNCH_COMMAND" ]; then
+    echo -n "Spark Command: " 1>&2
+    echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
+    echo -e "========================================\n" 1>&2
+  fi
+  exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
 fi
 
-exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
diff --git a/bin/spark-shell b/bin/spark-shell
@@ -31,13 +31,21 @@ set -o posix
 ## Global script variables
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
+function usage() {
+    echo "Usage: ./bin/spark-shell [options]"
+    $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+    exit 0
+}
+
 if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  echo "Usage: ./bin/spark-shell [options]"
-  $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
-  exit 0
+  usage
 fi
 
-function main(){
+source $FWDIR/bin/utils.sh
+SUBMIT_USAGE_FUNCTION=usage
+gatherSparkSubmitOpts "$@"
+
+function main() {
     if $cygwin; then
         # Workaround for issue involving JLine and Cygwin
         # (see http://sourceforge.net/p/jline/bugs/40/).
@@ -46,11 +54,11 @@ function main(){
         # (see https://github.com/sbt/sbt/issues/562).
         stty -icanon min 1 -echo > /dev/null 2>&1
         export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
-        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@"
+        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
         stty icanon echo > /dev/null 2>&1
     else
         export SPARK_SUBMIT_OPTS
-        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@"
+        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
     fi
 }
 

diff --git a/bin/spark-shell.cmd b/bin/spark-shell.cmd
@@ -19,4 +19,4 @@ rem
 
 set SPARK_HOME=%~dp0..
 
-cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd spark-shell --class org.apache.spark.repl.Main %*
+cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd --class org.apache.spark.repl.Main %* spark-shell
diff --git a/bin/spark-sql b/bin/spark-sql
@@ -29,7 +29,7 @@ CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
 function usage {
-  echo "Usage: ./sbin/spark-sql [options] [cli option]"
+  echo "Usage: ./bin/spark-sql [options] [cli option]"
   pattern="usage"
   pattern+="\|Spark assembly has been built with Hive"
   pattern+="\|NOTE: SPARK_PREPEND_CLASSES is set"
@@ -65,30 +65,30 @@ while (($#)); do
   case $1 in
     -d | --define | --database | -f | -h | --hiveconf | --hivevar | -i | -p)
       ensure_arg_number $# 2
-      CLI_ARGS+=($1); shift
-      CLI_ARGS+=($1); shift
+      CLI_ARGS+=("$1"); shift
+      CLI_ARGS+=("$1"); shift
       ;;
 
     -e)
       ensure_arg_number $# 2
-      CLI_ARGS+=($1); shift
-      CLI_ARGS+=(\"$1\"); shift
+      CLI_ARGS+=("$1"); shift
+      CLI_ARGS+=("$1"); shift
       ;;
 
     -s | --silent)
-      CLI_ARGS+=($1); shift
+      CLI_ARGS+=("$1"); shift
       ;;
 
     -v | --verbose)
       # Both SparkSubmit and SparkSQLCLIDriver recognizes -v | --verbose
-      CLI_ARGS+=($1)
-      SUBMISSION_ARGS+=($1); shift
+      CLI_ARGS+=("$1")
+      SUBMISSION_ARGS+=("$1"); shift
       ;;
 
     *)
-      SUBMISSION_ARGS+=($1); shift
+      SUBMISSION_ARGS+=("$1"); shift
       ;;
   esac
 done
 
-eval exec "$FWDIR"/bin/spark-submit --class $CLASS ${SUBMISSION_ARGS[*]} spark-internal ${CLI_ARGS[*]}
+exec "$FWDIR"/bin/spark-submit --class $CLASS "${SUBMISSION_ARGS[@]}" spark-internal "${CLI_ARGS[@]}"
diff --git a/bin/spark-submit b/bin/spark-submit
@@ -17,14 +17,18 @@
 # limitations under the License.
 #
 
+# NOTE: Any changes in this file must be reflected in SparkClassLauncher.scala!
+
 export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
 ORIG_ARGS=("$@")
 
 while (($#)); do
   if [ "$1" = "--deploy-mode" ]; then
-    DEPLOY_MODE=$2
+    SPARK_SUBMIT_DEPLOY_MODE=$2
+  elif [ "$1" = "--properties-file" ]; then
+    SPARK_SUBMIT_PROPERTIES_FILE=$2
   elif [ "$1" = "--driver-memory" ]; then
-    DRIVER_MEMORY=$2
+    export SPARK_SUBMIT_DRIVER_MEMORY=$2
   elif [ "$1" = "--driver-library-path" ]; then
     export SPARK_SUBMIT_LIBRARY_PATH=$2
   elif [ "$1" = "--driver-class-path" ]; then
@@ -35,10 +39,24 @@ while (($#)); do
   shift
 done
 
-DEPLOY_MODE=${DEPLOY_MODE:-"client"}
+DEFAULT_PROPERTIES_FILE="$SPARK_HOME/conf/spark-defaults.conf"
+export SPARK_SUBMIT_DEPLOY_MODE=${SPARK_SUBMIT_DEPLOY_MODE:-"client"}
+export SPARK_SUBMIT_PROPERTIES_FILE=${SPARK_SUBMIT_PROPERTIES_FILE:-"$DEFAULT_PROPERTIES_FILE"}
+
+# For client mode, the driver will be launched in the same JVM that launches
+# SparkSubmit, so we may need to read the properties file for any extra class
+# paths, library paths, java options and memory early on. Otherwise, it will
+# be too late by the time the driver JVM has started.
 
-if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
-  export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
+if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FILE" ]]; then
+  # Parse the properties file only if the special configs exist
+  contains_special_configs=$(
+    grep -e "spark.driver.extra*\|spark.driver.memory" "$SPARK_SUBMIT_PROPERTIES_FILE" | \
+    grep -v "^[[:space:]]*#"
+  )
+  if [ -n "$contains_special_configs" ]; then
+    export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1
+  fi
 fi
 
 exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"