Merge branch 'master' into reorder_keys

apache · Aug 2, 2020 · 268326b · 268326b
2 parents 8308649 + 71aea02
commit 268326b
Show file tree

Hide file tree

Showing 711 changed files with 18,988 additions and 7,385 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -154,15 +154,18 @@ jobs:
         python3.8 -m pip install numpy pyarrow pandas scipy
         python3.8 -m pip list
     # SparkR
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
+    - name: Install R 4.0
       if: contains(matrix.modules, 'sparkr')
-      with:
-        r-version: 3.6
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
     - name: Install R packages
       if: contains(matrix.modules, 'sparkr')
       run: |
-        sudo apt-get install -y libcurl4-openssl-dev
+        # qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
+        sudo apt-get install -y libcurl4-openssl-dev qpdf
         sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
         # Show installed packages in R.
         sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
@@ -200,11 +203,15 @@ jobs:
         architecture: x64
     - name: Install Python linter dependencies
       run: |
-        pip3 install flake8 sphinx numpy
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
-      with:
-        r-version: 3.6
+        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
+        pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
+    - name: Install R 4.0
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
     - name: Install R linter dependencies and SparkR
       run: |
         sudo apt-get install -y libcurl4-openssl-dev
@@ -218,7 +225,9 @@ jobs:
     - name: Install dependencies for documentation generation
       run: |
         sudo apt-get install -y libcurl4-openssl-dev pandoc
-        pip install sphinx mkdocs numpy
+        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
+        pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
         gem install jekyll jekyll-redirect-from rouge
         sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
     - name: Scala linter
@@ -237,3 +246,28 @@ jobs:
       run: |
         cd docs
         jekyll build
+
+  java11:
+    name: Java 11 build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: java11-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          java11-maven-
+    - name: Install Java 11
+      uses: actions/setup-java@v1
+      with:
+        java-version: 11
+    - name: Build with Maven
+      run: |
+        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        mkdir -p ~/.m2
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
+        rm -rf ~/.m2/repository/org/apache/spark
diff --git a/.gitignore b/.gitignore
@@ -64,6 +64,7 @@ python/lib/pyspark.zip
 python/.eggs/
 python/deps
 python/docs/_site/
+python/docs/source/reference/api/
 python/test_coverage/coverage_data
 python/test_coverage/htmlcov
 python/pyspark/python

diff --git a/LICENSE b/LICENSE
@@ -222,14 +222,13 @@ external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaRe
 Python Software Foundation License
 ----------------------------------
 
-pyspark/heapq3.py
-python/docs/_static/copybutton.js
+python/docs/source/_static/copybutton.js
 
 BSD 3-Clause
 ------------
 
 python/lib/py4j-*-src.zip
-python/pyspark/cloudpickle.py
+python/pyspark/cloudpickle/*.py
 python/pyspark/join.py
 core/src/main/resources/org/apache/spark/ui/static/d3.min.js
 

diff --git a/LICENSE-binary b/LICENSE-binary
@@ -557,12 +557,6 @@ jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api
 org.glassfish.hk2.external:jakarta.inject
 
 
-Python Software Foundation License
-----------------------------------
-
-pyspark/heapq3.py
-
-
 Public Domain
 -------------
 

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -23,7 +23,7 @@ Suggests:
     testthat,
     e1071,
     survival,
-    arrow (>= 0.15.1)
+    arrow (>= 1.0.0)
 Collate:
     'schema.R'
     'generics.R'

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -1233,7 +1233,7 @@ setMethod("collect",
                   port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
                 output <- tryCatch({
                   doServerAuth(conn, authSecret)
-                  arrowTable <- arrow::read_arrow(readRaw(conn))
+                  arrowTable <- arrow::read_ipc_stream(readRaw(conn))
                   # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
                   if (exists("as_tibble", envir = asNamespace("arrow"))) {
                     as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)

diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
@@ -139,7 +139,7 @@ test_that("utility function can be called", {
   expect_true(TRUE)
 })
 
-test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+test_that("getClientModeSparkSubmitOpts() returns spark-submit args from allowList", {
   e <- new.env()
   e[["spark.driver.memory"]] <- "512m"
   ops <- getClientModeSparkSubmitOpts("sparkrmain", e)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -3921,14 +3921,14 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki
   # before creating a SparkSession with enableHiveSupport = T at the top of this test file
   # (filesBefore). The test here is to compare that (filesBefore) against the list of files before
   # any test is run in run-all.R (sparkRFilesBefore).
-  # sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
+  # sparkRAllowedSQLDirs is also defined in run-all.R, and should contain only 2 allowed dirs,
   # here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
   # same as before any test is run.
-  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
+  compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRAllowedSQLDirs[[1]]))
   # third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
   # note: as the note above, after running all tests in this file while enableHiveSupport = T, we
-  # check the list of files again. This time we allow both whitelisted dirs to be in the diff.
-  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
+  # check the list of files again. This time we allow both dirs to be in the diff.
+  compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRAllowedSQLDirs))
 })
 
 unlink(parquetPath)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
@@ -312,4 +312,22 @@ test_that("Arrow optimization - unsupported types", {
   })
 })
 
+test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", {
+  skip_if_not_installed("arrow")
+  df <- createDataFrame(list(list(a = 1L, b = "a")))
+
+  conf <- callJMethod(sparkSession, "conf")
+  arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
+
+  callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
+  tryCatch({
+    expect_error(
+    count(gapply(df, "a", function(key, group) { group }, structType("a int, b int"))),
+    "expected IntegerType, IntegerType, got IntegerType, StringType")
+  },
+  finally = {
+    callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
+  })
+})
+
 sparkR.session.stop()
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
@@ -35,8 +35,8 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
   install.spark(overwrite = TRUE)
 
   sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
-  sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
-  invisible(lapply(sparkRWhitelistSQLDirs,
+  sparkRAllowedSQLDirs <- c("spark-warehouse", "metastore_db")
+  invisible(lapply(sparkRAllowedSQLDirs,
                    function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
   sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 

diff --git a/bin/find-spark-home b/bin/find-spark-home
@@ -33,9 +33,9 @@ elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
   export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
 else
   # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
-  # Default to standard python interpreter unless told otherwise
+  # Default to standard python3 interpreter unless told otherwise
   if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
-     PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
+     PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python3"}"
   fi
   export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
 fi
diff --git a/bin/find-spark-home.cmd b/bin/find-spark-home.cmd
@@ -20,8 +20,8 @@ rem
 rem Path to Python script finding SPARK_HOME
 set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py
 
-rem Default to standard python interpreter unless told otherwise
-set PYTHON_RUNNER=python
+rem Default to standard python3 interpreter unless told otherwise
+set PYTHON_RUNNER=python3
 rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version
 if not "x%PYSPARK_DRIVER_PYTHON%"=="x" (
   set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON%

diff --git a/bin/load-spark-env.cmd b/bin/load-spark-env.cmd
@@ -21,42 +21,42 @@ rem This script loads spark-env.cmd if it exists, and ensures it is only loaded
 rem spark-env.cmd is loaded from SPARK_CONF_DIR if set, or within the current directory's
 rem conf\ subdirectory.
 
-set SPARK_ENV_CMD=spark-env.cmd
-if [%SPARK_ENV_LOADED%] == [] (
+if not defined SPARK_ENV_LOADED (
   set SPARK_ENV_LOADED=1
 
   if [%SPARK_CONF_DIR%] == [] (
     set SPARK_CONF_DIR=%~dp0..\conf
   )
 
-  set SPARK_ENV_CMD=%SPARK_CONF_DIR%\%SPARK_ENV_CMD%
-  if exist %SPARK_ENV_CMD% (
-    call %SPARK_ENV_CMD%
-  )
+  call :LoadSparkEnv
 )
 
 rem Setting SPARK_SCALA_VERSION if not already set.
 
-rem TODO: revisit for Scala 2.13 support
-set SPARK_SCALA_VERSION=2.12
-rem if [%SPARK_SCALA_VERSION%] == [] (
-rem   set SCALA_VERSION_1=2.12
-rem   set SCALA_VERSION_2=2.11
-rem
-rem   set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1%
-rem   set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2%
-rem   set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables
-rem   if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
-rem     echo "Presence of build for multiple Scala versions detected (%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%)."
-rem     echo "Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in %SPARK_ENV_CMD%."
-rem     echo "Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd."
-rem     echo "Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd."
-rem     exit 1
-rem   )
-rem   if exist %ASSEMBLY_DIR1% (
-rem     set SPARK_SCALA_VERSION=%SCALA_VERSION_1%
-rem   ) else (
-rem     set SPARK_SCALA_VERSION=%SCALA_VERSION_2%
-rem   )
-rem )
+set SCALA_VERSION_1=2.13
+set SCALA_VERSION_2=2.12
+
+set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1%
+set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2%
+set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables
+
+if not defined SPARK_SCALA_VERSION (
+  if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
+    echo Presence of build for multiple Scala versions detected ^(%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%^).
+    echo Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in spark-env.cmd.
+    echo Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd.
+    echo Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd.
+    exit 1
+  )
+  if exist %ASSEMBLY_DIR1% (
+    set SPARK_SCALA_VERSION=%SCALA_VERSION_1%
+  ) else (
+    set SPARK_SCALA_VERSION=%SCALA_VERSION_2%
+  )
+)
 exit /b 0
+
+:LoadSparkEnv
+if exist "%SPARK_CONF_DIR%\spark-env.cmd" (
+  call "%SPARK_CONF_DIR%\spark-env.cmd"
+)
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
@@ -43,25 +43,23 @@ fi
 
 # Setting SPARK_SCALA_VERSION if not already set.
 
-# TODO: revisit for Scala 2.13 support
-export SPARK_SCALA_VERSION=2.12
-#if [ -z "$SPARK_SCALA_VERSION" ]; then
-#  SCALA_VERSION_1=2.12
-#  SCALA_VERSION_2=2.11
-#
-#  ASSEMBLY_DIR_1="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_1}"
-#  ASSEMBLY_DIR_2="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_2}"
-#  ENV_VARIABLE_DOC="https://spark.apache.org/docs/latest/configuration.html#environment-variables"
-#  if [[ -d "$ASSEMBLY_DIR_1" && -d "$ASSEMBLY_DIR_2" ]]; then
-#    echo "Presence of build for multiple Scala versions detected ($ASSEMBLY_DIR_1 and $ASSEMBLY_DIR_2)." 1>&2
-#    echo "Remove one of them or, export SPARK_SCALA_VERSION=$SCALA_VERSION_1 in ${SPARK_ENV_SH}." 1>&2
-#    echo "Visit ${ENV_VARIABLE_DOC} for more details about setting environment variables in spark-env.sh." 1>&2
-#    exit 1
-#  fi
-#
-#  if [[ -d "$ASSEMBLY_DIR_1" ]]; then
-#    export SPARK_SCALA_VERSION=${SCALA_VERSION_1}
-#  else
-#    export SPARK_SCALA_VERSION=${SCALA_VERSION_2}
-#  fi
-#fi
+if [ -z "$SPARK_SCALA_VERSION" ]; then
+  SCALA_VERSION_1=2.13
+  SCALA_VERSION_2=2.12
+
+  ASSEMBLY_DIR_1="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_1}"
+  ASSEMBLY_DIR_2="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_2}"
+  ENV_VARIABLE_DOC="https://spark.apache.org/docs/latest/configuration.html#environment-variables"
+  if [[ -d "$ASSEMBLY_DIR_1" && -d "$ASSEMBLY_DIR_2" ]]; then
+    echo "Presence of build for multiple Scala versions detected ($ASSEMBLY_DIR_1 and $ASSEMBLY_DIR_2)." 1>&2
+    echo "Remove one of them or, export SPARK_SCALA_VERSION=$SCALA_VERSION_1 in ${SPARK_ENV_SH}." 1>&2
+    echo "Visit ${ENV_VARIABLE_DOC} for more details about setting environment variables in spark-env.sh." 1>&2
+    exit 1
+  fi
+
+  if [[ -d "$ASSEMBLY_DIR_1" ]]; then
+    export SPARK_SCALA_VERSION=${SCALA_VERSION_1}
+  else
+    export SPARK_SCALA_VERSION=${SCALA_VERSION_2}
+  fi
+fi
diff --git a/bin/pyspark b/bin/pyspark
@@ -37,9 +37,9 @@ if [[ -n "$IPYTHON" || -n "$IPYTHON_OPTS" ]]; then
   exit 1
 fi
 
-# Default to standard python interpreter unless told otherwise
+# Default to standard python3 interpreter unless told otherwise
 if [[ -z "$PYSPARK_PYTHON" ]]; then
-  PYSPARK_PYTHON=python
+  PYSPARK_PYTHON=python3
 fi
 if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
   PYSPARK_DRIVER_PYTHON=$PYSPARK_PYTHON