Merge branch 'master' into pushdown

apache · Jun 1, 2021 · d95d332 · d95d332
2 parents 13e3f92 + 1dd0ca2
commit d95d332
Show file tree

Hide file tree

Showing 128 changed files with 1,545 additions and 1,234 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -152,7 +152,7 @@ jobs:
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20201025
+      image: dongjoon/apache-spark-github-action-image:20210530
     strategy:
       fail-fast: false
       matrix:
@@ -217,16 +217,6 @@ jobs:
       run: |
         python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner plotly>=4.8
         python3.6 -m pip list
-    # TODO(SPARK-35507) Move Python 3.9 installtation to the docker image
-    - name: Install Python 3.9
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.9
-        architecture: x64
-    - name: Install Python packages (Python 3.9)
-      run: |
-        python3.9 -m pip install numpy 'pyarrow<5.0.0' pandas scipy xmlrunner plotly>=4.8
-        python3.9 -m pip list
     - name: Install Conda for pip packaging test
       run: |
         curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
@@ -319,7 +309,7 @@ jobs:
       LC_ALL: C.UTF-8
       LANG: C.UTF-8
     container:
-      image: dongjoon/apache-spark-github-action-image:20201025
+      image: dongjoon/apache-spark-github-action-image:20210530
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -625,3 +615,85 @@ jobs:
       with:
         name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
         path: "**/target/unit-tests.log"
+
+  docker-integration-tests:
+    name: Run docker integration tests
+    runs-on: ubuntu-20.04
+    env:
+      HADOOP_PROFILE: hadoop3.2
+      HIVE_PROFILE: hive2.3
+      GITHUB_PREV_SHA: ${{ github.event.before }}
+      SPARK_LOCAL_IP: localhost
+      ORACLE_DOCKER_IMAGE_NAME: oracle/database:18.4.0-xe
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+        repository: apache/spark
+        ref: master
+    - name: Sync the current branch with the latest in Apache Spark
+      if: github.repository != 'apache/spark'
+      id: sync-branch
+      run: |
+        apache_spark_ref=`git rev-parse HEAD`
+        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
+        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
+        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
+        echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
+    - name: Cache Scala, SBT and Maven
+      uses: actions/cache@v2
+      with:
+        path: |
+          build/apache-maven-*
+          build/scala-*
+          build/*.jar
+          ~/.sbt
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+        restore-keys: |
+          build-
+    - name: Cache Coursier local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.cache/coursier
+        key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        restore-keys: |
+          docker-integration-coursier-
+    - name: Install Java 8
+      uses: actions/setup-java@v1
+      with:
+        java-version: 8
+    - name: Cache Oracle docker-images repository
+      id: cache-oracle-docker-images
+      uses: actions/cache@v2
+      with:
+        path: ./oracle/docker-images
+        # key should contains the commit hash of the Oracle docker images to be checkout.
+        key: oracle-docker-images-3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1
+    - name: Checkout Oracle docker-images repository
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+        repository: oracle/docker-images
+        ref: 3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1
+        path: ./oracle/docker-images
+    - name: Install Oracle Docker image
+      run: |
+        cd oracle/docker-images/OracleDatabase/SingleInstance/dockerfiles
+        ./buildContainerImage.sh -v 18.4.0 -x
+    - name: Run tests
+      run: |
+        export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
+        ./dev/run-tests --parallelism 2 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
+    - name: Upload test results to report
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: test-results-docker-integration--8-hadoop3.2-hive2.3
+        path: "**/target/test-reports/*.xml"
+    - name: Upload unit tests log files
+      if: failure()
+      uses: actions/upload-artifact@v2
+      with:
+        name: unit-tests-log-docker-integration--8-hadoop3.2-hive2.3
+        path: "**/target/unit-tests.log"
diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml
@@ -63,7 +63,15 @@ jobs:
                       const params = JSON.parse(cr.output.text)
 
                       // Get the workflow run in the forked repository
-                      const run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
+                      let run
+                      try {
+                        run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
+                      } catch (error) {
+                        console.error(error)
+                        // Run not found. This can happen when the PR author removes GitHub Actions runs or
+                        // disalbes GitHub Actions.
+                        continue
+                      }
 
                       // Keep syncing the status of the checks
                       if (run.data.status == 'completed') {

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1451,7 +1451,7 @@ test_that("column functions", {
   expect_equal(collect(df2)[[3, 2]], TRUE)
 
   # Test that input_file_name()
-  actual_names <- sort(collect(distinct(select(df, input_file_name()))))
+  actual_names <- collect(distinct(select(df, input_file_name())))
   expect_equal(length(actual_names), 1)
   expect_equal(basename(actual_names[1, 1]), basename(jsonPath))
 

diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
@@ -68,7 +68,7 @@ test_that("createDataFrame/collect Arrow optimization - type specification", {
     callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
   })
 
-  expect_equal(collect(createDataFrame(rdf)), expected)
+  expect_true(all(collect(createDataFrame(rdf)) == expected))
 })
 
 test_that("dapply() Arrow optimization", {
@@ -140,7 +140,7 @@ test_that("dapply() Arrow optimization - type specification (date and timestamp)
                               b = as.POSIXct("1990-02-24 12:34:56"))))
   df <- createDataFrame(rdf)
   ret <- dapply(df, function(rdf) { rdf }, schema(df))
-  expect_equal(collect(ret), rdf)
+  expect_true(all(collect(ret) == rdf))
 })
 
 test_that("gapply() Arrow optimization", {
@@ -226,7 +226,7 @@ test_that("gapply() Arrow optimization - type specification (date and timestamp)
   ret <- gapply(df,
                 "a",
                 function(key, grouped) { grouped }, schema(df))
-  expect_equal(collect(ret), rdf)
+  expect_true(all(collect(ret) == rdf))
 })
 
 test_that("Arrow optimization - unsupported types", {

diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R
@@ -64,7 +64,12 @@ test_that("cleanClosure on R functions", {
   actual <- get("y", envir = env, inherits = FALSE)
   expect_equal(actual, y)
   actual <- get("g", envir = env, inherits = FALSE)
-  expect_equal(actual, g)
+  if (as.numeric(R.Version()$major) >= 4 && !startsWith(R.Version()$minor, "0")) {
+    # 4.1+ checks environment in the function
+    expect_true(all.equal(actual, g, check.environment = FALSE))
+  } else {
+    expect_equal(actual, g)
+  }
 
   # Test for nested enclosures and package variables.
   env2 <- new.env()
@@ -77,7 +82,12 @@ test_that("cleanClosure on R functions", {
   actual <- get("y", envir = env, inherits = FALSE)
   expect_equal(actual, y)
   actual <- get("g", envir = env, inherits = FALSE)
-  expect_equal(actual, g)
+  if (as.numeric(R.Version()$major) >= 4 && !startsWith(R.Version()$minor, "0")) {
+    # 4.1+ checks environment in the function
+    expect_true(all.equal(actual, g, check.environment = FALSE))
+  } else {
+    expect_equal(actual, g)
+  }
 
   base <- c(1, 2, 3)
   l <- list(field = matrix(1))

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -720,7 +720,7 @@ private[spark] class MapOutputTrackerMaster(
     }
   }
 
-  def registerMergeResult(shuffleId: Int, reduceId: Int, status: MergeStatus) {
+  def registerMergeResult(shuffleId: Int, reduceId: Int, status: MergeStatus): Unit = {
     shuffleStatuses(shuffleId).addMergeResult(reduceId, status)
   }
 
@@ -745,7 +745,7 @@ private[spark] class MapOutputTrackerMaster(
     shuffleId: Int,
     reduceId: Int,
     bmAddress: BlockManagerId,
-    mapId: Option[Int] = None) {
+    mapId: Option[Int] = None): Unit = {
     shuffleStatuses.get(shuffleId) match {
       case Some(shuffleStatus) =>
         val mergeStatus = shuffleStatus.mergeStatuses(reduceId)

diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -220,7 +220,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite
     testSimpleSpilling()
   }
 
-  private def testSimpleSpillingForAllCodecs(encrypt: Boolean) {
+  private def testSimpleSpillingForAllCodecs(encrypt: Boolean): Unit = {
     // Keep track of which compression codec we're using to report in test failure messages
     var lastCompressionCodec: Option[String] = None
     try {

diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3
@@ -106,15 +106,15 @@ httpclient/4.5.13//httpclient-4.5.13.jar
 httpcore/4.4.14//httpcore-4.4.14.jar
 istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar
 ivy/2.4.0//ivy-2.4.0.jar
-jackson-annotations/2.12.2//jackson-annotations-2.12.2.jar
+jackson-annotations/2.12.3//jackson-annotations-2.12.3.jar
 jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar
-jackson-core/2.12.2//jackson-core-2.12.2.jar
-jackson-databind/2.12.2//jackson-databind-2.12.2.jar
-jackson-dataformat-yaml/2.12.2//jackson-dataformat-yaml-2.12.2.jar
+jackson-core/2.12.3//jackson-core-2.12.3.jar
+jackson-databind/2.12.3//jackson-databind-2.12.3.jar
+jackson-dataformat-yaml/2.12.3//jackson-dataformat-yaml-2.12.3.jar
 jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar
 jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar
-jackson-module-scala_2.12/2.12.2//jackson-module-scala_2.12-2.12.2.jar
+jackson-module-scala_2.12/2.12.3//jackson-module-scala_2.12-2.12.3.jar
 jackson-xc/1.9.13//jackson-xc-1.9.13.jar
 jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar
 jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar
@@ -212,10 +212,10 @@ protobuf-java/2.5.0//protobuf-java-2.5.0.jar
 py4j/0.10.9.2//py4j-0.10.9.2.jar
 pyrolite/4.30//pyrolite-4.30.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
-scala-compiler/2.12.10//scala-compiler-2.12.10.jar
-scala-library/2.12.10//scala-library-2.12.10.jar
+scala-compiler/2.12.14//scala-compiler-2.12.14.jar
+scala-library/2.12.14//scala-library-2.12.14.jar
 scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar
-scala-reflect/2.12.10//scala-reflect-2.12.10.jar
+scala-reflect/2.12.14//scala-reflect-2.12.14.jar
 scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar
 shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar
 shims/0.9.0//shims-0.9.0.jar

diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3
@@ -84,14 +84,14 @@ httpclient/4.5.13//httpclient-4.5.13.jar
 httpcore/4.4.14//httpcore-4.4.14.jar
 istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar
 ivy/2.4.0//ivy-2.4.0.jar
-jackson-annotations/2.12.2//jackson-annotations-2.12.2.jar
+jackson-annotations/2.12.3//jackson-annotations-2.12.3.jar
 jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar
-jackson-core/2.12.2//jackson-core-2.12.2.jar
-jackson-databind/2.12.2//jackson-databind-2.12.2.jar
-jackson-dataformat-yaml/2.12.2//jackson-dataformat-yaml-2.12.2.jar
+jackson-core/2.12.3//jackson-core-2.12.3.jar
+jackson-databind/2.12.3//jackson-databind-2.12.3.jar
+jackson-dataformat-yaml/2.12.3//jackson-dataformat-yaml-2.12.3.jar
 jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar
 jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar
-jackson-module-scala_2.12/2.12.2//jackson-module-scala_2.12-2.12.2.jar
+jackson-module-scala_2.12/2.12.3//jackson-module-scala_2.12-2.12.3.jar
 jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar
 jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar
 jakarta.servlet-api/4.0.3//jakarta.servlet-api-4.0.3.jar
@@ -183,10 +183,10 @@ protobuf-java/2.5.0//protobuf-java-2.5.0.jar
 py4j/0.10.9.2//py4j-0.10.9.2.jar
 pyrolite/4.30//pyrolite-4.30.jar
 scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
-scala-compiler/2.12.10//scala-compiler-2.12.10.jar
-scala-library/2.12.10//scala-library-2.12.10.jar
+scala-compiler/2.12.14//scala-compiler-2.12.14.jar
+scala-library/2.12.14//scala-library-2.12.14.jar
 scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar
-scala-reflect/2.12.10//scala-reflect-2.12.10.jar
+scala-reflect/2.12.14//scala-reflect-2.12.14.jar
 scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar
 shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar
 shims/0.9.0//shims-0.9.0.jar

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -122,19 +122,21 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
     ['graphx', 'examples']
     >>> [x.name for x in determine_modules_to_test([modules.sql])]
     ... # doctest: +NORMALIZE_WHITESPACE
-    ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
-     'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-ml']
+    ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples',
+     'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr',
+     'pyspark-mllib', 'pyspark-pandas', 'pyspark-ml']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sparkr, modules.sql], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
-    ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml',
-     'pyspark-mllib', 'pyspark-pandas', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
+    ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib',
+     'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-sql',
+     'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
     >>> sorted([x.name for x in determine_modules_to_test(
     ...     [modules.sql, modules.core], deduplicated=False)])
     ... # doctest: +NORMALIZE_WHITESPACE
-    ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver',
-     'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
-     'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
+    ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive',
+     'hive-thriftserver', 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib',
+     'pyspark-pandas', 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
      'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
      'streaming-kinesis-asl']
     """

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -17,6 +17,7 @@
 
 from functools import total_ordering
 import itertools
+import os
 import re
 
 all_modules = []
@@ -745,6 +746,20 @@ def __hash__(self):
     ]
 )
 
+docker_integration_tests = Module(
+    name="docker-integration-tests",
+    dependencies=[],
+    build_profile_flags=["-Pdocker-integration-tests"],
+    source_file_regexes=["external/docker-integration-tests"],
+    sbt_test_goals=["docker-integration-tests/test"],
+    environ=None if "GITHUB_ACTIONS" not in os.environ else {
+        "ENABLE_DOCKER_INTEGRATION_TESTS": "1"
+    },
+    test_tags=[
+        "org.apache.spark.tags.DockerTest"
+    ]
+)
+
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(

diff --git a/docs/_config.yml b/docs/_config.yml
@@ -22,7 +22,7 @@ include:
 SPARK_VERSION: 3.2.0-SNAPSHOT
 SPARK_VERSION_SHORT: 3.2.0
 SCALA_BINARY_VERSION: "2.12"
-SCALA_VERSION: "2.12.10"
+SCALA_VERSION: "2.12.14"
 MESOS_VERSION: 1.0.0
 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK
 SPARK_GITHUB_URL: https://github.com/apache/spark

diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
@@ -28,7 +28,7 @@ Spark applications run as independent sets of processes on a cluster, coordinate
 object in your main program (called the _driver program_).
 
 Specifically, to run on a cluster, the SparkContext can connect to several types of _cluster managers_
-(either Spark's own standalone cluster manager, Mesos or YARN), which allocate resources across
+(either Spark's own standalone cluster manager, Mesos, YARN or Kubernetes), which allocate resources across
 applications. Once connected, Spark acquires *executors* on nodes in the cluster, which are
 processes that run computations and store data for your application.
 Next, it sends your application code (defined by JAR or Python files passed to SparkContext) to
@@ -48,7 +48,7 @@ There are several useful things to note about this architecture:
    writing it to an external storage system.
 2. Spark is agnostic to the underlying cluster manager. As long as it can acquire executor
    processes, and these communicate with each other, it is relatively easy to run it even on a
-   cluster manager that also supports other applications (e.g. Mesos/YARN).
+   cluster manager that also supports other applications (e.g. Mesos/YARN/Kubernetes).
 3. The driver program must listen for and accept incoming connections from its executors throughout
    its lifetime (e.g., see [spark.driver.port in the network config
    section](configuration.html#networking)). As such, the driver program must be network
@@ -117,7 +117,7 @@ The following table summarizes terms you'll see used to refer to cluster concept
     </tr>
     <tr>
       <td>Cluster manager</td>
-      <td>An external service for acquiring resources on the cluster (e.g. standalone manager, Mesos, YARN)</td>
+      <td>An external service for acquiring resources on the cluster (e.g. standalone manager, Mesos, YARN, Kubernetes)</td>
     </tr>
     <tr>
       <td>Deploy mode</td>