Skip to content

Commit

Permalink
Merge branch 'master' into pushdown
Browse files Browse the repository at this point in the history
  • Loading branch information
sigmod committed Jun 1, 2021
2 parents 13e3f92 + 1dd0ca2 commit d95d332
Show file tree
Hide file tree
Showing 128 changed files with 1,545 additions and 1,234 deletions.
96 changes: 84 additions & 12 deletions .github/workflows/build_and_test.yml
Expand Up @@ -152,7 +152,7 @@ jobs:
name: "Build modules: ${{ matrix.modules }}"
runs-on: ubuntu-20.04
container:
image: dongjoon/apache-spark-github-action-image:20201025
image: dongjoon/apache-spark-github-action-image:20210530
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -217,16 +217,6 @@ jobs:
run: |
python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner plotly>=4.8
python3.6 -m pip list
# TODO(SPARK-35507) Move Python 3.9 installtation to the docker image
- name: Install Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
architecture: x64
- name: Install Python packages (Python 3.9)
run: |
python3.9 -m pip install numpy 'pyarrow<5.0.0' pandas scipy xmlrunner plotly>=4.8
python3.9 -m pip list
- name: Install Conda for pip packaging test
run: |
curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
Expand Down Expand Up @@ -319,7 +309,7 @@ jobs:
LC_ALL: C.UTF-8
LANG: C.UTF-8
container:
image: dongjoon/apache-spark-github-action-image:20201025
image: dongjoon/apache-spark-github-action-image:20210530
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
Expand Down Expand Up @@ -625,3 +615,85 @@ jobs:
with:
name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3
path: "**/target/unit-tests.log"

docker-integration-tests:
name: Run docker integration tests
runs-on: ubuntu-20.04
env:
HADOOP_PROFILE: hadoop3.2
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
ORACLE_DOCKER_IMAGE_NAME: oracle/database:18.4.0-xe
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: master
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
id: sync-branch
run: |
apache_spark_ref=`git rev-parse HEAD`
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
docker-integration-coursier-
- name: Install Java 8
uses: actions/setup-java@v1
with:
java-version: 8
- name: Cache Oracle docker-images repository
id: cache-oracle-docker-images
uses: actions/cache@v2
with:
path: ./oracle/docker-images
# key should contains the commit hash of the Oracle docker images to be checkout.
key: oracle-docker-images-3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1
- name: Checkout Oracle docker-images repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: oracle/docker-images
ref: 3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1
path: ./oracle/docker-images
- name: Install Oracle Docker image
run: |
cd oracle/docker-images/OracleDatabase/SingleInstance/dockerfiles
./buildContainerImage.sh -v 18.4.0 -x
- name: Run tests
run: |
export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
./dev/run-tests --parallelism 2 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-docker-integration--8-hadoop3.2-hive2.3
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-docker-integration--8-hadoop3.2-hive2.3
path: "**/target/unit-tests.log"
10 changes: 9 additions & 1 deletion .github/workflows/update_build_status.yml
Expand Up @@ -63,7 +63,15 @@ jobs:
const params = JSON.parse(cr.output.text)
// Get the workflow run in the forked repository
const run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
let run
try {
run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
} catch (error) {
console.error(error)
// Run not found. This can happen when the PR author removes GitHub Actions runs or
// disalbes GitHub Actions.
continue
}
// Keep syncing the status of the checks
if (run.data.status == 'completed') {
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/tests/fulltests/test_sparkSQL.R
Expand Up @@ -1451,7 +1451,7 @@ test_that("column functions", {
expect_equal(collect(df2)[[3, 2]], TRUE)

# Test that input_file_name()
actual_names <- sort(collect(distinct(select(df, input_file_name()))))
actual_names <- collect(distinct(select(df, input_file_name())))
expect_equal(length(actual_names), 1)
expect_equal(basename(actual_names[1, 1]), basename(jsonPath))

Expand Down
6 changes: 3 additions & 3 deletions R/pkg/tests/fulltests/test_sparkSQL_arrow.R
Expand Up @@ -68,7 +68,7 @@ test_that("createDataFrame/collect Arrow optimization - type specification", {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})

expect_equal(collect(createDataFrame(rdf)), expected)
expect_true(all(collect(createDataFrame(rdf)) == expected))
})

test_that("dapply() Arrow optimization", {
Expand Down Expand Up @@ -140,7 +140,7 @@ test_that("dapply() Arrow optimization - type specification (date and timestamp)
b = as.POSIXct("1990-02-24 12:34:56"))))
df <- createDataFrame(rdf)
ret <- dapply(df, function(rdf) { rdf }, schema(df))
expect_equal(collect(ret), rdf)
expect_true(all(collect(ret) == rdf))
})

test_that("gapply() Arrow optimization", {
Expand Down Expand Up @@ -226,7 +226,7 @@ test_that("gapply() Arrow optimization - type specification (date and timestamp)
ret <- gapply(df,
"a",
function(key, grouped) { grouped }, schema(df))
expect_equal(collect(ret), rdf)
expect_true(all(collect(ret) == rdf))
})

test_that("Arrow optimization - unsupported types", {
Expand Down
14 changes: 12 additions & 2 deletions R/pkg/tests/fulltests/test_utils.R
Expand Up @@ -64,7 +64,12 @@ test_that("cleanClosure on R functions", {
actual <- get("y", envir = env, inherits = FALSE)
expect_equal(actual, y)
actual <- get("g", envir = env, inherits = FALSE)
expect_equal(actual, g)
if (as.numeric(R.Version()$major) >= 4 && !startsWith(R.Version()$minor, "0")) {
# 4.1+ checks environment in the function
expect_true(all.equal(actual, g, check.environment = FALSE))
} else {
expect_equal(actual, g)
}

# Test for nested enclosures and package variables.
env2 <- new.env()
Expand All @@ -77,7 +82,12 @@ test_that("cleanClosure on R functions", {
actual <- get("y", envir = env, inherits = FALSE)
expect_equal(actual, y)
actual <- get("g", envir = env, inherits = FALSE)
expect_equal(actual, g)
if (as.numeric(R.Version()$major) >= 4 && !startsWith(R.Version()$minor, "0")) {
# 4.1+ checks environment in the function
expect_true(all.equal(actual, g, check.environment = FALSE))
} else {
expect_equal(actual, g)
}

base <- c(1, 2, 3)
l <- list(field = matrix(1))
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/scala/org/apache/spark/MapOutputTracker.scala
Expand Up @@ -720,7 +720,7 @@ private[spark] class MapOutputTrackerMaster(
}
}

def registerMergeResult(shuffleId: Int, reduceId: Int, status: MergeStatus) {
def registerMergeResult(shuffleId: Int, reduceId: Int, status: MergeStatus): Unit = {
shuffleStatuses(shuffleId).addMergeResult(reduceId, status)
}

Expand All @@ -745,7 +745,7 @@ private[spark] class MapOutputTrackerMaster(
shuffleId: Int,
reduceId: Int,
bmAddress: BlockManagerId,
mapId: Option[Int] = None) {
mapId: Option[Int] = None): Unit = {
shuffleStatuses.get(shuffleId) match {
case Some(shuffleStatus) =>
val mergeStatus = shuffleStatus.mergeStatuses(reduceId)
Expand Down
Expand Up @@ -220,7 +220,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite
testSimpleSpilling()
}

private def testSimpleSpillingForAllCodecs(encrypt: Boolean) {
private def testSimpleSpillingForAllCodecs(encrypt: Boolean): Unit = {
// Keep track of which compression codec we're using to report in test failure messages
var lastCompressionCodec: Option[String] = None
try {
Expand Down
16 changes: 8 additions & 8 deletions dev/deps/spark-deps-hadoop-2.7-hive-2.3
Expand Up @@ -106,15 +106,15 @@ httpclient/4.5.13//httpclient-4.5.13.jar
httpcore/4.4.14//httpcore-4.4.14.jar
istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar
ivy/2.4.0//ivy-2.4.0.jar
jackson-annotations/2.12.2//jackson-annotations-2.12.2.jar
jackson-annotations/2.12.3//jackson-annotations-2.12.3.jar
jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar
jackson-core/2.12.2//jackson-core-2.12.2.jar
jackson-databind/2.12.2//jackson-databind-2.12.2.jar
jackson-dataformat-yaml/2.12.2//jackson-dataformat-yaml-2.12.2.jar
jackson-core/2.12.3//jackson-core-2.12.3.jar
jackson-databind/2.12.3//jackson-databind-2.12.3.jar
jackson-dataformat-yaml/2.12.3//jackson-dataformat-yaml-2.12.3.jar
jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar
jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar
jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar
jackson-module-scala_2.12/2.12.2//jackson-module-scala_2.12-2.12.2.jar
jackson-module-scala_2.12/2.12.3//jackson-module-scala_2.12-2.12.3.jar
jackson-xc/1.9.13//jackson-xc-1.9.13.jar
jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar
jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar
Expand Down Expand Up @@ -212,10 +212,10 @@ protobuf-java/2.5.0//protobuf-java-2.5.0.jar
py4j/0.10.9.2//py4j-0.10.9.2.jar
pyrolite/4.30//pyrolite-4.30.jar
scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
scala-compiler/2.12.10//scala-compiler-2.12.10.jar
scala-library/2.12.10//scala-library-2.12.10.jar
scala-compiler/2.12.14//scala-compiler-2.12.14.jar
scala-library/2.12.14//scala-library-2.12.14.jar
scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar
scala-reflect/2.12.10//scala-reflect-2.12.10.jar
scala-reflect/2.12.14//scala-reflect-2.12.14.jar
scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar
shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar
shims/0.9.0//shims-0.9.0.jar
Expand Down
16 changes: 8 additions & 8 deletions dev/deps/spark-deps-hadoop-3.2-hive-2.3
Expand Up @@ -84,14 +84,14 @@ httpclient/4.5.13//httpclient-4.5.13.jar
httpcore/4.4.14//httpcore-4.4.14.jar
istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar
ivy/2.4.0//ivy-2.4.0.jar
jackson-annotations/2.12.2//jackson-annotations-2.12.2.jar
jackson-annotations/2.12.3//jackson-annotations-2.12.3.jar
jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar
jackson-core/2.12.2//jackson-core-2.12.2.jar
jackson-databind/2.12.2//jackson-databind-2.12.2.jar
jackson-dataformat-yaml/2.12.2//jackson-dataformat-yaml-2.12.2.jar
jackson-core/2.12.3//jackson-core-2.12.3.jar
jackson-databind/2.12.3//jackson-databind-2.12.3.jar
jackson-dataformat-yaml/2.12.3//jackson-dataformat-yaml-2.12.3.jar
jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar
jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar
jackson-module-scala_2.12/2.12.2//jackson-module-scala_2.12-2.12.2.jar
jackson-module-scala_2.12/2.12.3//jackson-module-scala_2.12-2.12.3.jar
jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar
jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar
jakarta.servlet-api/4.0.3//jakarta.servlet-api-4.0.3.jar
Expand Down Expand Up @@ -183,10 +183,10 @@ protobuf-java/2.5.0//protobuf-java-2.5.0.jar
py4j/0.10.9.2//py4j-0.10.9.2.jar
pyrolite/4.30//pyrolite-4.30.jar
scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar
scala-compiler/2.12.10//scala-compiler-2.12.10.jar
scala-library/2.12.10//scala-library-2.12.10.jar
scala-compiler/2.12.14//scala-compiler-2.12.14.jar
scala-library/2.12.14//scala-library-2.12.14.jar
scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar
scala-reflect/2.12.10//scala-reflect-2.12.10.jar
scala-reflect/2.12.14//scala-reflect-2.12.14.jar
scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar
shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar
shims/0.9.0//shims-0.9.0.jar
Expand Down
16 changes: 9 additions & 7 deletions dev/run-tests.py
Expand Up @@ -122,19 +122,21 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
['graphx', 'examples']
>>> [x.name for x in determine_modules_to_test([modules.sql])]
... # doctest: +NORMALIZE_WHITESPACE
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver',
'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-ml']
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples',
'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr',
'pyspark-mllib', 'pyspark-pandas', 'pyspark-ml']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sparkr, modules.sql], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml',
'pyspark-mllib', 'pyspark-pandas', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib',
'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-sql',
'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sql, modules.core], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver',
'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive',
'hive-thriftserver', 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib',
'pyspark-pandas', 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
'streaming-kinesis-asl']
"""
Expand Down
15 changes: 15 additions & 0 deletions dev/sparktestsupport/modules.py
Expand Up @@ -17,6 +17,7 @@

from functools import total_ordering
import itertools
import os
import re

all_modules = []
Expand Down Expand Up @@ -745,6 +746,20 @@ def __hash__(self):
]
)

docker_integration_tests = Module(
name="docker-integration-tests",
dependencies=[],
build_profile_flags=["-Pdocker-integration-tests"],
source_file_regexes=["external/docker-integration-tests"],
sbt_test_goals=["docker-integration-tests/test"],
environ=None if "GITHUB_ACTIONS" not in os.environ else {
"ENABLE_DOCKER_INTEGRATION_TESTS": "1"
},
test_tags=[
"org.apache.spark.tags.DockerTest"
]
)

# The root module is a dummy module which is used to run all of the tests.
# No other modules should directly depend on this module.
root = Module(
Expand Down
2 changes: 1 addition & 1 deletion docs/_config.yml
Expand Up @@ -22,7 +22,7 @@ include:
SPARK_VERSION: 3.2.0-SNAPSHOT
SPARK_VERSION_SHORT: 3.2.0
SCALA_BINARY_VERSION: "2.12"
SCALA_VERSION: "2.12.10"
SCALA_VERSION: "2.12.14"
MESOS_VERSION: 1.0.0
SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK
SPARK_GITHUB_URL: https://github.com/apache/spark
Expand Down
6 changes: 3 additions & 3 deletions docs/cluster-overview.md
Expand Up @@ -28,7 +28,7 @@ Spark applications run as independent sets of processes on a cluster, coordinate
object in your main program (called the _driver program_).

Specifically, to run on a cluster, the SparkContext can connect to several types of _cluster managers_
(either Spark's own standalone cluster manager, Mesos or YARN), which allocate resources across
(either Spark's own standalone cluster manager, Mesos, YARN or Kubernetes), which allocate resources across
applications. Once connected, Spark acquires *executors* on nodes in the cluster, which are
processes that run computations and store data for your application.
Next, it sends your application code (defined by JAR or Python files passed to SparkContext) to
Expand All @@ -48,7 +48,7 @@ There are several useful things to note about this architecture:
writing it to an external storage system.
2. Spark is agnostic to the underlying cluster manager. As long as it can acquire executor
processes, and these communicate with each other, it is relatively easy to run it even on a
cluster manager that also supports other applications (e.g. Mesos/YARN).
cluster manager that also supports other applications (e.g. Mesos/YARN/Kubernetes).
3. The driver program must listen for and accept incoming connections from its executors throughout
its lifetime (e.g., see [spark.driver.port in the network config
section](configuration.html#networking)). As such, the driver program must be network
Expand Down Expand Up @@ -117,7 +117,7 @@ The following table summarizes terms you'll see used to refer to cluster concept
</tr>
<tr>
<td>Cluster manager</td>
<td>An external service for acquiring resources on the cluster (e.g. standalone manager, Mesos, YARN)</td>
<td>An external service for acquiring resources on the cluster (e.g. standalone manager, Mesos, YARN, Kubernetes)</td>
</tr>
<tr>
<td>Deploy mode</td>
Expand Down

0 comments on commit d95d332

Please sign in to comment.