Skip to content

Commit

Permalink
Merge branch 'master' into reorder_keys
Browse files Browse the repository at this point in the history
  • Loading branch information
imback82 committed Aug 2, 2020
2 parents 8308649 + 71aea02 commit 268326b
Show file tree
Hide file tree
Showing 711 changed files with 18,988 additions and 7,385 deletions.
56 changes: 45 additions & 11 deletions .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,18 @@ jobs:
python3.8 -m pip install numpy pyarrow pandas scipy
python3.8 -m pip list
# SparkR
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
- name: Install R 4.0
if: contains(matrix.modules, 'sparkr')
with:
r-version: 3.6
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install R packages
if: contains(matrix.modules, 'sparkr')
run: |
sudo apt-get install -y libcurl4-openssl-dev
# qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
sudo apt-get install -y libcurl4-openssl-dev qpdf
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
# Show installed packages in R.
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
Expand Down Expand Up @@ -200,11 +203,15 @@ jobs:
architecture: x64
- name: Install Python linter dependencies
run: |
pip3 install flake8 sphinx numpy
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
r-version: 3.6
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
- name: Install R 4.0
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install R linter dependencies and SparkR
run: |
sudo apt-get install -y libcurl4-openssl-dev
Expand All @@ -218,7 +225,9 @@ jobs:
- name: Install dependencies for documentation generation
run: |
sudo apt-get install -y libcurl4-openssl-dev pandoc
pip install sphinx mkdocs numpy
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
Expand All @@ -237,3 +246,28 @@ jobs:
run: |
cd docs
jekyll build
java11:
name: Java 11 build
runs-on: ubuntu-latest
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: java11-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
java11-maven-
- name: Install Java 11
uses: actions/setup-java@v1
with:
java-version: 11
- name: Build with Maven
run: |
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
export MAVEN_CLI_OPTS="--no-transfer-progress"
mkdir -p ~/.m2
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
rm -rf ~/.m2/repository/org/apache/spark
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ python/lib/pyspark.zip
python/.eggs/
python/deps
python/docs/_site/
python/docs/source/reference/api/
python/test_coverage/coverage_data
python/test_coverage/htmlcov
python/pyspark/python
Expand Down
5 changes: 2 additions & 3 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -222,14 +222,13 @@ external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaRe
Python Software Foundation License
----------------------------------

pyspark/heapq3.py
python/docs/_static/copybutton.js
python/docs/source/_static/copybutton.js

BSD 3-Clause
------------

python/lib/py4j-*-src.zip
python/pyspark/cloudpickle.py
python/pyspark/cloudpickle/*.py
python/pyspark/join.py
core/src/main/resources/org/apache/spark/ui/static/d3.min.js

Expand Down
6 changes: 0 additions & 6 deletions LICENSE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -557,12 +557,6 @@ jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api
org.glassfish.hk2.external:jakarta.inject


Python Software Foundation License
----------------------------------

pyspark/heapq3.py


Public Domain
-------------

Expand Down
2 changes: 1 addition & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Suggests:
testthat,
e1071,
survival,
arrow (>= 0.15.1)
arrow (>= 1.0.0)
Collate:
'schema.R'
'generics.R'
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,7 @@ setMethod("collect",
port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
output <- tryCatch({
doServerAuth(conn, authSecret)
arrowTable <- arrow::read_arrow(readRaw(conn))
arrowTable <- arrow::read_ipc_stream(readRaw(conn))
# Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
if (exists("as_tibble", envir = asNamespace("arrow"))) {
as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/tests/fulltests/test_context.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ test_that("utility function can be called", {
expect_true(TRUE)
})

test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
test_that("getClientModeSparkSubmitOpts() returns spark-submit args from allowList", {
e <- new.env()
e[["spark.driver.memory"]] <- "512m"
ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
Expand Down
8 changes: 4 additions & 4 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -3921,14 +3921,14 @@ test_that("No extra files are created in SPARK_HOME by starting session and maki
# before creating a SparkSession with enableHiveSupport = T at the top of this test file
# (filesBefore). The test here is to compare that (filesBefore) against the list of files before
# any test is run in run-all.R (sparkRFilesBefore).
# sparkRWhitelistSQLDirs is also defined in run-all.R, and should contain only 2 whitelisted dirs,
# sparkRAllowedSQLDirs is also defined in run-all.R, and should contain only 2 allowed dirs,
# here allow the first value, spark-warehouse, in the diff, everything else should be exactly the
# same as before any test is run.
compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRWhitelistSQLDirs[[1]]))
compare_list(sparkRFilesBefore, setdiff(filesBefore, sparkRAllowedSQLDirs[[1]]))
# third, ensure only spark-warehouse and metastore_db are created when enableHiveSupport = T
# note: as the note above, after running all tests in this file while enableHiveSupport = T, we
# check the list of files again. This time we allow both whitelisted dirs to be in the diff.
compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRWhitelistSQLDirs))
# check the list of files again. This time we allow both dirs to be in the diff.
compare_list(sparkRFilesBefore, setdiff(filesAfter, sparkRAllowedSQLDirs))
})

unlink(parquetPath)
Expand Down
18 changes: 18 additions & 0 deletions R/pkg/tests/fulltests/test_sparkSQL_arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -312,4 +312,22 @@ test_that("Arrow optimization - unsupported types", {
})
})

test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", {
skip_if_not_installed("arrow")
df <- createDataFrame(list(list(a = 1L, b = "a")))

conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]

callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
expect_error(
count(gapply(df, "a", function(key, group) { group }, structType("a int, b int"))),
"expected IntegerType, IntegerType, got IntegerType, StringType")
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})

sparkR.session.stop()
4 changes: 2 additions & 2 deletions R/pkg/tests/run-all.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
install.spark(overwrite = TRUE)

sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
invisible(lapply(sparkRWhitelistSQLDirs,
sparkRAllowedSQLDirs <- c("spark-warehouse", "metastore_db")
invisible(lapply(sparkRAllowedSQLDirs,
function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)

Expand Down
4 changes: 2 additions & 2 deletions bin/find-spark-home
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
else
# We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
# Default to standard python interpreter unless told otherwise
# Default to standard python3 interpreter unless told otherwise
if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python3"}"
fi
export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
fi
4 changes: 2 additions & 2 deletions bin/find-spark-home.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ rem
rem Path to Python script finding SPARK_HOME
set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py

rem Default to standard python interpreter unless told otherwise
set PYTHON_RUNNER=python
rem Default to standard python3 interpreter unless told otherwise
set PYTHON_RUNNER=python3
rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version
if not "x%PYSPARK_DRIVER_PYTHON%"=="x" (
set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON%
Expand Down
56 changes: 28 additions & 28 deletions bin/load-spark-env.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,42 @@ rem This script loads spark-env.cmd if it exists, and ensures it is only loaded
rem spark-env.cmd is loaded from SPARK_CONF_DIR if set, or within the current directory's
rem conf\ subdirectory.

set SPARK_ENV_CMD=spark-env.cmd
if [%SPARK_ENV_LOADED%] == [] (
if not defined SPARK_ENV_LOADED (
set SPARK_ENV_LOADED=1

if [%SPARK_CONF_DIR%] == [] (
set SPARK_CONF_DIR=%~dp0..\conf
)

set SPARK_ENV_CMD=%SPARK_CONF_DIR%\%SPARK_ENV_CMD%
if exist %SPARK_ENV_CMD% (
call %SPARK_ENV_CMD%
)
call :LoadSparkEnv
)

rem Setting SPARK_SCALA_VERSION if not already set.

rem TODO: revisit for Scala 2.13 support
set SPARK_SCALA_VERSION=2.12
rem if [%SPARK_SCALA_VERSION%] == [] (
rem set SCALA_VERSION_1=2.12
rem set SCALA_VERSION_2=2.11
rem
rem set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1%
rem set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2%
rem set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables
rem if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
rem echo "Presence of build for multiple Scala versions detected (%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%)."
rem echo "Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in %SPARK_ENV_CMD%."
rem echo "Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd."
rem echo "Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd."
rem exit 1
rem )
rem if exist %ASSEMBLY_DIR1% (
rem set SPARK_SCALA_VERSION=%SCALA_VERSION_1%
rem ) else (
rem set SPARK_SCALA_VERSION=%SCALA_VERSION_2%
rem )
rem )
set SCALA_VERSION_1=2.13
set SCALA_VERSION_2=2.12

set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1%
set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2%
set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables

if not defined SPARK_SCALA_VERSION (
if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% (
echo Presence of build for multiple Scala versions detected ^(%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%^).
echo Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in spark-env.cmd.
echo Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd.
echo Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd.
exit 1
)
if exist %ASSEMBLY_DIR1% (
set SPARK_SCALA_VERSION=%SCALA_VERSION_1%
) else (
set SPARK_SCALA_VERSION=%SCALA_VERSION_2%
)
)
exit /b 0

:LoadSparkEnv
if exist "%SPARK_CONF_DIR%\spark-env.cmd" (
call "%SPARK_CONF_DIR%\spark-env.cmd"
)
42 changes: 20 additions & 22 deletions bin/load-spark-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,23 @@ fi

# Setting SPARK_SCALA_VERSION if not already set.

# TODO: revisit for Scala 2.13 support
export SPARK_SCALA_VERSION=2.12
#if [ -z "$SPARK_SCALA_VERSION" ]; then
# SCALA_VERSION_1=2.12
# SCALA_VERSION_2=2.11
#
# ASSEMBLY_DIR_1="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_1}"
# ASSEMBLY_DIR_2="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_2}"
# ENV_VARIABLE_DOC="https://spark.apache.org/docs/latest/configuration.html#environment-variables"
# if [[ -d "$ASSEMBLY_DIR_1" && -d "$ASSEMBLY_DIR_2" ]]; then
# echo "Presence of build for multiple Scala versions detected ($ASSEMBLY_DIR_1 and $ASSEMBLY_DIR_2)." 1>&2
# echo "Remove one of them or, export SPARK_SCALA_VERSION=$SCALA_VERSION_1 in ${SPARK_ENV_SH}." 1>&2
# echo "Visit ${ENV_VARIABLE_DOC} for more details about setting environment variables in spark-env.sh." 1>&2
# exit 1
# fi
#
# if [[ -d "$ASSEMBLY_DIR_1" ]]; then
# export SPARK_SCALA_VERSION=${SCALA_VERSION_1}
# else
# export SPARK_SCALA_VERSION=${SCALA_VERSION_2}
# fi
#fi
if [ -z "$SPARK_SCALA_VERSION" ]; then
SCALA_VERSION_1=2.13
SCALA_VERSION_2=2.12

ASSEMBLY_DIR_1="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_1}"
ASSEMBLY_DIR_2="${SPARK_HOME}/assembly/target/scala-${SCALA_VERSION_2}"
ENV_VARIABLE_DOC="https://spark.apache.org/docs/latest/configuration.html#environment-variables"
if [[ -d "$ASSEMBLY_DIR_1" && -d "$ASSEMBLY_DIR_2" ]]; then
echo "Presence of build for multiple Scala versions detected ($ASSEMBLY_DIR_1 and $ASSEMBLY_DIR_2)." 1>&2
echo "Remove one of them or, export SPARK_SCALA_VERSION=$SCALA_VERSION_1 in ${SPARK_ENV_SH}." 1>&2
echo "Visit ${ENV_VARIABLE_DOC} for more details about setting environment variables in spark-env.sh." 1>&2
exit 1
fi

if [[ -d "$ASSEMBLY_DIR_1" ]]; then
export SPARK_SCALA_VERSION=${SCALA_VERSION_1}
else
export SPARK_SCALA_VERSION=${SCALA_VERSION_2}
fi
fi
4 changes: 2 additions & 2 deletions bin/pyspark
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ if [[ -n "$IPYTHON" || -n "$IPYTHON_OPTS" ]]; then
exit 1
fi

# Default to standard python interpreter unless told otherwise
# Default to standard python3 interpreter unless told otherwise
if [[ -z "$PYSPARK_PYTHON" ]]; then
PYSPARK_PYTHON=python
PYSPARK_PYTHON=python3
fi
if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
PYSPARK_DRIVER_PYTHON=$PYSPARK_PYTHON
Expand Down
Loading

0 comments on commit 268326b

Please sign in to comment.