diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml new file mode 100644 index 0000000000000..d1c404e6fe35a --- /dev/null +++ b/.github/workflows/build_and_test.yml @@ -0,0 +1,372 @@ +name: Build and test + +on: + push: + branches: + - branch-3.0 + pull_request: + branches: + - branch-3.0 + +jobs: + # Build: build Spark and run the tests for specified modules. + build: + name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" + # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + java: + - 8 + hadoop: + - hadoop2.7 + hive: + - hive2.3 + # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. + # Kinesis tests depends on external Amazon kinesis service. + # Note that the modules below are from sparktestsupport/modules.py. + modules: + - >- + core, unsafe, kvstore, avro, + network-common, network-shuffle, repl, launcher, + examples, sketch, graphx + - >- + catalyst, hive-thriftserver + - >- + streaming, sql-kafka-0-10, streaming-kafka-0-10, + mllib-local, mllib, + yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl + # Here, we split Hive and SQL tests into some of slow ones and the rest of them. + included-tags: [""] + # Some tests are disabled in GitHun Actions. Ideally, we should remove this tag + # and run all tests. + excluded-tags: ["org.apache.spark.tags.GitHubActionsUnstableTest"] + comment: [""] + include: + # Hive tests + - modules: hive + java: 8 + hadoop: hadoop2.7 + hive: hive2.3 + included-tags: org.apache.spark.tags.SlowHiveTest + comment: "- slow tests" + - modules: hive + java: 8 + hadoop: hadoop2.7 + hive: hive2.3 + excluded-tags: org.apache.spark.tags.SlowHiveTest,org.apache.spark.tags.GitHubActionsUnstableTest + comment: "- other tests" + # SQL tests + - modules: sql + java: 8 + hadoop: hadoop2.7 + hive: hive2.3 + included-tags: org.apache.spark.tags.ExtendedSQLTest + comment: "- slow tests" + - modules: sql + java: 8 + hadoop: hadoop2.7 + hive: hive2.3 + excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.GitHubActionsUnstableTest + comment: "- other tests" + env: + MODULES_TO_TEST: ${{ matrix.modules }} + EXCLUDED_TAGS: ${{ matrix.excluded-tags }} + INCLUDED_TAGS: ${{ matrix.included-tags }} + HADOOP_PROFILE: ${{ matrix.hadoop }} + HIVE_PROFILE: ${{ matrix.hive }} + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda + GITHUB_PREV_SHA: ${{ github.event.before }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + # In order to fetch changed files + with: + fetch-depth: 0 + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/zinc-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Ivy local repository + uses: actions/cache@v2 + with: + path: ~/.ivy2/cache + key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-ivy- + - name: Install Java ${{ matrix.java }} + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + - name: Install Python 3.8 + uses: actions/setup-python@v2 + # We should install one Python that is higher then 3+ for SQL and Yarn because: + # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. + # - Yarn has a Python specific test too, for example, YarnClusterSuite. + if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + with: + python-version: 3.8 + architecture: x64 + - name: Install Python packages (Python 3.8) + if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) + run: | + python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner + python3.8 -m pip list + # Run the tests. + - name: Run tests + run: | + # Hive and SQL tests become flaky when running in parallel as it's too intensive. + if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi + ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" + - name: Upload test results to report + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} + path: "**/target/test-reports/*.xml" + - name: Upload unit tests log files + if: failure() + uses: actions/upload-artifact@v2 + with: + name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} + path: "**/target/unit-tests.log" + + pyspark: + name: "Build modules: ${{ matrix.modules }}" + runs-on: ubuntu-20.04 + container: + image: dongjoon/apache-spark-github-action-image:20201025 + strategy: + fail-fast: false + matrix: + modules: + - >- + pyspark-sql, pyspark-mllib + - >- + pyspark-core, pyspark-streaming, pyspark-ml + env: + MODULES_TO_TEST: ${{ matrix.modules }} + HADOOP_PROFILE: hadoop2.7 + HIVE_PROFILE: hive2.3 + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda + GITHUB_PREV_SHA: ${{ github.event.before }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + # In order to fetch changed files + with: + fetch-depth: 0 + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/zinc-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Ivy local repository + uses: actions/cache@v2 + with: + path: ~/.ivy2/cache + key: pyspark-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + pyspark-ivy- + - name: Install Python 2.7 + uses: actions/setup-python@v2 + with: + python-version: 2.7 + architecture: x64 + - name: Install Python packages (Python 2.7 ) + run: | + python2.7 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner + python2.7 -m pip list + # Run the tests. + - name: Run tests + run: | + ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" + - name: Upload test results to report + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-results-${{ matrix.modules }}--8-hadoop2.7-hive2.3 + path: "**/target/test-reports/*.xml" + - name: Upload unit tests log files + if: failure() + uses: actions/upload-artifact@v2 + with: + name: unit-tests-log-${{ matrix.modules }}--8-hadoop2.7-hive2.3 + path: "**/target/unit-tests.log" + + sparkr: + name: "Build modules: sparkr" + runs-on: ubuntu-20.04 + container: + image: dongjoon/apache-spark-github-action-image:20201025 + env: + HADOOP_PROFILE: hadoop2.7 + HIVE_PROFILE: hive2.3 + GITHUB_PREV_SHA: ${{ github.event.before }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + # In order to fetch changed files + with: + fetch-depth: 0 + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/zinc-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Ivy local repository + uses: actions/cache@v2 + with: + path: ~/.ivy2/cache + key: sparkr-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + sparkr-ivy- + - name: Run tests + run: | + # The followings are also used by `r-lib/actions/setup-r` to avoid + # R issues at docker environment + export TZ=UTC + export _R_CHECK_SYSTEM_CLOCK_=FALSE + ./dev/run-tests --parallelism 2 --modules sparkr + - name: Upload test results to report + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-results-sparkr--8-hadoop2.7-hive2.3 + path: "**/target/test-reports/*.xml" + - name: Upload unit tests log files + if: failure() + uses: actions/upload-artifact@v2 + with: + name: unit-tests-log-sparkr--8-hadoop2.7-hive2.3 + path: "**/target/unit-tests.log" + + # Static analysis, and documentation build + lint: + name: Linters, licenses, dependencies and documentation generation + runs-on: ubuntu-20.04 + container: + image: dongjoon/apache-spark-github-action-image:20201025 + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/zinc-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Ivy local repository + uses: actions/cache@v2 + with: + path: ~/.ivy2/cache + key: docs-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + docs-ivy- + - name: Cache Maven local repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: docs-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + docs-maven- + - name: Install Python 3.6 + uses: actions/setup-python@v2 + with: + python-version: 3.6 + architecture: x64 + - name: Install Python linter dependencies + run: | + python3.6 -m pip install install flake8 sphinx numpy + - name: Install R linter dependencies and SparkR + run: | + apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev + Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" + Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" + ./R/install-dev.sh + - name: Install dependencies for documentation generation + run: | + apt-get install -y libcurl4-openssl-dev pandoc + python3.6 -m pip install sphinx mkdocs numpy + apt-get update -y + apt-get install -y ruby ruby-dev + gem install jekyll jekyll-redirect-from rouge + Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" + - name: Scala linter + run: ./dev/lint-scala + - name: Java linter + run: ./dev/lint-java + - name: Python linter + run: ./dev/lint-python + - name: R linter + run: ./dev/lint-r + - name: License test + run: ./dev/check-license + - name: Dependencies test + run: ./dev/test-dependencies.sh + - name: Run documentation build + run: | + cd docs + export LC_ALL=C.UTF-8 + export LANG=C.UTF-8 + jekyll build + + java-11: + name: Java 11 build with Maven + runs-on: ubuntu-20.04 + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + - name: Cache Maven local repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: java11-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + java11-maven- + - name: Install Java 11 + uses: actions/setup-java@v1 + with: + java-version: 11 + - name: Build with Maven + run: | + export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" + export MAVEN_CLI_OPTS="--no-transfer-progress" + # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. + ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install + rm -rf ~/.m2/repository/org/apache/spark diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml deleted file mode 100644 index d53119ad75599..0000000000000 --- a/.github/workflows/master.yml +++ /dev/null @@ -1,119 +0,0 @@ -name: master - -on: - push: - branches: - - master - pull_request: - branches: - - master - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - java: [ '1.8', '11' ] - hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ] - hive: [ 'hive-1.2', 'hive-2.3' ] - exclude: - - java: '11' - hive: 'hive-1.2' - - hadoop: 'hadoop-3.2' - hive: 'hive-1.2' - name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }} - - steps: - - uses: actions/checkout@master - # We split caches because GitHub Action Cache has a 400MB-size limit. - - uses: actions/cache@v1 - with: - path: build - key: build-${{ hashFiles('**/pom.xml') }} - restore-keys: | - build- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/com - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/org - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/net - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net- - - uses: actions/cache@v1 - with: - path: ~/.m2/repository/io - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io- - - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - - name: Build with Maven - run: | - export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" - mkdir -p ~/.m2 - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install - rm -rf ~/.m2/repository/org/apache/spark - - - lint: - runs-on: ubuntu-latest - name: Linters (Java/Scala/Python), licenses, dependencies - steps: - - uses: actions/checkout@master - - uses: actions/setup-java@v1 - with: - java-version: '11' - - uses: actions/setup-python@v1 - with: - python-version: '3.x' - architecture: 'x64' - - name: Scala - run: ./dev/lint-scala - - name: Java - run: ./dev/lint-java - - name: Python - run: | - pip install flake8 sphinx numpy - ./dev/lint-python - - name: License - run: ./dev/check-license - - name: Dependencies - run: ./dev/test-dependencies.sh - - lintr: - runs-on: ubuntu-latest - name: Linter (R) - steps: - - uses: actions/checkout@master - - uses: actions/setup-java@v1 - with: - java-version: '11' - - name: install R - run: | - echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' | sudo tee -a /etc/apt/sources.list - curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add - sudo apt-get update - sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev - - name: install R packages - run: | - sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" - sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" - - name: package and install SparkR - run: ./R/install-dev.sh - - name: lint-r - run: ./dev/lint-r diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml new file mode 100644 index 0000000000000..5d4fdf4c97ded --- /dev/null +++ b/.github/workflows/publish_snapshot.yml @@ -0,0 +1,33 @@ +name: Publish Snapshot + +on: + push: + branches: + - branch-3.0 + +jobs: + publish-snapshot: + if: github.repository == 'apache/spark' + runs-on: ubuntu-latest + steps: + - name: Checkout Spark repository + uses: actions/checkout@master + - name: Cache Maven local repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: snapshot-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + snapshot-maven- + - name: Install Java 8 + uses: actions/setup-java@v1 + with: + java-version: 8 + - name: Publish snapshot + env: + ASF_USERNAME: ${{ secrets.NEXUS_USER }} + ASF_PASSWORD: ${{ secrets.NEXUS_PW }} + GPG_KEY: "not_used" + GPG_PASSPHRASE: "not_used" + GIT_REF: "branch-3.0" + run: ./dev/create-release/release-build.sh publish-snapshot diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml new file mode 100644 index 0000000000000..93cdb86687261 --- /dev/null +++ b/.github/workflows/test_report.yml @@ -0,0 +1,24 @@ +name: Report test results +on: + workflow_run: + workflows: ["Build and test"] + types: + - completed + +jobs: + test_report: + runs-on: ubuntu-latest + steps: + - name: Download test results to report + uses: dawidd6/action-download-artifact@v2 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + workflow: ${{ github.event.workflow_run.workflow_id }} + commit: ${{ github.event.workflow_run.head_commit.id }} + - name: Publish test report + uses: scacap/action-surefire-report@v1 + with: + check_name: Report test results + github_token: ${{ secrets.GITHUB_TOKEN }} + report_paths: "**/target/test-reports/*.xml" + commit: ${{ github.event.workflow_run.head_commit.id }} diff --git a/.gitignore b/.gitignore index 798e8acc4d43b..1f4b323c926ac 100644 --- a/.gitignore +++ b/.gitignore @@ -72,7 +72,6 @@ scalastyle-on-compile.generated.xml scalastyle-output.xml scalastyle.txt spark-*-bin-*.tgz -spark-resources/ spark-tests.log src_managed/ streaming-tests.log @@ -80,6 +79,7 @@ target/ unit-tests.log work/ docs/.jekyll-metadata +docs/.jekyll-cache # For Hive TempStatsStore/ diff --git a/R/WINDOWS.md b/R/WINDOWS.md index dbc27178bdb8c..9fe4a22bf22b2 100644 --- a/R/WINDOWS.md +++ b/R/WINDOWS.md @@ -22,8 +22,8 @@ To build SparkR on Windows, the following steps are required 1. Make sure `bash` is available and in `PATH` if you already have a built-in `bash` on Windows. If you do not have, install [Cygwin](https://www.cygwin.com/). -2. Install R (>= 3.1) and [Rtools](https://cloud.r-project.org/bin/windows/Rtools/). Make sure to -include Rtools and R in `PATH`. Note that support for R prior to version 3.4 is deprecated as of Spark 3.0.0. +2. Install R (>= 3.5) and [Rtools](https://cloud.r-project.org/bin/windows/Rtools/). Make sure to +include Rtools and R in `PATH`. 3. Install JDK that SparkR supports (see `R/pkg/DESCRIPTION`), and set `JAVA_HOME` in the system environment variables. diff --git a/R/create-rd.sh b/R/create-rd.sh index ff622a41a46c0..aaad3b1aafa0a 100755 --- a/R/create-rd.sh +++ b/R/create-rd.sh @@ -34,4 +34,4 @@ pushd "$FWDIR" > /dev/null . "$FWDIR/find-r.sh" # Generate Rd files if devtools is installed -"$R_SCRIPT_PATH/Rscript" -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }' +"$R_SCRIPT_PATH/Rscript" -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); setwd("'$FWDIR'"); devtools::document(pkg="./pkg", roclets=c("rd")) }' diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 95d3e52bef3a9..a17dea9093771 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 3.0.0 +Version: 3.0.2 Title: R Front End for 'Apache Spark' Description: Provides an R Front end for 'Apache Spark' . Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), @@ -15,7 +15,7 @@ URL: https://www.apache.org/ https://spark.apache.org/ BugReports: https://spark.apache.org/contributing.html SystemRequirements: Java (>= 8, < 12) Depends: - R (>= 3.1), + R (>= 3.5), methods Suggests: knitr, @@ -23,7 +23,7 @@ Suggests: testthat, e1071, survival, - arrow + arrow (>= 0.15.1) Collate: 'schema.R' 'generics.R' @@ -62,3 +62,4 @@ Collate: RoxygenNote: 5.0.1 VignetteBuilder: knitr NeedsCompilation: no +Encoding: UTF-8 diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 7ed2e36d59531..9fd7bb4c6ff2b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -28,6 +28,7 @@ importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "u # S3 methods exported export("sparkR.session") +export("sparkR.init") export("sparkR.session.stop") export("sparkR.stop") export("sparkR.conf") @@ -41,6 +42,9 @@ export("sparkR.callJStatic") export("install.spark") +export("sparkRSQL.init", + "sparkRHive.init") + # MLlib integration exportMethods("glm", "spark.glm", @@ -148,6 +152,7 @@ exportMethods("arrange", "printSchema", "randomSplit", "rbind", + "registerTempTable", "rename", "repartition", "repartitionByRange", @@ -420,8 +425,10 @@ export("as.DataFrame", "cacheTable", "clearCache", "createDataFrame", + "createExternalTable", "createTable", "currentDatabase", + "dropTempTable", "dropTempView", "listColumns", "listDatabases", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 593d3ca16220d..4195ec8f07b8b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -271,7 +271,7 @@ setMethod("show", "SparkDataFrame", paste(l, collapse = ":") }) s <- paste(cols, collapse = ", ") - cat(paste(class(object), "[", s, "]\n", sep = "")) + cat(paste0(class(object), "[", s, "]\n")) } }) @@ -431,7 +431,7 @@ setMethod("coltypes", if (is.null(type)) { specialtype <- specialtypeshandle(x) if (is.null(specialtype)) { - stop(paste("Unsupported data type: ", x)) + stop("Unsupported data type: ", x) } type <- PRIMITIVE_TYPES[[specialtype]] } @@ -521,6 +521,32 @@ setMethod("createOrReplaceTempView", invisible(callJMethod(x@sdf, "createOrReplaceTempView", viewName)) }) +#' (Deprecated) Register Temporary Table +#' +#' Registers a SparkDataFrame as a Temporary Table in the SparkSession +#' @param x A SparkDataFrame +#' @param tableName A character vector containing the name of the table +#' +#' @seealso \link{createOrReplaceTempView} +#' @rdname registerTempTable-deprecated +#' @name registerTempTable +#' @aliases registerTempTable,SparkDataFrame,character-method +#' @examples +#'\dontrun{ +#' sparkR.session() +#' path <- "path/to/file.json" +#' df <- read.json(path) +#' registerTempTable(df, "json_df") +#' new_df <- sql("SELECT * FROM json_df") +#'} +#' @note registerTempTable since 1.4.0 +setMethod("registerTempTable", + signature(x = "SparkDataFrame", tableName = "character"), + function(x, tableName) { + .Deprecated("createOrReplaceTempView") + invisible(callJMethod(x@sdf, "createOrReplaceTempView", tableName)) + }) + #' insertInto #' #' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession. @@ -803,8 +829,8 @@ setMethod("repartitionByRange", jcol <- lapply(cols, function(c) { c@jc }) sdf <- callJMethod(x@sdf, "repartitionByRange", numToInt(numPartitions), jcol) } else { - stop(paste("numPartitions and col must be numeric and Column; however, got", - class(numPartitions), "and", class(col))) + stop("numPartitions and col must be numeric and Column; however, got ", + class(numPartitions), " and ", class(col)) } } else if (!is.null(col)) { # only columns are specified @@ -813,7 +839,7 @@ setMethod("repartitionByRange", jcol <- lapply(cols, function(c) { c@jc }) sdf <- callJMethod(x@sdf, "repartitionByRange", jcol) } else { - stop(paste("col must be Column; however, got", class(col))) + stop("col must be Column; however, got ", class(col)) } } else if (!is.null(numPartitions)) { # only numPartitions is specified @@ -1042,10 +1068,10 @@ setMethod("sample", signature(x = "SparkDataFrame"), function(x, withReplacement = FALSE, fraction, seed) { if (!is.numeric(fraction)) { - stop(paste("fraction must be numeric; however, got", class(fraction))) + stop("fraction must be numeric; however, got ", class(fraction)) } if (!is.logical(withReplacement)) { - stop(paste("withReplacement must be logical; however, got", class(withReplacement))) + stop("withReplacement must be logical; however, got ", class(withReplacement)) } if (!missing(seed)) { @@ -1185,11 +1211,10 @@ setMethod("collect", checkSchemaInArrow(schema(x)) TRUE }, error = function(e) { - warning(paste0("The conversion from Spark DataFrame to R DataFrame was attempted ", - "with Arrow optimization because ", - "'spark.sql.execution.arrow.sparkr.enabled' is set to true; ", - "however, failed, attempting non-optimization. Reason: ", - e)) + warning("The conversion from Spark DataFrame to R DataFrame was attempted ", + "with Arrow optimization because ", + "'spark.sql.execution.arrow.sparkr.enabled' is set to true; ", + "however, failed, attempting non-optimization. Reason: ", e) FALSE }) } @@ -1200,27 +1225,23 @@ setMethod("collect", # empty data.frame with 0 columns and 0 rows data.frame() } else if (useArrow) { - requireNamespace1 <- requireNamespace - if (requireNamespace1("arrow", quietly = TRUE)) { - read_arrow <- get("read_arrow", envir = asNamespace("arrow"), inherits = FALSE) - # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190. - useAsTibble <- exists("as_tibble", envir = asNamespace("arrow")) - + if (requireNamespace("arrow", quietly = TRUE)) { portAuth <- callJMethod(x@sdf, "collectAsArrowToR") port <- portAuth[[1]] authSecret <- portAuth[[2]] conn <- socketConnection( port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout) + version <- packageVersion("arrow") output <- tryCatch({ doServerAuth(conn, authSecret) - arrowTable <- read_arrow(readRaw(conn)) - if (useAsTibble) { - as_tibble <- get("as_tibble", envir = asNamespace("arrow")) - as.data.frame(as_tibble(arrowTable), stringsAsFactors = stringsAsFactors) + if (version$minor >= 17 || version$major >= 1) { + arrowTable <- arrow::read_ipc_stream(readRaw(conn)) } else { - as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors) + arrowTable <- arrow::read_arrow(readRaw(conn)) } - }, finally = { + as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors) + }, + finally = { close(conn) }) return(output) @@ -1487,8 +1508,8 @@ dapplyInternal <- function(x, func, schema) { if (inherits(schema, "structType")) { checkSchemaInArrow(schema) } else if (is.null(schema)) { - stop(paste0("Arrow optimization does not support 'dapplyCollect' yet. Please disable ", - "Arrow optimization or use 'collect' and 'dapply' APIs instead.")) + stop("Arrow optimization does not support 'dapplyCollect' yet. Please disable ", + "Arrow optimization or use 'collect' and 'dapply' APIs instead.") } else { stop("'schema' should be DDL-formatted string or structType.") } @@ -1631,9 +1652,7 @@ setMethod("dapplyCollect", #' #' @param cols grouping columns. #' @param func a function to be applied to each group partition specified by grouping -#' column of the SparkDataFrame. The function \code{func} takes as argument -#' a key - grouping columns and a data frame - a local R data.frame. -#' The output of \code{func} is a local R data.frame. +#' column of the SparkDataFrame. See Details. #' @param schema the schema of the resulting SparkDataFrame after the function is applied. #' The schema must match to output of \code{func}. It has to be defined for each #' output column with preferred output column name and corresponding data type. @@ -1643,29 +1662,43 @@ setMethod("dapplyCollect", #' @aliases gapply,SparkDataFrame-method #' @rdname gapply #' @name gapply +#' @details +#' \code{func} is a function of two arguments. The first, usually named \code{key} +#' (though this is not enforced) corresponds to the grouping key, will be an +#' unnamed \code{list} of \code{length(cols)} length-one objects corresponding +#' to the grouping columns' values for the current group. +#' +#' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the +#' columns of the input not in \code{cols} for the rows corresponding to \code{key}. +#' +#' The output of \code{func} must be a \code{data.frame} matching \code{schema} -- +#' in particular this means the names of the output \code{data.frame} are irrelevant +#' #' @seealso \link{gapplyCollect} #' @examples #' #' \dontrun{ -#' Computes the arithmetic mean of the second column by grouping -#' on the first and third columns. Output the grouping values and the average. +#' # Computes the arithmetic mean of the second column by grouping +#' # on the first and third columns. Output the grouping values and the average. #' #' df <- createDataFrame ( #' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)), #' c("a", "b", "c", "d")) #' -#' Here our output contains three columns, the key which is a combination of two -#' columns with data types integer and string and the mean which is a double. +#' # Here our output contains three columns, the key which is a combination of two +#' # columns with data types integer and string and the mean which is a double. #' schema <- structType(structField("a", "integer"), structField("c", "string"), #' structField("avg", "double")) #' result <- gapply( #' df, #' c("a", "c"), #' function(key, x) { +#' # key will either be list(1L, '1') (for the group where a=1L,c='1') or +#' # list(3L, '3') (for the group where a=3L,c='3') #' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) #' }, schema) #' -#' The schema also can be specified in a DDL-formatted string. +#' # The schema also can be specified in a DDL-formatted string. #' schema <- "a INT, c STRING, avg DOUBLE" #' result <- gapply( #' df, @@ -1674,8 +1707,8 @@ setMethod("dapplyCollect", #' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) #' }, schema) #' -#' We can also group the data and afterwards call gapply on GroupedData. -#' For Example: +#' # We can also group the data and afterwards call gapply on GroupedData. +#' # For example: #' gdf <- group_by(df, "a", "c") #' result <- gapply( #' gdf, @@ -1684,15 +1717,15 @@ setMethod("dapplyCollect", #' }, schema) #' collect(result) #' -#' Result -#' ------ -#' a c avg -#' 3 3 3.0 -#' 1 1 1.5 +#' # Result +#' # ------ +#' # a c avg +#' # 3 3 3.0 +#' # 1 1 1.5 #' -#' Fits linear models on iris dataset by grouping on the 'Species' column and -#' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length' -#' and 'Petal_Width' as training features. +#' # Fits linear models on iris dataset by grouping on the 'Species' column and +#' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length' +#' # and 'Petal_Width' as training features. #' #' df <- createDataFrame (iris) #' schema <- structType(structField("(Intercept)", "double"), @@ -1708,12 +1741,12 @@ setMethod("dapplyCollect", #' }, schema) #' collect(df1) #' -#' Result -#' --------- -#' Model (Intercept) Sepal_Width Petal_Length Petal_Width -#' 1 0.699883 0.3303370 0.9455356 -0.1697527 -#' 2 1.895540 0.3868576 0.9083370 -0.6792238 -#' 3 2.351890 0.6548350 0.2375602 0.2521257 +#' # Result +#' # --------- +#' # Model (Intercept) Sepal_Width Petal_Length Petal_Width +#' # 1 0.699883 0.3303370 0.9455356 -0.1697527 +#' # 2 1.895540 0.3868576 0.9083370 -0.6792238 +#' # 3 2.351890 0.6548350 0.2375602 0.2521257 #' #'} #' @note gapply(SparkDataFrame) since 2.0.0 @@ -1731,20 +1764,30 @@ setMethod("gapply", #' #' @param cols grouping columns. #' @param func a function to be applied to each group partition specified by grouping -#' column of the SparkDataFrame. The function \code{func} takes as argument -#' a key - grouping columns and a data frame - a local R data.frame. -#' The output of \code{func} is a local R data.frame. +#' column of the SparkDataFrame. See Details. #' @return A data.frame. #' @family SparkDataFrame functions #' @aliases gapplyCollect,SparkDataFrame-method #' @rdname gapplyCollect #' @name gapplyCollect +#' @details +#' \code{func} is a function of two arguments. The first, usually named \code{key} +#' (though this is not enforced) corresponds to the grouping key, will be an +#' unnamed \code{list} of \code{length(cols)} length-one objects corresponding +#' to the grouping columns' values for the current group. +#' +#' The second, herein \code{x}, will be a local \code{\link{data.frame}} with the +#' columns of the input not in \code{cols} for the rows corresponding to \code{key}. +#' +#' The output of \code{func} must be a \code{data.frame} matching \code{schema} -- +#' in particular this means the names of the output \code{data.frame} are irrelevant +#' #' @seealso \link{gapply} #' @examples #' #' \dontrun{ -#' Computes the arithmetic mean of the second column by grouping -#' on the first and third columns. Output the grouping values and the average. +#' # Computes the arithmetic mean of the second column by grouping +#' # on the first and third columns. Output the grouping values and the average. #' #' df <- createDataFrame ( #' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)), @@ -1759,8 +1802,8 @@ setMethod("gapply", #' y #' }) #' -#' We can also group the data and afterwards call gapply on GroupedData. -#' For Example: +#' # We can also group the data and afterwards call gapply on GroupedData. +#' # For example: #' gdf <- group_by(df, "a", "c") #' result <- gapplyCollect( #' gdf, @@ -1770,15 +1813,15 @@ setMethod("gapply", #' y #' }) #' -#' Result -#' ------ -#' key_a key_c mean_b -#' 3 3 3.0 -#' 1 1 1.5 +#' # Result +#' # ------ +#' # key_a key_c mean_b +#' # 3 3 3.0 +#' # 1 1 1.5 #' -#' Fits linear models on iris dataset by grouping on the 'Species' column and -#' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length' -#' and 'Petal_Width' as training features. +#' # Fits linear models on iris dataset by grouping on the 'Species' column and +#' # using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length' +#' # and 'Petal_Width' as training features. #' #' df <- createDataFrame (iris) #' result <- gapplyCollect( @@ -1790,12 +1833,12 @@ setMethod("gapply", #' data.frame(t(coef(m))) #' }) #' -#' Result -#'--------- -#' Model X.Intercept. Sepal_Width Petal_Length Petal_Width -#' 1 0.699883 0.3303370 0.9455356 -0.1697527 -#' 2 1.895540 0.3868576 0.9083370 -0.6792238 -#' 3 2.351890 0.6548350 0.2375602 0.2521257 +#' # Result +#' # --------- +#' # Model X.Intercept. Sepal_Width Petal_Length Petal_Width +#' # 1 0.699883 0.3303370 0.9455356 -0.1697527 +#' # 2 1.895540 0.3868576 0.9083370 -0.6792238 +#' # 3 2.351890 0.6548350 0.2375602 0.2521257 #' #'} #' @note gapplyCollect(SparkDataFrame) since 2.0.0 @@ -1969,8 +2012,8 @@ setMethod("[", signature(x = "SparkDataFrame"), x } else { if (class(i) != "Column") { - stop(paste0("Expressions other than filtering predicates are not supported ", - "in the first parameter of extract operator [ or subset() method.")) + stop("Expressions other than filtering predicates are not supported ", + "in the first parameter of extract operator [ or subset() method.") } filter(x, i) } @@ -2561,18 +2604,17 @@ setMethod("join", if (is.null(joinType)) { sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc) } else { - if (joinType %in% c("inner", "cross", + validJoinTypes <- c("inner", "cross", "outer", "full", "fullouter", "full_outer", "left", "leftouter", "left_outer", "right", "rightouter", "right_outer", - "semi", "left_semi", "leftsemi", "anti", "left_anti", "leftanti")) { - joinType <- gsub("_", "", joinType) + "semi", "leftsemi", "left_semi", "anti", "leftanti", "left_anti") + if (joinType %in% validJoinTypes) { + joinType <- gsub("_", "", joinType, fixed = TRUE) sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType) } else { - stop(paste("joinType must be one of the following types:", - "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',", - "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", - "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")) + stop("joinType must be one of the following types: ", + "'", paste(validJoinTypes, collapse = "', '"), "'") } } } @@ -2707,10 +2749,10 @@ setMethod("merge", colY <- joinY[[i]] if (colX %in% by) { - colX <- paste(colX, suffixes[1], sep = "") + colX <- paste0(colX, suffixes[1]) } if (colY %in% by) { - colY <- paste(colY, suffixes[2], sep = "") + colY <- paste0(colY, suffixes[2]) } colX <- getColumn(xsel, colX) @@ -2725,7 +2767,7 @@ setMethod("merge", # sorts the result by 'by' columns if sort = TRUE if (sort && length(by) > 0) { - colNameWithSuffix <- paste(by, suffixes[2], sep = "") + colNameWithSuffix <- paste0(by, suffixes[2]) joinRes <- do.call("arrange", c(joinRes, colNameWithSuffix, decreasing = FALSE)) } @@ -2748,7 +2790,7 @@ genAliasesForIntersectedCols <- function(x, intersectedColNames, suffix) { cols <- lapply(allColNames, function(colName) { col <- getColumn(x, colName) if (colName %in% intersectedColNames) { - newJoin <- paste(colName, suffix, sep = "") + newJoin <- paste0(colName, suffix) if (newJoin %in% allColNames) { stop("The following column name: ", newJoin, " occurs more than once in the 'DataFrame'.", "Please use different suffixes for the intersected columns.") diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 6e89b4bb4d964..7a1d157bb8a36 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -69,7 +69,7 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode, setMethod("showRDD", "RDD", function(object) { - cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep = "")) + cat(paste0(callJMethod(getJRDD(object), "toString"), "\n")) }) setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) { @@ -947,7 +947,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", MAXINT <- .Machine$integer.max if (num < 0) - stop(paste("Negative number of elements requested")) + stop("Negative number of elements requested") if (initialCount > MAXINT - 1) { maxSelected <- MAXINT - 1 diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index f48a334ed6766..c0ac68332ec41 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -34,7 +34,7 @@ getInternalType <- function(x) { Date = "date", POSIXlt = "timestamp", POSIXct = "timestamp", - stop(paste("Unsupported type for SparkDataFrame:", class(x)))) + stop("Unsupported type for SparkDataFrame: ", class(x))) } #' return the SparkSession @@ -110,10 +110,11 @@ sparkR.conf <- function(key, defaultValue) { value <- if (missing(defaultValue)) { tryCatch(callJMethod(conf, "get", key), error = function(e) { - if (any(grep("java.util.NoSuchElementException", as.character(e)))) { - stop(paste0("Config '", key, "' is not set")) + estr <- as.character(e) + if (any(grepl("java.util.NoSuchElementException", estr, fixed = TRUE))) { + stop("Config '", key, "' is not set") } else { - stop(paste0("Unknown error: ", as.character(e))) + stop("Unknown error: ", estr) } }) } else { @@ -205,9 +206,9 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { # SPAKR-SQL does not support '.' in column name, so replace it with '_' # TODO(davies): remove this once SPARK-2775 is fixed names <- lapply(names, function(n) { - nn <- gsub("[.]", "_", n) + nn <- gsub(".", "_", n, fixed = TRUE) if (nn != n) { - warning(paste("Use", nn, "instead of", n, "as column name")) + warning("Use ", nn, " instead of ", n, " as column name") } nn }) @@ -289,10 +290,9 @@ createDataFrame <- function(data, schema = NULL, samplingRatio = 1.0, TRUE }, error = function(e) { - warning(paste0("createDataFrame attempted Arrow optimization because ", - "'spark.sql.execution.arrow.sparkr.enabled' is set to true; however, ", - "failed, attempting non-optimization. Reason: ", - e)) + warning("createDataFrame attempted Arrow optimization because ", + "'spark.sql.execution.arrow.sparkr.enabled' is set to true; however, ", + "failed, attempting non-optimization. Reason: ", e) FALSE }) } @@ -325,7 +325,7 @@ createDataFrame <- function(data, schema = NULL, samplingRatio = 1.0, } else if (inherits(data, "RDD")) { rdd <- data } else { - stop(paste("unexpected type:", class(data))) + stop("unexpected type: ", class(data)) } schema <- getSchema(schema, firstRow, rdd) @@ -556,7 +556,6 @@ tableToDF <- function(tableName) { #' stringSchema <- "name STRING, info MAP" #' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE) #' } -#' @name read.df #' @note read.df since 1.4.0 read.df <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) { if (!is.null(path) && !is.character(path)) { @@ -687,7 +686,6 @@ read.jdbc <- function(url, tableName, #' stringSchema <- "name STRING, info MAP" #' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, maxFilesPerTrigger = 1) #' } -#' @name read.stream #' @note read.stream since 2.2.0 #' @note experimental read.stream <- function(source = NULL, schema = NULL, ...) { diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R index 7641f8a7a0432..275737f804bde 100644 --- a/R/pkg/R/catalog.R +++ b/R/pkg/R/catalog.R @@ -17,6 +17,35 @@ # catalog.R: SparkSession catalog functions +#' (Deprecated) Create an external table +#' +#' Creates an external table based on the dataset in a data source, +#' Returns a SparkDataFrame associated with the external table. +#' +#' The data source is specified by the \code{source} and a set of options(...). +#' If \code{source} is not specified, the default data source configured by +#' "spark.sql.sources.default" will be used. +#' +#' @param tableName a name of the table. +#' @param path the path of files to load. +#' @param source the name of external data source. +#' @param schema the schema of the data required for some data sources. +#' @param ... additional argument(s) passed to the method. +#' @return A SparkDataFrame. +#' @rdname createExternalTable-deprecated +#' @seealso \link{createTable} +#' @examples +#'\dontrun{ +#' sparkR.session() +#' df <- createExternalTable("myjson", path="path/to/json", source="json", schema) +#' } +#' @name createExternalTable +#' @note createExternalTable since 1.4.0 +createExternalTable <- function(tableName, path = NULL, source = NULL, schema = NULL, ...) { + .Deprecated("createTable", old = "createExternalTable") + createTable(tableName, path, source, schema, ...) +} + #' Creates a table based on the dataset in a data source #' #' Creates a table based on the dataset in a data source. Returns a SparkDataFrame associated with @@ -130,6 +159,31 @@ clearCache <- function() { invisible(callJMethod(catalog, "clearCache")) } +#' (Deprecated) Drop Temporary Table +#' +#' Drops the temporary table with the given table name in the catalog. +#' If the table has been cached/persisted before, it's also unpersisted. +#' +#' @param tableName The name of the SparkSQL table to be dropped. +#' @seealso \link{dropTempView} +#' @rdname dropTempTable-deprecated +#' @examples +#' \dontrun{ +#' sparkR.session() +#' df <- read.df(path, "parquet") +#' createOrReplaceTempView(df, "table") +#' dropTempTable("table") +#' } +#' @name dropTempTable +#' @note dropTempTable since 1.4.0 +dropTempTable <- function(tableName) { + .Deprecated("dropTempView", old = "dropTempTable") + if (class(tableName) != "character") { + stop("tableName must be a string.") + } + dropTempView(tableName) +} + #' Drops the temporary view with the given view name in the catalog. #' #' Drops the temporary view with the given view name in the catalog. diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 2ff68ab7b9d77..797a5c7da1549 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -65,8 +65,8 @@ checkJavaVersion <- function() { javaHome <- Sys.getenv("JAVA_HOME") javaReqs <- utils::packageDescription(utils::packageName(), fields = c("SystemRequirements")) sparkJavaVersions <- strsplit(javaReqs, "[(,)]")[[1]] - minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= ")[[1]][[2]]) - maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< ")[[1]][[2]]) + minJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[2]], ">= ", fixed = TRUE)[[1]][[2]]) + maxJavaVersion <- as.numeric(strsplit(sparkJavaVersions[[3]], "< ", fixed = TRUE)[[1]][[2]]) if (javaHome != "") { javaBin <- file.path(javaHome, "bin", javaBin) } @@ -89,23 +89,22 @@ checkJavaVersion <- function() { }) javaVersionFilter <- Filter( function(x) { - grepl(" version", x) + grepl(" version", x, fixed = TRUE) }, javaVersionOut) - javaVersionStr <- strsplit(javaVersionFilter[[1]], "[\"]")[[1L]][2] + javaVersionStr <- strsplit(javaVersionFilter[[1]], '"', fixed = TRUE)[[1L]][2] # javaVersionStr is of the form 1.8.0_92/9.0.x/11.0.x. # We are using 8, 9, 10, 11 for sparkJavaVersion. - versions <- strsplit(javaVersionStr, "[.]")[[1L]] + versions <- strsplit(javaVersionStr, ".", fixed = TRUE)[[1L]] if ("1" == versions[1]) { javaVersionNum <- as.integer(versions[2]) } else { javaVersionNum <- as.integer(versions[1]) } if (javaVersionNum < minJavaVersion || javaVersionNum >= maxJavaVersion) { - stop(paste0("Java version, greater than or equal to ", minJavaVersion, - " and less than ", maxJavaVersion, - ", is required for this package; found version: ", - javaVersionStr)) + stop("Java version, greater than or equal to ", minJavaVersion, + " and less than ", maxJavaVersion, ", is required for this ", + "package; found version: ", javaVersionStr) } return(javaVersionNum) } diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index d96a287f818a2..e3c9d9f8793d6 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -144,13 +144,13 @@ parallelize <- function(sc, coll, numSlices = 1) { if ((!is.list(coll) && !is.vector(coll)) || is.data.frame(coll)) { # nolint end if (is.data.frame(coll)) { - message(paste("context.R: A data frame is parallelized by columns.")) + message("context.R: A data frame is parallelized by columns.") } else { if (is.matrix(coll)) { - message(paste("context.R: A matrix is parallelized by elements.")) + message("context.R: A matrix is parallelized by elements.") } else { - message(paste("context.R: parallelize() currently only supports lists and vectors.", - "Calling as.list() to coerce coll into a list.")) + message("context.R: parallelize() currently only supports lists and vectors. ", + "Calling as.list() to coerce coll into a list.") } } coll <- as.list(coll) diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index ca4a6e342d772..5d22340fb62a0 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -57,7 +57,7 @@ readTypedObject <- function(con, type) { "s" = readStruct(con), "n" = NULL, "j" = getJobj(readString(con)), - stop(paste("Unsupported type for deserialization", type))) + stop("Unsupported type for deserialization ", type)) } readStringData <- function(con, len) { @@ -233,24 +233,13 @@ readMultipleObjectsWithKeys <- function(inputCon) { readDeserializeInArrow <- function(inputCon) { if (requireNamespace("arrow", quietly = TRUE)) { - # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190. - useAsTibble <- exists("as_tibble", envir = asNamespace("arrow")) - - # Currently, there looks no way to read batch by batch by socket connection in R side, # See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted binary at once # for now. dataLen <- readInt(inputCon) arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big") batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches() - - if (useAsTibble) { - as_tibble <- get("as_tibble", envir = asNamespace("arrow")) - # Read all groupped batches. Tibble -> data.frame is cheap. - lapply(batches, function(batch) as.data.frame(as_tibble(batch))) - } else { - lapply(batches, function(batch) as.data.frame(batch)) - } + lapply(batches, function(batch) as.data.frame(batch)) } else { stop("'arrow' package should be installed.") } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 48f69d5769620..4cb23d72ef51a 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -77,7 +77,13 @@ NULL #' days to be added to or subtracted from \code{y}. For class \code{character}, it is #' \itemize{ #' \item \code{date_format}: date format specification. -#' \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: time zone to use. +#' \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: A string detailing +#' the time zone ID that the input should be adjusted to. It should be in the format +#' of either region-based zone IDs or zone offsets. Region IDs must have the form +#' 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in the format +#' (+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported +#' as aliases of '+00:00'. Other short names are not recommended to use +#' because they can be ambiguous. #' \item \code{next_day}: day of the week string. #' } #' @param ... additional argument(s). @@ -1792,7 +1798,7 @@ setMethod("radians", #' @details #' \code{to_date}: Converts the column into a DateType. You may optionally specify #' a format according to the rules in: -#' \url{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}. +#' \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{Datetime Pattern} #' If the string cannot be parsed according to the specified format (or default), #' the value of the column will be null. #' By default, it follows casting rules to a DateType if the format is omitted @@ -1888,7 +1894,7 @@ setMethod("to_csv", signature(x = "Column"), #' @details #' \code{to_timestamp}: Converts the column into a TimestampType. You may optionally specify #' a format according to the rules in: -#' \url{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}. +#' \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{Datetime Pattern} #' If the string cannot be parsed according to the specified format (or default), #' the value of the column will be null. #' By default, it follows casting rules to a TimestampType if the format is omitted @@ -2193,7 +2199,7 @@ setMethod("pmod", signature(y = "Column"), column(jc) }) -#' @param rsd maximum estimation error allowed (default = 0.05). +#' @param rsd maximum relative standard deviation allowed (default = 0.05). #' #' @rdname column_aggregate_functions #' @aliases approx_count_distinct,Column-method @@ -2760,8 +2766,8 @@ setMethod("format_string", signature(format = "character", x = "Column"), #' \code{from_unixtime}: Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) #' to a string representing the timestamp of that moment in the current system time zone in the JVM #' in the given format. -#' See \href{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}{ -#' Customizing Formats} for available options. +#' See \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{ +#' Datetime Pattern} for available options. #' #' @rdname column_datetime_functions #' @@ -2776,7 +2782,7 @@ setMethod("format_string", signature(format = "character", x = "Column"), #' head(tmp)} #' @note from_unixtime since 1.5.0 setMethod("from_unixtime", signature(x = "Column"), - function(x, format = "uuuu-MM-dd HH:mm:ss") { + function(x, format = "yyyy-MM-dd HH:mm:ss") { jc <- callJStatic("org.apache.spark.sql.functions", "from_unixtime", x@jc, format) @@ -2882,7 +2888,7 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"), #' @details #' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.) -#' samples from U[0.0, 1.0]. +#' samples uniformly distributed in [0.0, 1.0). #' Note: the function is non-deterministic in general case. #' #' @rdname column_nonaggregate_functions @@ -3062,7 +3068,7 @@ setMethod("unix_timestamp", signature(x = "Column", format = "missing"), #' @aliases unix_timestamp,Column,character-method #' @note unix_timestamp(Column, character) since 1.5.0 setMethod("unix_timestamp", signature(x = "Column", format = "character"), - function(x, format = "uuuu-MM-dd HH:mm:ss") { + function(x, format = "yyyy-MM-dd HH:mm:ss") { jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format) column(jc) }) @@ -4022,7 +4028,8 @@ setMethod("date_trunc", }) #' @details -#' \code{current_date}: Returns the current date as a date column. +#' \code{current_date}: Returns the current date at the start of query evaluation as a date column. +#' All calls of current_date within the same query return the same value. #' #' @rdname column_datetime_functions #' @aliases current_date current_date,missing-method @@ -4038,7 +4045,8 @@ setMethod("current_date", }) #' @details -#' \code{current_timestamp}: Returns the current timestamp as a timestamp column. +#' \code{current_timestamp}: Returns the current timestamp at the start of query evaluation as +#' a timestamp column. All calls of current_timestamp within the same query return the same value. #' #' @rdname column_datetime_functions #' @aliases current_timestamp current_timestamp,missing-method diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 4134d5cecc888..89e0982429b5f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -528,6 +528,9 @@ setGeneric("persist", function(x, newLevel) { standardGeneric("persist") }) #' @rdname printSchema setGeneric("printSchema", function(x) { standardGeneric("printSchema") }) +#' @rdname registerTempTable-deprecated +setGeneric("registerTempTable", function(x, tableName) { standardGeneric("registerTempTable") }) + #' @rdname rename setGeneric("rename", function(x, ...) { standardGeneric("rename") }) diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 2b7995e1e37f6..99d62240a3b2a 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -234,8 +234,8 @@ gapplyInternal <- function(x, func, schema) { if (inherits(schema, "structType")) { checkSchemaInArrow(schema) } else if (is.null(schema)) { - stop(paste0("Arrow optimization does not support 'gapplyCollect' yet. Please disable ", - "Arrow optimization or use 'collect' and 'gapply' APIs instead.")) + stop("Arrow optimization does not support 'gapplyCollect' yet. Please disable ", + "Arrow optimization or use 'collect' and 'gapply' APIs instead.") } else { stop("'schema' should be DDL-formatted string or structType.") } diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 6d1edf6b6f3cf..ea2c0b4c0f42f 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -89,8 +89,8 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } if (overwrite) { - message(paste0("Overwrite = TRUE: download and overwrite the tar file", - "and Spark package directory if they exist.")) + message("Overwrite = TRUE: download and overwrite the tar file", + "and Spark package directory if they exist.") } releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL") @@ -103,12 +103,11 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { if (releaseUrl != "") { - message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir)) + message(packageName, " found, setting SPARK_HOME to ", packageLocalDir) } else { - fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s" - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageLocalDir) - message(msg) + message(version, " for Hadoop ", + if (hadoopVersion == "without") "Free build" else hadoopVersion, + " found, setting SPARK_HOME to ", packageLocalDir) } Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) @@ -127,26 +126,23 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, success <- downloadUrl(releaseUrl, packageLocalPath) if (!success) { unlink(packageLocalPath) - stop(paste0("Fetch failed from ", releaseUrl)) + stop("Fetch failed from ", releaseUrl) } } else { robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } } - message(sprintf("Installing to %s", localDir)) + message("Installing to ", localDir) # There are two ways untar can fail - untar could stop() on errors like incomplete block on file # or, tar command can return failure code success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0, error = function(e) { - message(e) - message() + message(e, "\n") FALSE }, warning = function(w) { - # Treat warning as error, add an empty line with message() - message(w) - message() + message(w, "\n") FALSE }) if (!tarExists || overwrite || !success) { @@ -160,7 +156,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (!success) stop("Extract archive failed.") message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) - message(paste("SPARK_HOME set to", packageLocalDir)) + message("SPARK_HOME set to ", packageLocalDir) invisible(packageLocalDir) } @@ -173,7 +169,7 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa if (success) { return() } else { - message(paste0("Unable to download from mirrorUrl: ", mirrorUrl)) + message("Unable to download from mirrorUrl: ", mirrorUrl) } } else { message("MirrorUrl not provided.") @@ -201,11 +197,9 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa # remove any partially downloaded file unlink(packageLocalPath) message("Unable to download from default mirror site: ", mirrorUrl) - msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.", - "Please check network connection, Hadoop version,", - "or provide other mirror sites."), - version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) - stop(msg) + stop("Unable to download Spark ", version, + " for Hadoop ", if (hadoopVersion == "without") "Free build" else hadoopVersion, + ". Please check network connection, Hadoop version, or provide other mirror sites.") } } @@ -214,15 +208,15 @@ getPreferredMirror <- function(version, packageName) { file.path("spark", version, packageName), ".tgz&as_json=1") textLines <- readLines(jsonUrl, warn = FALSE) - rowNum <- grep("\"preferred\"", textLines) + rowNum <- grep('"preferred"', textLines, fixed = TRUE) linePreferred <- textLines[rowNum] - matchInfo <- regexpr("\"[A-Za-z][A-Za-z0-9+-.]*://.+\"", linePreferred) + matchInfo <- regexpr('"[A-Za-z][A-Za-z0-9+-.]*://.+"', linePreferred) if (matchInfo != -1) { startPos <- matchInfo + 1 endPos <- matchInfo + attr(matchInfo, "match.length") - 2 mirrorPreferred <- base::substr(linePreferred, startPos, endPos) mirrorPreferred <- paste0(mirrorPreferred, "spark") - message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) + message("Preferred mirror site found: ", mirrorPreferred) } else { mirrorPreferred <- NULL } @@ -231,24 +225,20 @@ getPreferredMirror <- function(version, packageName) { directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz") - fmt <- "Downloading %s for Hadoop %s from:\n- %s" - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageRemotePath) - message(msg) + message("Downloading ", version, " for Hadoop ", + if (hadoopVersion == "without") "Free build" else hadoopVersion, + " from:\n- ", packageRemotePath) downloadUrl(packageRemotePath, packageLocalPath) } downloadUrl <- function(remotePath, localPath) { isFail <- tryCatch(download.file(remotePath, localPath), error = function(e) { - message(e) - message() + message(e, "\n") TRUE }, warning = function(w) { - # Treat warning as error, add an empty line with message() - message(w) - message() + message(w, "\n") TRUE }) !isFail @@ -279,9 +269,9 @@ sparkCachePath <- function() { winAppPath <- Sys.getenv("USERPROFILE", unset = NA) } if (is.na(winAppPath)) { - stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.", - "Please define the environment variable", - "or restart and enter an installation path in localDir.")) + stop("%LOCALAPPDATA% and %USERPROFILE% not found. ", + "Please define the environment variable ", + "or restart and enter an installation path in localDir.") } else { path <- file.path(winAppPath, "Apache", "Spark", "Cache") } @@ -293,7 +283,7 @@ sparkCachePath <- function() { Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), "spark") } } else { - stop(sprintf("Unknown OS: %s", .Platform$OS.type)) + stop("Unknown OS: ", .Platform$OS.type) } normalizePath(path, mustWork = FALSE) } @@ -322,7 +312,7 @@ installInstruction <- function(mode) { "If you need further help, ", "contact the administrators of the cluster.") } else { - stop(paste0("No instruction found for ", mode, " mode.")) + stop("No instruction found for mode ", mode) } } diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 3ad824e1e634a..53c9775963d23 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -331,8 +331,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients) || col != ncol(upperBoundsOnCoefficients))) { - stop(paste0("dimension of upperBoundsOnCoefficients ", - "is not the same as lowerBoundsOnCoefficients", sep = "")) + stop("dimension of upperBoundsOnCoefficients ", + "is not the same as lowerBoundsOnCoefficients") } if (is.null(lowerBoundsOnCoefficients)) { diff --git a/R/pkg/R/mllib_stat.R b/R/pkg/R/mllib_stat.R index f8c3329359961..6db4d5d4831dd 100644 --- a/R/pkg/R/mllib_stat.R +++ b/R/pkg/R/mllib_stat.R @@ -69,8 +69,7 @@ setMethod("spark.kstest", signature(data = "SparkDataFrame"), function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) { tryCatch(match.arg(nullHypothesis), error = function(e) { - msg <- paste("Distribution", nullHypothesis, "is not supported.") - stop(msg) + stop("Distribution ", nullHypothesis, " is not supported.") }) if (nullHypothesis == "norm") { distParams <- as.numeric(distParams) diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index 9c2e57d3067db..b29381bb900fb 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -906,7 +906,7 @@ setMethod("sampleByKey", for (elem in fractions) { if (elem < 0.0) { - stop(paste("Negative fraction value ", fractions[which(fractions == elem)])) + stop("Negative fraction value ", fractions[which(fractions == elem)]) } } diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 9831fc3cc6d01..7044ede0cc58b 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -99,10 +99,9 @@ print.structType <- function(x, ...) { cat("StructType\n", sapply(x$fields(), function(field) { - paste("|-", "name = \"", field$name(), - "\", type = \"", field$dataType.toString(), - "\", nullable = ", field$nullable(), "\n", - sep = "") + paste0("|-", "name = \"", field$name(), + "\", type = \"", field$dataType.toString(), + "\", nullable = ", field$nullable(), "\n") }), sep = "") } @@ -183,7 +182,7 @@ checkType <- function(type) { # strsplit does not return the final empty string, so check if # the final char is "," if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") { - fields <- strsplit(fieldsString, ",")[[1]] + fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]] for (field in fields) { m <- regexec("^(.+):(.+)$", field) matchedStrings <- regmatches(field, m) @@ -200,7 +199,7 @@ checkType <- function(type) { }) } - stop(paste("Unsupported type for SparkDataframe:", type)) + stop("Unsupported type for SparkDataframe: ", type) } #' @param type The data type of the field diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index cb3c1c59d12ed..7760d9be16f0b 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -84,7 +84,7 @@ writeObject <- function(con, object, writeType = TRUE) { Date = writeDate(con, object), POSIXlt = writeTime(con, object), POSIXct = writeTime(con, object), - stop(paste("Unsupported type for serialization", type))) + stop("Unsupported type for serialization ", type)) } writeVoid <- function(con) { @@ -158,7 +158,7 @@ writeType <- function(con, class) { Date = "D", POSIXlt = "t", POSIXct = "t", - stop(paste("Unsupported type for serialization", class))) + stop("Unsupported type for serialization ", class)) writeBin(charToRaw(type), con) } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index cdb59093781fb..e4a11a5f78a71 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -88,6 +88,49 @@ sparkR.stop <- function() { sparkR.session.stop() } +#' (Deprecated) Initialize a new Spark Context +#' +#' This function initializes a new SparkContext. +#' +#' @param master The Spark master URL +#' @param appName Application name to register with cluster manager +#' @param sparkHome Spark Home directory +#' @param sparkEnvir Named list of environment variables to set on worker nodes +#' @param sparkExecutorEnv Named list of environment variables to be used when launching executors +#' @param sparkJars Character vector of jar files to pass to the worker nodes +#' @param sparkPackages Character vector of package coordinates +#' @seealso \link{sparkR.session} +#' @rdname sparkR.init-deprecated +#' @examples +#'\dontrun{ +#' sc <- sparkR.init("local[2]", "SparkR", "/home/spark") +#' sc <- sparkR.init("local[2]", "SparkR", "/home/spark", +#' list(spark.executor.memory="1g")) +#' sc <- sparkR.init("yarn-client", "SparkR", "/home/spark", +#' list(spark.executor.memory="4g"), +#' list(LD_LIBRARY_PATH="/directory of JVM libraries (libjvm.so) on workers/"), +#' c("one.jar", "two.jar", "three.jar"), +#' c("com.databricks:spark-avro_2.11:2.0.1")) +#'} +#' @note sparkR.init since 1.4.0 +sparkR.init <- function( + master = "", + appName = "SparkR", + sparkHome = Sys.getenv("SPARK_HOME"), + sparkEnvir = list(), + sparkExecutorEnv = list(), + sparkJars = "", + sparkPackages = "") { + .Deprecated("sparkR.session") + sparkR.sparkContext(master, + appName, + sparkHome, + convertNamedListToEnv(sparkEnvir), + convertNamedListToEnv(sparkExecutorEnv), + sparkJars, + sparkPackages) +} + # Internal function to handle creating the SparkContext. sparkR.sparkContext <- function( master = "", @@ -111,8 +154,8 @@ sparkR.sparkContext <- function( connectionTimeout <- as.numeric(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000")) if (existingPort != "") { if (length(packages) != 0) { - warning(paste("sparkPackages has no effect when using spark-submit or sparkR shell", - " please use the --packages commandline instead", sep = ",")) + warning("sparkPackages has no effect when using spark-submit or sparkR shell, ", + "please use the --packages commandline instead") } backendPort <- existingPort authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET") @@ -201,7 +244,7 @@ sparkR.sparkContext <- function( uriSep <- "////" } localJarPaths <- lapply(jars, - function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) }) + function(j) { utils::URLencode(paste0("file:", uriSep, j)) }) # Set the start time to identify jobjs # Seconds resolution is good enough for this purpose, so use ints @@ -229,6 +272,61 @@ sparkR.sparkContext <- function( sc } +#' (Deprecated) Initialize a new SQLContext +#' +#' This function creates a SparkContext from an existing JavaSparkContext and +#' then uses it to initialize a new SQLContext +#' +#' Starting SparkR 2.0, a SparkSession is initialized and returned instead. +#' This API is deprecated and kept for backward compatibility only. +#' +#' @param jsc The existing JavaSparkContext created with SparkR.init() +#' @seealso \link{sparkR.session} +#' @rdname sparkRSQL.init-deprecated +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlContext <- sparkRSQL.init(sc) +#'} +#' @note sparkRSQL.init since 1.4.0 +sparkRSQL.init <- function(jsc = NULL) { + .Deprecated("sparkR.session") + + if (exists(".sparkRsession", envir = .sparkREnv)) { + return(get(".sparkRsession", envir = .sparkREnv)) + } + + # Default to without Hive support for backward compatibility. + sparkR.session(enableHiveSupport = FALSE) +} + +#' (Deprecated) Initialize a new HiveContext +#' +#' This function creates a HiveContext from an existing JavaSparkContext +#' +#' Starting SparkR 2.0, a SparkSession is initialized and returned instead. +#' This API is deprecated and kept for backward compatibility only. +#' +#' @param jsc The existing JavaSparkContext created with SparkR.init() +#' @seealso \link{sparkR.session} +#' @rdname sparkRHive.init-deprecated +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlContext <- sparkRHive.init(sc) +#'} +#' @note sparkRHive.init since 1.4.0 +sparkRHive.init <- function(jsc = NULL) { + .Deprecated("sparkR.session") + + if (exists(".sparkRsession", envir = .sparkREnv)) { + return(get(".sparkRsession", envir = .sparkREnv)) + } + + # Default to without Hive support for backward compatibility. + sparkR.session(enableHiveSupport = TRUE) +} + #' Get the existing SparkSession or initialize a new SparkSession. #' #' SparkSession is the entry point into SparkR. \code{sparkR.session} gets the existing @@ -337,12 +435,13 @@ sparkR.session <- function( # Check if version number of SparkSession matches version number of SparkR package jvmVersion <- callJMethod(sparkSession, "version") # Remove -SNAPSHOT from jvm versions - jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion) + jvmVersionStrip <- gsub("-SNAPSHOT", "", jvmVersion, fixed = TRUE) rPackageVersion <- paste0(packageVersion("SparkR")) if (jvmVersionStrip != rPackageVersion) { - warning(paste("Version mismatch between Spark JVM and SparkR package. JVM version was", - jvmVersion, ", while R package version was", rPackageVersion)) + warning("Version mismatch between Spark JVM and SparkR package. ", + "JVM version was ", jvmVersion, + ", while R package version was ", rPackageVersion) } sparkSession @@ -508,7 +607,7 @@ getClientModeSparkSubmitOpts <- function(submitOps, sparkEnvirMap) { # process only if --option is not already specified if (!is.null(opsValue) && nchar(opsValue) > 1 && - !grepl(sparkConfToSubmitOps[[conf]], submitOps)) { + !grepl(sparkConfToSubmitOps[[conf]], submitOps, fixed = TRUE)) { # put "" around value in case it has spaces paste0(sparkConfToSubmitOps[[conf]], " \"", opsValue, "\" ") } else { diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 55f75508e88ff..5d48a9eee2799 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -88,33 +88,27 @@ specialtypeshandle <- function(type) { checkSchemaInArrow <- function(schema) { stopifnot(inherits(schema, "structType")) - requireNamespace1 <- requireNamespace - if (!requireNamespace1("arrow", quietly = TRUE)) { + if (!requireNamespace("arrow", quietly = TRUE)) { stop("'arrow' package should be installed.") } # Both cases below produce a corrupt value for unknown reason. It needs to be investigated. - if (any(sapply(schema$fields(), function(x) x$dataType.toString() == "FloatType"))) { + field_strings <- sapply(schema$fields(), function(x) x$dataType.toString()) + if (any(field_strings == "FloatType")) { stop("Arrow optimization in R does not support float type yet.") } - if (any(sapply(schema$fields(), function(x) x$dataType.toString() == "BinaryType"))) { + if (any(field_strings == "BinaryType")) { stop("Arrow optimization in R does not support binary type yet.") } - if (any(sapply(schema$fields(), - function(x) startsWith(x$dataType.toString(), - "ArrayType")))) { + if (any(startsWith(field_strings, "ArrayType"))) { stop("Arrow optimization in R does not support array type yet.") } # Arrow optimization in Spark does not yet support both cases below. - if (any(sapply(schema$fields(), - function(x) startsWith(x$dataType.toString(), - "StructType")))) { + if (any(startsWith(field_strings, "StructType"))) { stop("Arrow optimization in R does not support nested struct type yet.") } - if (any(sapply(schema$fields(), - function(x) startsWith(x$dataType.toString(), - "MapType")))) { + if (any(startsWith(field_strings, "MapType"))) { stop("Arrow optimization in R does not support map type yet.") } } diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index a8c1ddb3dd20b..cef2fa9b47440 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -46,9 +46,9 @@ convertJListToRList <- function(jList, flatten, logicalUpperBound = NULL, res <- list(unserialize(keyBytes), unserialize(valBytes)) } else { - stop(paste("utils.R: convertJListToRList only supports", - "RDD[Array[Byte]] and", - "JavaPairRDD[Array[Byte], Array[Byte]] for now")) + stop("utils.R: convertJListToRList only supports ", + "RDD[Array[Byte]] and ", + "JavaPairRDD[Array[Byte], Array[Byte]] for now") } } else { if (inherits(obj, "raw")) { @@ -137,7 +137,7 @@ hashCode <- function(key) { as.integer(hashC) } } else { - warning(paste("Could not hash object, returning 0", sep = "")) + warning("Could not hash object, returning 0") as.integer(0) } } @@ -354,8 +354,8 @@ varargsToStrEnv <- function(...) { } else { value <- pairs[[name]] if (!(is.logical(value) || is.numeric(value) || is.character(value) || is.null(value))) { - stop(paste0("Unsupported type for ", name, " : ", class(value), - ". Supported types are logical, numeric, character and NULL."), call. = FALSE) + stop("Unsupported type for ", name, " : ", toString(class(value)), ". ", + "Supported types are logical, numeric, character and NULL.", call. = FALSE) } if (is.logical(value)) { env[[name]] <- tolower(as.character(value)) @@ -369,8 +369,7 @@ varargsToStrEnv <- function(...) { } if (length(ignoredNames) != 0) { - warning(paste0("Unnamed arguments ignored: ", paste(ignoredNames, collapse = ", "), "."), - call. = FALSE) + warning("Unnamed arguments ignored: ", toString(ignoredNames), ".", call. = FALSE) } env } @@ -449,7 +448,7 @@ storageLevelToString <- function(levelObj) { # the user to type (for example) `5` instead of `5L` to avoid a confusing error message. numToInt <- function(num) { if (as.integer(num) != num) { - warning(paste("Coercing", as.list(sys.call())[[2]], "to integer.")) + warning("Coercing ", as.list(sys.call())[[2L]], " to integer.") } as.integer(num) } @@ -530,7 +529,10 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { # Namespaces other than "SparkR" will not be searched. if (!isNamespace(func.env) || (getNamespaceName(func.env) == "SparkR" && - !(nodeChar %in% getNamespaceExports("SparkR")))) { + !(nodeChar %in% getNamespaceExports("SparkR")) && + # Note that generic S4 methods should not be set to the environment of + # cleaned closure. It does not work with R 4.0.0+. See also SPARK-31918. + nodeChar != "" && !methods::isGeneric(nodeChar, func.env))) { # Only include SparkR internals. # Set parameter 'inherits' to FALSE since we do not need to search in @@ -650,8 +652,8 @@ mergePartitions <- function(rdd, zip) { # For zip operation, check if corresponding partitions # of both RDDs have the same number of elements. if (zip && lengthOfKeys != lengthOfValues) { - stop(paste("Can only zip RDDs with same number of elements", - "in each pair of corresponding partitions.")) + stop("Can only zip RDDs with same number of elements ", + "in each pair of corresponding partitions.") } if (lengthOfKeys > 1) { @@ -804,7 +806,7 @@ handledCallJMethod <- function(obj, method, ...) { captureJVMException <- function(e, method) { rawmsg <- as.character(e) - if (any(grep("^Error in .*?: ", rawmsg))) { + if (any(grepl("^Error in .*?: ", rawmsg))) { # If the exception message starts with "Error in ...", this is possibly # "Error in invokeJava(...)". Here, it replaces the characters to # `paste("Error in", method, ":")` in order to identify which function @@ -818,54 +820,58 @@ captureJVMException <- function(e, method) { } # StreamingQueryException could wrap an IllegalArgumentException, so look for that first - if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ", stacktrace))) { + if (any(grepl("org.apache.spark.sql.streaming.StreamingQueryException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.streaming.StreamingQueryException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "streaming query error - ", first), call. = FALSE) - } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace))) { + stop(rmsg, "streaming query error - ", first, call. = FALSE) + } else if (any(grepl("java.lang.IllegalArgumentException: ", stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "java.lang.IllegalArgumentException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "illegal argument - ", first), call. = FALSE) - } else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace))) { + stop(rmsg, "illegal argument - ", first, call. = FALSE) + } else if (any(grepl("org.apache.spark.sql.AnalysisException: ", stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.AnalysisException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "analysis error - ", first), call. = FALSE) + stop(rmsg, "analysis error - ", first, call. = FALSE) } else - if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", stacktrace))) { + if (any(grepl("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "no such database - ", first), call. = FALSE) + stop(rmsg, "no such database - ", first, call. = FALSE) } else - if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", stacktrace))) { + if (any(grepl("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "no such table - ", first), call. = FALSE) - } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ", stacktrace))) { + stop(rmsg, "no such table - ", first, call. = FALSE) + } else if (any(grepl("org.apache.spark.sql.catalyst.parser.ParseException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.parser.ParseException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "parse error - ", first), call. = FALSE) + stop(rmsg, "parse error - ", first, call. = FALSE) } else { stop(stacktrace, call. = FALSE) } diff --git a/R/pkg/inst/profile/general.R b/R/pkg/inst/profile/general.R index 3efb460846fc2..8c75c19ca7ac3 100644 --- a/R/pkg/inst/profile/general.R +++ b/R/pkg/inst/profile/general.R @@ -16,10 +16,6 @@ # .First <- function() { - if (utils::compareVersion(paste0(R.version$major, ".", R.version$minor), "3.4.0") == -1) { - warning("Support for R prior to version 3.4 is deprecated since Spark 3.0.0") - } - packageDir <- Sys.getenv("SPARKR_PACKAGE_DIR") dirs <- strsplit(packageDir, ",")[[1]] .libPaths(c(dirs, .libPaths())) diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R index e4e0d032997de..f6c20e1a5ebc3 100644 --- a/R/pkg/inst/profile/shell.R +++ b/R/pkg/inst/profile/shell.R @@ -16,10 +16,6 @@ # .First <- function() { - if (utils::compareVersion(paste0(R.version$major, ".", R.version$minor), "3.4.0") == -1) { - warning("Support for R prior to version 3.4 is deprecated since Spark 3.0.0") - } - home <- Sys.getenv("SPARK_HOME") .libPaths(c(file.path(home, "R", "lib"), .libPaths())) Sys.setenv(NOAWT = 1) diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R index b9139154bc165..f86872d727a1d 100644 --- a/R/pkg/tests/fulltests/test_context.R +++ b/R/pkg/tests/fulltests/test_context.R @@ -25,7 +25,10 @@ test_that("Check masked functions", { namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", "summary", "transform", "drop", "window", "as.data.frame", "union", "not") - if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { + version <- packageVersion("base") + is33Above <- as.numeric(version$major) >= 3 && as.numeric(version$minor) >= 3 + is40Above <- as.numeric(version$major) >= 4 + if (is33Above || is40Above) { namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) } masked <- conflicts(detail = TRUE)$`package:SparkR` diff --git a/R/pkg/tests/fulltests/test_includePackage.R b/R/pkg/tests/fulltests/test_includePackage.R index 916361ff4c797..1d16b260c4c52 100644 --- a/R/pkg/tests/fulltests/test_includePackage.R +++ b/R/pkg/tests/fulltests/test_includePackage.R @@ -27,8 +27,8 @@ rdd <- parallelize(sc, nums, 2L) test_that("include inside function", { # Only run the test if plyr is installed. - if ("plyr" %in% rownames(installed.packages())) { - suppressPackageStartupMessages(library(plyr)) + if ("plyr" %in% rownames(installed.packages()) && + suppressPackageStartupMessages(suppressWarnings(library(plyr, logical.return = TRUE)))) { generateData <- function(x) { suppressPackageStartupMessages(library(plyr)) attach(airquality) @@ -44,8 +44,8 @@ test_that("include inside function", { test_that("use include package", { # Only run the test if plyr is installed. - if ("plyr" %in% rownames(installed.packages())) { - suppressPackageStartupMessages(library(plyr)) + if ("plyr" %in% rownames(installed.packages()) && + suppressPackageStartupMessages(suppressWarnings(library(plyr, logical.return = TRUE)))) { generateData <- function(x) { attach(airquality) result <- transform(Ozone, logOzone = log(Ozone)) diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 2da3a022f8941..2a09c23bc3617 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -34,7 +34,7 @@ test_that("spark.svmLinear", { summary <- summary(model) # test summary coefficients return matrix type - expect_true(class(summary$coefficients) == "matrix") + expect_true(any(class(summary$coefficients) == "matrix")) expect_true(class(summary$coefficients[, 1]) == "numeric") coefs <- summary$coefficients[, "Estimate"] @@ -130,7 +130,7 @@ test_that("spark.logit", { summary <- summary(model) # test summary coefficients return matrix type - expect_true(class(summary$coefficients) == "matrix") + expect_true(any(class(summary$coefficients) == "matrix")) expect_true(class(summary$coefficients[, 1]) == "numeric") versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00) @@ -242,8 +242,8 @@ test_that("spark.logit", { # Test binomial logistic regression against two classes with upperBoundsOnCoefficients # and upperBoundsOnIntercepts u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4) - model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u, - upperBoundsOnIntercepts = 1.0) + model <- suppressWarnings(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u, + upperBoundsOnIntercepts = 1.0)) summary <- summary(model) coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000) coefs <- summary$coefficients[, "Estimate"] @@ -255,8 +255,8 @@ test_that("spark.logit", { # Test binomial logistic regression against two classes with lowerBoundsOnCoefficients # and lowerBoundsOnIntercepts l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4) - model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l, - lowerBoundsOnIntercepts = 0.0) + model <- suppressWarnings(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l, + lowerBoundsOnIntercepts = 0.0)) summary <- summary(model) coefsR <- c(0, 0, -1, 0, 1.902192) coefs <- summary$coefficients[, "Estimate"] @@ -268,9 +268,9 @@ test_that("spark.logit", { # Test multinomial logistic regression with lowerBoundsOnCoefficients # and lowerBoundsOnIntercepts l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4) - model <- spark.logit(training, Species ~ ., family = "multinomial", - lowerBoundsOnCoefficients = l, - lowerBoundsOnIntercepts = as.array(c(0.0, 0.0))) + model <- suppressWarnings(spark.logit(training, Species ~ ., family = "multinomial", + lowerBoundsOnCoefficients = l, + lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))) summary <- summary(model) versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429) virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021) diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R index 028ad574b8134..f180aeea28150 100644 --- a/R/pkg/tests/fulltests/test_mllib_clustering.R +++ b/R/pkg/tests/fulltests/test_mllib_clustering.R @@ -171,7 +171,7 @@ test_that("spark.kmeans", { expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1)) # test summary coefficients return matrix type - expect_true(class(summary.model$coefficients) == "matrix") + expect_true(any(class(summary.model$coefficients) == "matrix")) expect_true(class(summary.model$coefficients[1, ]) == "numeric") # Test model save/load diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R b/R/pkg/tests/fulltests/test_mllib_regression.R index b40c4cb9a9694..929cd27211fdc 100644 --- a/R/pkg/tests/fulltests/test_mllib_regression.R +++ b/R/pkg/tests/fulltests/test_mllib_regression.R @@ -116,7 +116,7 @@ test_that("spark.glm summary", { rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset)) # test summary coefficients return matrix type - expect_true(class(stats$coefficients) == "matrix") + expect_true(any(class(stats$coefficients) == "matrix")) expect_true(class(stats$coefficients[, 1]) == "numeric") coefs <- stats$coefficients diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 3b3768f7e2715..fb0ed7451cdb9 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -106,6 +106,15 @@ if (is_windows()) { Sys.setenv(TZ = "GMT") } +test_that("calling sparkRSQL.init returns existing SQL context", { + sqlContext <- suppressWarnings(sparkRSQL.init(sc)) + expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext) +}) + +test_that("calling sparkRSQL.init returns existing SparkSession", { + expect_equal(suppressWarnings(sparkRSQL.init(sc)), sparkSession) +}) + test_that("calling sparkR.session returns existing SparkSession", { expect_equal(sparkR.session(), sparkSession) }) @@ -656,10 +665,10 @@ test_that("test tableNames and tables", { expect_true("tableName" %in% colnames(tables())) expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables()))) - createOrReplaceTempView(df, "table2") + suppressWarnings(registerTempTable(df, "table2")) tables <- listTables() expect_equal(count(tables), count + 2) - dropTempView("table1") + suppressWarnings(dropTempTable("table1")) expect_true(dropTempView("table2")) tables <- listTables() @@ -1810,7 +1819,8 @@ test_that("string operators", { expect_true(first(select(df, endsWith(df$name, "el")))[[1]]) expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi") expect_equal(first(select(df, substr(df$name, 4, 6)))[[1]], "hae") - if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { + version <- packageVersion("base") + if (as.numeric(version$major) >= 3 && as.numeric(version$minor) >= 3) { expect_true(startsWith("Hello World", "Hello")) expect_false(endsWith("Hello World", "a")) } @@ -2462,7 +2472,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { error_msg <- paste("joinType must be one of the following types:", "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',", "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", - "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.") + "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti', 'left_anti'") expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg) merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE) @@ -2497,8 +2507,8 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { writeLines(mockLines3, jsonPath3) df3 <- read.json(jsonPath3) expect_error(merge(df, df3), - paste("The following column name: name_y occurs more than once in the 'DataFrame'.", - "Please use different suffixes for the intersected columns.", sep = "")) + paste0("The following column name: name_y occurs more than once in the 'DataFrame'.", + "Please use different suffixes for the intersected columns.")) unlink(jsonPath2) unlink(jsonPath3) @@ -2541,20 +2551,20 @@ test_that("toJSON() on DataFrame", { test_that("showDF()", { df <- read.json(jsonPath) - expected <- paste("+----+-------+\n", - "| age| name|\n", - "+----+-------+\n", - "|null|Michael|\n", - "| 30| Andy|\n", - "| 19| Justin|\n", - "+----+-------+\n", sep = "") - expected2 <- paste("+---+----+\n", - "|age|name|\n", - "+---+----+\n", - "|nul| Mic|\n", - "| 30| And|\n", - "| 19| Jus|\n", - "+---+----+\n", sep = "") + expected <- paste("+----+-------+", + "| age| name|", + "+----+-------+", + "|null|Michael|", + "| 30| Andy|", + "| 19| Justin|", + "+----+-------+\n", sep = "\n") + expected2 <- paste("+---+----+", + "|age|name|", + "+---+----+", + "|nul| Mic|", + "| 30| And|", + "| 19| Jus|", + "+---+----+\n", sep = "\n") expect_output(showDF(df), expected) expect_output(showDF(df, truncate = 3), expected2) }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R index 97972753a78fa..06743488fdf11 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R +++ b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R @@ -19,7 +19,10 @@ library(testthat) context("SparkSQL Arrow optimization") -sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) +sparkSession <- sparkR.session( + master = sparkRTestMaster, + enableHiveSupport = FALSE, + sparkConfig = list(spark.sql.execution.arrow.sparkr.enabled = "true")) test_that("createDataFrame/collect Arrow optimization", { skip_if_not_installed("arrow") @@ -35,29 +38,13 @@ test_that("createDataFrame/collect Arrow optimization", { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) }) - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - expect_equal(collect(createDataFrame(mtcars)), expected) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + expect_equal(collect(createDataFrame(mtcars)), expected) }) test_that("createDataFrame/collect Arrow optimization - many partitions (partition order test)", { skip_if_not_installed("arrow") - - conf <- callJMethod(sparkSession, "conf") - arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]] - - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - expect_equal(collect(createDataFrame(mtcars, numPartitions = 32)), - collect(createDataFrame(mtcars, numPartitions = 1))) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + expect_equal(collect(createDataFrame(mtcars, numPartitions = 32)), + collect(createDataFrame(mtcars, numPartitions = 1))) }) test_that("createDataFrame/collect Arrow optimization - type specification", { @@ -81,13 +68,7 @@ test_that("createDataFrame/collect Arrow optimization - type specification", { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) }) - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - expect_equal(collect(createDataFrame(rdf)), expected) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + expect_equal(collect(createDataFrame(rdf)), expected) }) test_that("dapply() Arrow optimization", { @@ -98,36 +79,30 @@ test_that("dapply() Arrow optimization", { arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]] callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false") - tryCatch({ - ret <- dapply(df, - function(rdf) { - stopifnot(is.data.frame(rdf)) - rdf - }, - schema(df)) - expected <- collect(ret) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) - - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") tryCatch({ ret <- dapply(df, function(rdf) { stopifnot(is.data.frame(rdf)) - # mtcars' hp is more then 50. - stopifnot(all(rdf$hp > 50)) rdf }, schema(df)) - actual <- collect(ret) - expect_equal(actual, expected) - expect_equal(count(ret), nrow(mtcars)) + expected <- collect(ret) }, finally = { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) }) + + ret <- dapply(df, + function(rdf) { + stopifnot(is.data.frame(rdf)) + # mtcars' hp is more then 50. + stopifnot(all(rdf$hp > 50)) + rdf + }, + schema(df)) + actual <- collect(ret) + expect_equal(actual, expected) + expect_equal(count(ret), nrow(mtcars)) }) test_that("dapply() Arrow optimization - type specification", { @@ -154,15 +129,9 @@ test_that("dapply() Arrow optimization - type specification", { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) }) - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - ret <- dapply(df, function(rdf) { rdf }, schema(df)) - actual <- collect(ret) - expect_equal(actual, expected) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + ret <- dapply(df, function(rdf) { rdf }, schema(df)) + actual <- collect(ret) + expect_equal(actual, expected) }) test_that("dapply() Arrow optimization - type specification (date and timestamp)", { @@ -170,18 +139,8 @@ test_that("dapply() Arrow optimization - type specification (date and timestamp) rdf <- data.frame(list(list(a = as.Date("1990-02-24"), b = as.POSIXct("1990-02-24 12:34:56")))) df <- createDataFrame(rdf) - - conf <- callJMethod(sparkSession, "conf") - arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]] - - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - ret <- dapply(df, function(rdf) { rdf }, schema(df)) - expect_equal(collect(ret), rdf) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + ret <- dapply(df, function(rdf) { rdf }, schema(df)) + expect_equal(collect(ret), rdf) }) test_that("gapply() Arrow optimization", { @@ -209,28 +168,22 @@ test_that("gapply() Arrow optimization", { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) }) - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - ret <- gapply(df, - "gear", - function(key, grouped) { - if (length(key) > 0) { - stopifnot(is.numeric(key[[1]])) - } - stopifnot(is.data.frame(grouped)) - stopifnot(length(colnames(grouped)) == 11) - # mtcars' hp is more then 50. - stopifnot(all(grouped$hp > 50)) - grouped - }, - schema(df)) - actual <- collect(ret) - expect_equal(actual, expected) - expect_equal(count(ret), nrow(mtcars)) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + ret <- gapply(df, + "gear", + function(key, grouped) { + if (length(key) > 0) { + stopifnot(is.numeric(key[[1]])) + } + stopifnot(is.data.frame(grouped)) + stopifnot(length(colnames(grouped)) == 11) + # mtcars' hp is more then 50. + stopifnot(all(grouped$hp > 50)) + grouped + }, + schema(df)) + actual <- collect(ret) + expect_equal(actual, expected) + expect_equal(count(ret), nrow(mtcars)) }) test_that("gapply() Arrow optimization - type specification", { @@ -250,26 +203,19 @@ test_that("gapply() Arrow optimization - type specification", { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "false") tryCatch({ ret <- gapply(df, - "a", - function(key, grouped) { grouped }, schema(df)) + "a", + function(key, grouped) { grouped }, schema(df)) expected <- collect(ret) }, finally = { callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) }) - - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - ret <- gapply(df, - "a", - function(key, grouped) { grouped }, schema(df)) - actual <- collect(ret) - expect_equal(actual, expected) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + ret <- gapply(df, + "a", + function(key, grouped) { grouped }, schema(df)) + actual <- collect(ret) + expect_equal(actual, expected) }) test_that("gapply() Arrow optimization - type specification (date and timestamp)", { @@ -277,39 +223,30 @@ test_that("gapply() Arrow optimization - type specification (date and timestamp) rdf <- data.frame(list(list(a = as.Date("1990-02-24"), b = as.POSIXct("1990-02-24 12:34:56")))) df <- createDataFrame(rdf) + ret <- gapply(df, + "a", + function(key, grouped) { grouped }, schema(df)) + expect_equal(collect(ret), rdf) +}) - conf <- callJMethod(sparkSession, "conf") - arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]] +test_that("Arrow optimization - unsupported types", { + skip_if_not_installed("arrow") - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - ret <- gapply(df, - "a", - function(key, grouped) { grouped }, schema(df)) - expect_equal(collect(ret), rdf) - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + expect_error(checkSchemaInArrow(structType("a FLOAT")), "not support float type") + expect_error(checkSchemaInArrow(structType("a BINARY")), "not support binary type") + expect_error(checkSchemaInArrow(structType("a ARRAY")), "not support array type") + expect_error(checkSchemaInArrow(structType("a MAP")), "not support map type") + expect_error(checkSchemaInArrow(structType("a STRUCT")), + "not support nested struct type") }) -test_that("Arrow optimization - unsupported types", { +test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", { skip_if_not_installed("arrow") + df <- createDataFrame(list(list(a = 1L, b = "a"))) - conf <- callJMethod(sparkSession, "conf") - arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]] - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true") - tryCatch({ - expect_error(checkSchemaInArrow(structType("a FLOAT")), "not support float type") - expect_error(checkSchemaInArrow(structType("a BINARY")), "not support binary type") - expect_error(checkSchemaInArrow(structType("a ARRAY")), "not support array type") - expect_error(checkSchemaInArrow(structType("a MAP")), "not support map type") - expect_error(checkSchemaInArrow(structType("a STRUCT")), - "not support nested struct type") - }, - finally = { - callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled) - }) + expect_error( + count(gapply(df, "a", function(key, group) { group }, structType("a int, b int"))), + "expected IntegerType, IntegerType, got IntegerType, StringType") }) sparkR.session.stop() diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index bf02ecdad66ff..0a6a6c4ea3d1d 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -60,22 +60,49 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) { # set random seed for predictable results. mostly for base's sample() in tree and classification set.seed(42) - # TODO (SPARK-30663) To be removed once testthat 1.x is removed from all builds - if (grepl("^1\\..*", packageVersion("testthat"))) { + test_runner <- if (packageVersion("testthat")$major <= 1) { # testthat 1.x - test_runner <- testthat:::run_tests - reporter <- "summary" + function(path, package, reporter, filter) { + testthat:::run_tests( + test_path = path, + package = package, + filter = filter, + reporter = reporter + ) + } + } else if (packageVersion("testthat")$major == 2) { + # testthat >= 2.0.0, < 3.0.0 + function(path, package, reporter, filter) { + testthat:::test_package_dir( + test_path = path, + package = package, + filter = filter, + reporter = reporter + ) + } + } else { + # testthat >= 3.0.0 + testthat::test_dir + } + reporter <- if (packageVersion("testthat")$major <= 1) { + "summary" } else { - # testthat >= 2.0.0 - test_runner <- testthat:::test_package_dir - reporter <- testthat::default_reporter() + dir.create("target/test-reports", showWarnings = FALSE) + MultiReporter$new(list( + SummaryReporter$new(), + JunitReporter$new( + file = file.path(getwd(), "target/test-reports/test-results.xml") + ) + )) } - test_runner("SparkR", - file.path(sparkRDir, "pkg", "tests", "fulltests"), - NULL, - reporter) + test_runner( + path = file.path(sparkRDir, "pkg", "tests", "fulltests"), + package = "SparkR", + reporter = reporter, + filter = NULL + ) } SparkR:::uninstallDownloadedSpark() diff --git a/appveyor.yml b/appveyor.yml index 5d98260265b1a..e0d2396bcbed4 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -41,15 +41,15 @@ cache: install: # Install maven and dependencies - ps: .\dev\appveyor-install-dependencies.ps1 - # Required package for R unit tests - - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')" - - cmd: R -e "install.packages(c('crayon', 'praise', 'R6', 'testthat'), repos='https://cloud.r-project.org/')" + # Required package for R unit tests. xml2 is required to use jUnit reporter in testthat. + - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')" - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival'); packageVersion('arrow')" build_script: # '-Djna.nosys=true' is required to avoid kernel32.dll load failure. # See SPARK-28759. - - cmd: mvn -DskipTests -Psparkr -Phive -Djna.nosys=true package + # Ideally we should check the tests related to Hive in SparkR as well (SPARK-31745). + - cmd: mvn -DskipTests -Psparkr -Djna.nosys=true package environment: NOT_CRAN: true diff --git a/assembly/pom.xml b/assembly/pom.xml index 193ad3d671bcf..e265bb948d9d4 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../pom.xml diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index 68fafbb848001..6d74f8328aea2 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -172,6 +172,7 @@ function build { local BASEDOCKERFILE=${BASEDOCKERFILE:-"kubernetes/dockerfiles/spark/Dockerfile"} local PYDOCKERFILE=${PYDOCKERFILE:-false} local RDOCKERFILE=${RDOCKERFILE:-false} + local ARCHS=${ARCHS:-"--platform linux/amd64,linux/arm64"} (cd $(img_ctx_dir base) && docker build $NOCACHEARG "${BUILD_ARGS[@]}" \ -t $(image_ref spark) \ @@ -179,6 +180,11 @@ function build { if [ $? -ne 0 ]; then error "Failed to build Spark JVM Docker image, please refer to Docker build output for details." fi + if [ "${CROSS_BUILD}" != "false" ]; then + (cd $(img_ctx_dir base) && docker buildx build $ARCHS $NOCACHEARG "${BUILD_ARGS[@]}" \ + -t $(image_ref spark) \ + -f "$BASEDOCKERFILE" .) + fi if [ "${PYDOCKERFILE}" != "false" ]; then (cd $(img_ctx_dir pyspark) && docker build $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ @@ -187,6 +193,11 @@ function build { if [ $? -ne 0 ]; then error "Failed to build PySpark Docker image, please refer to Docker build output for details." fi + if [ "${CROSS_BUILD}" != "false" ]; then + (cd $(img_ctx_dir pyspark) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ + -t $(image_ref spark-py) \ + -f "$PYDOCKERFILE" .) + fi fi if [ "${RDOCKERFILE}" != "false" ]; then @@ -196,6 +207,11 @@ function build { if [ $? -ne 0 ]; then error "Failed to build SparkR Docker image, please refer to Docker build output for details." fi + if [ "${CROSS_BUILD}" != "false" ]; then + (cd $(img_ctx_dir sparkr) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" \ + -t $(image_ref spark-r) \ + -f "$RDOCKERFILE" .) + fi fi } @@ -227,6 +243,8 @@ Options: -n Build docker image with --no-cache -u uid UID to use in the USER directive to set the user the main Spark process runs as inside the resulting container + -X Use docker buildx to cross build. Automatically pushes. + See https://docs.docker.com/buildx/working-with-buildx/ for steps to setup buildx. -b arg Build arg to build or push the image. For multiple build args, this option needs to be used separately for each build arg. @@ -248,6 +266,16 @@ Examples: - Build and push image with tag "v2.3.0" to docker.io/myrepo $0 -r docker.io/myrepo -t v2.3.0 build $0 -r docker.io/myrepo -t v2.3.0 push + + - Build and push JDK11-based image with tag "v3.0.0" to docker.io/myrepo + $0 -r docker.io/myrepo -t v3.0.0 -b java_image_tag=11-jre-slim build + $0 -r docker.io/myrepo -t v3.0.0 push + + - Build and push JDK11-based image for multiple archs to docker.io/myrepo + $0 -r docker.io/myrepo -t v3.0.0 -X -b java_image_tag=11-jre-slim build + # Note: buildx, which does cross building, needs to do the push during build + # So there is no seperate push step with -X + EOF } @@ -264,7 +292,8 @@ RDOCKERFILE= NOCACHEARG= BUILD_PARAMS= SPARK_UID= -while getopts f:p:R:mr:t:nb:u: option +CROSS_BUILD="false" +while getopts f:p:R:mr:t:Xnb:u: option do case "${option}" in @@ -275,6 +304,7 @@ do t) TAG=${OPTARG};; n) NOCACHEARG="--no-cache";; b) BUILD_PARAMS=${BUILD_PARAMS}" --build-arg "${OPTARG};; + X) CROSS_BUILD=1;; m) if ! which minikube 1>/dev/null; then error "Cannot find minikube." diff --git a/bin/load-spark-env.cmd b/bin/load-spark-env.cmd index 5f98cc34b6bab..7e2cf43af3c7d 100644 --- a/bin/load-spark-env.cmd +++ b/bin/load-spark-env.cmd @@ -29,10 +29,7 @@ if [%SPARK_ENV_LOADED%] == [] ( set SPARK_CONF_DIR=%~dp0..\conf ) - set SPARK_ENV_CMD=%SPARK_CONF_DIR%\%SPARK_ENV_CMD% - if exist %SPARK_ENV_CMD% ( - call %SPARK_ENV_CMD% - ) + call :LoadSparkEnv ) rem Setting SPARK_SCALA_VERSION if not already set. @@ -60,3 +57,8 @@ rem set SPARK_SCALA_VERSION=%SCALA_VERSION_2% rem ) rem ) exit /b 0 + +:LoadSparkEnv +if exist "%SPARK_CONF_DIR%\spark-env.cmd" ( + call "%SPARK_CONF_DIR%\spark-env.cmd" +) diff --git a/bin/pyspark b/bin/pyspark index 44891aee2e0a3..ad4132fb59eb0 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS # Add the PySpark classes to the Python path: export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" -export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.8.1-src.zip:$PYTHONPATH" +export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH" # Load the PySpark shell.py script when ./pyspark is used interactively: export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index 479fd464c7d3e..dc34be1a41706 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" ( ) set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% -set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.8.1-src.zip;%PYTHONPATH% +set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index a1c8a8e6582eb..f433764648ccd 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java index b33c53871c32f..431c7e42774e4 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java @@ -163,6 +163,13 @@ public void clear() { } } + /** + * An alias class for the type "{@literal ConcurrentHashMap, Boolean>}", + * which is used as a concurrent hashset for storing natural keys + * and the boolean value doesn't matter. + */ + private static class NaturalKeys extends ConcurrentHashMap, Boolean> {} + private static class InstanceList { /** @@ -171,7 +178,7 @@ private static class InstanceList { * iterators. https://bugs.openjdk.java.net/browse/JDK-8078645 */ private static class CountingRemoveIfForEach implements BiConsumer, T> { - private final ConcurrentMap, T> data; + private final InstanceList instanceList; private final Predicate filter; /** @@ -183,17 +190,15 @@ private static class CountingRemoveIfForEach implements BiConsumer, T> data, - Predicate filter) { - this.data = data; + CountingRemoveIfForEach(InstanceList instanceList, Predicate filter) { + this.instanceList = instanceList; this.filter = filter; } @Override public void accept(Comparable key, T value) { if (filter.test(value)) { - if (data.remove(key, value)) { + if (instanceList.delete(key, value)) { count++; } } @@ -205,11 +210,19 @@ public void accept(Comparable key, T value) { private final KVTypeInfo ti; private final KVTypeInfo.Accessor naturalKey; private final ConcurrentMap, T> data; + private final String naturalParentIndexName; + private final Boolean hasNaturalParentIndex; + // A mapping from parent to the natural keys of its children. + // For example, a mapping from a stage ID to all the task IDs in the stage. + private final ConcurrentMap, NaturalKeys> parentToChildrenMap; private InstanceList(Class klass) { this.ti = new KVTypeInfo(klass); this.naturalKey = ti.getAccessor(KVIndex.NATURAL_INDEX_NAME); this.data = new ConcurrentHashMap<>(); + this.naturalParentIndexName = ti.getParentIndexName(KVIndex.NATURAL_INDEX_NAME); + this.parentToChildrenMap = new ConcurrentHashMap<>(); + this.hasNaturalParentIndex = !naturalParentIndexName.isEmpty(); } KVTypeInfo.Accessor getIndexAccessor(String indexName) { @@ -217,11 +230,35 @@ KVTypeInfo.Accessor getIndexAccessor(String indexName) { } int countingRemoveAllByIndexValues(String index, Collection indexValues) { - Predicate filter = getPredicate(ti.getAccessor(index), indexValues); - CountingRemoveIfForEach callback = new CountingRemoveIfForEach<>(data, filter); + int count = 0; + if (KVIndex.NATURAL_INDEX_NAME.equals(index)) { + for (Object naturalKey : indexValues) { + count += delete(asKey(naturalKey)) ? 1 : 0; + } + return count; + } else if (hasNaturalParentIndex && naturalParentIndexName.equals(index)) { + // If there is a parent index for the natural index and `index` happens to be it, + // Spark can use the `parentToChildrenMap` to get the related natural keys, and then + // delete them from `data`. + for (Object indexValue : indexValues) { + Comparable parentKey = asKey(indexValue); + NaturalKeys children = parentToChildrenMap.getOrDefault(parentKey, new NaturalKeys()); + for (Comparable naturalKey : children.keySet()) { + data.remove(naturalKey); + count ++; + } + parentToChildrenMap.remove(parentKey); + } + return count; + } else { + Predicate filter = getPredicate(ti.getAccessor(index), indexValues); + CountingRemoveIfForEach callback = new CountingRemoveIfForEach<>(this, filter); - data.forEach(callback); - return callback.count(); + // Go through all the values in `data` and delete objects that meets the predicate `filter`. + // This can be slow when there is a large number of entries in `data`. + data.forEach(callback); + return callback.count(); + } } public T get(Object key) { @@ -230,10 +267,42 @@ public T get(Object key) { public void put(T value) throws Exception { data.put(asKey(naturalKey.get(value)), value); + if (hasNaturalParentIndex) { + Comparable parentKey = asKey(getIndexAccessor(naturalParentIndexName).get(value)); + NaturalKeys children = + parentToChildrenMap.computeIfAbsent(parentKey, k -> new NaturalKeys()); + children.put(asKey(naturalKey.get(value)), true); + } + } + + public boolean delete(Object key) { + boolean entryExists = data.remove(asKey(key)) != null; + if (entryExists) { + deleteParentIndex(key); + } + return entryExists; } - public void delete(Object key) { - data.remove(asKey(key)); + public boolean delete(Object key, T value) { + boolean entryExists = data.remove(asKey(key), value); + if (entryExists) { + deleteParentIndex(key); + } + return entryExists; + } + + private void deleteParentIndex(Object key) { + if (hasNaturalParentIndex) { + for (NaturalKeys v : parentToChildrenMap.values()) { + if (v.remove(asKey(key)) != null) { + // `v` can be empty after removing the natural key and we can remove it from + // `parentToChildrenMap`. However, `parentToChildrenMap` is a ConcurrentMap and such + // checking and deleting can be slow. + // This method is to delete one object with certain key, let's make it simple here. + break; + } + } + } } public int size() { @@ -241,7 +310,7 @@ public int size() { } public InMemoryView view() { - return new InMemoryView<>(data.values(), ti); + return new InMemoryView<>(data, ti, naturalParentIndexName, parentToChildrenMap); } private static Predicate getPredicate( @@ -271,22 +340,32 @@ private static Object indexValueForEntity(KVTypeInfo.Accessor getter, Object ent private static class InMemoryView extends KVStoreView { private static final InMemoryView EMPTY_VIEW = - new InMemoryView<>(Collections.emptyList(), null); + new InMemoryView<>(new ConcurrentHashMap<>(), null, "", new ConcurrentHashMap<>()); - private final Collection elements; + private final ConcurrentMap, T> data; private final KVTypeInfo ti; private final KVTypeInfo.Accessor natural; - - InMemoryView(Collection elements, KVTypeInfo ti) { - this.elements = elements; + private final ConcurrentMap, NaturalKeys> parentToChildrenMap; + private final String naturalParentIndexName; + private final Boolean hasNaturalParentIndex; + + InMemoryView( + ConcurrentMap, T> data, + KVTypeInfo ti, + String naturalParentIndexName, + ConcurrentMap, NaturalKeys> parentToChildrenMap) { + this.data = data; this.ti = ti; this.natural = ti != null ? ti.getAccessor(KVIndex.NATURAL_INDEX_NAME) : null; + this.naturalParentIndexName = naturalParentIndexName; + this.parentToChildrenMap = parentToChildrenMap; + this.hasNaturalParentIndex = !naturalParentIndexName.isEmpty(); } @Override public Iterator iterator() { - if (elements.isEmpty()) { - return new InMemoryIterator<>(elements.iterator()); + if (data.isEmpty()) { + return new InMemoryIterator<>(Collections.emptyIterator()); } KVTypeInfo.Accessor getter = index != null ? ti.getAccessor(index) : null; @@ -322,15 +401,31 @@ public Iterator iterator() { */ private List copyElements() { if (parent != null) { - KVTypeInfo.Accessor parentGetter = ti.getParentAccessor(index); - Preconditions.checkArgument(parentGetter != null, "Parent filter for non-child index."); - Comparable parentKey = asKey(parent); - - return elements.stream() - .filter(e -> compare(e, parentGetter, parentKey) == 0) - .collect(Collectors.toList()); + Comparable parentKey = asKey(parent); + if (hasNaturalParentIndex && naturalParentIndexName.equals(ti.getParentIndexName(index))) { + // If there is a parent index for the natural index and the parent of `index` happens to + // be it, Spark can use the `parentToChildrenMap` to get the related natural keys, and + // then copy them from `data`. + NaturalKeys children = parentToChildrenMap.getOrDefault(parentKey, new NaturalKeys()); + ArrayList elements = new ArrayList<>(); + for (Comparable naturalKey : children.keySet()) { + data.computeIfPresent(naturalKey, (k, v) -> { + elements.add(v); + return v; + }); + } + return elements; + } else { + // Go through all the values in `data` and collect all the objects has certain parent + // value. This can be slow when there is a large number of entries in `data`. + KVTypeInfo.Accessor parentGetter = ti.getParentAccessor(index); + Preconditions.checkArgument(parentGetter != null, "Parent filter for non-child index."); + return data.values().stream() + .filter(e -> compare(e, parentGetter, parentKey) == 0) + .collect(Collectors.toList()); + } } else { - return new ArrayList<>(elements); + return new ArrayList<>(data.values()); } } diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java index d2a26982d8703..5404d33dba5fb 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java @@ -68,8 +68,6 @@ public KVTypeInfo(Class type) { Preconditions.checkArgument(indices.containsKey(KVIndex.NATURAL_INDEX_NAME), "No natural index defined for type %s.", type.getName()); - Preconditions.checkArgument(indices.get(KVIndex.NATURAL_INDEX_NAME).parent().isEmpty(), - "Natural index of %s cannot have a parent.", type.getName()); for (KVIndex idx : indices.values()) { if (!idx.parent().isEmpty()) { @@ -117,6 +115,11 @@ Accessor getParentAccessor(String indexName) { return index.parent().isEmpty() ? null : getAccessor(index.parent()); } + String getParentIndexName(String indexName) { + KVIndex index = indices.get(indexName); + return index.parent(); + } + /** * Abstracts the difference between invoking a Field and a Method. */ diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java index f4d359234cb9e..d7423537ddfcf 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java @@ -133,12 +133,13 @@ class LevelDBTypeInfo { // First create the parent indices, then the child indices. ti.indices().forEach(idx -> { - if (idx.parent().isEmpty()) { + // In LevelDB, there is no parent index for the NUTURAL INDEX. + if (idx.parent().isEmpty() || idx.value().equals(KVIndex.NATURAL_INDEX_NAME)) { indices.put(idx.value(), new Index(idx, ti.getAccessor(idx.value()), null)); } }); ti.indices().forEach(idx -> { - if (!idx.parent().isEmpty()) { + if (!idx.parent().isEmpty() && !idx.value().equals(KVIndex.NATURAL_INDEX_NAME)) { indices.put(idx.value(), new Index(idx, ti.getAccessor(idx.value()), indices.get(idx.parent()))); } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType2.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType2.java new file mode 100644 index 0000000000000..3bb66bb3ec700 --- /dev/null +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType2.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util.kvstore; + +public class CustomType2 { + + @KVIndex(parent = "parentId") + public String key; + + @KVIndex("id") + public String id; + + @KVIndex("parentId") + public String parentId; + + @Override + public boolean equals(Object o) { + if (o instanceof CustomType2) { + CustomType2 other = (CustomType2) o; + return id.equals(other.id) && parentId.equals(other.parentId); + } + return false; + } + + @Override + public int hashCode() { + return id.hashCode() ^ parentId.hashCode(); + } + + @Override + public String toString() { + return "CustomType2[key=" + key + ",id=" + id + ",parentId=" + parentId; + } + +} diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java index 9e34225e14e18..35656fb12238a 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java @@ -158,23 +158,29 @@ public void testRemoveAll() throws Exception { assertEquals(9, store.count(ArrayKeyIndexType.class)); + // Try removing non-existing keys + assert(!store.removeAllByIndexValues( + ArrayKeyIndexType.class, + KVIndex.NATURAL_INDEX_NAME, + ImmutableSet.of(new int[] {10, 10, 10}, new int[] { 3, 3, 3 }))); + assertEquals(9, store.count(ArrayKeyIndexType.class)); - store.removeAllByIndexValues( + assert(store.removeAllByIndexValues( ArrayKeyIndexType.class, KVIndex.NATURAL_INDEX_NAME, - ImmutableSet.of(new int[] {0, 0, 0}, new int[] { 2, 2, 2 })); + ImmutableSet.of(new int[] {0, 0, 0}, new int[] { 2, 2, 2 }))); assertEquals(7, store.count(ArrayKeyIndexType.class)); - store.removeAllByIndexValues( + assert(store.removeAllByIndexValues( ArrayKeyIndexType.class, "id", - ImmutableSet.of(new String [] { "things" })); + ImmutableSet.of(new String [] { "things" }))); assertEquals(4, store.count(ArrayKeyIndexType.class)); - store.removeAllByIndexValues( + assert(store.removeAllByIndexValues( ArrayKeyIndexType.class, "id", - ImmutableSet.of(new String [] { "more things" })); + ImmutableSet.of(new String [] { "more things" }))); assertEquals(0, store.count(ArrayKeyIndexType.class)); } @@ -204,4 +210,46 @@ public void testBasicIteration() throws Exception { assertFalse(store.view(t1.getClass()).first(t2.id).skip(1).iterator().hasNext()); } + @Test + public void testDeleteParentIndex() throws Exception { + KVStore store = new InMemoryStore(); + + CustomType2 t1 = new CustomType2(); + t1.key = "key1"; + t1.id = "id1"; + t1.parentId = "parentId1"; + store.write(t1); + + CustomType2 t2 = new CustomType2(); + t2.key = "key2"; + t2.id = "id2"; + t2.parentId = "parentId1"; + store.write(t2); + + CustomType2 t3 = new CustomType2(); + t3.key = "key3"; + t3.id = "id1"; + t3.parentId = "parentId2"; + store.write(t3); + + CustomType2 t4 = new CustomType2(); + t4.key = "key4"; + t4.id = "id2"; + t4.parentId = "parentId2"; + store.write(t4); + + assertEquals(4, store.count(CustomType2.class)); + + store.delete(t1.getClass(), t1.key); + assertEquals(3, store.count(CustomType2.class)); + + store.delete(t2.getClass(), t2.key); + assertEquals(2, store.count(CustomType2.class)); + + store.delete(t3.getClass(), t3.key); + assertEquals(1, store.count(CustomType2.class)); + + store.delete(t4.getClass(), t4.key); + assertEquals(0, store.count(CustomType2.class)); + } } diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 163c250054e4d..4f31606fddf94 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java index d99b9bdbce392..a0de9df1986f5 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java +++ b/common/network-common/src/main/java/org/apache/spark/network/TransportContext.java @@ -123,7 +123,7 @@ public TransportContext( if (conf.getModuleName() != null && conf.getModuleName().equalsIgnoreCase("shuffle") && - !isClientOnly) { + !isClientOnly && conf.separateChunkFetchRequest()) { chunkFetchWorkers = NettyUtils.createEventLoop( IOMode.valueOf(conf.ioMode()), conf.chunkFetchHandlerThreads(), @@ -187,8 +187,6 @@ public TransportChannelHandler initializePipeline( RpcHandler channelRpcHandler) { try { TransportChannelHandler channelHandler = createChannelHandler(channel, channelRpcHandler); - ChunkFetchRequestHandler chunkFetchHandler = - createChunkFetchHandler(channelHandler, channelRpcHandler); ChannelPipeline pipeline = channel.pipeline() .addLast("encoder", ENCODER) .addLast(TransportFrameDecoder.HANDLER_NAME, NettyUtils.createFrameDecoder()) @@ -200,6 +198,9 @@ public TransportChannelHandler initializePipeline( .addLast("handler", channelHandler); // Use a separate EventLoopGroup to handle ChunkFetchRequest messages for shuffle rpcs. if (chunkFetchWorkers != null) { + ChunkFetchRequestHandler chunkFetchHandler = new ChunkFetchRequestHandler( + channelHandler.getClient(), rpcHandler.getStreamManager(), + conf.maxChunksBeingTransferred(), true /* syncModeEnabled */); pipeline.addLast(chunkFetchWorkers, "chunkFetchHandler", chunkFetchHandler); } return channelHandler; @@ -217,19 +218,17 @@ public TransportChannelHandler initializePipeline( private TransportChannelHandler createChannelHandler(Channel channel, RpcHandler rpcHandler) { TransportResponseHandler responseHandler = new TransportResponseHandler(channel); TransportClient client = new TransportClient(channel, responseHandler); + boolean separateChunkFetchRequest = conf.separateChunkFetchRequest(); + ChunkFetchRequestHandler chunkFetchRequestHandler = null; + if (!separateChunkFetchRequest) { + chunkFetchRequestHandler = new ChunkFetchRequestHandler( + client, rpcHandler.getStreamManager(), + conf.maxChunksBeingTransferred(), false /* syncModeEnabled */); + } TransportRequestHandler requestHandler = new TransportRequestHandler(channel, client, - rpcHandler, conf.maxChunksBeingTransferred()); + rpcHandler, conf.maxChunksBeingTransferred(), chunkFetchRequestHandler); return new TransportChannelHandler(client, responseHandler, requestHandler, - conf.connectionTimeoutMs(), closeIdleConnections, this); - } - - /** - * Creates the dedicated ChannelHandler for ChunkFetchRequest messages. - */ - private ChunkFetchRequestHandler createChunkFetchHandler(TransportChannelHandler channelHandler, - RpcHandler rpcHandler) { - return new ChunkFetchRequestHandler(channelHandler.getClient(), - rpcHandler.getStreamManager(), conf.maxChunksBeingTransferred()); + conf.connectionTimeoutMs(), separateChunkFetchRequest, closeIdleConnections, this); } public TransportConf getConf() { return conf; } diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java index 821cc7a849504..dd31c955350f1 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java @@ -29,12 +29,11 @@ import org.slf4j.LoggerFactory; import org.apache.spark.network.client.RpcResponseCallback; -import org.apache.spark.network.client.StreamCallbackWithID; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.sasl.SecretKeyHolder; import org.apache.spark.network.sasl.SaslRpcHandler; +import org.apache.spark.network.server.AbstractAuthRpcHandler; import org.apache.spark.network.server.RpcHandler; -import org.apache.spark.network.server.StreamManager; import org.apache.spark.network.util.TransportConf; /** @@ -46,7 +45,7 @@ * The delegate will only receive messages if the given connection has been successfully * authenticated. A connection may be authenticated at most once. */ -class AuthRpcHandler extends RpcHandler { +class AuthRpcHandler extends AbstractAuthRpcHandler { private static final Logger LOG = LoggerFactory.getLogger(AuthRpcHandler.class); /** Transport configuration. */ @@ -55,36 +54,31 @@ class AuthRpcHandler extends RpcHandler { /** The client channel. */ private final Channel channel; - /** - * RpcHandler we will delegate to for authenticated connections. When falling back to SASL - * this will be replaced with the SASL RPC handler. - */ - @VisibleForTesting - RpcHandler delegate; - /** Class which provides secret keys which are shared by server and client on a per-app basis. */ private final SecretKeyHolder secretKeyHolder; - /** Whether auth is done and future calls should be delegated. */ + /** RPC handler for auth handshake when falling back to SASL auth. */ @VisibleForTesting - boolean doDelegate; + SaslRpcHandler saslHandler; AuthRpcHandler( TransportConf conf, Channel channel, RpcHandler delegate, SecretKeyHolder secretKeyHolder) { + super(delegate); this.conf = conf; this.channel = channel; - this.delegate = delegate; this.secretKeyHolder = secretKeyHolder; } @Override - public void receive(TransportClient client, ByteBuffer message, RpcResponseCallback callback) { - if (doDelegate) { - delegate.receive(client, message, callback); - return; + protected boolean doAuthChallenge( + TransportClient client, + ByteBuffer message, + RpcResponseCallback callback) { + if (saslHandler != null) { + return saslHandler.doAuthChallenge(client, message, callback); } int position = message.position(); @@ -98,18 +92,17 @@ public void receive(TransportClient client, ByteBuffer message, RpcResponseCallb if (conf.saslFallback()) { LOG.warn("Failed to parse new auth challenge, reverting to SASL for client {}.", channel.remoteAddress()); - delegate = new SaslRpcHandler(conf, channel, delegate, secretKeyHolder); + saslHandler = new SaslRpcHandler(conf, channel, null, secretKeyHolder); message.position(position); message.limit(limit); - delegate.receive(client, message, callback); - doDelegate = true; + return saslHandler.doAuthChallenge(client, message, callback); } else { LOG.debug("Unexpected challenge message from client {}, closing channel.", channel.remoteAddress()); callback.onFailure(new IllegalArgumentException("Unknown challenge message.")); channel.close(); } - return; + return false; } // Here we have the client challenge, so perform the new auth protocol and set up the channel. @@ -131,7 +124,7 @@ public void receive(TransportClient client, ByteBuffer message, RpcResponseCallb LOG.debug("Authentication failed for client {}, closing channel.", channel.remoteAddress()); callback.onFailure(new IllegalArgumentException("Authentication failed.")); channel.close(); - return; + return false; } finally { if (engine != null) { try { @@ -143,40 +136,6 @@ public void receive(TransportClient client, ByteBuffer message, RpcResponseCallb } LOG.debug("Authorization successful for client {}.", channel.remoteAddress()); - doDelegate = true; - } - - @Override - public void receive(TransportClient client, ByteBuffer message) { - delegate.receive(client, message); - } - - @Override - public StreamCallbackWithID receiveStream( - TransportClient client, - ByteBuffer message, - RpcResponseCallback callback) { - return delegate.receiveStream(client, message, callback); + return true; } - - @Override - public StreamManager getStreamManager() { - return delegate.getStreamManager(); - } - - @Override - public void channelActive(TransportClient client) { - delegate.channelActive(client); - } - - @Override - public void channelInactive(TransportClient client) { - delegate.channelInactive(client); - } - - @Override - public void exceptionCaught(Throwable cause, TransportClient client) { - delegate.exceptionCaught(cause, client); - } - } diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java index 355a3def8cc22..cc9e88fcf98e7 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java @@ -28,10 +28,9 @@ import org.slf4j.LoggerFactory; import org.apache.spark.network.client.RpcResponseCallback; -import org.apache.spark.network.client.StreamCallbackWithID; import org.apache.spark.network.client.TransportClient; +import org.apache.spark.network.server.AbstractAuthRpcHandler; import org.apache.spark.network.server.RpcHandler; -import org.apache.spark.network.server.StreamManager; import org.apache.spark.network.util.JavaUtils; import org.apache.spark.network.util.TransportConf; @@ -43,7 +42,7 @@ * Note that the authentication process consists of multiple challenge-response pairs, each of * which are individual RPCs. */ -public class SaslRpcHandler extends RpcHandler { +public class SaslRpcHandler extends AbstractAuthRpcHandler { private static final Logger logger = LoggerFactory.getLogger(SaslRpcHandler.class); /** Transport configuration. */ @@ -52,37 +51,28 @@ public class SaslRpcHandler extends RpcHandler { /** The client channel. */ private final Channel channel; - /** RpcHandler we will delegate to for authenticated connections. */ - private final RpcHandler delegate; - /** Class which provides secret keys which are shared by server and client on a per-app basis. */ private final SecretKeyHolder secretKeyHolder; private SparkSaslServer saslServer; - private boolean isComplete; - private boolean isAuthenticated; public SaslRpcHandler( TransportConf conf, Channel channel, RpcHandler delegate, SecretKeyHolder secretKeyHolder) { + super(delegate); this.conf = conf; this.channel = channel; - this.delegate = delegate; this.secretKeyHolder = secretKeyHolder; this.saslServer = null; - this.isComplete = false; - this.isAuthenticated = false; } @Override - public void receive(TransportClient client, ByteBuffer message, RpcResponseCallback callback) { - if (isComplete) { - // Authentication complete, delegate to base handler. - delegate.receive(client, message, callback); - return; - } + public boolean doAuthChallenge( + TransportClient client, + ByteBuffer message, + RpcResponseCallback callback) { if (saslServer == null || !saslServer.isComplete()) { ByteBuf nettyBuf = Unpooled.wrappedBuffer(message); SaslMessage saslMessage; @@ -118,43 +108,21 @@ public void receive(TransportClient client, ByteBuffer message, RpcResponseCallb if (!SparkSaslServer.QOP_AUTH_CONF.equals(saslServer.getNegotiatedProperty(Sasl.QOP))) { logger.debug("SASL authentication successful for channel {}", client); complete(true); - return; + return true; } logger.debug("Enabling encryption for channel {}", client); SaslEncryption.addToChannel(channel, saslServer, conf.maxSaslEncryptedBlockSize()); complete(false); - return; + return true; } - } - - @Override - public void receive(TransportClient client, ByteBuffer message) { - delegate.receive(client, message); - } - - @Override - public StreamCallbackWithID receiveStream( - TransportClient client, - ByteBuffer message, - RpcResponseCallback callback) { - return delegate.receiveStream(client, message, callback); - } - - @Override - public StreamManager getStreamManager() { - return delegate.getStreamManager(); - } - - @Override - public void channelActive(TransportClient client) { - delegate.channelActive(client); + return false; } @Override public void channelInactive(TransportClient client) { try { - delegate.channelInactive(client); + super.channelInactive(client); } finally { if (saslServer != null) { saslServer.dispose(); @@ -162,11 +130,6 @@ public void channelInactive(TransportClient client) { } } - @Override - public void exceptionCaught(Throwable cause, TransportClient client) { - delegate.exceptionCaught(cause, client); - } - private void complete(boolean dispose) { if (dispose) { try { @@ -177,7 +140,6 @@ private void complete(boolean dispose) { } saslServer = null; - isComplete = true; } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/AbstractAuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/AbstractAuthRpcHandler.java new file mode 100644 index 0000000000000..92eb886283448 --- /dev/null +++ b/common/network-common/src/main/java/org/apache/spark/network/server/AbstractAuthRpcHandler.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.server; + +import java.nio.ByteBuffer; + +import org.apache.spark.network.client.RpcResponseCallback; +import org.apache.spark.network.client.StreamCallbackWithID; +import org.apache.spark.network.client.TransportClient; + +/** + * RPC Handler which performs authentication, and when it's successful, delegates further + * calls to another RPC handler. The authentication handshake itself should be implemented + * by subclasses. + */ +public abstract class AbstractAuthRpcHandler extends RpcHandler { + /** RpcHandler we will delegate to for authenticated connections. */ + private final RpcHandler delegate; + + private boolean isAuthenticated; + + protected AbstractAuthRpcHandler(RpcHandler delegate) { + this.delegate = delegate; + } + + /** + * Responds to an authentication challenge. + * + * @return Whether the client is authenticated. + */ + protected abstract boolean doAuthChallenge( + TransportClient client, + ByteBuffer message, + RpcResponseCallback callback); + + @Override + public final void receive( + TransportClient client, + ByteBuffer message, + RpcResponseCallback callback) { + if (isAuthenticated) { + delegate.receive(client, message, callback); + } else { + isAuthenticated = doAuthChallenge(client, message, callback); + } + } + + @Override + public final void receive(TransportClient client, ByteBuffer message) { + if (isAuthenticated) { + delegate.receive(client, message); + } else { + throw new SecurityException("Unauthenticated call to receive()."); + } + } + + @Override + public final StreamCallbackWithID receiveStream( + TransportClient client, + ByteBuffer message, + RpcResponseCallback callback) { + if (isAuthenticated) { + return delegate.receiveStream(client, message, callback); + } else { + throw new SecurityException("Unauthenticated call to receiveStream()."); + } + } + + @Override + public StreamManager getStreamManager() { + return delegate.getStreamManager(); + } + + @Override + public void channelActive(TransportClient client) { + delegate.channelActive(client); + } + + @Override + public void channelInactive(TransportClient client) { + delegate.channelInactive(client); + } + + @Override + public void exceptionCaught(Throwable cause, TransportClient client) { + delegate.exceptionCaught(cause, client); + } + + public boolean isAuthenticated() { + return isAuthenticated; + } +} diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java index 94412c4db559f..82810dacdad84 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java @@ -55,14 +55,17 @@ public class ChunkFetchRequestHandler extends SimpleChannelInboundHandler { + ChannelFuture channelFuture; + if (syncModeEnabled) { + channelFuture = channel.writeAndFlush(result).await(); + } else { + channelFuture = channel.writeAndFlush(result); + } + return channelFuture.addListener((ChannelFutureListener) future -> { if (future.isSuccess()) { logger.trace("Sent result {} to client {}", result, remoteAddress); } else { diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java index 31371f6970ffe..e53a0c1a0852e 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportChannelHandler.java @@ -58,6 +58,7 @@ public class TransportChannelHandler extends SimpleChannelInboundHandler { /** The max number of chunks being transferred and not finished yet. */ private final long maxChunksBeingTransferred; + /** The dedicated ChannelHandler for ChunkFetchRequest messages. */ + private final ChunkFetchRequestHandler chunkFetchRequestHandler; + public TransportRequestHandler( Channel channel, TransportClient reverseClient, RpcHandler rpcHandler, - Long maxChunksBeingTransferred) { + Long maxChunksBeingTransferred, + ChunkFetchRequestHandler chunkFetchRequestHandler) { this.channel = channel; this.reverseClient = reverseClient; this.rpcHandler = rpcHandler; this.streamManager = rpcHandler.getStreamManager(); this.maxChunksBeingTransferred = maxChunksBeingTransferred; + this.chunkFetchRequestHandler = chunkFetchRequestHandler; } @Override @@ -97,8 +102,10 @@ public void channelInactive() { } @Override - public void handle(RequestMessage request) { - if (request instanceof RpcRequest) { + public void handle(RequestMessage request) throws Exception { + if (request instanceof ChunkFetchRequest) { + chunkFetchRequestHandler.processFetchRequest(channel, (ChunkFetchRequest) request); + } else if (request instanceof RpcRequest) { processRpcRequest((RpcRequest) request); } else if (request instanceof OneWayMessage) { processOneWayMessage((OneWayMessage) request); diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java index cc0f2919568ac..646e4278811f4 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java @@ -290,7 +290,7 @@ public boolean sharedByteBufAllocators() { } /** - * If enabled then off-heap byte buffers will be prefered for the shared ByteBuf allocators. + * If enabled then off-heap byte buffers will be preferred for the shared ByteBuf allocators. */ public boolean preferDirectBufsForSharedByteBufAllocators() { return conf.getBoolean("spark.network.io.preferDirectBufs", true); @@ -316,7 +316,8 @@ public long maxChunksBeingTransferred() { /** * Percentage of io.serverThreads used by netty to process ChunkFetchRequest. - * Shuffle server will use a separate EventLoopGroup to process ChunkFetchRequest messages. + * When the config `spark.shuffle.server.chunkFetchHandlerThreadsPercent` is set, + * shuffle server will use a separate EventLoopGroup to process ChunkFetchRequest messages. * Although when calling the async writeAndFlush on the underlying channel to send * response back to client, the I/O on the channel is still being handled by * {@link org.apache.spark.network.server.TransportServer}'s default EventLoopGroup @@ -339,12 +340,20 @@ public int chunkFetchHandlerThreads() { return 0; } int chunkFetchHandlerThreadsPercent = - conf.getInt("spark.shuffle.server.chunkFetchHandlerThreadsPercent", 100); + Integer.parseInt(conf.get("spark.shuffle.server.chunkFetchHandlerThreadsPercent")); int threads = this.serverThreads() > 0 ? this.serverThreads() : 2 * NettyRuntime.availableProcessors(); return (int) Math.ceil(threads * (chunkFetchHandlerThreadsPercent / 100.0)); } + /** + * Whether to use a separate EventLoopGroup to process ChunkFetchRequest messages, it is decided + * by the config `spark.shuffle.server.chunkFetchHandlerThreadsPercent` is set or not. + */ + public boolean separateChunkFetchRequest() { + return conf.getInt("spark.shuffle.server.chunkFetchHandlerThreadsPercent", 0) > 0; + } + /** * Whether to use the old protocol while doing the shuffle block fetching. * It is only enabled while we need the compatibility in the scenario of new spark version diff --git a/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java index 7e30ed4048ca8..addb4ff332746 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/ChunkFetchRequestHandlerSuite.java @@ -22,7 +22,6 @@ import java.util.List; import io.netty.channel.Channel; -import org.apache.spark.network.server.ChunkFetchRequestHandler; import org.junit.Assert; import org.junit.Test; @@ -33,6 +32,7 @@ import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.protocol.*; +import org.apache.spark.network.server.ChunkFetchRequestHandler; import org.apache.spark.network.server.NoOpRpcHandler; import org.apache.spark.network.server.OneForOneStreamManager; import org.apache.spark.network.server.RpcHandler; @@ -68,7 +68,7 @@ public void handleChunkFetchRequest() throws Exception { long streamId = streamManager.registerStream("test-app", managedBuffers.iterator(), channel); TransportClient reverseClient = mock(TransportClient.class); ChunkFetchRequestHandler requestHandler = new ChunkFetchRequestHandler(reverseClient, - rpcHandler.getStreamManager(), 2L); + rpcHandler.getStreamManager(), 2L, false); RequestMessage request0 = new ChunkFetchRequest(new StreamChunkId(streamId, 0)); requestHandler.channelRead(context, request0); diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java index a43a659048686..0a6447176237a 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/TransportRequestHandlerSuite.java @@ -39,7 +39,7 @@ public class TransportRequestHandlerSuite { @Test - public void handleStreamRequest() { + public void handleStreamRequest() throws Exception { RpcHandler rpcHandler = new NoOpRpcHandler(); OneForOneStreamManager streamManager = (OneForOneStreamManager) (rpcHandler.getStreamManager()); Channel channel = mock(Channel.class); @@ -66,7 +66,7 @@ public void handleStreamRequest() { TransportClient reverseClient = mock(TransportClient.class); TransportRequestHandler requestHandler = new TransportRequestHandler(channel, reverseClient, - rpcHandler, 2L); + rpcHandler, 2L, null); RequestMessage request0 = new StreamRequest(String.format("%d_%d", streamId, 0)); requestHandler.handle(request0); diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java index 2f9dd629df1ba..a87a6aae21092 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthIntegrationSuite.java @@ -34,7 +34,6 @@ import org.apache.spark.network.client.RpcResponseCallback; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportClientBootstrap; -import org.apache.spark.network.sasl.SaslRpcHandler; import org.apache.spark.network.sasl.SaslServerBootstrap; import org.apache.spark.network.sasl.SecretKeyHolder; import org.apache.spark.network.server.RpcHandler; @@ -65,8 +64,7 @@ public void testNewAuth() throws Exception { ByteBuffer reply = ctx.client.sendRpcSync(JavaUtils.stringToBytes("Ping"), 5000); assertEquals("Pong", JavaUtils.bytesToString(reply)); - assertTrue(ctx.authRpcHandler.doDelegate); - assertFalse(ctx.authRpcHandler.delegate instanceof SaslRpcHandler); + assertNull(ctx.authRpcHandler.saslHandler); } @Test @@ -78,7 +76,7 @@ public void testAuthFailure() throws Exception { ctx.createClient("client"); fail("Should have failed to create client."); } catch (Exception e) { - assertFalse(ctx.authRpcHandler.doDelegate); + assertFalse(ctx.authRpcHandler.isAuthenticated()); assertFalse(ctx.serverChannel.isActive()); } } @@ -91,6 +89,8 @@ public void testSaslServerFallback() throws Exception { ByteBuffer reply = ctx.client.sendRpcSync(JavaUtils.stringToBytes("Ping"), 5000); assertEquals("Pong", JavaUtils.bytesToString(reply)); + assertNotNull(ctx.authRpcHandler.saslHandler); + assertTrue(ctx.authRpcHandler.isAuthenticated()); } @Test @@ -120,7 +120,7 @@ public void testAuthReplay() throws Exception { ctx.client.sendRpcSync(JavaUtils.stringToBytes("Ping"), 5000); fail("Should have failed unencrypted RPC."); } catch (Exception e) { - assertTrue(ctx.authRpcHandler.doDelegate); + assertTrue(ctx.authRpcHandler.isAuthenticated()); } } @@ -151,7 +151,7 @@ public StreamManager getStreamManager() { ctx.client.sendRpcSync(JavaUtils.stringToBytes("Ping"), 5000); fail("Should have failed unencrypted RPC."); } catch (Exception e) { - assertTrue(ctx.authRpcHandler.doDelegate); + assertTrue(ctx.authRpcHandler.isAuthenticated()); assertTrue(e.getMessage() + " is not an expected error", e.getMessage().contains("DDDDD")); // Verify we receive the complete error message int messageStart = e.getMessage().indexOf("DDDDD"); diff --git a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java index cf2d72f71e8de..ecaeec98da182 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java @@ -357,7 +357,8 @@ public void testRpcHandlerDelegate() throws Exception { public void testDelegates() throws Exception { Method[] rpcHandlerMethods = RpcHandler.class.getDeclaredMethods(); for (Method m : rpcHandlerMethods) { - SaslRpcHandler.class.getDeclaredMethod(m.getName(), m.getParameterTypes()); + Method delegate = SaslRpcHandler.class.getMethod(m.getName(), m.getParameterTypes()); + assertNotEquals(delegate.getDeclaringClass(), RpcHandler.class); } } diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index a6d99813a8501..a3ac6611a3127 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java index 13f6046dd856b..6549cac011feb 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java @@ -23,11 +23,19 @@ import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.lang3.SystemUtils; import org.apache.spark.network.util.JavaUtils; public class ExecutorDiskUtils { - private static final Pattern MULTIPLE_SEPARATORS = Pattern.compile(File.separator + "{2,}"); + private static final Pattern MULTIPLE_SEPARATORS; + static { + if (SystemUtils.IS_OS_WINDOWS) { + MULTIPLE_SEPARATORS = Pattern.compile("[/\\\\]+"); + } else { + MULTIPLE_SEPARATORS = Pattern.compile("/{2,}"); + } + } /** * Hashes a filename into the corresponding local directory, in a manner consistent with @@ -50,14 +58,18 @@ public static File getFile(String[] localDirs, int subDirsPerLocalDir, String fi * the internal code in java.io.File would normalize it later, creating a new "foo/bar" * String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File * uses, since it is in the package-private class java.io.FileSystem. + * + * On Windows, separator "\" is used instead of "/". + * + * "\\" is a legal character in path name on Unix-like OS, but illegal on Windows. */ @VisibleForTesting static String createNormalizedInternedPathname(String dir1, String dir2, String fname) { String pathname = dir1 + File.separator + dir2 + File.separator + fname; Matcher m = MULTIPLE_SEPARATORS.matcher(pathname); - pathname = m.replaceAll("/"); + pathname = m.replaceAll(Matcher.quoteReplacement(File.separator)); // A single trailing slash needs to be taken care of separately - if (pathname.length() > 1 && pathname.endsWith("/")) { + if (pathname.length() > 1 && pathname.charAt(pathname.length() - 1) == File.separatorChar) { pathname = pathname.substring(0, pathname.length() - 1); } return pathname.intern(); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java index 8c05288fb4111..33865a21ea914 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java @@ -229,8 +229,6 @@ public class ShuffleMetrics implements MetricSet { private final Meter blockTransferRateBytes = new Meter(); // Number of active connections to the shuffle service private Counter activeConnections = new Counter(); - // Number of registered connections to the shuffle service - private Counter registeredConnections = new Counter(); // Number of exceptions caught in connections to the shuffle service private Counter caughtExceptions = new Counter(); @@ -242,7 +240,6 @@ public ShuffleMetrics() { allMetrics.put("registeredExecutorsSize", (Gauge) () -> blockManager.getRegisteredExecutorsSize()); allMetrics.put("numActiveConnections", activeConnections); - allMetrics.put("numRegisteredConnections", registeredConnections); allMetrics.put("numCaughtExceptions", caughtExceptions); } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java index d6185f089d3c0..98247f24a036b 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java @@ -175,8 +175,6 @@ public void onSuccess(ByteBuffer response) { logger.warn("Error trying to remove RDD blocks " + Arrays.toString(blockIds) + " via external shuffle service from executor: " + execId, t); numRemovedBlocksFuture.complete(0); - } finally { - client.close(); } } @@ -185,7 +183,6 @@ public void onFailure(Throwable e) { logger.warn("Error trying to remove RDD blocks " + Arrays.toString(blockIds) + " via external shuffle service from executor: " + execId, e); numRemovedBlocksFuture.complete(0); - client.close(); } }); return numRemovedBlocksFuture; @@ -212,8 +209,6 @@ public void onSuccess(ByteBuffer response) { Arrays.toString(getLocalDirsMessage.execIds) + " via external shuffle service", t.getCause()); hostLocalDirsCompletable.completeExceptionally(t); - } finally { - client.close(); } } @@ -223,7 +218,6 @@ public void onFailure(Throwable t) { Arrays.toString(getLocalDirsMessage.execIds) + " via external shuffle service", t.getCause()); hostLocalDirsCompletable.completeExceptionally(t); - client.close(); } }); } catch (IOException | InterruptedException e) { diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java index 09b31430b1eb9..6515b6ca035f7 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java @@ -25,6 +25,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.io.CharStreams; +import org.apache.commons.lang3.SystemUtils; import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo; import org.apache.spark.network.util.MapConfigProvider; import org.apache.spark.network.util.TransportConf; @@ -146,12 +147,19 @@ public void jsonSerializationOfExecutorRegistration() throws IOException { @Test public void testNormalizeAndInternPathname() { - assertPathsMatch("/foo", "bar", "baz", "/foo/bar/baz"); - assertPathsMatch("//foo/", "bar/", "//baz", "/foo/bar/baz"); - assertPathsMatch("foo", "bar", "baz///", "foo/bar/baz"); - assertPathsMatch("/foo/", "/bar//", "/baz", "/foo/bar/baz"); - assertPathsMatch("/", "", "", "/"); - assertPathsMatch("/", "/", "/", "/"); + String sep = File.separator; + String expectedPathname = sep + "foo" + sep + "bar" + sep + "baz"; + assertPathsMatch("/foo", "bar", "baz", expectedPathname); + assertPathsMatch("//foo/", "bar/", "//baz", expectedPathname); + assertPathsMatch("/foo/", "/bar//", "/baz", expectedPathname); + assertPathsMatch("foo", "bar", "baz///", "foo" + sep + "bar" + sep + "baz"); + assertPathsMatch("/", "", "", sep); + assertPathsMatch("/", "/", "/", sep); + if (SystemUtils.IS_OS_WINDOWS) { + assertPathsMatch("/foo\\/", "bar", "baz", expectedPathname); + } else { + assertPathsMatch("/foo\\/", "bar", "baz", sep + "foo\\" + sep + "bar" + sep + "baz"); + } } private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) { @@ -160,6 +168,6 @@ private void assertPathsMatch(String p1, String p2, String p3, String expectedPa assertEquals(expectedPathname, normPathname); File file = new File(normPathname); String returnedPath = file.getPath(); - assertTrue(normPathname == returnedPath); + assertEquals(normPathname, returnedPath); } } diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 76a402bb2bd31..5c5dfbabe862e 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index 815a56d765b6a..c41efbad8ffec 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -188,7 +188,7 @@ protected void serviceInit(Configuration conf) throws Exception { int port = conf.getInt( SPARK_SHUFFLE_SERVICE_PORT_KEY, DEFAULT_SPARK_SHUFFLE_SERVICE_PORT); - transportContext = new TransportContext(transportConf, blockHandler); + transportContext = new TransportContext(transportConf, blockHandler, true); shuffleServer = transportContext.createServer(port, bootstraps); // the port should normally be fixed, but for tests its useful to find an open port port = shuffleServer.getPort(); diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 3c3c0d2d96a1c..8041d7a093da9 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 883b73a69c9de..e3a550288d71a 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/ApiSqlRootResource.scala b/common/tags/src/test/java/org/apache/spark/tags/ChromeUITest.java similarity index 72% rename from sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/ApiSqlRootResource.scala rename to common/tags/src/test/java/org/apache/spark/tags/ChromeUITest.java index 5fc7123c9097b..e3fed3d656d20 100644 --- a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/ApiSqlRootResource.scala +++ b/common/tags/src/test/java/org/apache/spark/tags/ChromeUITest.java @@ -15,15 +15,13 @@ * limitations under the License. */ -package org.apache.spark.status.api.v1.sql +package org.apache.spark.tags; -import javax.ws.rs.Path +import java.lang.annotation.*; -import org.apache.spark.status.api.v1.ApiRequestContext +import org.scalatest.TagAnnotation; -@Path("/v1") -private[v1] class ApiSqlRootResource extends ApiRequestContext { - - @Path("applications/{appId}/sql") - def sqlList(): Class[SqlResource] = classOf[SqlResource] -} +@TagAnnotation +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.METHOD, ElementType.TYPE}) +public @interface ChromeUITest { } diff --git a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/api.scala b/common/tags/src/test/java/org/apache/spark/tags/GitHubActionsUnstableTest.java similarity index 63% rename from sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/api.scala rename to common/tags/src/test/java/org/apache/spark/tags/GitHubActionsUnstableTest.java index 7ace66ffb06e1..a602656bb22f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/api.scala +++ b/common/tags/src/test/java/org/apache/spark/tags/GitHubActionsUnstableTest.java @@ -14,20 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.status.api.v1.sql -import java.util.Date +package org.apache.spark.tags; -class ExecutionData private[spark] ( - val id: Long, - val status: String, - val description: String, - val planDescription: String, - val metrics: Seq[Metrics], - val submissionTime: Date, - val duration: Long, - val runningJobIds: Seq[Int], - val successJobIds: Seq[Int], - val failedJobIds: Seq[Int]) +import org.scalatest.TagAnnotation; -case class Metrics private[spark] (metricName: String, metricValue: String) +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@TagAnnotation +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.METHOD, ElementType.TYPE}) +public @interface GitHubActionsUnstableTest { } diff --git a/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java b/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java new file mode 100644 index 0000000000000..a7e6f352667d7 --- /dev/null +++ b/common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.tags; + +import org.scalatest.TagAnnotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@TagAnnotation +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.METHOD, ElementType.TYPE}) +public @interface SlowHiveTest { } diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index 93a4f67fd23f2..09965b573706c 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java index 84a0156ebfb66..0ae238564d591 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java @@ -19,15 +19,9 @@ public class DateTimeConstants { - public static final int YEARS_PER_DECADE = 10; - public static final int YEARS_PER_CENTURY = 100; - public static final int YEARS_PER_MILLENNIUM = 1000; - - public static final byte MONTHS_PER_QUARTER = 3; public static final int MONTHS_PER_YEAR = 12; public static final byte DAYS_PER_WEEK = 7; - public static final long DAYS_PER_MONTH = 30L; public static final long HOURS_PER_DAY = 24L; @@ -47,9 +41,6 @@ public class DateTimeConstants { public static final long MICROS_PER_MINUTE = SECONDS_PER_MINUTE * MICROS_PER_SECOND; public static final long MICROS_PER_HOUR = MINUTES_PER_HOUR * MICROS_PER_MINUTE; public static final long MICROS_PER_DAY = HOURS_PER_DAY * MICROS_PER_HOUR; - public static final long MICROS_PER_MONTH = DAYS_PER_MONTH * MICROS_PER_DAY; - /* 365.25 days per year assumes leap year every four years */ - public static final long MICROS_PER_YEAR = (36525L * MICROS_PER_DAY) / 100; public static final long NANOS_PER_MICROS = 1000L; public static final long NANOS_PER_MILLIS = MICROS_PER_MILLIS * NANOS_PER_MICROS; diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java index 546e8780a6606..d399e66aa2a5f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/UnsafeAlignedOffset.java @@ -28,12 +28,20 @@ public class UnsafeAlignedOffset { private static final int UAO_SIZE = Platform.unaligned() ? 4 : 8; + private static int TEST_UAO_SIZE = 0; + + // used for test only + public static void setUaoSize(int size) { + assert size == 0 || size == 4 || size == 8; + TEST_UAO_SIZE = size; + } + public static int getUaoSize() { - return UAO_SIZE; + return TEST_UAO_SIZE == 0 ? UAO_SIZE : TEST_UAO_SIZE; } public static int getSize(Object object, long offset) { - switch (UAO_SIZE) { + switch (getUaoSize()) { case 4: return Platform.getInt(object, offset); case 8: @@ -46,7 +54,7 @@ public static int getSize(Object object, long offset) { } public static void putSize(Object object, long offset, int value) { - switch (UAO_SIZE) { + switch (getUaoSize()) { case 4: Platform.putInt(object, offset, value); break; diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c5384669eb922..b8dda22240042 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -341,8 +341,17 @@ public UTF8String substringSQL(int pos, int length) { // to the -ith element before the end of the sequence. If a start index i is 0, it // refers to the first element. int len = numChars(); + // `len + pos` does not overflow as `len >= 0`. int start = (pos > 0) ? pos -1 : ((pos < 0) ? len + pos : 0); - int end = (length == Integer.MAX_VALUE) ? len : start + length; + + int end; + if ((long) start + length > Integer.MAX_VALUE) { + end = Integer.MAX_VALUE; + } else if ((long) start + length < Integer.MIN_VALUE) { + end = Integer.MIN_VALUE; + } else { + end = start + length; + } return substring(start, end); } @@ -554,7 +563,7 @@ public UTF8String trim() { } /** - * Trims whitespaces (<= ASCII 32) from both ends of this string. + * Trims whitespaces ({@literal <=} ASCII 32) from both ends of this string. * * Note that, this method is the same as java's {@link String#trim}, and different from * {@link UTF8String#trim()} which remove only spaces(= ASCII 32) from both ends. @@ -566,14 +575,14 @@ public UTF8String trim() { public UTF8String trimAll() { int s = 0; // skip all of the whitespaces (<=0x20) in the left side - while (s < this.numBytes && getByte(s) <= ' ') s++; + while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++; if (s == this.numBytes) { // Everything trimmed return EMPTY_UTF8; } // skip all of the whitespaces (<=0x20) in the right side int e = this.numBytes - 1; - while (e > s && getByte(e) <= ' ') e--; + while (e > s && Character.isWhitespace(getByte(e))) e--; if (s == 0 && e == numBytes - 1) { // Nothing trimmed return this; @@ -1105,12 +1114,16 @@ public static class IntWrapper implements Serializable { * @return true if the parsing was successful else false */ public boolean toLong(LongWrapper toLongResult) { + return toLong(toLongResult, true); + } + + private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) { int offset = 0; - while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++; if (offset == this.numBytes) return false; int end = this.numBytes - 1; - while (end > offset && getByte(end) <= ' ') end--; + while (end > offset && Character.isWhitespace(getByte(end))) end--; byte b = getByte(offset); final boolean negative = b == '-'; @@ -1129,7 +1142,7 @@ public boolean toLong(LongWrapper toLongResult) { while (offset <= end) { b = getByte(offset); offset++; - if (b == separator) { + if (b == separator && allowDecimal) { // We allow decimals and will return a truncated integral in that case. // Therefore we won't throw an exception here (checking the fractional // part happens below.) @@ -1198,12 +1211,16 @@ public boolean toLong(LongWrapper toLongResult) { * @return true if the parsing was successful else false */ public boolean toInt(IntWrapper intWrapper) { + return toInt(intWrapper, true); + } + + private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) { int offset = 0; - while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++; if (offset == this.numBytes) return false; int end = this.numBytes - 1; - while (end > offset && getByte(end) <= ' ') end--; + while (end > offset && Character.isWhitespace(getByte(end))) end--; byte b = getByte(offset); final boolean negative = b == '-'; @@ -1222,7 +1239,7 @@ public boolean toInt(IntWrapper intWrapper) { while (offset <= end) { b = getByte(offset); offset++; - if (b == separator) { + if (b == separator && allowDecimal) { // We allow decimals and will return a truncated integral in that case. // Therefore we won't throw an exception here (checking the fractional // part happens below.) @@ -1276,9 +1293,7 @@ public boolean toShort(IntWrapper intWrapper) { if (toInt(intWrapper)) { int intValue = intWrapper.value; short result = (short) intValue; - if (result == intValue) { - return true; - } + return result == intValue; } return false; } @@ -1287,9 +1302,7 @@ public boolean toByte(IntWrapper intWrapper) { if (toInt(intWrapper)) { int intValue = intWrapper.value; byte result = (byte) intValue; - if (result == intValue) { - return true; - } + return result == intValue; } return false; } @@ -1302,7 +1315,7 @@ public boolean toByte(IntWrapper intWrapper) { */ public long toLongExact() { LongWrapper result = new LongWrapper(); - if (toLong(result)) { + if (toLong(result, false)) { return result.value; } throw new NumberFormatException("invalid input syntax for type numeric: " + this); @@ -1316,7 +1329,7 @@ public long toLongExact() { */ public int toIntExact() { IntWrapper result = new IntWrapper(); - if (toInt(result)) { + if (toInt(result, false)) { return result.value; } throw new NumberFormatException("invalid input syntax for type numeric: " + this); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 8f933877f82e6..70e276f7e5a8b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -390,6 +390,10 @@ public void substringSQL() { assertEquals(fromString("example"), e.substringSQL(0, Integer.MAX_VALUE)); assertEquals(fromString("example"), e.substringSQL(1, Integer.MAX_VALUE)); assertEquals(fromString("xample"), e.substringSQL(2, Integer.MAX_VALUE)); + assertEquals(EMPTY_UTF8, e.substringSQL(-100, -100)); + assertEquals(EMPTY_UTF8, e.substringSQL(-1207959552, -1207959552)); + assertEquals(fromString("pl"), e.substringSQL(-3, 2)); + assertEquals(EMPTY_UTF8, e.substringSQL(Integer.MIN_VALUE, 6)); } @Test diff --git a/core/pom.xml b/core/pom.xml index 9d54d21b95ba3..19b5b30051602 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../pom.xml @@ -414,7 +414,7 @@ net.sf.py4j py4j - 0.10.8.1 + 0.10.9 org.apache.spark diff --git a/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java b/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java index 0c0d0df8ae682..1d676ff781c70 100644 --- a/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java +++ b/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java @@ -41,7 +41,7 @@ public interface DriverPlugin { * initialization. *

* It's recommended that plugins be careful about what operations are performed in this call, - * preferrably performing expensive operations in a separate thread, or postponing them until + * preferably performing expensive operations in a separate thread, or postponing them until * the application has fully started. * * @param sc The SparkContext loading the plugin. diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java index d09282e61a9c7..3eee1e478616d 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java @@ -18,6 +18,7 @@ package org.apache.spark.shuffle.sort; import java.nio.channels.Channels; +import java.util.Arrays; import java.util.Optional; import javax.annotation.Nullable; import java.io.*; @@ -274,6 +275,8 @@ private long[] mergeSpills(SpillInfo[] spills) throws IOException { // Here, we don't need to perform any metrics updates because the bytes written to this // output file would have already been counted as shuffle bytes written. partitionLengths = spills[0].partitionLengths; + logger.debug("Merge shuffle spills for mapId {} with length {}", mapId, + partitionLengths.length); maybeSingleFileWriter.get().transferMapSpillFile(spills[0].file, partitionLengths); } else { partitionLengths = mergeSpillsUsingStandardWriter(spills); @@ -360,6 +363,7 @@ private void mergeSpillsWithFileStream( SpillInfo[] spills, ShuffleMapOutputWriter mapWriter, @Nullable CompressionCodec compressionCodec) throws IOException { + logger.debug("Merge shuffle spills with FileStream for mapId {}", mapId); final int numPartitions = partitioner.numPartitions(); final InputStream[] spillInputStreams = new InputStream[spills.length]; @@ -369,6 +373,11 @@ private void mergeSpillsWithFileStream( spillInputStreams[i] = new NioBufferedFileInputStream( spills[i].file, inputBufferSizeInBytes); + // Only convert the partitionLengths when debug level is enabled. + if (logger.isDebugEnabled()) { + logger.debug("Partition lengths for mapId {} in Spill {}: {}", mapId, i, + Arrays.toString(spills[i].partitionLengths)); + } } for (int partition = 0; partition < numPartitions; partition++) { boolean copyThrewException = true; @@ -431,6 +440,7 @@ private void mergeSpillsWithFileStream( private void mergeSpillsWithTransferTo( SpillInfo[] spills, ShuffleMapOutputWriter mapWriter) throws IOException { + logger.debug("Merge shuffle spills with TransferTo for mapId {}", mapId); final int numPartitions = partitioner.numPartitions(); final FileChannel[] spillInputChannels = new FileChannel[spills.length]; final long[] spillInputChannelPositions = new long[spills.length]; @@ -439,6 +449,11 @@ private void mergeSpillsWithTransferTo( try { for (int i = 0; i < spills.length; i++) { spillInputChannels[i] = new FileInputStream(spills[i].file).getChannel(); + // Only convert the partitionLengths when debug level is enabled. + if (logger.isDebugEnabled()) { + logger.debug("Partition lengths for mapId {} in Spill {}: {}", mapId, i, + Arrays.toString(spills[i].partitionLengths)); + } } for (int partition = 0; partition < numPartitions; partition++) { boolean copyThrewException = true; diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java index a6529fd76188a..1c3eb34f9be37 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java @@ -112,6 +112,8 @@ public long[] commitAllPartitions() throws IOException { } cleanUp(); File resolvedTmp = outputTempFile != null && outputTempFile.isFile() ? outputTempFile : null; + log.debug("Writing shuffle index file for mapId {} with length {}", mapId, + partitionLengths.length); blockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, resolvedTmp); return partitionLengths; } @@ -210,14 +212,14 @@ public long getNumBytesWritten() { private class PartitionWriterStream extends OutputStream { private final int partitionId; - private int count = 0; + private long count = 0; private boolean isClosed = false; PartitionWriterStream(int partitionId) { this.partitionId = partitionId; } - public int getCount() { + public long getCount() { return count; } diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 7bdd89488d119..888cc8c3a6cd6 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -54,13 +54,13 @@ * probably be using sorting instead of hashing for better cache locality. * * The key and values under the hood are stored together, in the following format: - * Bytes 0 to 4: len(k) (key length in bytes) + len(v) (value length in bytes) + 4 - * Bytes 4 to 8: len(k) - * Bytes 8 to 8 + len(k): key data - * Bytes 8 + len(k) to 8 + len(k) + len(v): value data - * Bytes 8 + len(k) + len(v) to 8 + len(k) + len(v) + 8: pointer to next pair + * First uaoSize bytes: len(k) (key length in bytes) + len(v) (value length in bytes) + uaoSize + * Next uaoSize bytes: len(k) + * Next len(k) bytes: key data + * Next len(v) bytes: value data + * Last 8 bytes: pointer to next pair * - * This means that the first four bytes store the entire record (key + value) length. This format + * It means first uaoSize bytes store the entire record (key + value + uaoSize) length. This format * is compatible with {@link org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter}, * so we can pass records from this map directly into the sorter to sort records in place. */ @@ -96,8 +96,7 @@ public final class BytesToBytesMap extends MemoryConsumer { * since that's the largest power-of-2 that's less than Integer.MAX_VALUE. We need two long array * entries per key, giving us a maximum capacity of (1 << 29). */ - @VisibleForTesting - static final int MAX_CAPACITY = (1 << 29); + public static final int MAX_CAPACITY = (1 << 29); // This choice of page table size and page size means that we can address up to 500 gigabytes // of memory. @@ -394,10 +393,12 @@ public void remove() { } private void handleFailedDelete() { - // remove the spill file from disk - File file = spillWriters.removeFirst().getFile(); - if (file != null && file.exists() && !file.delete()) { - logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); + if (spillWriters.size() > 0) { + // remove the spill file from disk + File file = spillWriters.removeFirst().getFile(); + if (file != null && file.exists() && !file.delete()) { + logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); + } } } } @@ -407,17 +408,10 @@ private void handleFailedDelete() { * * For efficiency, all calls to `next()` will return the same {@link Location} object. * - * If any other lookups or operations are performed on this map while iterating over it, including - * `lookup()`, the behavior of the returned iterator is undefined. + * The returned iterator is thread-safe. However if the map is modified while iterating over it, + * the behavior of the returned iterator is undefined. */ public MapIterator iterator() { - return new MapIterator(numValues, loc, false); - } - - /** - * Returns a thread safe iterator that iterates of the entries of this map. - */ - public MapIterator safeIterator() { return new MapIterator(numValues, new Location(), false); } @@ -428,19 +422,20 @@ public MapIterator safeIterator() { * * For efficiency, all calls to `next()` will return the same {@link Location} object. * - * If any other lookups or operations are performed on this map while iterating over it, including - * `lookup()`, the behavior of the returned iterator is undefined. + * The returned iterator is thread-safe. However if the map is modified while iterating over it, + * the behavior of the returned iterator is undefined. */ public MapIterator destructiveIterator() { updatePeakMemoryUsed(); - return new MapIterator(numValues, loc, true); + return new MapIterator(numValues, new Location(), true); } /** * Looks up a key, and return a {@link Location} handle that can be used to test existence * and read/write values. * - * This function always return the same {@link Location} instance to avoid object allocation. + * This function always returns the same {@link Location} instance to avoid object allocation. + * This function is not thread-safe. */ public Location lookup(Object keyBase, long keyOffset, int keyLength) { safeLookup(keyBase, keyOffset, keyLength, loc, @@ -452,7 +447,8 @@ public Location lookup(Object keyBase, long keyOffset, int keyLength) { * Looks up a key, and return a {@link Location} handle that can be used to test existence * and read/write values. * - * This function always return the same {@link Location} instance to avoid object allocation. + * This function always returns the same {@link Location} instance to avoid object allocation. + * This function is not thread-safe. */ public Location lookup(Object keyBase, long keyOffset, int keyLength, int hash) { safeLookup(keyBase, keyOffset, keyLength, loc, hash); @@ -707,7 +703,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff // Here, we'll copy the data into our data pages. Because we only store a relative offset from // the key address instead of storing the absolute address of the value, the key and value // must be stored in the same memory page. - // (8 byte key length) (key) (value) (8 byte pointer to next value) + // (total length) (key length) (key) (value) (8 byte pointer to next value) int uaoSize = UnsafeAlignedOffset.getUaoSize(); final long recordLength = (2L * uaoSize) + klen + vlen + 8; if (currentPage == null || currentPage.size() - pageCursor < recordLength) { @@ -744,12 +740,21 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff longArray.set(pos * 2 + 1, keyHashcode); isDefined = true; - // We use two array entries per key, so the array size is twice the capacity. - // We should compare the current capacity of the array, instead of its size. - if (numKeys >= growthThreshold && longArray.size() / 2 < MAX_CAPACITY) { - try { - growAndRehash(); - } catch (SparkOutOfMemoryError oom) { + // If the map has reached its growth threshold, try to grow it. + if (numKeys >= growthThreshold) { + // We use two array entries per key, so the array size is twice the capacity. + // We should compare the current capacity of the array, instead of its size. + if (longArray.size() / 2 < MAX_CAPACITY) { + try { + growAndRehash(); + } catch (SparkOutOfMemoryError oom) { + canGrowArray = false; + } + } else { + // The map is already at MAX_CAPACITY and cannot grow. Instead, we prevent it from + // accepting any more new elements to make sure we don't exceed the load factor. If we + // need to spill later, this allows UnsafeKVExternalSorter to reuse the array for + // sorting. canGrowArray = false; } } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 55e4e609c3c7b..e4e369baf9dfa 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -104,11 +104,14 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter( int initialSize, long pageSizeBytes, int numElementsForSpillThreshold, - UnsafeInMemorySorter inMemorySorter) throws IOException { + UnsafeInMemorySorter inMemorySorter, + long existingMemoryConsumption) throws IOException { UnsafeExternalSorter sorter = new UnsafeExternalSorter(taskMemoryManager, blockManager, serializerManager, taskContext, recordComparatorSupplier, prefixComparator, initialSize, pageSizeBytes, numElementsForSpillThreshold, inMemorySorter, false /* ignored */); sorter.spill(Long.MAX_VALUE, sorter); + taskContext.taskMetrics().incMemoryBytesSpilled(existingMemoryConsumption); + sorter.totalSpillBytes += existingMemoryConsumption; // The external sorter will be used to insert records, in-memory sorter is not needed. sorter.inMemSorter = null; return sorter; @@ -203,6 +206,10 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { } if (inMemSorter == null || inMemSorter.numRecords() <= 0) { + // There could still be some memory allocated when there are no records in the in-memory + // sorter. We will not spill it however, to ensure that we can always process at least one + // record before spilling. See the comments in `allocateMemoryForRecordIfNecessary` for why + // this is necessary. return 0L; } @@ -224,7 +231,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { // Note that this is more-or-less going to be a multiple of the page size, so wasted space in // pages will currently be counted as memory spilled even though that space isn't actually // written to disk. This also counts the space needed to store the sorter's pointer array. - inMemSorter.reset(); + inMemSorter.freeMemory(); // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the // records. Otherwise, if the task is over allocated memory, then without freeing the memory // pages, we might not be able to get memory for the pointer array. @@ -325,7 +332,7 @@ public void cleanupResources() { deleteSpillFiles(); freeMemory(); if (inMemSorter != null) { - inMemSorter.free(); + inMemSorter.freeMemory(); inMemSorter = null; } } @@ -339,40 +346,53 @@ public void cleanupResources() { private void growPointerArrayIfNecessary() throws IOException { assert(inMemSorter != null); if (!inMemSorter.hasSpaceForAnotherRecord()) { + if (inMemSorter.numRecords() <= 0) { + // Spilling was triggered just before this method was called. The pointer array was freed + // during the spill, so a new pointer array needs to be allocated here. + LongArray array = allocateArray(inMemSorter.getInitialSize()); + inMemSorter.expandPointerArray(array); + return; + } + long used = inMemSorter.getMemoryUsage(); - LongArray array; + LongArray array = null; try { // could trigger spilling array = allocateArray(used / 8 * 2); } catch (TooLargePageException e) { // The pointer array is too big to fix in a single page, spill. spill(); - return; } catch (SparkOutOfMemoryError e) { - // should have trigger spilling - if (!inMemSorter.hasSpaceForAnotherRecord()) { + if (inMemSorter.numRecords() > 0) { logger.error("Unable to grow the pointer array"); throw e; } - return; + // The new array could not be allocated, but that is not an issue as it is longer needed, + // as all records were spilled. } - // check if spilling is triggered or not - if (inMemSorter.hasSpaceForAnotherRecord()) { - freeArray(array); - } else { - inMemSorter.expandPointerArray(array); + + if (inMemSorter.numRecords() <= 0) { + // Spilling was triggered while trying to allocate the new array. + if (array != null) { + // We succeeded in allocating the new array, but, since all records were spilled, a + // smaller array would also suffice. + freeArray(array); + } + // The pointer array was freed during the spill, so a new pointer array needs to be + // allocated here. + array = allocateArray(inMemSorter.getInitialSize()); } + inMemSorter.expandPointerArray(array); } } /** - * Allocates more memory in order to insert an additional record. This will request additional - * memory from the memory manager and spill if the requested memory can not be obtained. + * Allocates an additional page in order to insert an additional record. This will request + * additional memory from the memory manager and spill if the requested memory can not be + * obtained. * * @param required the required space in the data page, in bytes, including space for storing - * the record size. This must be less than or equal to the page size (records - * that exceed the page size are handled via a different code path which uses - * special overflow pages). + * the record size. */ private void acquireNewPageIfNecessary(int required) { if (currentPage == null || @@ -384,6 +404,37 @@ private void acquireNewPageIfNecessary(int required) { } } + /** + * Allocates more memory in order to insert an additional record. This will request additional + * memory from the memory manager and spill if the requested memory can not be obtained. + * + * @param required the required space in the data page, in bytes, including space for storing + * the record size. + */ + private void allocateMemoryForRecordIfNecessary(int required) throws IOException { + // Step 1: + // Ensure that the pointer array has space for another record. This may cause a spill. + growPointerArrayIfNecessary(); + // Step 2: + // Ensure that the last page has space for another record. This may cause a spill. + acquireNewPageIfNecessary(required); + // Step 3: + // The allocation in step 2 could have caused a spill, which would have freed the pointer + // array allocated in step 1. Therefore we need to check again whether we have to allocate + // a new pointer array. + // + // If the allocation in this step causes a spill event then it will not cause the page + // allocated in the previous step to be freed. The function `spill` only frees memory if at + // least one record has been inserted in the in-memory sorter. This will not be the case if + // we have spilled in the previous step. + // + // If we did not spill in the previous step then `growPointerArrayIfNecessary` will be a + // no-op that does not allocate any memory, and therefore can't cause a spill event. + // + // Thus there is no need to call `acquireNewPageIfNecessary` again after this step. + growPointerArrayIfNecessary(); + } + /** * Write a record to the sorter. */ @@ -398,11 +449,10 @@ public void insertRecord( spill(); } - growPointerArrayIfNecessary(); - int uaoSize = UnsafeAlignedOffset.getUaoSize(); + final int uaoSize = UnsafeAlignedOffset.getUaoSize(); // Need 4 or 8 bytes to store the record length. final int required = length + uaoSize; - acquireNewPageIfNecessary(required); + allocateMemoryForRecordIfNecessary(required); final Object base = currentPage.getBaseObject(); final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor); @@ -425,10 +475,9 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen, Object valueBase, long valueOffset, int valueLen, long prefix, boolean prefixIsNull) throws IOException { - growPointerArrayIfNecessary(); - int uaoSize = UnsafeAlignedOffset.getUaoSize(); + final int uaoSize = UnsafeAlignedOffset.getUaoSize(); final int required = keyLen + valueLen + (2 * uaoSize); - acquireNewPageIfNecessary(required); + allocateMemoryForRecordIfNecessary(required); final Object base = currentPage.getBaseObject(); final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor); @@ -450,6 +499,7 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen, */ public void merge(UnsafeExternalSorter other) throws IOException { other.spill(); + totalSpillBytes += other.totalSpillBytes; spillWriters.addAll(other.spillWriters); // remove them from `spillWriters`, or the files will be deleted in `cleanupResources`. other.spillWriters.clear(); @@ -501,11 +551,15 @@ private static void spillIterator(UnsafeSorterIterator inMemIterator, */ class SpillableIterator extends UnsafeSorterIterator { private UnsafeSorterIterator upstream; - private UnsafeSorterIterator nextUpstream = null; private MemoryBlock lastPage = null; private boolean loaded = false; private int numRecords = 0; + private Object currentBaseObject; + private long currentBaseOffset; + private int currentRecordLength; + private long currentKeyPrefix; + SpillableIterator(UnsafeSorterIterator inMemIterator) { this.upstream = inMemIterator; this.numRecords = inMemIterator.getNumRecords(); @@ -516,23 +570,26 @@ public int getNumRecords() { return numRecords; } + @Override + public long getCurrentPageNumber() { + throw new UnsupportedOperationException(); + } + public long spill() throws IOException { synchronized (this) { - if (!(upstream instanceof UnsafeInMemorySorter.SortedIterator && nextUpstream == null - && numRecords > 0)) { + if (inMemSorter == null || numRecords <= 0) { return 0L; } - UnsafeInMemorySorter.SortedIterator inMemIterator = - ((UnsafeInMemorySorter.SortedIterator) upstream).clone(); + long currentPageNumber = upstream.getCurrentPageNumber(); - ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics(); + ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics(); // Iterate over the records that have not been returned and spill them. final UnsafeSorterSpillWriter spillWriter = new UnsafeSorterSpillWriter(blockManager, fileBufferSizeBytes, writeMetrics, numRecords); - spillIterator(inMemIterator, spillWriter); + spillIterator(upstream, spillWriter); spillWriters.add(spillWriter); - nextUpstream = spillWriter.getReader(serializerManager); + upstream = spillWriter.getReader(serializerManager); long released = 0L; synchronized (UnsafeExternalSorter.this) { @@ -540,8 +597,7 @@ public long spill() throws IOException { // is accessing the current record. We free this page in that caller's next loadNext() // call. for (MemoryBlock page : allocatedPages) { - if (!loaded || page.pageNumber != - ((UnsafeInMemorySorter.SortedIterator)upstream).getCurrentPageNumber()) { + if (!loaded || page.pageNumber != currentPageNumber) { released += page.size(); freePage(page); } else { @@ -555,7 +611,7 @@ public long spill() throws IOException { assert(inMemSorter != null); released += inMemSorter.getMemoryUsage(); totalSortTimeNanos += inMemSorter.getSortTimeNanos(); - inMemSorter.free(); + inMemSorter.freeMemory(); inMemSorter = null; taskContext.taskMetrics().incMemoryBytesSpilled(released); taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten()); @@ -575,22 +631,26 @@ public void loadNext() throws IOException { try { synchronized (this) { loaded = true; - if (nextUpstream != null) { - // Just consumed the last record from in memory iterator - if(lastPage != null) { - // Do not free the page here, while we are locking `SpillableIterator`. The `freePage` - // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in - // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and - // `SpillableIterator` in sequence, which may happen in - // `TaskMemoryManager.acquireExecutionMemory`. - pageToFree = lastPage; - lastPage = null; - } - upstream = nextUpstream; - nextUpstream = null; + // Just consumed the last record from in memory iterator + if (lastPage != null) { + // Do not free the page here, while we are locking `SpillableIterator`. The `freePage` + // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in + // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and + // `SpillableIterator` in sequence, which may happen in + // `TaskMemoryManager.acquireExecutionMemory`. + pageToFree = lastPage; + lastPage = null; } numRecords--; upstream.loadNext(); + + // Keep track of the current base object, base offset, record length, and key prefix, + // so that the current record can still be read in case a spill is triggered and we + // switch to the spill writer's iterator. + currentBaseObject = upstream.getBaseObject(); + currentBaseOffset = upstream.getBaseOffset(); + currentRecordLength = upstream.getRecordLength(); + currentKeyPrefix = upstream.getKeyPrefix(); } } finally { if (pageToFree != null) { @@ -601,22 +661,22 @@ public void loadNext() throws IOException { @Override public Object getBaseObject() { - return upstream.getBaseObject(); + return currentBaseObject; } @Override public long getBaseOffset() { - return upstream.getBaseOffset(); + return currentBaseOffset; } @Override public int getRecordLength() { - return upstream.getRecordLength(); + return currentRecordLength; } @Override public long getKeyPrefix() { - return upstream.getKeyPrefix(); + return currentKeyPrefix; } } @@ -646,7 +706,7 @@ public UnsafeSorterIterator getIterator(int startIndex) throws IOException { } i += spillWriter.recordsSpilled(); } - if (inMemSorter != null) { + if (inMemSorter != null && inMemSorter.numRecords() > 0) { UnsafeSorterIterator iter = inMemSorter.getSortedIterator(); moveOver(iter, startIndex - i); queue.add(iter); @@ -693,6 +753,11 @@ public int getNumRecords() { return numRecords; } + @Override + public long getCurrentPageNumber() { + return current.getCurrentPageNumber(); + } + @Override public boolean hasNext() { while (!current.hasNext() && !iterators.isEmpty()) { diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index e14964d68119b..33be899b6b438 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -159,32 +159,26 @@ private int getUsableCapacity() { return (int) (array.size() / (radixSortSupport != null ? 2 : 1.5)); } + public long getInitialSize() { + return initialSize; + } + /** * Free the memory used by pointer array. */ - public void free() { + public void freeMemory() { if (consumer != null) { if (array != null) { consumer.freeArray(array); } - array = null; - } - } - public void reset() { - if (consumer != null) { - consumer.freeArray(array); - // the call to consumer.allocateArray may trigger a spill which in turn access this instance - // and eventually re-enter this method and try to free the array again. by setting the array - // to null and its length to 0 we effectively make the spill code-path a no-op. setting the - // array to null also indicates that it has already been de-allocated which prevents a double - // de-allocation in free(). + // Set the array to null instead of allocating a new array. Allocating an array could have + // triggered another spill and this method already is called from UnsafeExternalSorter when + // spilling. Attempting to allocate while spilling is dangerous, as we could be holding onto + // a large partially complete allocation, which may prevent other memory from being allocated. + // Instead we will allocate the new array when it is necessary. array = null; usableCapacity = 0; - pos = 0; - nullBoundaryPos = 0; - array = consumer.allocateArray(initialSize); - usableCapacity = getUsableCapacity(); } pos = 0; nullBoundaryPos = 0; @@ -217,25 +211,27 @@ public boolean hasSpaceForAnotherRecord() { } public void expandPointerArray(LongArray newArray) { - if (newArray.size() < array.size()) { - // checkstyle.off: RegexpSinglelineJava - throw new SparkOutOfMemoryError("Not enough memory to grow pointer array"); - // checkstyle.on: RegexpSinglelineJava + if (array != null) { + if (newArray.size() < array.size()) { + // checkstyle.off: RegexpSinglelineJava + throw new SparkOutOfMemoryError("Not enough memory to grow pointer array"); + // checkstyle.on: RegexpSinglelineJava + } + Platform.copyMemory( + array.getBaseObject(), + array.getBaseOffset(), + newArray.getBaseObject(), + newArray.getBaseOffset(), + pos * 8L); + consumer.freeArray(array); } - Platform.copyMemory( - array.getBaseObject(), - array.getBaseOffset(), - newArray.getBaseObject(), - newArray.getBaseOffset(), - pos * 8L); - consumer.freeArray(array); array = newArray; usableCapacity = getUsableCapacity(); } /** * Inserts a record to be sorted. Assumes that the record pointer points to a record length - * stored as a 4-byte integer, followed by the record's bytes. + * stored as a uaoSize(4 or 8) bytes integer, followed by the record's bytes. * * @param recordPointer pointer to a record in a data page, encoded by {@link TaskMemoryManager}. * @param keyPrefix a user-defined key prefix @@ -330,6 +326,7 @@ public void loadNext() { @Override public long getBaseOffset() { return baseOffset; } + @Override public long getCurrentPageNumber() { return currentPageNumber; } @@ -346,6 +343,11 @@ public long getCurrentPageNumber() { * {@code next()} will return the same mutable object. */ public UnsafeSorterIterator getSortedIterator() { + if (numRecords() == 0) { + // `array` might be null, so make sure that it is not accessed by returning early. + return new SortedIterator(0, 0); + } + int offset = 0; long start = System.nanoTime(); if (sortComparator != null) { diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java index 1b3167fcc250c..d9f22311d07c2 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterIterator.java @@ -34,4 +34,6 @@ public abstract class UnsafeSorterIterator { public abstract long getKeyPrefix(); public abstract int getNumRecords(); + + public abstract long getCurrentPageNumber(); } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java index ab800288dcb43..f8603c5799e9b 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillMerger.java @@ -70,6 +70,11 @@ public int getNumRecords() { return numRecords; } + @Override + public long getCurrentPageNumber() { + throw new UnsupportedOperationException(); + } + @Override public boolean hasNext() { return !priorityQueue.isEmpty() || (spillReader != null && spillReader.hasNext()); diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java index a524c4790407d..db79efd008530 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java @@ -89,6 +89,11 @@ public int getNumRecords() { return numRecords; } + @Override + public long getCurrentPageNumber() { + throw new UnsupportedOperationException(); + } + @Override public boolean hasNext() { return (numRecordsRemaining > 0); diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html index 33eb7bfa671bc..7e9927d40c191 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html +++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html @@ -77,12 +77,12 @@ {{#applications}} + {{#attempts}} {{version}} - {{id}} + {{id}} {{name}} - {{#attempts}} {{#hasMultipleAttempts}} - {{attemptId}} + {{attemptId}} {{/hasMultipleAttempts}} {{startTime}} {{#showCompletedColumns}} diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js index 4df5f07f077d7..3a4c815029f6d 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js @@ -130,7 +130,7 @@ $(document).ready(function() { if (app["attempts"].length > 1) { hasMultipleAttempts = true; } - var num = app["attempts"].length; + for (var j in app["attempts"]) { var attempt = app["attempts"][j]; attempt["startTime"] = formatTimeMillis(attempt["startTimeEpoch"]); @@ -140,7 +140,8 @@ $(document).ready(function() { (attempt.hasOwnProperty("attemptId") ? attempt["attemptId"] + "/" : "") + "logs"; attempt["durationMillisec"] = attempt["duration"]; attempt["duration"] = formatDuration(attempt["duration"]); - var app_clone = {"id" : id, "name" : name, "version": version, "num" : num, "attempts" : [attempt]}; + var hasAttemptId = attempt.hasOwnProperty("attemptId"); + var app_clone = {"id" : id, "name" : name, "version": version, "hasAttemptId" : hasAttemptId, "attempts" : [attempt]}; array.push(app_clone); } } diff --git a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js index ecd580e5c64aa..3f98a0379dc3c 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js +++ b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js @@ -99,12 +99,12 @@ sorttable = { 'sorttable_sorted_reverse'); rowlists = this.parentNode.getElementsByTagName("span"); for (var j=0; j < rowlists.length; j++) { - if (rowlists[j].className.search(/\bsorttable_sortfwdind\b/)) { + if (rowlists[j].className.search(/\bsorttable_sortfwdind\b/) != -1) { rowlists[j].parentNode.removeChild(rowlists[j]); } } sortrevind = document.createElement('span'); - sortrevind.class = "sorttable_sortrevind"; + sortrevind.className = "sorttable_sortrevind"; sortrevind.innerHTML = stIsIE ? ' 5' : ' ▾'; this.appendChild(sortrevind); return; @@ -117,12 +117,12 @@ sorttable = { 'sorttable_sorted'); rowlists = this.parentNode.getElementsByTagName("span"); for (var j=0; j < rowlists.length; j++) { - if (rowlists[j].className.search(/\sorttable_sortrevind\b/)) { + if (rowlists[j].className.search(/\bsorttable_sortrevind\b/) != -1) { rowlists[j].parentNode.removeChild(rowlists[j]); } } sortfwdind = document.createElement('span'); - sortfwdind.class = "sorttable_sortfwdind"; + sortfwdind.className = "sorttable_sortfwdind"; sortfwdind.innerHTML = stIsIE ? ' 6' : ' ▴'; this.appendChild(sortfwdind); return; @@ -138,15 +138,15 @@ sorttable = { }); rowlists = this.parentNode.getElementsByTagName("span"); for (var j=0; j < rowlists.length; j++) { - if (rowlists[j].className.search(/\bsorttable_sortfwdind\b/) - || rowlists[j].className.search(/\sorttable_sortrevind\b/) ) { + if (rowlists[j].className.search(/\bsorttable_sortfwdind\b/) != -1 + || rowlists[j].className.search(/\bsorttable_sortrevind\b/) != -1) { rowlists[j].parentNode.removeChild(rowlists[j]); } } this.className += ' sorttable_sorted'; sortfwdind = document.createElement('span'); - sortfwdind.class = "sorttable_sortfwdind"; + sortfwdind.className = "sorttable_sortfwdind"; sortfwdind.innerHTML = stIsIE ? ' 6' : ' ▴'; this.appendChild(sortfwdind); diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css index 1fbc90b832bc9..e44a724c202f7 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.css @@ -22,6 +22,7 @@ #dag-viz-graph .label { font-weight: normal; text-shadow: none; + color: #333; } #dag-viz-graph svg path { diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 25dec9d3788ba..474c453643365 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -173,9 +173,11 @@ function renderDagViz(forJob) { }); metadataContainer().selectAll(".barrier-rdd").each(function() { - var rddId = d3.select(this).text().trim() - var clusterId = VizConstants.clusterPrefix + rddId - svg.selectAll("g." + clusterId).classed("barrier", true) + var opId = d3.select(this).text().trim(); + var opClusterId = VizConstants.clusterPrefix + opId; + var stageId = $(this).parents(".stage-metadata").attr("stage-id"); + var stageClusterId = VizConstants.graphPrefix + stageId; + svg.selectAll("g[id=" + stageClusterId + "] g." + opClusterId).classed("barrier", true) }); resizeSvg(svg); @@ -216,7 +218,7 @@ function renderDagVizForJob(svgContainer) { var dot = metadata.select(".dot-file").text(); var stageId = metadata.attr("stage-id"); var containerId = VizConstants.graphPrefix + stageId; - var isSkipped = metadata.attr("skipped") == "true"; + var isSkipped = metadata.attr("skipped") === "true"; var container; if (isSkipped) { container = svgContainer @@ -225,11 +227,8 @@ function renderDagVizForJob(svgContainer) { .attr("skipped", "true"); } else { // Link each graph to the corresponding stage page (TODO: handle stage attempts) - // Use the link from the stage table so it also works for the history server var attemptId = 0; - var stageLink = d3.select("#stage-" + stageId + "-" + attemptId) - .select("a.name-link") - .attr("href"); + var stageLink = uiRoot + appBasePath + "/stages/stage/?id=" + stageId + "&attempt=" + attemptId; container = svgContainer .append("a") .attr("xlink:href", stageLink) @@ -282,11 +281,7 @@ function renderDagVizForJob(svgContainer) { /* Render the dot file as an SVG in the given container. */ function renderDot(dot, container, forJob) { - var escaped_dot = dot - .replace(/</g, "<") - .replace(/>/g, ">") - .replace(/"/g, "\""); - var g = graphlibDot.read(escaped_dot); + var g = graphlibDot.read(dot); var renderer = new dagreD3.render(); preprocessGraphLayout(g, forJob); renderer(container, g); @@ -498,18 +493,11 @@ function connectRDDs(fromRDDId, toRDDId, edgesContainer, svgContainer) { edgesContainer.append("path").datum(points).attr("d", line); } -/* - * Replace `/n` with `
` - */ -function replaceLineBreak(str) { - return str.replace("\\n", "
"); -} - /* (Job page only) Helper function to add tooltips for RDDs. */ function addTooltipsForRDDs(svgContainer) { svgContainer.selectAll("g.node").each(function() { var node = d3.select(this); - var tooltipText = replaceLineBreak(node.attr("name")); + var tooltipText = node.attr("name"); if (tooltipText) { node.select("circle") .attr("data-toggle", "tooltip") diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js index ee2b7b353d62e..67d6d741e5537 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js @@ -70,7 +70,7 @@ function stageEndPoint(appId) { return newBaseURI + "/api/v1/applications/" + appId + "/" + appAttemptId + "/stages/" + stageId; } } - return location.origin + "/api/v1/applications/" + appId + "/stages/" + stageId; + return uiRoot + "/api/v1/applications/" + appId + "/stages/" + stageId; } function getColumnNameForTaskMetricSummary(columnKey) { @@ -821,7 +821,8 @@ $(document).ready(function () { }, { data : function (row, type) { - if (row.taskMetrics && row.taskMetrics.shuffleReadMetrics && row.taskMetrics.shuffleReadMetrics.localBytesRead > 0) { + if (row.taskMetrics && row.taskMetrics.shuffleReadMetrics && + (row.taskMetrics.shuffleReadMetrics.localBytesRead > 0 || row.taskMetrics.shuffleReadMetrics.remoteBytesRead > 0)) { var totalBytesRead = parseInt(row.taskMetrics.shuffleReadMetrics.localBytesRead) + parseInt(row.taskMetrics.shuffleReadMetrics.remoteBytesRead); if (type === 'display') { return formatBytes(totalBytesRead, type) + " / " + row.taskMetrics.shuffleReadMetrics.recordsRead; diff --git a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js index 5b75bc3011b6d..9a01c3ba83696 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js +++ b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js @@ -33,6 +33,8 @@ var yValueFormat = d3.format(",.2f"); var unitLabelYOffset = -10; +var onClickTimeline = function() {}; + // Show a tooltip "text" for "node" function showBootstrapTooltip(node, text) { $(node).tooltip({title: text, trigger: "manual", container: "body"}); @@ -44,6 +46,45 @@ function hideBootstrapTooltip(node) { $(node).tooltip("destroy"); } +// Return the function to scroll to the corresponding +// row on clicking a point of batch in the timeline. +function getOnClickTimelineFunction() { + // If the user click one point in the graphs, jump to the batch row and highlight it. And + // recovery the batch row after 3 seconds if necessary. + // We need to remember the last clicked batch so that we can recovery it. + var lastClickedBatch = null; + var lastTimeout = null; + + return function(d) { + var batchSelector = $("#batch-" + d.x); + // If there is a corresponding batch row, scroll down to it and highlight it. + if (batchSelector.length > 0) { + if (lastTimeout != null) { + window.clearTimeout(lastTimeout); + } + if (lastClickedBatch != null) { + clearBatchRow(lastClickedBatch); + lastClickedBatch = null; + } + lastClickedBatch = d.x; + highlightBatchRow(lastClickedBatch); + lastTimeout = window.setTimeout(function () { + lastTimeout = null; + if (lastClickedBatch != null) { + clearBatchRow(lastClickedBatch); + lastClickedBatch = null; + } + }, 3000); // Clean up after 3 seconds + + var topOffset = batchSelector.offset().top - 15; + if (topOffset < 0) { + topOffset = 0; + } + $('html,body').animate({scrollTop: topOffset}, 200); + } + } +} + // Register a timeline graph. All timeline graphs should be register before calling any // "drawTimeline" so that we can determine the max margin left for all timeline graphs. function registerTimeline(minY, maxY) { @@ -171,7 +212,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) { .attr("cy", function(d) { return y(d.y); }) .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "3";}) .on('mouseover', function(d) { - var tip = formatYValue(d.y) + " " + unitY + " at " + timeFormat[d.x]; + var tip = formatYValue(d.y) + " " + unitY + " at " + timeTipStrings[d.x]; showBootstrapTooltip(d3.select(this).node(), tip); // show the point d3.select(this) @@ -189,31 +230,7 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) { .attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";}) .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "3";}); }) - .on("click", function(d) { - if (lastTimeout != null) { - window.clearTimeout(lastTimeout); - } - if (lastClickedBatch != null) { - clearBatchRow(lastClickedBatch); - lastClickedBatch = null; - } - lastClickedBatch = d.x; - highlightBatchRow(lastClickedBatch); - lastTimeout = window.setTimeout(function () { - lastTimeout = null; - if (lastClickedBatch != null) { - clearBatchRow(lastClickedBatch); - lastClickedBatch = null; - } - }, 3000); // Clean up after 3 seconds - - var batchSelector = $("#batch-" + d.x); - var topOffset = batchSelector.offset().top - 15; - if (topOffset < 0) { - topOffset = 0; - } - $('html,body').animate({scrollTop: topOffset}, 200); - }); + .on("click", onClickTimeline); } /** diff --git a/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js index 70250fdbd2d0c..c92226b408b6c 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js +++ b/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js @@ -106,12 +106,12 @@ function drawAreaStack(id, labels, values, minX, maxX, minY, maxY) { .on('mouseover', function(d) { var tip = ''; var idx = 0; - var _values = timeToValues[d._x] + var _values = formattedTimeToValues[d._x]; _values.forEach(function (k) { tip += labels[idx] + ': ' + k + ' '; idx += 1; }); - tip += " at " + d._x + tip += " at " + formattedTimeTipStrings[d._x]; showBootstrapTooltip(d3.select(this).node(), tip); }) .on('mouseout', function() { diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css index 3f31403eaeef3..c9bf83ca98b4f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css +++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css @@ -238,18 +238,6 @@ tr.corresponding-item-hover > td, tr.corresponding-item-hover > th { background-color: #D6FFE4 !important; } -#application-timeline.collapsed { - display: none; -} - -#job-timeline.collapsed { - display: none; -} - -#task-assignment-timeline.collapsed { - display: none; -} - .control-panel { margin-bottom: 5px; } diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js index b2cd616791734..220b76a0f1b27 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js +++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js @@ -26,8 +26,9 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime, offset) { editable: false, align: 'left', showCurrentTime: false, - min: startTime, + start: startTime, zoomable: false, + locale: "en", moment: function (date) { return vis.moment(date).utcOffset(offset); } @@ -41,26 +42,31 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime, offset) { setupZoomable("#application-timeline-zoom-lock", applicationTimeline); setupExecutorEventAction(); + function getIdForJobEntry(baseElem) { + var jobIdText = $($(baseElem).find(".application-timeline-content")[0]).text(); + var jobId = jobIdText.match("\\(Job (\\d+)\\)$")[1]; + return jobId; + } + + function getSelectorForJobEntry(jobId) { + return "#job-" + jobId; + } + function setupJobEventAction() { $(".vis-item.vis-range.job.application-timeline-object").each(function() { - var getSelectorForJobEntry = function(baseElem) { - var jobIdText = $($(baseElem).find(".application-timeline-content")[0]).text(); - var jobId = jobIdText.match("\\(Job (\\d+)\\)$")[1]; - return "#job-" + jobId; - }; - $(this).click(function() { - var jobPagePath = $(getSelectorForJobEntry(this)).find("a.name-link").attr("href") - window.location.href = jobPagePath + var jobId = getIdForJobEntry(this); + var jobPagePath = uiRoot + appBasePath + "/jobs/job/?id=" + jobId; + window.location.href = jobPagePath; }); $(this).hover( function() { - $(getSelectorForJobEntry(this)).addClass("corresponding-item-hover"); + $(getSelectorForJobEntry(getIdForJobEntry(this))).addClass("corresponding-item-hover"); $($(this).find("div.application-timeline-content")[0]).tooltip("show"); }, function() { - $(getSelectorForJobEntry(this)).removeClass("corresponding-item-hover"); + $(getSelectorForJobEntry(getIdForJobEntry(this))).removeClass("corresponding-item-hover"); $($(this).find("div.application-timeline-content")[0]).tooltip("hide"); } ); @@ -75,6 +81,9 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime, offset) { $("#application-timeline").toggleClass('collapsed'); + var visibilityState = status ? "" : "none"; + $("#application-timeline").css("display", visibilityState); + // Switch the class of the arrow from open to closed. $(this).find('.expand-application-timeline-arrow').toggleClass('arrow-open'); $(this).find('.expand-application-timeline-arrow').toggleClass('arrow-closed'); @@ -89,6 +98,8 @@ $(function () { // Set it to false so that the click function can revert it window.localStorage.setItem("expand-application-timeline", "false"); $("span.expand-application-timeline").trigger('click'); + } else { + $("#application-timeline").css("display", "none"); } }); @@ -103,8 +114,9 @@ function drawJobTimeline(groupArray, eventObjArray, startTime, offset) { editable: false, align: 'left', showCurrentTime: false, - min: startTime, + start: startTime, zoomable: false, + locale: "en", moment: function (date) { return vis.moment(date).utcOffset(offset); } @@ -118,26 +130,34 @@ function drawJobTimeline(groupArray, eventObjArray, startTime, offset) { setupZoomable("#job-timeline-zoom-lock", jobTimeline); setupExecutorEventAction(); + function getStageIdAndAttemptForStageEntry(baseElem) { + var stageIdText = $($(baseElem).find(".job-timeline-content")[0]).text(); + var stageIdAndAttempt = stageIdText.match("\\(Stage (\\d+\\.\\d+)\\)$")[1].split("."); + return stageIdAndAttempt; + } + + function getSelectorForStageEntry(stageIdAndAttempt) { + return "#stage-" + stageIdAndAttempt[0] + "-" + stageIdAndAttempt[1]; + } + function setupStageEventAction() { $(".vis-item.vis-range.stage.job-timeline-object").each(function() { - var getSelectorForStageEntry = function(baseElem) { - var stageIdText = $($(baseElem).find(".job-timeline-content")[0]).text(); - var stageIdAndAttempt = stageIdText.match("\\(Stage (\\d+\\.\\d+)\\)$")[1].split("."); - return "#stage-" + stageIdAndAttempt[0] + "-" + stageIdAndAttempt[1]; - }; - $(this).click(function() { - var stagePagePath = $(getSelectorForStageEntry(this)).find("a.name-link").attr("href") - window.location.href = stagePagePath + var stageIdAndAttempt = getStageIdAndAttemptForStageEntry(this); + var stagePagePath = uiRoot + appBasePath + + "/stages/stage/?id=" + stageIdAndAttempt[0] + "&attempt=" + stageIdAndAttempt[1]; + window.location.href = stagePagePath; }); $(this).hover( function() { - $(getSelectorForStageEntry(this)).addClass("corresponding-item-hover"); + $(getSelectorForStageEntry(getStageIdAndAttemptForStageEntry(this))) + .addClass("corresponding-item-hover"); $($(this).find("div.job-timeline-content")[0]).tooltip("show"); }, function() { - $(getSelectorForStageEntry(this)).removeClass("corresponding-item-hover"); + $(getSelectorForStageEntry(getStageIdAndAttemptForStageEntry(this))) + .removeClass("corresponding-item-hover"); $($(this).find("div.job-timeline-content")[0]).tooltip("hide"); } ); @@ -152,6 +172,9 @@ function drawJobTimeline(groupArray, eventObjArray, startTime, offset) { $("#job-timeline").toggleClass('collapsed'); + var visibilityState = status ? "" : "none"; + $("#job-timeline").css("display", visibilityState); + // Switch the class of the arrow from open to closed. $(this).find('.expand-job-timeline-arrow').toggleClass('arrow-open'); $(this).find('.expand-job-timeline-arrow').toggleClass('arrow-closed'); @@ -166,13 +189,15 @@ $(function () { // Set it to false so that the click function can revert it window.localStorage.setItem("expand-job-timeline", "false"); $("span.expand-job-timeline").trigger('click'); + } else { + $("#job-timeline").css("display", "none"); } }); function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, maxFinishTime, offset) { var groups = new vis.DataSet(groupArray); var items = new vis.DataSet(eventObjArray); - var container = $("#task-assignment-timeline")[0] + var container = $("#task-assignment-timeline")[0]; var options = { groupOrder: function(a, b) { return a.value - b.value @@ -181,15 +206,16 @@ function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, ma align: 'left', selectable: false, showCurrentTime: false, - min: minLaunchTime, - max: maxFinishTime, + start: minLaunchTime, + end: maxFinishTime, zoomable: false, + locale: "en", moment: function (date) { return vis.moment(date).utcOffset(offset); } }; - var taskTimeline = new vis.Timeline(container) + var taskTimeline = new vis.Timeline(container); taskTimeline.setOptions(options); taskTimeline.setGroups(groups); taskTimeline.setItems(items); @@ -220,6 +246,9 @@ function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, ma $("#task-assignment-timeline").toggleClass("collapsed"); + var visibilityState = status ? "" : "none"; + $("#task-assignment-timeline").css("display", visibilityState); + // Switch the class of the arrow from open to closed. $(this).find(".expand-task-assignment-timeline-arrow").toggleClass("arrow-open"); $(this).find(".expand-task-assignment-timeline-arrow").toggleClass("arrow-closed"); @@ -234,6 +263,8 @@ $(function () { // Set it to false so that the click function can revert it window.localStorage.setItem("expand-task-assignment-timeline", "false"); $("span.expand-task-assignment-timeline").trigger('click'); + } else { + $("#task-assignment-timeline").css("display", "none"); } }); diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js index 6fc34a9e1f7ea..d15d0035844de 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/utils.js +++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js @@ -56,13 +56,17 @@ function formatTimeMillis(timeMillis) { return "-"; } else { var dt = new Date(timeMillis); + return formatDateString(dt); + } +} + +function formatDateString(dt) { return dt.getFullYear() + "-" + padZeroes(dt.getMonth() + 1) + "-" + padZeroes(dt.getDate()) + " " + padZeroes(dt.getHours()) + ":" + padZeroes(dt.getMinutes()) + ":" + padZeroes(dt.getSeconds()); - } } function getTimeZone() { @@ -101,7 +105,7 @@ function getStandAloneAppId(cb) { } // Looks like Web UI is running in standalone mode // Let's get application-id using REST End Point - $.getJSON(location.origin + "/api/v1/applications", function(response, status, jqXHR) { + $.getJSON(uiRoot + "/api/v1/applications", function(response, status, jqXHR) { if (response && response.length > 0) { var appId = response[0].id; cb(appId); @@ -148,7 +152,7 @@ function createTemplateURI(appId, templateName) { var baseURI = words.slice(0, ind).join('/') + '/static/' + templateName + '-template.html'; return baseURI; } - return location.origin + "/static/" + templateName + "-template.html"; + return uiRoot + "/static/" + templateName + "-template.html"; } function setDataTableDefaults() { @@ -161,7 +165,10 @@ function setDataTableDefaults() { function formatDate(date) { if (date <= 0) return "-"; - else return date.split(".")[0].replace("T", " "); + else { + var dt = new Date(date.replace("GMT", "Z")) + return formatDateString(dt); + } } function createRESTEndPointForExecutorsPage(appId) { @@ -183,5 +190,5 @@ function createRESTEndPointForExecutorsPage(appId) { return newBaseURI + "/api/v1/applications/" + appId + "/" + attemptId + "/allexecutors"; } } - return location.origin + "/api/v1/applications/" + appId + "/allexecutors"; + return uiRoot + "/api/v1/applications/" + appId + "/allexecutors"; } diff --git a/core/src/main/resources/org/apache/spark/ui/static/vis-timeline-graph2d.min.css b/core/src/main/resources/org/apache/spark/ui/static/vis-timeline-graph2d.min.css new file mode 100644 index 0000000000000..eeacbd802d305 --- /dev/null +++ b/core/src/main/resources/org/apache/spark/ui/static/vis-timeline-graph2d.min.css @@ -0,0 +1 @@ +.vis .overlay{position:absolute;top:0;left:0;width:100%;height:100%;z-index:10}.vis-active{box-shadow:0 0 10px #86d5f8}.vis [class*=span]{min-height:0;width:auto}div.vis-configuration{position:relative;display:block;float:left;font-size:12px}div.vis-configuration-wrapper{display:block;width:700px}div.vis-configuration-wrapper::after{clear:both;content:"";display:block}div.vis-configuration.vis-config-option-container{display:block;width:495px;background-color:#fff;border:2px solid #f7f8fa;border-radius:4px;margin-top:20px;left:10px;padding-left:5px}div.vis-configuration.vis-config-button{display:block;width:495px;height:25px;vertical-align:middle;line-height:25px;background-color:#f7f8fa;border:2px solid #ceced0;border-radius:4px;margin-top:20px;left:10px;padding-left:5px;cursor:pointer;margin-bottom:30px}div.vis-configuration.vis-config-button.hover{background-color:#4588e6;border:2px solid #214373;color:#fff}div.vis-configuration.vis-config-item{display:block;float:left;width:495px;height:25px;vertical-align:middle;line-height:25px}div.vis-configuration.vis-config-item.vis-config-s2{left:10px;background-color:#f7f8fa;padding-left:5px;border-radius:3px}div.vis-configuration.vis-config-item.vis-config-s3{left:20px;background-color:#e4e9f0;padding-left:5px;border-radius:3px}div.vis-configuration.vis-config-item.vis-config-s4{left:30px;background-color:#cfd8e6;padding-left:5px;border-radius:3px}div.vis-configuration.vis-config-header{font-size:18px;font-weight:700}div.vis-configuration.vis-config-label{width:120px;height:25px;line-height:25px}div.vis-configuration.vis-config-label.vis-config-s3{width:110px}div.vis-configuration.vis-config-label.vis-config-s4{width:100px}div.vis-configuration.vis-config-colorBlock{top:1px;width:30px;height:19px;border:1px solid #444;border-radius:2px;padding:0;margin:0;cursor:pointer}input.vis-configuration.vis-config-checkbox{left:-5px}input.vis-configuration.vis-config-rangeinput{position:relative;top:-5px;width:60px;padding:1px;margin:0;pointer-events:none}input.vis-configuration.vis-config-range{-webkit-appearance:none;border:0 solid #fff;background-color:rgba(0,0,0,0);width:300px;height:20px}input.vis-configuration.vis-config-range::-webkit-slider-runnable-track{width:300px;height:5px;background:#dedede;background:-moz-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#dedede),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-o-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:linear-gradient(to bottom,#dedede 0,#c8c8c8 99%);border:1px solid #999;box-shadow:#aaa 0 0 3px 0;border-radius:3px}input.vis-configuration.vis-config-range::-webkit-slider-thumb{-webkit-appearance:none;border:1px solid #14334b;height:17px;width:17px;border-radius:50%;background:#3876c2;background:-moz-linear-gradient(top,#3876c2 0,#385380 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#3876c2),color-stop(100%,#385380));background:-webkit-linear-gradient(top,#3876c2 0,#385380 100%);background:-o-linear-gradient(top,#3876c2 0,#385380 100%);background:-ms-linear-gradient(top,#3876c2 0,#385380 100%);background:linear-gradient(to bottom,#3876c2 0,#385380 100%);box-shadow:#111927 0 0 1px 0;margin-top:-7px}input.vis-configuration.vis-config-range:focus{outline:0}input.vis-configuration.vis-config-range:focus::-webkit-slider-runnable-track{background:#9d9d9d;background:-moz-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#9d9d9d),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-o-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:linear-gradient(to bottom,#9d9d9d 0,#c8c8c8 99%)}input.vis-configuration.vis-config-range::-moz-range-track{width:300px;height:10px;background:#dedede;background:-moz-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#dedede),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-o-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:linear-gradient(to bottom,#dedede 0,#c8c8c8 99%);border:1px solid #999;box-shadow:#aaa 0 0 3px 0;border-radius:3px}input.vis-configuration.vis-config-range::-moz-range-thumb{border:none;height:16px;width:16px;border-radius:50%;background:#385380}input.vis-configuration.vis-config-range:-moz-focusring{outline:1px solid #fff;outline-offset:-1px}input.vis-configuration.vis-config-range::-ms-track{width:300px;height:5px;background:0 0;border-color:transparent;border-width:6px 0;color:transparent}input.vis-configuration.vis-config-range::-ms-fill-lower{background:#777;border-radius:10px}input.vis-configuration.vis-config-range::-ms-fill-upper{background:#ddd;border-radius:10px}input.vis-configuration.vis-config-range::-ms-thumb{border:none;height:16px;width:16px;border-radius:50%;background:#385380}input.vis-configuration.vis-config-range:focus::-ms-fill-lower{background:#888}input.vis-configuration.vis-config-range:focus::-ms-fill-upper{background:#ccc}.vis-configuration-popup{position:absolute;background:rgba(57,76,89,.85);border:2px solid #f2faff;line-height:30px;height:30px;width:150px;text-align:center;color:#fff;font-size:14px;border-radius:4px;-webkit-transition:opacity .3s ease-in-out;-moz-transition:opacity .3s ease-in-out;transition:opacity .3s ease-in-out}.vis-configuration-popup:after,.vis-configuration-popup:before{left:100%;top:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}.vis-configuration-popup:after{border-color:rgba(136,183,213,0);border-left-color:rgba(57,76,89,.85);border-width:8px;margin-top:-8px}.vis-configuration-popup:before{border-color:rgba(194,225,245,0);border-left-color:#f2faff;border-width:12px;margin-top:-12px}div.vis-tooltip{position:absolute;visibility:hidden;padding:5px;white-space:nowrap;font-family:verdana;font-size:14px;color:#000;background-color:#f5f4ed;-moz-border-radius:3px;-webkit-border-radius:3px;border-radius:3px;border:1px solid #808074;box-shadow:3px 3px 10px rgba(0,0,0,.2);pointer-events:none;z-index:5}.vis-current-time{background-color:#ff7f6e;width:2px;z-index:1;pointer-events:none}.vis-rolling-mode-btn{height:40px;width:40px;position:absolute;top:7px;right:20px;border-radius:50%;font-size:28px;cursor:pointer;opacity:.8;color:#fff;font-weight:700;text-align:center;background:#3876c2}.vis-rolling-mode-btn:before{content:"\26F6"}.vis-rolling-mode-btn:hover{opacity:1}.vis-custom-time{background-color:#6e94ff;width:2px;cursor:move;z-index:1}.vis-panel.vis-background.vis-horizontal .vis-grid.vis-horizontal{position:absolute;width:100%;height:0;border-bottom:1px solid}.vis-panel.vis-background.vis-horizontal .vis-grid.vis-minor{border-color:#e5e5e5}.vis-panel.vis-background.vis-horizontal .vis-grid.vis-major{border-color:#bfbfbf}.vis-data-axis .vis-y-axis.vis-major{width:100%;position:absolute;color:#4d4d4d;white-space:nowrap}.vis-data-axis .vis-y-axis.vis-major.vis-measure{padding:0;margin:0;border:0;visibility:hidden;width:auto}.vis-data-axis .vis-y-axis.vis-minor{position:absolute;width:100%;color:#bebebe;white-space:nowrap}.vis-data-axis .vis-y-axis.vis-minor.vis-measure{padding:0;margin:0;border:0;visibility:hidden;width:auto}.vis-data-axis .vis-y-axis.vis-title{position:absolute;color:#4d4d4d;white-space:nowrap;bottom:20px;text-align:center}.vis-data-axis .vis-y-axis.vis-title.vis-measure{padding:0;margin:0;visibility:hidden;width:auto}.vis-data-axis .vis-y-axis.vis-title.vis-left{bottom:0;-webkit-transform-origin:left top;-moz-transform-origin:left top;-ms-transform-origin:left top;-o-transform-origin:left top;transform-origin:left bottom;-webkit-transform:rotate(-90deg);-moz-transform:rotate(-90deg);-ms-transform:rotate(-90deg);-o-transform:rotate(-90deg);transform:rotate(-90deg)}.vis-data-axis .vis-y-axis.vis-title.vis-right{bottom:0;-webkit-transform-origin:right bottom;-moz-transform-origin:right bottom;-ms-transform-origin:right bottom;-o-transform-origin:right bottom;transform-origin:right bottom;-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.vis-legend{background-color:rgba(247,252,255,.65);padding:5px;border:1px solid #b3b3b3;box-shadow:2px 2px 10px rgba(154,154,154,.55)}.vis-legend-text{white-space:nowrap;display:inline-block}.vis-item{position:absolute;color:#1a1a1a;border-color:#97b0f8;border-width:1px;background-color:#d5ddf6;display:inline-block;z-index:1}.vis-item.vis-selected{border-color:#ffc200;background-color:#fff785;z-index:2}.vis-editable.vis-selected{cursor:move}.vis-item.vis-point.vis-selected{background-color:#fff785}.vis-item.vis-box{text-align:center;border-style:solid;border-radius:2px}.vis-item.vis-point{background:0 0}.vis-item.vis-dot{position:absolute;padding:0;border-width:4px;border-style:solid;border-radius:4px}.vis-item.vis-range{border-style:solid;border-radius:2px;box-sizing:border-box}.vis-item.vis-background{border:none;background-color:rgba(213,221,246,.4);box-sizing:border-box;padding:0;margin:0}.vis-item .vis-item-overflow{position:relative;width:100%;height:100%;padding:0;margin:0;overflow:hidden}.vis-item-visible-frame{white-space:nowrap}.vis-item.vis-range .vis-item-content{position:relative;display:inline-block}.vis-item.vis-background .vis-item-content{position:absolute;display:inline-block}.vis-item.vis-line{padding:0;position:absolute;width:0;border-left-width:1px;border-left-style:solid}.vis-item .vis-item-content{white-space:nowrap;box-sizing:border-box;padding:5px}.vis-item .vis-onUpdateTime-tooltip{position:absolute;background:#4f81bd;color:#fff;width:200px;text-align:center;white-space:nowrap;padding:5px;border-radius:1px;transition:.4s;-o-transition:.4s;-moz-transition:.4s;-webkit-transition:.4s}.vis-item .vis-delete,.vis-item .vis-delete-rtl{position:absolute;top:0;width:24px;height:24px;box-sizing:border-box;padding:0 5px;cursor:pointer;-webkit-transition:background .2s linear;-moz-transition:background .2s linear;-ms-transition:background .2s linear;-o-transition:background .2s linear;transition:background .2s linear}.vis-item .vis-delete{right:-24px}.vis-item .vis-delete-rtl{left:-24px}.vis-item .vis-delete-rtl:after,.vis-item .vis-delete:after{content:"\00D7";color:red;font-family:arial,sans-serif;font-size:22px;font-weight:700;-webkit-transition:color .2s linear;-moz-transition:color .2s linear;-ms-transition:color .2s linear;-o-transition:color .2s linear;transition:color .2s linear}.vis-item .vis-delete-rtl:hover,.vis-item .vis-delete:hover{background:red}.vis-item .vis-delete-rtl:hover:after,.vis-item .vis-delete:hover:after{color:#fff}.vis-item .vis-drag-center{position:absolute;width:100%;height:100%;top:0;left:0;cursor:move}.vis-item.vis-range .vis-drag-left{position:absolute;width:24px;max-width:20%;min-width:2px;height:100%;top:0;left:-4px;cursor:w-resize}.vis-item.vis-range .vis-drag-right{position:absolute;width:24px;max-width:20%;min-width:2px;height:100%;top:0;right:-4px;cursor:e-resize}.vis-range.vis-item.vis-readonly .vis-drag-left,.vis-range.vis-item.vis-readonly .vis-drag-right{cursor:auto}.vis-itemset{position:relative;padding:0;margin:0;box-sizing:border-box}.vis-itemset .vis-background,.vis-itemset .vis-foreground{position:absolute;width:100%;height:100%;overflow:visible}.vis-axis{position:absolute;width:100%;height:0;left:0;z-index:1}.vis-foreground .vis-group{position:relative;box-sizing:border-box;border-bottom:1px solid #bfbfbf}.vis-foreground .vis-group:last-child{border-bottom:none}.vis-nesting-group{cursor:pointer}.vis-nested-group{background:#f5f5f5}.vis-label.vis-nesting-group.expanded:before{content:"\25BC"}.vis-label.vis-nesting-group.collapsed-rtl:before{content:"\25C0"}.vis-label.vis-nesting-group.collapsed:before{content:"\25B6"}.vis-overlay{position:absolute;top:0;left:0;width:100%;height:100%;z-index:10}.vis-labelset{position:relative;overflow:hidden;box-sizing:border-box}.vis-labelset .vis-label{position:relative;left:0;top:0;width:100%;color:#4d4d4d;box-sizing:border-box}.vis-labelset .vis-label{border-bottom:1px solid #bfbfbf}.vis-labelset .vis-label.draggable{cursor:pointer}.vis-labelset .vis-label:last-child{border-bottom:none}.vis-labelset .vis-label .vis-inner{display:inline-block;padding:5px}.vis-labelset .vis-label .vis-inner.vis-hidden{padding:0}.vis-panel{position:absolute;padding:0;margin:0;box-sizing:border-box}.vis-panel.vis-bottom,.vis-panel.vis-center,.vis-panel.vis-left,.vis-panel.vis-right,.vis-panel.vis-top{border:1px #bfbfbf}.vis-panel.vis-center,.vis-panel.vis-left,.vis-panel.vis-right{border-top-style:solid;border-bottom-style:solid;overflow:hidden}.vis-left.vis-panel.vis-vertical-scroll,.vis-right.vis-panel.vis-vertical-scroll{height:100%;overflow-x:hidden;overflow-y:scroll}.vis-left.vis-panel.vis-vertical-scroll{direction:rtl}.vis-left.vis-panel.vis-vertical-scroll .vis-content{direction:ltr}.vis-right.vis-panel.vis-vertical-scroll{direction:ltr}.vis-right.vis-panel.vis-vertical-scroll .vis-content{direction:rtl}.vis-panel.vis-bottom,.vis-panel.vis-center,.vis-panel.vis-top{border-left-style:solid;border-right-style:solid}.vis-background{overflow:hidden}.vis-panel>.vis-content{position:relative}.vis-panel .vis-shadow{position:absolute;width:100%;height:1px;box-shadow:0 0 10px rgba(0,0,0,.8)}.vis-panel .vis-shadow.vis-top{top:-1px;left:0}.vis-panel .vis-shadow.vis-bottom{bottom:-1px;left:0}.vis-graph-group0{fill:#4f81bd;fill-opacity:0;stroke-width:2px;stroke:#4f81bd}.vis-graph-group1{fill:#f79646;fill-opacity:0;stroke-width:2px;stroke:#f79646}.vis-graph-group2{fill:#8c51cf;fill-opacity:0;stroke-width:2px;stroke:#8c51cf}.vis-graph-group3{fill:#75c841;fill-opacity:0;stroke-width:2px;stroke:#75c841}.vis-graph-group4{fill:#ff0100;fill-opacity:0;stroke-width:2px;stroke:#ff0100}.vis-graph-group5{fill:#37d8e6;fill-opacity:0;stroke-width:2px;stroke:#37d8e6}.vis-graph-group6{fill:#042662;fill-opacity:0;stroke-width:2px;stroke:#042662}.vis-graph-group7{fill:#00ff26;fill-opacity:0;stroke-width:2px;stroke:#00ff26}.vis-graph-group8{fill:#f0f;fill-opacity:0;stroke-width:2px;stroke:#f0f}.vis-graph-group9{fill:#8f3938;fill-opacity:0;stroke-width:2px;stroke:#8f3938}.vis-timeline .vis-fill{fill-opacity:.1;stroke:none}.vis-timeline .vis-bar{fill-opacity:.5;stroke-width:1px}.vis-timeline .vis-point{stroke-width:2px;fill-opacity:1}.vis-timeline .vis-legend-background{stroke-width:1px;fill-opacity:.9;fill:#fff;stroke:#c2c2c2}.vis-timeline .vis-outline{stroke-width:1px;fill-opacity:1;fill:#fff;stroke:#e5e5e5}.vis-timeline .vis-icon-fill{fill-opacity:.3;stroke:none}.vis-time-axis{position:relative;overflow:hidden}.vis-time-axis.vis-foreground{top:0;left:0;width:100%}.vis-time-axis.vis-background{position:absolute;top:0;left:0;width:100%;height:100%}.vis-time-axis .vis-text{position:absolute;color:#4d4d4d;padding:3px;overflow:hidden;box-sizing:border-box;white-space:nowrap}.vis-time-axis .vis-text.vis-measure{position:absolute;padding-left:0;padding-right:0;margin-left:0;margin-right:0;visibility:hidden}.vis-time-axis .vis-grid.vis-vertical{position:absolute;border-left:1px solid}.vis-time-axis .vis-grid.vis-vertical-rtl{position:absolute;border-right:1px solid}.vis-time-axis .vis-grid.vis-minor{border-color:#e5e5e5}.vis-time-axis .vis-grid.vis-major{border-color:#bfbfbf}.vis-timeline{position:relative;border:1px solid #bfbfbf;overflow:hidden;padding:0;margin:0;box-sizing:border-box} \ No newline at end of file diff --git a/core/src/main/resources/org/apache/spark/ui/static/vis-timeline-graph2d.min.js b/core/src/main/resources/org/apache/spark/ui/static/vis-timeline-graph2d.min.js new file mode 100644 index 0000000000000..5cfa0da3d4f38 --- /dev/null +++ b/core/src/main/resources/org/apache/spark/ui/static/vis-timeline-graph2d.min.js @@ -0,0 +1,30 @@ +/** + * vis.js + * https://github.com/almende/vis + * + * A dynamic, browser-based visualization library. + * + * @version 4.21.0 + * @date 2017-10-12 + * + * @license + * Copyright (C) 2011-2017 Almende B.V, http://almende.com + * + * Vis.js is dual licensed under both + * + * * The Apache 2.0 License + * http://www.apache.org/licenses/LICENSE-2.0 + * + * and + * + * * The MIT License + * http://opensource.org/licenses/MIT + * + * Vis.js may be distributed under either license. + */ +"use strict";!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.vis=e():t.vis=e()}(this,function(){return function(t){function e(o){if(i[o])return i[o].exports;var n=i[o]={i:o,l:!1,exports:{}};return t[o].call(n.exports,n,n.exports,e),n.l=!0,n.exports}var i={};return e.m=t,e.c=i,e.d=function(t,i,o){e.o(t,i)||Object.defineProperty(t,i,{configurable:!1,enumerable:!0,get:o})},e.n=function(t){var i=t&&t.__esModule?function(){return t.default}:function(){return t};return e.d(i,"a",i),i},e.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},e.p="",e(e.s=85)}([function(t,e,i){function o(t){return t&&t.__esModule?t:{default:t}}function n(t,e,i,o){var n=!1;!0===o&&(n=null===e[i]&&void 0!==t[i]),n?delete t[i]:t[i]=e[i]}var s=i(86),r=o(s),a=i(43),h=o(a),d=i(4),l=o(d),u=i(1),p=o(u),c=i(3),m=i(126);e.isNumber=function(t){return t instanceof Number||"number"==typeof t},e.recursiveDOMDelete=function(t){if(t)for(;!0===t.hasChildNodes();)e.recursiveDOMDelete(t.firstChild),t.removeChild(t.firstChild)},e.giveRange=function(t,e,i,o){if(e==t)return.5;var n=1/(e-t);return Math.max(0,(o-t)*n)},e.isString=function(t){return t instanceof String||"string"==typeof t},e.isDate=function(t){if(t instanceof Date)return!0;if(e.isString(t)){if(f.exec(t))return!0;if(!isNaN(Date.parse(t)))return!0}return!1},e.randomUUID=function(){return m.v4()},e.assignAllKeys=function(t,e){for(var i in t)t.hasOwnProperty(i)&&"object"!==(0,p.default)(t[i])&&(t[i]=e)},e.fillIfDefined=function(t,i){var o=arguments.length>2&&void 0!==arguments[2]&&arguments[2];for(var s in t)void 0!==i[s]&&(null===i[s]||"object"!==(0,p.default)(i[s])?n(t,i,s,o):"object"===(0,p.default)(t[s])&&e.fillIfDefined(t[s],i[s],o))},e.extend=function(t,e){for(var i=1;i3&&void 0!==arguments[3]&&arguments[3];if(Array.isArray(o))throw new TypeError("Arrays are not supported by deepExtend");for(var r=0;r3&&void 0!==arguments[3]&&arguments[3];if(Array.isArray(o))throw new TypeError("Arrays are not supported by deepExtend");for(var r in o)if(o.hasOwnProperty(r)&&-1===t.indexOf(r))if(o[r]&&o[r].constructor===Object)void 0===i[r]&&(i[r]={}),i[r].constructor===Object?e.deepExtend(i[r],o[r]):n(i,o,r,s);else if(Array.isArray(o[r])){i[r]=[];for(var a=0;a2&&void 0!==arguments[2]&&arguments[2],s=arguments.length>3&&void 0!==arguments[3]&&arguments[3];for(var r in i)if(i.hasOwnProperty(r)||!0===o)if(i[r]&&i[r].constructor===Object)void 0===t[r]&&(t[r]={}),t[r].constructor===Object?e.deepExtend(t[r],i[r],o):n(t,i,r,s);else if(Array.isArray(i[r])){t[r]=[];for(var a=0;a=0&&(e="DOMMouseScroll"),t.addEventListener(e,i,o)):t.attachEvent("on"+e,i)},e.removeEventListener=function(t,e,i,o){t.removeEventListener?(void 0===o&&(o=!1),"mousewheel"===e&&navigator.userAgent.indexOf("Firefox")>=0&&(e="DOMMouseScroll"),t.removeEventListener(e,i,o)):t.detachEvent("on"+e,i)},e.preventDefault=function(t){t||(t=window.event),t.preventDefault?t.preventDefault():t.returnValue=!1},e.getTarget=function(t){t||(t=window.event);var e;return t.target?e=t.target:t.srcElement&&(e=t.srcElement),void 0!=e.nodeType&&3==e.nodeType&&(e=e.parentNode),e},e.hasParent=function(t,e){for(var i=t;i;){if(i===e)return!0;i=i.parentNode}return!1},e.option={},e.option.asBoolean=function(t,e){return"function"==typeof t&&(t=t()),null!=t?0!=t:e||null},e.option.asNumber=function(t,e){return"function"==typeof t&&(t=t()),null!=t?Number(t)||e||null:e||null},e.option.asString=function(t,e){return"function"==typeof t&&(t=t()),null!=t?String(t):e||null},e.option.asSize=function(t,i){return"function"==typeof t&&(t=t()),e.isString(t)?t:e.isNumber(t)?t+"px":i||null},e.option.asElement=function(t,e){return"function"==typeof t&&(t=t()),t||e||null},e.hexToRGB=function(t){var e=/^#?([a-f\d])([a-f\d])([a-f\d])$/i;t=t.replace(e,function(t,e,i,o){return e+e+i+i+o+o});var i=/^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(t);return i?{r:parseInt(i[1],16),g:parseInt(i[2],16),b:parseInt(i[3],16)}:null},e.overrideOpacity=function(t,i){var o;return-1!=t.indexOf("rgba")?t:-1!=t.indexOf("rgb")?(o=t.substr(t.indexOf("(")+1).replace(")","").split(","),"rgba("+o[0]+","+o[1]+","+o[2]+","+i+")"):(o=e.hexToRGB(t),null==o?t:"rgba("+o.r+","+o.g+","+o.b+","+i+")")},e.RGBToHex=function(t,e,i){return"#"+((1<<24)+(t<<16)+(e<<8)+i).toString(16).slice(1)},e.parseColor=function(t){var i;if(!0===e.isString(t)){if(!0===e.isValidRGB(t)){var o=t.substr(4).substr(0,t.length-5).split(",").map(function(t){return parseInt(t)});t=e.RGBToHex(o[0],o[1],o[2])}if(!0===e.isValidHex(t)){var n=e.hexToHSV(t),s={h:n.h,s:.8*n.s,v:Math.min(1,1.02*n.v)},r={h:n.h,s:Math.min(1,1.25*n.s),v:.8*n.v},a=e.HSVToHex(r.h,r.s,r.v),h=e.HSVToHex(s.h,s.s,s.v);i={background:t,border:a,highlight:{background:h,border:a},hover:{background:h,border:a}}}else i={background:t,border:t,highlight:{background:t,border:t},hover:{background:t,border:t}}}else i={},i.background=t.background||void 0,i.border=t.border||void 0,e.isString(t.highlight)?i.highlight={border:t.highlight,background:t.highlight}:(i.highlight={},i.highlight.background=t.highlight&&t.highlight.background||void 0,i.highlight.border=t.highlight&&t.highlight.border||void 0),e.isString(t.hover)?i.hover={border:t.hover,background:t.hover}:(i.hover={},i.hover.background=t.hover&&t.hover.background||void 0,i.hover.border=t.hover&&t.hover.border||void 0);return i},e.RGBToHSV=function(t,e,i){t/=255,e/=255,i/=255;var o=Math.min(t,Math.min(e,i)),n=Math.max(t,Math.max(e,i));if(o==n)return{h:0,s:0,v:o};var s=t==o?e-i:i==o?t-e:i-t;return{h:60*((t==o?3:i==o?1:5)-s/(n-o))/360,s:(n-o)/n,v:n}};var g={split:function(t){var e={};return t.split(";").forEach(function(t){if(""!=t.trim()){var i=t.split(":"),o=i[0].trim(),n=i[1].trim();e[o]=n}}),e},join:function(t){return(0,l.default)(t).map(function(e){return e+": "+t[e]}).join("; ")}};e.addCssText=function(t,i){var o=g.split(t.style.cssText),n=g.split(i),s=e.extend(o,n);t.style.cssText=g.join(s)},e.removeCssText=function(t,e){var i=g.split(t.style.cssText),o=g.split(e);for(var n in o)o.hasOwnProperty(n)&&delete i[n];t.style.cssText=g.join(i)},e.HSVToRGB=function(t,e,i){var o,n,s,r=Math.floor(6*t),a=6*t-r,h=i*(1-e),d=i*(1-a*e),l=i*(1-(1-a)*e);switch(r%6){case 0:o=i,n=l,s=h;break;case 1:o=d,n=i,s=h;break;case 2:o=h,n=i,s=l;break;case 3:o=h,n=d,s=i;break;case 4:o=l,n=h,s=i;break;case 5:o=i,n=h,s=d}return{r:Math.floor(255*o),g:Math.floor(255*n),b:Math.floor(255*s)}},e.HSVToHex=function(t,i,o){var n=e.HSVToRGB(t,i,o);return e.RGBToHex(n.r,n.g,n.b)},e.hexToHSV=function(t){var i=e.hexToRGB(t);return e.RGBToHSV(i.r,i.g,i.b)},e.isValidHex=function(t){return/(^#[0-9A-F]{6}$)|(^#[0-9A-F]{3}$)/i.test(t)},e.isValidRGB=function(t){return t=t.replace(" ",""),/rgb\((\d{1,3}),(\d{1,3}),(\d{1,3})\)/i.test(t)},e.isValidRGBA=function(t){return t=t.replace(" ",""),/rgba\((\d{1,3}),(\d{1,3}),(\d{1,3}),(.{1,3})\)/i.test(t)},e.selectiveBridgeObject=function(t,i){if(null!==i&&"object"===(void 0===i?"undefined":(0,p.default)(i))){for(var o=(0,h.default)(i),n=0;n0&&e(o,t[n-1])<0;n--)t[n]=t[n-1];t[n]=o}return t},e.mergeOptions=function(t,e,i){var o=arguments.length>3&&void 0!==arguments[3]?arguments[3]:{},n=function(t){return null!==t&&void 0!==t},s=function(t){return null!==t&&"object"===(void 0===t?"undefined":(0,p.default)(t))};if(!s(t))throw new Error("Parameter mergeTarget must be an object");if(!s(e))throw new Error("Parameter options must be an object");if(!n(i))throw new Error("Parameter option must have a value");if(!s(o))throw new Error("Parameter globalOptions must be an object");var r=e[i],a=s(o)&&!function(t){for(var e in t)if(t.hasOwnProperty(e))return!1;return!0}(o),d=a?o[i]:void 0,l=d?d.enabled:void 0;if(void 0!==r){if("boolean"==typeof r)return s(t[i])||(t[i]={}),void(t[i].enabled=r);if(null===r&&!s(t[i])){if(!n(d))return;t[i]=(0,h.default)(d)}if(s(r)){var u=!0;void 0!==r.enabled?u=r.enabled:void 0!==l&&(u=d.enabled),function(t,e,i){s(t[i])||(t[i]={});var o=e[i],n=t[i];for(var r in o)o.hasOwnProperty(r)&&(n[r]=o[r])}(t,e,i),t[i].enabled=u}}},e.binarySearchCustom=function(t,e,i,o){for(var n=0,s=0,r=t.length-1;s<=r&&n<1e4;){var a=Math.floor((s+r)/2),h=t[a],d=void 0===o?h[i]:h[i][o],l=e(d);if(0==l)return a;-1==l?s=a+1:r=a-1,n++}return-1},e.binarySearchValue=function(t,e,i,o,n){var s,r,a,h,d=0,l=0,u=t.length-1;for(n=void 0!=n?n:function(t,e){return t==e?0:t0)return"before"==o?Math.max(0,h-1):h;if(n(r,e)<0&&n(a,e)>0)return"before"==o?h:Math.min(t.length-1,h+1);n(r,e)<0?l=h+1:u=h-1,d++}return-1},e.easingFunctions={linear:function(t){return t},easeInQuad:function(t){return t*t},easeOutQuad:function(t){return t*(2-t)},easeInOutQuad:function(t){return t<.5?2*t*t:(4-2*t)*t-1},easeInCubic:function(t){return t*t*t},easeOutCubic:function(t){return--t*t*t+1},easeInOutCubic:function(t){return t<.5?4*t*t*t:(t-1)*(2*t-2)*(2*t-2)+1},easeInQuart:function(t){return t*t*t*t},easeOutQuart:function(t){return 1- --t*t*t*t},easeInOutQuart:function(t){return t<.5?8*t*t*t*t:1-8*--t*t*t*t},easeInQuint:function(t){return t*t*t*t*t},easeOutQuint:function(t){return 1+--t*t*t*t*t},easeInOutQuint:function(t){return t<.5?16*t*t*t*t*t:1+16*--t*t*t*t*t}},e.getScrollBarWidth=function(){var t=document.createElement("p");t.style.width="100%",t.style.height="200px";var e=document.createElement("div");e.style.position="absolute",e.style.top="0px",e.style.left="0px",e.style.visibility="hidden",e.style.width="200px",e.style.height="150px",e.style.overflow="hidden",e.appendChild(t),document.body.appendChild(e);var i=t.offsetWidth;e.style.overflow="scroll";var o=t.offsetWidth;return i==o&&(o=e.clientWidth),document.body.removeChild(e),i-o},e.topMost=function(t,e){var i=void 0;Array.isArray(e)||(e=[e]);var o=!0,n=!1,s=void 0;try{for(var a,h=(0,r.default)(t);!(o=(a=h.next()).done);o=!0){var d=a.value;if(d){i=d[e[0]];for(var l=1;l0?(o=e[t].redundant[0],e[t].redundant.shift()):(o=document.createElementNS("http://www.w3.org/2000/svg",t),i.appendChild(o)):(o=document.createElementNS("http://www.w3.org/2000/svg",t),e[t]={used:[],redundant:[]},i.appendChild(o)),e[t].used.push(o),o},e.getDOMElement=function(t,e,i,o){var n;return e.hasOwnProperty(t)?e[t].redundant.length>0?(n=e[t].redundant[0],e[t].redundant.shift()):(n=document.createElement(t),void 0!==o?i.insertBefore(n,o):i.appendChild(n)):(n=document.createElement(t),e[t]={used:[],redundant:[]},void 0!==o?i.insertBefore(n,o):i.appendChild(n)),e[t].used.push(n),n},e.drawPoint=function(t,i,o,n,s,r){var a;if("circle"==o.style?(a=e.getSVGElement("circle",n,s),a.setAttributeNS(null,"cx",t),a.setAttributeNS(null,"cy",i),a.setAttributeNS(null,"r",.5*o.size)):(a=e.getSVGElement("rect",n,s),a.setAttributeNS(null,"x",t-.5*o.size),a.setAttributeNS(null,"y",i-.5*o.size),a.setAttributeNS(null,"width",o.size),a.setAttributeNS(null,"height",o.size)),void 0!==o.styles&&a.setAttributeNS(null,"style",o.styles),a.setAttributeNS(null,"class",o.className+" vis-point"),r){var h=e.getSVGElement("text",n,s);r.xOffset&&(t+=r.xOffset),r.yOffset&&(i+=r.yOffset),r.content&&(h.textContent=r.content),r.className&&h.setAttributeNS(null,"class",r.className+" vis-label"),h.setAttributeNS(null,"x",t),h.setAttributeNS(null,"y",i)}return a},e.drawBar=function(t,i,o,n,s,r,a,h){if(0!=n){n<0&&(n*=-1,i-=n);var d=e.getSVGElement("rect",r,a);d.setAttributeNS(null,"x",t-.5*o),d.setAttributeNS(null,"y",i),d.setAttributeNS(null,"width",o),d.setAttributeNS(null,"height",n),d.setAttributeNS(null,"class",s),h&&d.setAttributeNS(null,"style",h)}}},function(t,e,i){function o(){var t=function(){};return{on:t,off:t,destroy:t,emit:t,get:function(e){return{set:t}}}}if("undefined"!=typeof window){var n=i(130),s=window.Hammer||i(131);t.exports=n(s,{preventDefault:"mouse"})}else t.exports=function(){return o()}},function(t,e,i){var o=i(91),n=i(34);t.exports=function(t){return o(n(t))}},function(t,e,i){var o=i(8),n=i(25);t.exports=i(9)?function(t,e,i){return o.f(t,e,n(1,i))}:function(t,e,i){return t[e]=i,t}},function(t,e,i){function o(t){return t&&t.__esModule?t:{default:t}}function n(t,e){if(t&&!Array.isArray(t)&&(e=t,t=null),this._options=e||{},this._data={},this.length=0,this._fieldId=this._options.fieldId||"id",this._type={},this._options.type)for(var i=(0,l.default)(this._options.type),o=0,n=i.length;on?1:or)&&(s=h,r=d)}return s},n.prototype.min=function(t){var e,i,o=this._data,n=(0,l.default)(o),s=null,r=null;for(e=0,i=n.length;e=4*a){var p=0,c=s.clone();switch(o[h].repeat){case"daily":d.day()!=l.day()&&(p=1),d.dayOfYear(n.dayOfYear()),d.year(n.year()),d.subtract(7,"days"),l.dayOfYear(n.dayOfYear()),l.year(n.year()),l.subtract(7-p,"days"),c.add(1,"weeks");break;case"weekly":var m=l.diff(d,"days"),f=d.day();d.date(n.date()),d.month(n.month()),d.year(n.year()),l=d.clone(),d.day(f),l.day(f),l.add(m,"days"),d.subtract(1,"weeks"),l.subtract(1,"weeks"),c.add(1,"weeks");break;case"monthly":d.month()!=l.month()&&(p=1),d.month(n.month()),d.year(n.year()),d.subtract(1,"months"),l.month(n.month()),l.year(n.year()),l.subtract(1,"months"),l.add(p,"months"),c.add(1,"months");break;case"yearly":d.year()!=l.year()&&(p=1),d.year(n.year()),d.subtract(1,"years"),l.year(n.year()),l.subtract(1,"years"),l.add(p,"years"),c.add(1,"years");break;default:return void console.log("Wrong repeat format, allowed are: daily, weekly, monthly, yearly. Given:",o[h].repeat)}for(;d=e[o].start&&e[n].end<=e[o].end?e[n].remove=!0:e[n].start>=e[o].start&&e[n].start<=e[o].end?(e[o].end=e[n].end,e[n].remove=!0):e[n].end>=e[o].start&&e[n].end<=e[o].end&&(e[o].start=e[n].start,e[n].remove=!0));for(o=0;o=r&&nt.range.end){var h={start:t.range.start,end:i};return i=e.correctTimeForHidden(t.options.moment,t.body.hiddenDates,h,i),n=t.range.conversion(o,r),(i.valueOf()-n.offset)*n.scale}return i=e.correctTimeForHidden(t.options.moment,t.body.hiddenDates,t.range,i),n=t.range.conversion(o,r),(i.valueOf()-n.offset)*n.scale},e.toTime=function(t,i,o){if(0==t.body.hiddenDates.length){var n=t.range.conversion(o);return new Date(i/n.scale+n.offset)}var s=e.getHiddenDurationBetween(t.body.hiddenDates,t.range.start,t.range.end),r=t.range.end-t.range.start-s,a=r*i/o,h=e.getAccumulatedHiddenDuration(t.body.hiddenDates,t.range,a);return new Date(h+a+t.range.start)},e.getHiddenDurationBetween=function(t,e,i){for(var o=0,n=0;n=e&&r=e&&r<=i&&(o+=r-s)}return o},e.correctTimeForHidden=function(t,i,o,n){return n=t(n).toDate().valueOf(),n-=e.getHiddenDurationBefore(t,i,o,n)},e.getHiddenDurationBefore=function(t,e,i,o){var n=0;o=t(o).toDate().valueOf();for(var s=0;s=i.start&&a=a&&(n+=a-r)}return n},e.getAccumulatedHiddenDuration=function(t,e,i){for(var o=0,n=0,s=e.start,r=0;r=e.start&&h=i)break;o+=h-a}}return o},e.snapAwayFromHidden=function(t,i,o,n){var s=e.isHidden(i,t);return 1==s.hidden?o<0?1==n?s.startDate-(s.endDate-i)-1:s.startDate-1:1==n?s.endDate+(i-s.startDate)+1:s.endDate+1:i},e.isHidden=function(t,e){for(var i=0;i=o&&t0){var e=[];if(Array.isArray(this.options.dataAttributes))e=this.options.dataAttributes;else{if("all"!=this.options.dataAttributes)return;e=(0,h.default)(this.data)}for(var i=0;i=.4*g}if(this.options.showMinorLabels&&f){var D=this._repaintMinorText(p,y,t,b);D.style.width=_+"px"}m&&this.options.showMajorLabels?(p>0&&(void 0==w&&(w=p),D=this._repaintMajorText(p,s.getLabelMajor(),t,b)),v=this._repaintMajorLine(p,_,t,b)):f?v=this._repaintMinorLine(p,_,t,b):v&&(v.style.width=parseInt(v.style.width)+_+"px")}if(1e3!==x||u||(console.warn("Something is wrong with the Timeline scale. Limited drawing of grid lines to 1000 lines."),u=!0),this.options.showMajorLabels){var S=this.body.util.toTime(0),k=s.getLabelMajor(S),C=k.length*(this.props.majorCharWidth||10)+10;(void 0==w||Cs.distance?" in "+t.printLocation(n.path,e,"")+"Perhaps it was misplaced? Matching option found at: "+t.printLocation(s.path,s.closestMatch,""):n.distance<=8?'. Did you mean "'+n.closestMatch+'"?'+t.printLocation(n.path,e):". Did you mean one of these: "+t.print((0,d.default)(i))+t.printLocation(o,e),console.log('%cUnknown option detected: "'+e+'"'+r,v),f=!0}},{key:"findInOptions",value:function(e,i,o){var n=arguments.length>3&&void 0!==arguments[3]&&arguments[3],s=1e9,r="",a=[],h=e.toLowerCase(),d=void 0;for(var l in i){var u=void 0;if(void 0!==i[l].__type__&&!0===n){var p=t.findInOptions(e,i[l],m.copyAndExtendArray(o,l));s>p.distance&&(r=p.closestMatch,a=p.path,s=p.distance,d=p.indexMatch)}else-1!==l.toLowerCase().indexOf(h)&&(d=l),u=t.levenshteinDistance(e,l),s>u&&(r=l,a=m.copyArray(o),s=u)}return{closestMatch:r,path:a,distance:s,indexMatch:d}}},{key:"printLocation",value:function(t,e){for(var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:"Problem value found at: \n",o="\n\n"+i+"options = {\n",n=0;ndocument.F=Object<\/script>"),t.close(),h=t.F;o--;)delete h.prototype[s[o]];return h()};t.exports=Object.create||function(t,e){var i;return null!==t?(a.prototype=o(t),i=new a,a.prototype=null,i[r]=t):i=h(),void 0===e?i:n(i,e)}},function(t,e){var i=Math.ceil,o=Math.floor;t.exports=function(t){return isNaN(t=+t)?0:(t>0?o:i)(t)}},function(t,e,i){var o=i(40)("keys"),n=i(27);t.exports=function(t){return o[t]||(o[t]=n(t))}},function(t,e,i){var o=i(6),n=o["__core-js_shared__"]||(o["__core-js_shared__"]={});t.exports=function(t){return n[t]||(n[t]={})}},function(t,e){t.exports="constructor,hasOwnProperty,isPrototypeOf,propertyIsEnumerable,toLocaleString,toString,valueOf".split(",")},function(t,e,i){var o=i(8).f,n=i(10),s=i(7)("toStringTag");t.exports=function(t,e,i){t&&!n(t=i?t:t.prototype,s)&&o(t,s,{configurable:!0,value:e})}},function(t,e,i){t.exports={default:i(105),__esModule:!0}},function(t,e,i){e.f=i(7)},function(t,e,i){var o=i(6),n=i(2),s=i(35),r=i(44),a=i(8).f;t.exports=function(t){var e=n.Symbol||(n.Symbol=s?{}:o.Symbol||{});"_"==t.charAt(0)||t in e||a(e,t,{value:r.f(t)})}},function(t,e){e.f={}.propertyIsEnumerable},function(t,e,i){function o(t){return t&&t.__esModule?t:{default:t}}function n(t,e){var i=c().hours(0).minutes(0).seconds(0).milliseconds(0),o=i.clone().add(-3,"days").valueOf(),n=i.clone().add(3,"days").valueOf();this.millisecondsPerPixelCache=void 0,void 0===e?(this.start=o,this.end=n):(this.start=e.start||o,this.end=e.end||n),this.rolling=!1,this.body=t,this.deltaDifference=0,this.scaleOffset=0,this.startToFront=!1,this.endToFront=!0,this.defaultOptions={rtl:!1,start:null,end:null,moment:c,direction:"horizontal",moveable:!0,zoomable:!0,min:null,max:null,zoomMin:10,zoomMax:31536e10,rollingMode:{follow:!1,offset:.5}},this.options=p.extend({},this.defaultOptions),this.props={touch:{}},this.animationTimer=null,this.body.emitter.on("panstart",this._onDragStart.bind(this)),this.body.emitter.on("panmove",this._onDrag.bind(this)),this.body.emitter.on("panend",this._onDragEnd.bind(this)),this.body.emitter.on("mousewheel",this._onMouseWheel.bind(this)),this.body.emitter.on("touch",this._onTouch.bind(this)),this.body.emitter.on("pinch",this._onPinch.bind(this)),this.body.dom.rollingModeBtn.addEventListener("click",this.startRolling.bind(this)),this.setOptions(e)}function s(t){if("horizontal"!=t&&"vertical"!=t)throw new TypeError('Unknown direction "'+t+'". Choose "horizontal" or "vertical".')}var r=i(4),a=o(r),h=i(16),d=o(h),l=i(1),u=o(l),p=i(0),c=i(3),m=i(5),f=i(21);n.prototype=new m,n.prototype.setOptions=function(t){if(t){var e=["animation","direction","min","max","zoomMin","zoomMax","moveable","zoomable","moment","activate","hiddenDates","zoomKey","rtl","showCurrentTime","rollingMode","horizontalScroll"];p.selectiveExtend(e,this.options,t),t.rollingMode&&t.rollingMode.follow&&this.startRolling(),("start"in t||"end"in t)&&this.setRange(t.start,t.end)}},n.prototype.startRolling=function(){function t(){e.stopRolling(),e.rolling=!0;var i=e.end-e.start,o=p.convert(new Date,"Date").valueOf(),n=o-i*e.options.rollingMode.offset,s=o+i*(1-e.options.rollingMode.offset),r={animation:!1};e.setRange(n,s,r),i=1/e.conversion(e.body.domProps.center.width).scale/10,i<30&&(i=30),i>1e3&&(i=1e3),e.body.dom.rollingModeBtn.style.visibility="hidden",e.currentTimeTimer=setTimeout(t,i)}var e=this;t()},n.prototype.stopRolling=function(){void 0!==this.currentTimeTimer&&(clearTimeout(this.currentTimeTimer),this.rolling=!1,this.body.dom.rollingModeBtn.style.visibility="visible")},n.prototype.setRange=function(t,e,i,o,n){i||(i={}),!0!==i.byUser&&(i.byUser=!1);var s=this,r=void 0!=t?p.convert(t,"Date").valueOf():null,h=void 0!=e?p.convert(e,"Date").valueOf():null;if(this._cancelAnimation(),this.millisecondsPerPixelCache=void 0,i.animation){var l=this.start,c=this.end,m="object"===(0,u.default)(i.animation)&&"duration"in i.animation?i.animation.duration:500,g="object"===(0,u.default)(i.animation)&&"easingFunction"in i.animation?i.animation.easingFunction:"easeInOutQuad",v=p.easingFunctions[g];if(!v)throw new Error("Unknown easing function "+(0,d.default)(g)+". Choose from: "+(0,a.default)(p.easingFunctions).join(", "));var y=(new Date).valueOf(),b=!1;return function t(){if(!s.props.touch.dragging){var e=(new Date).valueOf(),a=e-y,d=v(a/m),u=a>m,p=u||null===r?r:l+(r-l)*d,g=u||null===h?h:c+(h-c)*d;_=s._applyRange(p,g),f.updateHiddenDates(s.options.moment,s.body,s.options.hiddenDates),b=b||_;var w={start:new Date(s.start),end:new Date(s.end),byUser:i.byUser,event:i.event};if(n&&n(d,_,u),_&&s.body.emitter.emit("rangechange",w),u){if(b&&(s.body.emitter.emit("rangechanged",w),o))return o()}else s.animationTimer=setTimeout(t,20)}}()}var _=this._applyRange(r,h);if(f.updateHiddenDates(this.options.moment,this.body,this.options.hiddenDates),_){var w={start:new Date(this.start),end:new Date(this.end),byUser:i.byUser,event:i.event};if(this.body.emitter.emit("rangechange",w),clearTimeout(s.timeoutID),s.timeoutID=setTimeout(function(){s.body.emitter.emit("rangechanged",w)},200),o)return o()}},n.prototype.getMillisecondsPerPixel=function(){return void 0===this.millisecondsPerPixelCache&&(this.millisecondsPerPixelCache=(this.end-this.start)/this.body.dom.center.clientWidth),this.millisecondsPerPixelCache},n.prototype._cancelAnimation=function(){this.animationTimer&&(clearTimeout(this.animationTimer),this.animationTimer=null)},n.prototype._applyRange=function(t,e){var i,o=null!=t?p.convert(t,"Date").valueOf():this.start,n=null!=e?p.convert(e,"Date").valueOf():this.end,s=null!=this.options.max?p.convert(this.options.max,"Date").valueOf():null,r=null!=this.options.min?p.convert(this.options.min,"Date").valueOf():null;if(isNaN(o)||null===o)throw new Error('Invalid start "'+t+'"');if(isNaN(n)||null===n)throw new Error('Invalid end "'+e+'"');if(ns&&(n=s)),null!==s&&n>s&&(i=n-s,o-=i,n-=i,null!=r&&o=this.start-.5&&n<=this.end?(o=this.start,n=this.end):(i=a-(n-o),o-=i/2,n+=i/2)}}if(null!==this.options.zoomMax){var h=parseFloat(this.options.zoomMax);h<0&&(h=0),n-o>h&&(this.end-this.start===h&&othis.end?(o=this.start,n=this.end):(i=n-o-h,o+=i/2,n-=i/2))}var d=this.start!=o||this.end!=n;return o>=this.start&&o<=this.end||n>=this.start&&n<=this.end||this.start>=o&&this.start<=n||this.end>=o&&this.end<=n||this.body.emitter.emit("checkRangedItems"),this.start=o,this.end=n,d},n.prototype.getRange=function(){return{start:this.start,end:this.end}},n.prototype.conversion=function(t,e){return n.conversion(this.start,this.end,t,e)},n.conversion=function(t,e,i,o){return void 0===o&&(o=0),0!=i&&e-t!=0?{offset:t,scale:i/(e-t-o)}:{offset:0,scale:1}},n.prototype._onDragStart=function(t){this.deltaDifference=0,this.previousDelta=0,this.options.moveable&&this._isInsideRange(t)&&this.props.touch.allowDragging&&(this.stopRolling(),this.props.touch.start=this.start,this.props.touch.end=this.end,this.props.touch.dragging=!0,this.body.dom.root&&(this.body.dom.root.style.cursor="move"))},n.prototype._onDrag=function(t){if(t&&this.props.touch.dragging&&this.options.moveable&&this.props.touch.allowDragging){var e=this.options.direction;s(e);var i="horizontal"==e?t.deltaX:t.deltaY;i-=this.deltaDifference;var o=this.props.touch.end-this.props.touch.start;o-=f.getHiddenDurationBetween(this.body.hiddenDates,this.start,this.end);var n,r="horizontal"==e?this.body.domProps.center.width:this.body.domProps.center.height;n=this.options.rtl?i/r*o:-i/r*o;var a=this.props.touch.start+n,h=this.props.touch.end+n,d=f.snapAwayFromHidden(this.body.hiddenDates,a,this.previousDelta-i,!0),l=f.snapAwayFromHidden(this.body.hiddenDates,h,this.previousDelta-i,!0);if(d!=a||l!=h)return this.deltaDifference+=i,this.props.touch.start=d,this.props.touch.end=l,void this._onDrag(t);this.previousDelta=i,this._applyRange(a,h);var u=new Date(this.start),p=new Date(this.end);this.body.emitter.emit("rangechange",{start:u,end:p,byUser:!0,event:t}),this.body.emitter.emit("panmove")}},n.prototype._onDragEnd=function(t){this.props.touch.dragging&&this.options.moveable&&this.props.touch.allowDragging&&(this.props.touch.dragging=!1,this.body.dom.root&&(this.body.dom.root.style.cursor="auto"),this.body.emitter.emit("rangechanged",{start:new Date(this.start),end:new Date(this.end),byUser:!0,event:t}))},n.prototype._onMouseWheel=function(t){var e=0;if(t.wheelDelta?e=t.wheelDelta/120:t.detail&&(e=-t.detail/3),!(this.options.zoomKey&&!t[this.options.zoomKey]&&this.options.zoomable||!this.options.zoomable&&this.options.moveable)&&this.options.zoomable&&this.options.moveable&&this._isInsideRange(t)&&e){var i;i=e<0?1-e/5:1/(1+e/5);var o;if(this.rolling)o=this.start+(this.end-this.start)*this.options.rollingMode.offset;else{var n=this.getPointer({x:t.clientX,y:t.clientY},this.body.dom.center);o=this._pointerToDate(n)}this.zoom(i,o,e,t),t.preventDefault()}},n.prototype._onTouch=function(t){this.props.touch.start=this.start,this.props.touch.end=this.end,this.props.touch.allowDragging=!0,this.props.touch.center=null,this.scaleOffset=0,this.deltaDifference=0,p.preventDefault(t)},n.prototype._onPinch=function(t){if(this.options.zoomable&&this.options.moveable){p.preventDefault(t),this.props.touch.allowDragging=!1,this.props.touch.center||(this.props.touch.center=this.getPointer(t.center,this.body.dom.center)),this.stopRolling();var e=1/(t.scale+this.scaleOffset),i=this._pointerToDate(this.props.touch.center),o=f.getHiddenDurationBetween(this.body.hiddenDates,this.start,this.end),n=f.getHiddenDurationBefore(this.options.moment,this.body.hiddenDates,this,i),s=o-n,r=i-n+(this.props.touch.start-(i-n))*e,a=i+s+(this.props.touch.end-(i+s))*e;this.startToFront=1-e<=0,this.endToFront=e-1<=0;var h=f.snapAwayFromHidden(this.body.hiddenDates,r,1-e,!0),d=f.snapAwayFromHidden(this.body.hiddenDates,a,e-1,!0);h==r&&d==a||(this.props.touch.start=h,this.props.touch.end=d,this.scaleOffset=1-t.scale,r=h,a=d);var l={animation:!1,byUser:!0,event:t};this.setRange(r,a,l),this.startToFront=!1,this.endToFront=!0}},n.prototype._isInsideRange=function(t){var e,i=t.center?t.center.x:t.clientX;e=this.options.rtl?i-p.getAbsoluteLeft(this.body.dom.centerContainer):p.getAbsoluteRight(this.body.dom.centerContainer)-i;var o=this.body.util.toTime(e);return o>=this.start&&o<=this.end},n.prototype._pointerToDate=function(t){var e,i=this.options.direction;if(s(i),"horizontal"==i)return this.body.util.toTime(t.x).valueOf();var o=this.body.domProps.center.height;return e=this.conversion(o),t.y/e.scale+e.offset},n.prototype.getPointer=function(t,e){return this.options.rtl?{x:p.getAbsoluteRight(e)-t.x,y:t.y-p.getAbsoluteTop(e)}:{x:t.x-p.getAbsoluteLeft(e),y:t.y-p.getAbsoluteTop(e)}},n.prototype.zoom=function(t,e,i,o){null==e&&(e=(this.start+this.end)/2);var n=f.getHiddenDurationBetween(this.body.hiddenDates,this.start,this.end),s=f.getHiddenDurationBefore(this.options.moment,this.body.hiddenDates,this,e),r=n-s,a=e-s+(this.start-(e-s))*t,h=e+r+(this.end-(e+r))*t;this.startToFront=!(i>0),this.endToFront=!(-i>0);var d=f.snapAwayFromHidden(this.body.hiddenDates,a,i,!0),l=f.snapAwayFromHidden(this.body.hiddenDates,h,-i,!0);d==a&&l==h||(a=d,h=l);var u={animation:!1,byUser:!0,event:o};this.setRange(a,h,u),this.startToFront=!1,this.endToFront=!0},n.prototype.move=function(t){var e=this.end-this.start,i=this.start+e*t,o=this.end+e*t;this.start=i,this.end=o},n.prototype.moveTo=function(t){var e=(this.start+this.end)/2,i=e-t,o=this.start-i,n=this.end-i,s={animation:!1,byUser:!0,event:null};this.setRange(o,n,s)},t.exports=n},function(t,e,i){function o(t){return t&&t.__esModule?t:{default:t}}function n(){}var s=i(16),r=o(s),a=i(1),h=o(a),d=i(66),l=i(12),u=i(67),p=i(0),c=i(28),m=i(132),f=i(21),g=i(29);d(n.prototype),n.prototype._create=function(t){function e(t){this.isActive()&&this.emit("mousewheel",t);var e=0,i=0;if("detail"in t&&(i=-1*t.detail),"wheelDelta"in t&&(i=t.wheelDelta),"wheelDeltaY"in t&&(i=t.wheelDeltaY),"wheelDeltaX"in t&&(e=-1*t.wheelDeltaX),"axis"in t&&t.axis===t.HORIZONTAL_AXIS&&(e=-1*i,i=0),"deltaY"in t&&(i=-1*t.deltaY),"deltaX"in t&&(e=t.deltaX),this.options.zoomKey&&!t[this.options.zoomKey])if(t.preventDefault(),this.options.verticalScroll&&Math.abs(i)>=Math.abs(e)){var o=this.props.scrollTop,n=o+i;this.isActive()&&(this._setScrollTop(n),this._redraw(),this.emit("scroll",t))}else if(this.options.horizontalScroll){var s=Math.abs(e)>=Math.abs(i)?e:i,r=s/120*(this.range.end-this.range.start)/20,a=this.range.start+r,h=this.range.end+r,d={animation:!1,byUser:!0,event:t};this.range.setRange(a,h,d)}}function i(t){if(s.options.verticalScroll&&(t.preventDefault(),s.isActive())){var e=-t.target.scrollTop;s._setScrollTop(e),s._redraw(),s.emit("scrollSide",t)}}function o(t){if(t.preventDefault&&t.preventDefault(),!(!t.target.className.indexOf("vis")>-1||a))return t.dataTransfer.dropEffect="move",a=!0,!1}function n(t){t.preventDefault&&t.preventDefault(),t.stopPropagation&&t.stopPropagation();try{var e=JSON.parse(t.dataTransfer.getData("text"));if(!e||!e.content)return}catch(t){return!1}return a=!1,t.center={x:t.clientX,y:t.clientY},"item"!==e.target?s.itemSet._onAddItem(t):s.itemSet._onDropObjectOnItem(t),s.emit("drop",s.getEventProperties(t)),!1}this.dom={},this.dom.container=t,this.dom.root=document.createElement("div"),this.dom.background=document.createElement("div"),this.dom.backgroundVertical=document.createElement("div"),this.dom.backgroundHorizontal=document.createElement("div"),this.dom.centerContainer=document.createElement("div"),this.dom.leftContainer=document.createElement("div"),this.dom.rightContainer=document.createElement("div"),this.dom.center=document.createElement("div"),this.dom.left=document.createElement("div"),this.dom.right=document.createElement("div"),this.dom.top=document.createElement("div"),this.dom.bottom=document.createElement("div"),this.dom.shadowTop=document.createElement("div"),this.dom.shadowBottom=document.createElement("div"),this.dom.shadowTopLeft=document.createElement("div"),this.dom.shadowBottomLeft=document.createElement("div"),this.dom.shadowTopRight=document.createElement("div"),this.dom.shadowBottomRight=document.createElement("div"),this.dom.rollingModeBtn=document.createElement("div"),this.dom.root.className="vis-timeline",this.dom.background.className="vis-panel vis-background",this.dom.backgroundVertical.className="vis-panel vis-background vis-vertical",this.dom.backgroundHorizontal.className="vis-panel vis-background vis-horizontal",this.dom.centerContainer.className="vis-panel vis-center",this.dom.leftContainer.className="vis-panel vis-left",this.dom.rightContainer.className="vis-panel vis-right",this.dom.top.className="vis-panel vis-top",this.dom.bottom.className="vis-panel vis-bottom",this.dom.left.className="vis-content",this.dom.center.className="vis-content",this.dom.right.className="vis-content",this.dom.shadowTop.className="vis-shadow vis-top",this.dom.shadowBottom.className="vis-shadow vis-bottom",this.dom.shadowTopLeft.className="vis-shadow vis-top",this.dom.shadowBottomLeft.className="vis-shadow vis-bottom",this.dom.shadowTopRight.className="vis-shadow vis-top",this.dom.shadowBottomRight.className="vis-shadow vis-bottom",this.dom.rollingModeBtn.className="vis-rolling-mode-btn",this.dom.root.appendChild(this.dom.background),this.dom.root.appendChild(this.dom.backgroundVertical),this.dom.root.appendChild(this.dom.backgroundHorizontal),this.dom.root.appendChild(this.dom.centerContainer),this.dom.root.appendChild(this.dom.leftContainer),this.dom.root.appendChild(this.dom.rightContainer),this.dom.root.appendChild(this.dom.top),this.dom.root.appendChild(this.dom.bottom),this.dom.root.appendChild(this.dom.bottom),this.dom.root.appendChild(this.dom.rollingModeBtn),this.dom.centerContainer.appendChild(this.dom.center),this.dom.leftContainer.appendChild(this.dom.left),this.dom.rightContainer.appendChild(this.dom.right),this.dom.centerContainer.appendChild(this.dom.shadowTop),this.dom.centerContainer.appendChild(this.dom.shadowBottom),this.dom.leftContainer.appendChild(this.dom.shadowTopLeft),this.dom.leftContainer.appendChild(this.dom.shadowBottomLeft),this.dom.rightContainer.appendChild(this.dom.shadowTopRight),this.dom.rightContainer.appendChild(this.dom.shadowBottomRight),this.props={root:{},background:{},centerContainer:{},leftContainer:{},rightContainer:{},center:{},left:{},right:{},top:{},bottom:{},border:{},scrollTop:0,scrollTopMin:0},this.on("rangechange",function(){!0===this.initialDrawDone&&this._redraw()}.bind(this)),this.on("rangechanged",function(){this.initialRangeChangeDone||(this.initialRangeChangeDone=!0)}.bind(this)),this.on("touch",this._onTouch.bind(this)),this.on("panmove",this._onDrag.bind(this));var s=this;this._origRedraw=this._redraw.bind(this),this._redraw=p.throttle(this._origRedraw),this.on("_change",function(t){s.itemSet&&s.itemSet.initialItemSetDrawn&&t&&1==t.queue?s._redraw():s._origRedraw()}),this.hammer=new l(this.dom.root);var r=this.hammer.get("pinch").set({enable:!0});u.disablePreventDefaultVertically(r),this.hammer.get("pan").set({threshold:5,direction:l.DIRECTION_HORIZONTAL}),this.listeners={},["tap","doubletap","press","pinch","pan","panstart","panmove","panend"].forEach(function(t){var e=function(e){s.isActive()&&s.emit(t,e)};s.hammer.on(t,e),s.listeners[t]=e}),u.onTouch(this.hammer,function(t){s.emit("touch",t)}.bind(this)),u.onRelease(this.hammer,function(t){s.emit("release",t)}.bind(this)),this.dom.centerContainer.addEventListener?(this.dom.centerContainer.addEventListener("mousewheel",e.bind(this),!1),this.dom.centerContainer.addEventListener("DOMMouseScroll",e.bind(this),!1)):this.dom.centerContainer.attachEvent("onmousewheel",e.bind(this)),this.dom.left.parentNode.addEventListener("scroll",i.bind(this)),this.dom.right.parentNode.addEventListener("scroll",i.bind(this));var a=!1;if(this.dom.center.addEventListener("dragover",o.bind(this),!1),this.dom.center.addEventListener("drop",n.bind(this),!1),this.customTimes=[],this.touch={},this.redrawCount=0,this.initialDrawDone=!1,this.initialRangeChangeDone=!1,!t)throw new Error("No container provided");t.appendChild(this.dom.root)},n.prototype.setOptions=function(t){if(t){var e=["width","height","minHeight","maxHeight","autoResize","start","end","clickToUse","dataAttributes","hiddenDates","locale","locales","moment","rtl","zoomKey","horizontalScroll","verticalScroll"];if(p.selectiveExtend(e,this.options,t),this.dom.rollingModeBtn.style.visibility="hidden",this.options.rtl&&(this.dom.container.style.direction="rtl",this.dom.backgroundVertical.className="vis-panel vis-background vis-vertical-rtl"),this.options.verticalScroll&&(this.options.rtl?this.dom.rightContainer.className="vis-panel vis-right vis-vertical-scroll":this.dom.leftContainer.className="vis-panel vis-left vis-vertical-scroll"),"object"!==(0,h.default)(this.options.orientation)&&(this.options.orientation={item:void 0,axis:void 0}),"orientation"in t&&("string"==typeof t.orientation?this.options.orientation={item:t.orientation,axis:t.orientation}:"object"===(0,h.default)(t.orientation)&&("item"in t.orientation&&(this.options.orientation.item=t.orientation.item),"axis"in t.orientation&&(this.options.orientation.axis=t.orientation.axis))),"both"===this.options.orientation.axis){if(!this.timeAxis2){var i=this.timeAxis2=new c(this.body);i.setOptions=function(t){var e=t?p.extend({},t):{};e.orientation="top",c.prototype.setOptions.call(i,e)},this.components.push(i)}}else if(this.timeAxis2){var o=this.components.indexOf(this.timeAxis2);-1!==o&&this.components.splice(o,1),this.timeAxis2.destroy(),this.timeAxis2=null}if("function"==typeof t.drawPoints&&(t.drawPoints={onRender:t.drawPoints}),"hiddenDates"in this.options&&f.convertHiddenOptions(this.options.moment,this.body,this.options.hiddenDates),"clickToUse"in t&&(t.clickToUse?this.activator||(this.activator=new m(this.dom.root)):this.activator&&(this.activator.destroy(),delete this.activator)),"showCustomTime"in t)throw new Error("Option `showCustomTime` is deprecated. Create a custom time bar via timeline.addCustomTime(time [, id])");this._initAutoResize()}if(this.components.forEach(function(e){return e.setOptions(t)}),"configure"in t){this.configurator||(this.configurator=this._createConfigurator()),this.configurator.setOptions(t.configure);var n=p.deepExtend({},this.options);this.components.forEach(function(t){p.deepExtend(n,t.options)}),this.configurator.setModuleOptions({global:n})}this._redraw()},n.prototype.isActive=function(){return!this.activator||this.activator.active},n.prototype.destroy=function(){this.setItems(null),this.setGroups(null),this.off(),this._stopAutoResize(),this.dom.root.parentNode&&this.dom.root.parentNode.removeChild(this.dom.root),this.dom=null,this.activator&&(this.activator.destroy(),delete this.activator);for(var t in this.listeners)this.listeners.hasOwnProperty(t)&&delete this.listeners[t];this.listeners=null,this.hammer=null,this.components.forEach(function(t){return t.destroy()}),this.body=null},n.prototype.setCustomTime=function(t,e){var i=this.customTimes.filter(function(t){return e===t.options.id});if(0===i.length)throw new Error("No custom time bar found with id "+(0,r.default)(e));i.length>0&&i[0].setCustomTime(t)},n.prototype.getCustomTime=function(t){var e=this.customTimes.filter(function(e){return e.options.id===t});if(0===e.length)throw new Error("No custom time bar found with id "+(0,r.default)(t));return e[0].getCustomTime()},n.prototype.setCustomTimeTitle=function(t,e){var i=this.customTimes.filter(function(t){return t.options.id===e});if(0===i.length)throw new Error("No custom time bar found with id "+(0,r.default)(e));if(i.length>0)return i[0].setCustomTitle(t)},n.prototype.getEventProperties=function(t){return{event:t}},n.prototype.addCustomTime=function(t,e){var i=void 0!==t?p.convert(t,"Date").valueOf():new Date;if(this.customTimes.some(function(t){return t.options.id===e}))throw new Error("A custom time with id "+(0,r.default)(e)+" already exists");var o=new g(this.body,p.extend({},this.options,{time:i,id:e}));return this.customTimes.push(o),this.components.push(o),this._redraw(),e},n.prototype.removeCustomTime=function(t){var e=this.customTimes.filter(function(e){return e.options.id===t});if(0===e.length)throw new Error("No custom time bar found with id "+(0,r.default)(t));e.forEach(function(t){this.customTimes.splice(this.customTimes.indexOf(t),1),this.components.splice(this.components.indexOf(t),1),t.destroy()}.bind(this))},n.prototype.getVisibleItems=function(){return this.itemSet&&this.itemSet.getVisibleItems()||[]},n.prototype.fit=function(t,e){var i=this.getDataRange();if(null!==i.min||null!==i.max){var o=i.max-i.min,n=new Date(i.min.valueOf()-.01*o),s=new Date(i.max.valueOf()+.01*o),r=!t||void 0===t.animation||t.animation;this.range.setRange(n,s,{animation:r},e)}},n.prototype.getDataRange=function(){throw new Error("Cannot invoke abstract method getDataRange")},n.prototype.setWindow=function(t,e,i,o){"function"==typeof arguments[2]&&(o=arguments[2],i={});var n,s;1==arguments.length?(s=arguments[0],n=void 0===s.animation||s.animation,this.range.setRange(s.start,s.end,{animation:n})):2==arguments.length&&"function"==typeof arguments[1]?(s=arguments[0],o=arguments[1],n=void 0===s.animation||s.animation,this.range.setRange(s.start,s.end,{animation:n},o)):(n=!i||void 0===i.animation||i.animation,this.range.setRange(t,e,{animation:n},o))},n.prototype.moveTo=function(t,e,i){"function"==typeof arguments[1]&&(i=arguments[1],e={});var o=this.range.end-this.range.start,n=p.convert(t,"Date").valueOf(),s=n-o/2,r=n+o/2,a=!e||void 0===e.animation||e.animation;this.range.setRange(s,r,{animation:a},i)},n.prototype.getWindow=function(){var t=this.range.getRange();return{start:new Date(t.start),end:new Date(t.end)}},n.prototype.zoomIn=function(t,e,i){if(!(!t||t<0||t>1)){"function"==typeof arguments[1]&&(i=arguments[1],e={});var o=this.getWindow(),n=o.start.valueOf(),s=o.end.valueOf(),r=s-n,a=r/(1+t),h=(r-a)/2,d=n+h,l=s-h;this.setWindow(d,l,e,i)}},n.prototype.zoomOut=function(t,e,i){if(!(!t||t<0||t>1)){"function"==typeof arguments[1]&&(i=arguments[1],e={});var o=this.getWindow(),n=o.start.valueOf(),s=o.end.valueOf(),r=s-n,a=n-r*t/2,h=s+r*t/2;this.setWindow(a,h,e,i)}},n.prototype.redraw=function(){this._redraw()},n.prototype._redraw=function(){this.redrawCount++;var t=!1,e=this.options,i=this.props,o=this.dom;if(o&&o.container&&0!=o.root.offsetWidth){f.updateHiddenDates(this.options.moment,this.body,this.options.hiddenDates),"top"==e.orientation?(p.addClassName(o.root,"vis-top"),p.removeClassName(o.root,"vis-bottom")):(p.removeClassName(o.root,"vis-top"),p.addClassName(o.root,"vis-bottom")),o.root.style.maxHeight=p.option.asSize(e.maxHeight,""),o.root.style.minHeight=p.option.asSize(e.minHeight,""),o.root.style.width=p.option.asSize(e.width,""),i.border.left=(o.centerContainer.offsetWidth-o.centerContainer.clientWidth)/2,i.border.right=i.border.left,i.border.top=(o.centerContainer.offsetHeight-o.centerContainer.clientHeight)/2,i.border.bottom=i.border.top,i.borderRootHeight=o.root.offsetHeight-o.root.clientHeight,i.borderRootWidth=o.root.offsetWidth-o.root.clientWidth,0===o.centerContainer.clientHeight&&(i.border.left=i.border.top,i.border.right=i.border.left),0===o.root.clientHeight&&(i.borderRootWidth=i.borderRootHeight),i.center.height=o.center.offsetHeight,i.left.height=o.left.offsetHeight,i.right.height=o.right.offsetHeight,i.top.height=o.top.clientHeight||-i.border.top,i.bottom.height=o.bottom.clientHeight||-i.border.bottom;var n=Math.max(i.left.height,i.center.height,i.right.height),s=i.top.height+n+i.bottom.height+i.borderRootHeight+i.border.top+i.border.bottom;o.root.style.height=p.option.asSize(e.height,s+"px"),i.root.height=o.root.offsetHeight,i.background.height=i.root.height-i.borderRootHeight;var r=i.root.height-i.top.height-i.bottom.height-i.borderRootHeight;i.centerContainer.height=r,i.leftContainer.height=r,i.rightContainer.height=i.leftContainer.height,i.root.width=o.root.offsetWidth,i.background.width=i.root.width-i.borderRootWidth,this.initialDrawDone||(i.scrollbarWidth=p.getScrollBarWidth()),e.verticalScroll?e.rtl?(i.left.width=o.leftContainer.clientWidth||-i.border.left,i.right.width=o.rightContainer.clientWidth+i.scrollbarWidth||-i.border.right):(i.left.width=o.leftContainer.clientWidth+i.scrollbarWidth||-i.border.left,i.right.width=o.rightContainer.clientWidth||-i.border.right):(i.left.width=o.leftContainer.clientWidth||-i.border.left,i.right.width=o.rightContainer.clientWidth||-i.border.right),this._setDOM();var a=this._updateScrollTop();"top"!=e.orientation.item&&(a+=Math.max(i.centerContainer.height-i.center.height-i.border.top-i.border.bottom,0)),o.center.style.top=a+"px";var h=0==i.scrollTop?"hidden":"",d=i.scrollTop==i.scrollTopMin?"hidden":"";o.shadowTop.style.visibility=h,o.shadowBottom.style.visibility=d,o.shadowTopLeft.style.visibility=h,o.shadowBottomLeft.style.visibility=d,o.shadowTopRight.style.visibility=h,o.shadowBottomRight.style.visibility=d,e.verticalScroll&&(o.rightContainer.className="vis-panel vis-right vis-vertical-scroll",o.leftContainer.className="vis-panel vis-left vis-vertical-scroll",o.shadowTopRight.style.visibility="hidden",o.shadowBottomRight.style.visibility="hidden",o.shadowTopLeft.style.visibility="hidden",o.shadowBottomLeft.style.visibility="hidden",o.left.style.top="0px",o.right.style.top="0px"),(!e.verticalScroll||i.center.heighti.centerContainer.height;this.hammer.get("pan").set({direction:u?l.DIRECTION_ALL:l.DIRECTION_HORIZONTAL}),this.components.forEach(function(e){t=e.redraw()||t});if(t){if(this.redrawCount<5)return void this.body.emitter.emit("_change");console.log("WARNING: infinite loop in redraw?")}else this.redrawCount=0;this.body.emitter.emit("changed")}},n.prototype._setDOM=function(){var t=this.props,e=this.dom;t.leftContainer.width=t.left.width,t.rightContainer.width=t.right.width;var i=t.root.width-t.left.width-t.right.width-t.borderRootWidth;t.center.width=i,t.centerContainer.width=i,t.top.width=i,t.bottom.width=i,e.background.style.height=t.background.height+"px",e.backgroundVertical.style.height=t.background.height+"px",e.backgroundHorizontal.style.height=t.centerContainer.height+"px",e.centerContainer.style.height=t.centerContainer.height+"px",e.leftContainer.style.height=t.leftContainer.height+"px",e.rightContainer.style.height=t.rightContainer.height+"px",e.background.style.width=t.background.width+"px",e.backgroundVertical.style.width=t.centerContainer.width+"px",e.backgroundHorizontal.style.width=t.background.width+"px",e.centerContainer.style.width=t.center.width+"px",e.top.style.width=t.top.width+"px",e.bottom.style.width=t.bottom.width+"px",e.background.style.left="0",e.background.style.top="0",e.backgroundVertical.style.left=t.left.width+t.border.left+"px",e.backgroundVertical.style.top="0",e.backgroundHorizontal.style.left="0",e.backgroundHorizontal.style.top=t.top.height+"px",e.centerContainer.style.left=t.left.width+"px",e.centerContainer.style.top=t.top.height+"px",e.leftContainer.style.left="0",e.leftContainer.style.top=t.top.height+"px",e.rightContainer.style.left=t.left.width+t.center.width+"px",e.rightContainer.style.top=t.top.height+"px",e.top.style.left=t.left.width+"px",e.top.style.top="0",e.bottom.style.left=t.left.width+"px",e.bottom.style.top=t.top.height+t.centerContainer.height+"px",e.center.style.left="0",e.left.style.left="0",e.right.style.left="0"},n.prototype.repaint=function(){throw new Error("Function repaint is deprecated. Use redraw instead.")},n.prototype.setCurrentTime=function(t){if(!this.currentTime)throw new Error("Option showCurrentTime must be true");this.currentTime.setCurrentTime(t)},n.prototype.getCurrentTime=function(){if(!this.currentTime)throw new Error("Option showCurrentTime must be true");return this.currentTime.getCurrentTime()},n.prototype._toTime=function(t){return f.toTime(this,t,this.props.center.width)},n.prototype._toGlobalTime=function(t){return f.toTime(this,t,this.props.root.width)},n.prototype._toScreen=function(t){return f.toScreen(this,t,this.props.center.width)},n.prototype._toGlobalScreen=function(t){return f.toScreen(this,t,this.props.root.width)},n.prototype._initAutoResize=function(){1==this.options.autoResize?this._startAutoResize():this._stopAutoResize()},n.prototype._startAutoResize=function(){var t=this;this._stopAutoResize(),this._onResize=function(){if(1!=t.options.autoResize)return void t._stopAutoResize();t.dom.root&&(t.dom.root.offsetWidth==t.props.lastWidth&&t.dom.root.offsetHeight==t.props.lastHeight||(t.props.lastWidth=t.dom.root.offsetWidth,t.props.lastHeight=t.dom.root.offsetHeight,t.props.scrollbarWidth=p.getScrollBarWidth(),t.body.emitter.emit("_change")))},p.addEventListener(window,"resize",this._onResize),t.dom.root&&(t.props.lastWidth=t.dom.root.offsetWidth,t.props.lastHeight=t.dom.root.offsetHeight),this.watchTimer=setInterval(this._onResize,1e3)},n.prototype._stopAutoResize=function(){this.watchTimer&&(clearInterval(this.watchTimer),this.watchTimer=void 0),this._onResize&&(p.removeEventListener(window,"resize",this._onResize),this._onResize=null)},n.prototype._onTouch=function(t){this.touch.allowDragging=!0,this.touch.initialScrollTop=this.props.scrollTop},n.prototype._onPinch=function(t){this.touch.allowDragging=!1},n.prototype._onDrag=function(t){if(t&&this.touch.allowDragging){var e=t.deltaY,i=this._getScrollTop(),o=this._setScrollTop(this.touch.initialScrollTop+e);this.options.verticalScroll&&(this.dom.left.parentNode.scrollTop=-this.props.scrollTop,this.dom.right.parentNode.scrollTop=-this.props.scrollTop),o!=i&&this.emit("verticalDrag")}},n.prototype._setScrollTop=function(t){return this.props.scrollTop=t,this._updateScrollTop(),this.props.scrollTop},n.prototype._updateScrollTop=function(){var t=Math.min(this.props.centerContainer.height-this.props.center.height,0);return t!=this.props.scrollTopMin&&("top"!=this.options.orientation.item&&(this.props.scrollTop+=t-this.props.scrollTopMin),this.props.scrollTopMin=t),this.props.scrollTop>0&&(this.props.scrollTop=0),this.props.scrollTop0&&this.current.milliseconds()0&&this.current.seconds()0&&this.current.minutes()0&&this.current.hours()0?t.step:1,this.autoScale=!1)},o.prototype.setAutoScale=function(t){this.autoScale=t},o.prototype.setMinimumStep=function(t){if(void 0!=t){31104e9>t&&(this.scale="year",this.step=1e3),15552e9>t&&(this.scale="year",this.step=500),31104e8>t&&(this.scale="year",this.step=100),15552e8>t&&(this.scale="year",this.step=50),31104e7>t&&(this.scale="year",this.step=10),15552e7>t&&(this.scale="year",this.step=5),31104e6>t&&(this.scale="year",this.step=1),7776e6>t&&(this.scale="month",this.step=3),2592e6>t&&(this.scale="month",this.step=1),432e6>t&&(this.scale="day",this.step=5),1728e5>t&&(this.scale="day",this.step=2),864e5>t&&(this.scale="day",this.step=1),432e5>t&&(this.scale="weekday",this.step=1),144e5>t&&(this.scale="hour",this.step=4),36e5>t&&(this.scale="hour",this.step=1),9e5>t&&(this.scale="minute",this.step=15),6e5>t&&(this.scale="minute",this.step=10),3e5>t&&(this.scale="minute",this.step=5),6e4>t&&(this.scale="minute",this.step=1),15e3>t&&(this.scale="second",this.step=15),1e4>t&&(this.scale="second",this.step=10),5e3>t&&(this.scale="second",this.step=5),1e3>t&&(this.scale="second",this.step=1),200>t&&(this.scale="millisecond",this.step=200),100>t&&(this.scale="millisecond",this.step=100),50>t&&(this.scale="millisecond",this.step=50),10>t&&(this.scale="millisecond",this.step=10),5>t&&(this.scale="millisecond",this.step=5),1>t&&(this.scale="millisecond",this.step=1)}},o.snap=function(t,e,i){var o=n(t);if("year"==e){var s=o.year()+Math.round(o.month()/12);o.year(Math.round(s/i)*i),o.month(0),o.date(0),o.hours(0),o.minutes(0),o.seconds(0),o.milliseconds(0)}else if("month"==e)o.date()>15?(o.date(1),o.add(1,"month")):o.date(1),o.hours(0),o.minutes(0),o.seconds(0),o.milliseconds(0);else if("week"==e)o.weekday()>2?(o.weekday(0),o.add(1,"week")):o.weekday(0),o.hours(0),o.minutes(0),o.seconds(0),o.milliseconds(0);else if("day"==e){switch(i){case 5:case 2:o.hours(24*Math.round(o.hours()/24));break;default:o.hours(12*Math.round(o.hours()/12))}o.minutes(0),o.seconds(0),o.milliseconds(0)}else if("weekday"==e){switch(i){case 5:case 2:o.hours(12*Math.round(o.hours()/12));break;default:o.hours(6*Math.round(o.hours()/6))}o.minutes(0),o.seconds(0),o.milliseconds(0)}else if("hour"==e){switch(i){case 4:o.minutes(60*Math.round(o.minutes()/60));break;default:o.minutes(30*Math.round(o.minutes()/30))}o.seconds(0),o.milliseconds(0)}else if("minute"==e){switch(i){case 15:case 10:o.minutes(5*Math.round(o.minutes()/5)),o.seconds(0);break;case 5:o.seconds(60*Math.round(o.seconds()/60));break;default:o.seconds(30*Math.round(o.seconds()/30))}o.milliseconds(0)}else if("second"==e)switch(i){case 15:case 10:o.seconds(5*Math.round(o.seconds()/5)),o.milliseconds(0);break;case 5:o.milliseconds(1e3*Math.round(o.milliseconds()/1e3));break;default:o.milliseconds(500*Math.round(o.milliseconds()/500))}else if("millisecond"==e){var r=i>5?i/2:1;o.milliseconds(Math.round(o.milliseconds()/r)*r)}return o},o.prototype.isMajor=function(){if(1==this.switchedYear)switch(this.scale){case"year":case"month":case"week":case"weekday":case"day":case"hour":case"minute":case"second":case"millisecond":return!0;default:return!1}else if(1==this.switchedMonth)switch(this.scale){case"week":case"weekday":case"day":case"hour":case"minute":case"second":case"millisecond":return!0;default:return!1}else if(1==this.switchedDay)switch(this.scale){case"millisecond":case"second":case"minute":case"hour":return!0;default:return!1}var t=this.moment(this.current);switch(this.scale){case"millisecond":return 0==t.milliseconds();case"second":return 0==t.seconds();case"minute":return 0==t.hours()&&0==t.minutes();case"hour":return 0==t.hours();case"weekday":case"day":case"week":return 1==t.date();case"month":return 0==t.month();case"year":default:return!1}},o.prototype.getLabelMinor=function(t){if(void 0==t&&(t=this.current),t instanceof Date&&(t=this.moment(t)),"function"==typeof this.format.minorLabels)return this.format.minorLabels(t,this.scale,this.step);var e=this.format.minorLabels[this.scale];switch(this.scale){case"week":if(this.isMajor()&&0!==t.weekday())return"";default:return e&&e.length>0?this.moment(t).format(e):""}},o.prototype.getLabelMajor=function(t){if(void 0==t&&(t=this.current),t instanceof Date&&(t=this.moment(t)),"function"==typeof this.format.majorLabels)return this.format.majorLabels(t,this.scale,this.step);var e=this.format.majorLabels[this.scale];return e&&e.length>0?this.moment(t).format(e):""},o.prototype.getClassName=function(){function t(t){return t/a%2==0?" vis-even":" vis-odd"}function e(t){return t.isSame(new Date,"day")?" vis-today":t.isSame(n().add(1,"day"),"day")?" vis-tomorrow":t.isSame(n().add(-1,"day"),"day")?" vis-yesterday":""}function i(t){return t.isSame(new Date,"week")?" vis-current-week":""}function o(t){return t.isSame(new Date,"month")?" vis-current-month":""}var n=this.moment,s=this.moment(this.current),r=s.locale?s.locale("en"):s.lang("en"),a=this.step,h=[];switch(this.scale){case"millisecond":h.push(e(r)),h.push(t(r.milliseconds()));break;case"second":h.push(e(r)),h.push(t(r.seconds()));break;case"minute":h.push(e(r)),h.push(t(r.minutes()));break;case"hour":h.push("vis-h"+r.hours()+(4==this.step?"-h"+(r.hours()+4):"")),h.push(e(r)),h.push(t(r.hours()));break;case"weekday":h.push("vis-"+r.format("dddd").toLowerCase()),h.push(e(r)),h.push(i(r)),h.push(t(r.date()));break;case"day":h.push("vis-day"+r.date()),h.push("vis-"+r.format("MMMM").toLowerCase()),h.push(e(r)),h.push(o(r)),h.push(this.step<=2?e(r):""),h.push(this.step<=2?"vis-"+r.format("dddd").toLowerCase():""),h.push(t(r.date()-1));break;case"week":h.push("vis-week"+r.format("w")),h.push(i(r)),h.push(t(r.week()));break;case"month":h.push("vis-"+r.format("MMMM").toLowerCase()),h.push(o(r)),h.push(t(r.month()));break;case"year":h.push("vis-year"+r.year()),h.push(function(t){return t.isSame(new Date,"year")?" vis-current-year":""}(r)),h.push(t(r.year()))}return h.filter(String).join(" ")},t.exports=o},function(t,e,i){function o(t,e){this.body=t,this.defaultOptions={rtl:!1,showCurrentTime:!0,moment:r,locales:a,locale:"en"},this.options=n.extend({},this.defaultOptions),this.offset=0,this._create(),this.setOptions(e)}var n=i(0),s=i(5),r=i(3),a=i(69);o.prototype=new s,o.prototype._create=function(){var t=document.createElement("div");t.className="vis-current-time",t.style.position="absolute",t.style.top="0px",t.style.height="100%",this.bar=t},o.prototype.destroy=function(){this.options.showCurrentTime=!1,this.redraw(),this.body=null},o.prototype.setOptions=function(t){t&&n.selectiveExtend(["rtl","showCurrentTime","moment","locale","locales"],this.options,t)},o.prototype.redraw=function(){if(this.options.showCurrentTime){var t=this.body.dom.backgroundVertical;this.bar.parentNode!=t&&(this.bar.parentNode&&this.bar.parentNode.removeChild(this.bar),t.appendChild(this.bar),this.start());var e=this.options.moment((new Date).valueOf()+this.offset),i=this.body.util.toScreen(e),o=this.options.locales[this.options.locale];o||(this.warned||(console.log("WARNING: options.locales['"+this.options.locale+"'] not found. See http://visjs.org/docs/timeline/#Localization"),this.warned=!0),o=this.options.locales.en);var n=o.current+" "+o.time+": "+e.format("dddd, MMMM Do YYYY, H:mm:ss");n=n.charAt(0).toUpperCase()+n.substring(1),this.options.rtl?this.bar.style.right=i+"px":this.bar.style.left=i+"px",this.bar.title=n}else this.bar.parentNode&&this.bar.parentNode.removeChild(this.bar),this.stop();return!1},o.prototype.start=function(){function t(){e.stop();var i=e.body.range.conversion(e.body.domProps.center.width).scale,o=1/i/10;o<30&&(o=30),o>1e3&&(o=1e3),e.redraw(),e.body.emitter.emit("currentTimeTick"),e.currentTimeTimer=setTimeout(t,o)}var e=this;t()},o.prototype.stop=function(){void 0!==this.currentTimeTimer&&(clearTimeout(this.currentTimeTimer),delete this.currentTimeTimer)},o.prototype.setCurrentTime=function(t){var e=n.convert(t,"Date").valueOf(),i=(new Date).valueOf();this.offset=e-i,this.redraw()},o.prototype.getCurrentTime=function(){return new Date((new Date).valueOf()+this.offset)},t.exports=o},function(t,e,i){function o(t,e,i){if(this.groupId=t,this.subgroups={},this.subgroupStack={},this.subgroupStackAll=!1,this.doInnerStack=!1,this.subgroupIndex=0,this.subgroupOrderer=e&&e.subgroupOrder,this.itemSet=i,this.isVisible=null,this.stackDirty=!0,e&&e.nestedGroups&&(this.nestedGroups=e.nestedGroups,0==e.showNested?this.showNested=!1:this.showNested=!0),e&&e.subgroupStack)if("boolean"==typeof e.subgroupStack)this.doInnerStack=e.subgroupStack,this.subgroupStackAll=e.subgroupStack;else for(var o in e.subgroupStack)this.subgroupStack[o]=e.subgroupStack[o],this.doInnerStack=this.doInnerStack||e.subgroupStack[o];this.nestedInGroup=null,this.dom={},this.props={label:{width:0,height:0}},this.className=null,this.items={},this.visibleItems=[],this.itemsInRange=[],this.orderedItems={byStart:[],byEnd:[]},this.checkRangedItems=!1;var n=this;this.itemSet.body.emitter.on("checkRangedItems",function(){n.checkRangedItems=!0}),this._create(),this.setData(e)}var n=i(4),s=function(t){return t&&t.__esModule?t:{default:t}}(n),r=i(0),a=i(71);o.prototype._create=function(){var t=document.createElement("div");this.itemSet.options.groupEditable.order?t.className="vis-label draggable":t.className="vis-label",this.dom.label=t;var e=document.createElement("div");e.className="vis-inner",t.appendChild(e),this.dom.inner=e;var i=document.createElement("div");i.className="vis-group",i["timeline-group"]=this,this.dom.foreground=i,this.dom.background=document.createElement("div"),this.dom.background.className="vis-group",this.dom.axis=document.createElement("div"),this.dom.axis.className="vis-group",this.dom.marker=document.createElement("div"),this.dom.marker.style.visibility="hidden",this.dom.marker.style.position="absolute",this.dom.marker.innerHTML="",this.dom.background.appendChild(this.dom.marker)},o.prototype.setData=function(t){var e,i;if(this.itemSet.options&&this.itemSet.options.groupTemplate?(i=this.itemSet.options.groupTemplate.bind(this),e=i(t,this.dom.inner)):e=t&&t.content,e instanceof Element){for(this.dom.inner.appendChild(e);this.dom.inner.firstChild;)this.dom.inner.removeChild(this.dom.inner.firstChild);this.dom.inner.appendChild(e)}else e instanceof Object?i(t,this.dom.inner):this.dom.inner.innerHTML=void 0!==e&&null!==e?e:this.groupId||"";if(this.dom.label.title=t&&t.title||"",this.dom.inner.firstChild?r.removeClassName(this.dom.inner,"vis-hidden"):r.addClassName(this.dom.inner,"vis-hidden"),t&&t.nestedGroups){this.nestedGroups&&this.nestedGroups==t.nestedGroups||(this.nestedGroups=t.nestedGroups),void 0===t.showNested&&void 0!==this.showNested||(0==t.showNested?this.showNested=!1:this.showNested=!0),r.addClassName(this.dom.label,"vis-nesting-group");var o=this.itemSet.options.rtl?"collapsed-rtl":"collapsed";this.showNested?(r.removeClassName(this.dom.label,o),r.addClassName(this.dom.label,"expanded")):(r.removeClassName(this.dom.label,"expanded"),r.addClassName(this.dom.label,o))}else this.nestedGroups&&(this.nestedGroups=null,o=this.itemSet.options.rtl?"collapsed-rtl":"collapsed",r.removeClassName(this.dom.label,o),r.removeClassName(this.dom.label,"expanded"),r.removeClassName(this.dom.label,"vis-nesting-group"));t&&t.nestedInGroup&&(r.addClassName(this.dom.label,"vis-nested-group"),this.itemSet.options&&this.itemSet.options.rtl?this.dom.inner.style.paddingRight="30px":this.dom.inner.style.paddingLeft="30px");var n=t&&t.className||null;n!=this.className&&(this.className&&(r.removeClassName(this.dom.label,this.className),r.removeClassName(this.dom.foreground,this.className),r.removeClassName(this.dom.background,this.className),r.removeClassName(this.dom.axis,this.className)),r.addClassName(this.dom.label,n),r.addClassName(this.dom.foreground,n),r.addClassName(this.dom.background,n),r.addClassName(this.dom.axis,n),this.className=n),this.style&&(r.removeCssText(this.dom.label,this.style),this.style=null),t&&t.style&&(r.addCssText(this.dom.label,t.style),this.style=t.style)},o.prototype.getLabelWidth=function(){return this.props.label.width},o.prototype._didMarkerHeightChange=function(){var t=this.dom.marker.clientHeight;if(t!=this.lastMarkerHeight){this.lastMarkerHeight=t;var e={},i=0;r.forEach(this.items,function(t,o){if(t.dirty=!0,t.displayed){e[o]=t.redraw(!0),i=e[o].length}});if(i>0)for(var o=0;o0)for(var u=0;u0){var e=this;this.resetSubgroups(),r.forEach(this.visibleItems,function(i){void 0!==i.data.subgroup&&(e.subgroups[i.data.subgroup].height=Math.max(e.subgroups[i.data.subgroup].height,i.height+t.item.vertical),e.subgroups[i.data.subgroup].visible=!0)})}},o.prototype._isGroupVisible=function(t,e){return this.top<=t.body.domProps.centerContainer.height-t.body.domProps.scrollTop+e.axis&&this.top+this.height+e.axis>=-t.body.domProps.scrollTop},o.prototype._calculateHeight=function(t){var e,i=this.visibleItems;if(i.length>0){var o=i[0].top,n=i[0].top+i[0].height;if(r.forEach(i,function(t){o=Math.min(o,t.top),n=Math.max(n,t.top+t.height)}),o>t.axis){var s=o-t.axis;n-=s,r.forEach(i,function(t){t.top-=s})}e=n+t.item.vertical/2}else e=0;return e=Math.max(e,this.props.label.height)},o.prototype.show=function(){this.dom.label.parentNode||this.itemSet.dom.labelSet.appendChild(this.dom.label),this.dom.foreground.parentNode||this.itemSet.dom.foreground.appendChild(this.dom.foreground),this.dom.background.parentNode||this.itemSet.dom.background.appendChild(this.dom.background),this.dom.axis.parentNode||this.itemSet.dom.axis.appendChild(this.dom.axis)},o.prototype.hide=function(){var t=this.dom.label;t.parentNode&&t.parentNode.removeChild(t);var e=this.dom.foreground;e.parentNode&&e.parentNode.removeChild(e);var i=this.dom.background;i.parentNode&&i.parentNode.removeChild(i);var o=this.dom.axis;o.parentNode&&o.parentNode.removeChild(o)},o.prototype.add=function(t){if(this.items[t.id]=t,t.setParent(this),this.stackDirty=!0,void 0!==t.data.subgroup&&(this._addToSubgroup(t),this.orderSubgroups()),-1==this.visibleItems.indexOf(t)){var e=this.itemSet.body.range;this._checkIfVisible(t,this.visibleItems,e)}},o.prototype._addToSubgroup=function(t,e){e=e||t.data.subgroup,void 0!=e&&void 0===this.subgroups[e]&&(this.subgroups[e]={height:0,top:0,start:t.data.start,end:t.data.end||t.data.start,visible:!1,index:this.subgroupIndex,items:[],stack:this.subgroupStackAll||this.subgroupStack[e]||!1},this.subgroupIndex++),new Date(t.data.start)new Date(this.subgroups[e].end)&&(this.subgroups[e].end=i),this.subgroups[e].items.push(t)},o.prototype._updateSubgroupsSizes=function(){var t=this;if(t.subgroups)for(var e in t.subgroups){var i=t.subgroups[e].items[0].data.end||t.subgroups[e].items[0].data.start,o=t.subgroups[e].items[0].data.start,n=i-1;t.subgroups[e].items.forEach(function(t){new Date(t.data.start)new Date(n)&&(n=e)}),t.subgroups[e].start=o,t.subgroups[e].end=new Date(n-1)}},o.prototype.orderSubgroups=function(){if(void 0!==this.subgroupOrderer){var t,e=[];if("string"==typeof this.subgroupOrderer){for(t in this.subgroups)e.push({subgroup:t,sortField:this.subgroups[t].items[0].data[this.subgroupOrderer]});e.sort(function(t,e){return t.sortField-e.sortField})}else if("function"==typeof this.subgroupOrderer){for(t in this.subgroups)e.push(this.subgroups[t].items[0].data);e.sort(this.subgroupOrderer)}if(e.length>0)for(var i=0;i=0&&(i.items.splice(o,1),i.items.length?this._updateSubgroupsSizes():delete this.subgroups[e])}}},o.prototype.removeFromDataSet=function(t){this.itemSet.removeItem(t.id)},o.prototype.order=function(){for(var t=r.toArray(this.items),e=[],i=[],o=0;o0)for(var l=0;lh}),1==this.checkRangedItems)for(this.checkRangedItems=!1,l=0;lh})}var c={},m=0;for(l=0;l0)for(var g=0;g=0&&(r=e[s],!n(r));s--)void 0===o[r.id]&&(o[r.id]=!0,i.push(r));for(s=t+1;st.start},o.prototype._createDomElement=function(){this.dom||(this.dom={},this.dom.box=document.createElement("div"),this.dom.frame=document.createElement("div"),this.dom.frame.className="vis-item-overflow",this.dom.box.appendChild(this.dom.frame),this.dom.visibleFrame=document.createElement("div"),this.dom.visibleFrame.className="vis-item-visible-frame",this.dom.box.appendChild(this.dom.visibleFrame),this.dom.content=document.createElement("div"),this.dom.content.className="vis-item-content",this.dom.frame.appendChild(this.dom.content),this.dom.box["timeline-item"]=this,this.dirty=!0)},o.prototype._appendDomElement=function(){if(!this.parent)throw new Error("Cannot redraw item: no parent attached");if(!this.dom.box.parentNode){var t=this.parent.dom.foreground;if(!t)throw new Error("Cannot redraw item: parent has no foreground container element");t.appendChild(this.dom.box)}this.displayed=!0},o.prototype._updateDirtyDomComponents=function(){if(this.dirty){this._updateContents(this.dom.content),this._updateDataAttributes(this.dom.box),this._updateStyle(this.dom.box);var t=this.editable.updateTime||this.editable.updateGroup,e=(this.data.className?" "+this.data.className:"")+(this.selected?" vis-selected":"")+(t?" vis-editable":" vis-readonly");this.dom.box.className=this.baseClassName+e,this.dom.content.style.maxWidth="none"}},o.prototype._getDomComponentsSizes=function(){return this.overflow="hidden"!==window.getComputedStyle(this.dom.frame).overflow,{content:{width:this.dom.content.offsetWidth},box:{height:this.dom.box.offsetHeight}}},o.prototype._updateDomComponentsSizes=function(t){this.props.content.width=t.content.width,this.height=t.box.height,this.dom.content.style.maxWidth="",this.dirty=!1},o.prototype._repaintDomAdditionals=function(){this._repaintOnItemUpdateTimeTooltip(this.dom.box),this._repaintDeleteButton(this.dom.box),this._repaintDragCenter(),this._repaintDragLeft(),this._repaintDragRight()},o.prototype.redraw=function(t){var e,i=[this._createDomElement.bind(this),this._appendDomElement.bind(this),this._updateDirtyDomComponents.bind(this),function(){this.dirty&&(e=this._getDomComponentsSizes.bind(this)())}.bind(this),function(){this.dirty&&this._updateDomComponentsSizes.bind(this)(e)}.bind(this),this._repaintDomAdditionals.bind(this)];if(t)return i;var o;return i.forEach(function(t){o=t()}),o},o.prototype.show=function(){this.displayed||this.redraw()},o.prototype.hide=function(){if(this.displayed){var t=this.dom.box;t.parentNode&&t.parentNode.removeChild(t),this.displayed=!1}},o.prototype.repositionX=function(t){var e,i,o=this.parent.width,n=this.conversion.toScreen(this.data.start),s=this.conversion.toScreen(this.data.end),r=void 0===this.data.align?this.options.align:this.data.align;!1===this.data.limitSize||void 0!==t&&!0!==t||(n<-o&&(n=-o),s>2*o&&(s=2*o));var a=Math.max(s-n+.5,1);switch(this.overflow?(this.options.rtl?this.right=n:this.left=n,this.width=a+this.props.content.width,i=this.props.content.width):(this.options.rtl?this.right=n:this.left=n,this.width=a,i=Math.min(s-n,this.props.content.width)),this.options.rtl?this.dom.box.style.right=this.right+"px":this.dom.box.style.left=this.left+"px",this.dom.box.style.width=a+"px",r){case"left":this.options.rtl?this.dom.content.style.right="0":this.dom.content.style.left="0";break;case"right":this.options.rtl?this.dom.content.style.right=Math.max(a-i,0)+"px":this.dom.content.style.left=Math.max(a-i,0)+"px";break;case"center":this.options.rtl?this.dom.content.style.right=Math.max((a-i)/2,0)+"px":this.dom.content.style.left=Math.max((a-i)/2,0)+"px";break;default:e=this.overflow?s>0?Math.max(-n,0):-i:n<0?-n:0,this.options.rtl?this.dom.content.style.right=e+"px":(this.dom.content.style.left=e+"px",this.dom.content.style.width="calc(100% - "+e+"px)")}},o.prototype.repositionY=function(){var t=this.options.orientation.item,e=this.dom.box;e.style.top="top"==t?this.top+"px":this.parent.height-this.top-this.height+"px"},o.prototype._repaintDragLeft=function(){if((this.selected||this.options.itemsAlwaysDraggable.range)&&this.options.editable.updateTime&&!this.dom.dragLeft){var t=document.createElement("div");t.className="vis-drag-left",t.dragLeftItem=this,this.dom.box.appendChild(t),this.dom.dragLeft=t}else this.selected||this.options.itemsAlwaysDraggable.range||!this.dom.dragLeft||(this.dom.dragLeft.parentNode&&this.dom.dragLeft.parentNode.removeChild(this.dom.dragLeft),this.dom.dragLeft=null)},o.prototype._repaintDragRight=function(){if((this.selected||this.options.itemsAlwaysDraggable.range)&&this.options.editable.updateTime&&!this.dom.dragRight){var t=document.createElement("div");t.className="vis-drag-right",t.dragRightItem=this,this.dom.box.appendChild(t),this.dom.dragRight=t}else this.selected||this.options.itemsAlwaysDraggable.range||!this.dom.dragRight||(this.dom.dragRight.parentNode&&this.dom.dragRight.parentNode.removeChild(this.dom.dragRight),this.dom.dragRight=null)},t.exports=o},function(t,e,i){function o(t,e){}function n(t,e){return e=void 0===e?{}:e,{style:e.style||t.options.drawPoints.style,styles:e.styles||t.options.drawPoints.styles,size:e.size||t.options.drawPoints.size,className:e.className||t.className}}function s(t,e){var i=void 0;return t.options&&t.options.drawPoints&&t.options.drawPoints.onRender&&"function"==typeof t.options.drawPoints.onRender&&(i=t.options.drawPoints.onRender),e.group.options&&e.group.options.drawPoints&&e.group.options.drawPoints.onRender&&"function"==typeof e.group.options.drawPoints.onRender&&(i=e.group.options.drawPoints.onRender),i}var r=i(1),a=function(t){return t&&t.__esModule?t:{default:t}}(r),h=i(11);o.draw=function(t,e,i,o){o=o||0;for(var r=s(i,e),d=0;dh;)o(a,i=e[h++])&&(~s(d,i)||d.push(i));return d}},function(t,e,i){var o=i(34);t.exports=function(t){return Object(o(t))}},function(t,e,i){var o=i(101)(!0);i(56)(String,"String",function(t){this._t=String(t),this._i=0},function(){var t,e=this._t,i=this._i;return i>=e.length?{value:void 0,done:!0}:(t=o(e,i),this._i+=t.length,{value:t,done:!1})})},function(t,e){e.f=Object.getOwnPropertySymbols},function(t,e,i){var o=i(60),n=i(41).concat("length","prototype");e.f=Object.getOwnPropertyNames||function(t){return o(t,n)}},function(t,e,i){function o(t){this.delay=null,this.max=1/0,this._queue=[],this._timeout=null,this._extended=null,this.setOptions(t)}o.prototype.setOptions=function(t){t&&void 0!==t.delay&&(this.delay=t.delay),t&&void 0!==t.max&&(this.max=t.max),this._flushIfNeeded()},o.extend=function(t,e){var i=new o(e);if(void 0!==t.flush)throw new Error("Target object already has a property flush");t.flush=function(){i.flush()};var n=[{name:"flush",original:void 0}];if(e&&e.replace)for(var s=0;sthis.max&&this.flush(),clearTimeout(this._timeout),this.queue.length>0&&"number"==typeof this.delay){var t=this;this._timeout=setTimeout(function(){t.flush()},this.delay)}},o.prototype.flush=function(){for(;this._queue.length>0;){var t=this._queue.shift();t.fn.apply(t.context||t.fn,t.args||[])}},t.exports=o},function(t,e){function i(t){if(t)return o(t)}function o(t){for(var e in i.prototype)t[e]=i.prototype[e];return t}t.exports=i,i.prototype.on=i.prototype.addEventListener=function(t,e){return this._callbacks=this._callbacks||{},(this._callbacks[t]=this._callbacks[t]||[]).push(e),this},i.prototype.once=function(t,e){function i(){o.off(t,i),e.apply(this,arguments)}var o=this;return this._callbacks=this._callbacks||{},i.fn=e,this.on(t,i),this},i.prototype.off=i.prototype.removeListener=i.prototype.removeAllListeners=i.prototype.removeEventListener=function(t,e){if(this._callbacks=this._callbacks||{},0==arguments.length)return this._callbacks={},this;var i=this._callbacks[t];if(!i)return this;if(1==arguments.length)return delete this._callbacks[t],this;for(var o,n=0;n0){var n=i.groupsData.getDataSet();n.get().forEach(function(t){if(t.nestedGroups){0!=t.showNested&&(t.showNested=!0);var e=[];t.nestedGroups.forEach(function(i){var o=n.get(i);o&&(o.nestedInGroup=t.id,0==t.showNested&&(o.visible=!1),e=e.concat(o))}),n.update(e,o)}})}},update:function(t,e,o){i._onUpdateGroups(e.items)},remove:function(t,e,o){i._onRemoveGroups(e.items)}},this.items={},this.groups={},this.groupIds=[],this.selection=[],this.popup=null,this.touchParams={},this.groupTouchParams={},this._create(),this.setOptions(e)}var s=i(43),r=o(s),a=i(1),h=o(a),d=i(12),l=i(0),u=i(15),p=i(20),c=i(49),m=i(5),f=i(51),g=i(52),v=i(72),y=i(73),b=i(53),_=i(74),w=i(133).default,x="__ungrouped__",D="__background__";n.prototype=new m,n.types={background:_,box:v,range:b,point:y},n.prototype._create=function(){var t=document.createElement("div");t.className="vis-itemset",t["timeline-itemset"]=this,this.dom.frame=t;var e=document.createElement("div");e.className="vis-background",t.appendChild(e),this.dom.background=e;var i=document.createElement("div");i.className="vis-foreground",t.appendChild(i),this.dom.foreground=i;var o=document.createElement("div");o.className="vis-axis",this.dom.axis=o;var n=document.createElement("div");n.className="vis-labelset",this.dom.labelSet=n,this._updateUngrouped();var s=new g(D,null,this);s.show(),this.groups[D]=s,this.hammer=new d(this.body.dom.centerContainer),this.hammer.on("hammer.input",function(t){t.isFirst&&this._onTouch(t)}.bind(this)),this.hammer.on("panstart",this._onDragStart.bind(this)),this.hammer.on("panmove",this._onDrag.bind(this)),this.hammer.on("panend",this._onDragEnd.bind(this)),this.hammer.get("pan").set({threshold:5,direction:d.DIRECTION_HORIZONTAL}),this.hammer.on("tap",this._onSelectItem.bind(this)),this.hammer.on("press",this._onMultiSelectItem.bind(this)),this.hammer.on("doubletap",this._onAddItem.bind(this)),this.options.rtl?this.groupHammer=new d(this.body.dom.rightContainer):this.groupHammer=new d(this.body.dom.leftContainer),this.groupHammer.on("tap",this._onGroupClick.bind(this)),this.groupHammer.on("panstart",this._onGroupDragStart.bind(this)),this.groupHammer.on("panmove",this._onGroupDrag.bind(this)),this.groupHammer.on("panend",this._onGroupDragEnd.bind(this)),this.groupHammer.get("pan").set({threshold:5,direction:d.DIRECTION_VERTICAL}),this.body.dom.centerContainer.addEventListener("mouseover",this._onMouseOver.bind(this)),this.body.dom.centerContainer.addEventListener("mouseout",this._onMouseOut.bind(this)),this.body.dom.centerContainer.addEventListener("mousemove",this._onMouseMove.bind(this)),this.body.dom.centerContainer.addEventListener("contextmenu",this._onDragEnd.bind(this)),this.body.dom.centerContainer.addEventListener("mousewheel",this._onMouseWheel.bind(this)),this.show()},n.prototype.setOptions=function(t){if(t){var e=["type","rtl","align","order","stack","stackSubgroups","selectable","multiselect","multiselectPerGroup","groupOrder","dataAttributes","template","groupTemplate","visibleFrameTemplate","hide","snap","groupOrderSwap","showTooltips","tooltip","tooltipOnItemUpdateTime"];l.selectiveExtend(e,this.options,t),"itemsAlwaysDraggable"in t&&("boolean"==typeof t.itemsAlwaysDraggable?(this.options.itemsAlwaysDraggable.item=t.itemsAlwaysDraggable,this.options.itemsAlwaysDraggable.range=!1):"object"===(0,h.default)(t.itemsAlwaysDraggable)&&(l.selectiveExtend(["item","range"],this.options.itemsAlwaysDraggable,t.itemsAlwaysDraggable),this.options.itemsAlwaysDraggable.item||(this.options.itemsAlwaysDraggable.range=!1))),"orientation"in t&&("string"==typeof t.orientation?this.options.orientation.item="top"===t.orientation?"top":"bottom":"object"===(0,h.default)(t.orientation)&&"item"in t.orientation&&(this.options.orientation.item=t.orientation.item)),"margin"in t&&("number"==typeof t.margin?(this.options.margin.axis=t.margin,this.options.margin.item.horizontal=t.margin,this.options.margin.item.vertical=t.margin):"object"===(0,h.default)(t.margin)&&(l.selectiveExtend(["axis"],this.options.margin,t.margin),"item"in t.margin&&("number"==typeof t.margin.item?(this.options.margin.item.horizontal=t.margin.item,this.options.margin.item.vertical=t.margin.item):"object"===(0,h.default)(t.margin.item)&&l.selectiveExtend(["horizontal","vertical"],this.options.margin.item,t.margin.item)))),"editable"in t&&("boolean"==typeof t.editable?(this.options.editable.updateTime=t.editable,this.options.editable.updateGroup=t.editable,this.options.editable.add=t.editable,this.options.editable.remove=t.editable,this.options.editable.overrideItems=!1):"object"===(0,h.default)(t.editable)&&l.selectiveExtend(["updateTime","updateGroup","add","remove","overrideItems"],this.options.editable,t.editable)),"groupEditable"in t&&("boolean"==typeof t.groupEditable?(this.options.groupEditable.order=t.groupEditable,this.options.groupEditable.add=t.groupEditable,this.options.groupEditable.remove=t.groupEditable):"object"===(0,h.default)(t.groupEditable)&&l.selectiveExtend(["order","add","remove"],this.options.groupEditable,t.groupEditable));["onDropObjectOnItem","onAdd","onUpdate","onRemove","onMove","onMoving","onAddGroup","onMoveGroup","onRemoveGroup"].forEach(function(e){var i=t[e];if(i){if(!(i instanceof Function))throw new Error("option "+e+" must be a function "+e+"(item, callback)");this.options[e]=i}}.bind(this)),this.markDirty()}},n.prototype.markDirty=function(t){this.groupIds=[],t&&t.refreshItems&&l.forEach(this.items,function(t){t.dirty=!0,t.displayed&&t.redraw()})},n.prototype.destroy=function(){this.hide(),this.setItems(null),this.setGroups(null),this.hammer=null,this.body=null,this.conversion=null},n.prototype.hide=function(){this.dom.frame.parentNode&&this.dom.frame.parentNode.removeChild(this.dom.frame),this.dom.axis.parentNode&&this.dom.axis.parentNode.removeChild(this.dom.axis),this.dom.labelSet.parentNode&&this.dom.labelSet.parentNode.removeChild(this.dom.labelSet)},n.prototype.show=function(){this.dom.frame.parentNode||this.body.dom.center.appendChild(this.dom.frame),this.dom.axis.parentNode||this.body.dom.backgroundVertical.appendChild(this.dom.axis),this.dom.labelSet.parentNode||(this.options.rtl?this.body.dom.right.appendChild(this.dom.labelSet):this.body.dom.left.appendChild(this.dom.labelSet))},n.prototype.setSelection=function(t){var e,i,o,n;for(void 0==t&&(t=[]),Array.isArray(t)||(t=[t]),e=0,i=this.selection.length;et&&o.push(h.id):h.lefte&&o.push(h.id)}return o},n.prototype._deselect=function(t){for(var e=this.selection,i=0,o=e.length;i0){for(var w={},x=0;x<_;x++)l.forEach(b,function(t,e){w[e]=t[x]()});l.forEach(this.groups,function(t,e){if(e!==D){var i=w[e];s=i||s,v+=t.height}}),v=Math.max(v,y)}return v=Math.max(v,y),r.style.height=i(v),this.props.width=r.offsetWidth,this.props.height=v,this.dom.axis.style.top=i("top"==n?this.body.domProps.top.height+this.body.domProps.border.top:this.body.domProps.top.height+this.body.domProps.centerContainer.height),this.options.rtl?this.dom.axis.style.right="0":this.dom.axis.style.left="0",this.initialItemSetDrawn=!0,s=this._isResized()||s},n.prototype._firstGroup=function(){var t="top"==this.options.orientation.item?0:this.groupIds.length-1,e=this.groupIds[t];return this.groups[e]||this.groups[x]||null},n.prototype._updateUngrouped=function(){var t,e,i=this.groups[x];if(this.groupsData){if(i){i.hide(),delete this.groups[x];for(e in this.items)if(this.items.hasOwnProperty(e)){t=this.items[e],t.parent&&t.parent.remove(t);var o=this._getGroupId(t.data),n=this.groups[o];n&&n.add(t)||t.hide()}}}else if(!i){i=new f(null,null,this),this.groups[x]=i;for(e in this.items)this.items.hasOwnProperty(e)&&(t=this.items[e],i.add(t));i.show()}},n.prototype.getLabelSet=function(){return this.dom.labelSet},n.prototype.setItems=function(t){var e,i=this,o=this.itemsData;if(t){if(!(t instanceof u||t instanceof p))throw new TypeError("Data must be an instance of DataSet or DataView");this.itemsData=t}else this.itemsData=null;if(o&&(l.forEach(this.itemListeners,function(t,e){o.off(e,t)}),e=o.getIds(),this._onRemove(e)),this.itemsData){var n=this.id;l.forEach(this.itemListeners,function(t,e){i.itemsData.on(e,t,n)}),e=this.itemsData.getIds(),this._onAdd(e),this._updateUngrouped()}this.body.emitter.emit("_change",{queue:!0})},n.prototype.getItems=function(){return this.itemsData},n.prototype.setGroups=function(t){var e,i=this;if(this.groupsData&&(l.forEach(this.groupListeners,function(t,e){i.groupsData.off(e,t)}),e=this.groupsData.getIds(),this.groupsData=null,this._onRemoveGroups(e)),t){if(!(t instanceof u||t instanceof p))throw new TypeError("Data must be an instance of DataSet or DataView");this.groupsData=t}else this.groupsData=null;if(this.groupsData){var o=this.groupsData;this.groupsData instanceof p&&(o=this.groupsData.getDataSet()),o.get().forEach(function(t){t.nestedGroups&&t.nestedGroups.forEach(function(e){var i=o.get(e);i.nestedInGroup=t.id,0==t.showNested&&(i.visible=!1),o.update(i)})});var n=this.id;l.forEach(this.groupListeners,function(t,e){i.groupsData.on(e,t,n)}),e=this.groupsData.getIds(),this._onAddGroups(e)}this._updateUngrouped(),this._order(),this.body.emitter.emit("_change",{queue:!0})},n.prototype.getGroups=function(){return this.groupsData},n.prototype.removeItem=function(t){var e=this.itemsData.get(t),i=this.itemsData.getDataSet();e&&this.options.onRemove(e,function(e){e&&i.remove(t)})},n.prototype._getType=function(t){return t.type||this.options.type||(t.end?"range":"box")},n.prototype._getGroupId=function(t){return"background"==this._getType(t)&&void 0==t.group?D:this.groupsData?t.group:x},n.prototype._onUpdate=function(t){var e=this;t.forEach(function(t){var i,o=e.itemsData.get(t,e.itemOptions),s=e.items[t],r=o?e._getType(o):null,a=n.types[r];if(s&&(a&&s instanceof a?e._updateItem(s,o):(i=s.selected,e._removeItem(s),s=null)),!s&&o){if(!a)throw"rangeoverflow"==r?new TypeError('Item type "rangeoverflow" is deprecated. Use css styling instead: .vis-item.vis-range .vis-item-content {overflow: visible;}'):new TypeError('Unknown item type "'+r+'"');s=new a(o,e.conversion,e.options),s.id=t,e._addItem(s),i&&(this.selection.push(t),s.select())}}.bind(this)),this._order(),this.body.emitter.emit("_change",{queue:!0})},n.prototype._onAdd=n.prototype._onUpdate,n.prototype._onRemove=function(t){var e=0,i=this;t.forEach(function(t){var o=i.items[t];o&&(e++,i._removeItem(o))}),e&&(this._order(),this.body.emitter.emit("_change",{queue:!0}))},n.prototype._order=function(){l.forEach(this.groups,function(t){t.order()})},n.prototype._onUpdateGroups=function(t){this._onAddGroups(t)},n.prototype._onAddGroups=function(t){var e=this;t.forEach(function(t){var i=e.groupsData.get(t),o=e.groups[t];if(o)o.setData(i);else{if(t==x||t==D)throw new Error("Illegal group id. "+t+" is a reserved id.");var n=(0,r.default)(e.options);l.extend(n,{height:null}),o=new f(t,i,e),e.groups[t]=o;for(var s in e.items)if(e.items.hasOwnProperty(s)){var a=e.items[s];a.data.group==t&&o.add(a)}o.order(),o.show()}}),this.body.emitter.emit("_change",{queue:!0})},n.prototype._onRemoveGroups=function(t){var e=this.groups;t.forEach(function(t){var i=e[t];i&&(i.hide(),delete e[t])}),this.markDirty(),this.body.emitter.emit("_change",{queue:!0})},n.prototype._orderGroups=function(){if(this.groupsData){var t=this.groupsData.getIds({order:this.options.groupOrder});t=this._orderNestedGroups(t);var e=!l.equalArray(t,this.groupIds);if(e){var i=this.groups;t.forEach(function(t){i[t].hide()}),t.forEach(function(t){i[t].show()}),this.groupIds=t}return e}return!1},n.prototype._orderNestedGroups=function(t){var e=[];return t.forEach(function(t){var i=this.groupsData.get(t);if(i.nestedInGroup||e.push(t),i.nestedGroups){var o=this.groupsData.get({filter:function(e){return e.nestedInGroup==t},order:this.options.groupOrder}),n=o.map(function(t){return t.id});e=e.concat(n)}},this),e},n.prototype._addItem=function(t){this.items[t.id]=t;var e=this._getGroupId(t.data),i=this.groups[e];i?i&&i.data&&i.data.showNested&&(t.groupShowing=!0):t.groupShowing=!1,i&&i.add(t)},n.prototype._updateItem=function(t,e){t.setData(e);var i=this._getGroupId(t.data),o=this.groups[i];o?o&&o.data&&o.data.showNested&&(t.groupShowing=!0):t.groupShowing=!1},n.prototype._removeItem=function(t){t.hide(),delete this.items[t.id];var e=this.selection.indexOf(t.id);-1!=e&&this.selection.splice(e,1),t.parent&&t.parent.remove(t)},n.prototype._constructByEndArray=function(t){for(var e=[],i=0;in)return}}if(i&&i!=this.groupTouchParams.group){var a=e.get(i.groupId),h=e.get(this.groupTouchParams.group.groupId);h&&a&&(this.options.groupOrderSwap(h,a,e),e.update(h),e.update(a));var d=e.getIds({order:this.options.groupOrder});if(!l.equalArray(d,this.groupTouchParams.originalOrder))for(var u=this.groupTouchParams.originalOrder,c=this.groupTouchParams.group.groupId,m=Math.min(u.length,d.length),f=0,g=0,v=0;f=m)break;if(d[f+g]==c)g=1;else if(u[f+v]==c)v=1;else{var y=d.indexOf(u[f+v]),b=e.get(d[f+g]),_=e.get(u[f+v]);this.options.groupOrderSwap(b,_,e),e.update(b),e.update(_);var w=d[f+g];d[f+g]=u[f+v],d[y]=w,f++}}}}},n.prototype._onGroupDragEnd=function(t){if(this.options.groupEditable.order&&this.groupTouchParams.group){t.stopPropagation();var e=this,i=e.groupTouchParams.group.groupId,o=e.groupsData.getDataSet(),n=l.extend({},o.get(i));e.options.onMoveGroup(n,function(t){if(t)t[o._fieldId]=i,o.update(t);else{var n=o.getIds({order:e.options.groupOrder});if(!l.equalArray(n,e.groupTouchParams.originalOrder))for(var s=e.groupTouchParams.originalOrder,r=Math.min(s.length,n.length),a=0;a=r)break;var h=n.indexOf(s[a]),d=o.get(n[a]),u=o.get(s[a]);e.options.groupOrderSwap(d,u,o),o.update(d),o.update(u);var p=n[a];n[a]=s[a],n[h]=p,a++}}}),e.body.emitter.emit("groupDragged",{groupId:i})}},n.prototype._onSelectItem=function(t){if(this.options.selectable){var e=t.srcEvent&&(t.srcEvent.ctrlKey||t.srcEvent.metaKey),i=t.srcEvent&&t.srcEvent.shiftKey;if(e||i)return void this._onMultiSelectItem(t);var o=this.getSelection(),n=this.itemFromTarget(t),s=n?[n.id]:[];this.setSelection(s);var r=this.getSelection();(r.length>0||o.length>0)&&this.body.emitter.emit("select",{items:r,event:t})}},n.prototype._onMouseOver=function(t){var e=this.itemFromTarget(t);if(e){if(e!==this.itemFromRelatedTarget(t)){var i=e.getTitle();if(this.options.showTooltips&&i){null==this.popup&&(this.popup=new w(this.body.dom.root,this.options.tooltip.overflowMethod||"flip")),this.popup.setText(i);var o=this.body.dom.centerContainer;this.popup.setPosition(t.clientX-l.getAbsoluteLeft(o)+o.offsetLeft,t.clientY-l.getAbsoluteTop(o)+o.offsetTop),this.popup.show()}else null!=this.popup&&this.popup.hide();this.body.emitter.emit("itemover",{item:e.id,event:t})}}},n.prototype._onMouseOut=function(t){var e=this.itemFromTarget(t);if(e){e!==this.itemFromRelatedTarget(t)&&(null!=this.popup&&this.popup.hide(),this.body.emitter.emit("itemout",{item:e.id,event:t}))}},n.prototype._onMouseMove=function(t){if(this.itemFromTarget(t)&&this.options.showTooltips&&this.options.tooltip.followMouse&&this.popup&&!this.popup.hidden){var e=this.body.dom.centerContainer;this.popup.setPosition(t.clientX-l.getAbsoluteLeft(e)+e.offsetLeft,t.clientY-l.getAbsoluteTop(e)+e.offsetTop),this.popup.show()}},n.prototype._onMouseWheel=function(t){this.touchParams.itemIsDragging&&this._onDragEnd(t)},n.prototype._onUpdateItem=function(t){if(this.options.selectable&&this.options.editable.add){var e=this;if(t){var i=e.itemsData.get(t.id);this.options.onUpdate(i,function(t){t&&e.itemsData.getDataSet().update(t)})}}},n.prototype._onDropObjectOnItem=function(t){var e=this.itemFromTarget(t),i=JSON.parse(t.dataTransfer.getData("text"));this.options.onDropObjectOnItem(i,e)},n.prototype._onAddItem=function(t){if(this.options.selectable&&this.options.editable.add){var e,i,o=this,n=this.options.snap||null;this.options.rtl?(e=l.getAbsoluteRight(this.dom.frame),i=e-t.center.x):(e=l.getAbsoluteLeft(this.dom.frame),i=t.center.x-e);var s,r,a=this.body.util.toTime(i),h=this.body.util.getScale(),d=this.body.util.getStep();"drop"==t.type?(r=JSON.parse(t.dataTransfer.getData("text")),r.content=r.content?r.content:"new item",r.start=r.start?r.start:n?n(a,h,d):a,r.type=r.type||"box",r[this.itemsData._fieldId]=r.id||l.randomUUID(),"range"!=r.type||r.end||(s=this.body.util.toTime(i+this.props.width/5),r.end=n?n(s,h,d):s)):(r={start:n?n(a,h,d):a,content:"new item"},r[this.itemsData._fieldId]=l.randomUUID(),"range"===this.options.type&&(s=this.body.util.toTime(i+this.props.width/5),r.end=n?n(s,h,d):s));var u=this.groupFromTarget(t);u&&(r.group=u.groupId),r=this._cloneItemData(r),this.options.onAdd(r,function(e){e&&(o.itemsData.getDataSet().add(e),"drop"==t.type&&o.setSelection([e.id]))})}},n.prototype._onMultiSelectItem=function(t){if(this.options.selectable){var e=this.itemFromTarget(t);if(e){var i=this.options.multiselect?this.getSelection():[];if((t.srcEvent&&t.srcEvent.shiftKey||!1)&&this.options.multiselect){var o=this.itemsData.get(e.id).group,s=void 0;this.options.multiselectPerGroup&&i.length>0&&(s=this.itemsData.get(i[0]).group),this.options.multiselectPerGroup&&void 0!=s&&s!=o||i.push(e.id);var r=n._getItemRange(this.itemsData.get(i,this.itemOptions));if(!this.options.multiselectPerGroup||s==o){i=[];for(var a in this.items)if(this.items.hasOwnProperty(a)){var h=this.items[a],d=h.data.start,l=void 0!==h.data.end?h.data.end:d;!(d>=r.min&&l<=r.max)||this.options.multiselectPerGroup&&s!=this.itemsData.get(h.id).group||h instanceof _||i.push(h.id)}}}else{var u=i.indexOf(e.id);-1==u?i.push(e.id):i.splice(u,1)}this.setSelection(i),this.body.emitter.emit("select",{items:this.getSelection(),event:t})}}},n._getItemRange=function(t){var e=null,i=null;return t.forEach(function(t){(null==i||t.starte)&&(e=t.end):(null==e||t.start>e)&&(e=t.start)}),{min:i,max:e}},n.prototype.itemFromElement=function(t){for(var e=t;e;){if(e.hasOwnProperty("timeline-item"))return e["timeline-item"];e=e.parentNode}return null},n.prototype.itemFromTarget=function(t){return this.itemFromElement(t.target)},n.prototype.itemFromRelatedTarget=function(t){return this.itemFromElement(t.relatedTarget)},n.prototype.groupFromTarget=function(t){var e=t.center?t.center.y:t.clientY,i=this.groupIds;i.length<=0&&this.groupsData&&(i=this.groupsData.getIds({order:this.options.groupOrder}));for(var o=0;oa&&ea)return s}else if(0===o&&es&&(s=r.top+r.height)}while(a)}}o.height=s-o.top+.5*i.item.vertical},e.nostack=function(t,i,o,n){for(var s=0;so[r].index&&e.collisionByTimes(o[n],o[r])){s=o[r];break}null!=s&&(o[n].top=s.top+s.height)}while(s)}for(var a=0;ao[h].index&&(o[r].top+=o[h].height);for(var d=t[r],l=0;le.right&&t.top-i.vertical+.001e.top:t.left-i.horizontal+.001e.left&&t.top-i.vertical+.001e.top},e.collisionByTimes=function(t,e){return t.start<=e.start&&t.end>=e.start&&t.tope.top||e.start<=t.start&&e.end>=t.start&&e.topt.top}},function(t,e,i){function o(t,e,i){if(this.props={dot:{width:0,height:0},line:{width:0,height:0}},this.options=i,t&&void 0==t.start)throw new Error('Property "start" missing in item '+t);n.call(this,t,e,i)}var n=i(22);o.prototype=new n(null,null,null),o.prototype.isVisible=function(t){var e=this.options.align,i=this.width*t.getMillisecondsPerPixel();return"right"==e?this.data.start.getTime()>t.start&&this.data.start.getTime()-it.start&&this.data.start.getTime()t.start&&this.data.start.getTime()-i/2t.start&&this.data.startt.start},o.prototype._createDomElement=function(){this.dom||(this.dom={},this.dom.box=document.createElement("div"),this.dom.frame=document.createElement("div"),this.dom.frame.className="vis-item-overflow",this.dom.box.appendChild(this.dom.frame),this.dom.content=document.createElement("div"),this.dom.content.className="vis-item-content",this.dom.frame.appendChild(this.dom.content),this.dirty=!0)},o.prototype._appendDomElement=function(){if(!this.parent)throw new Error("Cannot redraw item: no parent attached");if(!this.dom.box.parentNode){var t=this.parent.dom.background;if(!t)throw new Error("Cannot redraw item: parent has no background container element");t.appendChild(this.dom.box)}this.displayed=!0},o.prototype._updateDirtyDomComponents=function(){if(this.dirty){this._updateContents(this.dom.content),this._updateDataAttributes(this.dom.content),this._updateStyle(this.dom.box);var t=(this.data.className?" "+this.data.className:"")+(this.selected?" vis-selected":"");this.dom.box.className=this.baseClassName+t}},o.prototype._getDomComponentsSizes=function(){return this.overflow="hidden"!==window.getComputedStyle(this.dom.content).overflow,{content:{width:this.dom.content.offsetWidth}}},o.prototype._updateDomComponentsSizes=function(t){this.props.content.width=t.content.width,this.height=0,this.dirty=!1},o.prototype._repaintDomAdditionals=function(){},o.prototype.redraw=function(t){var e,i=[this._createDomElement.bind(this),this._appendDomElement.bind(this),this._updateDirtyDomComponents.bind(this),function(){this.dirty&&(e=this._getDomComponentsSizes.bind(this)())}.bind(this),function(){this.dirty&&this._updateDomComponentsSizes.bind(this)(e)}.bind(this),this._repaintDomAdditionals.bind(this)];if(t)return i;var o;return i.forEach(function(t){o=t()}),o},o.prototype.show=r.prototype.show,o.prototype.hide=r.prototype.hide,o.prototype.repositionX=r.prototype.repositionX,o.prototype.repositionY=function(t){var e,i=this.options.orientation.item;if(void 0!==this.data.subgroup){var o=this.data.subgroup;this.dom.box.style.height=this.parent.subgroups[o].height+"px",this.dom.box.style.top="top"==i?this.parent.top+this.parent.subgroups[o].top+"px":this.parent.top+this.parent.height-this.parent.subgroups[o].top-this.parent.subgroups[o].height+"px",this.dom.box.style.bottom=""}else this.parent instanceof s?(e=Math.max(this.parent.height,this.parent.itemSet.body.domProps.center.height,this.parent.itemSet.body.domProps.centerContainer.height),this.dom.box.style.bottom="bottom"==i?"0":"",this.dom.box.style.top="top"==i?"0":""):(e=this.parent.height,this.dom.box.style.top=this.parent.top+"px",this.dom.box.style.bottom="");this.dom.box.style.height=e+"px"},t.exports=o},function(t,e,i){Object.defineProperty(e,"__esModule",{value:!0});var o="string",n="boolean",s="number",r="object",a={configure:{enabled:{boolean:n},filter:{boolean:n,function:"function"},container:{dom:"dom"},__type__:{object:r,boolean:n,function:"function"}},align:{string:o},rtl:{boolean:n,undefined:"undefined"},rollingMode:{follow:{boolean:n},offset:{number:s,undefined:"undefined"},__type__:{object:r}},verticalScroll:{boolean:n,undefined:"undefined"},horizontalScroll:{boolean:n,undefined:"undefined"},autoResize:{boolean:n},throttleRedraw:{number:s},clickToUse:{boolean:n},dataAttributes:{string:o,array:"array"},editable:{add:{boolean:n,undefined:"undefined"},remove:{boolean:n,undefined:"undefined"},updateGroup:{boolean:n,undefined:"undefined"},updateTime:{boolean:n,undefined:"undefined"},overrideItems:{boolean:n,undefined:"undefined"},__type__:{boolean:n,object:r}},end:{number:s,date:"date",string:o,moment:"moment"},format:{minorLabels:{millisecond:{string:o,undefined:"undefined"},second:{string:o,undefined:"undefined"},minute:{string:o,undefined:"undefined"},hour:{string:o,undefined:"undefined"},weekday:{string:o,undefined:"undefined"},day:{string:o,undefined:"undefined"},week:{string:o,undefined:"undefined"},month:{string:o,undefined:"undefined"},year:{string:o,undefined:"undefined"},__type__:{object:r,function:"function"}},majorLabels:{millisecond:{string:o,undefined:"undefined"},second:{string:o,undefined:"undefined"},minute:{string:o,undefined:"undefined"},hour:{string:o,undefined:"undefined"},weekday:{string:o,undefined:"undefined"},day:{string:o,undefined:"undefined"},week:{string:o,undefined:"undefined"},month:{string:o,undefined:"undefined"},year:{string:o,undefined:"undefined"},__type__:{object:r,function:"function"}},__type__:{object:r}},moment:{function:"function"},groupOrder:{string:o,function:"function"},groupEditable:{add:{boolean:n,undefined:"undefined"},remove:{boolean:n,undefined:"undefined"},order:{boolean:n,undefined:"undefined"},__type__:{boolean:n,object:r}},groupOrderSwap:{function:"function"},height:{string:o,number:s},hiddenDates:{start:{date:"date",number:s,string:o,moment:"moment"},end:{date:"date",number:s,string:o,moment:"moment"},repeat:{string:o},__type__:{object:r,array:"array"}},itemsAlwaysDraggable:{item:{boolean:n,undefined:"undefined"},range:{boolean:n,undefined:"undefined"},__type__:{boolean:n,object:r}},limitSize:{boolean:n},locale:{string:o},locales:{__any__:{any:"any"},__type__:{object:r}},margin:{axis:{number:s},item:{horizontal:{number:s,undefined:"undefined"},vertical:{number:s,undefined:"undefined"},__type__:{object:r,number:s}},__type__:{object:r,number:s}},max:{date:"date",number:s,string:o,moment:"moment"},maxHeight:{number:s,string:o},maxMinorChars:{number:s},min:{date:"date",number:s,string:o,moment:"moment"},minHeight:{number:s,string:o},moveable:{boolean:n},multiselect:{boolean:n},multiselectPerGroup:{boolean:n},onAdd:{function:"function"},onDropObjectOnItem:{function:"function"},onUpdate:{function:"function"},onMove:{function:"function"},onMoving:{function:"function"},onRemove:{function:"function"},onAddGroup:{function:"function"},onMoveGroup:{function:"function"},onRemoveGroup:{function:"function"},onInitialDrawComplete:{function:"function"},order:{function:"function"},orientation:{axis:{string:o,undefined:"undefined"},item:{string:o,undefined:"undefined"},__type__:{string:o,object:r}},selectable:{boolean:n},showCurrentTime:{boolean:n},showMajorLabels:{boolean:n},showMinorLabels:{boolean:n},stack:{boolean:n},stackSubgroups:{boolean:n},snap:{function:"function",null:"null"},start:{date:"date",number:s,string:o,moment:"moment"},template:{function:"function"},groupTemplate:{function:"function"},visibleFrameTemplate:{string:o,function:"function"},showTooltips:{boolean:n},tooltip:{followMouse:{boolean:n},overflowMethod:{string:["cap","flip"]},__type__:{object:r}},tooltipOnItemUpdateTime:{template:{function:"function"},__type__:{boolean:n,object:r}},timeAxis:{scale:{string:o,undefined:"undefined"},step:{number:s,undefined:"undefined"},__type__:{object:r}},type:{string:o},width:{string:o,number:s},zoomable:{boolean:n},zoomKey:{string:["ctrlKey","altKey","metaKey",""]},zoomMax:{number:s},zoomMin:{number:s},__type__:{object:r}},h={global:{align:["center","left","right"],direction:!1,autoResize:!0,clickToUse:!1,editable:{add:!1,remove:!1,updateGroup:!1,updateTime:!1},end:"",format:{minorLabels:{millisecond:"SSS",second:"s",minute:"HH:mm",hour:"HH:mm",weekday:"ddd D",day:"D",week:"w",month:"MMM",year:"YYYY"},majorLabels:{millisecond:"HH:mm:ss",second:"D MMMM HH:mm",minute:"ddd D MMMM",hour:"ddd D MMMM",weekday:"MMMM YYYY",day:"MMMM YYYY",week:"MMMM YYYY",month:"YYYY",year:""}},groupsDraggable:!1,height:"",locale:"",margin:{axis:[20,0,100,1],item:{horizontal:[10,0,100,1],vertical:[10,0,100,1]}},max:"",maxHeight:"",maxMinorChars:[7,0,20,1],min:"",minHeight:"",moveable:!1,multiselect:!1,multiselectPerGroup:!1,orientation:{axis:["both","bottom","top"],item:["bottom","top"]},selectable:!0,showCurrentTime:!1,showMajorLabels:!0,showMinorLabels:!0,stack:!0,stackSubgroups:!0,start:"",showTooltips:!0,tooltip:{followMouse:!1,overflowMethod:"flip"},tooltipOnItemUpdateTime:!1,type:["box","point","range","background"],width:"100%",zoomable:!0,zoomKey:["ctrlKey","altKey","metaKey",""],zoomMax:[31536e10,10,31536e10,1],zoomMin:[10,10,31536e10,1]}};e.allOptions=a,e.configureOptions=h},function(t,e,i){function o(t){return t&&t.__esModule?t:{default:t}}Object.defineProperty(e,"__esModule",{value:!0});var n=i(16),s=o(n),r=i(1),a=o(r),h=i(30),d=o(h),l=i(31),u=o(l),p=i(0),c=i(137).default,m=function(){function t(e,i,o){var n=arguments.length>3&&void 0!==arguments[3]?arguments[3]:1;(0,d.default)(this,t),this.parent=e,this.changedOptions=[],this.container=i,this.allowCreation=!1,this.options={},this.initialized=!1,this.popupCounter=0,this.defaultOptions={enabled:!1,filter:!0,container:void 0,showButton:!0},p.extend(this.options,this.defaultOptions),this.configureOptions=o,this.moduleOptions={},this.domElements=[],this.popupDiv={},this.popupLimit=5,this.popupHistory={},this.colorPicker=new c(n),this.wrapper=void 0}return(0,u.default)(t,[{key:"setOptions",value:function(t){if(void 0!==t){this.popupHistory={},this._removePopup();var e=!0;"string"==typeof t?this.options.filter=t:t instanceof Array?this.options.filter=t.join():"object"===(void 0===t?"undefined":(0,a.default)(t))?(void 0!==t.container&&(this.options.container=t.container),void 0!==t.filter&&(this.options.filter=t.filter),void 0!==t.showButton&&(this.options.showButton=t.showButton),void 0!==t.enabled&&(e=t.enabled)):"boolean"==typeof t?(this.options.filter=!0,e=t):"function"==typeof t&&(this.options.filter=t,e=!0),!1===this.options.filter&&(e=!1),this.options.enabled=e}this._clean()}},{key:"setModuleOptions",value:function(t){this.moduleOptions=t,!0===this.options.enabled&&(this._clean(),void 0!==this.options.container&&(this.container=this.options.container),this._create())}},{key:"_create",value:function(){var t=this;this._clean(),this.changedOptions=[];var e=this.options.filter,i=0,o=!1;for(var n in this.configureOptions)this.configureOptions.hasOwnProperty(n)&&(this.allowCreation=!1,o=!1,"function"==typeof e?(o=e(n,[]),o=o||this._handleObject(this.configureOptions[n],[n],!0)):!0!==e&&-1===e.indexOf(n)||(o=!0),!1!==o&&(this.allowCreation=!0,i>0&&this._makeItem([]),this._makeHeader(n),this._handleObject(this.configureOptions[n],[n])),i++);if(!0===this.options.showButton){var s=document.createElement("div");s.className="vis-configuration vis-config-button",s.innerHTML="generate options",s.onclick=function(){t._printOptions()},s.onmouseover=function(){s.className="vis-configuration vis-config-button hover"},s.onmouseout=function(){s.className="vis-configuration vis-config-button"},this.optionsContainer=document.createElement("div"),this.optionsContainer.className="vis-configuration vis-config-option-container",this.domElements.push(this.optionsContainer),this.domElements.push(s)}this._push()}},{key:"_push",value:function(){this.wrapper=document.createElement("div"),this.wrapper.className="vis-configuration-wrapper",this.container.appendChild(this.wrapper);for(var t=0;t1?i-1:0),n=1;n2&&void 0!==arguments[2]&&arguments[2],o=document.createElement("div");return o.className="vis-configuration vis-config-label vis-config-s"+e.length,o.innerHTML=!0===i?""+t+":":t+":",o}},{key:"_makeDropdown",value:function(t,e,i){var o=document.createElement("select");o.className="vis-configuration vis-config-select";var n=0;void 0!==e&&-1!==t.indexOf(e)&&(n=t.indexOf(e));for(var s=0;ss&&1!==s&&(a.max=Math.ceil(1.2*e),d=a.max,h="range increased"),a.value=e}else a.value=o;var l=document.createElement("input");l.className="vis-configuration vis-config-rangeinput",l.value=a.value;var u=this;a.onchange=function(){l.value=this.value,u._update(Number(this.value),i)},a.oninput=function(){l.value=this.value};var p=this._makeLabel(i[i.length-1],i),c=this._makeItem(i,p,a,l);""!==h&&this.popupHistory[c]!==d&&(this.popupHistory[c]=d,this._setupPopup(h,c))}},{key:"_setupPopup",value:function(t,e){var i=this;if(!0===this.initialized&&!0===this.allowCreation&&this.popupCounter1&&void 0!==arguments[1]?arguments[1]:[],i=arguments.length>2&&void 0!==arguments[2]&&arguments[2],o=!1,n=this.options.filter,s=!1;for(var r in t)if(t.hasOwnProperty(r)){o=!0;var a=t[r],h=p.copyAndExtendArray(e,r);if("function"==typeof n&&!1===(o=n(r,e))&&!(a instanceof Array)&&"string"!=typeof a&&"boolean"!=typeof a&&a instanceof Object&&(this.allowCreation=!1,o=this._handleObject(a,h,!0),this.allowCreation=!1===i),!1!==o){s=!0;var d=this._getValue(h);if(a instanceof Array)this._handleArray(a,d,h);else if("string"==typeof a)this._makeTextInput(a,d,h);else if("boolean"==typeof a)this._makeCheckbox(a,d,h);else if(a instanceof Object){var l=!0;if(-1!==e.indexOf("physics")&&this.moduleOptions.physics.solver!==r&&(l=!1),!0===l)if(void 0!==a.enabled){var u=p.copyAndExtendArray(h,"enabled"),c=this._getValue(u);if(!0===c){var m=this._makeLabel(r,h,!0);this._makeItem(h,m),s=this._handleObject(a,h)||s}else this._makeCheckbox(a,c,h)}else{var f=this._makeLabel(r,h,!0);this._makeItem(h,f),s=this._handleObject(a,h)||s}}else console.error("dont know how to handle",a,r,h)}}return s}},{key:"_handleArray",value:function(t,e,i){"string"==typeof t[0]&&"color"===t[0]?(this._makeColorField(t,e,i),t[1]!==e&&this.changedOptions.push({path:i,value:e})):"string"==typeof t[0]?(this._makeDropdown(t,e,i),t[0]!==e&&this.changedOptions.push({path:i,value:e})):"number"==typeof t[0]&&(this._makeRange(t,e,i),t[0]!==e&&this.changedOptions.push({path:i,value:Number(e)}))}},{key:"_update",value:function(t,e){var i=this._constructOptions(t,e);this.parent.body&&this.parent.body.emitter&&this.parent.body.emitter.emit&&this.parent.body.emitter.emit("configChange",i),this.initialized=!0,this.parent.setOptions(i)}},{key:"_constructOptions",value:function(t,e){var i=arguments.length>2&&void 0!==arguments[2]?arguments[2]:{},o=i;t="true"===t||t,t="false"!==t&&t;for(var n=0;nvar options = "+(0,s.default)(t,null,2)+""}},{key:"getOptions",value:function(){for(var t={},e=0;e0){var r={};for(this._getRelevantData(s,r,o,n),this._applySampling(s,r),e=0;e0)switch(t.options.style){case"line":d.hasOwnProperty(s[e])||(d[s[e]]=f.calcPath(r[s[e]],t)),f.draw(d[s[e]],t,this.framework);case"point":case"points":"point"!=t.options.style&&"points"!=t.options.style&&1!=t.options.drawPoints.enabled||g.draw(r[s[e]],t,this.framework)}}}return a.cleanupElements(this.svgElements),!1},o.prototype._stack=function(t,e){var i,o,n,s,r;i=0;for(var a=0;at[a].x){r=e[h],s=0==h?r:e[h-1],i=h;break}}void 0===r&&(s=e[e.length-1],r=e[e.length-1]),o=r.x-s.x,n=r.y-s.y,t[a].y=0==o?t[a].orginalY+r.y:t[a].orginalY+n/o*(t[a].x-s.x)+s.y}},o.prototype._getRelevantData=function(t,e,i,o){var n,s,a,h;if(t.length>0)for(s=0;s0)for(var o=0;o0){var s=1,r=n.length,a=this.body.util.toGlobalScreen(n[n.length-1].x)-this.body.util.toGlobalScreen(n[0].x),h=r/a;s=Math.min(Math.ceil(.2*r),Math.max(1,Math.round(h)));for(var d=new Array(r),l=0;l0){for(s=0;s0&&(n=this.groups[t[s]],!0===r.stack&&"bar"===r.style?"left"===r.yAxisOrientation?a=a.concat(o):h=h.concat(o):i[t[s]]=n.getYRange(o,t[s]));m.getStackedYRange(a,i,t,"__barStackLeft","left"),m.getStackedYRange(h,i,t,"__barStackRight","right")}},o.prototype._updateYAxis=function(t,e){var i,o,n=!1,s=!1,r=!1,a=1e9,h=1e9,d=-1e9,l=-1e9;if(t.length>0){for(var u=0;ui?i:a,d=di?i:h,l=l=0&&t._redrawLabel(o-2,e.val,i,"vis-y-axis vis-major",t.props.majorCharHeight),!0===t.master&&(n?t._redrawLine(o,i,"vis-grid vis-horizontal vis-major",t.options.majorLinesOffset,t.props.majorLineWidth):t._redrawLine(o,i,"vis-grid vis-horizontal vis-minor",t.options.minorLinesOffset,t.props.minorLineWidth))});var r=0;void 0!==this.options[i].title&&void 0!==this.options[i].title.text&&(r=this.props.titleCharHeight);var h=!0===this.options.icons?Math.max(this.options.iconWidth,r)+this.options.labelOffsetX+15:r+this.options.labelOffsetX+15;return this.maxLabelSize>this.width-h&&!0===this.options.visible?(this.width=this.maxLabelSize+h,this.options.width=this.width+"px",a.cleanupElements(this.DOMelements.lines),a.cleanupElements(this.DOMelements.labels),this.redraw(),e=!0):this.maxLabelSizethis.minWidth?(this.width=Math.max(this.minWidth,this.maxLabelSize+h),this.options.width=this.width+"px",a.cleanupElements(this.DOMelements.lines),a.cleanupElements(this.DOMelements.labels),this.redraw(),e=!0):(a.cleanupElements(this.DOMelements.lines),a.cleanupElements(this.DOMelements.labels),e=!1),e},o.prototype.convertValue=function(t){return this.scale.convertValue(t)},o.prototype.screenToValue=function(t){return this.scale.screenToValue(t)},o.prototype._redrawLabel=function(t,e,i,o,n){var s=a.getDOMElement("div",this.DOMelements.labels,this.dom.frame);s.className=o,s.innerHTML=e,"left"===i?(s.style.left="-"+this.options.labelOffsetX+"px",s.style.textAlign="right"):(s.style.right="-"+this.options.labelOffsetX+"px",s.style.textAlign="left"),s.style.top=t-.5*n+this.options.labelOffsetY+"px",e+="";var r=Math.max(this.props.majorCharWidth,this.props.minorCharWidth);this.maxLabelSize6&&void 0!==arguments[6]&&arguments[6],a=arguments.length>7&&void 0!==arguments[7]&&arguments[7];if(this.majorSteps=[1,2,5,10],this.minorSteps=[.25,.5,1,2],this.customLines=null,this.containerHeight=n,this.majorCharHeight=s,this._start=t,this._end=e,this.scale=1,this.minorStepIdx=-1,this.magnitudefactor=1,this.determineScale(),this.zeroAlign=r,this.autoScaleStart=i,this.autoScaleEnd=o,this.formattingFunction=a,i||o){var h=this,d=function(t){var e=t-t%(h.magnitudefactor*h.minorSteps[h.minorStepIdx]);return t%(h.magnitudefactor*h.minorSteps[h.minorStepIdx])>h.magnitudefactor*h.minorSteps[h.minorStepIdx]*.5?e+h.magnitudefactor*h.minorSteps[h.minorStepIdx]:e};i&&(this._start-=2*this.magnitudefactor*this.minorSteps[this.minorStepIdx],this._start=d(this._start)),o&&(this._end+=this.magnitudefactor*this.minorSteps[this.minorStepIdx],this._end=d(this._end)),this.determineScale()}}o.prototype.setCharHeight=function(t){this.majorCharHeight=t},o.prototype.setHeight=function(t){this.containerHeight=t},o.prototype.determineScale=function(){var t=this._end-this._start;this.scale=this.containerHeight/t;var e=this.majorCharHeight/this.scale,i=t>0?Math.round(Math.log(t)/Math.LN10):0;this.minorStepIdx=-1,this.magnitudefactor=Math.pow(10,i);var o=0;i<0&&(o=i);for(var n=!1,s=o;Math.abs(s)<=Math.abs(i);s++){this.magnitudefactor=Math.pow(10,s);for(var r=0;r=e){n=!0,this.minorStepIdx=r;break}}if(!0===n)break}},o.prototype.is_major=function(t){return t%(this.magnitudefactor*this.majorSteps[this.minorStepIdx])==0},o.prototype.getStep=function(){return this.magnitudefactor*this.minorSteps[this.minorStepIdx]},o.prototype.getFirstMajor=function(){var t=this.magnitudefactor*this.majorSteps[this.minorStepIdx];return this.convertValue(this._start+(t-this._start%t)%t)},o.prototype.formatValue=function(t){var e=t.toPrecision(5);return"function"==typeof this.formattingFunction&&(e=this.formattingFunction(t)),"number"==typeof e?""+e:"string"==typeof e?e:t.toPrecision(5)},o.prototype.getLines=function(){for(var t=[],e=this.getStep(),i=(e-this._start%e)%e,o=this._start+i;this._end-o>1e-5;o+=e)o!=this._start&&t.push({major:this.is_major(o),y:this.convertValue(o),val:this.formatValue(o)});return t},o.prototype.followScale=function(t){var e=this.minorStepIdx,i=this._start,o=this._end,n=this,s=function(){n.magnitudefactor*=2},r=function(){n.magnitudefactor/=2};t.minorStepIdx<=1&&this.minorStepIdx<=1||t.minorStepIdx>1&&this.minorStepIdx>1||(t.minorStepIdxo+1e-5)r(),d=!1;else{if(!this.autoScaleStart&&this._start=0)){r(),d=!1;continue}console.warn("Can't adhere to given 'min' range, due to zeroalign")}this.autoScaleStart&&this.autoScaleEnd&&ue.x?1:-1})):this.itemsData=[]},o.prototype.getItems=function(){return this.itemsData},o.prototype.setZeroPosition=function(t){this.zeroPosition=t},o.prototype.setOptions=function(t){if(void 0!==t){var e=["sampling","style","sort","yAxisOrientation","barChart","zIndex","excludeFromStacking","excludeFromLegend"];r.selectiveDeepExtend(e,this.options,t),"function"==typeof t.drawPoints&&(t.drawPoints={onRender:t.drawPoints}),r.mergeOptions(this.options,t,"interpolation"),r.mergeOptions(this.options,t,"drawPoints"),r.mergeOptions(this.options,t,"shaded"),t.interpolation&&"object"==(0,s.default)(t.interpolation)&&t.interpolation.parametrization&&("uniform"==t.interpolation.parametrization?this.options.interpolation.alpha=0:"chordal"==t.interpolation.parametrization?this.options.interpolation.alpha=1:(this.options.interpolation.parametrization="centripetal",this.options.interpolation.alpha=.5))}},o.prototype.update=function(t){this.group=t,this.content=t.content||"graph",this.className=t.className||this.className||"vis-graph-group"+this.groupsUsingDefaultStyles[0]%10,this.visible=void 0===t.visible||t.visible,this.style=t.style,this.setOptions(t.options)},o.prototype.getLegend=function(t,e,i,o,n){if(void 0==i||null==i){i={svg:document.createElementNS("http://www.w3.org/2000/svg","svg"),svgElements:{},options:this.options,groups:[this]}}switch(void 0!=o&&null!=o||(o=0),void 0!=n&&null!=n||(n=.5*e),this.options.style){case"line":h.drawIcon(this,o,n,t,e,i);break;case"points":case"point":d.drawIcon(this,o,n,t,e,i);break;case"bar":a.drawIcon(this,o,n,t,e,i)}return{icon:i.svg,label:this.content,orientation:this.options.yAxisOrientation}},o.prototype.getYRange=function(t){for(var e=t[0].y,i=t[0].y,o=0;ot[o].y?t[o].y:e,i=i0&&(i=Math.min(i,Math.abs(e[o-1].screen_x-e[o].screen_x))),0===i&&(void 0===t[e[o].screen_x]&&(t[e[o].screen_x]={amount:0,resolved:0,accumulatedPositive:0,accumulatedNegative:0}),t[e[o].screen_x].amount+=1)},o._getSafeDrawData=function(t,e,i){var o,n;return t0?(o=t0){t.sort(function(t,e){return t.screen_x===e.screen_x?t.groupIde[s].screen_y?e[s].screen_y:o,n=nt[r].accumulatedNegative?t[r].accumulatedNegative:o,o=o>t[r].accumulatedPositive?t[r].accumulatedPositive:o,n=n0){return 1==e.options.interpolation.enabled?o._catmullRom(t,e):o._linear(t)}},o.drawIcon=function(t,e,i,o,s,r){var a,h,d=.5*s,l=n.getSVGElement("rect",r.svgElements,r.svg);if(l.setAttributeNS(null,"x",e),l.setAttributeNS(null,"y",i-d),l.setAttributeNS(null,"width",o),l.setAttributeNS(null,"height",2*d),l.setAttributeNS(null,"class","vis-outline"),a=n.getSVGElement("path",r.svgElements,r.svg),a.setAttributeNS(null,"class",t.className),void 0!==t.style&&a.setAttributeNS(null,"style",t.style),a.setAttributeNS(null,"d","M"+e+","+i+" L"+(e+o)+","+i),1==t.options.shaded.enabled&&(h=n.getSVGElement("path",r.svgElements,r.svg),"top"==t.options.shaded.orientation?h.setAttributeNS(null,"d","M"+e+", "+(i-d)+"L"+e+","+i+" L"+(e+o)+","+i+" L"+(e+o)+","+(i-d)):h.setAttributeNS(null,"d","M"+e+","+i+" L"+e+","+(i+d)+" L"+(e+o)+","+(i+d)+"L"+(e+o)+","+i),h.setAttributeNS(null,"class",t.className+" vis-icon-fill"),void 0!==t.options.shaded.style&&""!==t.options.shaded.style&&h.setAttributeNS(null,"style",t.options.shaded.style)),1==t.options.drawPoints.enabled){var u={style:t.options.drawPoints.style,styles:t.options.drawPoints.styles,size:t.options.drawPoints.size,className:t.className};n.drawPoint(e+.5*o,i,u,r.svgElements,r.svg)}},o.drawShading=function(t,e,i,o){if(1==e.options.shaded.enabled){var s=Number(o.svg.style.height.replace("px","")),r=n.getSVGElement("path",o.svgElements,o.svg),a="L";1==e.options.interpolation.enabled&&(a="C");var h,d=0;d="top"==e.options.shaded.orientation?0:"bottom"==e.options.shaded.orientation?s:Math.min(Math.max(0,e.zeroPosition),s),h="group"==e.options.shaded.orientation&&null!=i&&void 0!=i?"M"+t[0][0]+","+t[0][1]+" "+this.serializePath(t,a,!1)+" L"+i[i.length-1][0]+","+i[i.length-1][1]+" "+this.serializePath(i,a,!0)+i[0][0]+","+i[0][1]+" Z":"M"+t[0][0]+","+t[0][1]+" "+this.serializePath(t,a,!1)+" V"+d+" H"+t[0][0]+" Z",r.setAttributeNS(null,"class",e.className+" vis-fill"),void 0!==e.options.shaded.style&&r.setAttributeNS(null,"style",e.options.shaded.style),r.setAttributeNS(null,"d",h)}},o.draw=function(t,e,i){if(null!=t&&void 0!=t){var o=n.getSVGElement("path",i.svgElements,i.svg);o.setAttributeNS(null,"class",e.className),void 0!==e.style&&o.setAttributeNS(null,"style",e.style);var s="L";1==e.options.interpolation.enabled&&(s="C"),o.setAttributeNS(null,"d","M"+t[0][0]+","+t[0][1]+" "+this.serializePath(t,s,!1))}},o.serializePath=function(t,e,i){if(t.length<2)return"";var o,n=e;if(i)for(o=t.length-2;o>0;o--)n+=t[o][0]+","+t[o][1]+" ";else for(o=1;o0&&(m=1/m),f=3*g*(g+v),f>0&&(f=1/f),a={screen_x:(-b*o.screen_x+p*n.screen_x+_*s.screen_x)*m,screen_y:(-b*o.screen_y+p*n.screen_y+_*s.screen_y)*m},h={screen_x:(y*n.screen_x+c*s.screen_x-b*r.screen_x)*f,screen_y:(y*n.screen_y+c*s.screen_y-b*r.screen_y)*f},0==a.screen_x&&0==a.screen_y&&(a=n),0==h.screen_x&&0==h.screen_y&&(h=s),x.push([a.screen_x,a.screen_y]),x.push([h.screen_x,h.screen_y]),x.push([s.screen_x,s.screen_y]);return x},o._linear=function(t){for(var e=[],i=0;i");this.dom.textArea.innerHTML=r,this.dom.textArea.style.lineHeight=.75*this.options.iconSize+this.options.iconSpacing+"px"}},o.prototype.drawLegendIcons=function(){if(this.dom.frame.parentNode){var t=(0,s.default)(this.groups);t.sort(function(t,e){return t=t.length?(this._t=void 0,n(1)):"keys"==e?n(0,i):"values"==e?n(0,t[i]):n(0,[i,t[i]])},"values"),s.Arguments=s.Array,o("keys"),o("values"),o("entries")},function(t,e){t.exports=function(){}},function(t,e){t.exports=function(t,e){return{value:e,done:!!t}}},function(t,e,i){var o=i(33);t.exports=Object("z").propertyIsEnumerable(0)?Object:function(t){return"String"==o(t)?t.split(""):Object(t)}},function(t,e,i){var o=i(93);t.exports=function(t,e,i){if(o(t),void 0===e)return t;switch(i){case 1:return function(i){return t.call(e,i)};case 2:return function(i,o){return t.call(e,i,o)};case 3:return function(i,o,n){return t.call(e,i,o,n)}}return function(){return t.apply(e,arguments)}}},function(t,e){t.exports=function(t){if("function"!=typeof t)throw TypeError(t+" is not a function!");return t}},function(t,e,i){var o=i(37),n=i(25),s=i(42),r={};i(14)(r,i(7)("iterator"),function(){return this}),t.exports=function(t,e,i){t.prototype=o(r,{next:n(1,i)}),s(t,e+" Iterator")}},function(t,e,i){var o=i(8),n=i(18),s=i(26);t.exports=i(9)?Object.defineProperties:function(t,e){n(t);for(var i,r=s(e),a=r.length,h=0;a>h;)o.f(t,i=r[h++],e[i]);return t}},function(t,e,i){var o=i(13),n=i(97),s=i(98);t.exports=function(t){return function(e,i,r){var a,h=o(e),d=n(h.length),l=s(r,d);if(t&&i!=i){for(;d>l;)if((a=h[l++])!=a)return!0}else for(;d>l;l++)if((t||l in h)&&h[l]===i)return t||l||0;return!t&&-1}}},function(t,e,i){var o=i(38),n=Math.min;t.exports=function(t){return t>0?n(o(t),9007199254740991):0}},function(t,e,i){var o=i(38),n=Math.max,s=Math.min;t.exports=function(t,e){return t=o(t),t<0?n(t+e,0):s(t,e)}},function(t,e,i){var o=i(6).document;t.exports=o&&o.documentElement},function(t,e,i){var o=i(10),n=i(61),s=i(39)("IE_PROTO"),r=Object.prototype;t.exports=Object.getPrototypeOf||function(t){return t=n(t),o(t,s)?t[s]:"function"==typeof t.constructor&&t instanceof t.constructor?t.constructor.prototype:t instanceof Object?r:null}},function(t,e,i){var o=i(38),n=i(34);t.exports=function(t){return function(e,i){var s,r,a=String(n(e)),h=o(i),d=a.length;return h<0||h>=d?t?"":void 0:(s=a.charCodeAt(h),s<55296||s>56319||h+1===d||(r=a.charCodeAt(h+1))<56320||r>57343?t?a.charAt(h):s:t?a.slice(h,h+2):r-56320+(s-55296<<10)+65536)}}},function(t,e,i){var o=i(18),n=i(103);t.exports=i(2).getIterator=function(t){var e=n(t);if("function"!=typeof e)throw TypeError(t+" is not iterable!");return o(e.call(t))}},function(t,e,i){var o=i(104),n=i(7)("iterator"),s=i(23);t.exports=i(2).getIteratorMethod=function(t){if(void 0!=t)return t[n]||t["@@iterator"]||s[o(t)]}},function(t,e,i){var o=i(33),n=i(7)("toStringTag"),s="Arguments"==o(function(){return arguments}()),r=function(t,e){try{return t[e]}catch(t){}};t.exports=function(t){var e,i,a;return void 0===t?"Undefined":null===t?"Null":"string"==typeof(i=r(e=Object(t),n))?i:s?o(e):"Object"==(a=o(e))&&"function"==typeof e.callee?"Arguments":a}},function(t,e,i){i(106);var o=i(2).Object;t.exports=function(t,e){return o.create(t,e)}},function(t,e,i){var o=i(17);o(o.S,"Object",{create:i(37)})},function(t,e,i){i(108),t.exports=i(2).Object.keys},function(t,e,i){var o=i(61),n=i(26);i(109)("keys",function(){return function(t){return n(o(t))}})},function(t,e,i){var o=i(17),n=i(2),s=i(19);t.exports=function(t,e){var i=(n.Object||{})[t]||Object[t],r={};r[t]=e(i),o(o.S+o.F*s(function(){i(1)}),"Object",r)}},function(t,e,i){t.exports={default:i(111),__esModule:!0}},function(t,e,i){i(62),i(55),t.exports=i(44).f("iterator")},function(t,e,i){t.exports={default:i(113),__esModule:!0}},function(t,e,i){i(114),i(120),i(121),i(122),t.exports=i(2).Symbol},function(t,e,i){var o=i(6),n=i(10),s=i(9),r=i(17),a=i(59),h=i(115).KEY,d=i(19),l=i(40),u=i(42),p=i(27),c=i(7),m=i(44),f=i(45),g=i(116),v=i(117),y=i(18),b=i(13),_=i(36),w=i(25),x=i(37),D=i(118),S=i(119),k=i(8),C=i(26),T=S.f,M=k.f,O=D.f,E=o.Symbol,P=o.JSON,I=P&&P.stringify,N=c("_hidden"),A=c("toPrimitive"),R={}.propertyIsEnumerable,L=l("symbol-registry"),F=l("symbols"),H=l("op-symbols"),j=Object.prototype,Y="function"==typeof E,G=o.QObject,z=!G||!G.prototype||!G.prototype.findChild,W=s&&d(function(){return 7!=x(M({},"a",{get:function(){return M(this,"a",{value:7}).a}})).a})?function(t,e,i){var o=T(j,e);o&&delete j[e],M(t,e,i),o&&t!==j&&M(j,e,o)}:M,V=function(t){var e=F[t]=x(E.prototype);return e._k=t,e},B=Y&&"symbol"==typeof E.iterator?function(t){return"symbol"==typeof t}:function(t){return t instanceof E},U=function(t,e,i){return t===j&&U(H,e,i),y(t),e=_(e,!0),y(i),n(F,e)?(i.enumerable?(n(t,N)&&t[N][e]&&(t[N][e]=!1),i=x(i,{enumerable:w(0,!1)})):(n(t,N)||M(t,N,w(1,{})),t[N][e]=!0),W(t,e,i)):M(t,e,i)},q=function(t,e){y(t);for(var i,o=g(e=b(e)),n=0,s=o.length;s>n;)U(t,i=o[n++],e[i]);return t},X=function(t,e){return void 0===e?x(t):q(x(t),e)},Z=function(t){var e=R.call(this,t=_(t,!0));return!(this===j&&n(F,t)&&!n(H,t))&&(!(e||!n(this,t)||!n(F,t)||n(this,N)&&this[N][t])||e)},K=function(t,e){if(t=b(t),e=_(e,!0),t!==j||!n(F,e)||n(H,e)){var i=T(t,e);return!i||!n(F,e)||n(t,N)&&t[N][e]||(i.enumerable=!0),i}},J=function(t){for(var e,i=O(b(t)),o=[],s=0;i.length>s;)n(F,e=i[s++])||e==N||e==h||o.push(e);return o},$=function(t){for(var e,i=t===j,o=O(i?H:b(t)),s=[],r=0;o.length>r;)!n(F,e=o[r++])||i&&!n(j,e)||s.push(F[e]);return s};Y||(E=function(){if(this instanceof E)throw TypeError("Symbol is not a constructor!");var t=p(arguments.length>0?arguments[0]:void 0),e=function(i){this===j&&e.call(H,i),n(this,N)&&n(this[N],t)&&(this[N][t]=!1),W(this,t,w(1,i))};return s&&z&&W(j,t,{configurable:!0,set:e}),V(t)},a(E.prototype,"toString",function(){return this._k}),S.f=K,k.f=U,i(64).f=D.f=J,i(46).f=Z,i(63).f=$,s&&!i(35)&&a(j,"propertyIsEnumerable",Z,!0),m.f=function(t){return V(c(t))}),r(r.G+r.W+r.F*!Y,{Symbol:E});for(var Q="hasInstance,isConcatSpreadable,iterator,match,replace,search,species,split,toPrimitive,toStringTag,unscopables".split(","),tt=0;Q.length>tt;)c(Q[tt++]);for(var et=C(c.store),it=0;et.length>it;)f(et[it++]);r(r.S+r.F*!Y,"Symbol",{for:function(t){return n(L,t+="")?L[t]:L[t]=E(t)},keyFor:function(t){if(!B(t))throw TypeError(t+" is not a symbol!");for(var e in L)if(L[e]===t)return e},useSetter:function(){z=!0},useSimple:function(){z=!1}}),r(r.S+r.F*!Y,"Object",{create:X,defineProperty:U,defineProperties:q,getOwnPropertyDescriptor:K,getOwnPropertyNames:J,getOwnPropertySymbols:$}),P&&r(r.S+r.F*(!Y||d(function(){var t=E();return"[null]"!=I([t])||"{}"!=I({a:t})||"{}"!=I(Object(t))})),"JSON",{stringify:function(t){if(void 0!==t&&!B(t)){for(var e,i,o=[t],n=1;arguments.length>n;)o.push(arguments[n++]);return e=o[1],"function"==typeof e&&(i=e),!i&&v(e)||(e=function(t,e){if(i&&(e=i.call(this,t,e)),!B(e))return e}),o[1]=e,I.apply(P,o)}}}),E.prototype[A]||i(14)(E.prototype,A,E.prototype.valueOf),u(E,"Symbol"),u(Math,"Math",!0),u(o.JSON,"JSON",!0)},function(t,e,i){var o=i(27)("meta"),n=i(24),s=i(10),r=i(8).f,a=0,h=Object.isExtensible||function(){return!0},d=!i(19)(function(){return h(Object.preventExtensions({}))}),l=function(t){r(t,o,{value:{i:"O"+ ++a,w:{}}})},u=function(t,e){if(!n(t))return"symbol"==typeof t?t:("string"==typeof t?"S":"P")+t;if(!s(t,o)){if(!h(t))return"F";if(!e)return"E";l(t)}return t[o].i},p=function(t,e){if(!s(t,o)){if(!h(t))return!0;if(!e)return!1;l(t)}return t[o].w},c=function(t){return d&&m.NEED&&h(t)&&!s(t,o)&&l(t),t},m=t.exports={KEY:o,NEED:!1,fastKey:u,getWeak:p,onFreeze:c}},function(t,e,i){var o=i(26),n=i(63),s=i(46);t.exports=function(t){var e=o(t),i=n.f;if(i)for(var r,a=i(t),h=s.f,d=0;a.length>d;)h.call(t,r=a[d++])&&e.push(r);return e}},function(t,e,i){var o=i(33);t.exports=Array.isArray||function(t){return"Array"==o(t)}},function(t,e,i){var o=i(13),n=i(64).f,s={}.toString,r="object"==typeof window&&window&&Object.getOwnPropertyNames?Object.getOwnPropertyNames(window):[],a=function(t){try{return n(t)}catch(t){return r.slice()}};t.exports.f=function(t){return r&&"[object Window]"==s.call(t)?a(t):n(o(t))}},function(t,e,i){var o=i(46),n=i(25),s=i(13),r=i(36),a=i(10),h=i(57),d=Object.getOwnPropertyDescriptor;e.f=i(9)?d:function(t,e){if(t=s(t),e=r(e,!0),h)try{return d(t,e)}catch(t){}if(a(t,e))return n(!o.f.call(t,e),t[e])}},function(t,e){},function(t,e,i){i(45)("asyncIterator")},function(t,e,i){i(45)("observable")},function(t,e,i){(function(t){!function(e,i){t.exports=i()}(0,function(){function e(){return To.apply(null,arguments)}function i(t){return t instanceof Array||"[object Array]"===Object.prototype.toString.call(t)}function o(t){return null!=t&&"[object Object]"===Object.prototype.toString.call(t)}function n(t){if(Object.getOwnPropertyNames)return 0===Object.getOwnPropertyNames(t).length;var e;for(e in t)if(t.hasOwnProperty(e))return!1;return!0}function s(t){return void 0===t}function r(t){return"number"==typeof t||"[object Number]"===Object.prototype.toString.call(t)}function a(t){return t instanceof Date||"[object Date]"===Object.prototype.toString.call(t)}function h(t,e){var i,o=[];for(i=0;i0)for(i=0;i0?"future":"past"];return k(i)?i(e):i.replace(/%s/i,e)}function R(t,e){var i=t.toLowerCase();Fo[i]=Fo[i+"s"]=Fo[e]=t}function L(t){return"string"==typeof t?Fo[t]||Fo[t.toLowerCase()]:void 0}function F(t){var e,i,o={};for(i in t)d(t,i)&&(e=L(i))&&(o[e]=t[i]);return o}function H(t,e){Ho[t]=e}function j(t){var e=[];for(var i in t)e.push({unit:i,priority:Ho[i]});return e.sort(function(t,e){return t.priority-e.priority}),e}function Y(t,e,i){var o=""+Math.abs(t),n=e-o.length;return(t>=0?i?"+":"":"-")+Math.pow(10,Math.max(0,n)).toString().substr(1)+o}function G(t,e,i,o){var n=o;"string"==typeof o&&(n=function(){return this[o]()}),t&&(zo[t]=n),e&&(zo[e[0]]=function(){return Y(n.apply(this,arguments),e[1],e[2])}),i&&(zo[i]=function(){return this.localeData().ordinal(n.apply(this,arguments),t)})}function z(t){return t.match(/\[[\s\S]/)?t.replace(/^\[|\]$/g,""):t.replace(/\\/g,"")}function W(t){var e,i,o=t.match(jo);for(e=0,i=o.length;e=0&&Yo.test(t);)t=t.replace(Yo,i),Yo.lastIndex=0,o-=1;return t}function U(t,e,i){an[t]=k(e)?e:function(t,o){return t&&i?i:e}}function q(t,e){return d(an,t)?an[t](e._strict,e._locale):new RegExp(X(t))}function X(t){return Z(t.replace("\\","").replace(/\\(\[)|\\(\])|\[([^\]\[]*)\]|\\(.)/g,function(t,e,i,o,n){return e||i||o||n}))}function Z(t){return t.replace(/[-\/\\^$*+?.()|[\]{}]/g,"\\$&")}function K(t,e){var i,o=e;for("string"==typeof t&&(t=[t]),r(e)&&(o=function(t,i){i[e]=_(t)}),i=0;i=0&&isFinite(a.getFullYear())&&a.setFullYear(t),a}function _t(t){var e=new Date(Date.UTC.apply(null,arguments));return t<100&&t>=0&&isFinite(e.getUTCFullYear())&&e.setUTCFullYear(t),e}function wt(t,e,i){var o=7+e-i;return-(7+_t(t,0,o).getUTCDay()-e)%7+o-1}function xt(t,e,i,o,n){var s,r,a=(7+i-o)%7,h=wt(t,o,n),d=1+7*(e-1)+a+h;return d<=0?(s=t-1,r=Q(s)+d):d>Q(t)?(s=t+1,r=d-Q(t)):(s=t,r=d),{year:s,dayOfYear:r}}function Dt(t,e,i){var o,n,s=wt(t.year(),e,i),r=Math.floor((t.dayOfYear()-s-1)/7)+1;return r<1?(n=t.year()-1,o=r+St(n,e,i)):r>St(t.year(),e,i)?(o=r-St(t.year(),e,i),n=t.year()+1):(n=t.year(),o=r),{week:o,year:n}}function St(t,e,i){var o=wt(t,e,i),n=wt(t+1,e,i);return(Q(t)-o+n)/7}function kt(t){return Dt(t,this._week.dow,this._week.doy).week}function Ct(){return this._week.dow}function Tt(){return this._week.doy}function Mt(t){var e=this.localeData().week(this);return null==t?e:this.add(7*(t-e),"d")}function Ot(t){var e=Dt(this,1,4).week;return null==t?e:this.add(7*(t-e),"d")}function Et(t,e){return"string"!=typeof t?t:isNaN(t)?(t=e.weekdaysParse(t),"number"==typeof t?t:null):parseInt(t,10)}function Pt(t,e){return"string"==typeof t?e.weekdaysParse(t)%7||7:isNaN(t)?null:t}function It(t,e){return t?i(this._weekdays)?this._weekdays[t.day()]:this._weekdays[this._weekdays.isFormat.test(e)?"format":"standalone"][t.day()]:i(this._weekdays)?this._weekdays:this._weekdays.standalone}function Nt(t){return t?this._weekdaysShort[t.day()]:this._weekdaysShort}function At(t){return t?this._weekdaysMin[t.day()]:this._weekdaysMin}function Rt(t,e,i){var o,n,s,r=t.toLocaleLowerCase();if(!this._weekdaysParse)for(this._weekdaysParse=[],this._shortWeekdaysParse=[],this._minWeekdaysParse=[],o=0;o<7;++o)s=u([2e3,1]).day(o),this._minWeekdaysParse[o]=this.weekdaysMin(s,"").toLocaleLowerCase(),this._shortWeekdaysParse[o]=this.weekdaysShort(s,"").toLocaleLowerCase(),this._weekdaysParse[o]=this.weekdays(s,"").toLocaleLowerCase();return i?"dddd"===e?(n=yn.call(this._weekdaysParse,r),-1!==n?n:null):"ddd"===e?(n=yn.call(this._shortWeekdaysParse,r),-1!==n?n:null):(n=yn.call(this._minWeekdaysParse,r),-1!==n?n:null):"dddd"===e?-1!==(n=yn.call(this._weekdaysParse,r))?n:-1!==(n=yn.call(this._shortWeekdaysParse,r))?n:(n=yn.call(this._minWeekdaysParse,r),-1!==n?n:null):"ddd"===e?-1!==(n=yn.call(this._shortWeekdaysParse,r))?n:-1!==(n=yn.call(this._weekdaysParse,r))?n:(n=yn.call(this._minWeekdaysParse,r),-1!==n?n:null):-1!==(n=yn.call(this._minWeekdaysParse,r))?n:-1!==(n=yn.call(this._weekdaysParse,r))?n:(n=yn.call(this._shortWeekdaysParse,r),-1!==n?n:null)}function Lt(t,e,i){var o,n,s;if(this._weekdaysParseExact)return Rt.call(this,t,e,i);for(this._weekdaysParse||(this._weekdaysParse=[],this._minWeekdaysParse=[],this._shortWeekdaysParse=[],this._fullWeekdaysParse=[]),o=0;o<7;o++){if(n=u([2e3,1]).day(o),i&&!this._fullWeekdaysParse[o]&&(this._fullWeekdaysParse[o]=new RegExp("^"+this.weekdays(n,"").replace(".",".?")+"$","i"),this._shortWeekdaysParse[o]=new RegExp("^"+this.weekdaysShort(n,"").replace(".",".?")+"$","i"),this._minWeekdaysParse[o]=new RegExp("^"+this.weekdaysMin(n,"").replace(".",".?")+"$","i")),this._weekdaysParse[o]||(s="^"+this.weekdays(n,"")+"|^"+this.weekdaysShort(n,"")+"|^"+this.weekdaysMin(n,""),this._weekdaysParse[o]=new RegExp(s.replace(".",""),"i")),i&&"dddd"===e&&this._fullWeekdaysParse[o].test(t))return o;if(i&&"ddd"===e&&this._shortWeekdaysParse[o].test(t))return o;if(i&&"dd"===e&&this._minWeekdaysParse[o].test(t))return o;if(!i&&this._weekdaysParse[o].test(t))return o}}function Ft(t){if(!this.isValid())return null!=t?this:NaN;var e=this._isUTC?this._d.getUTCDay():this._d.getDay();return null!=t?(t=Et(t,this.localeData()),this.add(t-e,"d")):e}function Ht(t){if(!this.isValid())return null!=t?this:NaN;var e=(this.day()+7-this.localeData()._week.dow)%7;return null==t?e:this.add(t-e,"d")}function jt(t){if(!this.isValid())return null!=t?this:NaN;if(null!=t){var e=Pt(t,this.localeData());return this.day(this.day()%7?e:e-7)}return this.day()||7}function Yt(t){return this._weekdaysParseExact?(d(this,"_weekdaysRegex")||Wt.call(this),t?this._weekdaysStrictRegex:this._weekdaysRegex):(d(this,"_weekdaysRegex")||(this._weekdaysRegex=On),this._weekdaysStrictRegex&&t?this._weekdaysStrictRegex:this._weekdaysRegex)}function Gt(t){return this._weekdaysParseExact?(d(this,"_weekdaysRegex")||Wt.call(this),t?this._weekdaysShortStrictRegex:this._weekdaysShortRegex):(d(this,"_weekdaysShortRegex")||(this._weekdaysShortRegex=En),this._weekdaysShortStrictRegex&&t?this._weekdaysShortStrictRegex:this._weekdaysShortRegex)}function zt(t){return this._weekdaysParseExact?(d(this,"_weekdaysRegex")||Wt.call(this),t?this._weekdaysMinStrictRegex:this._weekdaysMinRegex):(d(this,"_weekdaysMinRegex")||(this._weekdaysMinRegex=Pn),this._weekdaysMinStrictRegex&&t?this._weekdaysMinStrictRegex:this._weekdaysMinRegex)}function Wt(){function t(t,e){return e.length-t.length}var e,i,o,n,s,r=[],a=[],h=[],d=[];for(e=0;e<7;e++)i=u([2e3,1]).day(e),o=this.weekdaysMin(i,""),n=this.weekdaysShort(i,""),s=this.weekdays(i,""),r.push(o),a.push(n),h.push(s),d.push(o),d.push(n),d.push(s);for(r.sort(t),a.sort(t),h.sort(t),d.sort(t),e=0;e<7;e++)a[e]=Z(a[e]),h[e]=Z(h[e]),d[e]=Z(d[e]);this._weekdaysRegex=new RegExp("^("+d.join("|")+")","i"),this._weekdaysShortRegex=this._weekdaysRegex,this._weekdaysMinRegex=this._weekdaysRegex,this._weekdaysStrictRegex=new RegExp("^("+h.join("|")+")","i"),this._weekdaysShortStrictRegex=new RegExp("^("+a.join("|")+")","i"),this._weekdaysMinStrictRegex=new RegExp("^("+r.join("|")+")","i")}function Vt(){return this.hours()%12||12}function Bt(){return this.hours()||24}function Ut(t,e){G(t,0,0,function(){return this.localeData().meridiem(this.hours(),this.minutes(),e)})}function qt(t,e){return e._meridiemParse}function Xt(t){return"p"===(t+"").toLowerCase().charAt(0)}function Zt(t,e,i){return t>11?i?"pm":"PM":i?"am":"AM"}function Kt(t){return t?t.toLowerCase().replace("_","-"):t}function Jt(t){for(var e,i,o,n,s=0;s0;){if(o=$t(n.slice(0,e).join("-")))return o;if(i&&i.length>=e&&w(n,i,!0)>=e-1)break;e--}s++}return null}function $t(e){var i=null;if(!Ln[e]&&void 0!==t&&t&&t.exports)try{i=In._abbr;!function(){var t=new Error('Cannot find module "./locale"');throw t.code="MODULE_NOT_FOUND",t}(),Qt(i)}catch(t){}return Ln[e]}function Qt(t,e){var i;return t&&(i=s(e)?ie(t):te(t,e))&&(In=i),In._abbr}function te(t,e){if(null!==e){var i=Rn;if(e.abbr=t,null!=Ln[t])S("defineLocaleOverride","use moment.updateLocale(localeName, config) to change an existing locale. moment.defineLocale(localeName, config) should only be used for creating a new locale See http://momentjs.com/guides/#/warnings/define-locale/ for more info."),i=Ln[t]._config;else if(null!=e.parentLocale){if(null==Ln[e.parentLocale])return Fn[e.parentLocale]||(Fn[e.parentLocale]=[]),Fn[e.parentLocale].push({name:t,config:e}),null;i=Ln[e.parentLocale]._config}return Ln[t]=new M(T(i,e)),Fn[t]&&Fn[t].forEach(function(t){te(t.name,t.config)}),Qt(t),Ln[t]}return delete Ln[t],null}function ee(t,e){if(null!=e){var i,o=Rn;null!=Ln[t]&&(o=Ln[t]._config),e=T(o,e),i=new M(e),i.parentLocale=Ln[t],Ln[t]=i,Qt(t)}else null!=Ln[t]&&(null!=Ln[t].parentLocale?Ln[t]=Ln[t].parentLocale:null!=Ln[t]&&delete Ln[t]);return Ln[t]}function ie(t){var e;if(t&&t._locale&&t._locale._abbr&&(t=t._locale._abbr),!t)return In;if(!i(t)){if(e=$t(t))return e;t=[t]}return Jt(t)}function oe(){return Io(Ln)}function ne(t){var e,i=t._a;return i&&-2===c(t).overflow&&(e=i[ln]<0||i[ln]>11?ln:i[un]<1||i[un]>ht(i[dn],i[ln])?un:i[pn]<0||i[pn]>24||24===i[pn]&&(0!==i[cn]||0!==i[mn]||0!==i[fn])?pn:i[cn]<0||i[cn]>59?cn:i[mn]<0||i[mn]>59?mn:i[fn]<0||i[fn]>999?fn:-1,c(t)._overflowDayOfYear&&(eun)&&(e=un),c(t)._overflowWeeks&&-1===e&&(e=gn),c(t)._overflowWeekday&&-1===e&&(e=vn),c(t).overflow=e),t}function se(t,e,i){return null!=t?t:null!=e?e:i}function re(t){var i=new Date(e.now());return t._useUTC?[i.getUTCFullYear(),i.getUTCMonth(),i.getUTCDate()]:[i.getFullYear(),i.getMonth(),i.getDate()]}function ae(t){var e,i,o,n,s=[];if(!t._d){for(o=re(t),t._w&&null==t._a[un]&&null==t._a[ln]&&he(t),null!=t._dayOfYear&&(n=se(t._a[dn],o[dn]),(t._dayOfYear>Q(n)||0===t._dayOfYear)&&(c(t)._overflowDayOfYear=!0),i=_t(n,0,t._dayOfYear),t._a[ln]=i.getUTCMonth(),t._a[un]=i.getUTCDate()),e=0;e<3&&null==t._a[e];++e)t._a[e]=s[e]=o[e];for(;e<7;e++)t._a[e]=s[e]=null==t._a[e]?2===e?1:0:t._a[e];24===t._a[pn]&&0===t._a[cn]&&0===t._a[mn]&&0===t._a[fn]&&(t._nextDay=!0,t._a[pn]=0),t._d=(t._useUTC?_t:bt).apply(null,s),null!=t._tzm&&t._d.setUTCMinutes(t._d.getUTCMinutes()-t._tzm),t._nextDay&&(t._a[pn]=24),t._w&&void 0!==t._w.d&&t._w.d!==t._d.getDay()&&(c(t).weekdayMismatch=!0)}}function he(t){var e,i,o,n,s,r,a,h;if(e=t._w,null!=e.GG||null!=e.W||null!=e.E)s=1,r=4,i=se(e.GG,t._a[dn],Dt(ke(),1,4).year),o=se(e.W,1),((n=se(e.E,1))<1||n>7)&&(h=!0);else{s=t._locale._week.dow,r=t._locale._week.doy;var d=Dt(ke(),s,r);i=se(e.gg,t._a[dn],d.year),o=se(e.w,d.week),null!=e.d?((n=e.d)<0||n>6)&&(h=!0):null!=e.e?(n=e.e+s,(e.e<0||e.e>6)&&(h=!0)):n=s}o<1||o>St(i,s,r)?c(t)._overflowWeeks=!0:null!=h?c(t)._overflowWeekday=!0:(a=xt(i,o,n,s,r),t._a[dn]=a.year,t._dayOfYear=a.dayOfYear)}function de(t){var e,i,o,n,s,r,a=t._i,h=Hn.exec(a)||jn.exec(a);if(h){for(c(t).iso=!0,e=0,i=Gn.length;e0&&c(t).unusedInput.push(r),a=a.slice(a.indexOf(o)+o.length),d+=o.length),zo[s]?(o?c(t).empty=!1:c(t).unusedTokens.push(s),$(s,o,t)):t._strict&&!o&&c(t).unusedTokens.push(s);c(t).charsLeftOver=h-d,a.length>0&&c(t).unusedInput.push(a),t._a[pn]<=12&&!0===c(t).bigHour&&t._a[pn]>0&&(c(t).bigHour=void 0),c(t).parsedDateParts=t._a.slice(0),c(t).meridiem=t._meridiem,t._a[pn]=ye(t._locale,t._a[pn],t._meridiem),ae(t),ne(t)}function ye(t,e,i){var o;return null==i?e:null!=t.meridiemHour?t.meridiemHour(e,i):null!=t.isPM?(o=t.isPM(i),o&&e<12&&(e+=12),o||12!==e||(e=0),e):e}function be(t){var e,i,o,n,s;if(0===t._f.length)return c(t).invalidFormat=!0,void(t._d=new Date(NaN));for(n=0;nthis.clone().month(0).utcOffset()||this.utcOffset()>this.clone().month(5).utcOffset()}function Ue(){if(!s(this._isDSTShifted))return this._isDSTShifted;var t={};if(g(t,this),t=xe(t),t._a){var e=t._isUTC?u(t._a):ke(t._a);this._isDSTShifted=this.isValid()&&w(t._a,e.toArray())>0}else this._isDSTShifted=!1;return this._isDSTShifted}function qe(){return!!this.isValid()&&!this._isUTC}function Xe(){return!!this.isValid()&&this._isUTC}function Ze(){return!!this.isValid()&&(this._isUTC&&0===this._offset)}function Ke(t,e){var i,o,n,s=t,a=null;return Ne(t)?s={ms:t._milliseconds,d:t._days,M:t._months}:r(t)?(s={},e?s[e]=t:s.milliseconds=t):(a=Jn.exec(t))?(i="-"===a[1]?-1:1,s={y:0,d:_(a[un])*i,h:_(a[pn])*i,m:_(a[cn])*i,s:_(a[mn])*i,ms:_(Ae(1e3*a[fn]))*i}):(a=$n.exec(t))?(i="-"===a[1]?-1:(a[1],1),s={y:Je(a[2],i),M:Je(a[3],i),w:Je(a[4],i),d:Je(a[5],i),h:Je(a[6],i),m:Je(a[7],i),s:Je(a[8],i)}):null==s?s={}:"object"==typeof s&&("from"in s||"to"in s)&&(n=Qe(ke(s.from),ke(s.to)),s={},s.ms=n.milliseconds,s.M=n.months),o=new Ie(s),Ne(t)&&d(t,"_locale")&&(o._locale=t._locale),o}function Je(t,e){var i=t&&parseFloat(t.replace(",","."));return(isNaN(i)?0:i)*e}function $e(t,e){var i={milliseconds:0,months:0};return i.months=e.month()-t.month()+12*(e.year()-t.year()),t.clone().add(i.months,"M").isAfter(e)&&--i.months,i.milliseconds=+e-+t.clone().add(i.months,"M"),i}function Qe(t,e){var i;return t.isValid()&&e.isValid()?(e=Fe(e,t),t.isBefore(e)?i=$e(t,e):(i=$e(e,t),i.milliseconds=-i.milliseconds,i.months=-i.months),i):{milliseconds:0,months:0}}function ti(t,e){return function(i,o){var n,s;return null===o||isNaN(+o)||(S(e,"moment()."+e+"(period, number) is deprecated. Please use moment()."+e+"(number, period). See http://momentjs.com/guides/#/warnings/add-inverted-param/ for more info."),s=i,i=o,o=s),i="string"==typeof i?+i:i,n=Ke(i,o),ei(this,n,t),this}}function ei(t,i,o,n){var s=i._milliseconds,r=Ae(i._days),a=Ae(i._months);t.isValid()&&(n=null==n||n,a&&ct(t,ot(t,"Month")+a*o),r&&nt(t,"Date",ot(t,"Date")+r*o),s&&t._d.setTime(t._d.valueOf()+s*o),n&&e.updateOffset(t,r||a))}function ii(t,e){var i=t.diff(e,"days",!0);return i<-6?"sameElse":i<-1?"lastWeek":i<0?"lastDay":i<1?"sameDay":i<2?"nextDay":i<7?"nextWeek":"sameElse"}function oi(t,i){var o=t||ke(),n=Fe(o,this).startOf("day"),s=e.calendarFormat(this,n)||"sameElse",r=i&&(k(i[s])?i[s].call(this,o):i[s]);return this.format(r||this.localeData().calendar(s,this,ke(o)))}function ni(){return new v(this)}function si(t,e){var i=y(t)?t:ke(t);return!(!this.isValid()||!i.isValid())&&(e=L(s(e)?"millisecond":e),"millisecond"===e?this.valueOf()>i.valueOf():i.valueOf()9999?V(t,"YYYYYY-MM-DD[T]HH:mm:ss.SSS[Z]"):k(Date.prototype.toISOString)?this.toDate().toISOString():V(t,"YYYY-MM-DD[T]HH:mm:ss.SSS[Z]")}function fi(){if(!this.isValid())return"moment.invalid(/* "+this._i+" */)";var t="moment",e="";this.isLocal()||(t=0===this.utcOffset()?"moment.utc":"moment.parseZone",e="Z");var i="["+t+'("]',o=0<=this.year()&&this.year()<=9999?"YYYY":"YYYYYY",n=e+'[")]';return this.format(i+o+"-MM-DD[T]HH:mm:ss.SSS"+n)}function gi(t){t||(t=this.isUtc()?e.defaultFormatUtc:e.defaultFormat);var i=V(this,t);return this.localeData().postformat(i)}function vi(t,e){return this.isValid()&&(y(t)&&t.isValid()||ke(t).isValid())?Ke({to:this,from:t}).locale(this.locale()).humanize(!e):this.localeData().invalidDate()}function yi(t){return this.from(ke(),t)}function bi(t,e){return this.isValid()&&(y(t)&&t.isValid()||ke(t).isValid())?Ke({from:this,to:t}).locale(this.locale()).humanize(!e):this.localeData().invalidDate()}function _i(t){return this.to(ke(),t)}function wi(t){var e;return void 0===t?this._locale._abbr:(e=ie(t),null!=e&&(this._locale=e),this)}function xi(){return this._locale}function Di(t){switch(t=L(t)){case"year":this.month(0);case"quarter":case"month":this.date(1);case"week":case"isoWeek":case"day":case"date":this.hours(0);case"hour":this.minutes(0);case"minute":this.seconds(0);case"second":this.milliseconds(0)}return"week"===t&&this.weekday(0),"isoWeek"===t&&this.isoWeekday(1),"quarter"===t&&this.month(3*Math.floor(this.month()/3)),this}function Si(t){return void 0===(t=L(t))||"millisecond"===t?this:("date"===t&&(t="day"),this.startOf(t).add(1,"isoWeek"===t?"week":t).subtract(1,"ms"))}function ki(){return this._d.valueOf()-6e4*(this._offset||0)}function Ci(){return Math.floor(this.valueOf()/1e3)}function Ti(){return new Date(this.valueOf())}function Mi(){var t=this;return[t.year(),t.month(),t.date(),t.hour(),t.minute(),t.second(),t.millisecond()]}function Oi(){var t=this;return{years:t.year(),months:t.month(),date:t.date(),hours:t.hours(),minutes:t.minutes(),seconds:t.seconds(),milliseconds:t.milliseconds()}}function Ei(){return this.isValid()?this.toISOString():null}function Pi(){return m(this)}function Ii(){return l({},c(this))}function Ni(){return c(this).overflow}function Ai(){return{input:this._i,format:this._f,locale:this._locale,isUTC:this._isUTC,strict:this._strict}}function Ri(t,e){G(0,[t,t.length],0,e)}function Li(t){return Yi.call(this,t,this.week(),this.weekday(),this.localeData()._week.dow,this.localeData()._week.doy)}function Fi(t){return Yi.call(this,t,this.isoWeek(),this.isoWeekday(),1,4)}function Hi(){return St(this.year(),1,4)}function ji(){var t=this.localeData()._week;return St(this.year(),t.dow,t.doy)}function Yi(t,e,i,o,n){var s;return null==t?Dt(this,o,n).year:(s=St(t,o,n),e>s&&(e=s),Gi.call(this,t,e,i,o,n))}function Gi(t,e,i,o,n){var s=xt(t,e,i,o,n),r=_t(s.year,0,s.dayOfYear);return this.year(r.getUTCFullYear()),this.month(r.getUTCMonth()),this.date(r.getUTCDate()),this}function zi(t){return null==t?Math.ceil((this.month()+1)/3):this.month(3*(t-1)+this.month()%3)}function Wi(t){var e=Math.round((this.clone().startOf("day")-this.clone().startOf("year"))/864e5)+1;return null==t?e:this.add(t-e,"d")}function Vi(t,e){e[fn]=_(1e3*("0."+t))}function Bi(){return this._isUTC?"UTC":""}function Ui(){return this._isUTC?"Coordinated Universal Time":""}function qi(t){return ke(1e3*t)}function Xi(){return ke.apply(null,arguments).parseZone()}function Zi(t){return t}function Ki(t,e,i,o){var n=ie(),s=u().set(o,e);return n[i](s,t)}function Ji(t,e,i){if(r(t)&&(e=t,t=void 0),t=t||"",null!=e)return Ki(t,e,i,"month");var o,n=[];for(o=0;o<12;o++)n[o]=Ki(t,o,i,"month");return n}function $i(t,e,i,o){"boolean"==typeof t?(r(e)&&(i=e,e=void 0),e=e||""):(e=t,i=e,t=!1,r(e)&&(i=e,e=void 0),e=e||"");var n=ie(),s=t?n._week.dow:0;if(null!=i)return Ki(e,(i+s)%7,o,"day");var a,h=[];for(a=0;a<7;a++)h[a]=Ki(e,(a+s)%7,o,"day");return h}function Qi(t,e){return Ji(t,e,"months")}function to(t,e){return Ji(t,e,"monthsShort")}function eo(t,e,i){return $i(t,e,i,"weekdays")}function io(t,e,i){return $i(t,e,i,"weekdaysShort")}function oo(t,e,i){return $i(t,e,i,"weekdaysMin")}function no(){var t=this._data;return this._milliseconds=ds(this._milliseconds),this._days=ds(this._days),this._months=ds(this._months),t.milliseconds=ds(t.milliseconds),t.seconds=ds(t.seconds),t.minutes=ds(t.minutes),t.hours=ds(t.hours),t.months=ds(t.months),t.years=ds(t.years),this}function so(t,e,i,o){var n=Ke(e,i);return t._milliseconds+=o*n._milliseconds,t._days+=o*n._days,t._months+=o*n._months,t._bubble()}function ro(t,e){return so(this,t,e,1)}function ao(t,e){return so(this,t,e,-1)}function ho(t){return t<0?Math.floor(t):Math.ceil(t)}function lo(){var t,e,i,o,n,s=this._milliseconds,r=this._days,a=this._months,h=this._data;return s>=0&&r>=0&&a>=0||s<=0&&r<=0&&a<=0||(s+=864e5*ho(po(a)+r),r=0,a=0),h.milliseconds=s%1e3,t=b(s/1e3),h.seconds=t%60,e=b(t/60),h.minutes=e%60,i=b(e/60),h.hours=i%24,r+=b(i/24),n=b(uo(r)),a+=n,r-=ho(po(n)),o=b(a/12),a%=12,h.days=r,h.months=a,h.years=o,this}function uo(t){return 4800*t/146097}function po(t){return 146097*t/4800}function co(t){if(!this.isValid())return NaN;var e,i,o=this._milliseconds;if("month"===(t=L(t))||"year"===t)return e=this._days+o/864e5,i=this._months+uo(e),"month"===t?i:i/12;switch(e=this._days+Math.round(po(this._months)),t){case"week":return e/7+o/6048e5;case"day":return e+o/864e5;case"hour":return 24*e+o/36e5;case"minute":return 1440*e+o/6e4;case"second":return 86400*e+o/1e3;case"millisecond":return Math.floor(864e5*e)+o;default:throw new Error("Unknown unit "+t)}}function mo(){return this.isValid()?this._milliseconds+864e5*this._days+this._months%12*2592e6+31536e6*_(this._months/12):NaN}function fo(t){return function(){return this.as(t)}}function go(){return Ke(this)}function vo(t){return t=L(t),this.isValid()?this[t+"s"]():NaN}function yo(t){return function(){return this.isValid()?this._data[t]:NaN}}function bo(){return b(this.days()/7)}function _o(t,e,i,o,n){return n.relativeTime(e||1,!!i,t,o)}function wo(t,e,i){var o=Ke(t).abs(),n=ks(o.as("s")),s=ks(o.as("m")),r=ks(o.as("h")),a=ks(o.as("d")),h=ks(o.as("M")),d=ks(o.as("y")),l=n<=Cs.ss&&["s",n]||n0,l[4]=i,_o.apply(null,l)}function xo(t){return void 0===t?ks:"function"==typeof t&&(ks=t,!0)}function Do(t,e){return void 0!==Cs[t]&&(void 0===e?Cs[t]:(Cs[t]=e,"s"===t&&(Cs.ss=e-1),!0))}function So(t){if(!this.isValid())return this.localeData().invalidDate();var e=this.localeData(),i=wo(this,!t,e);return t&&(i=e.pastFuture(+this,i)),e.postformat(i)}function ko(t){return(t>0)-(t<0)||+t}function Co(){if(!this.isValid())return this.localeData().invalidDate();var t,e,i,o=Ts(this._milliseconds)/1e3,n=Ts(this._days),s=Ts(this._months);t=b(o/60),e=b(t/60),o%=60,t%=60,i=b(s/12),s%=12;var r=i,a=s,h=n,d=e,l=t,u=o?o.toFixed(3).replace(/\.?0+$/,""):"",p=this.asSeconds();if(!p)return"P0D";var c=p<0?"-":"",m=ko(this._months)!==ko(p)?"-":"",f=ko(this._days)!==ko(p)?"-":"",g=ko(this._milliseconds)!==ko(p)?"-":"";return c+"P"+(r?m+r+"Y":"")+(a?m+a+"M":"")+(h?f+h+"D":"")+(d||l||u?"T":"")+(d?g+d+"H":"")+(l?g+l+"M":"")+(u?g+u+"S":"")}var To,Mo;Mo=Array.prototype.some?Array.prototype.some:function(t){for(var e=Object(this),i=e.length>>>0,o=0;o68?1900:2e3)};var yn,bn=it("FullYear",!0);yn=Array.prototype.indexOf?Array.prototype.indexOf:function(t){var e;for(e=0;ethis?this:t:f()}),Xn=function(){return Date.now?Date.now():+new Date},Zn=["year","quarter","month","week","day","hour","minute","second","millisecond"];Re("Z",":"),Re("ZZ",""),U("Z",nn),U("ZZ",nn),K(["Z","ZZ"],function(t,e,i){i._useUTC=!0,i._tzm=Le(nn,t)});var Kn=/([\+\-]|\d\d)/gi;e.updateOffset=function(){};var Jn=/^(\-|\+)?(?:(\d*)[. ])?(\d+)\:(\d+)(?:\:(\d+)(\.\d*)?)?$/,$n=/^(-|\+)?P(?:([-+]?[0-9,.]*)Y)?(?:([-+]?[0-9,.]*)M)?(?:([-+]?[0-9,.]*)W)?(?:([-+]?[0-9,.]*)D)?(?:T(?:([-+]?[0-9,.]*)H)?(?:([-+]?[0-9,.]*)M)?(?:([-+]?[0-9,.]*)S)?)?$/;Ke.fn=Ie.prototype,Ke.invalid=Pe;var Qn=ti(1,"add"),ts=ti(-1,"subtract");e.defaultFormat="YYYY-MM-DDTHH:mm:ssZ",e.defaultFormatUtc="YYYY-MM-DDTHH:mm:ss[Z]";var es=D("moment().lang() is deprecated. Instead, use moment().localeData() to get the language configuration. Use moment().locale() to change languages.",function(t){return void 0===t?this.localeData():this.locale(t)});G(0,["gg",2],0,function(){return this.weekYear()%100}),G(0,["GG",2],0,function(){return this.isoWeekYear()%100}),Ri("gggg","weekYear"),Ri("ggggg","weekYear"),Ri("GGGG","isoWeekYear"),Ri("GGGGG","isoWeekYear"),R("weekYear","gg"),R("isoWeekYear","GG"),H("weekYear",1),H("isoWeekYear",1),U("G",en),U("g",en),U("GG",Xo,Vo),U("gg",Xo,Vo),U("GGGG",$o,Uo),U("gggg",$o,Uo),U("GGGGG",Qo,qo),U("ggggg",Qo,qo),J(["gggg","ggggg","GGGG","GGGGG"],function(t,e,i,o){e[o.substr(0,2)]=_(t)}),J(["gg","GG"],function(t,i,o,n){i[n]=e.parseTwoDigitYear(t)}),G("Q",0,"Qo","quarter"),R("quarter","Q"),H("quarter",7),U("Q",Wo),K("Q",function(t,e){e[ln]=3*(_(t)-1)}),G("D",["DD",2],"Do","date"),R("date","D"),H("date",9),U("D",Xo),U("DD",Xo,Vo),U("Do",function(t,e){return t?e._dayOfMonthOrdinalParse||e._ordinalParse:e._dayOfMonthOrdinalParseLenient}),K(["D","DD"],un),K("Do",function(t,e){e[un]=_(t.match(Xo)[0],10)});var is=it("Date",!0);G("DDD",["DDDD",3],"DDDo","dayOfYear"),R("dayOfYear","DDD"),H("dayOfYear",4),U("DDD",Jo),U("DDDD",Bo),K(["DDD","DDDD"],function(t,e,i){i._dayOfYear=_(t)}),G("m",["mm",2],0,"minute"),R("minute","m"),H("minute",14),U("m",Xo),U("mm",Xo,Vo),K(["m","mm"],cn);var os=it("Minutes",!1);G("s",["ss",2],0,"second"),R("second","s"),H("second",15),U("s",Xo),U("ss",Xo,Vo),K(["s","ss"],mn);var ns=it("Seconds",!1);G("S",0,0,function(){return~~(this.millisecond()/100)}),G(0,["SS",2],0,function(){return~~(this.millisecond()/10)}),G(0,["SSS",3],0,"millisecond"),G(0,["SSSS",4],0,function(){return 10*this.millisecond()}),G(0,["SSSSS",5],0,function(){return 100*this.millisecond()}),G(0,["SSSSSS",6],0,function(){return 1e3*this.millisecond()}),G(0,["SSSSSSS",7],0,function(){return 1e4*this.millisecond()}),G(0,["SSSSSSSS",8],0,function(){return 1e5*this.millisecond()}),G(0,["SSSSSSSSS",9],0,function(){return 1e6*this.millisecond()}),R("millisecond","ms"),H("millisecond",16),U("S",Jo,Wo),U("SS",Jo,Vo),U("SSS",Jo,Bo);var ss;for(ss="SSSS";ss.length<=9;ss+="S")U(ss,tn);for(ss="S";ss.length<=9;ss+="S")K(ss,Vi);var rs=it("Milliseconds",!1);G("z",0,0,"zoneAbbr"),G("zz",0,0,"zoneName");var as=v.prototype;as.add=Qn,as.calendar=oi,as.clone=ni,as.diff=ui,as.endOf=Si,as.format=gi,as.from=vi,as.fromNow=yi,as.to=bi,as.toNow=_i,as.get=st,as.invalidAt=Ni,as.isAfter=si,as.isBefore=ri,as.isBetween=ai,as.isSame=hi,as.isSameOrAfter=di,as.isSameOrBefore=li,as.isValid=Pi,as.lang=es,as.locale=wi,as.localeData=xi,as.max=qn,as.min=Un,as.parsingFlags=Ii,as.set=rt,as.startOf=Di,as.subtract=ts,as.toArray=Mi,as.toObject=Oi,as.toDate=Ti,as.toISOString=mi,as.inspect=fi,as.toJSON=Ei,as.toString=ci,as.unix=Ci,as.valueOf=ki,as.creationData=Ai,as.year=bn,as.isLeapYear=et,as.weekYear=Li,as.isoWeekYear=Fi,as.quarter=as.quarters=zi,as.month=mt,as.daysInMonth=ft,as.week=as.weeks=Mt,as.isoWeek=as.isoWeeks=Ot,as.weeksInYear=ji,as.isoWeeksInYear=Hi,as.date=is,as.day=as.days=Ft,as.weekday=Ht,as.isoWeekday=jt,as.dayOfYear=Wi,as.hour=as.hours=An,as.minute=as.minutes=os,as.second=as.seconds=ns,as.millisecond=as.milliseconds=rs,as.utcOffset=je,as.utc=Ge,as.local=ze,as.parseZone=We,as.hasAlignedHourOffset=Ve,as.isDST=Be,as.isLocal=qe,as.isUtcOffset=Xe,as.isUtc=Ze,as.isUTC=Ze,as.zoneAbbr=Bi,as.zoneName=Ui,as.dates=D("dates accessor is deprecated. Use date instead.",is),as.months=D("months accessor is deprecated. Use month instead",mt),as.years=D("years accessor is deprecated. Use year instead",bn),as.zone=D("moment().zone is deprecated, use moment().utcOffset instead. http://momentjs.com/guides/#/warnings/zone/",Ye),as.isDSTShifted=D("isDSTShifted is deprecated. See http://momentjs.com/guides/#/warnings/dst-shifted/ for more information",Ue);var hs=M.prototype;hs.calendar=O,hs.longDateFormat=E,hs.invalidDate=P,hs.ordinal=I,hs.preparse=Zi,hs.postformat=Zi,hs.relativeTime=N,hs.pastFuture=A,hs.set=C,hs.months=dt,hs.monthsShort=lt,hs.monthsParse=pt,hs.monthsRegex=vt,hs.monthsShortRegex=gt,hs.week=kt,hs.firstDayOfYear=Tt,hs.firstDayOfWeek=Ct,hs.weekdays=It,hs.weekdaysMin=At,hs.weekdaysShort=Nt,hs.weekdaysParse=Lt,hs.weekdaysRegex=Yt,hs.weekdaysShortRegex=Gt,hs.weekdaysMinRegex=zt,hs.isPM=Xt,hs.meridiem=Zt,Qt("en",{dayOfMonthOrdinalParse:/\d{1,2}(th|st|nd|rd)/,ordinal:function(t){var e=t%10;return t+(1===_(t%100/10)?"th":1===e?"st":2===e?"nd":3===e?"rd":"th")}}),e.lang=D("moment.lang is deprecated. Use moment.locale instead.",Qt),e.langData=D("moment.langData is deprecated. Use moment.localeData instead.",ie);var ds=Math.abs,ls=fo("ms"),us=fo("s"),ps=fo("m"),cs=fo("h"),ms=fo("d"),fs=fo("w"),gs=fo("M"),vs=fo("y"),ys=yo("milliseconds"),bs=yo("seconds"),_s=yo("minutes"),ws=yo("hours"),xs=yo("days"),Ds=yo("months"),Ss=yo("years"),ks=Math.round,Cs={ss:44,s:45,m:45,h:22,d:26,M:11},Ts=Math.abs,Ms=Ie.prototype;return Ms.isValid=Ee,Ms.abs=no,Ms.add=ro,Ms.subtract=ao,Ms.as=co,Ms.asMilliseconds=ls,Ms.asSeconds=us,Ms.asMinutes=ps,Ms.asHours=cs,Ms.asDays=ms,Ms.asWeeks=fs,Ms.asMonths=gs,Ms.asYears=vs,Ms.valueOf=mo,Ms._bubble=lo,Ms.clone=go,Ms.get=vo,Ms.milliseconds=ys,Ms.seconds=bs,Ms.minutes=_s,Ms.hours=ws,Ms.days=xs,Ms.weeks=bo,Ms.months=Ds,Ms.years=Ss,Ms.humanize=So,Ms.toISOString=Co,Ms.toString=Co,Ms.toJSON=Co,Ms.locale=wi,Ms.localeData=xi,Ms.toIsoString=D("toIsoString() is deprecated. Please use toISOString() instead (notice the capitals)",Co),Ms.lang=es,G("X",0,0,"unix"),G("x",0,0,"valueOf"),U("x",en),U("X",sn),K("X",function(t,e,i){i._d=new Date(1e3*parseFloat(t,10))}),K("x",function(t,e,i){i._d=new Date(_(t))}),e.version="2.19.1",function(t){To=t}(ke),e.fn=as,e.min=Te,e.max=Me,e.now=Xn,e.utc=u,e.unix=qi,e.months=Qi,e.isDate=a,e.locale=Qt,e.invalid=f,e.duration=Ke,e.isMoment=y,e.weekdays=eo,e.parseZone=Xi,e.localeData=ie,e.isDuration=Ne,e.monthsShort=to,e.weekdaysMin=oo,e.defineLocale=te,e.updateLocale=ee,e.locales=oe,e.weekdaysShort=io,e.normalizeUnits=L,e.relativeTimeRounding=xo,e.relativeTimeThreshold=Do,e.calendarFormat=ii,e.prototype=as,e})}).call(e,i(124)(t))},function(t,e){t.exports=function(t){return t.webpackPolyfill||(t.deprecate=function(){},t.paths=[],t.children||(t.children=[]),Object.defineProperty(t,"loaded",{enumerable:!0,get:function(){return t.l}}),Object.defineProperty(t,"id",{enumerable:!0,get:function(){return t.i}}),t.webpackPolyfill=1),t}},function(t,e){function i(t){throw new Error("Cannot find module '"+t+"'.")}i.keys=function(){return[]},i.resolve=i,t.exports=i,i.id=125},function(t,e,i){(function(e){function i(t,e,i){var o=e&&i||0,n=0;for(e=e||[],t.toLowerCase().replace(/[0-9a-f]{2}/g,function(t){n<16&&(e[o+n++]=u[t])});n<16;)e[o+n++]=0;return e}function o(t,e){var i=e||0,o=l;return o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]}function n(t,e,i){var n=e&&i||0,s=e||[];t=t||{};var r=void 0!==t.clockseq?t.clockseq:f,a=void 0!==t.msecs?t.msecs:(new Date).getTime(),h=void 0!==t.nsecs?t.nsecs:v+1,d=a-g+(h-v)/1e4;if(d<0&&void 0===t.clockseq&&(r=r+1&16383),(d<0||a>g)&&void 0===t.nsecs&&(h=0),h>=1e4)throw new Error("uuid.v1(): Can't create more than 10M uuids/sec");g=a,v=h,f=r,a+=122192928e5;var l=(1e4*(268435455&a)+h)%4294967296;s[n++]=l>>>24&255,s[n++]=l>>>16&255,s[n++]=l>>>8&255,s[n++]=255&l;var u=a/4294967296*1e4&268435455;s[n++]=u>>>8&255,s[n++]=255&u,s[n++]=u>>>24&15|16,s[n++]=u>>>16&255,s[n++]=r>>>8|128,s[n++]=255&r;for(var p=t.node||m,c=0;c<6;c++)s[n+c]=p[c];return e||o(s)}function s(t,e,i){var n=e&&i||0;"string"==typeof t&&(e="binary"==t?new Array(16):null,t=null),t=t||{};var s=t.random||(t.rng||r)();if(s[6]=15&s[6]|64,s[8]=63&s[8]|128,e)for(var a=0;a<16;a++)e[n+a]=s[a];return e||o(s)}var r,a="undefined"!=typeof window?window:void 0!==e?e:null;if(a&&a.crypto&&crypto.getRandomValues){var h=new Uint8Array(16);r=function(){return crypto.getRandomValues(h),h}}if(!r){var d=new Array(16);r=function(){for(var t,e=0;e<16;e++)0==(3&e)&&(t=4294967296*Math.random()),d[e]=t>>>((3&e)<<3)&255;return d}}for(var l=[],u={},p=0;p<256;p++)l[p]=(p+256).toString(16).substr(1),u[l[p]]=p;var c=r(),m=[1|c[0],c[1],c[2],c[3],c[4],c[5]],f=16383&(c[6]<<8|c[7]),g=0,v=0,y=s;y.v1=n,y.v4=s,y.parse=i,y.unparse=o,t.exports=y}).call(e,i(127))},function(t,e){var i;i=function(){return this}();try{i=i||Function("return this")()||(0,eval)("this")}catch(t){"object"==typeof window&&(i=window)}t.exports=i},function(t,e,i){var o=i(2),n=o.JSON||(o.JSON={stringify:JSON.stringify});t.exports=function(t){return n.stringify.apply(n,arguments)}},function(t,e,i){function o(t,e,i,n){if(!(this instanceof o))throw new SyntaxError("Constructor must be called with the new operator");if(!(Array.isArray(i)||i instanceof d||i instanceof l)&&i instanceof Object){var s=n;n=i,i=s}n&&n.throttleRedraw&&console.warn('Timeline option "throttleRedraw" is DEPRICATED and no longer supported. It will be removed in the next MAJOR release.');var r=this;if(this.defaultOptions={start:null,end:null,autoResize:!0,orientation:{axis:"bottom",item:"bottom"},moment:a,width:null,height:null,maxHeight:null,minHeight:null},this.options=h.deepExtend({},this.defaultOptions),this._create(t),!n||n&&void 0===n.rtl){this.dom.root.style.visibility="hidden";for(var p,f=this.dom.root;!p&&f;)p=window.getComputedStyle(f,null).direction,f=f.parentElement;this.options.rtl=p&&"rtl"==p.toLowerCase()}else this.options.rtl=n.rtl;this.options.rollingMode=n&&n.rollingMode,this.options.onInitialDrawComplete=n&&n.onInitialDrawComplete,this.components=[],this.body={dom:this.dom,domProps:this.props,emitter:{on:this.on.bind(this),off:this.off.bind(this),emit:this.emit.bind(this)},hiddenDates:[],util:{getScale:function(){return r.timeAxis.step.scale},getStep:function(){return r.timeAxis.step.step},toScreen:r._toScreen.bind(r),toGlobalScreen:r._toGlobalScreen.bind(r),toTime:r._toTime.bind(r),toGlobalTime:r._toGlobalTime.bind(r)}},this.range=new u(this.body,this.options),this.components.push(this.range),this.body.range=this.range,this.timeAxis=new c(this.body,this.options),this.timeAxis2=null,this.components.push(this.timeAxis),this.currentTime=new m(this.body,this.options),this.components.push(this.currentTime),this.itemSet=new g(this.body,this.options),this.components.push(this.itemSet),this.itemsData=null,this.groupsData=null,this.dom.root.onclick=function(t){r.emit("click",r.getEventProperties(t))},this.dom.root.ondblclick=function(t){r.emit("doubleClick",r.getEventProperties(t))},this.dom.root.oncontextmenu=function(t){r.emit("contextmenu",r.getEventProperties(t))},this.dom.root.onmouseover=function(t){r.emit("mouseOver",r.getEventProperties(t))},window.PointerEvent?(this.dom.root.onpointerdown=function(t){r.emit("mouseDown",r.getEventProperties(t))},this.dom.root.onpointermove=function(t){r.emit("mouseMove",r.getEventProperties(t))},this.dom.root.onpointerup=function(t){r.emit("mouseUp",r.getEventProperties(t))}):(this.dom.root.onmousemove=function(t){r.emit("mouseMove",r.getEventProperties(t))},this.dom.root.onmousedown=function(t){r.emit("mouseDown",r.getEventProperties(t))},this.dom.root.onmouseup=function(t){r.emit("mouseUp",r.getEventProperties(t))}),this.initialFitDone=!1,this.on("changed",function(){if(null!=this.itemsData&&!this.options.rollingMode){if(!r.initialFitDone)if(r.initialFitDone=!0,void 0!=r.options.start||void 0!=r.options.end){if(void 0==r.options.start||void 0==r.options.end)var t=r.getItemRange();var e=void 0!=r.options.start?r.options.start:t.min,i=void 0!=r.options.end?r.options.end:t.max;r.setWindow(e,i,{animation:!1})}else r.fit({animation:!1});!r.initialDrawDone&&r.initialRangeChangeDone&&(r.initialDrawDone=!0,r.dom.root.style.visibility="visible",r.options.onInitialDrawComplete&&setTimeout(function(){return r.options.onInitialDrawComplete()},0))}}),n&&this.setOptions(n),i&&this.setGroups(i),e&&this.setItems(e),this._redraw()}function n(t){return h.convert(t.data.start,"Date").valueOf()}function s(t){var e=void 0!=t.data.end?t.data.end:t.data.start;return h.convert(e,"Date").valueOf()}function r(t,e){var i=t.props.leftContainer.height,o=t.props.left.height,n=e.parent,s=n.top,r=!0,a=t.timeAxis.options.orientation.axis,h=function(){return"bottom"==a?n.height-e.top-e.height:e.top},d=-1*t._getScrollTop(),l=s+h(),u=e.height;return ld+i?s+=h()+u-i+t.itemSet.options.margin.item.vertical:r=!1,s=Math.min(s,o-i),{shouldScroll:r,scrollOffset:s,itemTop:l}}var a=i(3),h=i(0),d=i(15),l=i(20),u=i(47),p=i(48),c=i(28),m=i(50),f=i(29),g=i(70),v=i(32).printStyle,y=i(75).allOptions,b=i(75).configureOptions,_=i(76).default,w=i(32).default;o.prototype=new p,o.prototype._createConfigurator=function(){return new _(this,this.dom.container,b)},o.prototype.redraw=function(){this.itemSet&&this.itemSet.markDirty({refreshItems:!0}),this._redraw()},o.prototype.setOptions=function(t){if(!0===w.validate(t,y)&&console.log("%cErrors have been found in the supplied options object.",v),p.prototype.setOptions.call(this,t),"type"in t&&t.type!==this.options.type){this.options.type=t.type;var e=this.itemsData;if(e){var i=this.getSelection();this.setItems(null),this.setItems(e),this.setSelection(i)}}},o.prototype.setItems=function(t){var e;e=t?t instanceof d||t instanceof l?t:new d(t,{type:{start:"Date",end:"Date"}}):null,this.itemsData=e,this.itemSet&&this.itemSet.setItems(e)},o.prototype.setGroups=function(t){var e;if(t){var i=function(t){return!1!==t.visible};e=t instanceof d||t instanceof l?new l(t,{filter:i}):new d(t.filter(i))}else e=null;this.groupsData=e,this.itemSet.setGroups(e)},o.prototype.setData=function(t){t&&t.groups&&this.setGroups(t.groups),t&&t.items&&this.setItems(t.items)},o.prototype.setSelection=function(t,e){this.itemSet&&this.itemSet.setSelection(t),e&&e.focus&&this.focus(t,e)},o.prototype.getSelection=function(){return this.itemSet&&this.itemSet.getSelection()||[]},o.prototype.focus=function(t,e){if(this.itemsData&&void 0!=t){var i=Array.isArray(t)?t:[t],o=this.itemsData.getDataSet().get(i,{type:{start:"Date",end:"Date"}}),n=null,s=null;if(o.forEach(function(t){var e=t.start.valueOf(),i="end"in t?t.end.valueOf():t.start.valueOf();(null===n||es)&&(s=i)}),null!==n&&null!==s){var a=this,h=this.itemSet.items[i[0]],d=-1*this._getScrollTop(),l=null,u=function(t,e,i){var o=r(a,h);if(l||(l=o),l.itemTop!=o.itemTop||l.shouldScroll){l.itemTop!=o.itemTop&&o.shouldScroll&&(l=o,d=-1*a._getScrollTop());var n=d,s=l.scrollOffset,u=i?s:n+(s-n)*t;a._setScrollTop(-u),e||a._redraw()}},p=function(){var t=r(a,h);t.shouldScroll&&t.itemTop!=l.itemTop&&(a._setScrollTop(-t.scrollOffset),a._redraw())},c=function(){p(),setTimeout(p,100)},m=(n+s)/2,f=Math.max(this.range.end-this.range.start,1.1*(s-n)),g=!e||void 0===e.animation||e.animation;g||(l={shouldScroll:!1,scrollOffset:-1,itemTop:-1}),this.range.setRange(m-f/2,m+f/2,{animation:g},c,u)}}},o.prototype.fit=function(t,e){var i,o=!t||void 0===t.animation||t.animation,n=this.itemsData&&this.itemsData.getDataSet();1===n.length&&void 0===n.get()[0].end?(i=this.getDataRange(),this.moveTo(i.min.valueOf(),{animation:o},e)):(i=this.getItemRange(),this.range.setRange(i.min,i.max,{animation:o},e))},o.prototype.getItemRange=function(){var t=this.getDataRange(),e=null!==t.min?t.min.valueOf():null,i=null!==t.max?t.max.valueOf():null,o=null,r=null;if(null!=e&&null!=i){var a=i-e;a<=0&&(a=10);var d=a/this.props.center.width,l={},u=0;h.forEach(this.itemSet.items,function(t,e){if(t.groupShowing){l[e]=t.redraw(!0),u=l[e].length}});if(u>0)for(var p=0;pi&&(i=h,r=t)}.bind(this)),o&&r){var c=o.getWidthLeft()+10,m=r.getWidthRight()+10,f=this.props.center.width-c-m;f>0&&(this.options.rtl?(e=n(o)-m*a/f,i=s(r)+c*a/f):(e=n(o)-c*a/f,i=s(r)+m*a/f))}}return{min:null!=e?new Date(e):null,max:null!=i?new Date(i):null}},o.prototype.getDataRange=function(){var t=null,e=null,i=this.itemsData&&this.itemsData.getDataSet();return i&&i.forEach(function(i){var o=h.convert(i.start,"Date").valueOf(),n=h.convert(void 0!=i.end?i.end:i.start,"Date").valueOf();(null===t||oe)&&(e=n)}),{min:null!=t?new Date(t):null,max:null!=e?new Date(e):null}},o.prototype.getEventProperties=function(t){var e,i=t.center?t.center.x:t.clientX,o=t.center?t.center.y:t.clientY;e=this.options.rtl?h.getAbsoluteRight(this.dom.centerContainer)-i:i-h.getAbsoluteLeft(this.dom.centerContainer);var n=o-h.getAbsoluteTop(this.dom.centerContainer),s=this.itemSet.itemFromTarget(t),r=this.itemSet.groupFromTarget(t),a=f.customTimeFromTarget(t),d=this.itemSet.options.snap||null,l=this.body.util.getScale(),u=this.body.util.getStep(),p=this._toTime(e),c=d?d(p,l,u):p,m=h.getTarget(t),g=null;return null!=s?g="item":null!=a?g="custom-time":h.hasParent(m,this.timeAxis.dom.foreground)?g="axis":this.timeAxis2&&h.hasParent(m,this.timeAxis2.dom.foreground)?g="axis":h.hasParent(m,this.itemSet.dom.labelSet)?g="group-label":h.hasParent(m,this.currentTime.bar)?g="current-time":h.hasParent(m,this.dom.center)&&(g="background"),{event:t,item:s?s.id:null,group:r?r.groupId:null,what:g,pageX:t.srcEvent?t.srcEvent.pageX:t.pageX,pageY:t.srcEvent?t.srcEvent.pageY:t.pageY,x:e,y:n,time:p,snappedTime:c}},o.prototype.toggleRollingMode=function(){this.range.rolling?this.range.stopRolling():(void 0==this.options.rollingMode&&this.setOptions(this.options),this.range.startRolling())},t.exports=o},function(t,e,i){var o,n,s;!function(i){n=[],o=i,void 0!==(s="function"==typeof o?o.apply(e,n):o)&&(t.exports=s)}(function(){var t=null;return function e(i,o){function n(t){return t.match(/[^ ]+/g)}function s(e){if("hammer.input"!==e.type){if(e.srcEvent._handled||(e.srcEvent._handled={}),e.srcEvent._handled[e.type])return;e.srcEvent._handled[e.type]=!0}var i=!1;e.stopPropagation=function(){i=!0};var o=e.srcEvent.stopPropagation.bind(e.srcEvent);"function"==typeof o&&(e.srcEvent.stopPropagation=function(){o(),e.stopPropagation()}),e.firstTarget=t;for(var n=t;n&&!i;){var s=n.hammer;if(s)for(var r,a=0;a0?d._handlers[t]=o:(i.off(t,s),delete d._handlers[t]))}),d},d.emit=function(e,o){t=o.target,i.emit(e,o)},d.destroy=function(){var t=i.element.hammer,e=t.indexOf(d);-1!==e&&t.splice(e,1),t.length||delete i.element.hammer,d._handlers={},i.destroy()},d}})},function(t,e,i){var o;/*! Hammer.JS - v2.0.7 - 2016-04-22 + * http://hammerjs.github.io/ + * + * Copyright (c) 2016 Jorik Tangelder; + * Licensed under the MIT license */ +!function(n,s,r,a){function h(t,e,i){return setTimeout(c(t,i),e)}function d(t,e,i){return!!Array.isArray(t)&&(l(t,i[e],i),!0)}function l(t,e,i){var o;if(t)if(t.forEach)t.forEach(e,i);else if(t.length!==a)for(o=0;o\s*\(/gm,"{anonymous}()@"):"Unknown Stack Trace",s=n.console&&(n.console.warn||n.console.log);return s&&s.call(n.console,o,i),t.apply(this,arguments)}}function p(t,e,i){var o,n=e.prototype;o=t.prototype=Object.create(n),o.constructor=t,o._super=n,i&&mt(o,i)}function c(t,e){return function(){return t.apply(e,arguments)}}function m(t,e){return typeof t==vt?t.apply(e?e[0]||a:a,e):t}function f(t,e){return t===a?e:t}function g(t,e,i){l(_(e),function(e){t.addEventListener(e,i,!1)})}function v(t,e,i){l(_(e),function(e){t.removeEventListener(e,i,!1)})}function y(t,e){for(;t;){if(t==e)return!0;t=t.parentNode}return!1}function b(t,e){return t.indexOf(e)>-1}function _(t){return t.trim().split(/\s+/g)}function w(t,e,i){if(t.indexOf&&!i)return t.indexOf(e);for(var o=0;oi[e]}):o.sort()),o}function S(t,e){for(var i,o,n=e[0].toUpperCase()+e.slice(1),s=0;s1&&!i.firstMultiple?i.firstMultiple=N(e):1===n&&(i.firstMultiple=!1);var s=i.firstInput,r=i.firstMultiple,a=r?r.center:s.center,h=e.center=A(o);e.timeStamp=_t(),e.deltaTime=e.timeStamp-s.timeStamp,e.angle=H(a,h),e.distance=F(a,h),P(i,e),e.offsetDirection=L(e.deltaX,e.deltaY);var d=R(e.deltaTime,e.deltaX,e.deltaY);e.overallVelocityX=d.x,e.overallVelocityY=d.y,e.overallVelocity=bt(d.x)>bt(d.y)?d.x:d.y,e.scale=r?Y(r.pointers,o):1,e.rotation=r?j(r.pointers,o):0,e.maxPointers=i.prevInput?e.pointers.length>i.prevInput.maxPointers?e.pointers.length:i.prevInput.maxPointers:e.pointers.length,I(i,e);var l=t.element;y(e.srcEvent.target,l)&&(l=e.srcEvent.target),e.target=l}function P(t,e){var i=e.center,o=t.offsetDelta||{},n=t.prevDelta||{},s=t.prevInput||{};e.eventType!==Ot&&s.eventType!==Pt||(n=t.prevDelta={x:s.deltaX||0,y:s.deltaY||0},o=t.offsetDelta={x:i.x,y:i.y}),e.deltaX=n.x+(i.x-o.x),e.deltaY=n.y+(i.y-o.y)}function I(t,e){var i,o,n,s,r=t.lastInterval||e,h=e.timeStamp-r.timeStamp;if(e.eventType!=It&&(h>Mt||r.velocity===a)){var d=e.deltaX-r.deltaX,l=e.deltaY-r.deltaY,u=R(h,d,l);o=u.x,n=u.y,i=bt(u.x)>bt(u.y)?u.x:u.y,s=L(d,l),t.lastInterval=e}else i=r.velocity,o=r.velocityX,n=r.velocityY,s=r.direction;e.velocity=i,e.velocityX=o,e.velocityY=n,e.direction=s}function N(t){for(var e=[],i=0;i=bt(e)?t<0?At:Rt:e<0?Lt:Ft}function F(t,e,i){i||(i=Gt);var o=e[i[0]]-t[i[0]],n=e[i[1]]-t[i[1]];return Math.sqrt(o*o+n*n)}function H(t,e,i){i||(i=Gt);var o=e[i[0]]-t[i[0]],n=e[i[1]]-t[i[1]];return 180*Math.atan2(n,o)/Math.PI}function j(t,e){return H(e[1],e[0],zt)+H(t[1],t[0],zt)}function Y(t,e){return F(e[0],e[1],zt)/F(t[0],t[1],zt)}function G(){this.evEl=Vt,this.evWin=Bt,this.pressed=!1,T.apply(this,arguments)}function z(){this.evEl=Xt,this.evWin=Zt,T.apply(this,arguments),this.store=this.manager.session.pointerEvents=[]}function W(){this.evTarget=Jt,this.evWin=$t,this.started=!1,T.apply(this,arguments)}function V(t,e){var i=x(t.touches),o=x(t.changedTouches);return e&(Pt|It)&&(i=D(i.concat(o),"identifier",!0)),[i,o]}function B(){this.evTarget=te,this.targetIds={},T.apply(this,arguments)}function U(t,e){var i=x(t.touches),o=this.targetIds;if(e&(Ot|Et)&&1===i.length)return o[i[0].identifier]=!0,[i,i];var n,s,r=x(t.changedTouches),a=[],h=this.target;if(s=i.filter(function(t){return y(t.target,h)}),e===Ot)for(n=0;n-1&&o.splice(t,1)};setTimeout(n,ee)}}function K(t){for(var e=t.srcEvent.clientX,i=t.srcEvent.clientY,o=0;o-1&&this.requireFail.splice(e,1),this},hasRequireFailures:function(){return this.requireFail.length>0},canRecognizeWith:function(t){return!!this.simultaneous[t.id]},emit:function(t){function e(e){i.manager.emit(e,t)}var i=this,o=this.state;o=me&&e(i.options.event+tt(o))},tryEmit:function(t){if(this.canEmit())return this.emit(t);this.state=32},canEmit:function(){for(var t=0;te.threshold&&n&e.direction},attrTest:function(t){return ot.prototype.attrTest.call(this,t)&&(this.state&pe||!(this.state&pe)&&this.directionTest(t))},emit:function(t){this.pX=t.deltaX,this.pY=t.deltaY;var e=et(t.direction);e&&(t.additionalEvent=this.options.event+e),this._super.emit.call(this,t)}}),p(st,ot,{defaults:{event:"pinch",threshold:0,pointers:2},getTouchAction:function(){return[ae]},attrTest:function(t){return this._super.attrTest.call(this,t)&&(Math.abs(t.scale-1)>this.options.threshold||this.state&pe)},emit:function(t){if(1!==t.scale){var e=t.scale<1?"in":"out";t.additionalEvent=this.options.event+e}this._super.emit.call(this,t)}}),p(rt,Q,{defaults:{event:"press",pointers:1,time:251,threshold:9},getTouchAction:function(){return[se]},process:function(t){var e=this.options,i=t.pointers.length===e.pointers,o=t.distancee.time;if(this._input=t,!o||!i||t.eventType&(Pt|It)&&!n)this.reset();else if(t.eventType&Ot)this.reset(),this._timer=h(function(){this.state=fe,this.tryEmit()},e.time,this);else if(t.eventType&Pt)return fe;return 32},reset:function(){clearTimeout(this._timer)},emit:function(t){this.state===fe&&(t&&t.eventType&Pt?this.manager.emit(this.options.event+"up",t):(this._input.timeStamp=_t(),this.manager.emit(this.options.event,this._input)))}}),p(at,ot,{defaults:{event:"rotate",threshold:0,pointers:2},getTouchAction:function(){return[ae]},attrTest:function(t){return this._super.attrTest.call(this,t)&&(Math.abs(t.rotation)>this.options.threshold||this.state&pe)}}),p(ht,ot,{defaults:{event:"swipe",threshold:10,velocity:.3,direction:Ht|jt,pointers:1},getTouchAction:function(){return nt.prototype.getTouchAction.call(this)},attrTest:function(t){var e,i=this.options.direction;return i&(Ht|jt)?e=t.overallVelocity:i&Ht?e=t.overallVelocityX:i&jt&&(e=t.overallVelocityY),this._super.attrTest.call(this,t)&&i&t.offsetDirection&&t.distance>this.options.threshold&&t.maxPointers==this.options.pointers&&bt(e)>this.options.velocity&&t.eventType&Pt},emit:function(t){var e=et(t.offsetDirection);e&&this.manager.emit(this.options.event+e,t),this.manager.emit(this.options.event,t)}}),p(dt,Q,{defaults:{event:"tap",pointers:1,taps:1,interval:300,time:250,threshold:9,posThreshold:10},getTouchAction:function(){return[re]},process:function(t){var e=this.options,i=t.pointers.length===e.pointers,o=t.distancen-this.padding&&(a=!0),s=a?this.x-i:this.x,r=h?this.y-e:this.y}else r=this.y-e,r+e+this.padding>o&&(r=o-e-this.padding),rn&&(s=n-i-this.padding),s0&&void 0!==arguments[0]?arguments[0]:1;(0,a.default)(this,t),this.pixelRatio=e,this.generated=!1,this.centerCoordinates={x:144.5,y:144.5},this.r=289*.49,this.color={r:255,g:255,b:255,a:1},this.hueCircle=void 0,this.initialColor={r:255,g:255,b:255,a:1},this.previousColor=void 0,this.applied=!1,this.updateCallback=function(){},this.closeCallback=function(){},this._create()}return(0,d.default)(t,[{key:"insertTo",value:function(t){void 0!==this.hammer&&(this.hammer.destroy(),this.hammer=void 0),this.container=t,this.container.appendChild(this.frame),this._bindHammer(),this._setSize()}},{key:"setUpdateCallback",value:function(t){if("function"!=typeof t)throw new Error("Function attempted to set as colorPicker update callback is not a function.");this.updateCallback=t}},{key:"setCloseCallback",value:function(t){if("function"!=typeof t)throw new Error("Function attempted to set as colorPicker closing callback is not a function.");this.closeCallback=t}},{key:"_isColorString",value:function(t){var e={black:"#000000",navy:"#000080",darkblue:"#00008B",mediumblue:"#0000CD",blue:"#0000FF",darkgreen:"#006400",green:"#008000",teal:"#008080",darkcyan:"#008B8B",deepskyblue:"#00BFFF",darkturquoise:"#00CED1",mediumspringgreen:"#00FA9A",lime:"#00FF00",springgreen:"#00FF7F",aqua:"#00FFFF",cyan:"#00FFFF",midnightblue:"#191970",dodgerblue:"#1E90FF",lightseagreen:"#20B2AA",forestgreen:"#228B22",seagreen:"#2E8B57",darkslategray:"#2F4F4F",limegreen:"#32CD32",mediumseagreen:"#3CB371",turquoise:"#40E0D0",royalblue:"#4169E1",steelblue:"#4682B4",darkslateblue:"#483D8B",mediumturquoise:"#48D1CC",indigo:"#4B0082",darkolivegreen:"#556B2F",cadetblue:"#5F9EA0",cornflowerblue:"#6495ED",mediumaquamarine:"#66CDAA",dimgray:"#696969",slateblue:"#6A5ACD",olivedrab:"#6B8E23",slategray:"#708090",lightslategray:"#778899",mediumslateblue:"#7B68EE",lawngreen:"#7CFC00",chartreuse:"#7FFF00",aquamarine:"#7FFFD4",maroon:"#800000",purple:"#800080",olive:"#808000",gray:"#808080",skyblue:"#87CEEB",lightskyblue:"#87CEFA",blueviolet:"#8A2BE2",darkred:"#8B0000",darkmagenta:"#8B008B",saddlebrown:"#8B4513",darkseagreen:"#8FBC8F",lightgreen:"#90EE90",mediumpurple:"#9370D8",darkviolet:"#9400D3",palegreen:"#98FB98",darkorchid:"#9932CC",yellowgreen:"#9ACD32",sienna:"#A0522D",brown:"#A52A2A",darkgray:"#A9A9A9",lightblue:"#ADD8E6",greenyellow:"#ADFF2F",paleturquoise:"#AFEEEE",lightsteelblue:"#B0C4DE",powderblue:"#B0E0E6",firebrick:"#B22222",darkgoldenrod:"#B8860B",mediumorchid:"#BA55D3",rosybrown:"#BC8F8F",darkkhaki:"#BDB76B",silver:"#C0C0C0",mediumvioletred:"#C71585",indianred:"#CD5C5C",peru:"#CD853F",chocolate:"#D2691E",tan:"#D2B48C",lightgrey:"#D3D3D3",palevioletred:"#D87093",thistle:"#D8BFD8",orchid:"#DA70D6",goldenrod:"#DAA520",crimson:"#DC143C",gainsboro:"#DCDCDC",plum:"#DDA0DD",burlywood:"#DEB887",lightcyan:"#E0FFFF",lavender:"#E6E6FA",darksalmon:"#E9967A",violet:"#EE82EE",palegoldenrod:"#EEE8AA",lightcoral:"#F08080",khaki:"#F0E68C",aliceblue:"#F0F8FF",honeydew:"#F0FFF0",azure:"#F0FFFF",sandybrown:"#F4A460",wheat:"#F5DEB3",beige:"#F5F5DC",whitesmoke:"#F5F5F5",mintcream:"#F5FFFA",ghostwhite:"#F8F8FF",salmon:"#FA8072",antiquewhite:"#FAEBD7",linen:"#FAF0E6",lightgoldenrodyellow:"#FAFAD2",oldlace:"#FDF5E6",red:"#FF0000",fuchsia:"#FF00FF",magenta:"#FF00FF",deeppink:"#FF1493",orangered:"#FF4500",tomato:"#FF6347",hotpink:"#FF69B4",coral:"#FF7F50",darkorange:"#FF8C00",lightsalmon:"#FFA07A",orange:"#FFA500",lightpink:"#FFB6C1",pink:"#FFC0CB",gold:"#FFD700",peachpuff:"#FFDAB9",navajowhite:"#FFDEAD",moccasin:"#FFE4B5",bisque:"#FFE4C4",mistyrose:"#FFE4E1",blanchedalmond:"#FFEBCD",papayawhip:"#FFEFD5",lavenderblush:"#FFF0F5",seashell:"#FFF5EE",cornsilk:"#FFF8DC",lemonchiffon:"#FFFACD",floralwhite:"#FFFAF0",snow:"#FFFAFA",yellow:"#FFFF00",lightyellow:"#FFFFE0",ivory:"#FFFFF0",white:"#FFFFFF"};if("string"==typeof t)return e[t]}},{key:"setColor",value:function(t){var e=!(arguments.length>1&&void 0!==arguments[1])||arguments[1];if("none"!==t){var i=void 0,o=this._isColorString(t);if(void 0!==o&&(t=o),!0===p.isString(t)){if(!0===p.isValidRGB(t)){var n=t.substr(4).substr(0,t.length-5).split(",");i={r:n[0],g:n[1],b:n[2],a:1}}else if(!0===p.isValidRGBA(t)){var r=t.substr(5).substr(0,t.length-6).split(",");i={r:r[0],g:r[1],b:r[2],a:r[3]}}else if(!0===p.isValidHex(t)){var a=p.hexToRGB(t);i={r:a.r,g:a.g,b:a.b,a:1}}}else if(t instanceof Object&&void 0!==t.r&&void 0!==t.g&&void 0!==t.b){var h=void 0!==t.a?t.a:"1.0";i={r:t.r,g:t.g,b:t.b,a:h}}if(void 0===i)throw new Error("Unknown color passed to the colorPicker. Supported are strings: rgb, hex, rgba. Object: rgb ({r:r,g:g,b:b,[a:a]}). Supplied: "+(0,s.default)(t));this._setColor(i,e)}}},{key:"show",value:function(){void 0!==this.closeCallback&&(this.closeCallback(),this.closeCallback=void 0),this.applied=!1,this.frame.style.display="block",this._generateHueCircle()}},{key:"_hide",value:function(){var t=this;!0===(!(arguments.length>0&&void 0!==arguments[0])||arguments[0])&&(this.previousColor=p.extend({},this.color)),!0===this.applied&&this.updateCallback(this.initialColor),this.frame.style.display="none",setTimeout(function(){void 0!==t.closeCallback&&(t.closeCallback(),t.closeCallback=void 0)},0)}},{key:"_save",value:function(){this.updateCallback(this.color),this.applied=!1,this._hide()}},{key:"_apply",value:function(){this.applied=!0,this.updateCallback(this.color),this._updatePicker(this.color)}},{key:"_loadLast",value:function(){void 0!==this.previousColor?this.setColor(this.previousColor,!1):alert("There is no last color to load...")}},{key:"_setColor",value:function(t){!0===(!(arguments.length>1&&void 0!==arguments[1])||arguments[1])&&(this.initialColor=p.extend({},t)),this.color=t;var e=p.RGBToHSV(t.r,t.g,t.b),i=2*Math.PI,o=this.r*e.s,n=this.centerCoordinates.x+o*Math.sin(i*e.h),s=this.centerCoordinates.y+o*Math.cos(i*e.h);this.colorPickerSelector.style.left=n-.5*this.colorPickerSelector.clientWidth+"px",this.colorPickerSelector.style.top=s-.5*this.colorPickerSelector.clientHeight+"px",this._updatePicker(t)}},{key:"_setOpacity",value:function(t){this.color.a=t/100,this._updatePicker(this.color)}},{key:"_setBrightness",value:function(t){var e=p.RGBToHSV(this.color.r,this.color.g,this.color.b);e.v=t/100;var i=p.HSVToRGB(e.h,e.s,e.v);i.a=this.color.a,this.color=i,this._updatePicker()}},{key:"_updatePicker",value:function(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:this.color,e=p.RGBToHSV(t.r,t.g,t.b),i=this.colorPickerCanvas.getContext("2d");void 0===this.pixelRation&&(this.pixelRatio=(window.devicePixelRatio||1)/(i.webkitBackingStorePixelRatio||i.mozBackingStorePixelRatio||i.msBackingStorePixelRatio||i.oBackingStorePixelRatio||i.backingStorePixelRatio||1)),i.setTransform(this.pixelRatio,0,0,this.pixelRatio,0,0);var o=this.colorPickerCanvas.clientWidth,n=this.colorPickerCanvas.clientHeight;i.clearRect(0,0,o,n),i.putImageData(this.hueCircle,0,0),i.fillStyle="rgba(0,0,0,"+(1-e.v)+")",i.circle(this.centerCoordinates.x,this.centerCoordinates.y,this.r),i.fill(),this.brightnessRange.value=100*e.v,this.opacityRange.value=100*t.a,this.initialColorDiv.style.backgroundColor="rgba("+this.initialColor.r+","+this.initialColor.g+","+this.initialColor.b+","+this.initialColor.a+")",this.newColorDiv.style.backgroundColor="rgba("+this.color.r+","+this.color.g+","+this.color.b+","+this.color.a+")"}},{key:"_setSize",value:function(){this.colorPickerCanvas.style.width="100%",this.colorPickerCanvas.style.height="100%",this.colorPickerCanvas.width=289*this.pixelRatio,this.colorPickerCanvas.height=289*this.pixelRatio}},{key:"_create",value:function(){if(this.frame=document.createElement("div"),this.frame.className="vis-color-picker",this.colorPickerDiv=document.createElement("div"),this.colorPickerSelector=document.createElement("div"),this.colorPickerSelector.className="vis-selector",this.colorPickerDiv.appendChild(this.colorPickerSelector),this.colorPickerCanvas=document.createElement("canvas"),this.colorPickerDiv.appendChild(this.colorPickerCanvas),this.colorPickerCanvas.getContext){var t=this.colorPickerCanvas.getContext("2d");this.pixelRatio=(window.devicePixelRatio||1)/(t.webkitBackingStorePixelRatio||t.mozBackingStorePixelRatio||t.msBackingStorePixelRatio||t.oBackingStorePixelRatio||t.backingStorePixelRatio||1),this.colorPickerCanvas.getContext("2d").setTransform(this.pixelRatio,0,0,this.pixelRatio,0,0)}else{var e=document.createElement("DIV");e.style.color="red",e.style.fontWeight="bold",e.style.padding="10px",e.innerHTML="Error: your browser does not support HTML canvas",this.colorPickerCanvas.appendChild(e)}this.colorPickerDiv.className="vis-color",this.opacityDiv=document.createElement("div"),this.opacityDiv.className="vis-opacity",this.brightnessDiv=document.createElement("div"),this.brightnessDiv.className="vis-brightness",this.arrowDiv=document.createElement("div"),this.arrowDiv.className="vis-arrow",this.opacityRange=document.createElement("input");try{this.opacityRange.type="range",this.opacityRange.min="0",this.opacityRange.max="100"}catch(t){}this.opacityRange.value="100",this.opacityRange.className="vis-range",this.brightnessRange=document.createElement("input");try{this.brightnessRange.type="range",this.brightnessRange.min="0",this.brightnessRange.max="100"}catch(t){}this.brightnessRange.value="100",this.brightnessRange.className="vis-range",this.opacityDiv.appendChild(this.opacityRange),this.brightnessDiv.appendChild(this.brightnessRange);var i=this;this.opacityRange.onchange=function(){i._setOpacity(this.value)},this.opacityRange.oninput=function(){i._setOpacity(this.value)},this.brightnessRange.onchange=function(){i._setBrightness(this.value)},this.brightnessRange.oninput=function(){i._setBrightness(this.value)},this.brightnessLabel=document.createElement("div"),this.brightnessLabel.className="vis-label vis-brightness",this.brightnessLabel.innerHTML="brightness:",this.opacityLabel=document.createElement("div"),this.opacityLabel.className="vis-label vis-opacity",this.opacityLabel.innerHTML="opacity:",this.newColorDiv=document.createElement("div"),this.newColorDiv.className="vis-new-color",this.newColorDiv.innerHTML="new",this.initialColorDiv=document.createElement("div"),this.initialColorDiv.className="vis-initial-color",this.initialColorDiv.innerHTML="initial",this.cancelButton=document.createElement("div"),this.cancelButton.className="vis-button vis-cancel",this.cancelButton.innerHTML="cancel",this.cancelButton.onclick=this._hide.bind(this,!1),this.applyButton=document.createElement("div"),this.applyButton.className="vis-button vis-apply",this.applyButton.innerHTML="apply",this.applyButton.onclick=this._apply.bind(this),this.saveButton=document.createElement("div"),this.saveButton.className="vis-button vis-save",this.saveButton.innerHTML="save",this.saveButton.onclick=this._save.bind(this),this.loadButton=document.createElement("div"),this.loadButton.className="vis-button vis-load",this.loadButton.innerHTML="load last",this.loadButton.onclick=this._loadLast.bind(this),this.frame.appendChild(this.colorPickerDiv),this.frame.appendChild(this.arrowDiv),this.frame.appendChild(this.brightnessLabel),this.frame.appendChild(this.brightnessDiv),this.frame.appendChild(this.opacityLabel),this.frame.appendChild(this.opacityDiv),this.frame.appendChild(this.newColorDiv),this.frame.appendChild(this.initialColorDiv),this.frame.appendChild(this.cancelButton),this.frame.appendChild(this.applyButton),this.frame.appendChild(this.saveButton),this.frame.appendChild(this.loadButton)}},{key:"_bindHammer",value:function(){var t=this;this.drag={},this.pinch={},this.hammer=new l(this.colorPickerCanvas),this.hammer.get("pinch").set({enable:!0}),u.onTouch(this.hammer,function(e){t._moveSelector(e)}),this.hammer.on("tap",function(e){t._moveSelector(e)}),this.hammer.on("panstart",function(e){t._moveSelector(e)}),this.hammer.on("panmove",function(e){t._moveSelector(e)}),this.hammer.on("panend",function(e){t._moveSelector(e)})}},{key:"_generateHueCircle",value:function(){if(!1===this.generated){var t=this.colorPickerCanvas.getContext("2d");void 0===this.pixelRation&&(this.pixelRatio=(window.devicePixelRatio||1)/(t.webkitBackingStorePixelRatio||t.mozBackingStorePixelRatio||t.msBackingStorePixelRatio||t.oBackingStorePixelRatio||t.backingStorePixelRatio||1)),t.setTransform(this.pixelRatio,0,0,this.pixelRatio,0,0);var e=this.colorPickerCanvas.clientWidth,i=this.colorPickerCanvas.clientHeight;t.clearRect(0,0,e,i);var o=void 0,n=void 0,s=void 0,r=void 0;this.centerCoordinates={x:.5*e,y:.5*i},this.r=.49*e;var a=2*Math.PI/360,h=1/this.r,d=void 0;for(s=0;s<360;s++)for(r=0;rr?r:t,e=null==e?r:e0&&l.push(u.screenToValue(n)),!c.hidden&&this.itemsData.length>0&&l.push(c.screenToValue(n)),{event:t,what:d,pageX:t.srcEvent?t.srcEvent.pageX:t.pageX,pageY:t.srcEvent?t.srcEvent.pageY:t.pageY,x:o,y:n,time:r,value:l}},o.prototype._createConfigurator=function(){return new v(this,this.dom.container,g)},t.exports=o}])}); \ No newline at end of file diff --git a/core/src/main/resources/org/apache/spark/ui/static/vis.min.css b/core/src/main/resources/org/apache/spark/ui/static/vis.min.css deleted file mode 100644 index 40d182cfde231..0000000000000 --- a/core/src/main/resources/org/apache/spark/ui/static/vis.min.css +++ /dev/null @@ -1 +0,0 @@ -.vis-background,.vis-labelset,.vis-timeline{overflow:hidden}.vis .overlay{position:absolute;top:0;left:0;width:100%;height:100%;z-index:10}.vis-active{box-shadow:0 0 10px #86d5f8}.vis [class*=span]{min-height:0;width:auto}div.vis-configuration{position:relative;display:block;float:left;font-size:12px}div.vis-configuration-wrapper{display:block;width:700px}div.vis-configuration-wrapper::after{clear:both;content:"";display:block}div.vis-configuration.vis-config-option-container{display:block;width:495px;background-color:#fff;border:2px solid #f7f8fa;border-radius:4px;margin-top:20px;left:10px;padding-left:5px}div.vis-configuration.vis-config-button{display:block;width:495px;height:25px;vertical-align:middle;line-height:25px;background-color:#f7f8fa;border:2px solid #ceced0;border-radius:4px;margin-top:20px;left:10px;padding-left:5px;cursor:pointer;margin-bottom:30px}div.vis-configuration.vis-config-button.hover{background-color:#4588e6;border:2px solid #214373;color:#fff}div.vis-configuration.vis-config-item{display:block;float:left;width:495px;height:25px;vertical-align:middle;line-height:25px}div.vis-configuration.vis-config-item.vis-config-s2{left:10px;background-color:#f7f8fa;padding-left:5px;border-radius:3px}div.vis-configuration.vis-config-item.vis-config-s3{left:20px;background-color:#e4e9f0;padding-left:5px;border-radius:3px}div.vis-configuration.vis-config-item.vis-config-s4{left:30px;background-color:#cfd8e6;padding-left:5px;border-radius:3px}div.vis-configuration.vis-config-header{font-size:18px;font-weight:700}div.vis-configuration.vis-config-label{width:120px;height:25px;line-height:25px}div.vis-configuration.vis-config-label.vis-config-s3{width:110px}div.vis-configuration.vis-config-label.vis-config-s4{width:100px}div.vis-configuration.vis-config-colorBlock{top:1px;width:30px;height:19px;border:1px solid #444;border-radius:2px;padding:0;margin:0;cursor:pointer}input.vis-configuration.vis-config-checkbox{left:-5px}input.vis-configuration.vis-config-rangeinput{position:relative;top:-5px;width:60px;padding:1px;margin:0;pointer-events:none}.vis-panel,.vis-timeline{padding:0;box-sizing:border-box}input.vis-configuration.vis-config-range{-webkit-appearance:none;border:0 solid #fff;background-color:rgba(0,0,0,0);width:300px;height:20px}input.vis-configuration.vis-config-range::-webkit-slider-runnable-track{width:300px;height:5px;background:#dedede;background:-moz-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#dedede),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-o-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:linear-gradient(to bottom,#dedede 0,#c8c8c8 99%);filter:progid:DXImageTransform.Microsoft.gradient( startColorstr='#dedede', endColorstr='#c8c8c8', GradientType=0 );border:1px solid #999;box-shadow:#aaa 0 0 3px 0;border-radius:3px}input.vis-configuration.vis-config-range::-webkit-slider-thumb{-webkit-appearance:none;border:1px solid #14334b;height:17px;width:17px;border-radius:50%;background:#3876c2;background:-moz-linear-gradient(top,#3876c2 0,#385380 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#3876c2),color-stop(100%,#385380));background:-webkit-linear-gradient(top,#3876c2 0,#385380 100%);background:-o-linear-gradient(top,#3876c2 0,#385380 100%);background:-ms-linear-gradient(top,#3876c2 0,#385380 100%);background:linear-gradient(to bottom,#3876c2 0,#385380 100%);filter:progid:DXImageTransform.Microsoft.gradient( startColorstr='#3876c2', endColorstr='#385380', GradientType=0 );box-shadow:#111927 0 0 1px 0;margin-top:-7px}input.vis-configuration.vis-config-range:focus{outline:0}input.vis-configuration.vis-config-range:focus::-webkit-slider-runnable-track{background:#9d9d9d;background:-moz-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#9d9d9d),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-o-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:linear-gradient(to bottom,#9d9d9d 0,#c8c8c8 99%);filter:progid:DXImageTransform.Microsoft.gradient( startColorstr='#9d9d9d', endColorstr='#c8c8c8', GradientType=0 )}input.vis-configuration.vis-config-range::-moz-range-track{width:300px;height:10px;background:#dedede;background:-moz-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#dedede),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-o-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:linear-gradient(to bottom,#dedede 0,#c8c8c8 99%);filter:progid:DXImageTransform.Microsoft.gradient( startColorstr='#dedede', endColorstr='#c8c8c8', GradientType=0 );border:1px solid #999;box-shadow:#aaa 0 0 3px 0;border-radius:3px}input.vis-configuration.vis-config-range::-moz-range-thumb{border:none;height:16px;width:16px;border-radius:50%;background:#385380}input.vis-configuration.vis-config-range:-moz-focusring{outline:#fff solid 1px;outline-offset:-1px}input.vis-configuration.vis-config-range::-ms-track{width:300px;height:5px;background:0 0;border-color:transparent;border-width:6px 0;color:transparent}input.vis-configuration.vis-config-range::-ms-fill-lower{background:#777;border-radius:10px}input.vis-configuration.vis-config-range::-ms-fill-upper{background:#ddd;border-radius:10px}input.vis-configuration.vis-config-range::-ms-thumb{border:none;height:16px;width:16px;border-radius:50%;background:#385380}input.vis-configuration.vis-config-range:focus::-ms-fill-lower{background:#888}input.vis-configuration.vis-config-range:focus::-ms-fill-upper{background:#ccc}.vis-configuration-popup{position:absolute;background:rgba(57,76,89,.85);border:2px solid #f2faff;line-height:30px;height:30px;width:150px;text-align:center;color:#fff;font-size:14px;border-radius:4px;-webkit-transition:opacity .3s ease-in-out;-moz-transition:opacity .3s ease-in-out;transition:opacity .3s ease-in-out}.vis-configuration-popup:after,.vis-configuration-popup:before{left:100%;top:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}.vis-configuration-popup:after{border-color:rgba(136,183,213,0);border-left-color:rgba(57,76,89,.85);border-width:8px;margin-top:-8px}.vis-configuration-popup:before{border-color:rgba(194,225,245,0);border-left-color:#f2faff;border-width:12px;margin-top:-12px}.vis-timeline{position:relative;border:1px solid #bfbfbf;margin:0}.vis-panel{position:absolute;margin:0}.vis-panel.vis-bottom,.vis-panel.vis-center,.vis-panel.vis-left,.vis-panel.vis-right,.vis-panel.vis-top{border:1px #bfbfbf}.vis-panel.vis-center,.vis-panel.vis-left,.vis-panel.vis-right{border-top-style:solid;border-bottom-style:solid;overflow:hidden}.vis-panel.vis-bottom,.vis-panel.vis-center,.vis-panel.vis-top{border-left-style:solid;border-right-style:solid}.vis-panel>.vis-content{position:relative}.vis-panel .vis-shadow{position:absolute;width:100%;height:1px;box-shadow:0 0 10px rgba(0,0,0,.8)}.vis-itemset,.vis-labelset,.vis-labelset .vis-label{position:relative;box-sizing:border-box}.vis-panel .vis-shadow.vis-top{top:-1px;left:0}.vis-panel .vis-shadow.vis-bottom{bottom:-1px;left:0}.vis-labelset .vis-label{left:0;top:0;width:100%;color:#4d4d4d;border-bottom:1px solid #bfbfbf}.vis-labelset .vis-label.draggable{cursor:pointer}.vis-labelset .vis-label:last-child{border-bottom:none}.vis-labelset .vis-label .vis-inner{display:inline-block;padding:5px}.vis-labelset .vis-label .vis-inner.vis-hidden{padding:0}.vis-itemset{padding:0;margin:0}.vis-itemset .vis-background,.vis-itemset .vis-foreground{position:absolute;width:100%;height:100%;overflow:visible}.vis-axis{position:absolute;width:100%;height:0;left:0;z-index:1}.vis-foreground .vis-group{position:relative;box-sizing:border-box;border-bottom:1px solid #bfbfbf}.vis-foreground .vis-group:last-child{border-bottom:none}.vis-overlay{position:absolute;top:0;left:0;width:100%;height:100%;z-index:10}.vis-item{position:absolute;color:#1A1A1A;border-color:#97B0F8;border-width:1px;background-color:#D5DDF6;display:inline-block}.vis-item.vis-point.vis-selected,.vis-item.vis-selected{background-color:#FFF785}.vis-item.vis-selected{border-color:#FFC200;z-index:2}.vis-editable.vis-selected{cursor:move}.vis-item.vis-box{text-align:center;border-style:solid;border-radius:2px}.vis-item.vis-point{background:0 0}.vis-item.vis-dot{position:absolute;padding:0;border-width:4px;border-style:solid;border-radius:4px}.vis-item.vis-range{border-style:solid;border-radius:2px;box-sizing:border-box}.vis-item.vis-background{border:none;background-color:rgba(213,221,246,.4);box-sizing:border-box;padding:0;margin:0}.vis-item .vis-item-overflow{position:relative;width:100%;height:100%;padding:0;margin:0;overflow:hidden}.vis-item .vis-delete,.vis-item .vis-delete-rtl{background:url(img/timeline/delete.png) center no-repeat;height:24px;top:-4px;cursor:pointer}.vis-item.vis-range .vis-item-content{position:relative;display:inline-block}.vis-item.vis-background .vis-item-content{position:absolute;display:inline-block}.vis-item.vis-line{padding:0;position:absolute;width:0;border-left-width:1px;border-left-style:solid}.vis-item .vis-item-content{white-space:nowrap;box-sizing:border-box;padding:5px}.vis-item .vis-delete{position:absolute;width:24px;right:-24px}.vis-item .vis-delete-rtl{position:absolute;width:24px;left:-24px}.vis-item.vis-range .vis-drag-left{position:absolute;width:24px;max-width:20%;min-width:2px;height:100%;top:0;left:-4px;cursor:w-resize}.vis-item.vis-range .vis-drag-right{position:absolute;width:24px;max-width:20%;min-width:2px;height:100%;top:0;right:-4px;cursor:e-resize}.vis-range.vis-item.vis-readonly .vis-drag-left,.vis-range.vis-item.vis-readonly .vis-drag-right{cursor:auto}.vis-time-axis{position:relative;overflow:hidden}.vis-time-axis.vis-foreground{top:0;left:0;width:100%}.vis-time-axis.vis-background{position:absolute;top:0;left:0;width:100%;height:100%}.vis-time-axis .vis-text{position:absolute;color:#4d4d4d;padding:3px;overflow:hidden;box-sizing:border-box;white-space:nowrap}.vis-time-axis .vis-text.vis-measure{position:absolute;padding-left:0;padding-right:0;margin-left:0;margin-right:0;visibility:hidden}.vis-time-axis .vis-grid.vis-vertical{position:absolute;border-left:1px solid}.vis-time-axis .vis-grid.vis-vertical-rtl{position:absolute;border-right:1px solid}.vis-time-axis .vis-grid.vis-minor{border-color:#e5e5e5}.vis-time-axis .vis-grid.vis-major{border-color:#bfbfbf}.vis-current-time{background-color:#FF7F6E;width:2px;z-index:1}.vis-custom-time{background-color:#6E94FF;width:2px;cursor:move;z-index:1}div.vis-network div.vis-close,div.vis-network div.vis-edit-mode div.vis-button,div.vis-network div.vis-manipulation div.vis-button{cursor:pointer;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;-webkit-touch-callout:none;-khtml-user-select:none}.vis-panel.vis-background.vis-horizontal .vis-grid.vis-horizontal{position:absolute;width:100%;height:0;border-bottom:1px solid}.vis-panel.vis-background.vis-horizontal .vis-grid.vis-minor{border-color:#e5e5e5}.vis-panel.vis-background.vis-horizontal .vis-grid.vis-major{border-color:#bfbfbf}.vis-data-axis .vis-y-axis.vis-major{width:100%;position:absolute;color:#4d4d4d;white-space:nowrap}.vis-data-axis .vis-y-axis.vis-major.vis-measure{padding:0;margin:0;border:0;visibility:hidden;width:auto}.vis-data-axis .vis-y-axis.vis-minor{position:absolute;width:100%;color:#bebebe;white-space:nowrap}.vis-data-axis .vis-y-axis.vis-minor.vis-measure{padding:0;margin:0;border:0;visibility:hidden;width:auto}.vis-data-axis .vis-y-axis.vis-title{position:absolute;color:#4d4d4d;white-space:nowrap;bottom:20px;text-align:center}.vis-data-axis .vis-y-axis.vis-title.vis-measure{padding:0;margin:0;visibility:hidden;width:auto}.vis-data-axis .vis-y-axis.vis-title.vis-left{bottom:0;-webkit-transform-origin:left top;-moz-transform-origin:left top;-ms-transform-origin:left top;-o-transform-origin:left top;transform-origin:left bottom;-webkit-transform:rotate(-90deg);-moz-transform:rotate(-90deg);-ms-transform:rotate(-90deg);-o-transform:rotate(-90deg);transform:rotate(-90deg)}.vis-data-axis .vis-y-axis.vis-title.vis-right{bottom:0;-webkit-transform-origin:right bottom;-moz-transform-origin:right bottom;-ms-transform-origin:right bottom;-o-transform-origin:right bottom;transform-origin:right bottom;-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.vis-legend{background-color:rgba(247,252,255,.65);padding:5px;border:1px solid #b3b3b3;box-shadow:2px 2px 10px rgba(154,154,154,.55)}.vis-legend-text{white-space:nowrap;display:inline-block}.vis-graph-group0{fill:#4f81bd;fill-opacity:0;stroke-width:2px;stroke:#4f81bd}.vis-graph-group1{fill:#f79646;fill-opacity:0;stroke-width:2px;stroke:#f79646}.vis-graph-group2{fill:#8c51cf;fill-opacity:0;stroke-width:2px;stroke:#8c51cf}.vis-graph-group3{fill:#75c841;fill-opacity:0;stroke-width:2px;stroke:#75c841}.vis-graph-group4{fill:#ff0100;fill-opacity:0;stroke-width:2px;stroke:#ff0100}.vis-graph-group5{fill:#37d8e6;fill-opacity:0;stroke-width:2px;stroke:#37d8e6}.vis-graph-group6{fill:#042662;fill-opacity:0;stroke-width:2px;stroke:#042662}.vis-graph-group7{fill:#00ff26;fill-opacity:0;stroke-width:2px;stroke:#00ff26}.vis-graph-group8{fill:#f0f;fill-opacity:0;stroke-width:2px;stroke:#f0f}.vis-graph-group9{fill:#8f3938;fill-opacity:0;stroke-width:2px;stroke:#8f3938}.vis-timeline .vis-fill{fill-opacity:.1;stroke:none}.vis-timeline .vis-bar{fill-opacity:.5;stroke-width:1px}.vis-timeline .vis-point{stroke-width:2px;fill-opacity:1}.vis-timeline .vis-legend-background{stroke-width:1px;fill-opacity:.9;fill:#fff;stroke:#c2c2c2}.vis-timeline .vis-outline{stroke-width:1px;fill-opacity:1;fill:#fff;stroke:#e5e5e5}.vis-timeline .vis-icon-fill{fill-opacity:.3;stroke:none}div.vis-network div.vis-manipulation{border-width:0;border-bottom:1px;border-style:solid;border-color:#d6d9d8;background:#fff;background:-moz-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#fff),color-stop(48%,#fcfcfc),color-stop(50%,#fafafa),color-stop(100%,#fcfcfc));background:-webkit-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:-o-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:-ms-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:linear-gradient(to bottom,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);filter:progid:DXImageTransform.Microsoft.gradient( startColorstr='#ffffff', endColorstr='#fcfcfc', GradientType=0 );padding-top:4px;position:absolute;left:0;top:0;width:100%;height:28px}div.vis-network div.vis-edit-mode{position:absolute;left:0;top:5px;height:30px}div.vis-network div.vis-close{position:absolute;right:0;top:0;width:30px;height:30px;background-position:20px 3px;background-repeat:no-repeat;background-image:url(img/network/cross.png);user-select:none}div.vis-network div.vis-close:hover{opacity:.6}div.vis-network div.vis-edit-mode div.vis-button,div.vis-network div.vis-manipulation div.vis-button{float:left;font-family:verdana;font-size:12px;-moz-border-radius:15px;border-radius:15px;display:inline-block;background-position:0 0;background-repeat:no-repeat;height:24px;margin-left:10px;padding:0 8px;user-select:none}div.vis-network div.vis-manipulation div.vis-button:hover{box-shadow:1px 1px 8px rgba(0,0,0,.2)}div.vis-network div.vis-manipulation div.vis-button:active{box-shadow:1px 1px 8px rgba(0,0,0,.5)}div.vis-network div.vis-manipulation div.vis-button.vis-back{background-image:url(img/network/backIcon.png)}div.vis-network div.vis-manipulation div.vis-button.vis-none:hover{box-shadow:1px 1px 8px transparent;cursor:default}div.vis-network div.vis-manipulation div.vis-button.vis-none:active{box-shadow:1px 1px 8px transparent}div.vis-network div.vis-manipulation div.vis-button.vis-none{padding:0}div.vis-network div.vis-manipulation div.notification{margin:2px;font-weight:700}div.vis-network div.vis-manipulation div.vis-button.vis-add{background-image:url(img/network/addNodeIcon.png)}div.vis-network div.vis-edit-mode div.vis-button.vis-edit,div.vis-network div.vis-manipulation div.vis-button.vis-edit{background-image:url(img/network/editIcon.png)}div.vis-network div.vis-edit-mode div.vis-button.vis-edit.vis-edit-mode{background-color:#fcfcfc;border:1px solid #ccc}div.vis-network div.vis-manipulation div.vis-button.vis-connect{background-image:url(img/network/connectIcon.png)}div.vis-network div.vis-manipulation div.vis-button.vis-delete{background-image:url(img/network/deleteIcon.png)}div.vis-network div.vis-edit-mode div.vis-label,div.vis-network div.vis-manipulation div.vis-label{margin:0 0 0 23px;line-height:25px}div.vis-network div.vis-manipulation div.vis-separator-line{float:left;display:inline-block;width:1px;height:21px;background-color:#bdbdbd;margin:0 7px 0 15px}div.vis-network-tooltip{position:absolute;visibility:hidden;padding:5px;white-space:nowrap;font-family:verdana;font-size:14px;color:#000;background-color:#f5f4ed;-moz-border-radius:3px;-webkit-border-radius:3px;border-radius:3px;border:1px solid #808074;box-shadow:3px 3px 10px rgba(0,0,0,.2);pointer-events:none}div.vis-network div.vis-navigation div.vis-button{width:34px;height:34px;-moz-border-radius:17px;border-radius:17px;position:absolute;display:inline-block;background-position:2px 2px;background-repeat:no-repeat;cursor:pointer;-webkit-touch-callout:none;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}div.vis-network div.vis-navigation div.vis-button:hover{box-shadow:0 0 3px 3px rgba(56,207,21,.3)}div.vis-network div.vis-navigation div.vis-button:active{box-shadow:0 0 1px 3px rgba(56,207,21,.95)}div.vis-network div.vis-navigation div.vis-button.vis-up{background-image:url(img/network/upArrow.png);bottom:50px;left:55px}div.vis-network div.vis-navigation div.vis-button.vis-down{background-image:url(img/network/downArrow.png);bottom:10px;left:55px}div.vis-network div.vis-navigation div.vis-button.vis-left{background-image:url(img/network/leftArrow.png);bottom:10px;left:15px}div.vis-network div.vis-navigation div.vis-button.vis-right{background-image:url(img/network/rightArrow.png);bottom:10px;left:95px}div.vis-network div.vis-navigation div.vis-button.vis-zoomIn{background-image:url(img/network/plus.png);bottom:10px;right:15px}div.vis-network div.vis-navigation div.vis-button.vis-zoomOut{background-image:url(img/network/minus.png);bottom:10px;right:55px}div.vis-network div.vis-navigation div.vis-button.vis-zoomExtends{background-image:url(img/network/zoomExtends.png);bottom:50px;right:15px}div.vis-color-picker{position:absolute;top:0;left:30px;margin-top:-140px;margin-left:30px;width:310px;height:444px;z-index:1;padding:10px;border-radius:15px;background-color:#fff;display:none;box-shadow:rgba(0,0,0,.5) 0 0 10px 0}div.vis-color-picker div.vis-arrow{position:absolute;top:147px;left:5px}div.vis-color-picker div.vis-arrow::after,div.vis-color-picker div.vis-arrow::before{right:100%;top:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}div.vis-color-picker div.vis-arrow:after{border-color:rgba(255,255,255,0);border-right-color:#fff;border-width:30px;margin-top:-30px}div.vis-color-picker div.vis-color{position:absolute;width:289px;height:289px;cursor:pointer}div.vis-color-picker div.vis-brightness{position:absolute;top:313px}div.vis-color-picker div.vis-opacity{position:absolute;top:350px}div.vis-color-picker div.vis-selector{position:absolute;top:137px;left:137px;width:15px;height:15px;border-radius:15px;border:1px solid #fff;background:#4c4c4c;background:-moz-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#4c4c4c),color-stop(12%,#595959),color-stop(25%,#666),color-stop(39%,#474747),color-stop(50%,#2c2c2c),color-stop(51%,#000),color-stop(60%,#111),color-stop(76%,#2b2b2b),color-stop(91%,#1c1c1c),color-stop(100%,#131313));background:-webkit-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:-o-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:-ms-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:linear-gradient(to bottom,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);filter:progid:DXImageTransform.Microsoft.gradient( startColorstr='#4c4c4c', endColorstr='#131313', GradientType=0 )}div.vis-color-picker div.vis-initial-color,div.vis-color-picker div.vis-new-color{width:140px;height:20px;top:380px;font-size:10px;color:rgba(0,0,0,.4);line-height:20px;position:absolute;vertical-align:middle}div.vis-color-picker div.vis-new-color{border:1px solid rgba(0,0,0,.1);border-radius:5px;left:159px;text-align:right;padding-right:2px}div.vis-color-picker div.vis-initial-color{border:1px solid rgba(0,0,0,.1);border-radius:5px;left:10px;text-align:left;padding-left:2px}div.vis-color-picker div.vis-label{position:absolute;width:300px;left:10px}div.vis-color-picker div.vis-label.vis-brightness{top:300px}div.vis-color-picker div.vis-label.vis-opacity{top:338px}div.vis-color-picker div.vis-button{position:absolute;width:68px;height:25px;border-radius:10px;vertical-align:middle;text-align:center;line-height:25px;top:410px;border:2px solid #d9d9d9;background-color:#f7f7f7;cursor:pointer}div.vis-color-picker div.vis-button.vis-cancel{left:5px}div.vis-color-picker div.vis-button.vis-load{left:82px}div.vis-color-picker div.vis-button.vis-apply{left:159px}div.vis-color-picker div.vis-button.vis-save{left:236px}div.vis-color-picker input.vis-range{width:290px;height:20px} \ No newline at end of file diff --git a/core/src/main/resources/org/apache/spark/ui/static/vis.min.js b/core/src/main/resources/org/apache/spark/ui/static/vis.min.js deleted file mode 100644 index 92b8ed75d85fc..0000000000000 --- a/core/src/main/resources/org/apache/spark/ui/static/vis.min.js +++ /dev/null @@ -1,45 +0,0 @@ -/** - * vis.js - * https://github.com/almende/vis - * - * A dynamic, browser-based visualization library. - * - * @version 4.16.1 - * @date 2016-04-18 - * - * @license - * Copyright (C) 2011-2016 Almende B.V, http://almende.com - * - * Vis.js is dual licensed under both - * - * * The Apache 2.0 License - * http://www.apache.org/licenses/LICENSE-2.0 - * - * and - * - * * The MIT License - * http://opensource.org/licenses/MIT - * - * Vis.js may be distributed under either license. - */ -"use strict";!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.vis=e():t.vis=e()}(this,function(){return function(t){function e(o){if(i[o])return i[o].exports;var n=i[o]={exports:{},id:o,loaded:!1};return t[o].call(n.exports,n,n.exports,e),n.loaded=!0,n.exports}var i={};return e.m=t,e.c=i,e.p="",e(0)}([function(t,e,i){var o=i(1);o.extend(e,i(7)),o.extend(e,i(24)),o.extend(e,i(60))},function(t,e,i){var o="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},n=i(2),s=i(6);e.isNumber=function(t){return t instanceof Number||"number"==typeof t},e.recursiveDOMDelete=function(t){if(t)for(;t.hasChildNodes()===!0;)e.recursiveDOMDelete(t.firstChild),t.removeChild(t.firstChild)},e.giveRange=function(t,e,i,o){if(e==t)return.5;var n=1/(e-t);return Math.max(0,(o-t)*n)},e.isString=function(t){return t instanceof String||"string"==typeof t},e.isDate=function(t){if(t instanceof Date)return!0;if(e.isString(t)){var i=r.exec(t);if(i)return!0;if(!isNaN(Date.parse(t)))return!0}return!1},e.randomUUID=function(){return s.v4()},e.assignAllKeys=function(t,e){for(var i in t)t.hasOwnProperty(i)&&"object"!==o(t[i])&&(t[i]=e)},e.fillIfDefined=function(t,i){var n=arguments.length<=2||void 0===arguments[2]?!1:arguments[2];for(var s in t)void 0!==i[s]&&("object"!==o(i[s])?void 0!==i[s]&&null!==i[s]||void 0===t[s]||n!==!0?t[s]=i[s]:delete t[s]:"object"===o(t[s])&&e.fillIfDefined(t[s],i[s],n))},e.protoExtend=function(t,e){for(var i=1;ii;i++)if(t[i]!=e[i])return!1;return!0},e.convert=function(t,i){var o;if(void 0!==t){if(null===t)return null;if(!i)return t;if("string"!=typeof i&&!(i instanceof String))throw new Error("Type must be a string");switch(i){case"boolean":case"Boolean":return Boolean(t);case"number":case"Number":return Number(t.valueOf());case"string":case"String":return String(t);case"Date":if(e.isNumber(t))return new Date(t);if(t instanceof Date)return new Date(t.valueOf());if(n.isMoment(t))return new Date(t.valueOf());if(e.isString(t))return o=r.exec(t),o?new Date(Number(o[1])):n(t).toDate();throw new Error("Cannot convert object of type "+e.getType(t)+" to type Date");case"Moment":if(e.isNumber(t))return n(t);if(t instanceof Date)return n(t.valueOf());if(n.isMoment(t))return n(t);if(e.isString(t))return o=r.exec(t),n(o?Number(o[1]):t);throw new Error("Cannot convert object of type "+e.getType(t)+" to type Date");case"ISODate":if(e.isNumber(t))return new Date(t);if(t instanceof Date)return t.toISOString();if(n.isMoment(t))return t.toDate().toISOString();if(e.isString(t))return o=r.exec(t),o?new Date(Number(o[1])).toISOString():new Date(t).toISOString();throw new Error("Cannot convert object of type "+e.getType(t)+" to type ISODate");case"ASPDate":if(e.isNumber(t))return"/Date("+t+")/";if(t instanceof Date)return"/Date("+t.valueOf()+")/";if(e.isString(t)){o=r.exec(t);var s;return s=o?new Date(Number(o[1])).valueOf():new Date(t).valueOf(),"/Date("+s+")/"}throw new Error("Cannot convert object of type "+e.getType(t)+" to type ASPDate");default:throw new Error('Unknown type "'+i+'"')}}};var r=/^\/?Date\((\-?\d+)/i;e.getType=function(t){var e="undefined"==typeof t?"undefined":o(t);return"object"==e?null===t?"null":t instanceof Boolean?"Boolean":t instanceof Number?"Number":t instanceof String?"String":Array.isArray(t)?"Array":t instanceof Date?"Date":"Object":"number"==e?"Number":"boolean"==e?"Boolean":"string"==e?"String":void 0===e?"undefined":e},e.copyAndExtendArray=function(t,e){for(var i=[],o=0;oi;i++)e(t[i],i,t);else for(i in t)t.hasOwnProperty(i)&&e(t[i],i,t)},e.toArray=function(t){var e=[];for(var i in t)t.hasOwnProperty(i)&&e.push(t[i]);return e},e.updateProperty=function(t,e,i){return t[e]!==i?(t[e]=i,!0):!1},e.throttle=function(t,e){var i=null,o=!1;return function n(){i?o=!0:(o=!1,t(),i=setTimeout(function(){i=null,o&&n()},e))}},e.addEventListener=function(t,e,i,o){t.addEventListener?(void 0===o&&(o=!1),"mousewheel"===e&&navigator.userAgent.indexOf("Firefox")>=0&&(e="DOMMouseScroll"),t.addEventListener(e,i,o)):t.attachEvent("on"+e,i)},e.removeEventListener=function(t,e,i,o){t.removeEventListener?(void 0===o&&(o=!1),"mousewheel"===e&&navigator.userAgent.indexOf("Firefox")>=0&&(e="DOMMouseScroll"),t.removeEventListener(e,i,o)):t.detachEvent("on"+e,i)},e.preventDefault=function(t){t||(t=window.event),t.preventDefault?t.preventDefault():t.returnValue=!1},e.getTarget=function(t){t||(t=window.event);var e;return t.target?e=t.target:t.srcElement&&(e=t.srcElement),void 0!=e.nodeType&&3==e.nodeType&&(e=e.parentNode),e},e.hasParent=function(t,e){for(var i=t;i;){if(i===e)return!0;i=i.parentNode}return!1},e.option={},e.option.asBoolean=function(t,e){return"function"==typeof t&&(t=t()),null!=t?0!=t:e||null},e.option.asNumber=function(t,e){return"function"==typeof t&&(t=t()),null!=t?Number(t)||e||null:e||null},e.option.asString=function(t,e){return"function"==typeof t&&(t=t()),null!=t?String(t):e||null},e.option.asSize=function(t,i){return"function"==typeof t&&(t=t()),e.isString(t)?t:e.isNumber(t)?t+"px":i||null},e.option.asElement=function(t,e){return"function"==typeof t&&(t=t()),t||e||null},e.hexToRGB=function(t){var e=/^#?([a-f\d])([a-f\d])([a-f\d])$/i;t=t.replace(e,function(t,e,i,o){return e+e+i+i+o+o});var i=/^#?([a-f\d]{2})([a-f\d]{2})([a-f\d]{2})$/i.exec(t);return i?{r:parseInt(i[1],16),g:parseInt(i[2],16),b:parseInt(i[3],16)}:null},e.overrideOpacity=function(t,i){if(-1!=t.indexOf("rgba"))return t;if(-1!=t.indexOf("rgb")){var o=t.substr(t.indexOf("(")+1).replace(")","").split(",");return"rgba("+o[0]+","+o[1]+","+o[2]+","+i+")"}var o=e.hexToRGB(t);return null==o?t:"rgba("+o.r+","+o.g+","+o.b+","+i+")"},e.RGBToHex=function(t,e,i){return"#"+((1<<24)+(t<<16)+(e<<8)+i).toString(16).slice(1)},e.parseColor=function(t){var i;if(e.isString(t)===!0){if(e.isValidRGB(t)===!0){var o=t.substr(4).substr(0,t.length-5).split(",").map(function(t){return parseInt(t)});t=e.RGBToHex(o[0],o[1],o[2])}if(e.isValidHex(t)===!0){var n=e.hexToHSV(t),s={h:n.h,s:.8*n.s,v:Math.min(1,1.02*n.v)},r={h:n.h,s:Math.min(1,1.25*n.s),v:.8*n.v},a=e.HSVToHex(r.h,r.s,r.v),h=e.HSVToHex(s.h,s.s,s.v);i={background:t,border:a,highlight:{background:h,border:a},hover:{background:h,border:a}}}else i={background:t,border:t,highlight:{background:t,border:t},hover:{background:t,border:t}}}else i={},i.background=t.background||void 0,i.border=t.border||void 0,e.isString(t.highlight)?i.highlight={border:t.highlight,background:t.highlight}:(i.highlight={},i.highlight.background=t.highlight&&t.highlight.background||void 0,i.highlight.border=t.highlight&&t.highlight.border||void 0),e.isString(t.hover)?i.hover={border:t.hover,background:t.hover}:(i.hover={},i.hover.background=t.hover&&t.hover.background||void 0,i.hover.border=t.hover&&t.hover.border||void 0);return i},e.RGBToHSV=function(t,e,i){t/=255,e/=255,i/=255;var o=Math.min(t,Math.min(e,i)),n=Math.max(t,Math.max(e,i));if(o==n)return{h:0,s:0,v:o};var s=t==o?e-i:i==o?t-e:i-t,r=t==o?3:i==o?1:5,a=60*(r-s/(n-o))/360,h=(n-o)/n,d=n;return{h:a,s:h,v:d}};var a={split:function(t){var e={};return t.split(";").forEach(function(t){if(""!=t.trim()){var i=t.split(":"),o=i[0].trim(),n=i[1].trim();e[o]=n}}),e},join:function(t){return Object.keys(t).map(function(e){return e+": "+t[e]}).join("; ")}};e.addCssText=function(t,i){var o=a.split(t.style.cssText),n=a.split(i),s=e.extend(o,n);t.style.cssText=a.join(s)},e.removeCssText=function(t,e){var i=a.split(t.style.cssText),o=a.split(e);for(var n in o)o.hasOwnProperty(n)&&delete i[n];t.style.cssText=a.join(i)},e.HSVToRGB=function(t,e,i){var o,n,s,r=Math.floor(6*t),a=6*t-r,h=i*(1-e),d=i*(1-a*e),l=i*(1-(1-a)*e);switch(r%6){case 0:o=i,n=l,s=h;break;case 1:o=d,n=i,s=h;break;case 2:o=h,n=i,s=l;break;case 3:o=h,n=d,s=i;break;case 4:o=l,n=h,s=i;break;case 5:o=i,n=h,s=d}return{r:Math.floor(255*o),g:Math.floor(255*n),b:Math.floor(255*s)}},e.HSVToHex=function(t,i,o){var n=e.HSVToRGB(t,i,o);return e.RGBToHex(n.r,n.g,n.b)},e.hexToHSV=function(t){var i=e.hexToRGB(t);return e.RGBToHSV(i.r,i.g,i.b)},e.isValidHex=function(t){var e=/(^#[0-9A-F]{6}$)|(^#[0-9A-F]{3}$)/i.test(t);return e},e.isValidRGB=function(t){t=t.replace(" ","");var e=/rgb\((\d{1,3}),(\d{1,3}),(\d{1,3})\)/i.test(t);return e},e.isValidRGBA=function(t){t=t.replace(" ","");var e=/rgba\((\d{1,3}),(\d{1,3}),(\d{1,3}),(.{1,3})\)/i.test(t);return e},e.selectiveBridgeObject=function(t,i){if("object"==("undefined"==typeof i?"undefined":o(i))){for(var n=Object.create(i),s=0;s0&&e(o,t[n-1])<0;n--)t[n]=t[n-1];t[n]=o}return t},e.mergeOptions=function(t,e,i){var o=(arguments.length<=3||void 0===arguments[3]?!1:arguments[3],arguments.length<=4||void 0===arguments[4]?{}:arguments[4]);if(null===e[i])t[i]=Object.create(o[i]);else if(void 0!==e[i])if("boolean"==typeof e[i])t[i].enabled=e[i];else{void 0===e[i].enabled&&(t[i].enabled=!0);for(var n in e[i])e[i].hasOwnProperty(n)&&(t[i][n]=e[i][n])}},e.binarySearchCustom=function(t,e,i,o){for(var n=1e4,s=0,r=0,a=t.length-1;a>=r&&n>s;){var h=Math.floor((r+a)/2),d=t[h],l=void 0===o?d[i]:d[i][o],c=e(l);if(0==c)return h;-1==c?r=h+1:a=h-1,s++}return-1},e.binarySearchValue=function(t,e,i,o,n){for(var s,r,a,h,d=1e4,l=0,c=0,u=t.length-1,n=void 0!=n?n:function(t,e){return t==e?0:e>t?-1:1};u>=c&&d>l;){if(h=Math.floor(.5*(u+c)),s=t[Math.max(0,h-1)][i],r=t[h][i],a=t[Math.min(t.length-1,h+1)][i],0==n(r,e))return h;if(n(s,e)<0&&n(r,e)>0)return"before"==o?Math.max(0,h-1):h;if(n(r,e)<0&&n(a,e)>0)return"before"==o?h:Math.min(t.length-1,h+1);n(r,e)<0?c=h+1:u=h-1,l++}return-1},e.easingFunctions={linear:function(t){return t},easeInQuad:function(t){return t*t},easeOutQuad:function(t){return t*(2-t)},easeInOutQuad:function(t){return.5>t?2*t*t:-1+(4-2*t)*t},easeInCubic:function(t){return t*t*t},easeOutCubic:function(t){return--t*t*t+1},easeInOutCubic:function(t){return.5>t?4*t*t*t:(t-1)*(2*t-2)*(2*t-2)+1},easeInQuart:function(t){return t*t*t*t},easeOutQuart:function(t){return 1- --t*t*t*t},easeInOutQuart:function(t){return.5>t?8*t*t*t*t:1-8*--t*t*t*t},easeInQuint:function(t){return t*t*t*t*t},easeOutQuint:function(t){return 1+--t*t*t*t*t},easeInOutQuint:function(t){return.5>t?16*t*t*t*t*t:1+16*--t*t*t*t*t}}},function(t,e,i){t.exports="undefined"!=typeof window&&window.moment||i(3)},function(t,e,i){(function(t){!function(e,i){t.exports=i()}(this,function(){function e(){return ro.apply(null,arguments)}function i(t){ro=t}function o(t){return t instanceof Array||"[object Array]"===Object.prototype.toString.call(t)}function n(t){return t instanceof Date||"[object Date]"===Object.prototype.toString.call(t)}function s(t,e){var i,o=[];for(i=0;i0)for(i in ho)o=ho[i],n=e[o],p(n)||(t[o]=n);return t}function m(t){f(this,t),this._d=new Date(null!=t._d?t._d.getTime():NaN),lo===!1&&(lo=!0,e.updateOffset(this),lo=!1)}function v(t){return t instanceof m||null!=t&&null!=t._isAMomentObject}function g(t){return 0>t?Math.ceil(t):Math.floor(t)}function y(t){var e=+t,i=0;return 0!==e&&isFinite(e)&&(i=g(e)),i}function b(t,e,i){var o,n=Math.min(t.length,e.length),s=Math.abs(t.length-e.length),r=0;for(o=0;n>o;o++)(i&&t[o]!==e[o]||!i&&y(t[o])!==y(e[o]))&&r++;return r+s}function w(t){e.suppressDeprecationWarnings===!1&&"undefined"!=typeof console&&console.warn&&console.warn("Deprecation warning: "+t)}function _(t,i){var o=!0;return a(function(){return null!=e.deprecationHandler&&e.deprecationHandler(null,t),o&&(w(t+"\nArguments: "+Array.prototype.slice.call(arguments).join(", ")+"\n"+(new Error).stack),o=!1),i.apply(this,arguments)},i)}function x(t,i){null!=e.deprecationHandler&&e.deprecationHandler(t,i),co[t]||(w(i),co[t]=!0)}function k(t){return t instanceof Function||"[object Function]"===Object.prototype.toString.call(t)}function O(t){return"[object Object]"===Object.prototype.toString.call(t)}function M(t){var e,i;for(i in t)e=t[i],k(e)?this[i]=e:this["_"+i]=e;this._config=t,this._ordinalParseLenient=new RegExp(this._ordinalParse.source+"|"+/\d{1,2}/.source)}function D(t,e){var i,o=a({},t);for(i in e)r(e,i)&&(O(t[i])&&O(e[i])?(o[i]={},a(o[i],t[i]),a(o[i],e[i])):null!=e[i]?o[i]=e[i]:delete o[i]);return o}function S(t){null!=t&&this.set(t)}function C(t){return t?t.toLowerCase().replace("_","-"):t}function T(t){for(var e,i,o,n,s=0;s0;){if(o=E(n.slice(0,e).join("-")))return o;if(i&&i.length>=e&&b(n,i,!0)>=e-1)break;e--}s++}return null}function E(e){var i=null;if(!mo[e]&&"undefined"!=typeof t&&t&&t.exports)try{i=po._abbr,!function(){var t=new Error('Cannot find module "./locale"');throw t.code="MODULE_NOT_FOUND",t}(),P(i)}catch(o){}return mo[e]}function P(t,e){var i;return t&&(i=p(e)?R(t):I(t,e),i&&(po=i)),po._abbr}function I(t,e){return null!==e?(e.abbr=t,null!=mo[t]?(x("defineLocaleOverride","use moment.updateLocale(localeName, config) to change an existing locale. moment.defineLocale(localeName, config) should only be used for creating a new locale"),e=D(mo[t]._config,e)):null!=e.parentLocale&&(null!=mo[e.parentLocale]?e=D(mo[e.parentLocale]._config,e):x("parentLocaleUndefined","specified parentLocale is not defined yet")),mo[t]=new S(e),P(t),mo[t]):(delete mo[t],null)}function N(t,e){if(null!=e){var i;null!=mo[t]&&(e=D(mo[t]._config,e)),i=new S(e),i.parentLocale=mo[t],mo[t]=i,P(t)}else null!=mo[t]&&(null!=mo[t].parentLocale?mo[t]=mo[t].parentLocale:null!=mo[t]&&delete mo[t]);return mo[t]}function R(t){var e;if(t&&t._locale&&t._locale._abbr&&(t=t._locale._abbr),!t)return po;if(!o(t)){if(e=E(t))return e;t=[t]}return T(t)}function z(){return uo(mo)}function L(t,e){var i=t.toLowerCase();vo[i]=vo[i+"s"]=vo[e]=t}function A(t){return"string"==typeof t?vo[t]||vo[t.toLowerCase()]:void 0}function B(t){var e,i,o={};for(i in t)r(t,i)&&(e=A(i),e&&(o[e]=t[i]));return o}function F(t,i){return function(o){return null!=o?(H(this,t,o),e.updateOffset(this,i),this):j(this,t)}}function j(t,e){return t.isValid()?t._d["get"+(t._isUTC?"UTC":"")+e]():NaN}function H(t,e,i){t.isValid()&&t._d["set"+(t._isUTC?"UTC":"")+e](i)}function W(t,e){var i;if("object"==typeof t)for(i in t)this.set(i,t[i]);else if(t=A(t),k(this[t]))return this[t](e);return this}function Y(t,e,i){var o=""+Math.abs(t),n=e-o.length,s=t>=0;return(s?i?"+":"":"-")+Math.pow(10,Math.max(0,n)).toString().substr(1)+o}function G(t,e,i,o){var n=o;"string"==typeof o&&(n=function(){return this[o]()}),t&&(wo[t]=n),e&&(wo[e[0]]=function(){return Y(n.apply(this,arguments),e[1],e[2])}),i&&(wo[i]=function(){return this.localeData().ordinal(n.apply(this,arguments),t)})}function V(t){return t.match(/\[[\s\S]/)?t.replace(/^\[|\]$/g,""):t.replace(/\\/g,"")}function U(t){var e,i,o=t.match(go);for(e=0,i=o.length;i>e;e++)wo[o[e]]?o[e]=wo[o[e]]:o[e]=V(o[e]);return function(e){var n,s="";for(n=0;i>n;n++)s+=o[n]instanceof Function?o[n].call(e,t):o[n];return s}}function q(t,e){return t.isValid()?(e=X(e,t.localeData()),bo[e]=bo[e]||U(e),bo[e](t)):t.localeData().invalidDate()}function X(t,e){function i(t){return e.longDateFormat(t)||t}var o=5;for(yo.lastIndex=0;o>=0&&yo.test(t);)t=t.replace(yo,i),yo.lastIndex=0,o-=1;return t}function Z(t,e,i){Bo[t]=k(e)?e:function(t,o){return t&&i?i:e}}function K(t,e){return r(Bo,t)?Bo[t](e._strict,e._locale):new RegExp(J(t))}function J(t){return Q(t.replace("\\","").replace(/\\(\[)|\\(\])|\[([^\]\[]*)\]|\\(.)/g,function(t,e,i,o,n){return e||i||o||n}))}function Q(t){return t.replace(/[-\/\\^$*+?.()|[\]{}]/g,"\\$&")}function $(t,e){var i,o=e;for("string"==typeof t&&(t=[t]),"number"==typeof e&&(o=function(t,i){i[e]=y(t)}),i=0;io;++o)s=h([2e3,o]),this._shortMonthsParse[o]=this.monthsShort(s,"").toLocaleLowerCase(),this._longMonthsParse[o]=this.months(s,"").toLocaleLowerCase();return i?"MMM"===e?(n=fo.call(this._shortMonthsParse,r),-1!==n?n:null):(n=fo.call(this._longMonthsParse,r),-1!==n?n:null):"MMM"===e?(n=fo.call(this._shortMonthsParse,r),-1!==n?n:(n=fo.call(this._longMonthsParse,r),-1!==n?n:null)):(n=fo.call(this._longMonthsParse,r),-1!==n?n:(n=fo.call(this._shortMonthsParse,r),-1!==n?n:null))}function rt(t,e,i){var o,n,s;if(this._monthsParseExact)return st.call(this,t,e,i);for(this._monthsParse||(this._monthsParse=[],this._longMonthsParse=[],this._shortMonthsParse=[]),o=0;12>o;o++){if(n=h([2e3,o]),i&&!this._longMonthsParse[o]&&(this._longMonthsParse[o]=new RegExp("^"+this.months(n,"").replace(".","")+"$","i"),this._shortMonthsParse[o]=new RegExp("^"+this.monthsShort(n,"").replace(".","")+"$","i")),i||this._monthsParse[o]||(s="^"+this.months(n,"")+"|^"+this.monthsShort(n,""),this._monthsParse[o]=new RegExp(s.replace(".",""),"i")),i&&"MMMM"===e&&this._longMonthsParse[o].test(t))return o;if(i&&"MMM"===e&&this._shortMonthsParse[o].test(t))return o;if(!i&&this._monthsParse[o].test(t))return o}}function at(t,e){var i;if(!t.isValid())return t;if("string"==typeof e)if(/^\d+$/.test(e))e=y(e);else if(e=t.localeData().monthsParse(e),"number"!=typeof e)return t;return i=Math.min(t.date(),it(t.year(),e)),t._d["set"+(t._isUTC?"UTC":"")+"Month"](e,i),t}function ht(t){return null!=t?(at(this,t),e.updateOffset(this,!0),this):j(this,"Month")}function dt(){return it(this.year(),this.month())}function lt(t){return this._monthsParseExact?(r(this,"_monthsRegex")||ut.call(this),t?this._monthsShortStrictRegex:this._monthsShortRegex):this._monthsShortStrictRegex&&t?this._monthsShortStrictRegex:this._monthsShortRegex}function ct(t){return this._monthsParseExact?(r(this,"_monthsRegex")||ut.call(this),t?this._monthsStrictRegex:this._monthsRegex):this._monthsStrictRegex&&t?this._monthsStrictRegex:this._monthsRegex}function ut(){function t(t,e){return e.length-t.length}var e,i,o=[],n=[],s=[];for(e=0;12>e;e++)i=h([2e3,e]),o.push(this.monthsShort(i,"")),n.push(this.months(i,"")),s.push(this.months(i,"")),s.push(this.monthsShort(i,""));for(o.sort(t),n.sort(t),s.sort(t),e=0;12>e;e++)o[e]=Q(o[e]),n[e]=Q(n[e]),s[e]=Q(s[e]);this._monthsRegex=new RegExp("^("+s.join("|")+")","i"),this._monthsShortRegex=this._monthsRegex,this._monthsStrictRegex=new RegExp("^("+n.join("|")+")","i"),this._monthsShortStrictRegex=new RegExp("^("+o.join("|")+")","i")}function pt(t){var e,i=t._a;return i&&-2===l(t).overflow&&(e=i[Ho]<0||i[Ho]>11?Ho:i[Wo]<1||i[Wo]>it(i[jo],i[Ho])?Wo:i[Yo]<0||i[Yo]>24||24===i[Yo]&&(0!==i[Go]||0!==i[Vo]||0!==i[Uo])?Yo:i[Go]<0||i[Go]>59?Go:i[Vo]<0||i[Vo]>59?Vo:i[Uo]<0||i[Uo]>999?Uo:-1,l(t)._overflowDayOfYear&&(jo>e||e>Wo)&&(e=Wo),l(t)._overflowWeeks&&-1===e&&(e=qo),l(t)._overflowWeekday&&-1===e&&(e=Xo),l(t).overflow=e),t}function ft(t){var e,i,o,n,s,r,a=t._i,h=tn.exec(a)||en.exec(a);if(h){for(l(t).iso=!0,e=0,i=nn.length;i>e;e++)if(nn[e][1].exec(h[1])){n=nn[e][0],o=nn[e][2]!==!1;break}if(null==n)return void(t._isValid=!1);if(h[3]){for(e=0,i=sn.length;i>e;e++)if(sn[e][1].exec(h[3])){s=(h[2]||" ")+sn[e][0];break}if(null==s)return void(t._isValid=!1)}if(!o&&null!=s)return void(t._isValid=!1);if(h[4]){if(!on.exec(h[4]))return void(t._isValid=!1);r="Z"}t._f=n+(s||"")+(r||""),Tt(t)}else t._isValid=!1}function mt(t){var i=rn.exec(t._i);return null!==i?void(t._d=new Date(+i[1])):(ft(t),void(t._isValid===!1&&(delete t._isValid,e.createFromInputFallback(t))))}function vt(t,e,i,o,n,s,r){var a=new Date(t,e,i,o,n,s,r);return 100>t&&t>=0&&isFinite(a.getFullYear())&&a.setFullYear(t),a}function gt(t){var e=new Date(Date.UTC.apply(null,arguments));return 100>t&&t>=0&&isFinite(e.getUTCFullYear())&&e.setUTCFullYear(t),e}function yt(t){return bt(t)?366:365}function bt(t){return t%4===0&&t%100!==0||t%400===0}function wt(){return bt(this.year())}function _t(t,e,i){var o=7+e-i,n=(7+gt(t,0,o).getUTCDay()-e)%7;return-n+o-1}function xt(t,e,i,o,n){var s,r,a=(7+i-o)%7,h=_t(t,o,n),d=1+7*(e-1)+a+h;return 0>=d?(s=t-1,r=yt(s)+d):d>yt(t)?(s=t+1,r=d-yt(t)):(s=t,r=d),{year:s,dayOfYear:r}}function kt(t,e,i){var o,n,s=_t(t.year(),e,i),r=Math.floor((t.dayOfYear()-s-1)/7)+1;return 1>r?(n=t.year()-1,o=r+Ot(n,e,i)):r>Ot(t.year(),e,i)?(o=r-Ot(t.year(),e,i),n=t.year()+1):(n=t.year(),o=r),{week:o,year:n}}function Ot(t,e,i){var o=_t(t,e,i),n=_t(t+1,e,i);return(yt(t)-o+n)/7}function Mt(t,e,i){return null!=t?t:null!=e?e:i}function Dt(t){var i=new Date(e.now());return t._useUTC?[i.getUTCFullYear(),i.getUTCMonth(),i.getUTCDate()]:[i.getFullYear(),i.getMonth(),i.getDate()]}function St(t){var e,i,o,n,s=[];if(!t._d){for(o=Dt(t),t._w&&null==t._a[Wo]&&null==t._a[Ho]&&Ct(t),t._dayOfYear&&(n=Mt(t._a[jo],o[jo]),t._dayOfYear>yt(n)&&(l(t)._overflowDayOfYear=!0),i=gt(n,0,t._dayOfYear),t._a[Ho]=i.getUTCMonth(),t._a[Wo]=i.getUTCDate()),e=0;3>e&&null==t._a[e];++e)t._a[e]=s[e]=o[e];for(;7>e;e++)t._a[e]=s[e]=null==t._a[e]?2===e?1:0:t._a[e];24===t._a[Yo]&&0===t._a[Go]&&0===t._a[Vo]&&0===t._a[Uo]&&(t._nextDay=!0,t._a[Yo]=0),t._d=(t._useUTC?gt:vt).apply(null,s),null!=t._tzm&&t._d.setUTCMinutes(t._d.getUTCMinutes()-t._tzm),t._nextDay&&(t._a[Yo]=24)}}function Ct(t){var e,i,o,n,s,r,a,h;e=t._w,null!=e.GG||null!=e.W||null!=e.E?(s=1,r=4,i=Mt(e.GG,t._a[jo],kt(At(),1,4).year),o=Mt(e.W,1),n=Mt(e.E,1),(1>n||n>7)&&(h=!0)):(s=t._locale._week.dow,r=t._locale._week.doy,i=Mt(e.gg,t._a[jo],kt(At(),s,r).year),o=Mt(e.w,1),null!=e.d?(n=e.d,(0>n||n>6)&&(h=!0)):null!=e.e?(n=e.e+s,(e.e<0||e.e>6)&&(h=!0)):n=s),1>o||o>Ot(i,s,r)?l(t)._overflowWeeks=!0:null!=h?l(t)._overflowWeekday=!0:(a=xt(i,o,n,s,r),t._a[jo]=a.year,t._dayOfYear=a.dayOfYear)}function Tt(t){if(t._f===e.ISO_8601)return void ft(t);t._a=[],l(t).empty=!0;var i,o,n,s,r,a=""+t._i,h=a.length,d=0;for(n=X(t._f,t._locale).match(go)||[],i=0;i0&&l(t).unusedInput.push(r),a=a.slice(a.indexOf(o)+o.length),d+=o.length),wo[s]?(o?l(t).empty=!1:l(t).unusedTokens.push(s),et(s,o,t)):t._strict&&!o&&l(t).unusedTokens.push(s);l(t).charsLeftOver=h-d,a.length>0&&l(t).unusedInput.push(a),l(t).bigHour===!0&&t._a[Yo]<=12&&t._a[Yo]>0&&(l(t).bigHour=void 0),l(t).parsedDateParts=t._a.slice(0),l(t).meridiem=t._meridiem,t._a[Yo]=Et(t._locale,t._a[Yo],t._meridiem),St(t),pt(t)}function Et(t,e,i){var o;return null==i?e:null!=t.meridiemHour?t.meridiemHour(e,i):null!=t.isPM?(o=t.isPM(i),o&&12>e&&(e+=12),o||12!==e||(e=0),e):e}function Pt(t){var e,i,o,n,s;if(0===t._f.length)return l(t).invalidFormat=!0,void(t._d=new Date(NaN));for(n=0;ns)&&(o=s,i=e));a(t,i||e)}function It(t){if(!t._d){var e=B(t._i);t._a=s([e.year,e.month,e.day||e.date,e.hour,e.minute,e.second,e.millisecond],function(t){return t&&parseInt(t,10)}),St(t)}}function Nt(t){var e=new m(pt(Rt(t)));return e._nextDay&&(e.add(1,"d"),e._nextDay=void 0),e}function Rt(t){var e=t._i,i=t._f;return t._locale=t._locale||R(t._l),null===e||void 0===i&&""===e?u({nullInput:!0}):("string"==typeof e&&(t._i=e=t._locale.preparse(e)),v(e)?new m(pt(e)):(o(i)?Pt(t):i?Tt(t):n(e)?t._d=e:zt(t),c(t)||(t._d=null),t))}function zt(t){var i=t._i;void 0===i?t._d=new Date(e.now()):n(i)?t._d=new Date(i.valueOf()):"string"==typeof i?mt(t):o(i)?(t._a=s(i.slice(0),function(t){return parseInt(t,10)}),St(t)):"object"==typeof i?It(t):"number"==typeof i?t._d=new Date(i):e.createFromInputFallback(t)}function Lt(t,e,i,o,n){var s={};return"boolean"==typeof i&&(o=i,i=void 0),s._isAMomentObject=!0,s._useUTC=s._isUTC=n,s._l=i,s._i=t,s._f=e,s._strict=o,Nt(s)}function At(t,e,i,o){return Lt(t,e,i,o,!1)}function Bt(t,e){var i,n;if(1===e.length&&o(e[0])&&(e=e[0]),!e.length)return At();for(i=e[0],n=1;nt&&(t=-t,i="-"),i+Y(~~(t/60),2)+e+Y(~~t%60,2)})}function Gt(t,e){var i=(e||"").match(t)||[],o=i[i.length-1]||[],n=(o+"").match(cn)||["-",0,0],s=+(60*n[1])+y(n[2]);return"+"===n[0]?s:-s}function Vt(t,i){var o,s;return i._isUTC?(o=i.clone(),s=(v(t)||n(t)?t.valueOf():At(t).valueOf())-o.valueOf(),o._d.setTime(o._d.valueOf()+s),e.updateOffset(o,!1),o):At(t).local()}function Ut(t){return 15*-Math.round(t._d.getTimezoneOffset()/15)}function qt(t,i){var o,n=this._offset||0;return this.isValid()?null!=t?("string"==typeof t?t=Gt(zo,t):Math.abs(t)<16&&(t=60*t),!this._isUTC&&i&&(o=Ut(this)),this._offset=t,this._isUTC=!0,null!=o&&this.add(o,"m"),n!==t&&(!i||this._changeInProgress?le(this,ne(t-n,"m"),1,!1):this._changeInProgress||(this._changeInProgress=!0,e.updateOffset(this,!0),this._changeInProgress=null)),this):this._isUTC?n:Ut(this):null!=t?this:NaN}function Xt(t,e){return null!=t?("string"!=typeof t&&(t=-t),this.utcOffset(t,e),this):-this.utcOffset()}function Zt(t){return this.utcOffset(0,t)}function Kt(t){return this._isUTC&&(this.utcOffset(0,t),this._isUTC=!1,t&&this.subtract(Ut(this),"m")),this}function Jt(){return this._tzm?this.utcOffset(this._tzm):"string"==typeof this._i&&this.utcOffset(Gt(Ro,this._i)),this}function Qt(t){return this.isValid()?(t=t?At(t).utcOffset():0,(this.utcOffset()-t)%60===0):!1}function $t(){return this.utcOffset()>this.clone().month(0).utcOffset()||this.utcOffset()>this.clone().month(5).utcOffset()}function te(){if(!p(this._isDSTShifted))return this._isDSTShifted;var t={};if(f(t,this),t=Rt(t),t._a){var e=t._isUTC?h(t._a):At(t._a);this._isDSTShifted=this.isValid()&&b(t._a,e.toArray())>0}else this._isDSTShifted=!1;return this._isDSTShifted}function ee(){return this.isValid()?!this._isUTC:!1}function ie(){return this.isValid()?this._isUTC:!1}function oe(){return this.isValid()?this._isUTC&&0===this._offset:!1}function ne(t,e){var i,o,n,s=t,a=null;return Wt(t)?s={ms:t._milliseconds,d:t._days,M:t._months}:"number"==typeof t?(s={},e?s[e]=t:s.milliseconds=t):(a=un.exec(t))?(i="-"===a[1]?-1:1,s={y:0,d:y(a[Wo])*i,h:y(a[Yo])*i,m:y(a[Go])*i,s:y(a[Vo])*i,ms:y(a[Uo])*i}):(a=pn.exec(t))?(i="-"===a[1]?-1:1,s={y:se(a[2],i),M:se(a[3],i),w:se(a[4],i),d:se(a[5],i),h:se(a[6],i),m:se(a[7],i),s:se(a[8],i)}):null==s?s={}:"object"==typeof s&&("from"in s||"to"in s)&&(n=ae(At(s.from),At(s.to)),s={},s.ms=n.milliseconds,s.M=n.months),o=new Ht(s),Wt(t)&&r(t,"_locale")&&(o._locale=t._locale),o}function se(t,e){var i=t&&parseFloat(t.replace(",","."));return(isNaN(i)?0:i)*e}function re(t,e){var i={milliseconds:0,months:0};return i.months=e.month()-t.month()+12*(e.year()-t.year()),t.clone().add(i.months,"M").isAfter(e)&&--i.months, -i.milliseconds=+e-+t.clone().add(i.months,"M"),i}function ae(t,e){var i;return t.isValid()&&e.isValid()?(e=Vt(e,t),t.isBefore(e)?i=re(t,e):(i=re(e,t),i.milliseconds=-i.milliseconds,i.months=-i.months),i):{milliseconds:0,months:0}}function he(t){return 0>t?-1*Math.round(-1*t):Math.round(t)}function de(t,e){return function(i,o){var n,s;return null===o||isNaN(+o)||(x(e,"moment()."+e+"(period, number) is deprecated. Please use moment()."+e+"(number, period)."),s=i,i=o,o=s),i="string"==typeof i?+i:i,n=ne(i,o),le(this,n,t),this}}function le(t,i,o,n){var s=i._milliseconds,r=he(i._days),a=he(i._months);t.isValid()&&(n=null==n?!0:n,s&&t._d.setTime(t._d.valueOf()+s*o),r&&H(t,"Date",j(t,"Date")+r*o),a&&at(t,j(t,"Month")+a*o),n&&e.updateOffset(t,r||a))}function ce(t,e){var i=t||At(),o=Vt(i,this).startOf("day"),n=this.diff(o,"days",!0),s=-6>n?"sameElse":-1>n?"lastWeek":0>n?"lastDay":1>n?"sameDay":2>n?"nextDay":7>n?"nextWeek":"sameElse",r=e&&(k(e[s])?e[s]():e[s]);return this.format(r||this.localeData().calendar(s,this,At(i)))}function ue(){return new m(this)}function pe(t,e){var i=v(t)?t:At(t);return this.isValid()&&i.isValid()?(e=A(p(e)?"millisecond":e),"millisecond"===e?this.valueOf()>i.valueOf():i.valueOf()e-s?(i=t.clone().add(n-1,"months"),o=(e-s)/(s-i)):(i=t.clone().add(n+1,"months"),o=(e-s)/(i-s)),-(n+o)||0}function _e(){return this.clone().locale("en").format("ddd MMM DD YYYY HH:mm:ss [GMT]ZZ")}function xe(){var t=this.clone().utc();return 0s&&(e=s),Xe.call(this,t,e,i,o,n))}function Xe(t,e,i,o,n){var s=xt(t,e,i,o,n),r=gt(s.year,0,s.dayOfYear);return this.year(r.getUTCFullYear()),this.month(r.getUTCMonth()),this.date(r.getUTCDate()),this}function Ze(t){return null==t?Math.ceil((this.month()+1)/3):this.month(3*(t-1)+this.month()%3)}function Ke(t){return kt(t,this._week.dow,this._week.doy).week}function Je(){return this._week.dow}function Qe(){return this._week.doy}function $e(t){var e=this.localeData().week(this);return null==t?e:this.add(7*(t-e),"d")}function ti(t){var e=kt(this,1,4).week;return null==t?e:this.add(7*(t-e),"d")}function ei(t,e){return"string"!=typeof t?t:isNaN(t)?(t=e.weekdaysParse(t),"number"==typeof t?t:null):parseInt(t,10)}function ii(t,e){return o(this._weekdays)?this._weekdays[t.day()]:this._weekdays[this._weekdays.isFormat.test(e)?"format":"standalone"][t.day()]}function oi(t){return this._weekdaysShort[t.day()]}function ni(t){return this._weekdaysMin[t.day()]}function si(t,e,i){var o,n,s,r=t.toLocaleLowerCase();if(!this._weekdaysParse)for(this._weekdaysParse=[],this._shortWeekdaysParse=[],this._minWeekdaysParse=[],o=0;7>o;++o)s=h([2e3,1]).day(o),this._minWeekdaysParse[o]=this.weekdaysMin(s,"").toLocaleLowerCase(),this._shortWeekdaysParse[o]=this.weekdaysShort(s,"").toLocaleLowerCase(),this._weekdaysParse[o]=this.weekdays(s,"").toLocaleLowerCase();return i?"dddd"===e?(n=fo.call(this._weekdaysParse,r),-1!==n?n:null):"ddd"===e?(n=fo.call(this._shortWeekdaysParse,r),-1!==n?n:null):(n=fo.call(this._minWeekdaysParse,r),-1!==n?n:null):"dddd"===e?(n=fo.call(this._weekdaysParse,r),-1!==n?n:(n=fo.call(this._shortWeekdaysParse,r),-1!==n?n:(n=fo.call(this._minWeekdaysParse,r),-1!==n?n:null))):"ddd"===e?(n=fo.call(this._shortWeekdaysParse,r),-1!==n?n:(n=fo.call(this._weekdaysParse,r),-1!==n?n:(n=fo.call(this._minWeekdaysParse,r),-1!==n?n:null))):(n=fo.call(this._minWeekdaysParse,r),-1!==n?n:(n=fo.call(this._weekdaysParse,r),-1!==n?n:(n=fo.call(this._shortWeekdaysParse,r),-1!==n?n:null)))}function ri(t,e,i){var o,n,s;if(this._weekdaysParseExact)return si.call(this,t,e,i);for(this._weekdaysParse||(this._weekdaysParse=[],this._minWeekdaysParse=[],this._shortWeekdaysParse=[],this._fullWeekdaysParse=[]),o=0;7>o;o++){if(n=h([2e3,1]).day(o),i&&!this._fullWeekdaysParse[o]&&(this._fullWeekdaysParse[o]=new RegExp("^"+this.weekdays(n,"").replace(".",".?")+"$","i"),this._shortWeekdaysParse[o]=new RegExp("^"+this.weekdaysShort(n,"").replace(".",".?")+"$","i"),this._minWeekdaysParse[o]=new RegExp("^"+this.weekdaysMin(n,"").replace(".",".?")+"$","i")),this._weekdaysParse[o]||(s="^"+this.weekdays(n,"")+"|^"+this.weekdaysShort(n,"")+"|^"+this.weekdaysMin(n,""),this._weekdaysParse[o]=new RegExp(s.replace(".",""),"i")),i&&"dddd"===e&&this._fullWeekdaysParse[o].test(t))return o;if(i&&"ddd"===e&&this._shortWeekdaysParse[o].test(t))return o;if(i&&"dd"===e&&this._minWeekdaysParse[o].test(t))return o;if(!i&&this._weekdaysParse[o].test(t))return o}}function ai(t){if(!this.isValid())return null!=t?this:NaN;var e=this._isUTC?this._d.getUTCDay():this._d.getDay();return null!=t?(t=ei(t,this.localeData()),this.add(t-e,"d")):e}function hi(t){if(!this.isValid())return null!=t?this:NaN;var e=(this.day()+7-this.localeData()._week.dow)%7;return null==t?e:this.add(t-e,"d")}function di(t){return this.isValid()?null==t?this.day()||7:this.day(this.day()%7?t:t-7):null!=t?this:NaN}function li(t){return this._weekdaysParseExact?(r(this,"_weekdaysRegex")||pi.call(this),t?this._weekdaysStrictRegex:this._weekdaysRegex):this._weekdaysStrictRegex&&t?this._weekdaysStrictRegex:this._weekdaysRegex}function ci(t){return this._weekdaysParseExact?(r(this,"_weekdaysRegex")||pi.call(this),t?this._weekdaysShortStrictRegex:this._weekdaysShortRegex):this._weekdaysShortStrictRegex&&t?this._weekdaysShortStrictRegex:this._weekdaysShortRegex}function ui(t){return this._weekdaysParseExact?(r(this,"_weekdaysRegex")||pi.call(this),t?this._weekdaysMinStrictRegex:this._weekdaysMinRegex):this._weekdaysMinStrictRegex&&t?this._weekdaysMinStrictRegex:this._weekdaysMinRegex}function pi(){function t(t,e){return e.length-t.length}var e,i,o,n,s,r=[],a=[],d=[],l=[];for(e=0;7>e;e++)i=h([2e3,1]).day(e),o=this.weekdaysMin(i,""),n=this.weekdaysShort(i,""),s=this.weekdays(i,""),r.push(o),a.push(n),d.push(s),l.push(o),l.push(n),l.push(s);for(r.sort(t),a.sort(t),d.sort(t),l.sort(t),e=0;7>e;e++)a[e]=Q(a[e]),d[e]=Q(d[e]),l[e]=Q(l[e]);this._weekdaysRegex=new RegExp("^("+l.join("|")+")","i"),this._weekdaysShortRegex=this._weekdaysRegex,this._weekdaysMinRegex=this._weekdaysRegex,this._weekdaysStrictRegex=new RegExp("^("+d.join("|")+")","i"),this._weekdaysShortStrictRegex=new RegExp("^("+a.join("|")+")","i"),this._weekdaysMinStrictRegex=new RegExp("^("+r.join("|")+")","i")}function fi(t){var e=Math.round((this.clone().startOf("day")-this.clone().startOf("year"))/864e5)+1;return null==t?e:this.add(t-e,"d")}function mi(){return this.hours()%12||12}function vi(){return this.hours()||24}function gi(t,e){G(t,0,0,function(){return this.localeData().meridiem(this.hours(),this.minutes(),e)})}function yi(t,e){return e._meridiemParse}function bi(t){return"p"===(t+"").toLowerCase().charAt(0)}function wi(t,e,i){return t>11?i?"pm":"PM":i?"am":"AM"}function _i(t,e){e[Uo]=y(1e3*("0."+t))}function xi(){return this._isUTC?"UTC":""}function ki(){return this._isUTC?"Coordinated Universal Time":""}function Oi(t){return At(1e3*t)}function Mi(){return At.apply(null,arguments).parseZone()}function Di(t,e,i){var o=this._calendar[t];return k(o)?o.call(e,i):o}function Si(t){var e=this._longDateFormat[t],i=this._longDateFormat[t.toUpperCase()];return e||!i?e:(this._longDateFormat[t]=i.replace(/MMMM|MM|DD|dddd/g,function(t){return t.slice(1)}),this._longDateFormat[t])}function Ci(){return this._invalidDate}function Ti(t){return this._ordinal.replace("%d",t)}function Ei(t){return t}function Pi(t,e,i,o){var n=this._relativeTime[i];return k(n)?n(t,e,i,o):n.replace(/%d/i,t)}function Ii(t,e){var i=this._relativeTime[t>0?"future":"past"];return k(i)?i(e):i.replace(/%s/i,e)}function Ni(t,e,i,o){var n=R(),s=h().set(o,e);return n[i](s,t)}function Ri(t,e,i){if("number"==typeof t&&(e=t,t=void 0),t=t||"",null!=e)return Ni(t,e,i,"month");var o,n=[];for(o=0;12>o;o++)n[o]=Ni(t,o,i,"month");return n}function zi(t,e,i,o){"boolean"==typeof t?("number"==typeof e&&(i=e,e=void 0),e=e||""):(e=t,i=e,t=!1,"number"==typeof e&&(i=e,e=void 0),e=e||"");var n=R(),s=t?n._week.dow:0;if(null!=i)return Ni(e,(i+s)%7,o,"day");var r,a=[];for(r=0;7>r;r++)a[r]=Ni(e,(r+s)%7,o,"day");return a}function Li(t,e){return Ri(t,e,"months")}function Ai(t,e){return Ri(t,e,"monthsShort")}function Bi(t,e,i){return zi(t,e,i,"weekdays")}function Fi(t,e,i){return zi(t,e,i,"weekdaysShort")}function ji(t,e,i){return zi(t,e,i,"weekdaysMin")}function Hi(){var t=this._data;return this._milliseconds=jn(this._milliseconds),this._days=jn(this._days),this._months=jn(this._months),t.milliseconds=jn(t.milliseconds),t.seconds=jn(t.seconds),t.minutes=jn(t.minutes),t.hours=jn(t.hours),t.months=jn(t.months),t.years=jn(t.years),this}function Wi(t,e,i,o){var n=ne(e,i);return t._milliseconds+=o*n._milliseconds,t._days+=o*n._days,t._months+=o*n._months,t._bubble()}function Yi(t,e){return Wi(this,t,e,1)}function Gi(t,e){return Wi(this,t,e,-1)}function Vi(t){return 0>t?Math.floor(t):Math.ceil(t)}function Ui(){var t,e,i,o,n,s=this._milliseconds,r=this._days,a=this._months,h=this._data;return s>=0&&r>=0&&a>=0||0>=s&&0>=r&&0>=a||(s+=864e5*Vi(Xi(a)+r),r=0,a=0),h.milliseconds=s%1e3,t=g(s/1e3),h.seconds=t%60,e=g(t/60),h.minutes=e%60,i=g(e/60),h.hours=i%24,r+=g(i/24),n=g(qi(r)),a+=n,r-=Vi(Xi(n)),o=g(a/12),a%=12,h.days=r,h.months=a,h.years=o,this}function qi(t){return 4800*t/146097}function Xi(t){return 146097*t/4800}function Zi(t){var e,i,o=this._milliseconds;if(t=A(t),"month"===t||"year"===t)return e=this._days+o/864e5,i=this._months+qi(e),"month"===t?i:i/12;switch(e=this._days+Math.round(Xi(this._months)),t){case"week":return e/7+o/6048e5;case"day":return e+o/864e5;case"hour":return 24*e+o/36e5;case"minute":return 1440*e+o/6e4;case"second":return 86400*e+o/1e3;case"millisecond":return Math.floor(864e5*e)+o;default:throw new Error("Unknown unit "+t)}}function Ki(){return this._milliseconds+864e5*this._days+this._months%12*2592e6+31536e6*y(this._months/12)}function Ji(t){return function(){return this.as(t)}}function Qi(t){return t=A(t),this[t+"s"]()}function $i(t){return function(){return this._data[t]}}function to(){return g(this.days()/7)}function eo(t,e,i,o,n){return n.relativeTime(e||1,!!i,t,o)}function io(t,e,i){var o=ne(t).abs(),n=is(o.as("s")),s=is(o.as("m")),r=is(o.as("h")),a=is(o.as("d")),h=is(o.as("M")),d=is(o.as("y")),l=n=s&&["m"]||s=r&&["h"]||r=a&&["d"]||a=h&&["M"]||h=d&&["y"]||["yy",d];return l[2]=e,l[3]=+t>0,l[4]=i,eo.apply(null,l)}function oo(t,e){return void 0===os[t]?!1:void 0===e?os[t]:(os[t]=e,!0)}function no(t){var e=this.localeData(),i=io(this,!t,e);return t&&(i=e.pastFuture(+this,i)),e.postformat(i)}function so(){var t,e,i,o=ns(this._milliseconds)/1e3,n=ns(this._days),s=ns(this._months);t=g(o/60),e=g(t/60),o%=60,t%=60,i=g(s/12),s%=12;var r=i,a=s,h=n,d=e,l=t,c=o,u=this.asSeconds();return u?(0>u?"-":"")+"P"+(r?r+"Y":"")+(a?a+"M":"")+(h?h+"D":"")+(d||l||c?"T":"")+(d?d+"H":"")+(l?l+"M":"")+(c?c+"S":""):"P0D"}var ro,ao;ao=Array.prototype.some?Array.prototype.some:function(t){for(var e=Object(this),i=e.length>>>0,o=0;i>o;o++)if(o in e&&t.call(this,e[o],o,e))return!0;return!1};var ho=e.momentProperties=[],lo=!1,co={};e.suppressDeprecationWarnings=!1,e.deprecationHandler=null;var uo;uo=Object.keys?Object.keys:function(t){var e,i=[];for(e in t)r(t,e)&&i.push(e);return i};var po,fo,mo={},vo={},go=/(\[[^\[]*\])|(\\)?([Hh]mm(ss)?|Mo|MM?M?M?|Do|DDDo|DD?D?D?|ddd?d?|do?|w[o|w]?|W[o|W]?|Qo?|YYYYYY|YYYYY|YYYY|YY|gg(ggg?)?|GG(GGG?)?|e|E|a|A|hh?|HH?|kk?|mm?|ss?|S{1,9}|x|X|zz?|ZZ?|.)/g,yo=/(\[[^\[]*\])|(\\)?(LTS|LT|LL?L?L?|l{1,4})/g,bo={},wo={},_o=/\d/,xo=/\d\d/,ko=/\d{3}/,Oo=/\d{4}/,Mo=/[+-]?\d{6}/,Do=/\d\d?/,So=/\d\d\d\d?/,Co=/\d\d\d\d\d\d?/,To=/\d{1,3}/,Eo=/\d{1,4}/,Po=/[+-]?\d{1,6}/,Io=/\d+/,No=/[+-]?\d+/,Ro=/Z|[+-]\d\d:?\d\d/gi,zo=/Z|[+-]\d\d(?::?\d\d)?/gi,Lo=/[+-]?\d+(\.\d{1,3})?/,Ao=/[0-9]*['a-z\u00A0-\u05FF\u0700-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF]+|[\u0600-\u06FF\/]+(\s*?[\u0600-\u06FF]+){1,2}/i,Bo={},Fo={},jo=0,Ho=1,Wo=2,Yo=3,Go=4,Vo=5,Uo=6,qo=7,Xo=8;fo=Array.prototype.indexOf?Array.prototype.indexOf:function(t){var e;for(e=0;e=t?""+t:"+"+t}),G(0,["YY",2],0,function(){return this.year()%100}),G(0,["YYYY",4],0,"year"),G(0,["YYYYY",5],0,"year"),G(0,["YYYYYY",6,!0],0,"year"),L("year","y"),Z("Y",No),Z("YY",Do,xo),Z("YYYY",Eo,Oo),Z("YYYYY",Po,Mo),Z("YYYYYY",Po,Mo),$(["YYYYY","YYYYYY"],jo),$("YYYY",function(t,i){i[jo]=2===t.length?e.parseTwoDigitYear(t):y(t)}),$("YY",function(t,i){i[jo]=e.parseTwoDigitYear(t)}),$("Y",function(t,e){e[jo]=parseInt(t,10)}),e.parseTwoDigitYear=function(t){return y(t)+(y(t)>68?1900:2e3)};var an=F("FullYear",!0);e.ISO_8601=function(){};var hn=_("moment().min is deprecated, use moment.max instead. https://github.com/moment/moment/issues/1548",function(){var t=At.apply(null,arguments);return this.isValid()&&t.isValid()?this>t?this:t:u()}),dn=_("moment().max is deprecated, use moment.min instead. https://github.com/moment/moment/issues/1548",function(){var t=At.apply(null,arguments);return this.isValid()&&t.isValid()?t>this?this:t:u()}),ln=function(){return Date.now?Date.now():+new Date};Yt("Z",":"),Yt("ZZ",""),Z("Z",zo),Z("ZZ",zo),$(["Z","ZZ"],function(t,e,i){i._useUTC=!0,i._tzm=Gt(zo,t)});var cn=/([\+\-]|\d\d)/gi;e.updateOffset=function(){};var un=/^(\-)?(?:(\d*)[. ])?(\d+)\:(\d+)(?:\:(\d+)\.?(\d{3})?\d*)?$/,pn=/^(-)?P(?:(-?[0-9,.]*)Y)?(?:(-?[0-9,.]*)M)?(?:(-?[0-9,.]*)W)?(?:(-?[0-9,.]*)D)?(?:T(?:(-?[0-9,.]*)H)?(?:(-?[0-9,.]*)M)?(?:(-?[0-9,.]*)S)?)?$/;ne.fn=Ht.prototype;var fn=de(1,"add"),mn=de(-1,"subtract");e.defaultFormat="YYYY-MM-DDTHH:mm:ssZ",e.defaultFormatUtc="YYYY-MM-DDTHH:mm:ss[Z]";var vn=_("moment().lang() is deprecated. Instead, use moment().localeData() to get the language configuration. Use moment().locale() to change languages.",function(t){return void 0===t?this.localeData():this.locale(t)});G(0,["gg",2],0,function(){return this.weekYear()%100}),G(0,["GG",2],0,function(){return this.isoWeekYear()%100}),We("gggg","weekYear"),We("ggggg","weekYear"),We("GGGG","isoWeekYear"),We("GGGGG","isoWeekYear"),L("weekYear","gg"),L("isoWeekYear","GG"),Z("G",No),Z("g",No),Z("GG",Do,xo),Z("gg",Do,xo),Z("GGGG",Eo,Oo),Z("gggg",Eo,Oo),Z("GGGGG",Po,Mo),Z("ggggg",Po,Mo),tt(["gggg","ggggg","GGGG","GGGGG"],function(t,e,i,o){e[o.substr(0,2)]=y(t)}),tt(["gg","GG"],function(t,i,o,n){i[n]=e.parseTwoDigitYear(t)}),G("Q",0,"Qo","quarter"),L("quarter","Q"),Z("Q",_o),$("Q",function(t,e){e[Ho]=3*(y(t)-1)}),G("w",["ww",2],"wo","week"),G("W",["WW",2],"Wo","isoWeek"),L("week","w"),L("isoWeek","W"),Z("w",Do),Z("ww",Do,xo),Z("W",Do),Z("WW",Do,xo),tt(["w","ww","W","WW"],function(t,e,i,o){e[o.substr(0,1)]=y(t)});var gn={dow:0,doy:6};G("D",["DD",2],"Do","date"),L("date","D"),Z("D",Do),Z("DD",Do,xo),Z("Do",function(t,e){return t?e._ordinalParse:e._ordinalParseLenient}),$(["D","DD"],Wo),$("Do",function(t,e){e[Wo]=y(t.match(Do)[0],10)});var yn=F("Date",!0);G("d",0,"do","day"),G("dd",0,0,function(t){return this.localeData().weekdaysMin(this,t)}),G("ddd",0,0,function(t){return this.localeData().weekdaysShort(this,t)}),G("dddd",0,0,function(t){return this.localeData().weekdays(this,t)}),G("e",0,0,"weekday"),G("E",0,0,"isoWeekday"),L("day","d"),L("weekday","e"),L("isoWeekday","E"),Z("d",Do),Z("e",Do),Z("E",Do),Z("dd",function(t,e){return e.weekdaysMinRegex(t)}),Z("ddd",function(t,e){return e.weekdaysShortRegex(t)}),Z("dddd",function(t,e){return e.weekdaysRegex(t)}),tt(["dd","ddd","dddd"],function(t,e,i,o){var n=i._locale.weekdaysParse(t,o,i._strict);null!=n?e.d=n:l(i).invalidWeekday=t}),tt(["d","e","E"],function(t,e,i,o){e[o]=y(t)});var bn="Sunday_Monday_Tuesday_Wednesday_Thursday_Friday_Saturday".split("_"),wn="Sun_Mon_Tue_Wed_Thu_Fri_Sat".split("_"),_n="Su_Mo_Tu_We_Th_Fr_Sa".split("_"),xn=Ao,kn=Ao,On=Ao;G("DDD",["DDDD",3],"DDDo","dayOfYear"),L("dayOfYear","DDD"),Z("DDD",To),Z("DDDD",ko),$(["DDD","DDDD"],function(t,e,i){i._dayOfYear=y(t)}),G("H",["HH",2],0,"hour"),G("h",["hh",2],0,mi),G("k",["kk",2],0,vi),G("hmm",0,0,function(){return""+mi.apply(this)+Y(this.minutes(),2)}),G("hmmss",0,0,function(){return""+mi.apply(this)+Y(this.minutes(),2)+Y(this.seconds(),2)}),G("Hmm",0,0,function(){return""+this.hours()+Y(this.minutes(),2)}),G("Hmmss",0,0,function(){return""+this.hours()+Y(this.minutes(),2)+Y(this.seconds(),2)}),gi("a",!0),gi("A",!1),L("hour","h"),Z("a",yi),Z("A",yi),Z("H",Do),Z("h",Do),Z("HH",Do,xo),Z("hh",Do,xo),Z("hmm",So),Z("hmmss",Co),Z("Hmm",So),Z("Hmmss",Co),$(["H","HH"],Yo),$(["a","A"],function(t,e,i){i._isPm=i._locale.isPM(t),i._meridiem=t}),$(["h","hh"],function(t,e,i){e[Yo]=y(t),l(i).bigHour=!0}),$("hmm",function(t,e,i){var o=t.length-2;e[Yo]=y(t.substr(0,o)),e[Go]=y(t.substr(o)),l(i).bigHour=!0}),$("hmmss",function(t,e,i){var o=t.length-4,n=t.length-2;e[Yo]=y(t.substr(0,o)),e[Go]=y(t.substr(o,2)),e[Vo]=y(t.substr(n)),l(i).bigHour=!0}),$("Hmm",function(t,e,i){var o=t.length-2;e[Yo]=y(t.substr(0,o)),e[Go]=y(t.substr(o))}),$("Hmmss",function(t,e,i){var o=t.length-4,n=t.length-2;e[Yo]=y(t.substr(0,o)),e[Go]=y(t.substr(o,2)),e[Vo]=y(t.substr(n))});var Mn=/[ap]\.?m?\.?/i,Dn=F("Hours",!0);G("m",["mm",2],0,"minute"),L("minute","m"),Z("m",Do),Z("mm",Do,xo),$(["m","mm"],Go);var Sn=F("Minutes",!1);G("s",["ss",2],0,"second"),L("second","s"),Z("s",Do),Z("ss",Do,xo),$(["s","ss"],Vo);var Cn=F("Seconds",!1);G("S",0,0,function(){return~~(this.millisecond()/100)}),G(0,["SS",2],0,function(){return~~(this.millisecond()/10)}),G(0,["SSS",3],0,"millisecond"),G(0,["SSSS",4],0,function(){return 10*this.millisecond()}),G(0,["SSSSS",5],0,function(){return 100*this.millisecond()}),G(0,["SSSSSS",6],0,function(){return 1e3*this.millisecond()}),G(0,["SSSSSSS",7],0,function(){return 1e4*this.millisecond()}),G(0,["SSSSSSSS",8],0,function(){return 1e5*this.millisecond()}),G(0,["SSSSSSSSS",9],0,function(){return 1e6*this.millisecond()}),L("millisecond","ms"),Z("S",To,_o),Z("SS",To,xo),Z("SSS",To,ko);var Tn;for(Tn="SSSS";Tn.length<=9;Tn+="S")Z(Tn,Io);for(Tn="S";Tn.length<=9;Tn+="S")$(Tn,_i);var En=F("Milliseconds",!1);G("z",0,0,"zoneAbbr"),G("zz",0,0,"zoneName");var Pn=m.prototype;Pn.add=fn,Pn.calendar=ce,Pn.clone=ue,Pn.diff=be,Pn.endOf=Pe,Pn.format=ke,Pn.from=Oe,Pn.fromNow=Me,Pn.to=De,Pn.toNow=Se,Pn.get=W,Pn.invalidAt=je,Pn.isAfter=pe,Pn.isBefore=fe,Pn.isBetween=me,Pn.isSame=ve,Pn.isSameOrAfter=ge,Pn.isSameOrBefore=ye,Pn.isValid=Be,Pn.lang=vn,Pn.locale=Ce,Pn.localeData=Te,Pn.max=dn,Pn.min=hn,Pn.parsingFlags=Fe,Pn.set=W,Pn.startOf=Ee,Pn.subtract=mn,Pn.toArray=ze,Pn.toObject=Le,Pn.toDate=Re,Pn.toISOString=xe,Pn.toJSON=Ae,Pn.toString=_e,Pn.unix=Ne,Pn.valueOf=Ie,Pn.creationData=He,Pn.year=an,Pn.isLeapYear=wt,Pn.weekYear=Ye,Pn.isoWeekYear=Ge,Pn.quarter=Pn.quarters=Ze,Pn.month=ht,Pn.daysInMonth=dt,Pn.week=Pn.weeks=$e,Pn.isoWeek=Pn.isoWeeks=ti,Pn.weeksInYear=Ue,Pn.isoWeeksInYear=Ve,Pn.date=yn,Pn.day=Pn.days=ai,Pn.weekday=hi,Pn.isoWeekday=di,Pn.dayOfYear=fi,Pn.hour=Pn.hours=Dn,Pn.minute=Pn.minutes=Sn,Pn.second=Pn.seconds=Cn,Pn.millisecond=Pn.milliseconds=En,Pn.utcOffset=qt,Pn.utc=Zt,Pn.local=Kt,Pn.parseZone=Jt,Pn.hasAlignedHourOffset=Qt,Pn.isDST=$t,Pn.isDSTShifted=te,Pn.isLocal=ee,Pn.isUtcOffset=ie,Pn.isUtc=oe,Pn.isUTC=oe,Pn.zoneAbbr=xi,Pn.zoneName=ki,Pn.dates=_("dates accessor is deprecated. Use date instead.",yn),Pn.months=_("months accessor is deprecated. Use month instead",ht),Pn.years=_("years accessor is deprecated. Use year instead",an),Pn.zone=_("moment().zone is deprecated, use moment().utcOffset instead. https://github.com/moment/moment/issues/1779",Xt);var In=Pn,Nn={sameDay:"[Today at] LT",nextDay:"[Tomorrow at] LT",nextWeek:"dddd [at] LT",lastDay:"[Yesterday at] LT",lastWeek:"[Last] dddd [at] LT",sameElse:"L"},Rn={LTS:"h:mm:ss A",LT:"h:mm A",L:"MM/DD/YYYY",LL:"MMMM D, YYYY",LLL:"MMMM D, YYYY h:mm A",LLLL:"dddd, MMMM D, YYYY h:mm A"},zn="Invalid date",Ln="%d",An=/\d{1,2}/,Bn={future:"in %s",past:"%s ago",s:"a few seconds",m:"a minute",mm:"%d minutes",h:"an hour",hh:"%d hours",d:"a day",dd:"%d days",M:"a month",MM:"%d months",y:"a year",yy:"%d years"},Fn=S.prototype;Fn._calendar=Nn,Fn.calendar=Di,Fn._longDateFormat=Rn,Fn.longDateFormat=Si,Fn._invalidDate=zn,Fn.invalidDate=Ci,Fn._ordinal=Ln,Fn.ordinal=Ti,Fn._ordinalParse=An,Fn.preparse=Ei,Fn.postformat=Ei,Fn._relativeTime=Bn,Fn.relativeTime=Pi,Fn.pastFuture=Ii,Fn.set=M,Fn.months=ot,Fn._months=Ko,Fn.monthsShort=nt,Fn._monthsShort=Jo,Fn.monthsParse=rt,Fn._monthsRegex=$o,Fn.monthsRegex=ct,Fn._monthsShortRegex=Qo,Fn.monthsShortRegex=lt,Fn.week=Ke,Fn._week=gn,Fn.firstDayOfYear=Qe,Fn.firstDayOfWeek=Je,Fn.weekdays=ii,Fn._weekdays=bn,Fn.weekdaysMin=ni,Fn._weekdaysMin=_n,Fn.weekdaysShort=oi,Fn._weekdaysShort=wn,Fn.weekdaysParse=ri,Fn._weekdaysRegex=xn,Fn.weekdaysRegex=li,Fn._weekdaysShortRegex=kn,Fn.weekdaysShortRegex=ci,Fn._weekdaysMinRegex=On,Fn.weekdaysMinRegex=ui,Fn.isPM=bi,Fn._meridiemParse=Mn,Fn.meridiem=wi,P("en",{ordinalParse:/\d{1,2}(th|st|nd|rd)/,ordinal:function(t){var e=t%10,i=1===y(t%100/10)?"th":1===e?"st":2===e?"nd":3===e?"rd":"th";return t+i}}),e.lang=_("moment.lang is deprecated. Use moment.locale instead.",P),e.langData=_("moment.langData is deprecated. Use moment.localeData instead.",R);var jn=Math.abs,Hn=Ji("ms"),Wn=Ji("s"),Yn=Ji("m"),Gn=Ji("h"),Vn=Ji("d"),Un=Ji("w"),qn=Ji("M"),Xn=Ji("y"),Zn=$i("milliseconds"),Kn=$i("seconds"),Jn=$i("minutes"),Qn=$i("hours"),$n=$i("days"),ts=$i("months"),es=$i("years"),is=Math.round,os={s:45,m:45,h:22,d:26,M:11},ns=Math.abs,ss=Ht.prototype;ss.abs=Hi,ss.add=Yi,ss.subtract=Gi,ss.as=Zi,ss.asMilliseconds=Hn,ss.asSeconds=Wn,ss.asMinutes=Yn,ss.asHours=Gn,ss.asDays=Vn,ss.asWeeks=Un,ss.asMonths=qn,ss.asYears=Xn,ss.valueOf=Ki,ss._bubble=Ui,ss.get=Qi,ss.milliseconds=Zn,ss.seconds=Kn,ss.minutes=Jn,ss.hours=Qn,ss.days=$n,ss.weeks=to,ss.months=ts,ss.years=es,ss.humanize=no,ss.toISOString=so,ss.toString=so,ss.toJSON=so,ss.locale=Ce,ss.localeData=Te,ss.toIsoString=_("toIsoString() is deprecated. Please use toISOString() instead (notice the capitals)",so),ss.lang=vn,G("X",0,0,"unix"),G("x",0,0,"valueOf"),Z("x",No),Z("X",Lo),$("X",function(t,e,i){i._d=new Date(1e3*parseFloat(t,10))}),$("x",function(t,e,i){i._d=new Date(y(t))}),e.version="2.13.0",i(At),e.fn=In,e.min=Ft,e.max=jt,e.now=ln,e.utc=h,e.unix=Oi,e.months=Li,e.isDate=n,e.locale=P,e.invalid=u,e.duration=ne,e.isMoment=v,e.weekdays=Bi,e.parseZone=Mi,e.localeData=R,e.isDuration=Wt,e.monthsShort=Ai,e.weekdaysMin=ji,e.defineLocale=I,e.updateLocale=N,e.locales=z,e.weekdaysShort=Fi,e.normalizeUnits=A,e.relativeTimeThreshold=oo,e.prototype=In;var rs=e;return rs})}).call(e,i(4)(t))},function(t,e){t.exports=function(t){return t.webpackPolyfill||(t.deprecate=function(){},t.paths=[],t.children=[],t.webpackPolyfill=1),t}},function(t,e){function i(t){throw new Error("Cannot find module '"+t+"'.")}i.keys=function(){return[]},i.resolve=i,t.exports=i,i.id=5},function(t,e){(function(e){function i(t,e,i){var o=e&&i||0,n=0;for(e=e||[],t.toLowerCase().replace(/[0-9a-f]{2}/g,function(t){16>n&&(e[o+n++]=c[t])});16>n;)e[o+n++]=0;return e}function o(t,e){var i=e||0,o=l;return o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+"-"+o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]+o[t[i++]]}function n(t,e,i){var n=e&&i||0,s=e||[];t=t||{};var r=void 0!==t.clockseq?t.clockseq:m,a=void 0!==t.msecs?t.msecs:(new Date).getTime(),h=void 0!==t.nsecs?t.nsecs:g+1,d=a-v+(h-g)/1e4;if(0>d&&void 0===t.clockseq&&(r=r+1&16383),(0>d||a>v)&&void 0===t.nsecs&&(h=0),h>=1e4)throw new Error("uuid.v1(): Can't create more than 10M uuids/sec");v=a,g=h,m=r,a+=122192928e5;var l=(1e4*(268435455&a)+h)%4294967296;s[n++]=l>>>24&255,s[n++]=l>>>16&255,s[n++]=l>>>8&255,s[n++]=255&l;var c=a/4294967296*1e4&268435455;s[n++]=c>>>8&255,s[n++]=255&c,s[n++]=c>>>24&15|16,s[n++]=c>>>16&255,s[n++]=r>>>8|128,s[n++]=255&r;for(var u=t.node||f,p=0;6>p;p++)s[n+p]=u[p];return e?e:o(s)}function s(t,e,i){var n=e&&i||0;"string"==typeof t&&(e="binary"==t?new Array(16):null,t=null),t=t||{};var s=t.random||(t.rng||r)();if(s[6]=15&s[6]|64,s[8]=63&s[8]|128,e)for(var a=0;16>a;a++)e[n+a]=s[a];return e||o(s)}var r,a="undefined"!=typeof window?window:"undefined"!=typeof e?e:null;if(a&&a.crypto&&crypto.getRandomValues){var h=new Uint8Array(16);r=function(){return crypto.getRandomValues(h),h}}if(!r){var d=new Array(16);r=function(){for(var t,e=0;16>e;e++)0===(3&e)&&(t=4294967296*Math.random()),d[e]=t>>>((3&e)<<3)&255;return d}}for(var l=[],c={},u=0;256>u;u++)l[u]=(u+256).toString(16).substr(1),c[l[u]]=u;var p=r(),f=[1|p[0],p[1],p[2],p[3],p[4],p[5]],m=16383&(p[6]<<8|p[7]),v=0,g=0,y=s;y.v1=n,y.v4=s,y.parse=i,y.unparse=o,t.exports=y}).call(e,function(){return this}())},function(t,e,i){e.util=i(1),e.DOMutil=i(8),e.DataSet=i(9),e.DataView=i(11),e.Queue=i(10),e.Graph3d=i(12),e.graph3d={Camera:i(16),Filter:i(17),Point2d:i(15),Point3d:i(14),Slider:i(18),StepNumber:i(19)},e.moment=i(2),e.Hammer=i(20),e.keycharm=i(23)},function(t,e){e.prepareElements=function(t){for(var e in t)t.hasOwnProperty(e)&&(t[e].redundant=t[e].used,t[e].used=[])},e.cleanupElements=function(t){for(var e in t)if(t.hasOwnProperty(e)&&t[e].redundant){for(var i=0;i0?(o=e[t].redundant[0],e[t].redundant.shift()):(o=document.createElementNS("http://www.w3.org/2000/svg",t),i.appendChild(o)):(o=document.createElementNS("http://www.w3.org/2000/svg",t),e[t]={used:[],redundant:[]},i.appendChild(o)),e[t].used.push(o),o},e.getDOMElement=function(t,e,i,o){var n;return e.hasOwnProperty(t)?e[t].redundant.length>0?(n=e[t].redundant[0],e[t].redundant.shift()):(n=document.createElement(t),void 0!==o?i.insertBefore(n,o):i.appendChild(n)):(n=document.createElement(t),e[t]={used:[],redundant:[]},void 0!==o?i.insertBefore(n,o):i.appendChild(n)),e[t].used.push(n),n},e.drawPoint=function(t,i,o,n,s,r){var a;if("circle"==o.style?(a=e.getSVGElement("circle",n,s),a.setAttributeNS(null,"cx",t),a.setAttributeNS(null,"cy",i),a.setAttributeNS(null,"r",.5*o.size)):(a=e.getSVGElement("rect",n,s),a.setAttributeNS(null,"x",t-.5*o.size),a.setAttributeNS(null,"y",i-.5*o.size),a.setAttributeNS(null,"width",o.size),a.setAttributeNS(null,"height",o.size)),void 0!==o.styles&&a.setAttributeNS(null,"style",o.styles),a.setAttributeNS(null,"class",o.className+" vis-point"),r){var h=e.getSVGElement("text",n,s); -r.xOffset&&(t+=r.xOffset),r.yOffset&&(i+=r.yOffset),r.content&&(h.textContent=r.content),r.className&&h.setAttributeNS(null,"class",r.className+" vis-label"),h.setAttributeNS(null,"x",t),h.setAttributeNS(null,"y",i)}return a},e.drawBar=function(t,i,o,n,s,r,a,h){if(0!=n){0>n&&(n*=-1,i-=n);var d=e.getSVGElement("rect",r,a);d.setAttributeNS(null,"x",t-.5*o),d.setAttributeNS(null,"y",i),d.setAttributeNS(null,"width",o),d.setAttributeNS(null,"height",n),d.setAttributeNS(null,"class",s),h&&d.setAttributeNS(null,"style",h)}}},function(t,e,i){function o(t,e){if(t&&!Array.isArray(t)&&(e=t,t=null),this._options=e||{},this._data={},this.length=0,this._fieldId=this._options.fieldId||"id",this._type={},this._options.type)for(var i=Object.keys(this._options.type),o=0,n=i.length;n>o;o++){var s=i[o],r=this._options.type[s];"Date"==r||"ISODate"==r||"ASPDate"==r?this._type[s]="Date":this._type[s]=r}if(this._options.convert)throw new Error('Option "convert" is deprecated. Use "type" instead.');this._subscribers={},t&&this.add(t),this.setOptions(e)}var n="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},s=i(1),r=i(10);o.prototype.setOptions=function(t){t&&void 0!==t.queue&&(t.queue===!1?this._queue&&(this._queue.destroy(),delete this._queue):(this._queue||(this._queue=r.extend(this,{replace:["add","update","remove"]})),"object"===n(t.queue)&&this._queue.setOptions(t.queue)))},o.prototype.on=function(t,e){var i=this._subscribers[t];i||(i=[],this._subscribers[t]=i),i.push({callback:e})},o.prototype.subscribe=function(){throw new Error("DataSet.subscribe is deprecated. Use DataSet.on instead.")},o.prototype.off=function(t,e){var i=this._subscribers[t];i&&(this._subscribers[t]=i.filter(function(t){return t.callback!=e}))},o.prototype.unsubscribe=function(){throw new Error("DataSet.unsubscribe is deprecated. Use DataSet.off instead.")},o.prototype._trigger=function(t,e,i){if("*"==t)throw new Error("Cannot trigger event *");var o=[];t in this._subscribers&&(o=o.concat(this._subscribers[t])),"*"in this._subscribers&&(o=o.concat(this._subscribers["*"]));for(var n=0,s=o.length;s>n;n++){var r=o[n];r.callback&&r.callback(t,e,i||null)}},o.prototype.add=function(t,e){var i,o=[],n=this;if(Array.isArray(t))for(var s=0,r=t.length;r>s;s++)i=n._addItem(t[s]),o.push(i);else{if(!(t instanceof Object))throw new Error("Unknown dataType");i=n._addItem(t),o.push(i)}return o.length&&this._trigger("add",{items:o},e),o},o.prototype.update=function(t,e){var i=[],o=[],n=[],r=[],a=this,h=a._fieldId,d=function(t){var e=t[h];if(a._data[e]){var d=s.extend({},a._data[e]);e=a._updateItem(t),o.push(e),r.push(t),n.push(d)}else e=a._addItem(t),i.push(e)};if(Array.isArray(t))for(var l=0,c=t.length;c>l;l++)t[l]instanceof Object?d(t[l]):console.warn("Ignoring input item, which is not an object at index "+l);else{if(!(t instanceof Object))throw new Error("Unknown dataType");d(t)}if(i.length&&this._trigger("add",{items:i},e),o.length){var u={items:o,oldData:n,data:r};this._trigger("update",u,e)}return i.concat(o)},o.prototype.get=function(t){var e,i,o,n=this,r=s.getType(arguments[0]);"String"==r||"Number"==r?(e=arguments[0],o=arguments[1]):"Array"==r?(i=arguments[0],o=arguments[1]):o=arguments[0];var a;if(o&&o.returnType){var h=["Array","Object"];a=-1==h.indexOf(o.returnType)?"Array":o.returnType}else a="Array";var d,l,c,u,p,f=o&&o.type||this._options.type,m=o&&o.filter,v=[];if(void 0!=e)d=n._getItem(e,f),d&&m&&!m(d)&&(d=null);else if(void 0!=i)for(u=0,p=i.length;p>u;u++)d=n._getItem(i[u],f),m&&!m(d)||v.push(d);else for(l=Object.keys(this._data),u=0,p=l.length;p>u;u++)c=l[u],d=n._getItem(c,f),m&&!m(d)||v.push(d);if(o&&o.order&&void 0==e&&this._sort(v,o.order),o&&o.fields){var g=o.fields;if(void 0!=e)d=this._filterFields(d,g);else for(u=0,p=v.length;p>u;u++)v[u]=this._filterFields(v[u],g)}if("Object"==a){var y,b={};for(u=0,p=v.length;p>u;u++)y=v[u],b[y.id]=y;return b}return void 0!=e?d:v},o.prototype.getIds=function(t){var e,i,o,n,s,r=this._data,a=t&&t.filter,h=t&&t.order,d=t&&t.type||this._options.type,l=Object.keys(r),c=[];if(a)if(h){for(s=[],e=0,i=l.length;i>e;e++)o=l[e],n=this._getItem(o,d),a(n)&&s.push(n);for(this._sort(s,h),e=0,i=s.length;i>e;e++)c.push(s[e][this._fieldId])}else for(e=0,i=l.length;i>e;e++)o=l[e],n=this._getItem(o,d),a(n)&&c.push(n[this._fieldId]);else if(h){for(s=[],e=0,i=l.length;i>e;e++)o=l[e],s.push(r[o]);for(this._sort(s,h),e=0,i=s.length;i>e;e++)c.push(s[e][this._fieldId])}else for(e=0,i=l.length;i>e;e++)o=l[e],n=r[o],c.push(n[this._fieldId]);return c},o.prototype.getDataSet=function(){return this},o.prototype.forEach=function(t,e){var i,o,n,s,r=e&&e.filter,a=e&&e.type||this._options.type,h=this._data,d=Object.keys(h);if(e&&e.order){var l=this.get(e);for(i=0,o=l.length;o>i;i++)n=l[i],s=n[this._fieldId],t(n,s)}else for(i=0,o=d.length;o>i;i++)s=d[i],n=this._getItem(s,a),r&&!r(n)||t(n,s)},o.prototype.map=function(t,e){var i,o,n,s,r=e&&e.filter,a=e&&e.type||this._options.type,h=[],d=this._data,l=Object.keys(d);for(i=0,o=l.length;o>i;i++)n=l[i],s=this._getItem(n,a),r&&!r(s)||h.push(t(s,n));return e&&e.order&&this._sort(h,e.order),h},o.prototype._filterFields=function(t,e){if(!t)return t;var i,o,n={},s=Object.keys(t),r=s.length;if(Array.isArray(e))for(i=0;r>i;i++)o=s[i],-1!=e.indexOf(o)&&(n[o]=t[o]);else for(i=0;r>i;i++)o=s[i],e.hasOwnProperty(o)&&(n[e[o]]=t[o]);return n},o.prototype._sort=function(t,e){if(s.isString(e)){var i=e;t.sort(function(t,e){var o=t[i],n=e[i];return o>n?1:n>o?-1:0})}else{if("function"!=typeof e)throw new TypeError("Order must be a function or a string");t.sort(e)}},o.prototype.remove=function(t,e){var i,o,n,s=[];if(Array.isArray(t))for(i=0,o=t.length;o>i;i++)n=this._remove(t[i]),null!=n&&s.push(n);else n=this._remove(t),null!=n&&s.push(n);return s.length&&this._trigger("remove",{items:s},e),s},o.prototype._remove=function(t){if(s.isNumber(t)||s.isString(t)){if(this._data[t])return delete this._data[t],this.length--,t}else if(t instanceof Object){var e=t[this._fieldId];if(void 0!==e&&this._data[e])return delete this._data[e],this.length--,e}return null},o.prototype.clear=function(t){var e=Object.keys(this._data);return this._data={},this.length=0,this._trigger("remove",{items:e},t),e},o.prototype.max=function(t){var e,i,o=this._data,n=Object.keys(o),s=null,r=null;for(e=0,i=n.length;i>e;e++){var a=n[e],h=o[a],d=h[t];null!=d&&(!s||d>r)&&(s=h,r=d)}return s},o.prototype.min=function(t){var e,i,o=this._data,n=Object.keys(o),s=null,r=null;for(e=0,i=n.length;i>e;e++){var a=n[e],h=o[a],d=h[t];null!=d&&(!s||r>d)&&(s=h,r=d)}return s},o.prototype.distinct=function(t){var e,i,o,n=this._data,r=Object.keys(n),a=[],h=this._options.type&&this._options.type[t]||null,d=0;for(e=0,o=r.length;o>e;e++){var l=r[e],c=n[l],u=c[t],p=!1;for(i=0;d>i;i++)if(a[i]==u){p=!0;break}p||void 0===u||(a[d]=u,d++)}if(h)for(e=0,o=a.length;o>e;e++)a[e]=s.convert(a[e],h);return a},o.prototype._addItem=function(t){var e=t[this._fieldId];if(void 0!=e){if(this._data[e])throw new Error("Cannot add item: item with id "+e+" already exists")}else e=s.randomUUID(),t[this._fieldId]=e;var i,o,n={},r=Object.keys(t);for(i=0,o=r.length;o>i;i++){var a=r[i],h=this._type[a];n[a]=s.convert(t[a],h)}return this._data[e]=n,this.length++,e},o.prototype._getItem=function(t,e){var i,o,n,r,a=this._data[t];if(!a)return null;var h={},d=Object.keys(a);if(e)for(n=0,r=d.length;r>n;n++)i=d[n],o=a[i],h[i]=s.convert(o,e[i]);else for(n=0,r=d.length;r>n;n++)i=d[n],o=a[i],h[i]=o;return h},o.prototype._updateItem=function(t){var e=t[this._fieldId];if(void 0==e)throw new Error("Cannot update item: item has no id (item: "+JSON.stringify(t)+")");var i=this._data[e];if(!i)throw new Error("Cannot update item: no item with id "+e+" found");for(var o=Object.keys(t),n=0,r=o.length;r>n;n++){var a=o[n],h=this._type[a];i[a]=s.convert(t[a],h)}return e},t.exports=o},function(t,e){function i(t){this.delay=null,this.max=1/0,this._queue=[],this._timeout=null,this._extended=null,this.setOptions(t)}i.prototype.setOptions=function(t){t&&"undefined"!=typeof t.delay&&(this.delay=t.delay),t&&"undefined"!=typeof t.max&&(this.max=t.max),this._flushIfNeeded()},i.extend=function(t,e){var o=new i(e);if(void 0!==t.flush)throw new Error("Target object already has a property flush");t.flush=function(){o.flush()};var n=[{name:"flush",original:void 0}];if(e&&e.replace)for(var s=0;sthis.max&&this.flush(),clearTimeout(this._timeout),this.queue.length>0&&"number"==typeof this.delay){var t=this;this._timeout=setTimeout(function(){t.flush()},this.delay)}},i.prototype.flush=function(){for(;this._queue.length>0;){var t=this._queue.shift();t.fn.apply(t.context||t.fn,t.args||[])}},t.exports=i},function(t,e,i){function o(t,e){this._data=null,this._ids={},this.length=0,this._options=e||{},this._fieldId="id",this._subscribers={};var i=this;this.listener=function(){i._onEvent.apply(i,arguments)},this.setData(t)}var n=i(1),s=i(9);o.prototype.setData=function(t){var e,i,o,n;if(this._data&&(this._data.off&&this._data.off("*",this.listener),e=Object.keys(this._ids),this._ids={},this.length=0,this._trigger("remove",{items:e})),this._data=t,this._data){for(this._fieldId=this._options.fieldId||this._data&&this._data.options&&this._data.options.fieldId||"id",e=this._data.getIds({filter:this._options&&this._options.filter}),o=0,n=e.length;n>o;o++)i=e[o],this._ids[i]=!0;this.length=e.length,this._trigger("add",{items:e}),this._data.on&&this._data.on("*",this.listener)}},o.prototype.refresh=function(){var t,e,i,o=this._data.getIds({filter:this._options&&this._options.filter}),n=Object.keys(this._ids),s={},r=[],a=[];for(e=0,i=o.length;i>e;e++)t=o[e],s[t]=!0,this._ids[t]||(r.push(t),this._ids[t]=!0);for(e=0,i=n.length;i>e;e++)t=n[e],s[t]||(a.push(t),delete this._ids[t]);this.length+=r.length-a.length,r.length&&this._trigger("add",{items:r}),a.length&&this._trigger("remove",{items:a})},o.prototype.get=function(t){var e,i,o,s=this,r=n.getType(arguments[0]);"String"==r||"Number"==r||"Array"==r?(e=arguments[0],i=arguments[1],o=arguments[2]):(i=arguments[0],o=arguments[1]);var a=n.extend({},this._options,i);this._options.filter&&i&&i.filter&&(a.filter=function(t){return s._options.filter(t)&&i.filter(t)});var h=[];return void 0!=e&&h.push(e),h.push(a),h.push(o),this._data&&this._data.get.apply(this._data,h)},o.prototype.getIds=function(t){var e;if(this._data){var i,o=this._options.filter;i=t&&t.filter?o?function(e){return o(e)&&t.filter(e)}:t.filter:o,e=this._data.getIds({filter:i,order:t&&t.order})}else e=[];return e},o.prototype.map=function(t,e){var i=[];if(this._data){var o,n=this._options.filter;o=e&&e.filter?n?function(t){return n(t)&&e.filter(t)}:e.filter:n,i=this._data.map(t,{filter:o,order:e&&e.order})}else i=[];return i},o.prototype.getDataSet=function(){for(var t=this;t instanceof o;)t=t._data;return t||null},o.prototype._onEvent=function(t,e,i){var o,n,s,r,a=e&&e.items,h=this._data,d=[],l=[],c=[],u=[];if(a&&h){switch(t){case"add":for(o=0,n=a.length;n>o;o++)s=a[o],r=this.get(s),r&&(this._ids[s]=!0,l.push(s));break;case"update":for(o=0,n=a.length;n>o;o++)s=a[o],r=this.get(s),r?this._ids[s]?(c.push(s),d.push(e.data[o])):(this._ids[s]=!0,l.push(s)):this._ids[s]&&(delete this._ids[s],u.push(s));break;case"remove":for(o=0,n=a.length;n>o;o++)s=a[o],this._ids[s]&&(delete this._ids[s],u.push(s))}this.length+=l.length-u.length,l.length&&this._trigger("add",{items:l},i),c.length&&this._trigger("update",{items:c,data:d},i),u.length&&this._trigger("remove",{items:u},i)}},o.prototype.on=s.prototype.on,o.prototype.off=s.prototype.off,o.prototype._trigger=s.prototype._trigger,o.prototype.subscribe=o.prototype.on,o.prototype.unsubscribe=o.prototype.off,t.exports=o},function(t,e,i){function o(t,e,i){if(!(this instanceof o))throw new SyntaxError("Constructor must be called with the new operator");this.containerElement=t,this.width="400px",this.height="400px",this.margin=10,this.defaultXCenter="55%",this.defaultYCenter="50%",this.xLabel="x",this.yLabel="y",this.zLabel="z";var n=function(t){return t};this.xValueLabel=n,this.yValueLabel=n,this.zValueLabel=n,this.filterLabel="time",this.legendLabel="value",this.style=o.STYLE.DOT,this.showPerspective=!0,this.showGrid=!0,this.keepAspectRatio=!0,this.showShadow=!1,this.showGrayBottom=!1,this.showTooltip=!1,this.verticalRatio=.5,this.animationInterval=1e3,this.animationPreload=!1,this.camera=new p,this.camera.setArmRotation(1,.5),this.camera.setArmLength(1.7),this.eye=new c(0,0,-1),this.dataTable=null,this.dataPoints=null,this.colX=void 0,this.colY=void 0,this.colZ=void 0,this.colValue=void 0,this.colFilter=void 0,this.xMin=0,this.xStep=void 0,this.xMax=1,this.yMin=0,this.yStep=void 0,this.yMax=1,this.zMin=0,this.zStep=void 0,this.zMax=1,this.valueMin=0,this.valueMax=1,this.xBarWidth=1,this.yBarWidth=1,this.axisColor="#4D4D4D",this.gridColor="#D3D3D3",this.dataColor={fill:"#7DC1FF",stroke:"#3267D2",strokeWidth:1},this.dotSizeRatio=.02,this.create(),this.setOptions(i),e&&this.setData(e)}function n(t){return"clientX"in t?t.clientX:t.targetTouches[0]&&t.targetTouches[0].clientX||0}function s(t){return"clientY"in t?t.clientY:t.targetTouches[0]&&t.targetTouches[0].clientY||0}var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},a=i(13),h=i(9),d=i(11),l=i(1),c=i(14),u=i(15),p=i(16),f=i(17),m=i(18),v=i(19);a(o.prototype),o.prototype._setScale=function(){this.scale=new c(1/(this.xMax-this.xMin),1/(this.yMax-this.yMin),1/(this.zMax-this.zMin)),this.keepAspectRatio&&(this.scale.x3&&(this.colFilter=3);else{if(this.style!==o.STYLE.DOTCOLOR&&this.style!==o.STYLE.DOTSIZE&&this.style!==o.STYLE.BARCOLOR&&this.style!==o.STYLE.BARSIZE)throw'Unknown style "'+this.style+'"';this.colX=0,this.colY=1,this.colZ=2,this.colValue=3,t.getNumberOfColumns()>4&&(this.colFilter=4)}},o.prototype.getNumberOfRows=function(t){return t.length},o.prototype.getNumberOfColumns=function(t){var e=0;for(var i in t[0])t[0].hasOwnProperty(i)&&e++;return e},o.prototype.getDistinctValues=function(t,e){for(var i=[],o=0;ot[o][e]&&(i.min=t[o][e]),i.maxt;t++){var f=(t-u)/(p-u),m=240*f,g=this._hsv2rgb(m,1,1);c.strokeStyle=g,c.beginPath(),c.moveTo(h,r+t),c.lineTo(a,r+t),c.stroke()}c.strokeStyle=this.axisColor,c.strokeRect(h,r,i,s)}if(this.style===o.STYLE.DOTSIZE&&(c.strokeStyle=this.axisColor,c.fillStyle=this.dataColor.fill,c.beginPath(),c.moveTo(h,r),c.lineTo(a,r),c.lineTo(a-i+e,d),c.lineTo(h,d),c.closePath(),c.fill(),c.stroke()),this.style===o.STYLE.DOTCOLOR||this.style===o.STYLE.DOTSIZE){var y=5,b=new v(this.valueMin,this.valueMax,(this.valueMax-this.valueMin)/5,!0);for(b.start(),b.getCurrent()0?this.yMin:this.yMax,n=this._convert3Dto2D(new c(_,r,this.zMin)),Math.cos(2*w)>0?(m.textAlign="center",m.textBaseline="top",n.y+=b):Math.sin(2*w)<0?(m.textAlign="right",m.textBaseline="middle"):(m.textAlign="left",m.textBaseline="middle"),m.fillStyle=this.axisColor,m.fillText(" "+this.xValueLabel(i.getCurrent())+" ",n.x,n.y),i.next()}for(m.lineWidth=1,o=void 0===this.defaultYStep,i=new v(this.yMin,this.yMax,this.yStep,o),i.start(),i.getCurrent()0?this.xMin:this.xMax,n=this._convert3Dto2D(new c(s,i.getCurrent(),this.zMin)),Math.cos(2*w)<0?(m.textAlign="center",m.textBaseline="top",n.y+=b):Math.sin(2*w)>0?(m.textAlign="right",m.textBaseline="middle"):(m.textAlign="left",m.textBaseline="middle"),m.fillStyle=this.axisColor,m.fillText(" "+this.yValueLabel(i.getCurrent())+" ",n.x,n.y),i.next();for(m.lineWidth=1,o=void 0===this.defaultZStep,i=new v(this.zMin,this.zMax,this.zStep,o),i.start(),i.getCurrent()0?this.xMin:this.xMax,r=Math.sin(w)<0?this.yMin:this.yMax;!i.end();)t=this._convert3Dto2D(new c(s,r,i.getCurrent())),m.strokeStyle=this.axisColor,m.beginPath(),m.moveTo(t.x,t.y),m.lineTo(t.x-b,t.y),m.stroke(),m.textAlign="right",m.textBaseline="middle",m.fillStyle=this.axisColor,m.fillText(this.zValueLabel(i.getCurrent())+" ",t.x-5,t.y),i.next();m.lineWidth=1,t=this._convert3Dto2D(new c(s,r,this.zMin)),e=this._convert3Dto2D(new c(s,r,this.zMax)),m.strokeStyle=this.axisColor,m.beginPath(),m.moveTo(t.x,t.y),m.lineTo(e.x,e.y),m.stroke(),m.lineWidth=1,u=this._convert3Dto2D(new c(this.xMin,this.yMin,this.zMin)),p=this._convert3Dto2D(new c(this.xMax,this.yMin,this.zMin)),m.strokeStyle=this.axisColor,m.beginPath(),m.moveTo(u.x,u.y),m.lineTo(p.x,p.y),m.stroke(),u=this._convert3Dto2D(new c(this.xMin,this.yMax,this.zMin)),p=this._convert3Dto2D(new c(this.xMax,this.yMax,this.zMin)),m.strokeStyle=this.axisColor,m.beginPath(),m.moveTo(u.x,u.y),m.lineTo(p.x,p.y),m.stroke(),m.lineWidth=1,t=this._convert3Dto2D(new c(this.xMin,this.yMin,this.zMin)),e=this._convert3Dto2D(new c(this.xMin,this.yMax,this.zMin)),m.strokeStyle=this.axisColor,m.beginPath(),m.moveTo(t.x,t.y),m.lineTo(e.x,e.y),m.stroke(),t=this._convert3Dto2D(new c(this.xMax,this.yMin,this.zMin)),e=this._convert3Dto2D(new c(this.xMax,this.yMax,this.zMin)),m.strokeStyle=this.axisColor,m.beginPath(),m.moveTo(t.x,t.y),m.lineTo(e.x,e.y),m.stroke();var x=this.xLabel;x.length>0&&(l=.1/this.scale.y,s=(this.xMin+this.xMax)/2,r=Math.cos(w)>0?this.yMin-l:this.yMax+l,n=this._convert3Dto2D(new c(s,r,this.zMin)),Math.cos(2*w)>0?(m.textAlign="center",m.textBaseline="top"):Math.sin(2*w)<0?(m.textAlign="right",m.textBaseline="middle"):(m.textAlign="left",m.textBaseline="middle"),m.fillStyle=this.axisColor,m.fillText(x,n.x,n.y));var k=this.yLabel;k.length>0&&(d=.1/this.scale.x,s=Math.sin(w)>0?this.xMin-d:this.xMax+d,r=(this.yMin+this.yMax)/2,n=this._convert3Dto2D(new c(s,r,this.zMin)),Math.cos(2*w)<0?(m.textAlign="center",m.textBaseline="top"):Math.sin(2*w)>0?(m.textAlign="right",m.textBaseline="middle"):(m.textAlign="left",m.textBaseline="middle"),m.fillStyle=this.axisColor,m.fillText(k,n.x,n.y));var O=this.zLabel;O.length>0&&(h=30,s=Math.cos(w)>0?this.xMin:this.xMax,r=Math.sin(w)<0?this.yMin:this.yMax,a=(this.zMin+this.zMax)/2,n=this._convert3Dto2D(new c(s,r,a)),m.textAlign="right",m.textBaseline="middle",m.fillStyle=this.axisColor,m.fillText(O,n.x-h,n.y))},o.prototype._hsv2rgb=function(t,e,i){var o,n,s,r,a,h;switch(r=i*e,a=Math.floor(t/60),h=r*(1-Math.abs(t/60%2-1)),a){case 0:o=r,n=h,s=0;break;case 1:o=h,n=r,s=0;break;case 2:o=0,n=r,s=h;break;case 3:o=0,n=h,s=r;break;case 4:o=h,n=0,s=r;break;case 5:o=r,n=0,s=h;break;default:o=0,n=0,s=0}return"RGB("+parseInt(255*o)+","+parseInt(255*n)+","+parseInt(255*s)+")"},o.prototype._redrawDataGrid=function(){var t,e,i,n,s,r,a,h,d,l,u,p,f=this.frame.canvas,m=f.getContext("2d");if(m.lineJoin="round",m.lineCap="round",!(void 0===this.dataPoints||this.dataPoints.length<=0)){for(s=0;s0}else r=!0;r?(p=(t.point.z+e.point.z+i.point.z+n.point.z)/4,d=240*(1-(p-this.zMin)*this.scale.z/this.verticalRatio),l=1,this.showShadow?(u=Math.min(1+x.x/k/2,1),a=this._hsv2rgb(d,l,u),h=a):(u=1,a=this._hsv2rgb(d,l,u),h=this.axisColor)):(a="gray",h=this.axisColor),m.lineWidth=this._getStrokeWidth(t),m.fillStyle=a,m.strokeStyle=h,m.beginPath(),m.moveTo(t.screen.x,t.screen.y),m.lineTo(e.screen.x,e.screen.y),m.lineTo(n.screen.x,n.screen.y),m.lineTo(i.screen.x,i.screen.y),m.closePath(),m.fill(),m.stroke()}}else for(s=0;su&&(u=0);var p,f,m;this.style===o.STYLE.DOTCOLOR?(p=240*(1-(d.point.value-this.valueMin)*this.scale.value),f=this._hsv2rgb(p,1,1),m=this._hsv2rgb(p,1,.8)):this.style===o.STYLE.DOTSIZE?(f=this.dataColor.fill,m=this.dataColor.stroke):(p=240*(1-(d.point.z-this.zMin)*this.scale.z/this.verticalRatio),f=this._hsv2rgb(p,1,1),m=this._hsv2rgb(p,1,.8)),i.lineWidth=this._getStrokeWidth(d),i.strokeStyle=m,i.fillStyle=f,i.beginPath(),i.arc(d.screen.x,d.screen.y,u,0,2*Math.PI,!0),i.fill(),i.stroke()}}},o.prototype._redrawDataBar=function(){var t,e,i,n,s=this.frame.canvas,r=s.getContext("2d");if(!(void 0===this.dataPoints||this.dataPoints.length<=0)){for(t=0;t0){for(t=this.dataPoints[0],o.lineWidth=this._getStrokeWidth(t),o.lineJoin="round",o.lineCap="round",o.strokeStyle=this.dataColor.stroke,o.beginPath(),o.moveTo(t.screen.x,t.screen.y),e=1;e0?1:0>t?-1:0}var o=e[0],n=e[1],s=e[2],r=i((n.x-o.x)*(t.y-o.y)-(n.y-o.y)*(t.x-o.x)),a=i((s.x-n.x)*(t.y-n.y)-(s.y-n.y)*(t.x-n.x)),h=i((o.x-s.x)*(t.y-s.y)-(o.y-s.y)*(t.x-s.x));return!(0!=r&&0!=a&&r!=a||0!=a&&0!=h&&a!=h||0!=r&&0!=h&&r!=h)},o.prototype._dataPointFromXY=function(t,e){var i,n=100,s=null,r=null,a=null,h=new u(t,e);if(this.style===o.STYLE.BAR||this.style===o.STYLE.BARCOLOR||this.style===o.STYLE.BARSIZE)for(i=this.dataPoints.length-1;i>=0;i--){s=this.dataPoints[i];var d=s.surfaces;if(d)for(var l=d.length-1;l>=0;l--){var c=d[l],p=c.corners,f=[p[0].screen,p[1].screen,p[2].screen],m=[p[2].screen,p[3].screen,p[0].screen];if(this._insideTriangle(h,f)||this._insideTriangle(h,m))return s}}else for(i=0;ib)&&n>b&&(a=b,r=s)}}return r},o.prototype._showTooltip=function(t){var e,i,o;this.tooltip?(e=this.tooltip.dom.content,i=this.tooltip.dom.line,o=this.tooltip.dom.dot):(e=document.createElement("div"),e.style.position="absolute",e.style.padding="10px",e.style.border="1px solid #4d4d4d",e.style.color="#1a1a1a",e.style.background="rgba(255,255,255,0.7)",e.style.borderRadius="2px",e.style.boxShadow="5px 5px 10px rgba(128,128,128,0.5)",i=document.createElement("div"),i.style.position="absolute",i.style.height="40px",i.style.width="0",i.style.borderLeft="1px solid #4d4d4d",o=document.createElement("div"),o.style.position="absolute",o.style.height="0",o.style.width="0",o.style.border="5px solid #4d4d4d",o.style.borderRadius="5px",this.tooltip={dataPoint:null,dom:{content:e,line:i,dot:o}}),this._hideTooltip(),this.tooltip.dataPoint=t,"function"==typeof this.showTooltip?e.innerHTML=this.showTooltip(t.point):e.innerHTML="
"+this.xLabel+":"+t.point.x+"
"+this.yLabel+":"+t.point.y+"
"+this.zLabel+":"+t.point.z+"
",e.style.left="0",e.style.top="0",this.frame.appendChild(e),this.frame.appendChild(i),this.frame.appendChild(o);var n=e.offsetWidth,s=e.offsetHeight,r=i.offsetHeight,a=o.offsetWidth,h=o.offsetHeight,d=t.screen.x-n/2;d=Math.min(Math.max(d,10),this.frame.clientWidth-10-n),i.style.left=t.screen.x+"px",i.style.top=t.screen.y-r+"px",e.style.left=d+"px",e.style.top=t.screen.y-r-s+"px",o.style.left=t.screen.x-a/2+"px",o.style.top=t.screen.y-h/2+"px"},o.prototype._hideTooltip=function(){if(this.tooltip){this.tooltip.dataPoint=null;for(var t in this.tooltip.dom)if(this.tooltip.dom.hasOwnProperty(t)){var e=this.tooltip.dom[t];e&&e.parentNode&&e.parentNode.removeChild(e)}}},t.exports=o},function(t,e){function i(t){return t?o(t):void 0}function o(t){for(var e in i.prototype)t[e]=i.prototype[e];return t}t.exports=i,i.prototype.on=i.prototype.addEventListener=function(t,e){return this._callbacks=this._callbacks||{},(this._callbacks[t]=this._callbacks[t]||[]).push(e),this},i.prototype.once=function(t,e){function i(){o.off(t,i),e.apply(this,arguments)}var o=this;return this._callbacks=this._callbacks||{},i.fn=e,this.on(t,i),this},i.prototype.off=i.prototype.removeListener=i.prototype.removeAllListeners=i.prototype.removeEventListener=function(t,e){if(this._callbacks=this._callbacks||{},0==arguments.length)return this._callbacks={},this;var i=this._callbacks[t];if(!i)return this;if(1==arguments.length)return delete this._callbacks[t],this;for(var o,n=0;no;++o)i[o].apply(this,e)}return this},i.prototype.listeners=function(t){return this._callbacks=this._callbacks||{},this._callbacks[t]||[]},i.prototype.hasListeners=function(t){return!!this.listeners(t).length}},function(t,e){function i(t,e,i){this.x=void 0!==t?t:0,this.y=void 0!==e?e:0,this.z=void 0!==i?i:0}i.subtract=function(t,e){var o=new i;return o.x=t.x-e.x,o.y=t.y-e.y,o.z=t.z-e.z,o},i.add=function(t,e){var o=new i;return o.x=t.x+e.x,o.y=t.y+e.y,o.z=t.z+e.z,o},i.avg=function(t,e){return new i((t.x+e.x)/2,(t.y+e.y)/2,(t.z+e.z)/2)},i.crossProduct=function(t,e){var o=new i;return o.x=t.y*e.z-t.z*e.y,o.y=t.z*e.x-t.x*e.z,o.z=t.x*e.y-t.y*e.x,o},i.prototype.length=function(){return Math.sqrt(this.x*this.x+this.y*this.y+this.z*this.z)},t.exports=i},function(t,e){function i(t,e){this.x=void 0!==t?t:0,this.y=void 0!==e?e:0}t.exports=i},function(t,e,i){function o(){this.armLocation=new n,this.armRotation={},this.armRotation.horizontal=0,this.armRotation.vertical=0,this.armLength=1.7,this.cameraLocation=new n,this.cameraRotation=new n(.5*Math.PI,0,0),this.calculateCameraOrientation()}var n=i(14);o.prototype.setArmLocation=function(t,e,i){this.armLocation.x=t,this.armLocation.y=e,this.armLocation.z=i,this.calculateCameraOrientation()},o.prototype.setArmRotation=function(t,e){void 0!==t&&(this.armRotation.horizontal=t),void 0!==e&&(this.armRotation.vertical=e,this.armRotation.vertical<0&&(this.armRotation.vertical=0),this.armRotation.vertical>.5*Math.PI&&(this.armRotation.vertical=.5*Math.PI)),void 0===t&&void 0===e||this.calculateCameraOrientation()},o.prototype.getArmRotation=function(){var t={};return t.horizontal=this.armRotation.horizontal,t.vertical=this.armRotation.vertical,t},o.prototype.setArmLength=function(t){void 0!==t&&(this.armLength=t,this.armLength<.71&&(this.armLength=.71),this.armLength>5&&(this.armLength=5),this.calculateCameraOrientation())},o.prototype.getArmLength=function(){return this.armLength},o.prototype.getCameraLocation=function(){return this.cameraLocation},o.prototype.getCameraRotation=function(){return this.cameraRotation},o.prototype.calculateCameraOrientation=function(){this.cameraLocation.x=this.armLocation.x-this.armLength*Math.sin(this.armRotation.horizontal)*Math.cos(this.armRotation.vertical),this.cameraLocation.y=this.armLocation.y-this.armLength*Math.cos(this.armRotation.horizontal)*Math.cos(this.armRotation.vertical),this.cameraLocation.z=this.armLocation.z+this.armLength*Math.sin(this.armRotation.vertical),this.cameraRotation.x=Math.PI/2-this.armRotation.vertical,this.cameraRotation.y=0,this.cameraRotation.z=-this.armRotation.horizontal},t.exports=o},function(t,e,i){function o(t,e,i){this.data=t,this.column=e,this.graph=i,this.index=void 0,this.value=void 0,this.values=i.getDistinctValues(t.get(),this.column),this.values.sort(function(t,e){return t>e?1:e>t?-1:0}),this.values.length>0&&this.selectValue(0),this.dataPoints=[],this.loaded=!1,this.onLoadCallback=void 0,i.animationPreload?(this.loaded=!1,this.loadInBackground()):this.loaded=!0}var n=i(11);o.prototype.isLoaded=function(){return this.loaded},o.prototype.getLoadedProgress=function(){for(var t=this.values.length,e=0;this.dataPoints[e];)e++;return Math.round(e/t*100)},o.prototype.getLabel=function(){return this.graph.filterLabel},o.prototype.getColumn=function(){return this.column},o.prototype.getSelectedValue=function(){return void 0!==this.index?this.values[this.index]:void 0},o.prototype.getValues=function(){return this.values},o.prototype.getValue=function(t){if(t>=this.values.length)throw"Error: index out of range";return this.values[t]},o.prototype._getDataPoints=function(t){if(void 0===t&&(t=this.index),void 0===t)return[];var e;if(this.dataPoints[t])e=this.dataPoints[t];else{var i={};i.column=this.column,i.value=this.values[t];var o=new n(this.data,{filter:function(t){return t[i.column]==i.value}}).get();e=this.graph._getDataPoints(o),this.dataPoints[t]=e}return e},o.prototype.setOnLoadCallback=function(t){this.onLoadCallback=t},o.prototype.selectValue=function(t){if(t>=this.values.length)throw"Error: index out of range";this.index=t,this.value=this.values[t]},o.prototype.loadInBackground=function(t){void 0===t&&(t=0);var e=this.graph.frame;if(t0&&(t--,this.setIndex(t))},o.prototype.next=function(){var t=this.getIndex();t0?this.setIndex(0):this.index=void 0},o.prototype.setIndex=function(t){if(!(to&&(o=0),o>this.values.length-1&&(o=this.values.length-1),o},o.prototype.indexToLeft=function(t){var e=parseFloat(this.frame.bar.style.width)-this.frame.slide.clientWidth-10,i=t/(this.values.length-1)*e,o=i+3;return o},o.prototype._onMouseMove=function(t){var e=t.clientX-this.startClientX,i=this.startSlideX+e,o=this.leftToIndex(i);this.setIndex(o),n.preventDefault()},o.prototype._onMouseUp=function(t){this.frame.style.cursor="auto",n.removeEventListener(document,"mousemove",this.onmousemove),n.removeEventListener(document,"mouseup",this.onmouseup),n.preventDefault()},t.exports=o},function(t,e){function i(t,e,i,o){this._start=0,this._end=0,this._step=1,this.prettyStep=!0,this.precision=5,this._current=0,this.setRange(t,e,i,o)}i.prototype.setRange=function(t,e,i,o){this._start=t?t:0,this._end=e?e:0,this.setStep(i,o)},i.prototype.setStep=function(t,e){void 0===t||0>=t||(void 0!==e&&(this.prettyStep=e),this.prettyStep===!0?this._step=i.calculatePrettyStep(t):this._step=t)},i.calculatePrettyStep=function(t){var e=function(t){return Math.log(t)/Math.LN10},i=Math.pow(10,Math.round(e(t))),o=2*Math.pow(10,Math.round(e(t/2))),n=5*Math.pow(10,Math.round(e(t/5))),s=i;return Math.abs(o-t)<=Math.abs(s-t)&&(s=o),Math.abs(n-t)<=Math.abs(s-t)&&(s=n),0>=s&&(s=1),s},i.prototype.getCurrent=function(){return parseFloat(this._current.toPrecision(this.precision))},i.prototype.getStep=function(){return this._step},i.prototype.start=function(){this._current=this._start-this._start%this._step},i.prototype.next=function(){this._current+=this._step},i.prototype.end=function(){return this._current>this._end},t.exports=i},function(t,e,i){if("undefined"!=typeof window){var o=i(21),n=window.Hammer||i(22);t.exports=o(n,{preventDefault:"mouse"})}else t.exports=function(){throw Error("hammer.js is only available in a browser, not in node.js.")}},function(t,e,i){var o,n,s;!function(i){n=[],o=i,s="function"==typeof o?o.apply(e,n):o,!(void 0!==s&&(t.exports=s))}(function(){var t=null;return function e(i,o){function n(t){return t.match(/[^ ]+/g)}function s(e){if("hammer.input"!==e.type){if(e.srcEvent._handled||(e.srcEvent._handled={}),e.srcEvent._handled[e.type])return;e.srcEvent._handled[e.type]=!0}var i=!1;e.stopPropagation=function(){i=!0};var o=e.srcEvent.stopPropagation.bind(e.srcEvent);"function"==typeof o&&(e.srcEvent.stopPropagation=function(){o(),e.stopPropagation()}),e.firstTarget=t;for(var n=t;n&&!i;){var s=n.hammer;if(s)for(var r,a=0;a0?d._handlers[t]=o:(i.off(t,s),delete d._handlers[t]))}),d},d.emit=function(e,o){t=o.target,i.emit(e,o)},d.destroy=function(){var t=i.element.hammer,e=t.indexOf(d);-1!==e&&t.splice(e,1),t.length||delete i.element.hammer,d._handlers={},i.destroy()},d}})},function(t,e,i){var o;!function(n,s,r,a){function h(t,e,i){return setTimeout(p(t,i),e)}function d(t,e,i){return Array.isArray(t)?(l(t,i[e],i),!0):!1}function l(t,e,i){var o;if(t)if(t.forEach)t.forEach(e,i);else if(t.length!==a)for(o=0;o\s*\(/gm,"{anonymous}()@"):"Unknown Stack Trace",s=n.console&&(n.console.warn||n.console.log);return s&&s.call(n.console,o,i),t.apply(this,arguments)}}function u(t,e,i){var o,n=e.prototype;o=t.prototype=Object.create(n),o.constructor=t,o._super=n,i&&ct(o,i)}function p(t,e){return function(){return t.apply(e,arguments)}}function f(t,e){return typeof t==ft?t.apply(e?e[0]||a:a,e):t}function m(t,e){return t===a?e:t}function v(t,e,i){l(w(e),function(e){t.addEventListener(e,i,!1)})}function g(t,e,i){l(w(e),function(e){t.removeEventListener(e,i,!1)})}function y(t,e){for(;t;){if(t==e)return!0;t=t.parentNode}return!1}function b(t,e){return t.indexOf(e)>-1}function w(t){return t.trim().split(/\s+/g)}function _(t,e,i){if(t.indexOf&&!i)return t.indexOf(e);for(var o=0;oi[e]}):o.sort()),o}function O(t,e){for(var i,o,n=e[0].toUpperCase()+e.slice(1),s=0;s1&&!i.firstMultiple?i.firstMultiple=N(e):1===n&&(i.firstMultiple=!1);var s=i.firstInput,r=i.firstMultiple,a=r?r.center:s.center,h=e.center=R(o);e.timeStamp=gt(),e.deltaTime=e.timeStamp-s.timeStamp,e.angle=B(a,h),e.distance=A(a,h),P(i,e),e.offsetDirection=L(e.deltaX,e.deltaY);var d=z(e.deltaTime,e.deltaX,e.deltaY);e.overallVelocityX=d.x,e.overallVelocityY=d.y,e.overallVelocity=vt(d.x)>vt(d.y)?d.x:d.y,e.scale=r?j(r.pointers,o):1,e.rotation=r?F(r.pointers,o):0,e.maxPointers=i.prevInput?e.pointers.length>i.prevInput.maxPointers?e.pointers.length:i.prevInput.maxPointers:e.pointers.length,I(i,e);var l=t.element;y(e.srcEvent.target,l)&&(l=e.srcEvent.target),e.target=l}function P(t,e){var i=e.center,o=t.offsetDelta||{},n=t.prevDelta||{},s=t.prevInput||{};e.eventType!==Et&&s.eventType!==It||(n=t.prevDelta={x:s.deltaX||0,y:s.deltaY||0},o=t.offsetDelta={x:i.x,y:i.y}),e.deltaX=n.x+(i.x-o.x),e.deltaY=n.y+(i.y-o.y)}function I(t,e){var i,o,n,s,r=t.lastInterval||e,h=e.timeStamp-r.timeStamp;if(e.eventType!=Nt&&(h>Tt||r.velocity===a)){var d=e.deltaX-r.deltaX,l=e.deltaY-r.deltaY,c=z(h,d,l);o=c.x,n=c.y,i=vt(c.x)>vt(c.y)?c.x:c.y,s=L(d,l),t.lastInterval=e}else i=r.velocity,o=r.velocityX,n=r.velocityY,s=r.direction;e.velocity=i,e.velocityX=o,e.velocityY=n,e.direction=s}function N(t){for(var e=[],i=0;in;)i+=t[n].clientX,o+=t[n].clientY,n++;return{x:mt(i/e),y:mt(o/e)}}function z(t,e,i){return{x:e/t||0,y:i/t||0}}function L(t,e){return t===e?Rt:vt(t)>=vt(e)?0>t?zt:Lt:0>e?At:Bt}function A(t,e,i){i||(i=Wt);var o=e[i[0]]-t[i[0]],n=e[i[1]]-t[i[1]];return Math.sqrt(o*o+n*n)}function B(t,e,i){i||(i=Wt);var o=e[i[0]]-t[i[0]],n=e[i[1]]-t[i[1]];return 180*Math.atan2(n,o)/Math.PI}function F(t,e){return B(e[1],e[0],Yt)+B(t[1],t[0],Yt)}function j(t,e){return A(e[0],e[1],Yt)/A(t[0],t[1],Yt)}function H(){this.evEl=Vt,this.evWin=Ut,this.allow=!0,this.pressed=!1,S.apply(this,arguments)}function W(){this.evEl=Zt,this.evWin=Kt,S.apply(this,arguments),this.store=this.manager.session.pointerEvents=[]}function Y(){this.evTarget=Qt,this.evWin=$t,this.started=!1,S.apply(this,arguments)}function G(t,e){var i=x(t.touches),o=x(t.changedTouches);return e&(It|Nt)&&(i=k(i.concat(o),"identifier",!0)),[i,o]}function V(){this.evTarget=ee,this.targetIds={},S.apply(this,arguments)}function U(t,e){var i=x(t.touches),o=this.targetIds;if(e&(Et|Pt)&&1===i.length)return o[i[0].identifier]=!0,[i,i];var n,s,r=x(t.changedTouches),a=[],h=this.target;if(s=i.filter(function(t){return y(t.target,h)}),e===Et)for(n=0;na&&(e.push(t),a=e.length-1):n&(It|Nt)&&(i=!0),0>a||(e[a]=t,this.callback(this.manager,n,{pointers:e,changedPointers:[t],pointerType:s,srcEvent:t}),i&&e.splice(a,1))}});var Jt={touchstart:Et,touchmove:Pt,touchend:It,touchcancel:Nt},Qt="touchstart",$t="touchstart touchmove touchend touchcancel";u(Y,S,{handler:function(t){var e=Jt[t.type];if(e===Et&&(this.started=!0),this.started){var i=G.call(this,t,e);e&(It|Nt)&&i[0].length-i[1].length===0&&(this.started=!1),this.callback(this.manager,e,{pointers:i[0],changedPointers:i[1],pointerType:Mt,srcEvent:t})}}});var te={touchstart:Et,touchmove:Pt,touchend:It,touchcancel:Nt},ee="touchstart touchmove touchend touchcancel";u(V,S,{handler:function(t){var e=te[t.type],i=U.call(this,t,e);i&&this.callback(this.manager,e,{pointers:i[0],changedPointers:i[1],pointerType:Mt,srcEvent:t})}}),u(q,S,{handler:function(t,e,i){var o=i.pointerType==Mt,n=i.pointerType==St;if(o)this.mouse.allow=!1;else if(n&&!this.mouse.allow)return;e&(It|Nt)&&(this.mouse.allow=!0),this.callback(t,e,i)},destroy:function(){this.touch.destroy(),this.mouse.destroy()}});var ie=O(pt.style,"touchAction"),oe=ie!==a,ne="compute",se="auto",re="manipulation",ae="none",he="pan-x",de="pan-y";X.prototype={set:function(t){t==ne&&(t=this.compute()),oe&&this.manager.element.style&&(this.manager.element.style[ie]=t),this.actions=t.toLowerCase().trim()},update:function(){this.set(this.manager.options.touchAction)},compute:function(){var t=[];return l(this.manager.recognizers,function(e){f(e.options.enable,[e])&&(t=t.concat(e.getTouchAction()))}),Z(t.join(" "))},preventDefaults:function(t){if(!oe){var e=t.srcEvent,i=t.offsetDirection;if(this.manager.session.prevented)return void e.preventDefault();var o=this.actions,n=b(o,ae),s=b(o,de),r=b(o,he);if(n){var a=1===t.pointers.length,h=t.distance<2,d=t.deltaTime<250;if(a&&h&&d)return}if(!r||!s)return n||s&&i&Ft||r&&i&jt?this.preventSrc(e):void 0}},preventSrc:function(t){this.manager.session.prevented=!0,t.preventDefault()}};var le=1,ce=2,ue=4,pe=8,fe=pe,me=16,ve=32;K.prototype={defaults:{},set:function(t){return ct(this.options,t),this.manager&&this.manager.touchAction.update(),this},recognizeWith:function(t){if(d(t,"recognizeWith",this))return this;var e=this.simultaneous;return t=$(t,this),e[t.id]||(e[t.id]=t,t.recognizeWith(this)),this},dropRecognizeWith:function(t){return d(t,"dropRecognizeWith",this)?this:(t=$(t,this),delete this.simultaneous[t.id],this)},requireFailure:function(t){if(d(t,"requireFailure",this))return this;var e=this.requireFail;return t=$(t,this),-1===_(e,t)&&(e.push(t),t.requireFailure(this)),this},dropRequireFailure:function(t){if(d(t,"dropRequireFailure",this))return this;t=$(t,this);var e=_(this.requireFail,t);return e>-1&&this.requireFail.splice(e,1),this},hasRequireFailures:function(){return this.requireFail.length>0},canRecognizeWith:function(t){return!!this.simultaneous[t.id]},emit:function(t){function e(e){i.manager.emit(e,t)}var i=this,o=this.state;pe>o&&e(i.options.event+J(o)),e(i.options.event),t.additionalEvent&&e(t.additionalEvent),o>=pe&&e(i.options.event+J(o))},tryEmit:function(t){return this.canEmit()?this.emit(t):void(this.state=ve)},canEmit:function(){for(var t=0;ts?zt:Lt,i=s!=this.pX,o=Math.abs(t.deltaX)):(n=0===r?Rt:0>r?At:Bt,i=r!=this.pY,o=Math.abs(t.deltaY))),t.direction=n,i&&o>e.threshold&&n&e.direction},attrTest:function(t){return tt.prototype.attrTest.call(this,t)&&(this.state&ce||!(this.state&ce)&&this.directionTest(t))},emit:function(t){this.pX=t.deltaX,this.pY=t.deltaY;var e=Q(t.direction);e&&(t.additionalEvent=this.options.event+e),this._super.emit.call(this,t)}}),u(it,tt,{defaults:{event:"pinch",threshold:0,pointers:2},getTouchAction:function(){return[ae]},attrTest:function(t){return this._super.attrTest.call(this,t)&&(Math.abs(t.scale-1)>this.options.threshold||this.state&ce)},emit:function(t){if(1!==t.scale){var e=t.scale<1?"in":"out";t.additionalEvent=this.options.event+e}this._super.emit.call(this,t)}}),u(ot,K,{defaults:{event:"press",pointers:1,time:251,threshold:9},getTouchAction:function(){return[se]},process:function(t){var e=this.options,i=t.pointers.length===e.pointers,o=t.distancee.time;if(this._input=t,!o||!i||t.eventType&(It|Nt)&&!n)this.reset();else if(t.eventType&Et)this.reset(),this._timer=h(function(){this.state=fe,this.tryEmit()},e.time,this);else if(t.eventType&It)return fe;return ve},reset:function(){clearTimeout(this._timer)},emit:function(t){this.state===fe&&(t&&t.eventType&It?this.manager.emit(this.options.event+"up",t):(this._input.timeStamp=gt(),this.manager.emit(this.options.event,this._input)))}}),u(nt,tt,{defaults:{event:"rotate",threshold:0,pointers:2},getTouchAction:function(){return[ae]},attrTest:function(t){return this._super.attrTest.call(this,t)&&(Math.abs(t.rotation)>this.options.threshold||this.state&ce)}}),u(st,tt,{defaults:{event:"swipe",threshold:10,velocity:.3,direction:Ft|jt,pointers:1},getTouchAction:function(){return et.prototype.getTouchAction.call(this)},attrTest:function(t){var e,i=this.options.direction;return i&(Ft|jt)?e=t.overallVelocity:i&Ft?e=t.overallVelocityX:i&jt&&(e=t.overallVelocityY),this._super.attrTest.call(this,t)&&i&t.offsetDirection&&t.distance>this.options.threshold&&t.maxPointers==this.options.pointers&&vt(e)>this.options.velocity&&t.eventType&It},emit:function(t){var e=Q(t.offsetDirection);e&&this.manager.emit(this.options.event+e,t),this.manager.emit(this.options.event,t)}}),u(rt,K,{defaults:{event:"tap",pointers:1,taps:1,interval:300,time:250,threshold:9,posThreshold:10},getTouchAction:function(){return[re]},process:function(t){var e=this.options,i=t.pointers.length===e.pointers,o=t.distance=e;e++)r[String.fromCharCode(e)]={code:65+(e-97),shift:!1};for(e=65;90>=e;e++)r[String.fromCharCode(e)]={code:e,shift:!0};for(e=0;9>=e;e++)r[""+e]={code:48+e,shift:!1};for(e=1;12>=e;e++)r["F"+e]={code:111+e,shift:!1};for(e=0;9>=e;e++)r["num"+e]={code:96+e,shift:!1};r["num*"]={code:106,shift:!1},r["num+"]={code:107,shift:!1},r["num-"]={code:109,shift:!1},r["num/"]={code:111,shift:!1},r["num."]={code:110,shift:!1},r.left={code:37,shift:!1},r.up={code:38,shift:!1},r.right={code:39,shift:!1},r.down={code:40,shift:!1},r.space={code:32,shift:!1},r.enter={code:13,shift:!1},r.shift={code:16,shift:void 0},r.esc={code:27,shift:!1},r.backspace={code:8,shift:!1},r.tab={code:9,shift:!1},r.ctrl={code:17,shift:!1},r.alt={code:18,shift:!1},r["delete"]={code:46,shift:!1},r.pageup={code:33,shift:!1},r.pagedown={code:34,shift:!1},r["="]={code:187,shift:!1},r["-"]={code:189,shift:!1},r["]"]={code:221,shift:!1},r["["]={code:219,shift:!1};var a=function(t){d(t,"keydown")},h=function(t){d(t,"keyup")},d=function(t,e){if(void 0!==s[e][t.keyCode]){for(var o=s[e][t.keyCode],n=0;ne)&&(n=e),(null===s||i>s)&&(s=i)}),null!==n&&null!==s){var r=(n+s)/2,a=Math.max(this.range.end-this.range.start,1.1*(s-n)),h=e&&void 0!==e.animation?e.animation:!0;this.range.setRange(r-a/2,r+a/2,h)}}},n.prototype.fit=function(t){var e,i=t&&void 0!==t.animation?t.animation:!0,o=this.itemsData&&this.itemsData.getDataSet();1===o.length&&void 0===o.get()[0].end?(e=this.getDataRange(),this.moveTo(e.min.valueOf(),{animation:i})):(e=this.getItemRange(),this.range.setRange(e.min,e.max,i))},n.prototype.getItemRange=function(){var t=this,e=this.getDataRange(),i=null!==e.min?e.min.valueOf():null,o=null!==e.max?e.max.valueOf():null,n=null,s=null;if(null!=i&&null!=o){var r,a,h,d,c;!function(){var e=function(t){return l.convert(t.data.start,"Date").valueOf()},u=function(t){var e=void 0!=t.data.end?t.data.end:t.data.start;return l.convert(e,"Date").valueOf()};r=o-i,0>=r&&(r=10),a=r/t.props.center.width,l.forEach(t.itemSet.items,function(t){t.show(),t.repositionX();var r=e(t),h=u(t);if(this.options.rtl)var d=r-(t.getWidthRight()+10)*a,l=h+(t.getWidthLeft()+10)*a;else var d=r-(t.getWidthLeft()+10)*a,l=h+(t.getWidthRight()+10)*a;i>d&&(i=d,n=t),l>o&&(o=l,s=t)}.bind(t)),n&&s&&(h=n.getWidthLeft()+10,d=s.getWidthRight()+10,c=t.props.center.width-h-d,c>0&&(t.options.rtl?(i=e(n)-d*r/c,o=u(s)+h*r/c):(i=e(n)-h*r/c,o=u(s)+d*r/c)))}()}return{min:null!=i?new Date(i):null,max:null!=o?new Date(o):null}},n.prototype.getDataRange=function(){var t=null,e=null,i=this.itemsData&&this.itemsData.getDataSet();return i&&i.forEach(function(i){var o=l.convert(i.start,"Date").valueOf(),n=l.convert(void 0!=i.end?i.end:i.start,"Date").valueOf();(null===t||t>o)&&(t=o),(null===e||n>e)&&(e=n)}),{min:null!=t?new Date(t):null,max:null!=e?new Date(e):null}},n.prototype.getEventProperties=function(t){var e=t.center?t.center.x:t.clientX,i=t.center?t.center.y:t.clientY;if(this.options.rtl)var o=l.getAbsoluteRight(this.dom.centerContainer)-e;else var o=e-l.getAbsoluteLeft(this.dom.centerContainer);var n=i-l.getAbsoluteTop(this.dom.centerContainer),s=this.itemSet.itemFromTarget(t),r=this.itemSet.groupFromTarget(t),a=g.customTimeFromTarget(t),h=this.itemSet.options.snap||null,d=this.body.util.getScale(),c=this.body.util.getStep(),u=this._toTime(o),p=h?h(u,d,c):u,f=l.getTarget(t),m=null;return null!=s?m="item":null!=a?m="custom-time":l.hasParent(f,this.timeAxis.dom.foreground)?m="axis":this.timeAxis2&&l.hasParent(f,this.timeAxis2.dom.foreground)?m="axis":l.hasParent(f,this.itemSet.dom.labelSet)?m="group-label":l.hasParent(f,this.currentTime.bar)?m="current-time":l.hasParent(f,this.dom.center)&&(m="background"),{event:t,item:s?s.id:null,group:r?r.groupId:null,what:m,pageX:t.srcEvent?t.srcEvent.pageX:t.pageX,pageY:t.srcEvent?t.srcEvent.pageY:t.pageY,x:o,y:n,time:u,snappedTime:p}},t.exports=n},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var s="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},r=function(){function t(t,e){for(var i=0;i0&&this._makeItem([]),this._makeHeader(n),this._handleObject(this.configureOptions[n],[n])),i++);this.options.showButton===!0&&!function(){var e=document.createElement("div");e.className="vis-configuration vis-config-button",e.innerHTML="generate options",e.onclick=function(){t._printOptions()},e.onmouseover=function(){e.className="vis-configuration vis-config-button hover"},e.onmouseout=function(){e.className="vis-configuration vis-config-button"},t.optionsContainer=document.createElement("div"),t.optionsContainer.className="vis-configuration vis-config-option-container",t.domElements.push(t.optionsContainer),t.domElements.push(e)}(),this._push()}},{key:"_push",value:function(){this.wrapper=document.createElement("div"),this.wrapper.className="vis-configuration-wrapper",this.container.appendChild(this.wrapper);for(var t=0;t1?o-1:0),r=1;o>r;r++)n[r-1]=e[r];return n.forEach(function(t){s.appendChild(t)}),i.domElements.push(s),{v:i.domElements.length}}();if("object"===("undefined"==typeof a?"undefined":s(a)))return a.v}return 0}},{key:"_makeHeader",value:function(t){var e=document.createElement("div");e.className="vis-configuration vis-config-header",e.innerHTML=t,this._makeItem([],e)}},{key:"_makeLabel",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?!1:arguments[2],o=document.createElement("div");return o.className="vis-configuration vis-config-label vis-config-s"+e.length,i===!0?o.innerHTML=""+t+":":o.innerHTML=t+":",o}},{key:"_makeDropdown",value:function(t,e,i){var o=document.createElement("select");o.className="vis-configuration vis-config-select";var n=0;void 0!==e&&-1!==t.indexOf(e)&&(n=t.indexOf(e));for(var s=0;se&&n>e*c?(a.min=Math.ceil(e*c),l=a.min,d="range increased"):n>e/c&&(a.min=Math.ceil(e/c),l=a.min,d="range increased"),e*c>s&&1!==s&&(a.max=Math.ceil(e*c),l=a.max,d="range increased"),a.value=e}else a.value=o;var u=document.createElement("input");u.className="vis-configuration vis-config-rangeinput",u.value=a.value;var p=this;a.onchange=function(){u.value=this.value,p._update(Number(this.value),i)},a.oninput=function(){u.value=this.value};var f=this._makeLabel(i[i.length-1],i),m=this._makeItem(i,f,a,u);""!==d&&this.popupHistory[m]!==l&&(this.popupHistory[m]=l,this._setupPopup(d,m))}},{key:"_setupPopup",value:function(t,e){var i=this;if(this.initialized===!0&&this.allowCreation===!0&&this.popupCountervar options = "+JSON.stringify(t,null,2)+""}},{key:"getOptions",value:function(){for(var t={},e=0;es;s++)for(r=0;rp?p+1:p;var f=l/this.r,m=a.RGBToHSV(this.color.r,this.color.g,this.color.b);m.h=p,m.s=f;var v=a.HSVToRGB(m.h,m.s,m.v);v.a=this.color.a,this.color=v,this.initialColorDiv.style.backgroundColor="rgba("+this.initialColor.r+","+this.initialColor.g+","+this.initialColor.b+","+this.initialColor.a+")",this.newColorDiv.style.backgroundColor="rgba("+this.color.r+","+this.color.g+","+this.color.b+","+this.color.a+")"}}]),t}();e["default"]=h},function(t,e,i){i(20);e.onTouch=function(t,e){e.inputHandler=function(t){t.isFirst&&e(t)},t.on("hammer.input",e.inputHandler)},e.onRelease=function(t,e){return e.inputHandler=function(t){t.isFinal&&e(t)},t.on("hammer.input",e.inputHandler)},e.offTouch=function(t,e){t.off("hammer.input",e.inputHandler)},e.offRelease=e.offTouch,e.disablePreventDefaultVertically=function(t){var e="pan-y";return t.getTouchAction=function(){return[e]},t}},function(t,e,i){function o(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var n="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},s=function(){function t(t,e){for(var i=0;is.distance?console.log('%cUnknown option detected: "'+e+'" in '+t.printLocation(n.path,e,"")+"Perhaps it was misplaced? Matching option found at: "+t.printLocation(s.path,s.closestMatch,""),d):n.distance<=r?console.log('%cUnknown option detected: "'+e+'". Did you mean "'+n.closestMatch+'"?'+t.printLocation(n.path,e),d):console.log('%cUnknown option detected: "'+e+'". Did you mean one of these: '+t.print(Object.keys(i))+t.printLocation(o,e),d),a=!0}},{key:"findInOptions",value:function(e,i,o){var n=arguments.length<=3||void 0===arguments[3]?!1:arguments[3],s=1e9,a="",h=[],d=e.toLowerCase(),l=void 0;for(var c in i){var u=void 0;if(void 0!==i[c].__type__&&n===!0){var p=t.findInOptions(e,i[c],r.copyAndExtendArray(o,c));s>p.distance&&(a=p.closestMatch,h=p.path,s=p.distance,l=p.indexMatch)}else-1!==c.toLowerCase().indexOf(d)&&(l=c),u=t.levenshteinDistance(e,c),s>u&&(a=c,h=r.copyArray(o),s=u)}return{closestMatch:a,path:h,distance:s,indexMatch:l}}},{key:"printLocation",value:function(t,e){for(var i=arguments.length<=2||void 0===arguments[2]?"Problem value found at: \n":arguments[2],o="\n\n"+i+"options = {\n",n=0;ns;s++)o+=" ";o+=t[n]+": {\n"}for(var r=0;ru,r=s||null===n?n:l+(n-l)*i,p=s||null===a?a:c+(a-c)*i;y=h._applyRange(r,p),d.updateHiddenDates(h.options.moment,h.body,h.options.hiddenDates),v=v||y,y&&h.body.emitter.emit("rangechange",{start:new Date(h.start),end:new Date(h.end),byUser:o}),s?v&&h.body.emitter.emit("rangechanged",{start:new Date(h.start),end:new Date(h.end),byUser:o}):h.animationTimer=setTimeout(w,20)}};return g()}var y=this._applyRange(n,a);if(d.updateHiddenDates(this.options.moment,this.body,this.options.hiddenDates),y){var b={start:new Date(this.start),end:new Date(this.end),byUser:o};this.body.emitter.emit("rangechange",b),this.body.emitter.emit("rangechanged",b)}},o.prototype._cancelAnimation=function(){this.animationTimer&&(clearTimeout(this.animationTimer),this.animationTimer=null)},o.prototype._applyRange=function(t,e){var i,o=null!=t?r.convert(t,"Date").valueOf():this.start,n=null!=e?r.convert(e,"Date").valueOf():this.end,s=null!=this.options.max?r.convert(this.options.max,"Date").valueOf():null,a=null!=this.options.min?r.convert(this.options.min,"Date").valueOf():null;if(isNaN(o)||null===o)throw new Error('Invalid start "'+t+'"');if(isNaN(n)||null===n)throw new Error('Invalid end "'+e+'"');if(o>n&&(n=o),null!==a&&a>o&&(i=a-o,o+=i,n+=i,null!=s&&n>s&&(n=s)),null!==s&&n>s&&(i=n-s,o-=i,n-=i,null!=a&&a>o&&(o=a)),null!==this.options.zoomMin){var h=parseFloat(this.options.zoomMin);0>h&&(h=0),h>n-o&&(this.end-this.start===h&&o>this.start&&nd&&(d=0),n-o>d&&(this.end-this.start===d&&othis.end?(o=this.start,n=this.end):(i=n-o-d,o+=i/2,n-=i/2))}var l=this.start!=o||this.end!=n;return o>=this.start&&o<=this.end||n>=this.start&&n<=this.end||this.start>=o&&this.start<=n||this.end>=o&&this.end<=n||this.body.emitter.emit("checkRangedItems"),this.start=o,this.end=n,l},o.prototype.getRange=function(){return{start:this.start,end:this.end}},o.prototype.conversion=function(t,e){return o.conversion(this.start,this.end,t,e)},o.conversion=function(t,e,i,o){return void 0===o&&(o=0),0!=i&&e-t!=0?{offset:t,scale:i/(e-t-o)}:{offset:0,scale:1}},o.prototype._onDragStart=function(t){this.deltaDifference=0,this.previousDelta=0,this.options.moveable&&this._isInsideRange(t)&&this.props.touch.allowDragging&&(this.props.touch.start=this.start,this.props.touch.end=this.end,this.props.touch.dragging=!0,this.body.dom.root&&(this.body.dom.root.style.cursor="move"))},o.prototype._onDrag=function(t){if(this.props.touch.dragging&&this.options.moveable&&this.props.touch.allowDragging){var e=this.options.direction;n(e);var i="horizontal"==e?t.deltaX:t.deltaY;i-=this.deltaDifference;var o=this.props.touch.end-this.props.touch.start,s=d.getHiddenDurationBetween(this.body.hiddenDates,this.start,this.end);o-=s;var r="horizontal"==e?this.body.domProps.center.width:this.body.domProps.center.height;if(this.options.rtl)var a=i/r*o;else var a=-i/r*o;var h=this.props.touch.start+a,l=this.props.touch.end+a,c=d.snapAwayFromHidden(this.body.hiddenDates,h,this.previousDelta-i,!0),u=d.snapAwayFromHidden(this.body.hiddenDates,l,this.previousDelta-i,!0);if(c!=h||u!=l)return this.deltaDifference+=i,this.props.touch.start=c,this.props.touch.end=u,void this._onDrag(t);this.previousDelta=i,this._applyRange(h,l);var p=new Date(this.start),f=new Date(this.end);this.body.emitter.emit("rangechange",{start:p,end:f,byUser:!0})}},o.prototype._onDragEnd=function(t){this.props.touch.dragging&&this.options.moveable&&this.props.touch.allowDragging&&(this.props.touch.dragging=!1,this.body.dom.root&&(this.body.dom.root.style.cursor="auto"),this.body.emitter.emit("rangechanged",{start:new Date(this.start),end:new Date(this.end),byUser:!0}))},o.prototype._onMouseWheel=function(t){if(this.options.zoomable&&this.options.moveable&&this._isInsideRange(t)&&(!this.options.zoomKey||t[this.options.zoomKey])){var e=0;if(t.wheelDelta?e=t.wheelDelta/120:t.detail&&(e=-t.detail/3),e){var i;i=0>e?1-e/5:1/(1+e/5);var o=this.getPointer({x:t.clientX,y:t.clientY},this.body.dom.center),n=this._pointerToDate(o);this.zoom(i,n,e)}t.preventDefault()}},o.prototype._onTouch=function(t){this.props.touch.start=this.start,this.props.touch.end=this.end,this.props.touch.allowDragging=!0,this.props.touch.center=null,this.scaleOffset=0,this.deltaDifference=0},o.prototype._onPinch=function(t){if(this.options.zoomable&&this.options.moveable){this.props.touch.allowDragging=!1,this.props.touch.center||(this.props.touch.center=this.getPointer(t.center,this.body.dom.center));var e=1/(t.scale+this.scaleOffset),i=this._pointerToDate(this.props.touch.center),o=d.getHiddenDurationBetween(this.body.hiddenDates,this.start,this.end),n=d.getHiddenDurationBefore(this.options.moment,this.body.hiddenDates,this,i),s=o-n,r=i-n+(this.props.touch.start-(i-n))*e,a=i+s+(this.props.touch.end-(i+s))*e; -this.startToFront=0>=1-e,this.endToFront=0>=e-1;var h=d.snapAwayFromHidden(this.body.hiddenDates,r,1-e,!0),l=d.snapAwayFromHidden(this.body.hiddenDates,a,e-1,!0);h==r&&l==a||(this.props.touch.start=h,this.props.touch.end=l,this.scaleOffset=1-t.scale,r=h,a=l),this.setRange(r,a,!1,!0),this.startToFront=!1,this.endToFront=!0}},o.prototype._isInsideRange=function(t){var e=t.center?t.center.x:t.clientX;if(this.options.rtl)var i=e-r.getAbsoluteLeft(this.body.dom.centerContainer);else var i=r.getAbsoluteRight(this.body.dom.centerContainer)-e;var o=this.body.util.toTime(i);return o>=this.start&&o<=this.end},o.prototype._pointerToDate=function(t){var e,i=this.options.direction;if(n(i),"horizontal"==i)return this.body.util.toTime(t.x).valueOf();var o=this.body.domProps.center.height;return e=this.conversion(o),t.y/e.scale+e.offset},o.prototype.getPointer=function(t,e){return this.options.rtl?{x:r.getAbsoluteRight(e)-t.x,y:t.y-r.getAbsoluteTop(e)}:{x:t.x-r.getAbsoluteLeft(e),y:t.y-r.getAbsoluteTop(e)}},o.prototype.zoom=function(t,e,i){null==e&&(e=(this.start+this.end)/2);var o=d.getHiddenDurationBetween(this.body.hiddenDates,this.start,this.end),n=d.getHiddenDurationBefore(this.options.moment,this.body.hiddenDates,this,e),s=o-n,r=e-n+(this.start-(e-n))*t,a=e+s+(this.end-(e+s))*t;this.startToFront=!(i>0),this.endToFront=!(-i>0);var h=d.snapAwayFromHidden(this.body.hiddenDates,r,i,!0),l=d.snapAwayFromHidden(this.body.hiddenDates,a,-i,!0);h==r&&l==a||(r=h,a=l),this.setRange(r,a,!1,!0),this.startToFront=!1,this.endToFront=!0},o.prototype.move=function(t){var e=this.end-this.start,i=this.start+e*t,o=this.end+e*t;this.start=i,this.end=o},o.prototype.moveTo=function(t){var e=(this.start+this.end)/2,i=e-t,o=this.start-i,n=this.end-i;this.setRange(o,n)},t.exports=o},function(t,e){function i(t,e){this.options=null,this.props=null}i.prototype.setOptions=function(t){t&&util.extend(this.options,t)},i.prototype.redraw=function(){return!1},i.prototype.destroy=function(){},i.prototype._isResized=function(){var t=this.props._previousWidth!==this.props.width||this.props._previousHeight!==this.props.height;return this.props._previousWidth=this.props.width,this.props._previousHeight=this.props.height,t},t.exports=i},function(t,e){e.convertHiddenOptions=function(t,i,o){if(o&&!Array.isArray(o))return e.convertHiddenOptions(t,i,[o]);if(i.hiddenDates=[],o&&1==Array.isArray(o)){for(var n=0;n=4*a){var u=0,p=s.clone();switch(o[h].repeat){case"daily":d.day()!=l.day()&&(u=1),d.dayOfYear(n.dayOfYear()),d.year(n.year()),d.subtract(7,"days"),l.dayOfYear(n.dayOfYear()),l.year(n.year()),l.subtract(7-u,"days"),p.add(1,"weeks");break;case"weekly":var f=l.diff(d,"days"),m=d.day();d.date(n.date()),d.month(n.month()),d.year(n.year()),l=d.clone(),d.day(m),l.day(m),l.add(f,"days"),d.subtract(1,"weeks"),l.subtract(1,"weeks"),p.add(1,"weeks");break;case"monthly":d.month()!=l.month()&&(u=1),d.month(n.month()),d.year(n.year()),d.subtract(1,"months"),l.month(n.month()),l.year(n.year()),l.subtract(1,"months"),l.add(u,"months"),p.add(1,"months");break;case"yearly":d.year()!=l.year()&&(u=1),d.year(n.year()),d.subtract(1,"years"),l.year(n.year()),l.subtract(1,"years"),l.add(u,"years"),p.add(1,"years");break;default:return void console.log("Wrong repeat format, allowed are: daily, weekly, monthly, yearly. Given:",o[h].repeat)}for(;p>d;)switch(i.hiddenDates.push({start:d.valueOf(),end:l.valueOf()}),o[h].repeat){case"daily":d.add(1,"days"),l.add(1,"days");break;case"weekly":d.add(1,"weeks"),l.add(1,"weeks");break;case"monthly":d.add(1,"months"),l.add(1,"months");break;case"yearly":d.add(1,"y"),l.add(1,"y");break;default:return void console.log("Wrong repeat format, allowed are: daily, weekly, monthly, yearly. Given:",o[h].repeat)}i.hiddenDates.push({start:d.valueOf(),end:l.valueOf()})}}e.removeDuplicates(i);var v=e.isHidden(i.range.start,i.hiddenDates),g=e.isHidden(i.range.end,i.hiddenDates),y=i.range.start,b=i.range.end;1==v.hidden&&(y=1==i.range.startToFront?v.startDate-1:v.endDate+1),1==g.hidden&&(b=1==i.range.endToFront?g.startDate-1:g.endDate+1),1!=v.hidden&&1!=g.hidden||i.range._applyRange(y,b)}},e.removeDuplicates=function(t){for(var e=t.hiddenDates,i=[],o=0;o=e[o].start&&e[n].end<=e[o].end?e[n].remove=!0:e[n].start>=e[o].start&&e[n].start<=e[o].end?(e[o].end=e[n].end,e[n].remove=!0):e[n].end>=e[o].start&&e[n].end<=e[o].end&&(e[o].start=e[n].start,e[n].remove=!0));for(var o=0;o=r&&a>n){o=!0;break}}if(1==o&&n=e&&i>r&&(o+=r-s)}return o},e.correctTimeForHidden=function(t,i,o,n){return n=t(n).toDate().valueOf(),n-=e.getHiddenDurationBefore(t,i,o,n)},e.getHiddenDurationBefore=function(t,e,i,o){var n=0;o=t(o).toDate().valueOf();for(var s=0;s=i.start&&a=a&&(n+=a-r)}return n},e.getAccumulatedHiddenDuration=function(t,e,i){for(var o=0,n=0,s=e.start,r=0;r=e.start&&h=i)break;o+=h-a}}return o},e.snapAwayFromHidden=function(t,i,o,n){var s=e.isHidden(i,t);return 1==s.hidden?0>o?1==n?s.startDate-(s.endDate-i)-1:s.startDate-1:1==n?s.endDate+(i-s.startDate)+1:s.endDate+1:i},e.isHidden=function(t,e){for(var i=0;i=o&&n>t)return{hidden:!0,startDate:o,endDate:n}}return{hidden:!1,startDate:o,endDate:n}}},function(t,e,i){function o(){}var n="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},s=i(13),r=i(20),a=i(28),h=i(1),d=(i(9),i(11),i(30),i(34),i(44)),l=i(45),c=i(32),u=i(46);s(o.prototype),o.prototype._create=function(t){function e(t){i.isActive()&&i.emit("mousewheel",t)}this.dom={},this.dom.container=t,this.dom.root=document.createElement("div"),this.dom.background=document.createElement("div"),this.dom.backgroundVertical=document.createElement("div"),this.dom.backgroundHorizontal=document.createElement("div"),this.dom.centerContainer=document.createElement("div"),this.dom.leftContainer=document.createElement("div"),this.dom.rightContainer=document.createElement("div"),this.dom.center=document.createElement("div"),this.dom.left=document.createElement("div"),this.dom.right=document.createElement("div"),this.dom.top=document.createElement("div"),this.dom.bottom=document.createElement("div"),this.dom.shadowTop=document.createElement("div"),this.dom.shadowBottom=document.createElement("div"),this.dom.shadowTopLeft=document.createElement("div"),this.dom.shadowBottomLeft=document.createElement("div"),this.dom.shadowTopRight=document.createElement("div"),this.dom.shadowBottomRight=document.createElement("div"),this.dom.root.className="vis-timeline",this.dom.background.className="vis-panel vis-background",this.dom.backgroundVertical.className="vis-panel vis-background vis-vertical",this.dom.backgroundHorizontal.className="vis-panel vis-background vis-horizontal",this.dom.centerContainer.className="vis-panel vis-center",this.dom.leftContainer.className="vis-panel vis-left",this.dom.rightContainer.className="vis-panel vis-right",this.dom.top.className="vis-panel vis-top",this.dom.bottom.className="vis-panel vis-bottom",this.dom.left.className="vis-content",this.dom.center.className="vis-content",this.dom.right.className="vis-content",this.dom.shadowTop.className="vis-shadow vis-top",this.dom.shadowBottom.className="vis-shadow vis-bottom",this.dom.shadowTopLeft.className="vis-shadow vis-top",this.dom.shadowBottomLeft.className="vis-shadow vis-bottom",this.dom.shadowTopRight.className="vis-shadow vis-top",this.dom.shadowBottomRight.className="vis-shadow vis-bottom",this.dom.root.appendChild(this.dom.background),this.dom.root.appendChild(this.dom.backgroundVertical),this.dom.root.appendChild(this.dom.backgroundHorizontal),this.dom.root.appendChild(this.dom.centerContainer),this.dom.root.appendChild(this.dom.leftContainer),this.dom.root.appendChild(this.dom.rightContainer),this.dom.root.appendChild(this.dom.top),this.dom.root.appendChild(this.dom.bottom),this.dom.centerContainer.appendChild(this.dom.center),this.dom.leftContainer.appendChild(this.dom.left),this.dom.rightContainer.appendChild(this.dom.right),this.dom.centerContainer.appendChild(this.dom.shadowTop),this.dom.centerContainer.appendChild(this.dom.shadowBottom),this.dom.leftContainer.appendChild(this.dom.shadowTopLeft),this.dom.leftContainer.appendChild(this.dom.shadowBottomLeft),this.dom.rightContainer.appendChild(this.dom.shadowTopRight),this.dom.rightContainer.appendChild(this.dom.shadowBottomRight),this.on("rangechange",function(){this.initialDrawDone===!0&&this._redraw()}.bind(this)),this.on("touch",this._onTouch.bind(this)),this.on("pan",this._onDrag.bind(this));var i=this;this.on("_change",function(t){t&&1==t.queue?i._redrawTimer||(i._redrawTimer=setTimeout(function(){i._redrawTimer=null,i._redraw()},0)):i._redraw()}),this.hammer=new r(this.dom.root);var o=this.hammer.get("pinch").set({enable:!0});a.disablePreventDefaultVertically(o),this.hammer.get("pan").set({threshold:5,direction:r.DIRECTION_HORIZONTAL}),this.listeners={};var n=["tap","doubletap","press","pinch","pan","panstart","panmove","panend"];if(n.forEach(function(t){var e=function(e){i.isActive()&&i.emit(t,e)};i.hammer.on(t,e),i.listeners[t]=e}),a.onTouch(this.hammer,function(t){i.emit("touch",t)}.bind(this)),a.onRelease(this.hammer,function(t){i.emit("release",t)}.bind(this)),this.dom.root.addEventListener("mousewheel",e),this.dom.root.addEventListener("DOMMouseScroll",e),this.props={root:{},background:{},centerContainer:{},leftContainer:{},rightContainer:{},center:{},left:{},right:{},top:{},bottom:{},border:{},scrollTop:0,scrollTopMin:0},this.customTimes=[],this.touch={},this.redrawCount=0,this.initialDrawDone=!1,!t)throw new Error("No container provided");t.appendChild(this.dom.root)},o.prototype.setOptions=function(t){if(t){var e=["width","height","minHeight","maxHeight","autoResize","start","end","clickToUse","dataAttributes","hiddenDates","locale","locales","moment","rtl","throttleRedraw"];if(h.selectiveExtend(e,this.options,t),this.options.rtl){var i=this.dom.leftContainer;this.dom.leftContainer=this.dom.rightContainer,this.dom.rightContainer=i,this.dom.container.style.direction="rtl",this.dom.backgroundVertical.className="vis-panel vis-background vis-vertical-rtl"}if(this.options.orientation={item:void 0,axis:void 0},"orientation"in t&&("string"==typeof t.orientation?this.options.orientation={item:t.orientation,axis:t.orientation}:"object"===n(t.orientation)&&("item"in t.orientation&&(this.options.orientation.item=t.orientation.item),"axis"in t.orientation&&(this.options.orientation.axis=t.orientation.axis))),"both"===this.options.orientation.axis){if(!this.timeAxis2){var o=this.timeAxis2=new d(this.body);o.setOptions=function(t){var e=t?h.extend({},t):{};e.orientation="top",d.prototype.setOptions.call(o,e)},this.components.push(o)}}else if(this.timeAxis2){var s=this.components.indexOf(this.timeAxis2);-1!==s&&this.components.splice(s,1),this.timeAxis2.destroy(),this.timeAxis2=null}if("function"==typeof t.drawPoints&&(t.drawPoints={onRender:t.drawPoints}),"hiddenDates"in this.options&&c.convertHiddenOptions(this.options.moment,this.body,this.options.hiddenDates),"clickToUse"in t&&(t.clickToUse?this.activator||(this.activator=new l(this.dom.root)):this.activator&&(this.activator.destroy(),delete this.activator)),"showCustomTime"in t)throw new Error("Option `showCustomTime` is deprecated. Create a custom time bar via timeline.addCustomTime(time [, id])");this._initAutoResize()}if(this.components.forEach(function(e){return e.setOptions(t)}),"configure"in t){this.configurator||(this.configurator=this._createConfigurator()),this.configurator.setOptions(t.configure);var r=h.deepExtend({},this.options);this.components.forEach(function(t){h.deepExtend(r,t.options)}),this.configurator.setModuleOptions({global:r})}this._origRedraw?this._redraw():(this._origRedraw=this._redraw.bind(this),this._redraw=h.throttle(this._origRedraw,this.options.throttleRedraw))},o.prototype.isActive=function(){return!this.activator||this.activator.active},o.prototype.destroy=function(){this.setItems(null),this.setGroups(null),this.off(),this._stopAutoResize(),this.dom.root.parentNode&&this.dom.root.parentNode.removeChild(this.dom.root),this.dom=null,this.activator&&(this.activator.destroy(),delete this.activator);for(var t in this.listeners)this.listeners.hasOwnProperty(t)&&delete this.listeners[t];this.listeners=null,this.hammer=null,this.components.forEach(function(t){return t.destroy()}),this.body=null},o.prototype.setCustomTime=function(t,e){var i=this.customTimes.filter(function(t){return e===t.options.id});if(0===i.length)throw new Error("No custom time bar found with id "+JSON.stringify(e));i.length>0&&i[0].setCustomTime(t)},o.prototype.getCustomTime=function(t){var e=this.customTimes.filter(function(e){return e.options.id===t});if(0===e.length)throw new Error("No custom time bar found with id "+JSON.stringify(t));return e[0].getCustomTime()},o.prototype.setCustomTimeTitle=function(t,e){var i=this.customTimes.filter(function(t){return t.options.id===e});if(0===i.length)throw new Error("No custom time bar found with id "+JSON.stringify(e));return i.length>0?i[0].setCustomTitle(t):void 0},o.prototype.getEventProperties=function(t){return{event:t}},o.prototype.addCustomTime=function(t,e){var i=void 0!==t?h.convert(t,"Date").valueOf():new Date,o=this.customTimes.some(function(t){return t.options.id===e});if(o)throw new Error("A custom time with id "+JSON.stringify(e)+" already exists");var n=new u(this.body,h.extend({},this.options,{time:i,id:e}));return this.customTimes.push(n),this.components.push(n),this._redraw(),e},o.prototype.removeCustomTime=function(t){var e=this.customTimes.filter(function(e){return e.options.id===t});if(0===e.length)throw new Error("No custom time bar found with id "+JSON.stringify(t));e.forEach(function(t){this.customTimes.splice(this.customTimes.indexOf(t),1),this.components.splice(this.components.indexOf(t),1),t.destroy()}.bind(this))},o.prototype.getVisibleItems=function(){return this.itemSet&&this.itemSet.getVisibleItems()||[]},o.prototype.fit=function(t){var e=this.getDataRange();if(null!==e.min||null!==e.max){var i=e.max-e.min,o=new Date(e.min.valueOf()-.01*i),n=new Date(e.max.valueOf()+.01*i),s=t&&void 0!==t.animation?t.animation:!0;this.range.setRange(o,n,s)}},o.prototype.getDataRange=function(){throw new Error("Cannot invoke abstract method getDataRange")},o.prototype.setWindow=function(t,e,i){var o;if(1==arguments.length){var n=arguments[0];o=void 0!==n.animation?n.animation:!0,this.range.setRange(n.start,n.end,o)}else o=i&&void 0!==i.animation?i.animation:!0,this.range.setRange(t,e,o)},o.prototype.moveTo=function(t,e){var i=this.range.end-this.range.start,o=h.convert(t,"Date").valueOf(),n=o-i/2,s=o+i/2,r=e&&void 0!==e.animation?e.animation:!0;this.range.setRange(n,s,r)},o.prototype.getWindow=function(){var t=this.range.getRange();return{start:new Date(t.start),end:new Date(t.end)}},o.prototype.redraw=function(){this._redraw()},o.prototype._redraw=function(){this.redrawCount++;var t=!1,e=this.options,i=this.props,o=this.dom;if(o&&o.container&&0!=o.root.offsetWidth){c.updateHiddenDates(this.options.moment,this.body,this.options.hiddenDates),"top"==e.orientation?(h.addClassName(o.root,"vis-top"),h.removeClassName(o.root,"vis-bottom")):(h.removeClassName(o.root,"vis-top"),h.addClassName(o.root,"vis-bottom")),o.root.style.maxHeight=h.option.asSize(e.maxHeight,""),o.root.style.minHeight=h.option.asSize(e.minHeight,""),o.root.style.width=h.option.asSize(e.width,""),i.border.left=(o.centerContainer.offsetWidth-o.centerContainer.clientWidth)/2,i.border.right=i.border.left,i.border.top=(o.centerContainer.offsetHeight-o.centerContainer.clientHeight)/2,i.border.bottom=i.border.top;var n=o.root.offsetHeight-o.root.clientHeight,s=o.root.offsetWidth-o.root.clientWidth;0===o.centerContainer.clientHeight&&(i.border.left=i.border.top,i.border.right=i.border.left),0===o.root.clientHeight&&(s=n),i.center.height=o.center.offsetHeight,i.left.height=o.left.offsetHeight,i.right.height=o.right.offsetHeight,i.top.height=o.top.clientHeight||-i.border.top,i.bottom.height=o.bottom.clientHeight||-i.border.bottom;var a=Math.max(i.left.height,i.center.height,i.right.height),d=i.top.height+a+i.bottom.height+n+i.border.top+i.border.bottom;o.root.style.height=h.option.asSize(e.height,d+"px"),i.root.height=o.root.offsetHeight,i.background.height=i.root.height-n;var l=i.root.height-i.top.height-i.bottom.height-n;i.centerContainer.height=l,i.leftContainer.height=l,i.rightContainer.height=i.leftContainer.height,i.root.width=o.root.offsetWidth,i.background.width=i.root.width-s,i.left.width=o.leftContainer.clientWidth||-i.border.left,i.leftContainer.width=i.left.width,i.right.width=o.rightContainer.clientWidth||-i.border.right,i.rightContainer.width=i.right.width;var u=i.root.width-i.left.width-i.right.width-s;i.center.width=u,i.centerContainer.width=u,i.top.width=u,i.bottom.width=u,o.background.style.height=i.background.height+"px",o.backgroundVertical.style.height=i.background.height+"px",o.backgroundHorizontal.style.height=i.centerContainer.height+"px",o.centerContainer.style.height=i.centerContainer.height+"px",o.leftContainer.style.height=i.leftContainer.height+"px",o.rightContainer.style.height=i.rightContainer.height+"px",o.background.style.width=i.background.width+"px",o.backgroundVertical.style.width=i.centerContainer.width+"px",o.backgroundHorizontal.style.width=i.background.width+"px",o.centerContainer.style.width=i.center.width+"px",o.top.style.width=i.top.width+"px",o.bottom.style.width=i.bottom.width+"px",o.background.style.left="0",o.background.style.top="0",o.backgroundVertical.style.left=i.left.width+i.border.left+"px",o.backgroundVertical.style.top="0",o.backgroundHorizontal.style.left="0",o.backgroundHorizontal.style.top=i.top.height+"px",o.centerContainer.style.left=i.left.width+"px",o.centerContainer.style.top=i.top.height+"px",o.leftContainer.style.left="0",o.leftContainer.style.top=i.top.height+"px",o.rightContainer.style.left=i.left.width+i.center.width+"px",o.rightContainer.style.top=i.top.height+"px",o.top.style.left=i.left.width+"px",o.top.style.top="0",o.bottom.style.left=i.left.width+"px",o.bottom.style.top=i.top.height+i.centerContainer.height+"px",this._updateScrollTop();var p=this.props.scrollTop;"top"!=e.orientation.item&&(p+=Math.max(this.props.centerContainer.height-this.props.center.height-this.props.border.top-this.props.border.bottom,0)),o.center.style.left="0",o.center.style.top=p+"px",o.left.style.left="0",o.left.style.top=p+"px",o.right.style.left="0",o.right.style.top=p+"px";var f=0==this.props.scrollTop?"hidden":"",m=this.props.scrollTop==this.props.scrollTopMin?"hidden":"";o.shadowTop.style.visibility=f,o.shadowBottom.style.visibility=m,o.shadowTopLeft.style.visibility=f,o.shadowBottomLeft.style.visibility=m,o.shadowTopRight.style.visibility=f,o.shadowBottomRight.style.visibility=m;var v=this.props.center.height>this.props.centerContainer.height;this.hammer.get("pan").set({direction:v?r.DIRECTION_ALL:r.DIRECTION_HORIZONTAL}),this.components.forEach(function(e){t=e.redraw()||t});var g=5;if(t){if(this.redrawCount0&&(this.props.scrollTop=0),this.props.scrollTope;e++)o=this.selection[e],n=this.items[o],n&&n.unselect();for(this.selection=[],e=0,i=t.length;i>e;e++)o=t[e],n=this.items[o],n&&(this.selection.push(o),n.select())},o.prototype.getSelection=function(){return this.selection.concat([])},o.prototype.getVisibleItems=function(){var t=this.body.range.getRange();if(this.options.rtl)var e=this.body.util.toScreen(t.start),i=this.body.util.toScreen(t.end);else var i=this.body.util.toScreen(t.start),e=this.body.util.toScreen(t.end);var o=[];for(var n in this.groups)if(this.groups.hasOwnProperty(n))for(var s=this.groups[n],r=s.visibleItems,a=0;ae&&o.push(h.id):h.lefti&&o.push(h.id)}return o},o.prototype._deselect=function(t){for(var e=this.selection,i=0,o=e.length;o>i;i++)if(e[i]==t){e.splice(i,1);break}},o.prototype.redraw=function(){var t=this.options.margin,e=this.body.range,i=r.option.asSize,o=this.options,n=o.orientation.item,s=!1,a=this.dom.frame;this.props.top=this.body.domProps.top.height+this.body.domProps.border.top,this.options.rtl?this.props.right=this.body.domProps.right.width+this.body.domProps.border.right:this.props.left=this.body.domProps.left.width+this.body.domProps.border.left,a.className="vis-itemset",s=this._orderGroups()||s;var h=e.end-e.start,d=h!=this.lastVisibleInterval||this.props.width!=this.props.lastWidth;d&&(this.stackDirty=!0), -this.lastVisibleInterval=h,this.props.lastWidth=this.props.width;var l=this.stackDirty,c=this._firstGroup(),u={item:t.item,axis:t.axis},p={item:t.item,axis:t.item.vertical/2},f=0,m=t.axis+t.item.vertical;return this.groups[y].redraw(e,p,l),r.forEach(this.groups,function(t){var i=t==c?u:p,o=t.redraw(e,i,l);s=o||s,f+=t.height}),f=Math.max(f,m),this.stackDirty=!1,a.style.height=i(f),this.props.width=a.offsetWidth,this.props.height=f,this.dom.axis.style.top=i("top"==n?this.body.domProps.top.height+this.body.domProps.border.top:this.body.domProps.top.height+this.body.domProps.centerContainer.height),this.options.rtl?this.dom.axis.style.right="0":this.dom.axis.style.left="0",s=this._isResized()||s},o.prototype._firstGroup=function(){var t="top"==this.options.orientation.item?0:this.groupIds.length-1,e=this.groupIds[t],i=this.groups[e]||this.groups[g];return i||null},o.prototype._updateUngrouped=function(){var t,e,i=this.groups[g];this.groups[y];if(this.groupsData){if(i){i.hide(),delete this.groups[g];for(e in this.items)if(this.items.hasOwnProperty(e)){t=this.items[e],t.parent&&t.parent.remove(t);var o=this._getGroupId(t.data),n=this.groups[o];n&&n.add(t)||t.hide()}}}else if(!i){var s=null,r=null;i=new c(s,r,this),this.groups[g]=i;for(e in this.items)this.items.hasOwnProperty(e)&&(t=this.items[e],i.add(t));i.show()}},o.prototype.getLabelSet=function(){return this.dom.labelSet},o.prototype.setItems=function(t){var e,i=this,o=this.itemsData;if(t){if(!(t instanceof a||t instanceof h))throw new TypeError("Data must be an instance of DataSet or DataView");this.itemsData=t}else this.itemsData=null;if(o&&(r.forEach(this.itemListeners,function(t,e){o.off(e,t)}),e=o.getIds(),this._onRemove(e)),this.itemsData){var n=this.id;r.forEach(this.itemListeners,function(t,e){i.itemsData.on(e,t,n)}),e=this.itemsData.getIds(),this._onAdd(e),this._updateUngrouped()}this.body.emitter.emit("_change",{queue:!0})},o.prototype.getItems=function(){return this.itemsData},o.prototype.setGroups=function(t){var e,i=this;if(this.groupsData&&(r.forEach(this.groupListeners,function(t,e){i.groupsData.off(e,t)}),e=this.groupsData.getIds(),this.groupsData=null,this._onRemoveGroups(e)),t){if(!(t instanceof a||t instanceof h))throw new TypeError("Data must be an instance of DataSet or DataView");this.groupsData=t}else this.groupsData=null;if(this.groupsData){var o=this.id;r.forEach(this.groupListeners,function(t,e){i.groupsData.on(e,t,o)}),e=this.groupsData.getIds(),this._onAddGroups(e)}this._updateUngrouped(),this._order(),this.body.emitter.emit("_change",{queue:!0})},o.prototype.getGroups=function(){return this.groupsData},o.prototype.removeItem=function(t){var e=this.itemsData.get(t),i=this.itemsData.getDataSet();e&&this.options.onRemove(e,function(e){e&&i.remove(t)})},o.prototype._getType=function(t){return t.type||this.options.type||(t.end?"range":"box")},o.prototype._getGroupId=function(t){var e=this._getType(t);return"background"==e&&void 0==t.group?y:this.groupsData?t.group:g},o.prototype._onUpdate=function(t){var e=this;t.forEach(function(t){var i,n=e.itemsData.get(t,e.itemOptions),s=e.items[t],r=e._getType(n),a=o.types[r];if(s&&(a&&s instanceof a?e._updateItem(s,n):(i=s.selected,e._removeItem(s),s=null)),!s){if(!a)throw"rangeoverflow"==r?new TypeError('Item type "rangeoverflow" is deprecated. Use css styling instead: .vis-item.vis-range .vis-item-content {overflow: visible;}'):new TypeError('Unknown item type "'+r+'"');s=new a(n,e.conversion,e.options),s.id=t,e._addItem(s),i&&(this.selection.push(t),s.select())}}.bind(this)),this._order(),this.stackDirty=!0,this.body.emitter.emit("_change",{queue:!0})},o.prototype._onAdd=o.prototype._onUpdate,o.prototype._onRemove=function(t){var e=0,i=this;t.forEach(function(t){var o=i.items[t];o&&(e++,i._removeItem(o))}),e&&(this._order(),this.stackDirty=!0,this.body.emitter.emit("_change",{queue:!0}))},o.prototype._order=function(){r.forEach(this.groups,function(t){t.order()})},o.prototype._onUpdateGroups=function(t){this._onAddGroups(t)},o.prototype._onAddGroups=function(t){var e=this;t.forEach(function(t){var i=e.groupsData.get(t),o=e.groups[t];if(o)o.setData(i);else{if(t==g||t==y)throw new Error("Illegal group id. "+t+" is a reserved id.");var n=Object.create(e.options);r.extend(n,{height:null}),o=new c(t,i,e),e.groups[t]=o;for(var s in e.items)if(e.items.hasOwnProperty(s)){var a=e.items[s];a.data.group==t&&o.add(a)}o.order(),o.show()}}),this.body.emitter.emit("_change",{queue:!0})},o.prototype._onRemoveGroups=function(t){var e=this.groups;t.forEach(function(t){var i=e[t];i&&(i.hide(),delete e[t])}),this.markDirty(),this.body.emitter.emit("_change",{queue:!0})},o.prototype._orderGroups=function(){if(this.groupsData){var t=this.groupsData.getIds({order:this.options.groupOrder}),e=!r.equalArray(t,this.groupIds);if(e){var i=this.groups;t.forEach(function(t){i[t].hide()}),t.forEach(function(t){i[t].show()}),this.groupIds=t}return e}return!1},o.prototype._addItem=function(t){this.items[t.id]=t;var e=this._getGroupId(t.data),i=this.groups[e];i&&i.add(t)},o.prototype._updateItem=function(t,e){var i=t.data.group,o=t.data.subgroup;if(t.setData(e),i!=t.data.group||o!=t.data.subgroup){var n=this.groups[i];n&&n.remove(t);var s=this._getGroupId(t.data),r=this.groups[s];r&&r.add(t)}},o.prototype._removeItem=function(t){t.hide(),delete this.items[t.id];var e=this.selection.indexOf(t.id);-1!=e&&this.selection.splice(e,1),t.parent&&t.parent.remove(t)},o.prototype._constructByEndArray=function(t){for(var e=[],i=0;in+s)return}else{var a=e.height;if(n+a-s>o)return}}if(e&&e!=this.groupTouchParams.group){var h=this.groupsData,d=h.get(e.groupId),l=h.get(this.groupTouchParams.group.groupId);l&&d&&(this.options.groupOrderSwap(l,d,this.groupsData),this.groupsData.update(l),this.groupsData.update(d));var c=this.groupsData.getIds({order:this.options.groupOrder});if(!r.equalArray(c,this.groupTouchParams.originalOrder))for(var h=this.groupsData,u=this.groupTouchParams.originalOrder,p=this.groupTouchParams.group.groupId,f=Math.min(u.length,c.length),m=0,v=0,g=0;f>m;){for(;f>m+v&&f>m+g&&c[m+v]==u[m+g];)m++;if(m+v>=f)break;if(c[m+v]!=p)if(u[m+g]!=p){var y=c.indexOf(u[m+g]),b=h.get(c[m+v]),w=h.get(u[m+g]);this.options.groupOrderSwap(b,w,h),h.update(b),h.update(w);var _=c[m+v];c[m+v]=u[m+g],c[y]=_,m++}else g=1;else v=1}}}},o.prototype._onGroupDragEnd=function(t){if(this.options.groupEditable.order&&this.groupTouchParams.group){t.stopPropagation();var e=this,i=e.groupTouchParams.group.groupId,o=e.groupsData.getDataSet(),n=r.extend({},o.get(i));e.options.onMoveGroup(n,function(t){if(t)t[o._fieldId]=i,o.update(t);else{var n=o.getIds({order:e.options.groupOrder});if(!r.equalArray(n,e.groupTouchParams.originalOrder))for(var s=e.groupTouchParams.originalOrder,a=Math.min(s.length,n.length),h=0;a>h;){for(;a>h&&n[h]==s[h];)h++;if(h>=a)break;var d=n.indexOf(s[h]),l=o.get(n[h]),c=o.get(s[h]);e.options.groupOrderSwap(l,c,o),groupsData.update(l),groupsData.update(c);var u=n[h];n[h]=s[h],n[d]=u,h++}}}),e.body.emitter.emit("groupDragged",{groupId:i})}},o.prototype._onSelectItem=function(t){if(this.options.selectable){var e=t.srcEvent&&(t.srcEvent.ctrlKey||t.srcEvent.metaKey),i=t.srcEvent&&t.srcEvent.shiftKey;if(e||i)return void this._onMultiSelectItem(t);var o=this.getSelection(),n=this.itemFromTarget(t),s=n?[n.id]:[];this.setSelection(s);var r=this.getSelection();(r.length>0||o.length>0)&&this.body.emitter.emit("select",{items:r,event:t})}},o.prototype._onAddItem=function(t){if(this.options.selectable&&this.options.editable.add){var e=this,i=this.options.snap||null,o=this.itemFromTarget(t);if(o){var n=e.itemsData.get(o.id);this.options.onUpdate(n,function(t){t&&e.itemsData.getDataSet().update(t)})}else{if(this.options.rtl)var s=r.getAbsoluteRight(this.dom.frame),a=s-t.center.x;else var s=r.getAbsoluteLeft(this.dom.frame),a=t.center.x-s;var h=this.body.util.toTime(a),d=this.body.util.getScale(),l=this.body.util.getStep(),c={start:i?i(h,d,l):h,content:"new item"};if("range"===this.options.type){var u=this.body.util.toTime(a+this.props.width/5);c.end=i?i(u,d,l):u}c[this.itemsData._fieldId]=r.randomUUID();var p=this.groupFromTarget(t);p&&(c.group=p.groupId),c=this._cloneItemData(c),this.options.onAdd(c,function(t){t&&e.itemsData.getDataSet().add(t)})}}},o.prototype._onMultiSelectItem=function(t){if(this.options.selectable){var e=this.itemFromTarget(t);if(e){var i=this.options.multiselect?this.getSelection():[],n=t.srcEvent&&t.srcEvent.shiftKey||!1;if(n&&this.options.multiselect){var s=this.itemsData.get(e.id).group,r=void 0;this.options.multiselectPerGroup&&i.length>0&&(r=this.itemsData.get(i[0]).group),this.options.multiselectPerGroup&&void 0!=r&&r!=s||i.push(e.id);var a=o._getItemRange(this.itemsData.get(i,this.itemOptions));if(!this.options.multiselectPerGroup||r==s){i=[];for(var h in this.items)if(this.items.hasOwnProperty(h)){var d=this.items[h],l=d.data.start,c=void 0!==d.data.end?d.data.end:l;!(l>=a.min&&c<=a.max)||this.options.multiselectPerGroup&&r!=this.itemsData.get(d.id).group||d instanceof v||i.push(d.id)}}}else{var u=i.indexOf(e.id);-1==u?i.push(e.id):i.splice(u,1)}this.setSelection(i),this.body.emitter.emit("select",{items:this.getSelection(),event:t})}}},o._getItemRange=function(t){var e=null,i=null;return t.forEach(function(t){(null==i||t.starte)&&(e=t.end):(null==e||t.start>e)&&(e=t.start)}),{min:i,max:e}},o.prototype.itemFromTarget=function(t){for(var e=t.target;e;){if(e.hasOwnProperty("timeline-item"))return e["timeline-item"];e=e.parentNode}return null},o.prototype.groupFromTarget=function(t){for(var e=t.center?t.center.y:t.clientY,i=0;ia&&ea)return n}else if(0===i&&e0?t.step:1,this.autoScale=!1)},o.prototype.setAutoScale=function(t){this.autoScale=t},o.prototype.setMinimumStep=function(t){if(void 0!=t){var e=31104e6,i=2592e6,o=864e5,n=36e5,s=6e4,r=1e3,a=1;1e3*e>t&&(this.scale="year",this.step=1e3),500*e>t&&(this.scale="year",this.step=500),100*e>t&&(this.scale="year",this.step=100),50*e>t&&(this.scale="year",this.step=50),10*e>t&&(this.scale="year",this.step=10),5*e>t&&(this.scale="year",this.step=5),e>t&&(this.scale="year",this.step=1),3*i>t&&(this.scale="month",this.step=3),i>t&&(this.scale="month",this.step=1),5*o>t&&(this.scale="day",this.step=5),2*o>t&&(this.scale="day",this.step=2),o>t&&(this.scale="day",this.step=1),o/2>t&&(this.scale="weekday",this.step=1),4*n>t&&(this.scale="hour",this.step=4),n>t&&(this.scale="hour",this.step=1),15*s>t&&(this.scale="minute",this.step=15),10*s>t&&(this.scale="minute",this.step=10),5*s>t&&(this.scale="minute",this.step=5),s>t&&(this.scale="minute",this.step=1),15*r>t&&(this.scale="second",this.step=15),10*r>t&&(this.scale="second",this.step=10),5*r>t&&(this.scale="second",this.step=5),r>t&&(this.scale="second",this.step=1),200*a>t&&(this.scale="millisecond",this.step=200),100*a>t&&(this.scale="millisecond",this.step=100),50*a>t&&(this.scale="millisecond",this.step=50),10*a>t&&(this.scale="millisecond",this.step=10),5*a>t&&(this.scale="millisecond",this.step=5),a>t&&(this.scale="millisecond",this.step=1)}},o.snap=function(t,e,i){var o=n(t);if("year"==e){var s=o.year()+Math.round(o.month()/12);o.year(Math.round(s/i)*i),o.month(0),o.date(0),o.hours(0),o.minutes(0),o.seconds(0),o.milliseconds(0)}else if("month"==e)o.date()>15?(o.date(1),o.add(1,"month")):o.date(1),o.hours(0),o.minutes(0),o.seconds(0),o.milliseconds(0);else if("day"==e){switch(i){case 5:case 2:o.hours(24*Math.round(o.hours()/24));break;default:o.hours(12*Math.round(o.hours()/12))}o.minutes(0),o.seconds(0),o.milliseconds(0)}else if("weekday"==e){switch(i){case 5:case 2:o.hours(12*Math.round(o.hours()/12));break;default:o.hours(6*Math.round(o.hours()/6))}o.minutes(0),o.seconds(0),o.milliseconds(0)}else if("hour"==e){switch(i){case 4:o.minutes(60*Math.round(o.minutes()/60));break;default:o.minutes(30*Math.round(o.minutes()/30))}o.seconds(0),o.milliseconds(0)}else if("minute"==e){switch(i){case 15:case 10:o.minutes(5*Math.round(o.minutes()/5)),o.seconds(0);break;case 5:o.seconds(60*Math.round(o.seconds()/60));break;default:o.seconds(30*Math.round(o.seconds()/30))}o.milliseconds(0)}else if("second"==e)switch(i){case 15:case 10:o.seconds(5*Math.round(o.seconds()/5)),o.milliseconds(0);break;case 5:o.milliseconds(1e3*Math.round(o.milliseconds()/1e3));break;default:o.milliseconds(500*Math.round(o.milliseconds()/500))}else if("millisecond"==e){var r=i>5?i/2:1;o.milliseconds(Math.round(o.milliseconds()/r)*r)}return o},o.prototype.isMajor=function(){if(1==this.switchedYear)switch(this.switchedYear=!1,this.scale){case"year":case"month":case"weekday":case"day":case"hour":case"minute":case"second":case"millisecond":return!0;default:return!1}else if(1==this.switchedMonth)switch(this.switchedMonth=!1,this.scale){case"weekday":case"day":case"hour":case"minute":case"second":case"millisecond":return!0;default:return!1}else if(1==this.switchedDay)switch(this.switchedDay=!1,this.scale){case"millisecond":case"second":case"minute":case"hour":return!0;default:return!1}var t=this.moment(this.current);switch(this.scale){case"millisecond":return 0==t.milliseconds();case"second":return 0==t.seconds();case"minute":return 0==t.hours()&&0==t.minutes();case"hour":return 0==t.hours();case"weekday":case"day":return 1==t.date();case"month":return 0==t.month();case"year":return!1;default:return!1}},o.prototype.getLabelMinor=function(t){void 0==t&&(t=this.current);var e=this.format.minorLabels[this.scale];return e&&e.length>0?this.moment(t).format(e):""},o.prototype.getLabelMajor=function(t){void 0==t&&(t=this.current);var e=this.format.majorLabels[this.scale];return e&&e.length>0?this.moment(t).format(e):""},o.prototype.getClassName=function(){function t(t){return t/h%2==0?" vis-even":" vis-odd"}function e(t){return t.isSame(new Date,"day")?" vis-today":t.isSame(s().add(1,"day"),"day")?" vis-tomorrow":t.isSame(s().add(-1,"day"),"day")?" vis-yesterday":""}function i(t){return t.isSame(new Date,"week")?" vis-current-week":""}function o(t){return t.isSame(new Date,"month")?" vis-current-month":""}function n(t){return t.isSame(new Date,"year")?" vis-current-year":""}var s=this.moment,r=this.moment(this.current),a=r.locale?r.locale("en"):r.lang("en"),h=this.step;switch(this.scale){case"millisecond":return t(a.milliseconds()).trim();case"second":return t(a.seconds()).trim();case"minute":return t(a.minutes()).trim();case"hour":var d=a.hours();return 4==this.step&&(d=d+"-h"+(d+4)),"vis-h"+d+e(a)+t(a.hours());case"weekday":return"vis-"+a.format("dddd").toLowerCase()+e(a)+i(a)+t(a.date());case"day":var l=a.date(),c=a.format("MMMM").toLowerCase();return"vis-day"+l+" vis-"+c+o(a)+t(l-1);case"month":return"vis-"+a.format("MMMM").toLowerCase()+o(a)+t(a.month());case"year":var u=a.year();return"vis-year"+u+n(a)+t(u);default:return""}},t.exports=o},function(t,e,i){function o(t,e,i){this.groupId=t,this.subgroups={},this.subgroupIndex=0,this.subgroupOrderer=e&&e.subgroupOrder,this.itemSet=i,this.dom={},this.props={label:{width:0,height:0}},this.className=null,this.items={},this.visibleItems=[],this.orderedItems={byStart:[],byEnd:[]},this.checkRangedItems=!1;var o=this;this.itemSet.body.emitter.on("checkRangedItems",function(){o.checkRangedItems=!0}),this._create(),this.setData(e)}var n=i(1),s=i(37);i(38);o.prototype._create=function(){var t=document.createElement("div");this.itemSet.options.groupEditable.order?t.className="vis-label draggable":t.className="vis-label",this.dom.label=t;var e=document.createElement("div");e.className="vis-inner",t.appendChild(e),this.dom.inner=e;var i=document.createElement("div");i.className="vis-group",i["timeline-group"]=this,this.dom.foreground=i,this.dom.background=document.createElement("div"),this.dom.background.className="vis-group",this.dom.axis=document.createElement("div"),this.dom.axis.className="vis-group",this.dom.marker=document.createElement("div"),this.dom.marker.style.visibility="hidden",this.dom.marker.innerHTML="?",this.dom.background.appendChild(this.dom.marker)},o.prototype.setData=function(t){var e;if(e=this.itemSet.options&&this.itemSet.options.groupTemplate?this.itemSet.options.groupTemplate(t):t&&t.content,e instanceof Element){for(this.dom.inner.appendChild(e);this.dom.inner.firstChild;)this.dom.inner.removeChild(this.dom.inner.firstChild);this.dom.inner.appendChild(e)}else void 0!==e&&null!==e?this.dom.inner.innerHTML=e:this.dom.inner.innerHTML=this.groupId||"";this.dom.label.title=t&&t.title||"",this.dom.inner.firstChild?n.removeClassName(this.dom.inner,"vis-hidden"):n.addClassName(this.dom.inner,"vis-hidden");var i=t&&t.className||null;i!=this.className&&(this.className&&(n.removeClassName(this.dom.label,this.className),n.removeClassName(this.dom.foreground,this.className),n.removeClassName(this.dom.background,this.className),n.removeClassName(this.dom.axis,this.className)),n.addClassName(this.dom.label,i),n.addClassName(this.dom.foreground,i),n.addClassName(this.dom.background,i),n.addClassName(this.dom.axis,i),this.className=i),this.style&&(n.removeCssText(this.dom.label,this.style),this.style=null),t&&t.style&&(n.addCssText(this.dom.label,t.style),this.style=t.style)},o.prototype.getLabelWidth=function(){return this.props.label.width},o.prototype.redraw=function(t,e,i){var o=!1,r=this.dom.marker.clientHeight;if(r!=this.lastMarkerHeight&&(this.lastMarkerHeight=r,n.forEach(this.items,function(t){t.dirty=!0,t.displayed&&t.redraw()}),i=!0),this._calculateSubGroupHeights(),"function"==typeof this.itemSet.options.order){if(i){var a=this,h=!1;n.forEach(this.items,function(t){t.displayed||(t.redraw(),a.visibleItems.push(t)),t.repositionX(h)});var d=this.orderedItems.byStart.slice().sort(function(t,e){return a.itemSet.options.order(t.data,e.data)});s.stack(d,e,!0)}this.visibleItems=this._updateVisibleItems(this.orderedItems,this.visibleItems,t)}else this.visibleItems=this._updateVisibleItems(this.orderedItems,this.visibleItems,t),this.itemSet.options.stack?s.stack(this.visibleItems,e,i):s.nostack(this.visibleItems,e,this.subgroups);var l=this._calculateHeight(e),c=this.dom.foreground;this.top=c.offsetTop,this.right=c.offsetLeft,this.width=c.offsetWidth,o=n.updateProperty(this,"height",l)||o,o=n.updateProperty(this.props.label,"width",this.dom.inner.clientWidth)||o,o=n.updateProperty(this.props.label,"height",this.dom.inner.clientHeight)||o,this.dom.background.style.height=l+"px",this.dom.foreground.style.height=l+"px",this.dom.label.style.height=l+"px";for(var u=0,p=this.visibleItems.length;p>u;u++){var f=this.visibleItems[u];f.repositionY(e)}return o},o.prototype._calculateSubGroupHeights=function(){if(Object.keys(this.subgroups).length>0){var t=this;this.resetSubgroups(),n.forEach(this.visibleItems,function(e){void 0!==e.data.subgroup&&(t.subgroups[e.data.subgroup].height=Math.max(t.subgroups[e.data.subgroup].height,e.height),t.subgroups[e.data.subgroup].visible=!0)})}},o.prototype._calculateHeight=function(t){var e,i=this.visibleItems;if(i.length>0){var o=i[0].top,s=i[0].top+i[0].height;if(n.forEach(i,function(t){o=Math.min(o,t.top),s=Math.max(s,t.top+t.height)}),o>t.axis){var r=o-t.axis;s-=r,n.forEach(i,function(t){t.top-=r})}e=s+t.item.vertical/2}else e=0;return e=Math.max(e,this.props.label.height)},o.prototype.show=function(){this.dom.label.parentNode||this.itemSet.dom.labelSet.appendChild(this.dom.label),this.dom.foreground.parentNode||this.itemSet.dom.foreground.appendChild(this.dom.foreground),this.dom.background.parentNode||this.itemSet.dom.background.appendChild(this.dom.background),this.dom.axis.parentNode||this.itemSet.dom.axis.appendChild(this.dom.axis)},o.prototype.hide=function(){var t=this.dom.label;t.parentNode&&t.parentNode.removeChild(t);var e=this.dom.foreground;e.parentNode&&e.parentNode.removeChild(e);var i=this.dom.background;i.parentNode&&i.parentNode.removeChild(i);var o=this.dom.axis;o.parentNode&&o.parentNode.removeChild(o)},o.prototype.add=function(t){if(this.items[t.id]=t,t.setParent(this),void 0!==t.data.subgroup&&(void 0===this.subgroups[t.data.subgroup]&&(this.subgroups[t.data.subgroup]={height:0,visible:!1,index:this.subgroupIndex,items:[]},this.subgroupIndex++),this.subgroups[t.data.subgroup].items.push(t)),this.orderSubgroups(),-1==this.visibleItems.indexOf(t)){var e=this.itemSet.body.range;this._checkIfVisible(t,this.visibleItems,e)}},o.prototype.orderSubgroups=function(){if(void 0!==this.subgroupOrderer){var t=[];if("string"==typeof this.subgroupOrderer){for(var e in this.subgroups)t.push({subgroup:e,sortField:this.subgroups[e].items[0].data[this.subgroupOrderer]});t.sort(function(t,e){return t.sortField-e.sortField})}else if("function"==typeof this.subgroupOrderer){for(var e in this.subgroups)t.push(this.subgroups[e].items[0].data);t.sort(this.subgroupOrderer)}if(t.length>0)for(var i=0;it?-1:l>=t?0:1};if(e.length>0)for(s=0;sl}),1==this.checkRangedItems)for(this.checkRangedItems=!1,s=0;sl})}for(s=0;s=0&&(s=e[r],!n(s));r--)void 0===o[s.id]&&(o[s.id]=!0,i.push(s));for(r=t+1;rn;n++)t[n].top=null;for(n=0,s=t.length;s>n;n++){var r=t[n];if(r.stack&&null===r.top){r.top=i.axis;do{for(var a=null,h=0,d=t.length;d>h;h++){var l=t[h];if(null!==l.top&&l!==r&&l.stack&&e.collision(r,l,i.item,l.options.rtl)){a=l;break}}null!=a&&(r.top=a.top+a.height+i.item.vertical)}while(a)}}},e.nostack=function(t,e,i){var o,n,s;for(o=0,n=t.length;n>o;o++)if(void 0!==t[o].data.subgroup){s=e.axis;for(var r in i)i.hasOwnProperty(r)&&1==i[r].visible&&i[r].indexe.right&&t.top-o.vertical+ie.top:t.left-o.horizontal+ie.left&&t.top-o.vertical+ie.top}},function(t,e,i){function o(t,e,i){if(this.props={content:{width:0}},this.overflow=!1,this.options=i,t){if(void 0==t.start)throw new Error('Property "start" missing in item '+t.id);if(void 0==t.end)throw new Error('Property "end" missing in item '+t.id)}n.call(this,t,e,i)}var n=(i(20),i(39));o.prototype=new n(null,null,null),o.prototype.baseClassName="vis-item vis-range",o.prototype.isVisible=function(t){return this.data.startt.start},o.prototype.redraw=function(){var t=this.dom;if(t||(this.dom={},t=this.dom,t.box=document.createElement("div"),t.frame=document.createElement("div"),t.frame.className="vis-item-overflow",t.box.appendChild(t.frame),t.content=document.createElement("div"),t.content.className="vis-item-content",t.frame.appendChild(t.content),t.box["timeline-item"]=this,this.dirty=!0),!this.parent)throw new Error("Cannot redraw item: no parent attached");if(!t.box.parentNode){var e=this.parent.dom.foreground;if(!e)throw new Error("Cannot redraw item: parent has no foreground container element");e.appendChild(t.box)}if(this.displayed=!0,this.dirty){this._updateContents(this.dom.content),this._updateTitle(this.dom.box),this._updateDataAttributes(this.dom.box),this._updateStyle(this.dom.box);var i=(this.options.editable.updateTime||this.options.editable.updateGroup||this.editable===!0)&&this.editable!==!1,o=(this.data.className?" "+this.data.className:"")+(this.selected?" vis-selected":"")+(i?" vis-editable":" vis-readonly");t.box.className=this.baseClassName+o,this.overflow="hidden"!==window.getComputedStyle(t.frame).overflow,this.dom.content.style.maxWidth="none",this.props.content.width=this.dom.content.offsetWidth,this.height=this.dom.box.offsetHeight,this.dom.content.style.maxWidth="",this.dirty=!1}this._repaintDeleteButton(t.box),this._repaintDragLeft(),this._repaintDragRight()},o.prototype.show=function(){this.displayed||this.redraw()},o.prototype.hide=function(){if(this.displayed){var t=this.dom.box;t.parentNode&&t.parentNode.removeChild(t),this.displayed=!1}},o.prototype.repositionX=function(t){var e,i,o=this.parent.width,n=this.conversion.toScreen(this.data.start),s=this.conversion.toScreen(this.data.end);void 0!==t&&t!==!0||(-o>n&&(n=-o),s>2*o&&(s=2*o));var r=Math.max(s-n,1);switch(this.overflow?(this.options.rtl?this.right=n:this.left=n,this.width=r+this.props.content.width,i=this.props.content.width):(this.options.rtl?this.right=n:this.left=n,this.width=r,i=Math.min(s-n,this.props.content.width)),this.options.rtl?this.dom.box.style.right=this.right+"px":this.dom.box.style.left=this.left+"px",this.dom.box.style.width=r+"px",this.options.align){case"left":this.options.rtl?this.dom.content.style.right="0":this.dom.content.style.left="0";break;case"right":this.options.rtl?this.dom.content.style.right=Math.max(r-i,0)+"px":this.dom.content.style.left=Math.max(r-i,0)+"px";break;case"center":this.options.rtl?this.dom.content.style.right=Math.max((r-i)/2,0)+"px":this.dom.content.style.left=Math.max((r-i)/2,0)+"px";break;default:e=this.overflow?s>0?Math.max(-n,0):-i:0>n?-n:0,this.options.rtl?this.dom.content.style.right=e+"px":this.dom.content.style.left=e+"px"}},o.prototype.repositionY=function(){var t=this.options.orientation.item,e=this.dom.box;"top"==t?e.style.top=this.top+"px":e.style.top=this.parent.height-this.top-this.height+"px"},o.prototype._repaintDragLeft=function(){if(this.selected&&this.options.editable.updateTime&&!this.dom.dragLeft){var t=document.createElement("div");t.className="vis-drag-left",t.dragLeftItem=this,this.dom.box.appendChild(t),this.dom.dragLeft=t}else!this.selected&&this.dom.dragLeft&&(this.dom.dragLeft.parentNode&&this.dom.dragLeft.parentNode.removeChild(this.dom.dragLeft),this.dom.dragLeft=null)},o.prototype._repaintDragRight=function(){if(this.selected&&this.options.editable.updateTime&&!this.dom.dragRight){var t=document.createElement("div");t.className="vis-drag-right",t.dragRightItem=this,this.dom.box.appendChild(t),this.dom.dragRight=t}else!this.selected&&this.dom.dragRight&&(this.dom.dragRight.parentNode&&this.dom.dragRight.parentNode.removeChild(this.dom.dragRight),this.dom.dragRight=null)},t.exports=o},function(t,e,i){function o(t,e,i){this.id=null,this.parent=null,this.data=t,this.dom=null,this.conversion=e||{},this.options=i||{},this.selected=!1,this.displayed=!1,this.dirty=!0,this.top=null,this.right=null,this.left=null,this.width=null,this.height=null,this.editable=null,this.data&&this.data.hasOwnProperty("editable")&&"boolean"==typeof this.data.editable&&(this.editable=t.editable)}var n=i(20),s=i(1);o.prototype.stack=!0,o.prototype.select=function(){this.selected=!0,this.dirty=!0,this.displayed&&this.redraw()},o.prototype.unselect=function(){this.selected=!1,this.dirty=!0,this.displayed&&this.redraw()},o.prototype.setData=function(t){var e=void 0!=t.group&&this.data.group!=t.group;e&&this.parent.itemSet._moveToGroup(this,t.group),t.hasOwnProperty("editable")&&"boolean"==typeof t.editable&&(this.editable=t.editable),this.data=t,this.dirty=!0,this.displayed&&this.redraw()},o.prototype.setParent=function(t){this.displayed?(this.hide(),this.parent=t,this.parent&&this.show()):this.parent=t},o.prototype.isVisible=function(t){return!1},o.prototype.show=function(){return!1},o.prototype.hide=function(){return!1},o.prototype.redraw=function(){},o.prototype.repositionX=function(){},o.prototype.repositionY=function(){},o.prototype._repaintDeleteButton=function(t){var e=(this.options.editable.remove||this.data.editable===!0)&&this.data.editable!==!1;if(this.selected&&e&&!this.dom.deleteButton){var i=this,o=document.createElement("div");this.options.rtl?o.className="vis-delete-rtl":o.className="vis-delete",o.title="Delete this item",new n(o).on("tap",function(t){t.stopPropagation(),i.parent.removeFromDataSet(i)}),t.appendChild(o),this.dom.deleteButton=o}else!this.selected&&this.dom.deleteButton&&(this.dom.deleteButton.parentNode&&this.dom.deleteButton.parentNode.removeChild(this.dom.deleteButton),this.dom.deleteButton=null)},o.prototype._updateContents=function(t){var e;if(this.options.template){var i=this.parent.itemSet.itemsData.get(this.id);e=this.options.template(i)}else e=this.data.content;var o=this._contentToString(this.content)!==this._contentToString(e);if(o){if(e instanceof Element)t.innerHTML="",t.appendChild(e);else if(void 0!=e)t.innerHTML=e;else if("background"!=this.data.type||void 0!==this.data.content)throw new Error('Property "content" missing in item '+this.id);this.content=e}},o.prototype._updateTitle=function(t){null!=this.data.title?t.title=this.data.title||"":t.removeAttribute("vis-title")},o.prototype._updateDataAttributes=function(t){if(this.options.dataAttributes&&this.options.dataAttributes.length>0){var e=[];if(Array.isArray(this.options.dataAttributes))e=this.options.dataAttributes;else{if("all"!=this.options.dataAttributes)return;e=Object.keys(this.data)}for(var i=0;in;n++){var r=this.visibleItems[n];r.repositionY(e)}return o},o.prototype.show=function(){this.dom.background.parentNode||this.itemSet.dom.background.appendChild(this.dom.background)},t.exports=o},function(t,e,i){function o(t,e,i){if(this.props={dot:{width:0,height:0},line:{width:0,height:0}},this.options=i,t&&void 0==t.start)throw new Error('Property "start" missing in item '+t);n.call(this,t,e,i)}var n=i(39);i(1);o.prototype=new n(null,null,null),o.prototype.isVisible=function(t){var e=(t.end-t.start)/4;return this.data.start>t.start-e&&this.data.startt.start-e&&this.data.startt.start},o.prototype.redraw=function(){var t=this.dom;if(t||(this.dom={},t=this.dom,t.box=document.createElement("div"),t.frame=document.createElement("div"),t.frame.className="vis-item-overflow",t.box.appendChild(t.frame),t.content=document.createElement("div"),t.content.className="vis-item-content",t.frame.appendChild(t.content),this.dirty=!0),!this.parent)throw new Error("Cannot redraw item: no parent attached");if(!t.box.parentNode){var e=this.parent.dom.background;if(!e)throw new Error("Cannot redraw item: parent has no background container element");e.appendChild(t.box)}if(this.displayed=!0,this.dirty){this._updateContents(this.dom.content),this._updateTitle(this.dom.content),this._updateDataAttributes(this.dom.content),this._updateStyle(this.dom.box);var i=(this.data.className?" "+this.data.className:"")+(this.selected?" vis-selected":"");t.box.className=this.baseClassName+i,this.overflow="hidden"!==window.getComputedStyle(t.content).overflow,this.props.content.width=this.dom.content.offsetWidth,this.height=0,this.dirty=!1}},o.prototype.show=r.prototype.show,o.prototype.hide=r.prototype.hide,o.prototype.repositionX=r.prototype.repositionX,o.prototype.repositionY=function(t){var e="top"===this.options.orientation.item;this.dom.content.style.top=e?"":"0",this.dom.content.style.bottom=e?"0":"";var i;if(void 0!==this.data.subgroup){var o=this.data.subgroup,n=this.parent.subgroups,r=n[o].index;if(1==e){i=this.parent.subgroups[o].height+t.item.vertical,i+=0==r?t.axis-.5*t.item.vertical:0;var a=this.parent.top;for(var h in n)n.hasOwnProperty(h)&&1==n[h].visible&&n[h].indexr&&(a+=l)}i=this.parent.subgroups[o].height+t.item.vertical,this.dom.box.style.top=this.parent.height-d+a+"px",this.dom.box.style.bottom=""}}else this.parent instanceof s?(i=Math.max(this.parent.height,this.parent.itemSet.body.domProps.center.height,this.parent.itemSet.body.domProps.centerContainer.height),this.dom.box.style.top=e?"0":"",this.dom.box.style.bottom=e?"":"0"):(i=this.parent.height,this.dom.box.style.top=this.parent.top+"px",this.dom.box.style.bottom="");this.dom.box.style.height=i+"px"},t.exports=o},function(t,e,i){function o(t,e){this.dom={foreground:null,lines:[],majorTexts:[],minorTexts:[],redundant:{lines:[],majorTexts:[],minorTexts:[]}},this.props={range:{start:0,end:0,minimumStep:0},lineTop:0},this.defaultOptions={orientation:{axis:"bottom"},showMinorLabels:!0,showMajorLabels:!0,maxMinorChars:7,format:a.FORMAT,moment:d,timeAxis:null},this.options=s.extend({},this.defaultOptions),this.body=t,this._create(),this.setOptions(e)}var n="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},s=i(1),r=i(31),a=i(35),h=i(32),d=i(2);o.prototype=new r,o.prototype.setOptions=function(t){t&&(s.selectiveExtend(["showMinorLabels","showMajorLabels","maxMinorChars","hiddenDates","timeAxis","moment","rtl"],this.options,t),s.selectiveDeepExtend(["format"],this.options,t),"orientation"in t&&("string"==typeof t.orientation?this.options.orientation.axis=t.orientation:"object"===n(t.orientation)&&"axis"in t.orientation&&(this.options.orientation.axis=t.orientation.axis)),"locale"in t&&("function"==typeof d.locale?d.locale(t.locale):d.lang(t.locale)))},o.prototype._create=function(){this.dom.foreground=document.createElement("div"),this.dom.background=document.createElement("div"),this.dom.foreground.className="vis-time-axis vis-foreground",this.dom.background.className="vis-time-axis vis-background"},o.prototype.destroy=function(){this.dom.foreground.parentNode&&this.dom.foreground.parentNode.removeChild(this.dom.foreground),this.dom.background.parentNode&&this.dom.background.parentNode.removeChild(this.dom.background),this.body=null},o.prototype.redraw=function(){var t=this.props,e=this.dom.foreground,i=this.dom.background,o="top"==this.options.orientation.axis?this.body.dom.top:this.body.dom.bottom,n=e.parentNode!==o;this._calculateCharSize();var s=this.options.showMinorLabels&&"none"!==this.options.orientation.axis,r=this.options.showMajorLabels&&"none"!==this.options.orientation.axis;t.minorLabelHeight=s?t.minorCharHeight:0,t.majorLabelHeight=r?t.majorCharHeight:0,t.height=t.minorLabelHeight+t.majorLabelHeight,t.width=e.offsetWidth,t.minorLineHeight=this.body.domProps.root.height-t.majorLabelHeight-("top"==this.options.orientation.axis?this.body.domProps.bottom.height:this.body.domProps.top.height),t.minorLineWidth=1,t.majorLineHeight=t.minorLineHeight+t.majorLabelHeight,t.majorLineWidth=1;var a=e.nextSibling,h=i.nextSibling;return e.parentNode&&e.parentNode.removeChild(e),i.parentNode&&i.parentNode.removeChild(i),e.style.height=this.props.height+"px",this._repaintLabels(),a?o.insertBefore(e,a):o.appendChild(e),h?this.body.dom.backgroundVertical.insertBefore(i,h):this.body.dom.backgroundVertical.appendChild(i),this._isResized()||n},o.prototype._repaintLabels=function(){var t=this.options.orientation.axis,e=s.convert(this.body.range.start,"Number"),i=s.convert(this.body.range.end,"Number"),o=this.body.util.toTime((this.props.minorCharWidth||10)*this.options.maxMinorChars).valueOf(),n=o-h.getHiddenDurationBefore(this.options.moment,this.body.hiddenDates,this.body.range,o);n-=this.body.util.toTime(0).valueOf();var r=new a(new Date(e),new Date(i),n,this.body.hiddenDates);r.setMoment(this.options.moment),this.options.format&&r.setFormat(this.options.format),this.options.timeAxis&&r.setScale(this.options.timeAxis),this.step=r;var d=this.dom;d.redundant.lines=d.lines,d.redundant.majorTexts=d.majorTexts,d.redundant.minorTexts=d.minorTexts,d.lines=[],d.majorTexts=[],d.minorTexts=[];var c,u,p,f,m,v,g,y,b,w,_=0,x=void 0,k=0,O=1e3;for(r.start(),u=r.getCurrent(),f=this.body.util.toScreen(u);r.hasNext()&&O>k;){k++,m=r.isMajor(),w=r.getClassName(),b=r.getLabelMinor(),c=u,p=f,r.next(),u=r.getCurrent(),v=r.isMajor(),f=this.body.util.toScreen(u),g=_,_=f-p;var M=_>=.4*g;if(this.options.showMinorLabels&&M){var D=this._repaintMinorText(p,b,t,w);D.style.width=_+"px"}m&&this.options.showMajorLabels?(p>0&&(void 0==x&&(x=p),D=this._repaintMajorText(p,r.getLabelMajor(),t,w)),y=this._repaintMajorLine(p,_,t,w)):M?y=this._repaintMinorLine(p,_,t,w):y&&(y.style.width=parseInt(y.style.width)+_+"px")}if(k!==O||l||(console.warn("Something is wrong with the Timeline scale. Limited drawing of grid lines to "+O+" lines."),l=!0),this.options.showMajorLabels){var S=this.body.util.toTime(0),C=r.getLabelMajor(S),T=C.length*(this.props.majorCharWidth||10)+10;(void 0==x||x>T)&&this._repaintMajorText(0,C,t,w)}s.forEach(this.dom.redundant,function(t){for(;t.length;){var e=t.pop();e&&e.parentNode&&e.parentNode.removeChild(e)}})},o.prototype._repaintMinorText=function(t,e,i,o){var n=this.dom.redundant.minorTexts.shift();if(!n){var s=document.createTextNode("");n=document.createElement("div"),n.appendChild(s),this.dom.foreground.appendChild(n)}return this.dom.minorTexts.push(n),n.childNodes[0].nodeValue=e,n.style.top="top"==i?this.props.majorLabelHeight+"px":"0",this.options.rtl?(n.style.left="",n.style.right=t+"px"):n.style.left=t+"px",n.className="vis-text vis-minor "+o,n},o.prototype._repaintMajorText=function(t,e,i,o){var n=this.dom.redundant.majorTexts.shift();if(!n){var s=document.createTextNode(e);n=document.createElement("div"),n.appendChild(s),this.dom.foreground.appendChild(n)}return this.dom.majorTexts.push(n),n.childNodes[0].nodeValue=e,n.className="vis-text vis-major "+o,n.style.top="top"==i?"0":this.props.minorLabelHeight+"px",this.options.rtl?(n.style.left="",n.style.right=t+"px"):n.style.left=t+"px",n},o.prototype._repaintMinorLine=function(t,e,i,o){var n=this.dom.redundant.lines.shift();n||(n=document.createElement("div"),this.dom.background.appendChild(n)),this.dom.lines.push(n);var s=this.props;return"top"==i?n.style.top=s.majorLabelHeight+"px":n.style.top=this.body.domProps.top.height+"px",n.style.height=s.minorLineHeight+"px",this.options.rtl?(n.style.left="",n.style.right=t-s.minorLineWidth/2+"px",n.className="vis-grid vis-vertical-rtl vis-minor "+o):(n.style.left=t-s.minorLineWidth/2+"px",n.className="vis-grid vis-vertical vis-minor "+o),n.style.width=e+"px",n},o.prototype._repaintMajorLine=function(t,e,i,o){var n=this.dom.redundant.lines.shift();n||(n=document.createElement("div"),this.dom.background.appendChild(n)),this.dom.lines.push(n);var s=this.props;return"top"==i?n.style.top="0":n.style.top=this.body.domProps.top.height+"px",this.options.rtl?(n.style.left="",n.style.right=t-s.majorLineWidth/2+"px",n.className="vis-grid vis-vertical-rtl vis-major "+o):(n.style.left=t-s.majorLineWidth/2+"px",n.className="vis-grid vis-vertical vis-major "+o),n.style.height=s.majorLineHeight+"px",n.style.width=e+"px",n},o.prototype._calculateCharSize=function(){this.dom.measureCharMinor||(this.dom.measureCharMinor=document.createElement("DIV"),this.dom.measureCharMinor.className="vis-text vis-minor vis-measure",this.dom.measureCharMinor.style.position="absolute",this.dom.measureCharMinor.appendChild(document.createTextNode("0")),this.dom.foreground.appendChild(this.dom.measureCharMinor)),this.props.minorCharHeight=this.dom.measureCharMinor.clientHeight,this.props.minorCharWidth=this.dom.measureCharMinor.clientWidth,this.dom.measureCharMajor||(this.dom.measureCharMajor=document.createElement("DIV"),this.dom.measureCharMajor.className="vis-text vis-major vis-measure",this.dom.measureCharMajor.style.position="absolute",this.dom.measureCharMajor.appendChild(document.createTextNode("0")),this.dom.foreground.appendChild(this.dom.measureCharMajor)),this.props.majorCharHeight=this.dom.measureCharMajor.clientHeight,this.props.majorCharWidth=this.dom.measureCharMajor.clientWidth};var l=!1;t.exports=o},function(t,e,i){function o(t){this.active=!1,this.dom={container:t},this.dom.overlay=document.createElement("div"),this.dom.overlay.className="vis-overlay",this.dom.container.appendChild(this.dom.overlay),this.hammer=a(this.dom.overlay),this.hammer.on("tap",this._onTapOverlay.bind(this));var e=this,i=["tap","doubletap","press","pinch","pan","panstart","panmove","panend"];i.forEach(function(t){e.hammer.on(t,function(t){t.stopPropagation()})}),document&&document.body&&(this.onClick=function(i){n(i.target,t)||e.deactivate()},document.body.addEventListener("click",this.onClick)),void 0!==this.keycharm&&this.keycharm.destroy(),this.keycharm=s(),this.escListener=this.deactivate.bind(this)}function n(t,e){for(;t;){if(t===e)return!0;t=t.parentNode}return!1}var s=i(23),r=i(13),a=i(20),h=i(1);r(o.prototype),o.current=null,o.prototype.destroy=function(){this.deactivate(),this.dom.overlay.parentNode.removeChild(this.dom.overlay),this.onClick&&document.body.removeEventListener("click",this.onClick),this.hammer.destroy(),this.hammer=null},o.prototype.activate=function(){o.current&&o.current.deactivate(),o.current=this,this.active=!0,this.dom.overlay.style.display="none",h.addClassName(this.dom.container,"vis-active"),this.emit("change"),this.emit("activate"),this.keycharm.bind("esc",this.escListener)},o.prototype.deactivate=function(){this.active=!1,this.dom.overlay.style.display="",h.removeClassName(this.dom.container,"vis-active"),this.keycharm.unbind("esc",this.escListener),this.emit("change"),this.emit("deactivate")},o.prototype._onTapOverlay=function(t){this.activate(),t.stopPropagation()},t.exports=o},function(t,e,i){function o(t,e){this.body=t,this.defaultOptions={moment:a,locales:h,locale:"en",id:void 0,title:void 0},this.options=s.extend({},this.defaultOptions),e&&e.time?this.customTime=e.time:this.customTime=new Date,this.eventParams={},this.setOptions(e),this._create()}var n=i(20),s=i(1),r=i(31),a=i(2),h=i(47);o.prototype=new r,o.prototype.setOptions=function(t){t&&s.selectiveExtend(["moment","locale","locales","id"],this.options,t)},o.prototype._create=function(){var t=document.createElement("div");t["custom-time"]=this,t.className="vis-custom-time "+(this.options.id||""),t.style.position="absolute",t.style.top="0px",t.style.height="100%",this.bar=t;var e=document.createElement("div");e.style.position="relative",e.style.top="0px",e.style.left="-10px",e.style.height="100%",e.style.width="20px",t.appendChild(e),this.hammer=new n(e),this.hammer.on("panstart",this._onDragStart.bind(this)),this.hammer.on("panmove",this._onDrag.bind(this)),this.hammer.on("panend",this._onDragEnd.bind(this)),this.hammer.get("pan").set({threshold:5,direction:n.DIRECTION_HORIZONTAL})},o.prototype.destroy=function(){this.hide(),this.hammer.destroy(),this.hammer=null,this.body=null},o.prototype.redraw=function(){var t=this.body.dom.backgroundVertical;this.bar.parentNode!=t&&(this.bar.parentNode&&this.bar.parentNode.removeChild(this.bar),t.appendChild(this.bar));var e=this.body.util.toScreen(this.customTime),i=this.options.locales[this.options.locale];i||(this.warned||(console.log("WARNING: options.locales['"+this.options.locale+"'] not found. See http://visjs.org/docs/timeline.html#Localization"),this.warned=!0),i=this.options.locales.en);var o=this.options.title;return void 0===o&&(o=i.time+": "+this.options.moment(this.customTime).format("dddd, MMMM Do YYYY, H:mm:ss"),o=o.charAt(0).toUpperCase()+o.substring(1)),this.bar.style.left=e+"px",this.bar.title=o,!1},o.prototype.hide=function(){this.bar.parentNode&&this.bar.parentNode.removeChild(this.bar)},o.prototype.setCustomTime=function(t){this.customTime=s.convert(t,"Date"),this.redraw()},o.prototype.getCustomTime=function(){return new Date(this.customTime.valueOf())},o.prototype.setCustomTitle=function(t){this.options.title=t},o.prototype._onDragStart=function(t){this.eventParams.dragging=!0,this.eventParams.customTime=this.customTime,t.stopPropagation()},o.prototype._onDrag=function(t){if(this.eventParams.dragging){var e=this.body.util.toScreen(this.eventParams.customTime)+t.deltaX,i=this.body.util.toTime(e);this.setCustomTime(i),this.body.emitter.emit("timechange",{id:this.options.id,time:new Date(this.customTime.valueOf()) -}),t.stopPropagation()}},o.prototype._onDragEnd=function(t){this.eventParams.dragging&&(this.body.emitter.emit("timechanged",{id:this.options.id,time:new Date(this.customTime.valueOf())}),t.stopPropagation())},o.customTimeFromTarget=function(t){for(var e=t.target;e;){if(e.hasOwnProperty("custom-time"))return e["custom-time"];e=e.parentNode}return null},t.exports=o},function(t,e){e.en={current:"current",time:"time"},e.en_EN=e.en,e.en_US=e.en,e.nl={current:"huidige",time:"tijd"},e.nl_NL=e.nl,e.nl_BE=e.nl},function(t,e,i){function o(t,e){this.body=t,this.defaultOptions={rtl:!1,showCurrentTime:!0,moment:r,locales:a,locale:"en"},this.options=n.extend({},this.defaultOptions),this.offset=0,this._create(),this.setOptions(e)}var n=i(1),s=i(31),r=i(2),a=i(47);o.prototype=new s,o.prototype._create=function(){var t=document.createElement("div");t.className="vis-current-time",t.style.position="absolute",t.style.top="0px",t.style.height="100%",this.bar=t},o.prototype.destroy=function(){this.options.showCurrentTime=!1,this.redraw(),this.body=null},o.prototype.setOptions=function(t){t&&n.selectiveExtend(["rtl","showCurrentTime","moment","locale","locales"],this.options,t)},o.prototype.redraw=function(){if(this.options.showCurrentTime){var t=this.body.dom.backgroundVertical;this.bar.parentNode!=t&&(this.bar.parentNode&&this.bar.parentNode.removeChild(this.bar),t.appendChild(this.bar),this.start());var e=this.options.moment((new Date).valueOf()+this.offset),i=this.body.util.toScreen(e),o=this.options.locales[this.options.locale];o||(this.warned||(console.log("WARNING: options.locales['"+this.options.locale+"'] not found. See http://visjs.org/docs/timeline/#Localization"),this.warned=!0),o=this.options.locales.en);var n=o.current+" "+o.time+": "+e.format("dddd, MMMM Do YYYY, H:mm:ss");n=n.charAt(0).toUpperCase()+n.substring(1),this.options.rtl?this.bar.style.right=i+"px":this.bar.style.left=i+"px",this.bar.title=n}else this.bar.parentNode&&this.bar.parentNode.removeChild(this.bar),this.stop();return!1},o.prototype.start=function(){function t(){e.stop();var i=e.body.range.conversion(e.body.domProps.center.width).scale,o=1/i/10;30>o&&(o=30),o>1e3&&(o=1e3),e.redraw(),e.body.emitter.emit("currentTimeTick"),e.currentTimeTimer=setTimeout(t,o)}var e=this;t()},o.prototype.stop=function(){void 0!==this.currentTimeTimer&&(clearTimeout(this.currentTimeTimer),delete this.currentTimeTimer)},o.prototype.setCurrentTime=function(t){var e=n.convert(t,"Date").valueOf(),i=(new Date).valueOf();this.offset=e-i,this.redraw()},o.prototype.getCurrentTime=function(){return new Date((new Date).valueOf()+this.offset)},t.exports=o},function(t,e){Object.defineProperty(e,"__esModule",{value:!0});var i="string",o="boolean",n="number",s="array",r="date",a="object",h="dom",d="moment",l="any",c={configure:{enabled:{"boolean":o},filter:{"boolean":o,"function":"function"},container:{dom:h},__type__:{object:a,"boolean":o,"function":"function"}},align:{string:i},rtl:{"boolean":o,undefined:"undefined"},autoResize:{"boolean":o},throttleRedraw:{number:n},clickToUse:{"boolean":o},dataAttributes:{string:i,array:s},editable:{add:{"boolean":o,undefined:"undefined"},remove:{"boolean":o,undefined:"undefined"},updateGroup:{"boolean":o,undefined:"undefined"},updateTime:{"boolean":o,undefined:"undefined"},__type__:{"boolean":o,object:a}},end:{number:n,date:r,string:i,moment:d},format:{minorLabels:{millisecond:{string:i,undefined:"undefined"},second:{string:i,undefined:"undefined"},minute:{string:i,undefined:"undefined"},hour:{string:i,undefined:"undefined"},weekday:{string:i,undefined:"undefined"},day:{string:i,undefined:"undefined"},month:{string:i,undefined:"undefined"},year:{string:i,undefined:"undefined"},__type__:{object:a}},majorLabels:{millisecond:{string:i,undefined:"undefined"},second:{string:i,undefined:"undefined"},minute:{string:i,undefined:"undefined"},hour:{string:i,undefined:"undefined"},weekday:{string:i,undefined:"undefined"},day:{string:i,undefined:"undefined"},month:{string:i,undefined:"undefined"},year:{string:i,undefined:"undefined"},__type__:{object:a}},__type__:{object:a}},moment:{"function":"function"},groupOrder:{string:i,"function":"function"},groupEditable:{add:{"boolean":o,undefined:"undefined"},remove:{"boolean":o,undefined:"undefined"},order:{"boolean":o,undefined:"undefined"},__type__:{"boolean":o,object:a}},groupOrderSwap:{"function":"function"},height:{string:i,number:n},hiddenDates:{start:{date:r,number:n,string:i,moment:d},end:{date:r,number:n,string:i,moment:d},repeat:{string:i},__type__:{object:a,array:s}},itemsAlwaysDraggable:{"boolean":o},locale:{string:i},locales:{__any__:{any:l},__type__:{object:a}},margin:{axis:{number:n},item:{horizontal:{number:n,undefined:"undefined"},vertical:{number:n,undefined:"undefined"},__type__:{object:a,number:n}},__type__:{object:a,number:n}},max:{date:r,number:n,string:i,moment:d},maxHeight:{number:n,string:i},maxMinorChars:{number:n},min:{date:r,number:n,string:i,moment:d},minHeight:{number:n,string:i},moveable:{"boolean":o},multiselect:{"boolean":o},multiselectPerGroup:{"boolean":o},onAdd:{"function":"function"},onUpdate:{"function":"function"},onMove:{"function":"function"},onMoving:{"function":"function"},onRemove:{"function":"function"},onAddGroup:{"function":"function"},onMoveGroup:{"function":"function"},onRemoveGroup:{"function":"function"},order:{"function":"function"},orientation:{axis:{string:i,undefined:"undefined"},item:{string:i,undefined:"undefined"},__type__:{string:i,object:a}},selectable:{"boolean":o},showCurrentTime:{"boolean":o},showMajorLabels:{"boolean":o},showMinorLabels:{"boolean":o},stack:{"boolean":o},snap:{"function":"function","null":"null"},start:{date:r,number:n,string:i,moment:d},template:{"function":"function"},groupTemplate:{"function":"function"},timeAxis:{scale:{string:i,undefined:"undefined"},step:{number:n,undefined:"undefined"},__type__:{object:a}},type:{string:i},width:{string:i,number:n},zoomable:{"boolean":o},zoomKey:{string:["ctrlKey","altKey","metaKey",""]},zoomMax:{number:n},zoomMin:{number:n},__type__:{object:a}},u={global:{align:["center","left","right"],direction:!1,autoResize:!0,throttleRedraw:[10,0,1e3,10],clickToUse:!1,editable:{add:!1,remove:!1,updateGroup:!1,updateTime:!1},end:"",format:{minorLabels:{millisecond:"SSS",second:"s",minute:"HH:mm",hour:"HH:mm",weekday:"ddd D",day:"D",month:"MMM",year:"YYYY"},majorLabels:{millisecond:"HH:mm:ss",second:"D MMMM HH:mm",minute:"ddd D MMMM",hour:"ddd D MMMM",weekday:"MMMM YYYY",day:"MMMM YYYY",month:"YYYY",year:""}},groupsDraggable:!1,height:"",locale:"",margin:{axis:[20,0,100,1],item:{horizontal:[10,0,100,1],vertical:[10,0,100,1]}},max:"",maxHeight:"",maxMinorChars:[7,0,20,1],min:"",minHeight:"",moveable:!1,multiselect:!1,multiselectPerGroup:!1,orientation:{axis:["both","bottom","top"],item:["bottom","top"]},selectable:!0,showCurrentTime:!1,showMajorLabels:!0,showMinorLabels:!0,stack:!0,start:"",type:["box","point","range","background"],width:"100%",zoomable:!0,zoomKey:["ctrlKey","altKey","metaKey",""],zoomMax:[31536e10,10,31536e10,1],zoomMin:[10,10,31536e10,1]}};e.allOptions=c,e.configureOptions=u},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e,i,o){if(!(Array.isArray(i)||i instanceof c||i instanceof u)&&i instanceof Object){var n=o;o=i,i=n}var s=this;this.defaultOptions={start:null,end:null,autoResize:!0,orientation:{axis:"bottom",item:"bottom"},moment:d,width:null,height:null,maxHeight:null,minHeight:null},this.options=l.deepExtend({},this.defaultOptions),this._create(t),this.components=[],this.body={dom:this.dom,domProps:this.props,emitter:{on:this.on.bind(this),off:this.off.bind(this),emit:this.emit.bind(this)},hiddenDates:[],util:{toScreen:s._toScreen.bind(s),toGlobalScreen:s._toGlobalScreen.bind(s),toTime:s._toTime.bind(s),toGlobalTime:s._toGlobalTime.bind(s)}},this.range=new p(this.body),this.components.push(this.range),this.body.range=this.range,this.timeAxis=new m(this.body),this.components.push(this.timeAxis),this.currentTime=new v(this.body),this.components.push(this.currentTime),this.linegraph=new y(this.body),this.components.push(this.linegraph),this.itemsData=null,this.groupsData=null,this.on("tap",function(t){s.emit("click",s.getEventProperties(t))}),this.on("doubletap",function(t){s.emit("doubleClick",s.getEventProperties(t))}),this.dom.root.oncontextmenu=function(t){s.emit("contextmenu",s.getEventProperties(t))},o&&this.setOptions(o),i&&this.setGroups(i),e&&this.setItems(e),this._redraw()}var s=i(26),r=o(s),a=i(29),h=o(a),d=(i(13),i(20),i(2)),l=i(1),c=i(9),u=i(11),p=i(30),f=i(33),m=i(44),v=i(48),g=i(46),y=i(51),b=i(29).printStyle,w=i(59).allOptions,_=i(59).configureOptions;n.prototype=new f,n.prototype.setOptions=function(t){var e=h["default"].validate(t,w);e===!0&&console.log("%cErrors have been found in the supplied options object.",b),f.prototype.setOptions.call(this,t)},n.prototype.setItems=function(t){var e,i=null==this.itemsData;if(e=t?t instanceof c||t instanceof u?t:new c(t,{type:{start:"Date",end:"Date"}}):null,this.itemsData=e,this.linegraph&&this.linegraph.setItems(e),i)if(void 0!=this.options.start||void 0!=this.options.end){var o=void 0!=this.options.start?this.options.start:null,n=void 0!=this.options.end?this.options.end:null;this.setWindow(o,n,{animation:!1})}else this.fit({animation:!1})},n.prototype.setGroups=function(t){var e;e=t?t instanceof c||t instanceof u?t:new c(t):null,this.groupsData=e,this.linegraph.setGroups(e)},n.prototype.getLegend=function(t,e,i){return void 0===e&&(e=15),void 0===i&&(i=15),void 0!==this.linegraph.groups[t]?this.linegraph.groups[t].getLegend(e,i):"cannot find group:'"+t+"'"},n.prototype.isGroupVisible=function(t){return void 0!==this.linegraph.groups[t]?this.linegraph.groups[t].visible&&(void 0===this.linegraph.options.groups.visibility[t]||1==this.linegraph.options.groups.visibility[t]):!1},n.prototype.getDataRange=function(){var t=null,e=null;for(var i in this.linegraph.groups)if(this.linegraph.groups.hasOwnProperty(i)&&1==this.linegraph.groups[i].visible)for(var o=0;os?s:t,e=null==e?s:s>e?s:e}return{min:null!=t?new Date(t):null,max:null!=e?new Date(e):null}},n.prototype.getEventProperties=function(t){var e=t.center?t.center.x:t.clientX,i=t.center?t.center.y:t.clientY,o=e-l.getAbsoluteLeft(this.dom.centerContainer),n=i-l.getAbsoluteTop(this.dom.centerContainer),s=this._toTime(o),r=g.customTimeFromTarget(t),a=l.getTarget(t),h=null;l.hasParent(a,this.timeAxis.dom.foreground)?h="axis":this.timeAxis2&&l.hasParent(a,this.timeAxis2.dom.foreground)?h="axis":l.hasParent(a,this.linegraph.yAxisLeft.dom.frame)?h="data-axis":l.hasParent(a,this.linegraph.yAxisRight.dom.frame)?h="data-axis":l.hasParent(a,this.linegraph.legendLeft.dom.frame)?h="legend":l.hasParent(a,this.linegraph.legendRight.dom.frame)?h="legend":null!=r?h="custom-time":l.hasParent(a,this.currentTime.bar)?h="current-time":l.hasParent(a,this.dom.center)&&(h="background");var d=[],c=this.linegraph.yAxisLeft,u=this.linegraph.yAxisRight;return c.hidden||d.push(c.screenToValue(n)),u.hidden||d.push(u.screenToValue(n)),{event:t,what:h,pageX:t.srcEvent?t.srcEvent.pageX:t.pageX,pageY:t.srcEvent?t.srcEvent.pageY:t.pageY,x:o,y:n,time:s,value:d}},n.prototype._createConfigurator=function(){return new r["default"](this,this.dom.container,_)},t.exports=n},function(t,e,i){function o(t,e){this.id=s.randomUUID(),this.body=t,this.defaultOptions={yAxisOrientation:"left",defaultGroup:"default",sort:!0,sampling:!0,stack:!1,graphHeight:"400px",shaded:{enabled:!1,orientation:"bottom"},style:"line",barChart:{width:50,sideBySide:!1,align:"center"},interpolation:{enabled:!0,parametrization:"centripetal",alpha:.5},drawPoints:{enabled:!0,size:6,style:"square"},dataAxis:{},legend:{},groups:{visibility:{}}},this.options=s.extend({},this.defaultOptions),this.dom={},this.props={},this.hammer=null,this.groups={},this.abortedGraphUpdate=!1,this.updateSVGheight=!1,this.updateSVGheightOnResize=!1,this.forceGraphUpdate=!0;var i=this;this.itemsData=null,this.groupsData=null,this.itemListeners={add:function(t,e,o){i._onAdd(e.items)},update:function(t,e,o){i._onUpdate(e.items)},remove:function(t,e,o){i._onRemove(e.items)}},this.groupListeners={add:function(t,e,o){i._onAddGroups(e.items)},update:function(t,e,o){i._onUpdateGroups(e.items)},remove:function(t,e,o){i._onRemoveGroups(e.items)}},this.items={},this.selection=[],this.lastStart=this.body.range.start,this.touchParams={},this.svgElements={},this.setOptions(e),this.groupsUsingDefaultStyles=[0],this.body.emitter.on("rangechanged",function(){i.lastStart=i.body.range.start,i.svg.style.left=s.option.asSize(-i.props.width),i.forceGraphUpdate=!0,i.redraw.call(i)}),this._create(),this.framework={svg:this.svg,svgElements:this.svgElements,options:this.options,groups:this.groups}}var n="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},s=i(1),r=i(8),a=i(9),h=i(11),d=i(31),l=i(52),c=i(54),u=i(58),p=i(55),f=i(57),m=i(56),v="__ungrouped__";o.prototype=new d,o.prototype._create=function(){var t=document.createElement("div");t.className="vis-line-graph",this.dom.frame=t,this.svg=document.createElementNS("http://www.w3.org/2000/svg","svg"),this.svg.style.position="relative",this.svg.style.height=(""+this.options.graphHeight).replace("px","")+"px",this.svg.style.display="block",t.appendChild(this.svg),this.options.dataAxis.orientation="left",this.yAxisLeft=new l(this.body,this.options.dataAxis,this.svg,this.options.groups),this.options.dataAxis.orientation="right",this.yAxisRight=new l(this.body,this.options.dataAxis,this.svg,this.options.groups),delete this.options.dataAxis.orientation,this.legendLeft=new u(this.body,this.options.legend,"left",this.options.groups),this.legendRight=new u(this.body,this.options.legend,"right",this.options.groups),this.show()},o.prototype.setOptions=function(t){if(t){var e=["sampling","defaultGroup","stack","height","graphHeight","yAxisOrientation","style","barChart","dataAxis","sort","groups"];void 0===t.graphHeight&&void 0!==t.height?(this.updateSVGheight=!0,this.updateSVGheightOnResize=!0):void 0!==this.body.domProps.centerContainer.height&&void 0!==t.graphHeight&&parseInt((t.graphHeight+"").replace("px",""))i?-1:1});for(var o=new Array(t.length),n=0;n0){var h={};for(this._getRelevantData(a,h,n,s),this._applySampling(a,h),e=0;e0)switch(t.options.style){case"line":l.hasOwnProperty(a[e])||(l[a[e]]=f.calcPath(h[a[e]],t)),f.draw(l[a[e]],t,this.framework);case"point":case"points":"point"!=t.options.style&&"points"!=t.options.style&&1!=t.options.drawPoints.enabled||m.draw(h[a[e]],t,this.framework);break;case"bar":}}}return r.cleanupElements(this.svgElements),!1},o.prototype._stack=function(t,e){var i,o,n,s,r;i=0;for(var a=0;at[a].x){r=e[h],s=0==h?r:e[h-1],i=h;break}}void 0===r&&(s=e[e.length-1],r=e[e.length-1]),o=r.x-s.x,n=r.y-s.y,0==o?t[a].y=t[a].orginalY+r.y:t[a].y=t[a].orginalY+n/o*(t[a].x-s.x)+s.y}},o.prototype._getRelevantData=function(t,e,i,o){var n,r,a,h;if(t.length>0)for(r=0;rt?-1:1},c=Math.max(0,s.binarySearchValue(d,i,"x","before",l)),u=Math.min(d.length,s.binarySearchValue(d,o,"x","after",l)+1);0>=u&&(u=d.length);var p=new Array(u-c);for(a=c;u>a;a++)h=n.itemsData[a],p[a-c]=h;e[t[r]]=p}else e[t[r]]=n.itemsData}},o.prototype._applySampling=function(t,e){var i;if(t.length>0)for(var o=0;o0){var s=1,r=n.length,a=this.body.util.toGlobalScreen(n[n.length-1].x)-this.body.util.toGlobalScreen(n[0].x),h=r/a;s=Math.min(Math.ceil(.2*r),Math.max(1,Math.round(h)));for(var d=new Array(r),l=0;r>l;l+=s){var c=Math.round(l/s);d[c]=n[l]}e[t[o]]=d.splice(0,Math.round(r/s))}}},o.prototype._getYRanges=function(t,e,i){var o,n,s,r,a=[],h=[];if(t.length>0){for(s=0;s0&&(n=this.groups[t[s]],r.stack===!0&&"bar"===r.style?"left"===r.yAxisOrientation?a=a.concat(n.getItems()):h=h.concat(n.getItems()):i[t[s]]=n.getYRange(o,t[s]));p.getStackedYRange(a,i,t,"__barStackLeft","left"),p.getStackedYRange(h,i,t,"__barStackRight","right")}},o.prototype._updateYAxis=function(t,e){var i,o,n=!1,s=!1,r=!1,a=1e9,h=1e9,d=-1e9,l=-1e9;if(t.length>0){for(var c=0;ci?i:a,d=o>d?o:d):(r=!0,h=h>i?i:h,l=o>l?o:l));1==s&&this.yAxisLeft.setRange(a,d),1==r&&this.yAxisRight.setRange(h,l)}n=this._toggleAxisVisiblity(s,this.yAxisLeft)||n,n=this._toggleAxisVisiblity(r,this.yAxisRight)||n,1==r&&1==s?(this.yAxisLeft.drawIcons=!0,this.yAxisRight.drawIcons=!0):(this.yAxisLeft.drawIcons=!1,this.yAxisRight.drawIcons=!1),this.yAxisRight.master=!s,this.yAxisRight.masterAxis=this.yAxisLeft,0==this.yAxisRight.master?(1==r?this.yAxisLeft.lineOffset=this.yAxisRight.width:this.yAxisLeft.lineOffset=0,n=this.yAxisLeft.redraw()||n,n=this.yAxisRight.redraw()||n):n=this.yAxisRight.redraw()||n;for(var p=["__barStackLeft","__barStackRight","__lineStackLeft","__lineStackRight"],c=0;ct?-1:1});for(var a=0;a=0&&t._redrawLabel(o-2,e.val,i,"vis-y-axis vis-major",t.props.majorCharHeight),t.master===!0&&(n?t._redrawLine(o,i,"vis-grid vis-horizontal vis-major",t.options.majorLinesOffset,t.props.majorLineWidth):t._redrawLine(o,i,"vis-grid vis-horizontal vis-minor",t.options.minorLinesOffset,t.props.minorLineWidth))});var d=0;void 0!==this.options[i].title&&void 0!==this.options[i].title.text&&(d=this.props.titleCharHeight);var l=this.options.icons===!0?Math.max(this.options.iconWidth,d)+this.options.labelOffsetX+15:d+this.options.labelOffsetX+15;return this.maxLabelSize>this.width-l&&this.options.visible===!0?(this.width=this.maxLabelSize+l,this.options.width=this.width+"px",s.cleanupElements(this.DOMelements.lines),s.cleanupElements(this.DOMelements.labels),this.redraw(),e=!0):this.maxLabelSizethis.minWidth?(this.width=Math.max(this.minWidth,this.maxLabelSize+l),this.options.width=this.width+"px",s.cleanupElements(this.DOMelements.lines),s.cleanupElements(this.DOMelements.labels),this.redraw(),e=!0):(s.cleanupElements(this.DOMelements.lines),s.cleanupElements(this.DOMelements.labels),e=!1),e},o.prototype.convertValue=function(t){return this.scale.convertValue(t)},o.prototype.screenToValue=function(t){return this.scale.screenToValue(t)},o.prototype._redrawLabel=function(t,e,i,o,n){var r=s.getDOMElement("div",this.DOMelements.labels,this.dom.frame);r.className=o,r.innerHTML=e,"left"===i?(r.style.left="-"+this.options.labelOffsetX+"px",r.style.textAlign="right"):(r.style.right="-"+this.options.labelOffsetX+"px",r.style.textAlign="left"),r.style.top=t-.5*n+this.options.labelOffsetY+"px",e+="";var a=Math.max(this.props.majorCharWidth,this.props.minorCharWidth);this.maxLabelSize.5*(h.magnitudefactor*h.minorSteps[h.minorStepIdx])?e+h.magnitudefactor*h.minorSteps[h.minorStepIdx]:e};i&&(this._start-=2*this.magnitudefactor*this.minorSteps[this.minorStepIdx],this._start=d(this._start)),o&&(this._end+=this.magnitudefactor*this.minorSteps[this.minorStepIdx],this._end=d(this._end)),this.determineScale()}}i.prototype.setCharHeight=function(t){this.majorCharHeight=t},i.prototype.setHeight=function(t){this.containerHeight=t},i.prototype.determineScale=function(){var t=this._end-this._start;this.scale=this.containerHeight/t;var e=this.majorCharHeight/this.scale,i=t>0?Math.round(Math.log(t)/Math.LN10):0;this.minorStepIdx=-1,this.magnitudefactor=Math.pow(10,i);var o=0;0>i&&(o=i);for(var n=!1,s=o;Math.abs(s)<=Math.abs(i);s++){this.magnitudefactor=Math.pow(10,s);for(var r=0;r=e){n=!0,this.minorStepIdx=r;break}}if(n===!0)break}},i.prototype.is_major=function(t){return t%(this.magnitudefactor*this.majorSteps[this.minorStepIdx])===0},i.prototype.getStep=function(){return this.magnitudefactor*this.minorSteps[this.minorStepIdx]},i.prototype.getFirstMajor=function(){var t=this.magnitudefactor*this.majorSteps[this.minorStepIdx];return this.convertValue(this._start+(t-this._start%t)%t)},i.prototype.formatValue=function(t){var e=t.toPrecision(5);return"function"==typeof this.formattingFunction&&(e=this.formattingFunction(t)),"number"==typeof e?""+e:"string"==typeof e?e:t.toPrecision(5)},i.prototype.getLines=function(){for(var t=[],e=this.getStep(),i=(e-this._start%e)%e,o=this._start+i;this._end-o>1e-5;o+=e)o!=this._start&&t.push({major:this.is_major(o),y:this.convertValue(o),val:this.formatValue(o)});return t},i.prototype.followScale=function(t){var e=this.minorStepIdx,i=this._start,o=this._end,n=this,s=function(){n.magnitudefactor*=2},r=function(){n.magnitudefactor/=2};t.minorStepIdx<=1&&this.minorStepIdx<=1||t.minorStepIdx>1&&this.minorStepIdx>1||(t.minorStepIdxo+1e-5)r(),d=!1;else{if(!this.autoScaleStart&&this._start=0)){r(),d=!1;continue}console.warn("Can't adhere to given 'min' range, due to zeroalign")}this.autoScaleStart&&this.autoScaleEnd&&o-i>c?(s(),d=!1):d=!0}}},i.prototype.convertValue=function(t){return this.containerHeight-(t-this._start)*this.scale},i.prototype.screenToValue=function(t){return(this.containerHeight-t)/this.scale+this._start},t.exports=i},function(t,e,i){function o(t,e,i,o){this.id=e;var n=["sampling","style","sort","yAxisOrientation","barChart","drawPoints","shaded","interpolation","zIndex","excludeFromStacking","excludeFromLegend"];this.options=s.selectiveBridgeObject(n,i),this.usingDefaultStyle=void 0===t.className,this.groupsUsingDefaultStyles=o,this.zeroPosition=0,this.update(t),1==this.usingDefaultStyle&&(this.groupsUsingDefaultStyles[0]+=1),this.itemsData=[],this.visible=void 0===t.visible?!0:t.visible}var n="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},s=i(1),r=(i(8),i(55)),a=i(57),h=i(56);o.prototype.setItems=function(t){null!=t?(this.itemsData=t,1==this.options.sort&&s.insertSort(this.itemsData,function(t,e){return t.x>e.x?1:-1})):this.itemsData=[]},o.prototype.getItems=function(){return this.itemsData},o.prototype.setZeroPosition=function(t){this.zeroPosition=t},o.prototype.setOptions=function(t){if(void 0!==t){var e=["sampling","style","sort","yAxisOrientation","barChart","zIndex","excludeFromStacking","excludeFromLegend"];s.selectiveDeepExtend(e,this.options,t),"function"==typeof t.drawPoints&&(t.drawPoints={onRender:t.drawPoints}),s.mergeOptions(this.options,t,"interpolation"),s.mergeOptions(this.options,t,"drawPoints"),s.mergeOptions(this.options,t,"shaded"),t.interpolation&&"object"==n(t.interpolation)&&t.interpolation.parametrization&&("uniform"==t.interpolation.parametrization?this.options.interpolation.alpha=0:"chordal"==t.interpolation.parametrization?this.options.interpolation.alpha=1:(this.options.interpolation.parametrization="centripetal",this.options.interpolation.alpha=.5))}},o.prototype.update=function(t){this.group=t,this.content=t.content||"graph",this.className=t.className||this.className||"vis-graph-group"+this.groupsUsingDefaultStyles[0]%10,this.visible=void 0===t.visible?!0:t.visible,this.style=t.style,this.setOptions(t.options)},o.prototype.getLegend=function(t,e,i,o,n){if(void 0==i||null==i){var s=document.createElementNS("http://www.w3.org/2000/svg","svg");i={svg:s,svgElements:{},options:this.options,groups:[this]}}switch(void 0!=o&&null!=o||(o=0),void 0!=n&&null!=n||(n=.5*e),this.options.style){case"line":a.drawIcon(this,o,n,t,e,i);break;case"points":case"point":h.drawIcon(this,o,n,t,e,i);break;case"bar":r.drawIcon(this,o,n,t,e,i)}return{icon:i.svg,label:this.content,orientation:this.options.yAxisOrientation}},o.prototype.getYRange=function(t){for(var e=t[0].y,i=t[0].y,o=0;ot[o].y?t[o].y:e,i=i0&&(i=Math.min(i,Math.abs(e[o-1].screen_x-e[o].screen_x))),0===i&&(void 0===t[e[o].screen_x]&&(t[e[o].screen_x]={amount:0,resolved:0,accumulatedPositive:0,accumulatedNegative:0}),t[e[o].screen_x].amount+=1)},o._getSafeDrawData=function(t,e,i){var o,n;return t0?(o=i>t?i:t,n=0,"left"===e.options.barChart.align?n-=.5*t:"right"===e.options.barChart.align&&(n+=.5*t)):(o=e.options.barChart.width,n=0,"left"===e.options.barChart.align?n-=.5*e.options.barChart.width:"right"===e.options.barChart.align&&(n+=.5*e.options.barChart.width)),{width:o,offset:n}},o.getStackedYRange=function(t,e,i,n,s){if(t.length>0){t.sort(function(t,e){return t.screen_x===e.screen_x?t.groupIde[s].screen_y?e[s].screen_y:o,n=nt[r].accumulatedNegative?t[r].accumulatedNegative:o,o=o>t[r].accumulatedPositive?t[r].accumulatedPositive:o,n=n0){var i=[];return i=1==e.options.interpolation.enabled?o._catmullRom(t,e):o._linear(t)}},o.drawIcon=function(t,e,i,o,s,r){var a,h,d=.5*s,l=n.getSVGElement("rect",r.svgElements,r.svg);if(l.setAttributeNS(null,"x",e),l.setAttributeNS(null,"y",i-d),l.setAttributeNS(null,"width",o),l.setAttributeNS(null,"height",2*d),l.setAttributeNS(null,"class","vis-outline"),a=n.getSVGElement("path",r.svgElements,r.svg),a.setAttributeNS(null,"class",t.className),void 0!==t.style&&a.setAttributeNS(null,"style",t.style),a.setAttributeNS(null,"d","M"+e+","+i+" L"+(e+o)+","+i),1==t.options.shaded.enabled&&(h=n.getSVGElement("path",r.svgElements,r.svg),"top"==t.options.shaded.orientation?h.setAttributeNS(null,"d","M"+e+", "+(i-d)+"L"+e+","+i+" L"+(e+o)+","+i+" L"+(e+o)+","+(i-d)):h.setAttributeNS(null,"d","M"+e+","+i+" L"+e+","+(i+d)+" L"+(e+o)+","+(i+d)+"L"+(e+o)+","+i),h.setAttributeNS(null,"class",t.className+" vis-icon-fill"),void 0!==t.options.shaded.style&&""!==t.options.shaded.style&&h.setAttributeNS(null,"style",t.options.shaded.style)),1==t.options.drawPoints.enabled){var c={style:t.options.drawPoints.style,styles:t.options.drawPoints.styles,size:t.options.drawPoints.size,className:t.className};n.drawPoint(e+.5*o,i,c,r.svgElements,r.svg)}},o.drawShading=function(t,e,i,o){if(1==e.options.shaded.enabled){var s=Number(o.svg.style.height.replace("px","")),r=n.getSVGElement("path",o.svgElements,o.svg),a="L";1==e.options.interpolation.enabled&&(a="C");var h,d=0;d="top"==e.options.shaded.orientation?0:"bottom"==e.options.shaded.orientation?s:Math.min(Math.max(0,e.zeroPosition),s),h="group"==e.options.shaded.orientation&&null!=i&&void 0!=i?"M"+t[0][0]+","+t[0][1]+" "+this.serializePath(t,a,!1)+" L"+i[i.length-1][0]+","+i[i.length-1][1]+" "+this.serializePath(i,a,!0)+i[0][0]+","+i[0][1]+" Z":"M"+t[0][0]+","+t[0][1]+" "+this.serializePath(t,a,!1)+" V"+d+" H"+t[0][0]+" Z",r.setAttributeNS(null,"class",e.className+" vis-fill"),void 0!==e.options.shaded.style&&r.setAttributeNS(null,"style",e.options.shaded.style),r.setAttributeNS(null,"d",h)}},o.draw=function(t,e,i){if(null!=t&&void 0!=t){var o=n.getSVGElement("path",i.svgElements,i.svg);o.setAttributeNS(null,"class",e.className),void 0!==e.style&&o.setAttributeNS(null,"style",e.style);var s="L";1==e.options.interpolation.enabled&&(s="C"),o.setAttributeNS(null,"d","M"+t[0][0]+","+t[0][1]+" "+this.serializePath(t,s,!1))}},o.serializePath=function(t,e,i){if(t.length<2)return"";var o=e;if(i)for(var n=t.length-2;n>0;n--)o+=t[n][0]+","+t[n][1]+" ";else for(var n=1;nl;l++)e=0==l?t[0]:t[l-1],i=t[l],o=t[l+1],n=d>l+2?t[l+2]:o,s={screen_x:(-e.screen_x+6*i.screen_x+o.screen_x)*h,screen_y:(-e.screen_y+6*i.screen_y+o.screen_y)*h},r={screen_x:(i.screen_x+6*o.screen_x-n.screen_x)*h,screen_y:(i.screen_y+6*o.screen_y-n.screen_y)*h},a.push([s.screen_x,s.screen_y]),a.push([r.screen_x,r.screen_y]),a.push([o.screen_x,o.screen_y]);return a},o._catmullRom=function(t,e){var i=e.options.interpolation.alpha;if(0==i||void 0===i)return this._catmullRomUniform(t);var o,n,s,r,a,h,d,l,c,u,p,f,m,v,g,y,b,w,_,x=[];x.push([Math.round(t[0].screen_x),Math.round(t[0].screen_y)]);for(var k=t.length,O=0;k-1>O;O++)o=0==O?t[0]:t[O-1],n=t[O],s=t[O+1],r=k>O+2?t[O+2]:s,d=Math.sqrt(Math.pow(o.screen_x-n.screen_x,2)+Math.pow(o.screen_y-n.screen_y,2)),l=Math.sqrt(Math.pow(n.screen_x-s.screen_x,2)+Math.pow(n.screen_y-s.screen_y,2)),c=Math.sqrt(Math.pow(s.screen_x-r.screen_x,2)+Math.pow(s.screen_y-r.screen_y,2)),v=Math.pow(c,i),y=Math.pow(c,2*i),g=Math.pow(l,i),b=Math.pow(l,2*i),_=Math.pow(d,i),w=Math.pow(d,2*i),u=2*w+3*_*g+b,p=2*y+3*v*g+b,f=3*_*(_+g),f>0&&(f=1/f),m=3*v*(v+g),m>0&&(m=1/m),a={screen_x:(-b*o.screen_x+u*n.screen_x+w*s.screen_x)*f,screen_y:(-b*o.screen_y+u*n.screen_y+w*s.screen_y)*f},h={screen_x:(y*n.screen_x+p*s.screen_x-b*r.screen_x)*m,screen_y:(y*n.screen_y+p*s.screen_y-b*r.screen_y)*m},0==a.screen_x&&0==a.screen_y&&(a=n),0==h.screen_x&&0==h.screen_y&&(h=s),x.push([a.screen_x,a.screen_y]),x.push([h.screen_x,h.screen_y]),x.push([s.screen_x,s.screen_y]);return x},o._linear=function(t){for(var e=[],i=0;it?-1:1});for(var i=0;i")}this.dom.textArea.innerHTML=s,this.dom.textArea.style.lineHeight=.75*this.options.iconSize+this.options.iconSpacing+"px"}},o.prototype.drawLegendIcons=function(){if(this.dom.frame.parentNode){var t=Object.keys(this.groups);t.sort(function(t,e){return e>t?-1:1}),s.resetElements(this.svgElements);var e=window.getComputedStyle(this.dom.frame).paddingTop,i=Number(e.replace("px","")),o=i,n=this.options.iconSize,r=.75*this.options.iconSize,a=i+.5*r+3;this.svg.style.width=n+5+i+"px";for(var h=0;h0){var i=this.groupIndex%this.groupsArray.length;this.groupIndex++,e={},e.color=this.groups[this.groupsArray[i]],this.groups[t]=e}else{var o=this.defaultIndex%this.defaultGroups.length;this.defaultIndex++,e={},e.color=this.defaultGroups[o],this.groups[t]=e}return e}},{key:"add",value:function(t,e){return this.groups[t]=e,this.groupsArray.push(t),e}}]),t}();e["default"]=r},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var s=function(){function t(t,e){for(var i=0;it.left&&this.shape.topt.top}},{key:"isBoundingBoxOverlappingWith",value:function(t){return this.shape.boundingBox.leftt.left&&this.shape.boundingBox.topt.top}}],[{key:"parseOptions",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?!1:arguments[2],o=arguments.length<=3||void 0===arguments[3]?{}:arguments[3],n=["color","font","fixed","shadow"];if(A.selectiveNotDeepExtend(n,t,e,i),A.mergeOptions(t,e,"shadow",i,o),void 0!==e.color&&null!==e.color){var s=A.parseColor(e.color);A.fillIfDefined(t.color,s)}else i===!0&&null===e.color&&(t.color=A.bridgeObject(o.color));void 0!==e.fixed&&null!==e.fixed&&("boolean"==typeof e.fixed?(t.fixed.x=e.fixed,t.fixed.y=e.fixed):(void 0!==e.fixed.x&&"boolean"==typeof e.fixed.x&&(t.fixed.x=e.fixed.x),void 0!==e.fixed.y&&"boolean"==typeof e.fixed.y&&(t.fixed.y=e.fixed.y))),void 0!==e.font&&null!==e.font?a["default"].parseOptions(t.font,e):i===!0&&null===e.font&&(t.font=A.bridgeObject(o.font)),void 0!==e.scaling&&A.mergeOptions(t.scaling,e.scaling,"label",i,o.scaling)}}]),t}();e["default"]=B},function(t,e,i){function o(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var n=function(){function t(t,e){var i=[],o=!0,n=!1,s=void 0;try{for(var r,a=t[Symbol.iterator]();!(o=(r=a.next()).done)&&(i.push(r.value),!e||i.length!==e);o=!0);}catch(h){n=!0,s=h}finally{try{!o&&a["return"]&&a["return"]()}finally{if(n)throw s}}return i}return function(e,i){if(Array.isArray(e))return e;if(Symbol.iterator in Object(e))return t(e,i);throw new TypeError("Invalid attempt to destructure non-iterable instance")}}(),s="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},r=function(){function t(t,e){for(var i=0;i=this.nodeOptions.scaling.label.maxVisible&&(r=Number(this.nodeOptions.scaling.label.maxVisible)/this.body.view.scale);var h=this.size.yLine,d=this._getColor(a),l=n(d,2),c=l[0],u=l[1],p=this._setAlignment(t,i,h,s),f=n(p,2);i=f[0],h=f[1],t.font=(e&&this.nodeOptions.labelHighlightBold?"bold ":"")+r+"px "+this.fontOptions.face,t.fillStyle=c,this.isEdgeLabel||"left"!==this.fontOptions.align?t.textAlign="center":(t.textAlign=this.fontOptions.align,i-=.5*this.size.width),this.fontOptions.strokeWidth>0&&(t.lineWidth=this.fontOptions.strokeWidth,t.strokeStyle=u,t.lineJoin="round");for(var m=0;m0&&t.strokeText(this.lines[m],i,h),t.fillText(this.lines[m],i,h),h+=r}},{key:"_setAlignment",value:function(t,e,i,o){if(this.isEdgeLabel&&"horizontal"!==this.fontOptions.align&&this.pointToSelf===!1){e=0,i=0;var n=2;"top"===this.fontOptions.align?(t.textBaseline="alphabetic",i-=2*n):"bottom"===this.fontOptions.align?(t.textBaseline="hanging",i+=2*n):t.textBaseline="middle"}else t.textBaseline=o;return[e,i]}},{key:"_getColor",value:function(t){var e=this.fontOptions.color||"#000000",i=this.fontOptions.strokeColor||"#ffffff";if(t<=this.nodeOptions.scaling.label.drawThreshold){var o=Math.max(0,Math.min(1,1-(this.nodeOptions.scaling.label.drawThreshold-t)));e=a.overrideOpacity(e,o),i=a.overrideOpacity(i,o)}return[e,i]}},{key:"getTextSize",value:function(t){var e=arguments.length<=1||void 0===arguments[1]?!1:arguments[1],i={width:this._processLabel(t,e),height:this.fontOptions.size*this.lineCount,lineCount:this.lineCount};return i}},{key:"calculateLabelSize",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?0:arguments[2],o=arguments.length<=3||void 0===arguments[3]?0:arguments[3],n=arguments.length<=4||void 0===arguments[4]?"middle":arguments[4];this.labelDirty===!0&&(this.size.width=this._processLabel(t,e)),this.size.height=this.fontOptions.size*this.lineCount,this.size.left=i-.5*this.size.width,this.size.top=o-.5*this.size.height,this.size.yLine=o+.5*(1-this.lineCount)*this.fontOptions.size,"hanging"===n&&(this.size.top+=.5*this.fontOptions.size,this.size.top+=4,this.size.yLine+=4),this.labelDirty=!1}},{key:"_processLabel",value:function(t,e){var i=0,o=[""],n=0;if(void 0!==this.nodeOptions.label){o=String(this.nodeOptions.label).split("\n"),n=o.length,t.font=(e&&this.nodeOptions.labelHighlightBold?"bold ":"")+this.fontOptions.size+"px "+this.fontOptions.face,i=t.measureText(o[0]).width;for(var s=1;n>s;s++){var r=t.measureText(o[s]).width;i=r>i?r:i}}return this.lines=o,this.lineCount=n,i}}],[{key:"parseOptions",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?!1:arguments[2];if("string"==typeof e.font){var o=e.font.split(" ");t.size=o[0].replace("px",""),t.face=o[1],t.color=o[2]}else"object"===s(e.font)&&a.fillIfDefined(t,e.font,i);t.size=Number(t.size)}}]),t}();e["default"]=h},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i0&&(this.enableBorderDashes(t),t.stroke(),this.disableBorderDashes(t)),t.restore(),this.updateBoundingBox(e,i,t,o),this.labelModule.draw(t,e,i,o)}},{key:"updateBoundingBox",value:function(t,e,i,o){this.resize(i,o),this.left=t-.5*this.width,this.top=e-.5*this.height;var n=this.options.shapeProperties.borderRadius;this.boundingBox.left=this.left-n,this.boundingBox.top=this.top-n,this.boundingBox.bottom=this.top+this.height+n,this.boundingBox.right=this.left+this.width+n}},{key:"distanceToBorder",value:function(t,e){this.resize(t);var i=this.options.borderWidth;return Math.min(Math.abs(this.width/2/Math.cos(e)),Math.abs(this.height/2/Math.sin(e)))+i}}]),e}(d["default"]);e["default"]=l},function(t,e){function i(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var o=function(){function t(t,e){for(var i=0;ithis.imageObj.height?(o=this.imageObj.width/this.imageObj.height,e=2*this.options.size*o||this.imageObj.width,i=2*this.options.size||this.imageObj.height):(o=this.imageObj.width&&this.imageObj.height?this.imageObj.height/this.imageObj.width:1,e=2*this.options.size,i=2*this.options.size*o):(e=this.imageObj.width,i=this.imageObj.height),this.width=e,this.height=i,this.radius=.5*this.width}}},{key:"_drawRawCircle",value:function(t,e,i,o,n,s){var r=this.options.borderWidth,a=this.options.borderWidthSelected||2*this.options.borderWidth,h=(o?a:r)/this.body.view.scale;t.lineWidth=Math.min(this.width,h),t.strokeStyle=o?this.options.color.highlight.border:n?this.options.color.hover.border:this.options.color.border,t.fillStyle=o?this.options.color.highlight.background:n?this.options.color.hover.background:this.options.color.background,t.circle(e,i,s),this.enableShadow(t),t.fill(),this.disableShadow(t),t.save(),h>0&&(this.enableBorderDashes(t),t.stroke(),this.disableBorderDashes(t)),t.restore()}},{key:"_drawImageAtPosition",value:function(t){if(0!=this.imageObj.width){t.globalAlpha=1,this.enableShadow(t);var e=this.imageObj.width/this.width/this.body.view.scale;if(e>2&&this.options.shapeProperties.interpolation===!0){var i=this.imageObj.width,o=this.imageObj.height,n=document.createElement("canvas");n.width=i,n.height=i;var s=n.getContext("2d");e*=.5,i*=.5,o*=.5,s.drawImage(this.imageObj,0,0,i,o);for(var r=0,a=1;e>2&&4>a;)s.drawImage(n,r,0,i,o,r+i,0,i/2,o/2),r+=i,e*=.5,i*=.5,o*=.5,a+=1;t.drawImage(n,r,0,i,o,this.left,this.top,this.width,this.height)}else t.drawImage(this.imageObj,this.left,this.top,this.width,this.height);this.disableShadow(t)}}},{key:"_drawImageLabel",value:function(t,e,i,o){var n,s=0;if(void 0!==this.height){s=.5*this.height;var r=this.labelModule.getTextSize(t);r.lineCount>=1&&(s+=r.height/2)}n=i+s,this.options.label&&(this.labelOffset=s),this.labelModule.draw(t,e,n,o,"hanging")}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i0&&(this.enableBorderDashes(t),t.stroke(),this.disableBorderDashes(t)),t.restore(),this.updateBoundingBox(e,i,t,o),this.labelModule.draw(t,e,i,o)}},{key:"updateBoundingBox",value:function(t,e,i,o){this.resize(i,o),this.left=t-.5*this.width,this.top=e-.5*this.height,this.boundingBox.left=this.left,this.boundingBox.top=this.top,this.boundingBox.bottom=this.top+this.height,this.boundingBox.right=this.left+this.width}},{key:"distanceToBorder",value:function(t,e){return this._distanceToBorder(t,e)}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i0&&(this.enableBorderDashes(t),t.stroke(),this.disableBorderDashes(t)),t.restore(),void 0!==this.options.label){var l=n+.5*this.height+3;this.labelModule.draw(t,o,l,s,"hanging")}this.updateBoundingBox(o,n)}},{key:"updateBoundingBox",value:function(t,e){this.boundingBox.top=e-this.options.size,this.boundingBox.left=t-this.options.size,this.boundingBox.right=t+this.options.size,this.boundingBox.bottom=e+this.options.size,void 0!==this.options.label&&this.labelModule.size.width>0&&(this.boundingBox.left=Math.min(this.boundingBox.left,this.labelModule.size.left),this.boundingBox.right=Math.max(this.boundingBox.right,this.labelModule.size.left+this.labelModule.size.width),this.boundingBox.bottom=Math.max(this.boundingBox.bottom,this.boundingBox.bottom+this.labelModule.size.height+3))}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i0&&(this.enableBorderDashes(t),t.stroke(),this.disableBorderDashes(t)),t.restore(),this.updateBoundingBox(e,i,t,o),this.labelModule.draw(t,e,i,o)}},{key:"updateBoundingBox",value:function(t,e,i,o){this.resize(i,o),this.left=t-.5*this.width,this.top=e-.5*this.height,this.boundingBox.left=this.left,this.boundingBox.top=this.top,this.boundingBox.bottom=this.top+this.height,this.boundingBox.right=this.left+this.width}},{key:"distanceToBorder",value:function(t,e){this.resize(t);var i=.5*this.width,o=.5*this.height,n=Math.sin(e)*i,s=Math.cos(e)*o;return i*o/Math.sqrt(n*n+s*s)}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i0){var i=5;this.boundingBox.left=Math.min(this.boundingBox.left,this.labelModule.size.left),this.boundingBox.right=Math.max(this.boundingBox.right,this.labelModule.size.left+this.labelModule.size.width),this.boundingBox.bottom=Math.max(this.boundingBox.bottom,this.boundingBox.bottom+this.labelModule.size.height+i)}}},{key:"_icon",value:function(t,e,i,o){var n=Number(this.options.icon.size);void 0!==this.options.icon.code?(t.font=(o?"bold ":"")+n+"px "+this.options.icon.face,t.fillStyle=this.options.icon.color||"black",t.textAlign="center",t.textBaseline="middle",this.enableShadow(t),t.fillText(this.options.icon.code,e,i),this.disableShadow(t)):console.error("When using the icon shape, you need to define the code in the icon options object. This can be done per node or globally.")}},{key:"distanceToBorder",value:function(t,e){return this._distanceToBorder(t,e)}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i0&&(this.enableBorderDashes(t),t.stroke(),this.disableBorderDashes(t)),t.restore(),t.closePath()}this._drawImageAtPosition(t),this._drawImageLabel(t,e,i,o||n),this.updateBoundingBox(e,i)}},{key:"updateBoundingBox",value:function(t,e){this.resize(),this.left=t-this.width/2,this.top=e-this.height/2,this.boundingBox.top=this.top,this.boundingBox.left=this.left,this.boundingBox.right=this.left+this.width,this.boundingBox.bottom=this.top+this.height,void 0!==this.options.label&&this.labelModule.size.width>0&&(this.boundingBox.left=Math.min(this.boundingBox.left,this.labelModule.size.left),this.boundingBox.right=Math.max(this.boundingBox.right,this.labelModule.size.left+this.labelModule.size.width),this.boundingBox.bottom=Math.max(this.boundingBox.bottom,this.boundingBox.bottom+this.labelOffset))}},{key:"distanceToBorder", -value:function(t,e){return this._distanceToBorder(t,e)}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;ii.shape.height?(r=i.x+.5*i.shape.width,a=i.y-h):(r=i.x+h,a=i.y-.5*i.shape.height),s=this._pointOnCircle(r,a,h,.125),this.labelModule.draw(t,s.x,s.y,n)}}}},{key:"isOverlappingWith",value:function(t){if(this.connected){var e=10,i=this.from.x,o=this.from.y,n=this.to.x,s=this.to.y,r=t.left,a=t.top,h=this.edgeType.getDistanceToEdge(i,o,n,s,r,a);return e>h}return!1}},{key:"_rotateForLabelAlignment",value:function(t){var e=this.from.y-this.to.y,i=this.from.x-this.to.x,o=Math.atan2(e,i);(-1>o&&0>i||o>0&&0>i)&&(o+=Math.PI),t.rotate(o)}},{key:"_pointOnCircle",value:function(t,e,i,o){var n=2*o*Math.PI;return{x:t+i*Math.cos(n),y:e-i*Math.sin(n)}}},{key:"select",value:function(){this.selected=!0}},{key:"unselect",value:function(){this.selected=!1}},{key:"cleanup",value:function(){return this.edgeType.cleanup()}}],[{key:"parseOptions",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?!1:arguments[2],o=arguments.length<=3||void 0===arguments[3]?{}:arguments[3],n=["arrowStrikethrough","id","from","hidden","hoverWidth","label","labelHighlightBold","length","line","opacity","physics","scaling","selectionWidth","selfReferenceSize","to","title","value","width"];if(g.selectiveDeepExtend(n,t,e,i),g.mergeOptions(t,e,"smooth",i,o),g.mergeOptions(t,e,"shadow",i,o),void 0!==e.dashes&&null!==e.dashes?t.dashes=e.dashes:i===!0&&null===e.dashes&&(t.dashes=Object.create(o.dashes)),void 0!==e.scaling&&null!==e.scaling?(void 0!==e.scaling.min&&(t.scaling.min=e.scaling.min),void 0!==e.scaling.max&&(t.scaling.max=e.scaling.max),g.mergeOptions(t.scaling,e.scaling,"label",i,o.scaling)):i===!0&&null===e.scaling&&(t.scaling=Object.create(o.scaling)),void 0!==e.arrows&&null!==e.arrows)if("string"==typeof e.arrows){var r=e.arrows.toLowerCase();t.arrows.to.enabled=-1!=r.indexOf("to"),t.arrows.middle.enabled=-1!=r.indexOf("middle"),t.arrows.from.enabled=-1!=r.indexOf("from")}else{if("object"!==s(e.arrows))throw new Error("The arrow newOptions can only be an object or a string. Refer to the documentation. You used:"+JSON.stringify(e.arrows));g.mergeOptions(t.arrows,e.arrows,"to",i,o.arrows),g.mergeOptions(t.arrows,e.arrows,"middle",i,o.arrows),g.mergeOptions(t.arrows,e.arrows,"from",i,o.arrows)}else i===!0&&null===e.arrows&&(t.arrows=Object.create(o.arrows));if(void 0!==e.color&&null!==e.color)if(t.color=g.deepExtend({},t.color,!0),g.isString(e.color))t.color.color=e.color,t.color.highlight=e.color,t.color.hover=e.color,t.color.inherit=!1;else{var a=!1;void 0!==e.color.color&&(t.color.color=e.color.color,a=!0),void 0!==e.color.highlight&&(t.color.highlight=e.color.highlight,a=!0),void 0!==e.color.hover&&(t.color.hover=e.color.hover,a=!0),void 0!==e.color.inherit&&(t.color.inherit=e.color.inherit),void 0!==e.color.opacity&&(t.color.opacity=Math.min(1,Math.max(0,e.color.opacity))),void 0===e.color.inherit&&a===!0&&(t.color.inherit=!1)}else i===!0&&null===e.color&&(t.color=g.bridgeObject(o.color));void 0!==e.font&&null!==e.font?h["default"].parseOptions(t.font,e):i===!0&&null===e.font&&(t.font=g.bridgeObject(o.font))}}]),t}();e["default"]=y},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){var i=[],o=!0,n=!1,s=void 0;try{for(var r,a=t[Symbol.iterator]();!(o=(r=a.next()).done)&&(i.push(r.value),!e||i.length!==e);o=!0);}catch(h){n=!0,s=h}finally{try{!o&&a["return"]&&a["return"]()}finally{if(n)throw s}}return i}return function(e,i){if(Array.isArray(e))return e;if(Symbol.iterator in Object(e))return t(e,i);throw new TypeError("Invalid attempt to destructure non-iterable instance")}}(),h=function(){function t(t,e){for(var i=0;iMath.abs(e)||this.options.smooth.forceDirection===!0||"horizontal"===this.options.smooth.forceDirection)&&"vertical"!==this.options.smooth.forceDirection?(o=this.from.y,s=this.to.y,i=this.from.x-r*t,n=this.to.x+r*t):(o=this.from.y-r*e,s=this.to.y+r*e,i=this.from.x,n=this.to.x),[{x:i,y:o},{x:n,y:s}]}},{key:"getViaNode",value:function(){return this._getViaCoordinates()}},{key:"_findBorderPosition",value:function(t,e){return this._findBorderPositionBezier(t,e)}},{key:"_getDistanceToEdge",value:function(t,e,i,o,n,s){var r=arguments.length<=6||void 0===arguments[6]?this._getViaCoordinates():arguments[6],h=a(r,2),d=h[0],l=h[1];return this._getDistanceToBezierEdge(t,e,i,o,n,s,d,l)}},{key:"getPoint",value:function(t){var e=arguments.length<=1||void 0===arguments[1]?this._getViaCoordinates():arguments[1],i=a(e,2),o=i[0],n=i[1],s=t,r=[];r[0]=Math.pow(1-s,3),r[1]=3*s*Math.pow(1-s,2),r[2]=3*Math.pow(s,2)*(1-s),r[3]=Math.pow(s,3);var h=r[0]*this.fromPoint.x+r[1]*o.x+r[2]*n.x+r[3]*this.toPoint.x,d=r[0]*this.fromPoint.y+r[1]*o.y+r[2]*n.y+r[3]*this.toPoint.y;return{x:h,y:d}}}]),e}(l["default"]);e["default"]=c},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;il;l++)c=.1*l,v[0]=Math.pow(1-c,3),v[1]=3*c*Math.pow(1-c,2),v[2]=3*Math.pow(c,2)*(1-c),v[3]=Math.pow(c,3),u=v[0]*t+v[1]*r.x+v[2]*a.x+v[3]*i,p=v[0]*e+v[1]*r.y+v[2]*a.y+v[3]*o,l>0&&(d=this._getDistanceToLine(f,m,u,p,n,s),h=h>d?d:h),f=u,m=p;return h}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i=l&&h>d;){var m=.5*(l+c);if(i=this.getPoint(m,a),o=Math.atan2(p.y-i.y,p.x-i.x),n=p.distanceToBorder(e,o),s=Math.sqrt(Math.pow(i.x-p.x,2)+Math.pow(i.y-p.y,2)),r=n-s,Math.abs(r)r?f===!1?l=m:c=m:f===!1?c=m:l=m,d++}return i.t=m,i}},{key:"_getDistanceToBezierEdge",value:function(t,e,i,o,n,s,r){var a=1e9,h=void 0,d=void 0,l=void 0,c=void 0,u=void 0,p=t,f=e;for(d=1;10>d;d++)l=.1*d,c=Math.pow(1-l,2)*t+2*l*(1-l)*r.x+Math.pow(l,2)*i,u=Math.pow(1-l,2)*e+2*l*(1-l)*r.y+Math.pow(l,2)*o,d>0&&(h=this._getDistanceToLine(p,f,c,u,n,s),a=a>h?h:a),p=c,f=u;return a}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var n=function(){function t(t,e){var i=[],o=!0,n=!1,s=void 0;try{for(var r,a=t[Symbol.iterator]();!(o=(r=a.next()).done)&&(i.push(r.value),!e||i.length!==e);o=!0);}catch(h){n=!0,s=h}finally{try{!o&&a["return"]&&a["return"]()}finally{if(n)throw s}}return i}return function(e,i){if(Array.isArray(e))return e;if(Symbol.iterator in Object(e))return t(e,i);throw new TypeError("Invalid attempt to destructure non-iterable instance")}}(),s=function(){function t(t,e){for(var i=0;io.shape.height?(e=o.x+.5*o.shape.width,i=o.y-n):(e=o.x+n,i=o.y-.5*o.shape.height),[e,i,n]}},{key:"_pointOnCircle",value:function(t,e,i,o){var n=2*o*Math.PI;return{x:t+i*Math.cos(n),y:e-i*Math.sin(n)}}},{key:"_findBorderPositionCircle",value:function(t,e,i){for(var o=i.x,n=i.y,s=i.low,r=i.high,a=i.direction,h=10,d=0,l=this.options.selfReferenceSize,c=void 0,u=void 0,p=void 0,f=void 0,m=void 0,v=.05,g=.5*(s+r);r>=s&&h>d&&(g=.5*(s+r),c=this._pointOnCircle(o,n,l,g),u=Math.atan2(t.y-c.y,t.x-c.x),p=t.distanceToBorder(e,u),f=Math.sqrt(Math.pow(c.x-t.x,2)+Math.pow(c.y-t.y,2)), -m=p-f,!(Math.abs(m)0?a>0?s=g:r=g:a>0?r=g:s=g,d++;return c.t=g,c}},{key:"getLineWidth",value:function(t,e){return t===!0?Math.max(this.selectionWidth,.3/this.body.view.scale):e===!0?Math.max(this.hoverWidth,.3/this.body.view.scale):Math.max(this.options.width,.3/this.body.view.scale)}},{key:"getColor",value:function(t,e,i){var o=this.options.color;if(o.inherit!==!1){if("both"===o.inherit&&this.from.id!==this.to.id){var n=t.createLinearGradient(this.from.x,this.from.y,this.to.x,this.to.y),s=void 0,a=void 0;return s=this.from.options.color.highlight.border,a=this.to.options.color.highlight.border,this.from.selected===!1&&this.to.selected===!1?(s=r.overrideOpacity(this.from.options.color.border,this.options.color.opacity),a=r.overrideOpacity(this.to.options.color.border,this.options.color.opacity)):this.from.selected===!0&&this.to.selected===!1?a=this.to.options.color.border:this.from.selected===!1&&this.to.selected===!0&&(s=this.from.options.color.border),n.addColorStop(0,s),n.addColorStop(1,a),n}this.colorDirty===!0&&("to"===o.inherit?(this.color.highlight=this.to.options.color.highlight.border,this.color.hover=this.to.options.color.hover.border,this.color.color=r.overrideOpacity(this.to.options.color.border,o.opacity)):(this.color.highlight=this.from.options.color.highlight.border,this.color.hover=this.from.options.color.hover.border,this.color.color=r.overrideOpacity(this.from.options.color.border,o.opacity)))}else this.colorDirty===!0&&(this.color.highlight=o.highlight,this.color.hover=o.hover,this.color.color=r.overrideOpacity(o.color,o.opacity));return this.colorDirty=!1,e===!0?this.color.highlight:i===!0?this.color.hover:this.color.color}},{key:"_circle",value:function(t,e,i,o){this.enableShadow(t),t.beginPath(),t.arc(e,i,o,0,2*Math.PI,!1),t.stroke(),this.disableShadow(t)}},{key:"getDistanceToEdge",value:function(t,e,i,o,s,r,a){var h=0;if(this.from!=this.to)h=this._getDistanceToEdge(t,e,i,o,s,r,a);else{var d=this._getCircleData(),l=n(d,3),c=l[0],u=l[1],p=l[2],f=c-s,m=u-r;h=Math.abs(Math.sqrt(f*f+m*m)-p)}return this.labelModule.size.lefts&&this.labelModule.size.topr?0:h}},{key:"_getDistanceToLine",value:function(t,e,i,o,n,s){var r=i-t,a=o-e,h=r*r+a*a,d=((n-t)*r+(s-e)*a)/h;d>1?d=1:0>d&&(d=0);var l=t+d*r,c=e+d*a,u=l-n,p=c-s;return Math.sqrt(u*u+p*p)}},{key:"getArrowData",value:function(t,e,i,o,s){var r=void 0,a=void 0,h=void 0,d=void 0,l=void 0,c=void 0,u=this.getLineWidth(o,s);if("from"===e?(h=this.from,d=this.to,l=.1,c=this.options.arrows.from.scaleFactor):"to"===e?(h=this.to,d=this.from,l=-.1,c=this.options.arrows.to.scaleFactor):(h=this.to,d=this.from,c=this.options.arrows.middle.scaleFactor),h!=d)if("middle"!==e)if(this.options.smooth.enabled===!0){a=this.findBorderPosition(h,t,{via:i});var p=this.getPoint(Math.max(0,Math.min(1,a.t+l)),i);r=Math.atan2(a.y-p.y,a.x-p.x)}else r=Math.atan2(h.y-d.y,h.x-d.x),a=this.findBorderPosition(h,t);else r=Math.atan2(h.y-d.y,h.x-d.x),a=this.getPoint(.5,i);else{var f=this._getCircleData(t),m=n(f,3),v=m[0],g=m[1],y=m[2];"from"===e?(a=this.findBorderPosition(this.from,t,{x:v,y:g,low:.25,high:.6,direction:-1}),r=-2*a.t*Math.PI+1.5*Math.PI+.1*Math.PI):"to"===e?(a=this.findBorderPosition(this.from,t,{x:v,y:g,low:.6,high:1,direction:1}),r=-2*a.t*Math.PI+1.5*Math.PI-1.1*Math.PI):(a=this._pointOnCircle(v,g,y,.175),r=3.9269908169872414)}var b=15*c+3*u,w=a.x-.9*b*Math.cos(r),_=a.y-.9*b*Math.sin(r),x={x:w,y:_};return{point:a,core:x,angle:r,length:b}}},{key:"drawArrowHead",value:function(t,e,i,o){t.strokeStyle=this.getColor(t,e,i),t.fillStyle=t.strokeStyle,t.lineWidth=this.getLineWidth(e,i),t.arrow(o.point.x,o.point.y,o.angle,o.length),this.enableShadow(t),t.fill(),this.disableShadow(t)}},{key:"enableShadow",value:function(t){this.options.shadow.enabled===!0&&(t.shadowColor=this.options.shadow.color,t.shadowBlur=this.options.shadow.size,t.shadowOffsetX=this.options.shadow.x,t.shadowOffsetY=this.options.shadow.y)}},{key:"disableShadow",value:function(t){this.options.shadow.enabled===!0&&(t.shadowColor="rgba(0,0,0,0)",t.shadowBlur=0,t.shadowOffsetX=0,t.shadowOffsetY=0)}}]),t}();e["default"]=a},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function s(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}function r(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}Object.defineProperty(e,"__esModule",{value:!0});var a=function(){function t(t,e){for(var i=0;i=this.to.y?this.from.x<=this.to.x?(t=this.from.x+i*s,e=this.from.y-i*s):this.from.x>this.to.x&&(t=this.from.x-i*s,e=this.from.y-i*s):this.from.ythis.to.x&&(t=this.from.x-i*s,e=this.from.y+i*s)),"discrete"===o&&(t=i*s>n?this.from.x:t)):Math.abs(this.from.x-this.to.x)>Math.abs(this.from.y-this.to.y)&&(this.from.y>=this.to.y?this.from.x<=this.to.x?(t=this.from.x+i*n,e=this.from.y-i*n):this.from.x>this.to.x&&(t=this.from.x-i*n,e=this.from.y-i*n):this.from.ythis.to.x&&(t=this.from.x-i*n,e=this.from.y+i*n)),"discrete"===o&&(e=i*n>s?this.from.y:e));else if("straightCross"===o)Math.abs(this.from.x-this.to.x)<=Math.abs(this.from.y-this.to.y)?(t=this.from.x,e=this.from.yMath.abs(this.from.y-this.to.y)&&(t=this.from.x=this.to.y?this.from.x<=this.to.x?(t=this.from.x+i*s,e=this.from.y-i*s,t=this.to.xthis.to.x&&(t=this.from.x-i*s,e=this.from.y-i*s,t=this.to.x>t?this.to.x:t):this.from.ythis.to.x&&(t=this.from.x-i*s,e=this.from.y+i*s,t=this.to.x>t?this.to.x:t)):Math.abs(this.from.x-this.to.x)>Math.abs(this.from.y-this.to.y)&&(this.from.y>=this.to.y?this.from.x<=this.to.x?(t=this.from.x+i*n,e=this.from.y-i*n,e=this.to.y>e?this.to.y:e):this.from.x>this.to.x&&(t=this.from.x-i*n,e=this.from.y-i*n,e=this.to.y>e?this.to.y:e):this.from.ythis.to.x&&(t=this.from.x-i*n,e=this.from.y+i*n,e=this.to.y1||this.startedStabilization===!0)&&setTimeout(function(){t.body.emitter.emit("stabilized",{iterations:e}),t.startedStabilization=!1,t.stabilizationIterations=0},0)}},{key:"physicsTick",value:function(){if(this.startedStabilization===!1&&(this.body.emitter.emit("startStabilizing"),this.startedStabilization=!0),this.stabilized===!1){if(this.adaptiveTimestep===!0&&this.adaptiveTimestepEnabled===!0){var t=1.2;this.adaptiveCounter%this.adaptiveInterval===0?(this.timestep=2*this.timestep,this.calculateForces(),this.moveNodes(),this.revert(),this.timestep=.5*this.timestep,this.calculateForces(),this.moveNodes(),this.calculateForces(),this.moveNodes(),this._evaluateStepQuality()===!0?this.timestep=t*this.timestep:this.timestep/ts))return!1;return!0}},{key:"moveNodes",value:function(){for(var t=this.physicsBody.physicsNodeIndices,e=this.options.maxVelocity?this.options.maxVelocity:1e9,i=0,o=0,n=5,s=0;se?s[t].x>0?e:-e:s[t].x,i.x+=s[t].x*o}else n[t].x=0,s[t].x=0;if(i.options.fixed.y===!1){var h=this.modelOptions.damping*s[t].y,d=(n[t].y-h)/i.options.mass;s[t].y+=d*o,s[t].y=Math.abs(s[t].y)>e?s[t].y>0?e:-e:s[t].y,i.y+=s[t].y*o}else n[t].y=0,s[t].y=0;var l=Math.sqrt(Math.pow(s[t].x,2)+Math.pow(s[t].y,2));return l}},{key:"calculateForces",value:function(){this.gravitySolver.solve(),this.nodesSolver.solve(),this.edgesSolver.solve()}},{key:"_freezeNodes",value:function(){var t=this.body.nodes;for(var e in t)t.hasOwnProperty(e)&&t[e].x&&t[e].y&&(this.freezeCache[e]={x:t[e].options.fixed.x,y:t[e].options.fixed.y},t[e].options.fixed.x=!0,t[e].options.fixed.y=!0)}},{key:"_restoreFrozenNodes",value:function(){var t=this.body.nodes;for(var e in t)t.hasOwnProperty(e)&&void 0!==this.freezeCache[e]&&(t[e].options.fixed.x=this.freezeCache[e].x,t[e].options.fixed.y=this.freezeCache[e].y);this.freezeCache={}}},{key:"stabilize",value:function(){var t=this,e=arguments.length<=0||void 0===arguments[0]?this.options.stabilization.iterations:arguments[0];return"number"!=typeof e&&(console.log("The stabilize method needs a numeric amount of iterations. Switching to default: ",this.options.stabilization.iterations),e=this.options.stabilization.iterations),0===this.physicsBody.physicsNodeIndices.length?void(this.ready=!0):(this.adaptiveTimestep=this.options.adaptiveTimestep,this.body.emitter.emit("_resizeNodes"),this.stopSimulation(),this.stabilized=!1,this.body.emitter.emit("_blockRedraw"),this.targetIterations=e,this.options.stabilization.onlyDynamicEdges===!0&&this._freezeNodes(),this.stabilizationIterations=0,void setTimeout(function(){return t._stabilizationBatch()},0))}},{key:"_stabilizationBatch",value:function(){this.startedStabilization===!1&&(this.body.emitter.emit("startStabilizing"),this.startedStabilization=!0);for(var t=0;this.stabilized===!1&&t0){var t=void 0,e=this.body.nodes,i=this.physicsBody.physicsNodeIndices,o=i.length,n=this._formBarnesHutTree(e,i);this.barnesHutTree=n;for(var s=0;o>s;s++)t=e[i[s]],t.options.mass>0&&(this._getForceContribution(n.root.children.NW,t),this._getForceContribution(n.root.children.NE,t),this._getForceContribution(n.root.children.SW,t),this._getForceContribution(n.root.children.SE,t))}}},{key:"_getForceContribution",value:function(t,e){if(t.childrenCount>0){var i=void 0,o=void 0,n=void 0;i=t.centerOfMass.x-e.x,o=t.centerOfMass.y-e.y,n=Math.sqrt(i*i+o*o),n*t.calcSize>this.thetaInversed?this._calculateForces(n,i,o,e,t):4===t.childrenCount?(this._getForceContribution(t.children.NW,e),this._getForceContribution(t.children.NE,e),this._getForceContribution(t.children.SW,e),this._getForceContribution(t.children.SE,e)):t.children.data.id!=e.id&&this._calculateForces(n,i,o,e,t)}}},{key:"_calculateForces",value:function(t,e,i,o,n){0===t&&(t=.1,e=t),this.overlapAvoidanceFactor<1&&(t=Math.max(.1+this.overlapAvoidanceFactor*o.shape.radius,t-o.shape.radius));var s=this.options.gravitationalConstant*n.mass*o.options.mass/Math.pow(t,3),r=e*s,a=i*s;this.physicsBody.forces[o.id].x+=r,this.physicsBody.forces[o.id].y+=a}},{key:"_formBarnesHutTree",value:function(t,e){for(var i=void 0,o=e.length,n=t[e[0]].x,s=t[e[0]].y,r=t[e[0]].x,a=t[e[0]].y,h=1;o>h;h++){var d=t[e[h]].x,l=t[e[h]].y;t[e[h]].options.mass>0&&(n>d&&(n=d),d>r&&(r=d),s>l&&(s=l),l>a&&(a=l))}var c=Math.abs(r-n)-Math.abs(a-s);c>0?(s-=.5*c,a+=.5*c):(n+=.5*c,r-=.5*c);var u=1e-5,p=Math.max(u,Math.abs(r-n)),f=.5*p,m=.5*(n+r),v=.5*(s+a),g={root:{centerOfMass:{x:0,y:0},mass:0,range:{minX:m-f,maxX:m+f,minY:v-f,maxY:v+f},size:p,calcSize:1/p,children:{data:null},maxWidth:0,level:0,childrenCount:4}};this._splitBranch(g.root);for(var y=0;o>y;y++)i=t[e[y]],i.options.mass>0&&this._placeInTree(g.root,i);return g}},{key:"_updateBranchMass",value:function(t,e){var i=t.mass+e.options.mass,o=1/i;t.centerOfMass.x=t.centerOfMass.x*t.mass+e.x*e.options.mass,t.centerOfMass.x*=o,t.centerOfMass.y=t.centerOfMass.y*t.mass+e.y*e.options.mass,t.centerOfMass.y*=o,t.mass=i;var n=Math.max(Math.max(e.height,e.radius),e.width);t.maxWidth=t.maxWidthe.x?t.children.NW.range.maxY>e.y?this._placeInRegion(t,e,"NW"):this._placeInRegion(t,e,"SW"):t.children.NW.range.maxY>e.y?this._placeInRegion(t,e,"NE"):this._placeInRegion(t,e,"SE")}},{key:"_placeInRegion",value:function(t,e,i){switch(t.children[i].childrenCount){case 0:t.children[i].children.data=e,t.children[i].childrenCount=1,this._updateBranchMass(t.children[i],e);break;case 1:t.children[i].children.data.x===e.x&&t.children[i].children.data.y===e.y?(e.x+=this.seededRandom(),e.y+=this.seededRandom()):(this._splitBranch(t.children[i]),this._placeInTree(t.children[i],e));break;case 4:this._placeInTree(t.children[i],e)}}},{key:"_splitBranch",value:function(t){var e=null;1===t.childrenCount&&(e=t.children.data,t.mass=0,t.centerOfMass.x=0,t.centerOfMass.y=0),t.childrenCount=4,t.children.data=null,this._insertRegion(t,"NW"),this._insertRegion(t,"NE"),this._insertRegion(t,"SW"),this._insertRegion(t,"SE"),null!=e&&this._placeInTree(t,e)}},{key:"_insertRegion",value:function(t,e){var i=void 0,o=void 0,n=void 0,s=void 0,r=.5*t.size;switch(e){case"NW":i=t.range.minX,o=t.range.minX+r,n=t.range.minY,s=t.range.minY+r;break;case"NE":i=t.range.minX+r,o=t.range.maxX,n=t.range.minY,s=t.range.minY+r;break;case"SW":i=t.range.minX,o=t.range.minX+r,n=t.range.minY+r,s=t.range.maxY;break;case"SE":i=t.range.minX+r,o=t.range.maxX,n=t.range.minY+r,s=t.range.maxY}t.children[e]={centerOfMass:{x:0,y:0},mass:0,range:{minX:i,maxX:o,minY:n,maxY:s},size:.5*t.size,calcSize:2*t.calcSize,children:{data:null},maxWidth:0,level:t.level+1,childrenCount:0}}},{key:"_debug",value:function(t,e){void 0!==this.barnesHutTree&&(t.lineWidth=1,this._drawBranch(this.barnesHutTree.root,t,e))}},{key:"_drawBranch",value:function(t,e,i){void 0===i&&(i="#FF0000"),4===t.childrenCount&&(this._drawBranch(t.children.NW,e),this._drawBranch(t.children.NE,e),this._drawBranch(t.children.SE,e),this._drawBranch(t.children.SW,e)),e.strokeStyle=i,e.beginPath(),e.moveTo(t.range.minX,t.range.minY),e.lineTo(t.range.maxX,t.range.minY),e.stroke(),e.beginPath(),e.moveTo(t.range.maxX,t.range.minY),e.lineTo(t.range.maxX,t.range.maxY),e.stroke(),e.beginPath(),e.moveTo(t.range.maxX,t.range.maxY), -e.lineTo(t.range.minX,t.range.maxY),e.stroke(),e.beginPath(),e.moveTo(t.range.minX,t.range.maxY),e.lineTo(t.range.minX,t.range.minY),e.stroke()}}]),t}();e["default"]=n},function(t,e){function i(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var o=function(){function t(t,e){for(var i=0;ii&&(s=.5*c>i?1:u*i+p,s/=i,o=t*s,n=e*s,l[r.id].x-=o,l[r.id].y-=n,l[a.id].x+=o,l[a.id].y+=n)}}}]),t}();e["default"]=n},function(t,e){function i(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var o=function(){function t(t,e){for(var i=0;ii?-Math.pow(f*i,2)+Math.pow(f*p,2):0,0===i?i=.01:s/=i,o=t*s,n=e*s,u[r.id].x-=o,u[r.id].y-=n,u[a.id].x+=o,u[a.id].y+=n}}}]),t}();e["default"]=n},function(t,e){function i(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var o=function(){function t(t,e){for(var i=0;i0){var s=n.edges.length+1,r=this.options.centralGravity*s*n.options.mass;o[n.id].x=e*r,o[n.id].y=i*r}}}]),e}(d["default"]);e["default"]=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var s="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol?"symbol":typeof t},r=function(){function t(t,e){for(var i=0;i=t&&i.push(n.id)}for(var r=0;r0&&Object.keys(p).length>0&&m===!0&&o.push({nodes:u,edges:p})}}}for(var b=0;bo?r.x:o,n=r.ys?r.y:s;return{x:.5*(i+o),y:.5*(n+s)}}},{key:"openCluster",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?!0:arguments[2];if(void 0===t)throw new Error("No clusterNodeId supplied to openCluster.");if(void 0===this.body.nodes[t])throw new Error("The clusterNodeId supplied to openCluster does not exist.");if(void 0===this.body.nodes[t].containedNodes)return void console.log("The node:"+t+" is not a cluster.");var o=this.body.nodes[t],n=o.containedNodes,s=o.containedEdges;if(void 0!==e&&void 0!==e.releaseFunction&&"function"==typeof e.releaseFunction){var r={},a={x:o.x,y:o.y};for(var d in n)if(n.hasOwnProperty(d)){var l=this.body.nodes[d];r[d]={x:l.x,y:l.y}}var u=e.releaseFunction(a,r);for(var p in n)if(n.hasOwnProperty(p)){var f=this.body.nodes[p];void 0!==u[p]&&(f.x=void 0===u[p].x?o.x:u[p].x,f.y=void 0===u[p].y?o.y:u[p].y)}}else for(var m in n)if(n.hasOwnProperty(m)){var v=this.body.nodes[m];v=n[m],v.options.fixed.x===!1&&(v.x=o.x),v.options.fixed.y===!1&&(v.y=o.y)}for(var g in n)if(n.hasOwnProperty(g)){var y=this.body.nodes[g];y.vx=o.vx,y.vy=o.vy,y.setOptions({hidden:!1,physics:!0}),delete this.clusteredNodes[g]}for(var b=[],w=0;wo;)e.push(this.body.nodes[t].id),t=this.clusteredNodes[t].clusterId,o++;return e.push(this.body.nodes[t].id),e.reverse(),e}},{key:"_getConnectedId",value:function(t,e){return t.toId!=e?t.toId:t.fromId!=e?t.fromId:t.fromId}},{key:"_getHubSize",value:function(){for(var t=0,e=0,i=0,o=0,n=0;no&&(o=s.edges.length),t+=s.edges.length,e+=Math.pow(s.edges.length,2),i+=1}t/=i,e/=i;var r=e-Math.pow(t,2),a=Math.sqrt(r),h=Math.floor(t+2*a);return h>o&&(h=o),h}}]),t}();e["default"]=u},function(t,e,i){function o(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var n=function(){function t(t,e){for(var i=0;i0)for(var a=0;ae.shape.boundingBox.left&&(s=e.shape.boundingBox.left),re.shape.boundingBox.top&&(o=e.shape.boundingBox.top),n0)for(var a=0;ae.x&&(s=e.x),re.y&&(o=e.y),n0,t.renderTimer=void 0}),this.body.emitter.on("destroy",function(){t.renderRequests=0,t.allowRedraw=!1,t.renderingActive=!1,t.requiresTimeout===!0?clearTimeout(t.renderTimer):cancelAnimationFrame(t.renderTimer),t.body.emitter.off()})}},{key:"setOptions",value:function(t){if(void 0!==t){var e=["hideEdgesOnDrag","hideNodesOnDrag"];s.selectiveDeepExtend(e,this.options,t)}}},{key:"_startRendering",value:function(){this.renderingActive===!0&&void 0===this.renderTimer&&(this.requiresTimeout===!0?this.renderTimer=window.setTimeout(this._renderStep.bind(this),this.simulationInterval):this.renderTimer=window.requestAnimationFrame(this._renderStep.bind(this)))}},{key:"_renderStep",value:function(){this.renderingActive===!0&&(this.renderTimer=void 0,this.requiresTimeout===!0&&this._startRendering(),this._redraw(),this.requiresTimeout===!1&&this._startRendering())}},{key:"redraw",value:function(){this.body.emitter.emit("setSize"),this._redraw()}},{key:"_requestRedraw",value:function(){var t=this;this.redrawRequested!==!0&&this.renderingActive===!1&&this.allowRedraw===!0&&(this.redrawRequested=!0,this.requiresTimeout===!0?window.setTimeout(function(){t._redraw(!1)},0):window.requestAnimationFrame(function(){t._redraw(!1)}))}},{key:"_redraw",value:function(){var t=arguments.length<=0||void 0===arguments[0]?!1:arguments[0];if(this.allowRedraw===!0){this.body.emitter.emit("initRedraw"),this.redrawRequested=!1;var e=this.canvas.frame.canvas.getContext("2d");0!==this.canvas.frame.canvas.width&&0!==this.canvas.frame.canvas.height||this.canvas.setSize(),this.pixelRatio=(window.devicePixelRatio||1)/(e.webkitBackingStorePixelRatio||e.mozBackingStorePixelRatio||e.msBackingStorePixelRatio||e.oBackingStorePixelRatio||e.backingStorePixelRatio||1),e.setTransform(this.pixelRatio,0,0,this.pixelRatio,0,0);var i=this.canvas.frame.canvas.clientWidth,o=this.canvas.frame.canvas.clientHeight;if(e.clearRect(0,0,i,o),0===this.canvas.frame.clientWidth)return;e.save(),e.translate(this.body.view.translation.x,this.body.view.translation.y),e.scale(this.body.view.scale,this.body.view.scale),e.beginPath(),this.body.emitter.emit("beforeDrawing",e),e.closePath(),t===!1&&(this.dragging===!1||this.dragging===!0&&this.options.hideEdgesOnDrag===!1)&&this._drawEdges(e),(this.dragging===!1||this.dragging===!0&&this.options.hideNodesOnDrag===!1)&&this._drawNodes(e,t),e.beginPath(),this.body.emitter.emit("afterDrawing",e),e.closePath(),e.restore(),t===!0&&e.clearRect(0,0,i,o)}}},{key:"_resizeNodes",value:function(){var t=this.canvas.frame.canvas.getContext("2d");void 0===this.pixelRatio&&(this.pixelRatio=(window.devicePixelRatio||1)/(t.webkitBackingStorePixelRatio||t.mozBackingStorePixelRatio||t.msBackingStorePixelRatio||t.oBackingStorePixelRatio||t.backingStorePixelRatio||1)),t.setTransform(this.pixelRatio,0,0,this.pixelRatio,0,0),t.save(),t.translate(this.body.view.translation.x,this.body.view.translation.y),t.scale(this.body.view.scale,this.body.view.scale);var e=this.body.nodes,i=void 0;for(var o in e)e.hasOwnProperty(o)&&(i=e[o],i.resize(t),i.updateBoundingBox(t,i.selected));t.restore()}},{key:"_drawNodes",value:function(t){for(var e=arguments.length<=1||void 0===arguments[1]?!1:arguments[1],i=this.body.nodes,o=this.body.nodeIndices,n=void 0,s=[],r=20,a=this.canvas.DOMtoCanvas({x:-r,y:-r}),h=this.canvas.DOMtoCanvas({x:this.canvas.frame.canvas.clientWidth+r,y:this.canvas.frame.canvas.clientHeight+r}),d={top:a.y,left:a.x,bottom:h.y,right:h.x},l=0;l0){var t=this.frame.canvas.width/this.pixelRatio/this.cameraState.previousWidth,e=this.frame.canvas.height/this.pixelRatio/this.cameraState.previousHeight,i=this.cameraState.scale;1!=t&&1!=e?i=.5*this.cameraState.scale*(t+e):1!=t?i=this.cameraState.scale*t:1!=e&&(i=this.cameraState.scale*e),this.body.view.scale=i;var o=this.DOMtoCanvas({x:.5*this.frame.canvas.clientWidth,y:.5*this.frame.canvas.clientHeight}),n={x:o.x-this.cameraState.position.x,y:o.y-this.cameraState.position.y};this.body.view.translation.x+=n.x*this.body.view.scale,this.body.view.translation.y+=n.y*this.body.view.scale}}},{key:"_prepareValue",value:function(t){if("number"==typeof t)return t+"px";if("string"==typeof t){if(-1!==t.indexOf("%")||-1!==t.indexOf("px"))return t;if(-1===t.indexOf("%"))return t+"px"}throw new Error("Could not use the value supplied for width or height:"+t)}},{key:"_create",value:function(){for(;this.body.container.hasChildNodes();)this.body.container.removeChild(this.body.container.firstChild);if(this.frame=document.createElement("div"),this.frame.className="vis-network",this.frame.style.position="relative",this.frame.style.overflow="hidden",this.frame.tabIndex=900,this.frame.canvas=document.createElement("canvas"),this.frame.canvas.style.position="relative",this.frame.appendChild(this.frame.canvas),this.frame.canvas.getContext){var t=this.frame.canvas.getContext("2d");this.pixelRatio=(window.devicePixelRatio||1)/(t.webkitBackingStorePixelRatio||t.mozBackingStorePixelRatio||t.msBackingStorePixelRatio||t.oBackingStorePixelRatio||t.backingStorePixelRatio||1), -this.frame.canvas.getContext("2d").setTransform(this.pixelRatio,0,0,this.pixelRatio,0,0)}else{var e=document.createElement("DIV");e.style.color="red",e.style.fontWeight="bold",e.style.padding="10px",e.innerHTML="Error: your browser does not support HTML canvas",this.frame.canvas.appendChild(e)}this.body.container.appendChild(this.frame),this.body.view.scale=1,this.body.view.translation={x:.5*this.frame.canvas.clientWidth,y:.5*this.frame.canvas.clientHeight},this._bindHammer()}},{key:"_bindHammer",value:function(){var t=this;void 0!==this.hammer&&this.hammer.destroy(),this.drag={},this.pinch={},this.hammer=new s(this.frame.canvas),this.hammer.get("pinch").set({enable:!0}),this.hammer.get("pan").set({threshold:5,direction:s.DIRECTION_ALL}),r.onTouch(this.hammer,function(e){t.body.eventListeners.onTouch(e)}),this.hammer.on("tap",function(e){t.body.eventListeners.onTap(e)}),this.hammer.on("doubletap",function(e){t.body.eventListeners.onDoubleTap(e)}),this.hammer.on("press",function(e){t.body.eventListeners.onHold(e)}),this.hammer.on("panstart",function(e){t.body.eventListeners.onDragStart(e)}),this.hammer.on("panmove",function(e){t.body.eventListeners.onDrag(e)}),this.hammer.on("panend",function(e){t.body.eventListeners.onDragEnd(e)}),this.hammer.on("pinch",function(e){t.body.eventListeners.onPinch(e)}),this.frame.canvas.addEventListener("mousewheel",function(e){t.body.eventListeners.onMouseWheel(e)}),this.frame.canvas.addEventListener("DOMMouseScroll",function(e){t.body.eventListeners.onMouseWheel(e)}),this.frame.canvas.addEventListener("mousemove",function(e){t.body.eventListeners.onMouseMove(e)}),this.frame.canvas.addEventListener("contextmenu",function(e){t.body.eventListeners.onContext(e)}),this.hammerFrame=new s(this.frame),r.onRelease(this.hammerFrame,function(e){t.body.eventListeners.onRelease(e)})}},{key:"setSize",value:function(){var t=arguments.length<=0||void 0===arguments[0]?this.options.width:arguments[0],e=arguments.length<=1||void 0===arguments[1]?this.options.height:arguments[1];t=this._prepareValue(t),e=this._prepareValue(e);var i=!1,o=this.frame.canvas.width,n=this.frame.canvas.height,s=this.frame.canvas.getContext("2d"),r=this.pixelRatio;return this.pixelRatio=(window.devicePixelRatio||1)/(s.webkitBackingStorePixelRatio||s.mozBackingStorePixelRatio||s.msBackingStorePixelRatio||s.oBackingStorePixelRatio||s.backingStorePixelRatio||1),t!=this.options.width||e!=this.options.height||this.frame.style.width!=t||this.frame.style.height!=e?(this._getCameraState(r),this.frame.style.width=t,this.frame.style.height=e,this.frame.canvas.style.width="100%",this.frame.canvas.style.height="100%",this.frame.canvas.width=Math.round(this.frame.canvas.clientWidth*this.pixelRatio),this.frame.canvas.height=Math.round(this.frame.canvas.clientHeight*this.pixelRatio),this.options.width=t,this.options.height=e,i=!0):(this.frame.canvas.width==Math.round(this.frame.canvas.clientWidth*this.pixelRatio)&&this.frame.canvas.height==Math.round(this.frame.canvas.clientHeight*this.pixelRatio)||this._getCameraState(r),this.frame.canvas.width!=Math.round(this.frame.canvas.clientWidth*this.pixelRatio)&&(this.frame.canvas.width=Math.round(this.frame.canvas.clientWidth*this.pixelRatio),i=!0),this.frame.canvas.height!=Math.round(this.frame.canvas.clientHeight*this.pixelRatio)&&(this.frame.canvas.height=Math.round(this.frame.canvas.clientHeight*this.pixelRatio),i=!0)),i===!0&&(this.body.emitter.emit("resize",{width:Math.round(this.frame.canvas.width/this.pixelRatio),height:Math.round(this.frame.canvas.height/this.pixelRatio),oldWidth:Math.round(o/this.pixelRatio),oldHeight:Math.round(n/this.pixelRatio)}),this._setCameraState()),this.initialized=!0,i}},{key:"_XconvertDOMtoCanvas",value:function(t){return(t-this.body.view.translation.x)/this.body.view.scale}},{key:"_XconvertCanvasToDOM",value:function(t){return t*this.body.view.scale+this.body.view.translation.x}},{key:"_YconvertDOMtoCanvas",value:function(t){return(t-this.body.view.translation.y)/this.body.view.scale}},{key:"_YconvertCanvasToDOM",value:function(t){return t*this.body.view.scale+this.body.view.translation.y}},{key:"canvasToDOM",value:function(t){return{x:this._XconvertCanvasToDOM(t.x),y:this._YconvertCanvasToDOM(t.y)}}},{key:"DOMtoCanvas",value:function(t){return{x:this._XconvertDOMtoCanvas(t.x),y:this._YconvertDOMtoCanvas(t.y)}}}]),t}();e["default"]=h},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var s=function(){function t(t,e){for(var i=0;i.5*this.body.nodeIndices.length)return void this.fit(t,!1);i=a["default"].getRange(this.body.nodes,t.nodes);var h=this.body.nodeIndices.length;o=12.662/(h+7.4147)+.0964822;var d=Math.min(this.canvas.frame.canvas.clientWidth/600,this.canvas.frame.canvas.clientHeight/600);o*=d}else{this.body.emitter.emit("_resizeNodes"),i=a["default"].getRange(this.body.nodes,t.nodes);var l=1.1*Math.abs(i.maxX-i.minX),c=1.1*Math.abs(i.maxY-i.minY),u=this.canvas.frame.canvas.clientWidth/l,p=this.canvas.frame.canvas.clientHeight/c;o=p>=u?u:p}o>1?o=1:0===o&&(o=1);var f=a["default"].findCenter(i),m={position:f,scale:o,animation:t.animation};this.moveTo(m)}},{key:"focus",value:function(t){var e=arguments.length<=1||void 0===arguments[1]?{}:arguments[1];if(void 0!==this.body.nodes[t]){var i={x:this.body.nodes[t].x,y:this.body.nodes[t].y};e.position=i,e.lockedOnNode=t,this.moveTo(e)}else console.log("Node: "+t+" cannot be found.")}},{key:"moveTo",value:function(t){return void 0===t?void(t={}):(void 0===t.offset&&(t.offset={x:0,y:0}),void 0===t.offset.x&&(t.offset.x=0),void 0===t.offset.y&&(t.offset.y=0),void 0===t.scale&&(t.scale=this.body.view.scale),void 0===t.position&&(t.position=this.getViewPosition()),void 0===t.animation&&(t.animation={duration:0}),t.animation===!1&&(t.animation={duration:0}),t.animation===!0&&(t.animation={}),void 0===t.animation.duration&&(t.animation.duration=1e3),void 0===t.animation.easingFunction&&(t.animation.easingFunction="easeInOutQuad"),void this.animateView(t))}},{key:"animateView",value:function(t){if(void 0!==t){this.animationEasingFunction=t.animation.easingFunction,this.releaseNode(),t.locked===!0&&(this.lockedOnNodeId=t.lockedOnNode,this.lockedOnNodeOffset=t.offset),0!=this.easingTime&&this._transitionRedraw(!0),this.sourceScale=this.body.view.scale,this.sourceTranslation=this.body.view.translation,this.targetScale=t.scale,this.body.view.scale=this.targetScale;var e=this.canvas.DOMtoCanvas({x:.5*this.canvas.frame.canvas.clientWidth,y:.5*this.canvas.frame.canvas.clientHeight}),i={x:e.x-t.position.x,y:e.y-t.position.y};this.targetTranslation={x:this.sourceTranslation.x+i.x*this.targetScale+t.offset.x,y:this.sourceTranslation.y+i.y*this.targetScale+t.offset.y},0===t.animation.duration?void 0!=this.lockedOnNodeId?(this.viewFunction=this._lockedRedraw.bind(this),this.body.emitter.on("initRedraw",this.viewFunction)):(this.body.view.scale=this.targetScale,this.body.view.translation=this.targetTranslation,this.body.emitter.emit("_requestRedraw")):(this.animationSpeed=1/(60*t.animation.duration*.001)||1/60,this.animationEasingFunction=t.animation.easingFunction,this.viewFunction=this._transitionRedraw.bind(this),this.body.emitter.on("initRedraw",this.viewFunction),this.body.emitter.emit("_startRendering"))}}},{key:"_lockedRedraw",value:function(){var t={x:this.body.nodes[this.lockedOnNodeId].x,y:this.body.nodes[this.lockedOnNodeId].y},e=this.canvas.DOMtoCanvas({x:.5*this.canvas.frame.canvas.clientWidth,y:.5*this.canvas.frame.canvas.clientHeight}),i={x:e.x-t.x,y:e.y-t.y},o=this.body.view.translation,n={x:o.x+i.x*this.body.view.scale+this.lockedOnNodeOffset.x,y:o.y+i.y*this.body.view.scale+this.lockedOnNodeOffset.y};this.body.view.translation=n}},{key:"releaseNode",value:function(){void 0!==this.lockedOnNodeId&&void 0!==this.viewFunction&&(this.body.emitter.off("initRedraw",this.viewFunction),this.lockedOnNodeId=void 0,this.lockedOnNodeOffset=void 0)}},{key:"_transitionRedraw",value:function(){var t=arguments.length<=0||void 0===arguments[0]?!1:arguments[0];this.easingTime+=this.animationSpeed,this.easingTime=t===!0?1:this.easingTime;var e=h.easingFunctions[this.animationEasingFunction](this.easingTime);this.body.view.scale=this.sourceScale+(this.targetScale-this.sourceScale)*e,this.body.view.translation={x:this.sourceTranslation.x+(this.targetTranslation.x-this.sourceTranslation.x)*e,y:this.sourceTranslation.y+(this.targetTranslation.y-this.sourceTranslation.y)*e},this.easingTime>=1&&(this.body.emitter.off("initRedraw",this.viewFunction),this.easingTime=0,void 0!=this.lockedOnNodeId&&(this.viewFunction=this._lockedRedraw.bind(this),this.body.emitter.on("initRedraw",this.viewFunction)),this.body.emitter.emit("animationFinished"))}},{key:"getScale",value:function(){return this.body.view.scale}},{key:"getViewPosition",value:function(){return this.canvas.DOMtoCanvas({x:.5*this.canvas.frame.canvas.clientWidth,y:.5*this.canvas.frame.canvas.clientHeight})}}]),t}();e["default"]=d},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var s=function(){function t(t,e){for(var i=0;i50&&(this.drag.pointer=this.getPointer(t.center),this.drag.pinched=!1,this.pinch.scale=this.body.view.scale,this.touchTime=(new Date).valueOf())}},{key:"onTap",value:function(t){var e=this.getPointer(t.center),i=this.selectionHandler.options.multiselect&&(t.changedPointers[0].ctrlKey||t.changedPointers[0].metaKey);this.checkSelectionChanges(e,t,i),this.selectionHandler._generateClickEvent("click",t,e)}},{key:"onDoubleTap",value:function(t){var e=this.getPointer(t.center);this.selectionHandler._generateClickEvent("doubleClick",t,e)}},{key:"onHold",value:function(t){var e=this.getPointer(t.center),i=this.selectionHandler.options.multiselect;this.checkSelectionChanges(e,t,i),this.selectionHandler._generateClickEvent("click",t,e),this.selectionHandler._generateClickEvent("hold",t,e)}},{key:"onRelease",value:function(t){if((new Date).valueOf()-this.touchTime>10){var e=this.getPointer(t.center);this.selectionHandler._generateClickEvent("release",t,e),this.touchTime=(new Date).valueOf()}}},{key:"onContext",value:function(t){var e=this.getPointer({x:t.clientX,y:t.clientY});this.selectionHandler._generateClickEvent("oncontext",t,e)}},{key:"checkSelectionChanges",value:function(t,e){var i=arguments.length<=2||void 0===arguments[2]?!1:arguments[2],o=this.selectionHandler._getSelectedEdgeCount(),n=this.selectionHandler._getSelectedNodeCount(),s=this.selectionHandler.getSelection(),r=void 0;r=i===!0?this.selectionHandler.selectAdditionalOnPoint(t):this.selectionHandler.selectOnPoint(t);var a=this.selectionHandler._getSelectedEdgeCount(),h=this.selectionHandler._getSelectedNodeCount(),d=this.selectionHandler.getSelection(),l=this._determineIfDifferent(s,d),c=l.nodesChanged,u=l.edgesChanged,p=!1;h-n>0?(this.selectionHandler._generateClickEvent("selectNode",e,t),r=!0,p=!0):c===!0&&h>0?(this.selectionHandler._generateClickEvent("deselectNode",e,t,s),this.selectionHandler._generateClickEvent("selectNode",e,t),p=!0,r=!0):0>h-n&&(this.selectionHandler._generateClickEvent("deselectNode",e,t,s),r=!0),a-o>0&&p===!1?(this.selectionHandler._generateClickEvent("selectEdge",e,t),r=!0):a>0&&u===!0?(this.selectionHandler._generateClickEvent("deselectEdge",e,t,s),this.selectionHandler._generateClickEvent("selectEdge",e,t),r=!0):0>a-o&&(this.selectionHandler._generateClickEvent("deselectEdge",e,t,s),r=!0),r===!0&&this.selectionHandler._generateClickEvent("select",e,t)}},{key:"_determineIfDifferent",value:function(t,e){for(var i=!1,o=!1,n=0;nt&&(t=1e-5),t>10&&(t=10);var o=void 0;void 0!==this.drag&&this.drag.dragging===!0&&(o=this.canvas.DOMtoCanvas(this.drag.pointer));var n=this.body.view.translation,s=t/i,r=(1-s)*e.x+n.x*s,a=(1-s)*e.y+n.y*s;if(this.body.view.scale=t,this.body.view.translation={x:r,y:a},void 0!=o){var h=this.canvas.canvasToDOM(o);this.drag.pointer.x=h.x,this.drag.pointer.y=h.y}this.body.emitter.emit("_requestRedraw"),t>i?this.body.emitter.emit("zoom",{direction:"+",scale:this.body.view.scale}):this.body.emitter.emit("zoom",{direction:"-",scale:this.body.view.scale})}}},{key:"onMouseWheel",value:function(t){if(this.options.zoomView===!0){var e=0;if(t.wheelDelta?e=t.wheelDelta/120:t.detail&&(e=-t.detail/3),0!==e){var i=this.body.view.scale,o=e/10;0>e&&(o/=1-o),i*=1+o;var n=this.getPointer({x:t.clientX,y:t.clientY});this.zoom(i,n)}t.preventDefault()}}},{key:"onMouseMove",value:function(t){var e=this,i=this.getPointer({x:t.clientX,y:t.clientY}),o=!1;if(void 0!==this.popup&&(this.popup.hidden===!1&&this._checkHidePopup(i),this.popup.hidden===!1&&(o=!0,this.popup.setPosition(i.x+3,i.y-5),this.popup.show())),this.options.keyboard.bindToWindow===!1&&this.options.keyboard.enabled===!0&&this.canvas.frame.focus(),o===!1&&(void 0!==this.popupTimer&&(clearInterval(this.popupTimer),this.popupTimer=void 0),this.drag.dragging||(this.popupTimer=setTimeout(function(){return e._checkShowPopup(i)},this.options.tooltipDelay))),this.options.hover===!0){var n=this.selectionHandler.getNodeAt(i);void 0===n&&(n=this.selectionHandler.getEdgeAt(i)),this.selectionHandler.hoverObject(n)}}},{key:"_checkShowPopup",value:function(t){var e=this.canvas._XconvertDOMtoCanvas(t.x),i=this.canvas._YconvertDOMtoCanvas(t.y),o={left:e,top:i,right:e,bottom:i},n=void 0===this.popupObj?void 0:this.popupObj.id,s=!1,r="node";if(void 0===this.popupObj){for(var a=this.body.nodeIndices,h=this.body.nodes,l=void 0,c=[],u=0;u0&&(this.popupObj=h[c[c.length-1]],s=!0)}if(void 0===this.popupObj&&s===!1){for(var p=this.body.edgeIndices,f=this.body.edges,m=void 0,v=[],g=0;g0&&(this.popupObj=f[v[v.length-1]],r="edge")}void 0!==this.popupObj?this.popupObj.id!==n&&(void 0===this.popup&&(this.popup=new d["default"](this.canvas.frame)),this.popup.popupTargetType=r,this.popup.popupTargetId=this.popupObj.id,this.popup.setPosition(t.x+3,t.y-5),this.popup.setText(this.popupObj.getTitle()),this.popup.show(),this.body.emitter.emit("showPopup",this.popupObj.id)):void 0!==this.popup&&(this.popup.hide(),this.body.emitter.emit("hidePopup"))}},{key:"_checkHidePopup",value:function(t){var e=this.selectionHandler._pointerToPositionObject(t),i=!1;if("node"===this.popup.popupTargetType){if(void 0!==this.body.nodes[this.popup.popupTargetId]&&(i=this.body.nodes[this.popup.popupTargetId].isOverlappingWith(e),i===!0)){var o=this.selectionHandler.getNodeAt(t);i=o.id===this.popup.popupTargetId}}else void 0===this.selectionHandler.getNodeAt(t)&&void 0!==this.body.edges[this.popup.popupTargetId]&&(i=this.body.edges[this.popup.popupTargetId].isOverlappingWith(e));i===!1&&(this.popupObj=void 0,this.popup.hide(),this.body.emitter.emit("hidePopup"))}}]),t}();e["default"]=c},function(t,e,i){function o(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var n=function(){function t(t,e){for(var i=0;i700&&(this.body.emitter.emit("fit",{duration:700}),this.touchTime=(new Date).valueOf())}},{key:"_stopMovement",value:function(){for(var t in this.boundFunctions)this.boundFunctions.hasOwnProperty(t)&&(this.body.emitter.off("initRedraw",this.boundFunctions[t]),this.body.emitter.emit("_stopRendering"));this.boundFunctions={}}},{key:"_moveUp",value:function(){this.body.view.translation.y+=this.options.keyboard.speed.y}},{key:"_moveDown",value:function(){this.body.view.translation.y-=this.options.keyboard.speed.y}},{key:"_moveLeft",value:function(){this.body.view.translation.x+=this.options.keyboard.speed.x}},{key:"_moveRight",value:function(){this.body.view.translation.x-=this.options.keyboard.speed.x}},{key:"_zoomIn",value:function(){this.body.view.scale*=1+this.options.keyboard.speed.zoom,this.body.emitter.emit("zoom",{direction:"+",scale:this.body.view.scale})}},{key:"_zoomOut",value:function(){this.body.view.scale/=1+this.options.keyboard.speed.zoom,this.body.emitter.emit("zoom",{direction:"-",scale:this.body.view.scale})}},{key:"configureKeyboardBindings",value:function(){var t=this;void 0!==this.keycharm&&this.keycharm.destroy(),this.options.keyboard.enabled===!0&&(this.options.keyboard.bindToWindow===!0?this.keycharm=a({container:window,preventDefault:!0}):this.keycharm=a({container:this.canvas.frame,preventDefault:!0}),this.keycharm.reset(),this.activated===!0&&(this.keycharm.bind("up",function(){t.bindToRedraw("_moveUp")},"keydown"),this.keycharm.bind("down",function(){t.bindToRedraw("_moveDown")},"keydown"),this.keycharm.bind("left",function(){t.bindToRedraw("_moveLeft")},"keydown"),this.keycharm.bind("right",function(){t.bindToRedraw("_moveRight")},"keydown"),this.keycharm.bind("=",function(){t.bindToRedraw("_zoomIn")},"keydown"),this.keycharm.bind("num+",function(){t.bindToRedraw("_zoomIn")},"keydown"),this.keycharm.bind("num-",function(){t.bindToRedraw("_zoomOut")},"keydown"),this.keycharm.bind("-",function(){t.bindToRedraw("_zoomOut")},"keydown"),this.keycharm.bind("[",function(){t.bindToRedraw("_zoomOut")},"keydown"),this.keycharm.bind("]",function(){t.bindToRedraw("_zoomIn")},"keydown"),this.keycharm.bind("pageup",function(){t.bindToRedraw("_zoomIn")},"keydown"),this.keycharm.bind("pagedown",function(){t.bindToRedraw("_zoomOut")},"keydown"),this.keycharm.bind("up",function(){t.unbindFromRedraw("_moveUp")},"keyup"),this.keycharm.bind("down",function(){t.unbindFromRedraw("_moveDown")},"keyup"),this.keycharm.bind("left",function(){t.unbindFromRedraw("_moveLeft")},"keyup"),this.keycharm.bind("right",function(){t.unbindFromRedraw("_moveRight")},"keyup"),this.keycharm.bind("=",function(){t.unbindFromRedraw("_zoomIn")},"keyup"),this.keycharm.bind("num+",function(){t.unbindFromRedraw("_zoomIn")},"keyup"),this.keycharm.bind("num-",function(){t.unbindFromRedraw("_zoomOut")},"keyup"),this.keycharm.bind("-",function(){t.unbindFromRedraw("_zoomOut")},"keyup"),this.keycharm.bind("[",function(){t.unbindFromRedraw("_zoomOut")},"keyup"),this.keycharm.bind("]",function(){t.unbindFromRedraw("_zoomIn")},"keyup"),this.keycharm.bind("pageup",function(){t.unbindFromRedraw("_zoomIn")},"keyup"),this.keycharm.bind("pagedown",function(){t.unbindFromRedraw("_zoomOut")},"keyup")))}}]),t}();e["default"]=h},function(t,e){function i(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var o=function(){function t(t,e){for(var i=0;io&&(s=o-e-this.padding),sn&&(r=n-i-this.padding),r0?e===!0?this.body.nodes[o[o.length-1]]:o[o.length-1]:void 0}},{key:"_getEdgesOverlappingWith",value:function(t,e){for(var i=this.body.edges,o=0;o0?e===!0?this.body.edges[o[o.length-1]]:o[o.length-1]:void 0}},{key:"_addToSelection",value:function(t){t instanceof a["default"]?this.selectionObj.nodes[t.id]=t:this.selectionObj.edges[t.id]=t}},{key:"_addToHover",value:function(t){t instanceof a["default"]?this.hoverObj.nodes[t.id]=t:this.hoverObj.edges[t.id]=t}},{key:"_removeFromSelection",value:function(t){t instanceof a["default"]?(delete this.selectionObj.nodes[t.id],this._unselectConnectedEdges(t)):delete this.selectionObj.edges[t.id]}},{key:"unselectAll",value:function(){for(var t in this.selectionObj.nodes)this.selectionObj.nodes.hasOwnProperty(t)&&this.selectionObj.nodes[t].unselect();for(var e in this.selectionObj.edges)this.selectionObj.edges.hasOwnProperty(e)&&this.selectionObj.edges[e].unselect();this.selectionObj={nodes:{},edges:{}}}},{key:"_getSelectedNodeCount",value:function(){var t=0;for(var e in this.selectionObj.nodes)this.selectionObj.nodes.hasOwnProperty(e)&&(t+=1);return t}},{key:"_getSelectedNode",value:function(){for(var t in this.selectionObj.nodes)if(this.selectionObj.nodes.hasOwnProperty(t))return this.selectionObj.nodes[t]}},{key:"_getSelectedEdge",value:function(){for(var t in this.selectionObj.edges)if(this.selectionObj.edges.hasOwnProperty(t))return this.selectionObj.edges[t]}},{key:"_getSelectedEdgeCount",value:function(){var t=0;for(var e in this.selectionObj.edges)this.selectionObj.edges.hasOwnProperty(e)&&(t+=1);return t}},{key:"_getSelectedObjectCount",value:function(){var t=0;for(var e in this.selectionObj.nodes)this.selectionObj.nodes.hasOwnProperty(e)&&(t+=1);for(var i in this.selectionObj.edges)this.selectionObj.edges.hasOwnProperty(i)&&(t+=1);return t}},{key:"_selectionIsEmpty",value:function(){for(var t in this.selectionObj.nodes)if(this.selectionObj.nodes.hasOwnProperty(t))return!1;for(var e in this.selectionObj.edges)if(this.selectionObj.edges.hasOwnProperty(e))return!1;return!0}},{key:"_clusterInSelection",value:function(){for(var t in this.selectionObj.nodes)if(this.selectionObj.nodes.hasOwnProperty(t)&&this.selectionObj.nodes[t].clusterSize>1)return!0;return!1}},{key:"_selectConnectedEdges",value:function(t){for(var e=0;e0&&(this.options.hierarchical.levelSeparation*=-1):this.options.hierarchical.levelSeparation<0&&(this.options.hierarchical.levelSeparation*=-1),this.body.emitter.emit("_resetHierarchicalLayout"),this.adaptAllOptionsForHierarchicalLayout(e);if(i===!0)return this.body.emitter.emit("refresh"),l.deepExtend(e,this.optionsBackup)}return e}},{key:"adaptAllOptionsForHierarchicalLayout",value:function(t){if(this.options.hierarchical.enabled===!0){void 0===t.physics||t.physics===!0?(t.physics={enabled:void 0===this.optionsBackup.physics.enabled?!0:this.optionsBackup.physics.enabled,solver:"hierarchicalRepulsion"},this.optionsBackup.physics.enabled=void 0===this.optionsBackup.physics.enabled?!0:this.optionsBackup.physics.enabled,this.optionsBackup.physics.solver=this.optionsBackup.physics.solver||"barnesHut"):"object"===r(t.physics)?(this.optionsBackup.physics.enabled=void 0===t.physics.enabled?!0:t.physics.enabled,this.optionsBackup.physics.solver=t.physics.solver||"barnesHut",t.physics.solver="hierarchicalRepulsion"):t.physics!==!1&&(this.optionsBackup.physics.solver="barnesHut",t.physics={solver:"hierarchicalRepulsion"});var e="horizontal";"RL"!==this.options.hierarchical.direction&&"LR"!==this.options.hierarchical.direction||(e="vertical"),void 0===t.edges?(this.optionsBackup.edges={smooth:{enabled:!0,type:"dynamic"}},t.edges={smooth:!1}):void 0===t.edges.smooth?(this.optionsBackup.edges={smooth:{enabled:!0,type:"dynamic"}},t.edges.smooth=!1):"boolean"==typeof t.edges.smooth?(this.optionsBackup.edges={smooth:t.edges.smooth},t.edges.smooth={enabled:t.edges.smooth,type:e}):(void 0!==t.edges.smooth.type&&"dynamic"!==t.edges.smooth.type&&(e=t.edges.smooth.type),this.optionsBackup.edges={smooth:void 0===t.edges.smooth.enabled?!0:t.edges.smooth.enabled,type:void 0===t.edges.smooth.type?"dynamic":t.edges.smooth.type,roundness:void 0===t.edges.smooth.roundness?.5:t.edges.smooth.roundness,forceDirection:void 0===t.edges.smooth.forceDirection?!1:t.edges.smooth.forceDirection},t.edges.smooth={enabled:void 0===t.edges.smooth.enabled?!0:t.edges.smooth.enabled,type:e,roundness:void 0===t.edges.smooth.roundness?.5:t.edges.smooth.roundness,forceDirection:void 0===t.edges.smooth.forceDirection?!1:t.edges.smooth.forceDirection}),this.body.emitter.emit("_forceDisableDynamicCurves",e)}return t}},{key:"seededRandom",value:function(){var t=1e4*Math.sin(this.randomSeed++);return t-Math.floor(t)}},{key:"positionInitially",value:function(t){if(this.options.hierarchical.enabled!==!0){this.randomSeed=this.initialRandomSeed;for(var e=0;es){for(var r=this.body.nodeIndices.length;this.body.nodeIndices.length>s;){n+=1;var a=this.body.nodeIndices.length;n%3===0?this.body.modules.clustering.clusterBridges():this.body.modules.clustering.clusterOutliers();var h=this.body.nodeIndices.length;if(a==h&&n%3!==0||n>o)return this._declusterAll(),this.body.emitter.emit("_layoutFailed"),void console.info("This network could not be positioned by this version of the improved layout algorithm. Please disable improvedLayout for better performance.")}this.body.modules.kamadaKawai.setOptions({springLength:Math.max(150,2*r)})}this.body.modules.kamadaKawai.solve(this.body.nodeIndices,this.body.edgeIndices,!0),this._shiftToCenter();for(var d=70,l=0;l0){var t=void 0,e=void 0,i=!1,o=!0,n=!1;this.hierarchicalLevels={},this.lastNodeOnLevel={},this.hierarchicalChildrenReference={},this.hierarchicalParentReference={},this.hierarchicalTrees={},this.treeIndex=-1,this.distributionOrdering={},this.distributionIndex={},this.distributionOrderingPresence={};for(e in this.body.nodes)this.body.nodes.hasOwnProperty(e)&&(t=this.body.nodes[e],void 0===t.options.x&&void 0===t.options.y&&(o=!1),void 0!==t.options.level?(i=!0,this.hierarchicalLevels[e]=t.options.level):n=!0);if(n===!0&&i===!0)throw new Error("To use the hierarchical layout, nodes require either no predefined levels or levels have to be defined for all nodes.");n===!0&&("hubsize"===this.options.hierarchical.sortMethod?this._determineLevelsByHubsize():"directed"===this.options.hierarchical.sortMethod?this._determineLevelsDirected():"custom"===this.options.hierarchical.sortMethod&&this._determineLevelsCustomCallback());for(var s in this.body.nodes)this.body.nodes.hasOwnProperty(s)&&void 0===this.hierarchicalLevels[s]&&(this.hierarchicalLevels[s]=0);var r=this._getDistribution();this._generateMap(),this._placeNodesByHierarchy(r),this._condenseHierarchy(),this._shiftToCenter()}}},{key:"_condenseHierarchy",value:function(){var t=this,e=!1,i={},o=function(){for(var e=a(),i=0;i0)for(var n=0;n=l&&(r=Math.min(c,r),a=Math.max(c,a))}return[r,a,o,n]},l=function _(e){var i=t.hierarchicalLevels[e];if(t.hierarchicalChildrenReference[e]){var o=t.hierarchicalChildrenReference[e];if(o.length>0)for(var n=0;n1)for(var a=0;at.options.hierarchical.nodeSpacing){var u={};u[i.id]=!0;var p={};p[o.id]=!0,h(i,u),h(o,p);var f=c(i,o),m=d(u,f),v=s(m,4),g=(v[0],v[1]),y=(v[2],v[3],d(p,f)),b=s(y,4),w=b[0],_=(b[1],b[2]),x=(b[3],Math.abs(g-w));if(x>t.options.hierarchical.nodeSpacing){var k=g-w+t.options.hierarchical.nodeSpacing;k<-_+t.options.hierarchical.nodeSpacing&&(k=-_+t.options.hierarchical.nodeSpacing),0>k&&(t._shiftBlock(o.id,k),e=!0,n===!0&&t._centerParent(o))}}},m=function(o,n){for(var r=n.id,a=n.edges,l=t.hierarchicalLevels[n.id],c=t.options.hierarchical.levelSeparation*t.options.hierarchical.levelSeparation,u={},p=[],f=0;fr;r++){var a=g(o,i),h=y(o,i),d=40,l=Math.max(-d,Math.min(d,Math.round(a/h)));if(o-=l,void 0!==s[o])break;s[o]=r}return o},w=function(o){var r=t._getPositionForHierarchy(n);if(void 0===i[n.id]){var a={};a[n.id]=!0,h(n,a),i[n.id]=a}var l=d(i[n.id]),c=s(l,4),u=(c[0],c[1],c[2]),p=c[3],f=o-r,m=0;f>0?m=Math.min(f,p-t.options.hierarchical.nodeSpacing):0>f&&(m=-Math.min(-f,u-t.options.hierarchical.nodeSpacing)),0!=m&&(t._shiftBlock(n.id,m),e=!0)},_=function(i){var o=t._getPositionForHierarchy(n),r=t._getSpaceAroundNode(n),a=s(r,2),h=a[0],d=a[1],l=i-o,c=o;l>0?c=Math.min(o+(d-t.options.hierarchical.nodeSpacing),i):0>l&&(c=Math.max(o-(h-t.options.hierarchical.nodeSpacing),i)),c!==o&&(t._setPositionForHierarchy(n,c,void 0,!0),e=!0)},x=b(o,p);w(x),x=b(o,a),_(x)},v=function(i){var o=Object.keys(t.distributionOrdering);o=o.reverse();for(var n=0;i>n;n++){e=!1;for(var s=0;sn&&(e=!1,p(f,o,!0),e===!0);n++);},y=function(){for(var e in t.body.nodes)t.body.nodes.hasOwnProperty(e)&&t._centerParent(t.body.nodes[e])},b=function(){var e=Object.keys(t.distributionOrdering);e=e.reverse();for(var i=0;i0)for(var d=0;dg&&Math.abs(g)0&&Math.abs(g)0&&(r=this._getPositionForHierarchy(i[n-1])+this.options.hierarchical.nodeSpacing),this._setPositionForHierarchy(s,r,e),this._validataPositionAndContinue(s,e,r),o++}}}}},{key:"_placeBranchNodes",value:function(t,e){if(void 0!==this.hierarchicalChildrenReference[t]){for(var i=[],o=0;oe&&void 0===this.positionedNodes[s.id]))return;var a=void 0;a=0===n?this._getPositionForHierarchy(this.body.nodes[t]):this._getPositionForHierarchy(i[n-1])+this.options.hierarchical.nodeSpacing,this._setPositionForHierarchy(s,a,r),this._validataPositionAndContinue(s,r,a)}for(var h=1e9,d=-1e9,l=0;l0&&(e=this._getHubSize(),0!==e);)for(var o in this.body.nodes)if(this.body.nodes.hasOwnProperty(o)){var n=this.body.nodes[o];n.edges.length===e&&this._crawlNetwork(i,o)}}},{key:"_determineLevelsCustomCallback",value:function(){var t=this,e=1e5,i=function(t,e,i){},o=function(o,n,s){var r=t.hierarchicalLevels[o.id];void 0===r&&(t.hierarchicalLevels[o.id]=e);var a=i(d["default"].cloneOptions(o,"node"),d["default"].cloneOptions(n,"node"),d["default"].cloneOptions(s,"edge"));t.hierarchicalLevels[n.id]=t.hierarchicalLevels[o.id]+a};this._crawlNetwork(o),this._setMinLevelToZero()}},{key:"_determineLevelsDirected",value:function(){var t=this,e=1e4,i=function(i,o,n){var s=t.hierarchicalLevels[i.id];void 0===s&&(t.hierarchicalLevels[i.id]=e),n.toId==o.id?t.hierarchicalLevels[o.id]=t.hierarchicalLevels[i.id]+1:t.hierarchicalLevels[o.id]=t.hierarchicalLevels[i.id]-1};this._crawlNetwork(i),this._setMinLevelToZero()}},{key:"_setMinLevelToZero",value:function(){var t=1e9;for(var e in this.body.nodes)this.body.nodes.hasOwnProperty(e)&&void 0!==this.hierarchicalLevels[e]&&(t=Math.min(this.hierarchicalLevels[e],t));for(var i in this.body.nodes)this.body.nodes.hasOwnProperty(i)&&void 0!==this.hierarchicalLevels[i]&&(this.hierarchicalLevels[i]-=t)}},{key:"_generateMap",value:function(){var t=this,e=function(e,i){if(t.hierarchicalLevels[i.id]>t.hierarchicalLevels[e.id]){var o=e.id,n=i.id;void 0===t.hierarchicalChildrenReference[o]&&(t.hierarchicalChildrenReference[o]=[]),t.hierarchicalChildrenReference[o].push(n),void 0===t.hierarchicalParentReference[n]&&(t.hierarchicalParentReference[n]=[]),t.hierarchicalParentReference[n].push(o)}};this._crawlNetwork(e)}},{key:"_crawlNetwork",value:function(){var t=this,e=arguments.length<=0||void 0===arguments[0]?function(){}:arguments[0],i=arguments[1],o={},n=0,s=function d(i,n){if(void 0===o[i.id]){void 0===t.hierarchicalTrees[i.id]&&(t.hierarchicalTrees[i.id]=n,t.treeIndex=Math.max(n,t.treeIndex)),o[i.id]=!0;for(var s=void 0,r=0;r1&&("UD"===this.options.hierarchical.direction||"DU"===this.options.hierarchical.direction?t.sort(function(t,e){return t.x-e.x}):t.sort(function(t,e){return t.y-e.y}))}}]),t}();e["default"]=c},function(t,e,i){function o(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var n=function(){function t(t,e){for(var i=0;i0&&this.options.deleteNode!==!1?(n===!0&&this._createSeperator(4),this._createDeleteButton(o)):0===t&&this.options.deleteEdge!==!1&&(n===!0&&this._createSeperator(4),this._createDeleteButton(o))),this._bindHammerToDiv(this.closeDiv,this.toggleEditMode.bind(this)),this._temporaryBindEvent("select",this.showManipulatorToolbar.bind(this))}this.body.emitter.emit("_redraw")}},{key:"addNodeMode",value:function(){if(this.editMode!==!0&&this.enableEditMode(),this._clean(),this.inMode="addNode",this.guiEnabled===!0){var t=this.options.locales[this.options.locale];this.manipulationDOM={},this._createBackButton(t),this._createSeperator(),this._createDescription(t.addDescription||this.options.locales.en.addDescription),this._bindHammerToDiv(this.closeDiv,this.toggleEditMode.bind(this))}this._temporaryBindEvent("click",this._performAddNode.bind(this))}},{key:"editNode",value:function(){var t=this;this.editMode!==!0&&this.enableEditMode(),this._clean();var e=this.selectionHandler._getSelectedNode();if(void 0!==e){if(this.inMode="editNode","function"!=typeof this.options.editNode)throw new Error("No function has been configured to handle the editing of nodes.");if(e.isCluster!==!0){var i=s.deepExtend({},e.options,!1); -if(i.x=e.x,i.y=e.y,2!==this.options.editNode.length)throw new Error("The function for edit does not support two arguments (data, callback)");this.options.editNode(i,function(e){null!==e&&void 0!==e&&"editNode"===t.inMode&&t.body.data.nodes.getDataSet().update(e),t.showManipulatorToolbar()})}else alert(this.options.locales[this.options.locale].editClusterError||this.options.locales.en.editClusterError)}else this.showManipulatorToolbar()}},{key:"addEdgeMode",value:function(){if(this.editMode!==!0&&this.enableEditMode(),this._clean(),this.inMode="addEdge",this.guiEnabled===!0){var t=this.options.locales[this.options.locale];this.manipulationDOM={},this._createBackButton(t),this._createSeperator(),this._createDescription(t.edgeDescription||this.options.locales.en.edgeDescription),this._bindHammerToDiv(this.closeDiv,this.toggleEditMode.bind(this))}this._temporaryBindUI("onTouch",this._handleConnect.bind(this)),this._temporaryBindUI("onDragEnd",this._finishConnect.bind(this)),this._temporaryBindUI("onDrag",this._dragControlNode.bind(this)),this._temporaryBindUI("onRelease",this._finishConnect.bind(this)),this._temporaryBindUI("onDragStart",function(){}),this._temporaryBindUI("onHold",function(){})}},{key:"editEdgeMode",value:function(){var t=this;if(this.editMode!==!0&&this.enableEditMode(),this._clean(),this.inMode="editEdge",this.guiEnabled===!0){var e=this.options.locales[this.options.locale];this.manipulationDOM={},this._createBackButton(e),this._createSeperator(),this._createDescription(e.editEdgeDescription||this.options.locales.en.editEdgeDescription),this._bindHammerToDiv(this.closeDiv,this.toggleEditMode.bind(this))}this.edgeBeingEditedId=this.selectionHandler.getSelectedEdges()[0],void 0!==this.edgeBeingEditedId?!function(){var e=t.body.edges[t.edgeBeingEditedId],i=t._getNewTargetNode(e.from.x,e.from.y),o=t._getNewTargetNode(e.to.x,e.to.y);t.temporaryIds.nodes.push(i.id),t.temporaryIds.nodes.push(o.id),t.body.nodes[i.id]=i,t.body.nodeIndices.push(i.id),t.body.nodes[o.id]=o,t.body.nodeIndices.push(o.id),t._temporaryBindUI("onTouch",t._controlNodeTouch.bind(t)),t._temporaryBindUI("onTap",function(){}),t._temporaryBindUI("onHold",function(){}),t._temporaryBindUI("onDragStart",t._controlNodeDragStart.bind(t)),t._temporaryBindUI("onDrag",t._controlNodeDrag.bind(t)),t._temporaryBindUI("onDragEnd",t._controlNodeDragEnd.bind(t)),t._temporaryBindUI("onMouseMove",function(){}),t._temporaryBindEvent("beforeDrawing",function(t){var n=e.edgeType.findBorderPositions(t);i.selected===!1&&(i.x=n.from.x,i.y=n.from.y),o.selected===!1&&(o.x=n.to.x,o.y=n.to.y)}),t.body.emitter.emit("_redraw")}():this.showManipulatorToolbar()}},{key:"deleteSelected",value:function(){var t=this;this.editMode!==!0&&this.enableEditMode(),this._clean(),this.inMode="delete";var e=this.selectionHandler.getSelectedNodes(),i=this.selectionHandler.getSelectedEdges(),o=void 0;if(e.length>0){for(var n=0;n0&&"function"==typeof this.options.deleteEdge&&(o=this.options.deleteEdge);if("function"==typeof o){var s={nodes:e,edges:i};if(2!==o.length)throw new Error("The function for delete does not support two arguments (data, callback)");o(s,function(e){null!==e&&void 0!==e&&"delete"===t.inMode?(t.body.data.edges.getDataSet().remove(e.edges),t.body.data.nodes.getDataSet().remove(e.nodes),t.body.emitter.emit("startSimulation"),t.showManipulatorToolbar()):(t.body.emitter.emit("startSimulation"),t.showManipulatorToolbar())})}else this.body.data.edges.getDataSet().remove(i),this.body.data.nodes.getDataSet().remove(e),this.body.emitter.emit("startSimulation"),this.showManipulatorToolbar()}},{key:"_setup",value:function(){this.options.enabled===!0?(this.guiEnabled=!0,this._createWrappers(),this.editMode===!1?this._createEditButton():this.showManipulatorToolbar()):(this._removeManipulationDOM(),this.guiEnabled=!1)}},{key:"_createWrappers",value:function(){void 0===this.manipulationDiv&&(this.manipulationDiv=document.createElement("div"),this.manipulationDiv.className="vis-manipulation",this.editMode===!0?this.manipulationDiv.style.display="block":this.manipulationDiv.style.display="none",this.canvas.frame.appendChild(this.manipulationDiv)),void 0===this.editModeDiv&&(this.editModeDiv=document.createElement("div"),this.editModeDiv.className="vis-edit-mode",this.editMode===!0?this.editModeDiv.style.display="none":this.editModeDiv.style.display="block",this.canvas.frame.appendChild(this.editModeDiv)),void 0===this.closeDiv&&(this.closeDiv=document.createElement("div"),this.closeDiv.className="vis-close",this.closeDiv.style.display=this.manipulationDiv.style.display,this.canvas.frame.appendChild(this.closeDiv))}},{key:"_getNewTargetNode",value:function(t,e){var i=s.deepExtend({},this.options.controlNodeStyle);i.id="targetNode"+s.randomUUID(),i.hidden=!1,i.physics=!1,i.x=t,i.y=e;var o=this.body.functions.createNode(i);return o.shape.boundingBox={left:t,right:t,top:e,bottom:e},o}},{key:"_createEditButton",value:function(){this._clean(),this.manipulationDOM={},s.recursiveDOMDelete(this.editModeDiv);var t=this.options.locales[this.options.locale],e=this._createButton("editMode","vis-button vis-edit vis-edit-mode",t.edit||this.options.locales.en.edit);this.editModeDiv.appendChild(e),this._bindHammerToDiv(e,this.toggleEditMode.bind(this))}},{key:"_clean",value:function(){this.inMode=!1,this.guiEnabled===!0&&(s.recursiveDOMDelete(this.editModeDiv),s.recursiveDOMDelete(this.manipulationDiv),this._cleanManipulatorHammers()),this._cleanupTemporaryNodesAndEdges(),this._unbindTemporaryUIs(),this._unbindTemporaryEvents(),this.body.emitter.emit("restorePhysics")}},{key:"_cleanManipulatorHammers",value:function(){if(0!=this.manipulationHammers.length){for(var t=0;t=0;r--)if(n[r]!==this.selectedControlNode.id){s=this.body.nodes[n[r]];break}if(void 0!==s&&void 0!==this.selectedControlNode)if(s.isCluster===!0)alert(this.options.locales[this.options.locale].createEdgeError||this.options.locales.en.createEdgeError);else{var a=this.body.nodes[this.temporaryIds.nodes[0]];this.selectedControlNode.id===a.id?this._performEditEdge(s.id,o.to.id):this._performEditEdge(o.from.id,s.id)}else o.updateEdgeType(),this.body.emitter.emit("restorePhysics");this.body.emitter.emit("_redraw")}}},{key:"_handleConnect",value:function(t){if((new Date).valueOf()-this.touchTime>100){this.lastTouch=this.body.functions.getPointer(t.center),this.lastTouch.translation=s.extend({},this.body.view.translation);var e=this.lastTouch,i=this.selectionHandler.getNodeAt(e);if(void 0!==i)if(i.isCluster===!0)alert(this.options.locales[this.options.locale].createEdgeError||this.options.locales.en.createEdgeError);else{var o=this._getNewTargetNode(i.x,i.y);this.body.nodes[o.id]=o,this.body.nodeIndices.push(o.id);var n=this.body.functions.createEdge({id:"connectionEdge"+s.randomUUID(),from:i.id,to:o.id,physics:!1,smooth:{enabled:!0,type:"continuous",roundness:.5}});this.body.edges[n.id]=n,this.body.edgeIndices.push(n.id),this.temporaryIds.nodes.push(o.id),this.temporaryIds.edges.push(n.id)}this.touchTime=(new Date).valueOf()}}},{key:"_dragControlNode",value:function(t){var e=this.body.functions.getPointer(t.center);if(void 0!==this.temporaryIds.nodes[0]){var i=this.body.nodes[this.temporaryIds.nodes[0]];i.x=this.canvas._XconvertDOMtoCanvas(e.x),i.y=this.canvas._YconvertDOMtoCanvas(e.y),this.body.emitter.emit("_redraw")}else{var o=e.x-this.lastTouch.x,n=e.y-this.lastTouch.y;this.body.view.translation={x:this.lastTouch.translation.x+o,y:this.lastTouch.translation.y+n}}}},{key:"_finishConnect",value:function(t){var e=this.body.functions.getPointer(t.center),i=this.selectionHandler._pointerToPositionObject(e),o=void 0;void 0!==this.temporaryIds.edges[0]&&(o=this.body.edges[this.temporaryIds.edges[0]].fromId);for(var n=this.selectionHandler._getAllNodesOverlappingWith(i),s=void 0,r=n.length-1;r>=0;r--)if(-1===this.temporaryIds.nodes.indexOf(n[r])){s=this.body.nodes[n[r]];break}this._cleanupTemporaryNodesAndEdges(),void 0!==s&&(s.isCluster===!0?alert(this.options.locales[this.options.locale].createEdgeError||this.options.locales.en.createEdgeError):void 0!==this.body.nodes[o]&&void 0!==this.body.nodes[s.id]&&this._performAddEdge(o,s.id)),this.body.emitter.emit("_redraw")}},{key:"_performAddNode",value:function(t){var e=this,i={id:s.randomUUID(),x:t.pointer.canvas.x,y:t.pointer.canvas.y,label:"new"};if("function"==typeof this.options.addNode){if(2!==this.options.addNode.length)throw new Error("The function for add does not support two arguments (data,callback)");this.options.addNode(i,function(t){null!==t&&void 0!==t&&"addNode"===e.inMode&&(e.body.data.nodes.getDataSet().add(t),e.showManipulatorToolbar())})}else this.body.data.nodes.getDataSet().add(i),this.showManipulatorToolbar()}},{key:"_performAddEdge",value:function(t,e){var i=this,o={from:t,to:e};if("function"==typeof this.options.addEdge){if(2!==this.options.addEdge.length)throw new Error("The function for connect does not support two arguments (data,callback)");this.options.addEdge(o,function(t){null!==t&&void 0!==t&&"addEdge"===i.inMode&&(i.body.data.edges.getDataSet().add(t),i.selectionHandler.unselectAll(),i.showManipulatorToolbar())})}else this.body.data.edges.getDataSet().add(o),this.selectionHandler.unselectAll(),this.showManipulatorToolbar()}},{key:"_performEditEdge",value:function(t,e){var i=this,o={id:this.edgeBeingEditedId,from:t,to:e};if("function"==typeof this.options.editEdge){if(2!==this.options.editEdge.length)throw new Error("The function for edit does not support two arguments (data, callback)");this.options.editEdge(o,function(t){null===t||void 0===t||"editEdge"!==i.inMode?(i.body.edges[o.id].updateEdgeType(),i.body.emitter.emit("_redraw")):(i.body.data.edges.getDataSet().update(t),i.selectionHandler.unselectAll(),i.showManipulatorToolbar())})}else this.body.data.edges.getDataSet().update(o),this.selectionHandler.unselectAll(),this.showManipulatorToolbar()}}]),t}();e["default"]=h},function(t,e){Object.defineProperty(e,"__esModule",{value:!0});var i="string",o="boolean",n="number",s="array",r="object",a="dom",h="any",d={configure:{enabled:{"boolean":o},filter:{"boolean":o,string:i,array:s,"function":"function"},container:{dom:a},showButton:{"boolean":o},__type__:{object:r,"boolean":o,string:i,array:s,"function":"function"}},edges:{arrows:{to:{enabled:{"boolean":o},scaleFactor:{number:n},__type__:{object:r,"boolean":o}},middle:{enabled:{"boolean":o},scaleFactor:{number:n},__type__:{object:r,"boolean":o}},from:{enabled:{"boolean":o},scaleFactor:{number:n},__type__:{object:r,"boolean":o}},__type__:{string:["from","to","middle"],object:r}},arrowStrikethrough:{"boolean":o},color:{color:{string:i},highlight:{string:i},hover:{string:i},inherit:{string:["from","to","both"],"boolean":o},opacity:{number:n},__type__:{object:r,string:i}},dashes:{"boolean":o,array:s},font:{color:{string:i},size:{number:n},face:{string:i},background:{string:i},strokeWidth:{number:n},strokeColor:{string:i},align:{string:["horizontal","top","middle","bottom"]},__type__:{object:r,string:i}},hidden:{"boolean":o},hoverWidth:{"function":"function",number:n},label:{string:i,undefined:"undefined"},labelHighlightBold:{"boolean":o},length:{number:n,undefined:"undefined"},physics:{"boolean":o},scaling:{min:{number:n},max:{number:n},label:{enabled:{"boolean":o},min:{number:n},max:{number:n},maxVisible:{number:n},drawThreshold:{number:n},__type__:{object:r,"boolean":o}},customScalingFunction:{"function":"function"},__type__:{object:r}},selectionWidth:{"function":"function",number:n},selfReferenceSize:{number:n},shadow:{enabled:{"boolean":o},color:{string:i},size:{number:n},x:{number:n},y:{number:n},__type__:{object:r,"boolean":o}},smooth:{enabled:{"boolean":o},type:{string:["dynamic","continuous","discrete","diagonalCross","straightCross","horizontal","vertical","curvedCW","curvedCCW","cubicBezier"]},roundness:{number:n},forceDirection:{string:["horizontal","vertical","none"],"boolean":o},__type__:{object:r,"boolean":o}},title:{string:i,undefined:"undefined"},width:{number:n},value:{number:n,undefined:"undefined"},__type__:{object:r}},groups:{useDefaultGroups:{"boolean":o},__any__:"get from nodes, will be overwritten below",__type__:{object:r}},interaction:{dragNodes:{"boolean":o},dragView:{"boolean":o},hideEdgesOnDrag:{"boolean":o},hideNodesOnDrag:{"boolean":o},hover:{"boolean":o},keyboard:{enabled:{"boolean":o},speed:{x:{number:n},y:{number:n},zoom:{number:n},__type__:{object:r}},bindToWindow:{"boolean":o},__type__:{object:r,"boolean":o}},multiselect:{"boolean":o},navigationButtons:{"boolean":o},selectable:{"boolean":o},selectConnectedEdges:{"boolean":o},hoverConnectedEdges:{"boolean":o},tooltipDelay:{number:n},zoomView:{"boolean":o},__type__:{object:r}},layout:{randomSeed:{undefined:"undefined",number:n},improvedLayout:{"boolean":o},hierarchical:{enabled:{"boolean":o},levelSeparation:{number:n},nodeSpacing:{number:n},treeSpacing:{number:n},blockShifting:{"boolean":o},edgeMinimization:{"boolean":o},parentCentralization:{"boolean":o},direction:{string:["UD","DU","LR","RL"]},sortMethod:{string:["hubsize","directed"]},__type__:{object:r,"boolean":o}},__type__:{object:r}},manipulation:{enabled:{"boolean":o},initiallyActive:{"boolean":o},addNode:{"boolean":o,"function":"function"},addEdge:{"boolean":o,"function":"function"},editNode:{"function":"function"},editEdge:{"boolean":o,"function":"function"},deleteNode:{"boolean":o,"function":"function"},deleteEdge:{"boolean":o,"function":"function"},controlNodeStyle:"get from nodes, will be overwritten below",__type__:{object:r,"boolean":o}},nodes:{borderWidth:{number:n},borderWidthSelected:{number:n,undefined:"undefined"},brokenImage:{string:i,undefined:"undefined"},color:{border:{string:i},background:{string:i},highlight:{border:{string:i},background:{string:i},__type__:{object:r,string:i}},hover:{border:{string:i},background:{string:i},__type__:{object:r,string:i}},__type__:{object:r,string:i}},fixed:{x:{"boolean":o},y:{"boolean":o},__type__:{object:r,"boolean":o}},font:{align:{string:i},color:{string:i},size:{number:n},face:{string:i},background:{string:i},strokeWidth:{number:n},strokeColor:{string:i},__type__:{object:r,string:i}},group:{string:i,number:n,undefined:"undefined"},hidden:{"boolean":o},icon:{face:{string:i},code:{string:i},size:{number:n},color:{string:i},__type__:{object:r}},id:{string:i,number:n},image:{string:i,undefined:"undefined"},label:{string:i,undefined:"undefined"},labelHighlightBold:{"boolean":o},level:{number:n,undefined:"undefined"},mass:{number:n},physics:{"boolean":o},scaling:{min:{number:n},max:{number:n},label:{enabled:{"boolean":o},min:{number:n},max:{number:n},maxVisible:{number:n},drawThreshold:{number:n},__type__:{object:r,"boolean":o}},customScalingFunction:{"function":"function"},__type__:{object:r}},shadow:{enabled:{"boolean":o},color:{string:i},size:{number:n},x:{number:n},y:{number:n},__type__:{object:r,"boolean":o}},shape:{string:["ellipse","circle","database","box","text","image","circularImage","diamond","dot","star","triangle","triangleDown","square","icon"]},shapeProperties:{borderDashes:{"boolean":o,array:s},borderRadius:{number:n},interpolation:{"boolean":o},useImageSize:{"boolean":o},useBorderWithImage:{"boolean":o},__type__:{object:r}},size:{number:n},title:{string:i,undefined:"undefined"},value:{number:n,undefined:"undefined"},x:{number:n},y:{number:n},__type__:{object:r}},physics:{enabled:{"boolean":o},barnesHut:{gravitationalConstant:{number:n},centralGravity:{number:n},springLength:{number:n},springConstant:{number:n},damping:{number:n},avoidOverlap:{number:n},__type__:{object:r}},forceAtlas2Based:{gravitationalConstant:{number:n},centralGravity:{number:n},springLength:{number:n},springConstant:{number:n},damping:{number:n},avoidOverlap:{number:n},__type__:{object:r}},repulsion:{centralGravity:{number:n},springLength:{number:n},springConstant:{number:n},nodeDistance:{number:n},damping:{number:n},__type__:{object:r}},hierarchicalRepulsion:{centralGravity:{number:n},springLength:{number:n},springConstant:{number:n},nodeDistance:{number:n},damping:{number:n},__type__:{object:r}},maxVelocity:{number:n},minVelocity:{number:n},solver:{string:["barnesHut","repulsion","hierarchicalRepulsion","forceAtlas2Based"]},stabilization:{enabled:{"boolean":o},iterations:{number:n},updateInterval:{number:n},onlyDynamicEdges:{"boolean":o},fit:{"boolean":o},__type__:{object:r,"boolean":o}},timestep:{number:n},adaptiveTimestep:{"boolean":o},__type__:{object:r,"boolean":o}},autoResize:{"boolean":o},clickToUse:{"boolean":o},locale:{string:i},locales:{__any__:{any:h},__type__:{object:r}},height:{string:i},width:{string:i},__type__:{object:r}};d.groups.__any__=d.nodes,d.manipulation.controlNodeStyle=d.nodes;var l={nodes:{borderWidth:[1,0,10,1],borderWidthSelected:[2,0,10,1],color:{border:["color","#2B7CE9"],background:["color","#97C2FC"],highlight:{border:["color","#2B7CE9"],background:["color","#D2E5FF"]},hover:{border:["color","#2B7CE9"],background:["color","#D2E5FF"]}},fixed:{x:!1,y:!1},font:{color:["color","#343434"],size:[14,0,100,1],face:["arial","verdana","tahoma"],background:["color","none"],strokeWidth:[0,0,50,1],strokeColor:["color","#ffffff"]},hidden:!1,labelHighlightBold:!0,physics:!0,scaling:{min:[10,0,200,1],max:[30,0,200,1],label:{enabled:!1,min:[14,0,200,1],max:[30,0,200,1],maxVisible:[30,0,200,1],drawThreshold:[5,0,20,1]}},shadow:{enabled:!1,color:"rgba(0,0,0,0.5)",size:[10,0,20,1],x:[5,-30,30,1],y:[5,-30,30,1]},shape:["ellipse","box","circle","database","diamond","dot","square","star","text","triangle","triangleDown"],shapeProperties:{borderDashes:!1,borderRadius:[6,0,20,1],interpolation:!0,useImageSize:!1},size:[25,0,200,1]},edges:{arrows:{to:{enabled:!1,scaleFactor:[1,0,3,.05]},middle:{enabled:!1,scaleFactor:[1,0,3,.05]},from:{enabled:!1,scaleFactor:[1,0,3,.05]}},arrowStrikethrough:!0,color:{color:["color","#848484"],highlight:["color","#848484"],hover:["color","#848484"],inherit:["from","to","both",!0,!1],opacity:[1,0,1,.05]},dashes:!1,font:{color:["color","#343434"],size:[14,0,100,1],face:["arial","verdana","tahoma"],background:["color","none"],strokeWidth:[2,0,50,1],strokeColor:["color","#ffffff"],align:["horizontal","top","middle","bottom"]},hidden:!1,hoverWidth:[1.5,0,5,.1],labelHighlightBold:!0,physics:!0,scaling:{min:[1,0,100,1],max:[15,0,100,1],label:{enabled:!0,min:[14,0,200,1],max:[30,0,200,1],maxVisible:[30,0,200,1],drawThreshold:[5,0,20,1]}},selectionWidth:[1.5,0,5,.1],selfReferenceSize:[20,0,200,1],shadow:{enabled:!1,color:"rgba(0,0,0,0.5)",size:[10,0,20,1],x:[5,-30,30,1],y:[5,-30,30,1]},smooth:{enabled:!0,type:["dynamic","continuous","discrete","diagonalCross","straightCross","horizontal","vertical","curvedCW","curvedCCW","cubicBezier"],forceDirection:["horizontal","vertical","none"],roundness:[.5,0,1,.05]},width:[1,0,30,1]},layout:{hierarchical:{enabled:!1,levelSeparation:[150,20,500,5],nodeSpacing:[100,20,500,5],treeSpacing:[200,20,500,5],blockShifting:!0,edgeMinimization:!0,parentCentralization:!0,direction:["UD","DU","LR","RL"],sortMethod:["hubsize","directed"]}},interaction:{dragNodes:!0,dragView:!0,hideEdgesOnDrag:!1,hideNodesOnDrag:!1,hover:!1,keyboard:{enabled:!1,speed:{x:[10,0,40,1],y:[10,0,40,1],zoom:[.02,0,.1,.005]},bindToWindow:!0},multiselect:!1,navigationButtons:!1,selectable:!0,selectConnectedEdges:!0,hoverConnectedEdges:!0,tooltipDelay:[300,0,1e3,25],zoomView:!0},manipulation:{enabled:!1,initiallyActive:!1},physics:{enabled:!0,barnesHut:{gravitationalConstant:[-2e3,-3e4,0,50],centralGravity:[.3,0,10,.05],springLength:[95,0,500,5],springConstant:[.04,0,1.2,.005],damping:[.09,0,1,.01],avoidOverlap:[0,0,1,.01]},forceAtlas2Based:{gravitationalConstant:[-50,-500,0,1],centralGravity:[.01,0,1,.005],springLength:[95,0,500,5],springConstant:[.08,0,1.2,.005],damping:[.4,0,1,.01],avoidOverlap:[0,0,1,.01]},repulsion:{centralGravity:[.2,0,10,.05],springLength:[200,0,500,5],springConstant:[.05,0,1.2,.005],nodeDistance:[100,0,500,5],damping:[.09,0,1,.01]},hierarchicalRepulsion:{centralGravity:[.2,0,10,.05],springLength:[100,0,500,5],springConstant:[.01,0,1.2,.005],nodeDistance:[120,0,500,5],damping:[.09,0,1,.01]},maxVelocity:[50,0,150,1],minVelocity:[.1,.01,.5,.01],solver:["barnesHut","forceAtlas2Based","repulsion","hierarchicalRepulsion"],timestep:[.5,.01,1,.01]},global:{locale:["en","nl"]}};e.allOptions=d,e.configureOptions=l},function(t,e,i){function o(t){return t&&t.__esModule?t:{"default":t}}function n(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}Object.defineProperty(e,"__esModule",{value:!0});var s=function(){function t(t,e){var i=[],o=!0,n=!1,s=void 0;try{for(var r,a=t[Symbol.iterator]();!(o=(r=a.next()).done)&&(i.push(r.value),!e||i.length!==e);o=!0);}catch(h){n=!0,s=h}finally{try{!o&&a["return"]&&a["return"]()}finally{if(n)throw s}}return i}return function(e,i){if(Array.isArray(e))return e;if(Symbol.iterator in Object(e))return t(e,i);throw new TypeError("Invalid attempt to destructure non-iterable instance")}}(),r=function(){function t(t,e){for(var i=0;in&&h>a;){a+=1;var v=this._getHighestEnergyNode(i),g=s(v,4);for(c=g[0],l=g[1],u=g[2],p=g[3],f=l,m=0;f>r&&d>m;){m+=1,this._moveNode(c,u,p);var y=this._getEnergy(c),b=s(y,3);f=b[0],u=b[1],p=b[2]}}}},{key:"_getHighestEnergyNode",value:function(t){for(var e=this.body.nodeIndices,i=this.body.nodes,o=0,n=e[0],r=0,a=0,h=0;ho&&(o=u,n=d,r=p,a=f)}}return[n,o,r,a]}},{key:"_getEnergy",value:function(t){for(var e=this.body.nodeIndices,i=this.body.nodes,o=i[t].x,n=i[t].y,s=0,r=0,a=0;al;l++)for(var c=0;d-1>c;c++)for(var u=c+1;d>u;u++)o[e[c]][e[u]]=Math.min(o[e[c]][e[u]],o[e[c]][e[l]]+o[e[l]][e[u]]),o[e[u]][e[c]]=o[e[c]][e[u]];return o}}]),t}();e["default"]=n},function(t,e){"undefined"!=typeof CanvasRenderingContext2D&&(CanvasRenderingContext2D.prototype.circle=function(t,e,i){this.beginPath(),this.arc(t,e,i,0,2*Math.PI,!1),this.closePath()},CanvasRenderingContext2D.prototype.square=function(t,e,i){this.beginPath(),this.rect(t-i,e-i,2*i,2*i),this.closePath()},CanvasRenderingContext2D.prototype.triangle=function(t,e,i){this.beginPath(),i*=1.15,e+=.275*i;var o=2*i,n=o/2,s=Math.sqrt(3)/6*o,r=Math.sqrt(o*o-n*n);this.moveTo(t,e-(r-s)),this.lineTo(t+n,e+s),this.lineTo(t-n,e+s),this.lineTo(t,e-(r-s)),this.closePath()},CanvasRenderingContext2D.prototype.triangleDown=function(t,e,i){this.beginPath(),i*=1.15,e-=.275*i;var o=2*i,n=o/2,s=Math.sqrt(3)/6*o,r=Math.sqrt(o*o-n*n);this.moveTo(t,e+(r-s)), -this.lineTo(t+n,e-s),this.lineTo(t-n,e-s),this.lineTo(t,e+(r-s)),this.closePath()},CanvasRenderingContext2D.prototype.star=function(t,e,i){this.beginPath(),i*=.82,e+=.1*i;for(var o=0;10>o;o++){var n=o%2===0?1.3*i:.5*i;this.lineTo(t+n*Math.sin(2*o*Math.PI/10),e-n*Math.cos(2*o*Math.PI/10))}this.closePath()},CanvasRenderingContext2D.prototype.diamond=function(t,e,i){this.beginPath(),this.lineTo(t,e+i),this.lineTo(t+i,e),this.lineTo(t,e-i),this.lineTo(t-i,e),this.closePath()},CanvasRenderingContext2D.prototype.roundRect=function(t,e,i,o,n){var s=Math.PI/180;0>i-2*n&&(n=i/2),0>o-2*n&&(n=o/2),this.beginPath(),this.moveTo(t+n,e),this.lineTo(t+i-n,e),this.arc(t+i-n,e+n,n,270*s,360*s,!1),this.lineTo(t+i,e+o-n),this.arc(t+i-n,e+o-n,n,0,90*s,!1),this.lineTo(t+n,e+o),this.arc(t+n,e+o-n,n,90*s,180*s,!1),this.lineTo(t,e+n),this.arc(t+n,e+n,n,180*s,270*s,!1),this.closePath()},CanvasRenderingContext2D.prototype.ellipse=function(t,e,i,o){var n=.5522848,s=i/2*n,r=o/2*n,a=t+i,h=e+o,d=t+i/2,l=e+o/2;this.beginPath(),this.moveTo(t,l),this.bezierCurveTo(t,l-r,d-s,e,d,e),this.bezierCurveTo(d+s,e,a,l-r,a,l),this.bezierCurveTo(a,l+r,d+s,h,d,h),this.bezierCurveTo(d-s,h,t,l+r,t,l),this.closePath()},CanvasRenderingContext2D.prototype.database=function(t,e,i,o){var n=1/3,s=i,r=o*n,a=.5522848,h=s/2*a,d=r/2*a,l=t+s,c=e+r,u=t+s/2,p=e+r/2,f=e+(o-r/2),m=e+o;this.beginPath(),this.moveTo(l,p),this.bezierCurveTo(l,p+d,u+h,c,u,c),this.bezierCurveTo(u-h,c,t,p+d,t,p),this.bezierCurveTo(t,p-d,u-h,e,u,e),this.bezierCurveTo(u+h,e,l,p-d,l,p),this.lineTo(l,f),this.bezierCurveTo(l,f+d,u+h,m,u,m),this.bezierCurveTo(u-h,m,t,f+d,t,f),this.lineTo(t,p)},CanvasRenderingContext2D.prototype.arrow=function(t,e,i,o){var n=t-o*Math.cos(i),s=e-o*Math.sin(i),r=t-.9*o*Math.cos(i),a=e-.9*o*Math.sin(i),h=n+o/3*Math.cos(i+.5*Math.PI),d=s+o/3*Math.sin(i+.5*Math.PI),l=n+o/3*Math.cos(i-.5*Math.PI),c=s+o/3*Math.sin(i-.5*Math.PI);this.beginPath(),this.moveTo(t,e),this.lineTo(h,d),this.lineTo(r,a),this.lineTo(l,c),this.closePath()},CanvasRenderingContext2D.prototype.dashedLine=function(t,e,i,o,n){this.beginPath(),this.moveTo(t,e);for(var s=n.length,r=i-t,a=o-e,h=a/r,d=Math.sqrt(r*r+a*a),l=0,c=!0,u=0,p=n[0];d>=.1;)p=n[l++%s],p>d&&(p=d),u=Math.sqrt(p*p/(1+h*h)),u=0>r?-u:u,t+=u,e+=h*u,c===!0?this.lineTo(t,e):this.moveTo(t,e),d-=p,c=!c})},function(t,e){function i(t){return P=t,p()}function o(){I=0,N=P.charAt(0)}function n(){I++,N=P.charAt(I)}function s(){return P.charAt(I+1)}function r(t){return L.test(t)}function a(t,e){if(t||(t={}),e)for(var i in e)e.hasOwnProperty(i)&&(t[i]=e[i]);return t}function h(t,e,i){for(var o=e.split("."),n=t;o.length;){var s=o.shift();o.length?(n[s]||(n[s]={}),n=n[s]):n[s]=i}}function d(t,e){for(var i,o,n=null,s=[t],r=t;r.parent;)s.push(r.parent),r=r.parent;if(r.nodes)for(i=0,o=r.nodes.length;o>i;i++)if(e.id===r.nodes[i].id){n=r.nodes[i];break}for(n||(n={id:e.id},t.node&&(n.attr=a(n.attr,t.node))),i=s.length-1;i>=0;i--){var h=s[i];h.nodes||(h.nodes=[]),-1===h.nodes.indexOf(n)&&h.nodes.push(n)}e.attr&&(n.attr=a(n.attr,e.attr))}function l(t,e){if(t.edges||(t.edges=[]),t.edges.push(e),t.edge){var i=a({},t.edge);e.attr=a(i,e.attr)}}function c(t,e,i,o,n){var s={from:e,to:i,type:o};return t.edge&&(s.attr=a({},t.edge)),s.attr=a(s.attr||{},n),s}function u(){for(z=T.NULL,R="";" "===N||" "===N||"\n"===N||"\r"===N;)n();do{var t=!1;if("#"===N){for(var e=I-1;" "===P.charAt(e)||" "===P.charAt(e);)e--;if("\n"===P.charAt(e)||""===P.charAt(e)){for(;""!=N&&"\n"!=N;)n();t=!0}}if("/"===N&&"/"===s()){for(;""!=N&&"\n"!=N;)n();t=!0}if("/"===N&&"*"===s()){for(;""!=N;){if("*"===N&&"/"===s()){n(),n();break}n()}t=!0}for(;" "===N||" "===N||"\n"===N||"\r"===N;)n()}while(t);if(""===N)return void(z=T.DELIMITER);var i=N+s();if(E[i])return z=T.DELIMITER,R=i,n(),void n();if(E[N])return z=T.DELIMITER,R=N,void n();if(r(N)||"-"===N){for(R+=N,n();r(N);)R+=N,n();return"false"===R?R=!1:"true"===R?R=!0:isNaN(Number(R))||(R=Number(R)),void(z=T.IDENTIFIER)}if('"'===N){for(n();""!=N&&('"'!=N||'"'===N&&'"'===s());)R+=N,'"'===N&&n(),n();if('"'!=N)throw _('End of string " expected');return n(),void(z=T.IDENTIFIER)}for(z=T.UNKNOWN;""!=N;)R+=N,n();throw new SyntaxError('Syntax error in part "'+x(R,30)+'"')}function p(){var t={};if(o(),u(),"strict"===R&&(t.strict=!0,u()),"graph"!==R&&"digraph"!==R||(t.type=R,u()),z===T.IDENTIFIER&&(t.id=R,u()),"{"!=R)throw _("Angle bracket { expected");if(u(),f(t),"}"!=R)throw _("Angle bracket } expected");if(u(),""!==R)throw _("End of file expected");return u(),delete t.node,delete t.edge,delete t.graph,t}function f(t){for(;""!==R&&"}"!=R;)m(t),";"===R&&u()}function m(t){var e=v(t);if(e)return void b(t,e);var i=g(t);if(!i){if(z!=T.IDENTIFIER)throw _("Identifier expected");var o=R;if(u(),"="===R){if(u(),z!=T.IDENTIFIER)throw _("Identifier expected");t[o]=R,u()}else y(t,o)}}function v(t){var e=null;if("subgraph"===R&&(e={},e.type="subgraph",u(),z===T.IDENTIFIER&&(e.id=R,u())),"{"===R){if(u(),e||(e={}),e.parent=t,e.node=t.node,e.edge=t.edge,e.graph=t.graph,f(e),"}"!=R)throw _("Angle bracket } expected");u(),delete e.node,delete e.edge,delete e.graph,delete e.parent,t.subgraphs||(t.subgraphs=[]),t.subgraphs.push(e)}return e}function g(t){return"node"===R?(u(),t.node=w(),"node"):"edge"===R?(u(),t.edge=w(),"edge"):"graph"===R?(u(),t.graph=w(),"graph"):null}function y(t,e){var i={id:e},o=w();o&&(i.attr=o),d(t,i),b(t,e)}function b(t,e){for(;"->"===R||"--"===R;){var i,o=R;u();var n=v(t);if(n)i=n;else{if(z!=T.IDENTIFIER)throw _("Identifier or subgraph expected");i=R,d(t,{id:i}),u()}var s=w(),r=c(t,e,i,o,s);l(t,r),e=i}}function w(){for(var t=null;"["===R;){for(u(),t={};""!==R&&"]"!=R;){if(z!=T.IDENTIFIER)throw _("Attribute name expected");var e=R;if(u(),"="!=R)throw _("Equal sign = expected");if(u(),z!=T.IDENTIFIER)throw _("Attribute value expected");var i=R;h(t,e,i),u(),","==R&&u()}if("]"!=R)throw _("Bracket ] expected");u()}return t}function _(t){return new SyntaxError(t+', got "'+x(R,30)+'" (char '+I+")")}function x(t,e){return t.length<=e?t:t.substr(0,27)+"..."}function k(t,e,i){Array.isArray(t)?t.forEach(function(t){Array.isArray(e)?e.forEach(function(e){i(t,e)}):i(t,e)}):Array.isArray(e)?e.forEach(function(e){i(t,e)}):i(t,e)}function O(t,e,i){for(var o=e.split("."),n=o.pop(),s=t,r=0;r":!0,"--":!0},P="",I=0,N="",R="",z=T.NULL,L=/[a-zA-Z_0-9.:#]/;e.parseDOT=i,e.DOTToGraph=D},function(t,e){function i(t,e){var i=[],o=[],n={edges:{inheritColor:!1},nodes:{fixed:!1,parseColor:!1}};void 0!==e&&(void 0!==e.fixed&&(n.nodes.fixed=e.fixed),void 0!==e.parseColor&&(n.nodes.parseColor=e.parseColor),void 0!==e.inheritColor&&(n.edges.inheritColor=e.inheritColor));for(var s=t.edges,r=t.nodes,a=0;a 1) { + val error = new SparkException(s"Different barrier sync types found for the " + + s"sync $barrierId: ${requestMethods.mkString(", ")}. Please use the " + + s"same barrier sync type within a single sync.") + (requesters :+ requester).foreach(_.sendFailure(error)) + clear() + return + } // Require the number of tasks is correctly set from the BarrierTaskContext. require(request.numTasks == numTasks, s"Number of tasks of $barrierId is " + @@ -153,33 +172,23 @@ private[spark] class BarrierCoordinator( } // Add the requester to array of RPCCallContexts pending for reply. requesters += requester + messages(request.partitionId) = request.message logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received update from Task " + s"$taskId, current progress: ${requesters.size}/$numTasks.") - if (maybeFinishAllRequesters(requesters, numTasks)) { + if (requesters.size == numTasks) { + requesters.foreach(_.reply(messages)) // Finished current barrier() call successfully, clean up ContextBarrierState and // increase the barrier epoch. logInfo(s"Barrier sync epoch $barrierEpoch from $barrierId received all updates from " + s"tasks, finished successfully.") barrierEpoch += 1 requesters.clear() + requestMethods.clear() cancelTimerTask() } } } - // Finish all the blocking barrier sync requests from a stage attempt successfully if we - // have received all the sync requests. - private def maybeFinishAllRequesters( - requesters: ArrayBuffer[RpcCallContext], - numTasks: Int): Boolean = { - if (requesters.size == numTasks) { - requesters.foreach(_.reply(())) - true - } else { - false - } - } - // Cleanup the internal state of a barrier stage attempt. def clear(): Unit = synchronized { // The global sync fails so the stage is expected to retry another attempt, all sync @@ -199,7 +208,7 @@ private[spark] class BarrierCoordinator( } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { - case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _) => + case request @ RequestToSync(numTasks, stageId, stageAttemptId, _, _, _, _, _) => // Get or init the ContextBarrierState correspond to the stage attempt. val barrierId = ContextBarrierId(stageId, stageAttemptId) states.computeIfAbsent(barrierId, @@ -217,18 +226,28 @@ private[spark] class BarrierCoordinator( private[spark] sealed trait BarrierCoordinatorMessage extends Serializable /** - * A global sync request message from BarrierTaskContext, by `barrier()` call. Each request is + * A global sync request message from BarrierTaskContext. Each request is * identified by stageId + stageAttemptId + barrierEpoch. * * @param numTasks The number of global sync requests the BarrierCoordinator shall receive * @param stageId ID of current stage * @param stageAttemptId ID of current stage attempt * @param taskAttemptId Unique ID of current task - * @param barrierEpoch ID of the `barrier()` call, a task may consist multiple `barrier()` calls. + * @param barrierEpoch ID of a runBarrier() call, a task may consist multiple runBarrier() calls + * @param partitionId ID of the current partition the task is assigned to + * @param message Message sent from the BarrierTaskContext + * @param requestMethod The BarrierTaskContext method that was called to trigger BarrierCoordinator */ private[spark] case class RequestToSync( - numTasks: Int, - stageId: Int, - stageAttemptId: Int, - taskAttemptId: Long, - barrierEpoch: Int) extends BarrierCoordinatorMessage + numTasks: Int, + stageId: Int, + stageAttemptId: Int, + taskAttemptId: Long, + barrierEpoch: Int, + partitionId: Int, + message: String, + requestMethod: RequestMethod.Value) extends BarrierCoordinatorMessage + +private[spark] object RequestMethod extends Enumeration { + val BARRIER, ALL_GATHER = Value +} diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 3d369802f3023..4d765481eb836 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -20,8 +20,9 @@ package org.apache.spark import java.util.{Properties, Timer, TimerTask} import scala.collection.JavaConverters._ -import scala.concurrent.TimeoutException import scala.concurrent.duration._ +import scala.language.postfixOps +import scala.util.{Failure, Success => ScalaSuccess, Try} import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.executor.TaskMetrics @@ -59,49 +60,7 @@ class BarrierTaskContext private[spark] ( // from different tasks within the same barrier stage attempt to succeed. private lazy val numTasks = getTaskInfos().size - /** - * :: Experimental :: - * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to - * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same - * stage have reached this routine. - * - * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all - * possible code branches. Otherwise, you may get the job hanging or a SparkException after - * timeout. Some examples of '''misuses''' are listed below: - * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it - * shall lead to timeout of the function call. - * {{{ - * rdd.barrier().mapPartitions { iter => - * val context = BarrierTaskContext.get() - * if (context.partitionId() == 0) { - * // Do nothing. - * } else { - * context.barrier() - * } - * iter - * } - * }}} - * - * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the - * second function call. - * {{{ - * rdd.barrier().mapPartitions { iter => - * val context = BarrierTaskContext.get() - * try { - * // Do something that might throw an Exception. - * doSomething() - * context.barrier() - * } catch { - * case e: Exception => logWarning("...", e) - * } - * context.barrier() - * iter - * } - * }}} - */ - @Experimental - @Since("2.4.0") - def barrier(): Unit = { + private def runBarrier(message: String, requestMethod: RequestMethod.Value): Array[String] = { logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) has entered " + s"the global sync, current barrier epoch is $barrierEpoch.") logTrace("Current callSite: " + Utils.getCallSite()) @@ -119,9 +78,9 @@ class BarrierTaskContext private[spark] ( timer.schedule(timerTask, 60000, 60000) try { - val abortableRpcFuture = barrierCoordinator.askAbortable[Unit]( + val abortableRpcFuture = barrierCoordinator.askAbortable[Array[String]]( message = RequestToSync(numTasks, stageId, stageAttemptNumber, taskAttemptId, - barrierEpoch), + barrierEpoch, partitionId, message, requestMethod), // Set a fixed timeout for RPC here, so users shall get a SparkException thrown by // BarrierCoordinator on timeout, instead of RPCTimeoutException from the RPC framework. timeout = new RpcTimeout(365.days, "barrierTimeout")) @@ -129,29 +88,30 @@ class BarrierTaskContext private[spark] ( // Wait the RPC future to be completed, but every 1 second it will jump out waiting // and check whether current spark task is killed. If killed, then throw // a `TaskKilledException`, otherwise continue wait RPC until it completes. - try { - while (!abortableRpcFuture.toFuture.isCompleted) { + + while (!abortableRpcFuture.future.isCompleted) { + try { // wait RPC future for at most 1 second - try { - ThreadUtils.awaitResult(abortableRpcFuture.toFuture, 1.second) - } catch { - case _: TimeoutException | _: InterruptedException => - // If `TimeoutException` thrown, waiting RPC future reach 1 second. - // If `InterruptedException` thrown, it is possible this task is killed. - // So in this two cases, we should check whether task is killed and then - // throw `TaskKilledException` - taskContext.killTaskIfInterrupted() + Thread.sleep(1000) + } catch { + case _: InterruptedException => // task is killed by driver + } finally { + Try(taskContext.killTaskIfInterrupted()) match { + case ScalaSuccess(_) => // task is still running healthily + case Failure(e) => abortableRpcFuture.abort(e) } } - } finally { - abortableRpcFuture.abort(taskContext.getKillReason().getOrElse("Unknown reason.")) } + // messages which consist of all barrier tasks' messages. The future will return the + // desired messages if it is completed successfully. Otherwise, exception could be thrown. + val messages = abortableRpcFuture.future.value.get.get barrierEpoch += 1 logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) finished " + "global sync successfully, waited for " + s"${MILLISECONDS.toSeconds(System.currentTimeMillis() - startTime)} seconds, " + s"current barrier epoch is $barrierEpoch.") + messages } catch { case e: SparkException => logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt $stageAttemptNumber) failed " + @@ -165,6 +125,64 @@ class BarrierTaskContext private[spark] ( } } + /** + * :: Experimental :: + * Sets a global barrier and waits until all tasks in this stage hit this barrier. Similar to + * MPI_Barrier function in MPI, the barrier() function call blocks until all tasks in the same + * stage have reached this routine. + * + * CAUTION! In a barrier stage, each task must have the same number of barrier() calls, in all + * possible code branches. Otherwise, you may get the job hanging or a SparkException after + * timeout. Some examples of '''misuses''' are listed below: + * 1. Only call barrier() function on a subset of all the tasks in the same barrier stage, it + * shall lead to timeout of the function call. + * {{{ + * rdd.barrier().mapPartitions { iter => + * val context = BarrierTaskContext.get() + * if (context.partitionId() == 0) { + * // Do nothing. + * } else { + * context.barrier() + * } + * iter + * } + * }}} + * + * 2. Include barrier() function in a try-catch code block, this may lead to timeout of the + * second function call. + * {{{ + * rdd.barrier().mapPartitions { iter => + * val context = BarrierTaskContext.get() + * try { + * // Do something that might throw an Exception. + * doSomething() + * context.barrier() + * } catch { + * case e: Exception => logWarning("...", e) + * } + * context.barrier() + * iter + * } + * }}} + */ + @Experimental + @Since("2.4.0") + def barrier(): Unit = runBarrier("", RequestMethod.BARRIER) + + /** + * :: Experimental :: + * Blocks until all tasks in the same stage have reached this routine. Each task passes in + * a message and returns with a list of all the messages passed in by each of those tasks. + * + * CAUTION! The allGather method requires the same precautions as the barrier method + * + * The message is type String rather than Array[Byte] because it is more convenient for + * the user at the cost of worse performance. + */ + @Experimental + @Since("3.0.0") + def allGather(message: String): Array[String] = runBarrier(message, RequestMethod.ALL_GATHER) + /** * :: Experimental :: * Returns [[BarrierTaskInfo]] for all tasks in this barrier stage, ordered by partition ID. diff --git a/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala new file mode 100644 index 0000000000000..c4d0dd8aceab0 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.apache.spark.annotation.DeveloperApi + +/** + * :: DeveloperApi :: + * A TaskContext aware iterator. + * + * As the Python evaluation consumes the parent iterator in a separate thread, + * it could consume more data from the parent even after the task ends and the parent is closed. + * If an off-heap access exists in the parent iterator, it could cause segmentation fault + * which crashes the executor. + * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends. + */ +@DeveloperApi +class ContextAwareIterator[+T](val context: TaskContext, val delegate: Iterator[T]) + extends Iterator[T] { + + override def hasNext: Boolean = + !context.isCompleted() && !context.isInterrupted() && delegate.hasNext + + override def next(): T = delegate.next() +} diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala index 9506c36bf9c8c..7c3d6d98a33ed 100644 --- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala +++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala @@ -281,7 +281,7 @@ private object ContextCleaner { } /** - * Listener class used for testing when any item has been cleaned by the Cleaner class. + * Listener class used when any item has been cleaned by the Cleaner class. */ private[spark] trait CleanerListener { def rddCleaned(rddId: Int): Unit diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 677386cc7a572..fffa7fd498b96 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -45,8 +45,8 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils} * executors that could run all current running and pending tasks at once. * * Increasing the target number of executors happens in response to backlogged tasks waiting to be - * scheduled. If the scheduler queue is not drained in N seconds, then new executors are added. If - * the queue persists for another M seconds, then more executors are added and so on. The number + * scheduled. If the scheduler queue is not drained in M seconds, then new executors are added. If + * the queue persists for another N seconds, then more executors are added and so on. The number * added in each round increases exponentially from the previous round until an upper bound has been * reached. The upper bound is based both on a configured property and on the current number of * running and pending tasks, as described above. @@ -196,7 +196,7 @@ private[spark] class ExecutorAllocationManager( s"s${DYN_ALLOCATION_SUSTAINED_SCHEDULER_BACKLOG_TIMEOUT.key} must be > 0!") } if (!conf.get(config.SHUFFLE_SERVICE_ENABLED)) { - if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING)) { + if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED)) { logWarning("Dynamic allocation without a shuffle service is an experimental feature.") } else if (!testing) { throw new SparkException("Dynamic allocation of executors requires the external " + @@ -618,7 +618,11 @@ private[spark] class ExecutorAllocationManager( if (taskEnd.taskInfo.speculative) { stageAttemptToSpeculativeTaskIndices.get(stageAttempt).foreach {_.remove{taskIndex}} - stageAttemptToNumSpeculativeTasks(stageAttempt) -= 1 + // If the previous task attempt succeeded first and it was the last task in a stage, + // the stage may have been removed before handing this speculative TaskEnd event. + if (stageAttemptToNumSpeculativeTasks.contains(stageAttempt)) { + stageAttemptToNumSpeculativeTasks(stageAttempt) -= 1 + } } taskEnd.reason match { diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index 2ac72e66d6f32..be630722308fa 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -80,7 +80,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) // executor ID -> timestamp of when the last heartbeat from this executor was received private val executorLastSeen = new HashMap[String, Long] - private val executorTimeoutMs = sc.conf.get(config.STORAGE_BLOCKMANAGER_SLAVE_TIMEOUT) + private val executorTimeoutMs = sc.conf.get( + config.STORAGE_BLOCKMANAGER_SLAVE_TIMEOUT + ).getOrElse(Utils.timeStringAsMs(s"${sc.conf.get(Network.NETWORK_TIMEOUT)}s")) private val checkTimeoutIntervalMs = sc.conf.get(Network.NETWORK_TIMEOUT_INTERVAL) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index f229061a6d0f6..ec8621bc55cf3 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -696,7 +696,7 @@ private[spark] class MapOutputTrackerMaster( * * @param dep shuffle dependency object * @param startMapIndex the start map index - * @param endMapIndex the end map index + * @param endMapIndex the end map index (exclusive) * @return a sequence of locations where task runs. */ def getMapLocation( @@ -707,7 +707,8 @@ private[spark] class MapOutputTrackerMaster( val shuffleStatus = shuffleStatuses.get(dep.shuffleId).orNull if (shuffleStatus != null) { shuffleStatus.withMapStatuses { statuses => - if (startMapIndex < endMapIndex && (startMapIndex >= 0 && endMapIndex < statuses.length)) { + if (startMapIndex < endMapIndex && + (startMapIndex >= 0 && endMapIndex <= statuses.length)) { val statusesPicked = statuses.slice(startMapIndex, endMapIndex).filter(_ != null) statusesPicked.map(_.location.host).toSeq } else { diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 0e0291d2407d1..802100e05b728 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -577,7 +577,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria // If spark.executor.heartbeatInterval bigger than spark.network.timeout, // it will almost always cause ExecutorLostFailure. See SPARK-22754. require(executorTimeoutThresholdMs > executorHeartbeatIntervalMs, "The value of " + - s"${networkTimeout}=${executorTimeoutThresholdMs}ms must be no less than the value of " + + s"${networkTimeout}=${executorTimeoutThresholdMs}ms must be greater than the value of " + s"${EXECUTOR_HEARTBEAT_INTERVAL.key}=${executorHeartbeatIntervalMs}ms.") } @@ -684,7 +684,8 @@ private[spark] object SparkConf extends Logging { "spark.yarn.jars" -> Seq( AlternateConfig("spark.yarn.jar", "2.0")), MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM.key -> Seq( - AlternateConfig("spark.reducer.maxReqSizeShuffleToMem", "2.3")), + AlternateConfig("spark.reducer.maxReqSizeShuffleToMem", "2.3"), + AlternateConfig("spark.maxRemoteBlockSizeFetchToMem", "3.0")), LISTENER_BUS_EVENT_QUEUE_CAPACITY.key -> Seq( AlternateConfig("spark.scheduler.listenerbus.eventqueue.size", "2.3")), DRIVER_MEMORY_OVERHEAD.key -> Seq( diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 91188d58f4201..3ccbea5136f4a 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -41,7 +41,6 @@ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} -import org.apache.spark.deploy.StandaloneResourceUtils._ import org.apache.spark.executor.{ExecutorMetrics, ExecutorMetricsSource} import org.apache.spark.input.{FixedLengthBinaryInputFormat, PortableDataStream, StreamInputFormat, WholeTextFileInputFormat} import org.apache.spark.internal.Logging @@ -83,6 +82,11 @@ class SparkContext(config: SparkConf) extends Logging { // The call site where this SparkContext was constructed. private val creationSite: CallSite = Utils.getCallSite() + if (!config.get(EXECUTOR_ALLOW_SPARK_CONTEXT)) { + // In order to prevent SparkContext from being created in executors. + SparkContext.assertOnDriver() + } + // In order to prevent multiple SparkContexts from being active at the same time, mark this // context as having started construction. // NOTE: this must be placed at the beginning of the SparkContext constructor. @@ -250,15 +254,6 @@ class SparkContext(config: SparkConf) extends Logging { def isLocal: Boolean = Utils.isLocalMaster(_conf) - private def isClientStandalone: Boolean = { - val isSparkCluster = master match { - case SparkMasterRegex.SPARK_REGEX(_) => true - case SparkMasterRegex.LOCAL_CLUSTER_REGEX(_, _, _) => true - case _ => false - } - deployMode == "client" && isSparkCluster - } - /** * @return true if context is stopped or in the midst of stopping. */ @@ -385,6 +380,7 @@ class SparkContext(config: SparkConf) extends Logging { try { _conf = config.clone() _conf.validateSettings() + _conf.set("spark.app.startTime", startTime.toString) if (!_conf.contains("spark.master")) { throw new SparkException("A master URL must be set in your configuration") @@ -396,17 +392,7 @@ class SparkContext(config: SparkConf) extends Logging { _driverLogger = DriverLogger(_conf) val resourcesFileOpt = conf.get(DRIVER_RESOURCES_FILE) - val allResources = getOrDiscoverAllResources(_conf, SPARK_DRIVER_PREFIX, resourcesFileOpt) - _resources = { - // driver submitted in client mode under Standalone may have conflicting resources with - // other drivers/workers on this host. We should sync driver's resources info into - // SPARK_RESOURCES/SPARK_RESOURCES_COORDINATE_DIR/ to avoid collision. - if (isClientStandalone) { - acquireResources(_conf, SPARK_DRIVER_PREFIX, allResources, Utils.getProcessId) - } else { - allResources - } - } + _resources = getOrDiscoverAllResources(_conf, SPARK_DRIVER_PREFIX, resourcesFileOpt) logResourceInfo(SPARK_DRIVER_PREFIX, _resources) // log out spark.app.name in the Spark driver logs @@ -502,11 +488,17 @@ class SparkContext(config: SparkConf) extends Logging { // Add each JAR given through the constructor if (jars != null) { - jars.foreach(addJar) + jars.foreach(jar => addJar(jar, true)) + if (addedJars.nonEmpty) { + _conf.set("spark.app.initial.jar.urls", addedJars.keys.toSeq.mkString(",")) + } } if (files != null) { - files.foreach(addFile) + files.foreach(file => addFile(file, false, true)) + if (addedFiles.nonEmpty) { + _conf.set("spark.app.initial.file.urls", addedFiles.keys.toSeq.mkString(",")) + } } _executorMemory = _conf.getOption(EXECUTOR_MEMORY.key) @@ -1510,7 +1502,7 @@ class SparkContext(config: SparkConf) extends Logging { * @note A path can be added only once. Subsequent additions of the same path are ignored. */ def addFile(path: String): Unit = { - addFile(path, false) + addFile(path, false, false) } /** @@ -1532,6 +1524,10 @@ class SparkContext(config: SparkConf) extends Logging { * @note A path can be added only once. Subsequent additions of the same path are ignored. */ def addFile(path: String, recursive: Boolean): Unit = { + addFile(path, recursive, false) + } + + private def addFile(path: String, recursive: Boolean, addedOnSubmit: Boolean): Unit = { val uri = new Path(path).toUri val schemeCorrectedURI = uri.getScheme match { case null => new File(path).getCanonicalFile.toURI @@ -1569,7 +1565,7 @@ class SparkContext(config: SparkConf) extends Logging { path } } - val timestamp = System.currentTimeMillis + val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis if (addedFiles.putIfAbsent(key, timestamp).isEmpty) { logInfo(s"Added file $path at $key with timestamp $timestamp") // Fetch the file locally so that closures which are run on the driver can still use the @@ -1579,7 +1575,7 @@ class SparkContext(config: SparkConf) extends Logging { postEnvironmentUpdate() } else { logWarning(s"The path $path has been added already. Overwriting of added paths " + - "is not supported in the current version.") + "is not supported in the current version.") } } @@ -1612,7 +1608,8 @@ class SparkContext(config: SparkConf) extends Logging { } /** - * Get the max number of tasks that can be concurrent launched currently. + * Get the max number of tasks that can be concurrent launched based on the resources + * could be used, even if some of them are being used at the moment. * Note that please don't cache the value returned by this method, because the number can change * due to add/remove executors. * @@ -1841,6 +1838,10 @@ class SparkContext(config: SparkConf) extends Logging { * @note A path can be added only once. Subsequent additions of the same path are ignored. */ def addJar(path: String): Unit = { + addJar(path, false) + } + + private def addJar(path: String, addedOnSubmit: Boolean): Unit = { def addLocalJarFile(file: File): String = { try { if (!file.exists()) { @@ -1867,7 +1868,7 @@ class SparkContext(config: SparkConf) extends Logging { if (!fs.exists(hadoopPath)) { throw new FileNotFoundException(s"Jar ${path} not found") } - if (fs.isDirectory(hadoopPath)) { + if (fs.getFileStatus(hadoopPath).isDirectory) { throw new IllegalArgumentException( s"Directory ${path} is not allowed for addJar") } @@ -1905,7 +1906,7 @@ class SparkContext(config: SparkConf) extends Logging { } } if (key != null) { - val timestamp = System.currentTimeMillis + val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis if (addedJars.putIfAbsent(key, timestamp).isEmpty) { logInfo(s"Added JAR $path at $key with timestamp $timestamp") postEnvironmentUpdate() @@ -2019,9 +2020,6 @@ class SparkContext(config: SparkConf) extends Logging { Utils.tryLogNonFatalError { _progressBar.foreach(_.stop()) } - if (isClientStandalone) { - releaseResources(_conf, SPARK_DRIVER_PREFIX, _resources, Utils.getProcessId) - } _taskScheduler = null // TODO: Cache.stop()? if (_env != null) { @@ -2562,6 +2560,19 @@ object SparkContext extends Logging { } } + /** + * Called to ensure that SparkContext is created or accessed only on the Driver. + * + * Throws an exception if a SparkContext is about to be created in executors. + */ + private def assertOnDriver(): Unit = { + if (TaskContext.get != null) { + // we're accessing it during task execution, fail. + throw new IllegalStateException( + "SparkContext should only be created and accessed on the driver.") + } + } + /** * This function may be used to get or instantiate a SparkContext and register it as a * singleton object. Because we can only have one active SparkContext per JVM, @@ -2781,8 +2792,9 @@ object SparkContext extends Logging { } // some cluster managers don't set the EXECUTOR_CORES config by default (standalone // and mesos coarse grained), so we can't rely on that config for those. - val shouldCheckExecCores = executorCores.isDefined || sc.conf.contains(EXECUTOR_CORES) || + var shouldCheckExecCores = executorCores.isDefined || sc.conf.contains(EXECUTOR_CORES) || (master.equalsIgnoreCase("yarn") || master.startsWith("k8s")) + shouldCheckExecCores &= !sc.conf.get(SKIP_VALIDATE_CORES_TESTING) // Number of cores per executor must meet at least one task requirement. if (shouldCheckExecCores && execCores < taskCores) { @@ -2838,7 +2850,7 @@ object SparkContext extends Logging { limitingResourceName = taskReq.resourceName } } - if(!shouldCheckExecCores && Utils.isDynamicAllocationEnabled(sc.conf)) { + if(!shouldCheckExecCores) { // if we can't rely on the executor cores config throw a warning for user logWarning("Please ensure that the number of slots available on your " + "executors is limited by the number of cores to task cpus and not another " + @@ -2862,7 +2874,7 @@ object SparkContext extends Logging { s"result in wasted resources due to resource ${limitingResourceName} limiting the " + s"number of runnable tasks per executor to: ${numSlots}. Please adjust " + s"your configuration." - if (Utils.isTesting) { + if (sc.conf.get(RESOURCES_WARNING_TESTING)) { throw new SparkException(message) } else { logWarning(message) diff --git a/core/src/main/scala/org/apache/spark/SparkException.scala b/core/src/main/scala/org/apache/spark/SparkException.scala index 4ad9a0cc4b103..41382133bd84c 100644 --- a/core/src/main/scala/org/apache/spark/SparkException.scala +++ b/core/src/main/scala/org/apache/spark/SparkException.scala @@ -43,3 +43,10 @@ private[spark] case class SparkUserAppException(exitCode: Int) */ private[spark] case class ExecutorDeadException(message: String) extends SparkException(message) + +/** + * Exception thrown when Spark returns different result after upgrading to a new version. + */ +private[spark] class SparkUpgradeException(version: String, message: String, cause: Throwable) + extends RuntimeException("You may get a different result due to the upgrading of Spark" + + s" $version: $message", cause) diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index b13028f868072..6606d317e7b86 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -90,7 +90,8 @@ case class FetchFailed( extends TaskFailedReason { override def toErrorString: String = { val bmAddressString = if (bmAddress == null) "null" else bmAddress.toString - s"FetchFailed($bmAddressString, shuffleId=$shuffleId, mapIndex=$mapIndex, " + + val mapIndexString = if (mapIndex == Int.MinValue) "Unknown" else mapIndex.toString + s"FetchFailed($bmAddressString, shuffleId=$shuffleId, mapIndex=$mapIndexString, " + s"mapId=$mapId, reduceId=$reduceId, message=\n$message\n)" } diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala index d459627930f4c..054e7b07d1da6 100644 --- a/core/src/main/scala/org/apache/spark/TestUtils.scala +++ b/core/src/main/scala/org/apache/spark/TestUtils.scala @@ -179,11 +179,20 @@ private[spark] object TestUtils { destDir: File, toStringValue: String = "", baseClass: String = null, - classpathUrls: Seq[URL] = Seq.empty): File = { + classpathUrls: Seq[URL] = Seq.empty, + implementsClasses: Seq[String] = Seq.empty, + extraCodeBody: String = ""): File = { val extendsText = Option(baseClass).map { c => s" extends ${c}" }.getOrElse("") + val implementsText = + "implements " + (implementsClasses :+ "java.io.Serializable").mkString(", ") val sourceFile = new JavaSourceFromString(className, - "public class " + className + extendsText + " implements java.io.Serializable {" + - " @Override public String toString() { return \"" + toStringValue + "\"; }}") + s""" + |public class $className $extendsText $implementsText { + | @Override public String toString() { return "$toStringValue"; } + | + | $extraCodeBody + |} + """.stripMargin) createCompiledClass(className, destDir, sourceFile, classpathUrls) } @@ -240,6 +249,19 @@ private[spark] object TestUtils { attempt.isSuccess && attempt.get == 0 } + def isPythonVersionAtLeast38(): Boolean = { + val attempt = if (Utils.isWindows) { + Try(Process(Seq("cmd.exe", "/C", "python3 --version")) + .run(ProcessLogger(s => s.startsWith("Python 3.8") || s.startsWith("Python 3.9"))) + .exitValue()) + } else { + Try(Process(Seq("sh", "-c", "python3 --version")) + .run(ProcessLogger(s => s.startsWith("Python 3.8") || s.startsWith("Python 3.9"))) + .exitValue()) + } + attempt.isSuccess && attempt.get == 0 + } + /** * Returns the response code from an HTTP(S) URL. */ diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 6dc1721f56adf..a577194a48006 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -168,6 +168,21 @@ private[spark] object PythonRDD extends Logging { serveIterator(rdd.collect().iterator, s"serve RDD ${rdd.id}") } + /** + * A helper function to collect an RDD as an iterator, then serve it via socket. + * This method is similar with `PythonRDD.collectAndServe`, but user can specify job group id, + * job description, and interruptOnCancel option. + */ + def collectAndServeWithJobGroup[T]( + rdd: RDD[T], + groupId: String, + description: String, + interruptOnCancel: Boolean): Array[Any] = { + val sc = rdd.sparkContext + sc.setJobGroup(groupId, description, interruptOnCancel) + serveIterator(rdd.collect().iterator, s"serve RDD ${rdd.id}") + } + /** * A helper function to create a local RDD iterator and serve it via socket. Partitions are * are collected as separate jobs, by order of index. Partition data is first requested by a diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index 658e0d593a167..820cb798050f0 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -24,8 +24,13 @@ import java.nio.charset.StandardCharsets.UTF_8 import java.util.concurrent.atomic.AtomicBoolean import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} + import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES} @@ -225,7 +230,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( /* backlog */ 1, InetAddress.getByName("localhost"))) // A call to accept() for ServerSocket shall block infinitely. - serverSocket.map(_.setSoTimeout(0)) + serverSocket.foreach(_.setSoTimeout(0)) new Thread("accept-connections") { setDaemon(true) @@ -238,13 +243,18 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( sock.setSoTimeout(10000) authHelper.authClient(sock) val input = new DataInputStream(sock.getInputStream()) - input.readInt() match { + val requestMethod = input.readInt() + // The BarrierTaskContext function may wait infinitely, socket shall not timeout + // before the function finishes. + sock.setSoTimeout(0) + requestMethod match { case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION => - // The barrier() function may wait infinitely, socket shall not timeout - // before the function finishes. - sock.setSoTimeout(0) - barrierAndServe(sock) - + barrierAndServe(requestMethod, sock) + case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION => + val length = input.readInt() + val message = new Array[Byte](length) + input.readFully(message) + barrierAndServe(requestMethod, sock, new String(message, UTF_8)) case _ => val out = new DataOutputStream(new BufferedOutputStream( sock.getOutputStream)) @@ -395,15 +405,24 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( } /** - * Gateway to call BarrierTaskContext.barrier(). + * Gateway to call BarrierTaskContext methods. */ - def barrierAndServe(sock: Socket): Unit = { - require(serverSocket.isDefined, "No available ServerSocket to redirect the barrier() call.") - + def barrierAndServe(requestMethod: Int, sock: Socket, message: String = ""): Unit = { + require( + serverSocket.isDefined, + "No available ServerSocket to redirect the BarrierTaskContext method call." + ) val out = new DataOutputStream(new BufferedOutputStream(sock.getOutputStream)) try { - context.asInstanceOf[BarrierTaskContext].barrier() - writeUTF(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS, out) + val messages = requestMethod match { + case BarrierTaskContextMessageProtocol.BARRIER_FUNCTION => + context.asInstanceOf[BarrierTaskContext].barrier() + Array(BarrierTaskContextMessageProtocol.BARRIER_RESULT_SUCCESS) + case BarrierTaskContextMessageProtocol.ALL_GATHER_FUNCTION => + context.asInstanceOf[BarrierTaskContext].allGather(message) + } + out.writeInt(messages.length) + messages.foreach(writeUTF(_, out)) } catch { case e: SparkException => writeUTF(e.getMessage, out) @@ -638,6 +657,7 @@ private[spark] object SpecialLengths { private[spark] object BarrierTaskContextMessageProtocol { val BARRIER_FUNCTION = 1 + val ALL_GATHER_FUNCTION = 2 val BARRIER_RESULT_SUCCESS = "success" val ERROR_UNRECOGNIZED_FUNCTION = "Not recognized function call from python side." } diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala index 62d60475985b3..78eb6907c3898 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala @@ -27,7 +27,7 @@ import org.apache.spark.SparkContext import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} private[spark] object PythonUtils { - val PY4J_ZIP_NAME = "py4j-0.10.8.1-src.zip" + val PY4J_ZIP_NAME = "py4j-0.10.9-src.zip" /** Get the PYTHONPATH for PySpark, either from SPARK_HOME, if it is set, or from our JAR */ def sparkPythonPath: String = { @@ -85,4 +85,8 @@ private[spark] object PythonUtils { def getBroadcastThreshold(sc: JavaSparkContext): Long = { sc.conf.get(org.apache.spark.internal.config.BROADCAST_FOR_UDF_COMPRESSION_THRESHOLD) } + + def getSparkBufferSize(sc: JavaSparkContext): Int = { + sc.conf.get(org.apache.spark.internal.config.BUFFER_SIZE) + } } diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala index 311fade127839..784a57e7b98a8 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala @@ -43,9 +43,9 @@ private[spark] object RUtils { * Check if SparkR is installed before running tests that use SparkR. */ def isSparkRInstalled: Boolean = { - localSparkRPackagePath.filter { pkgDir => + localSparkRPackagePath.exists { pkgDir => new File(Seq(pkgDir, "SparkR").mkString(File.separator)).exists - }.isDefined + } } /** diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala index 77fbbc08c2103..1024d9b5060bc 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala @@ -133,22 +133,30 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long) if (!blockManager.putSingle(broadcastId, value, MEMORY_AND_DISK, tellMaster = false)) { throw new SparkException(s"Failed to store $broadcastId in BlockManager") } - val blocks = - TorrentBroadcast.blockifyObject(value, blockSize, SparkEnv.get.serializer, compressionCodec) - if (checksumEnabled) { - checksums = new Array[Int](blocks.length) - } - blocks.zipWithIndex.foreach { case (block, i) => + try { + val blocks = + TorrentBroadcast.blockifyObject(value, blockSize, SparkEnv.get.serializer, compressionCodec) if (checksumEnabled) { - checksums(i) = calcChecksum(block) + checksums = new Array[Int](blocks.length) } - val pieceId = BroadcastBlockId(id, "piece" + i) - val bytes = new ChunkedByteBuffer(block.duplicate()) - if (!blockManager.putBytes(pieceId, bytes, MEMORY_AND_DISK_SER, tellMaster = true)) { - throw new SparkException(s"Failed to store $pieceId of $broadcastId in local BlockManager") + blocks.zipWithIndex.foreach { case (block, i) => + if (checksumEnabled) { + checksums(i) = calcChecksum(block) + } + val pieceId = BroadcastBlockId(id, "piece" + i) + val bytes = new ChunkedByteBuffer(block.duplicate()) + if (!blockManager.putBytes(pieceId, bytes, MEMORY_AND_DISK_SER, tellMaster = true)) { + throw new SparkException(s"Failed to store $pieceId of $broadcastId " + + s"in local BlockManager") + } } + blocks.length + } catch { + case t: Throwable => + logError(s"Store broadcast $broadcastId fail, remove all pieces of the broadcast") + blockManager.removeBroadcast(id, tellMaster = true) + throw t } - blocks.length } /** Fetch torrent blocks from the driver and/or other executors. */ diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 1180501e8c738..6f799a542bc1e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -462,6 +462,9 @@ private[spark] object SparkHadoopUtil { for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) { hadoopConf.set(key.substring("spark.hadoop.".length), value) } + if (conf.getOption("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version").isEmpty) { + hadoopConf.set("mapreduce.fileoutputcommitter.algorithm.version", "1") + } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 3f7cfea778ac6..3090a3b10a97c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -513,7 +513,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | directory of each executor. File paths of these files | in executors can be accessed via SparkFiles.get(fileName). | - | --conf PROP=VALUE Arbitrary Spark configuration property. + | --conf, -c PROP=VALUE Arbitrary Spark configuration property. | --properties-file FILE Path to a file from which to load extra properties. If not | specified, this will look for conf/spark-defaults.conf. | diff --git a/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala b/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala index 65bf4351ebfd9..c7c31a85b0636 100644 --- a/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala @@ -17,30 +17,21 @@ package org.apache.spark.deploy -import java.io.{File, RandomAccessFile} -import java.nio.channels.{FileLock, OverlappingFileLockException} +import java.io.File import java.nio.file.Files import scala.collection.mutable -import scala.util.Random import scala.util.control.NonFatal import org.json4s.{DefaultFormats, Extraction} -import org.json4s.jackson.JsonMethods.{compact, parse, render} +import org.json4s.jackson.JsonMethods.{compact, render} -import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.SparkException import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.{SPARK_RESOURCES_COORDINATE, SPARK_RESOURCES_DIR} import org.apache.spark.resource.{ResourceAllocation, ResourceID, ResourceInformation, ResourceRequirement} -import org.apache.spark.resource.ResourceUtils.{parseResourceRequirements, withResourcesJson} import org.apache.spark.util.Utils private[spark] object StandaloneResourceUtils extends Logging { - // These directory/files are used to coordinate the resources between - // the drivers/workers on the host in Spark Standalone. - val SPARK_RESOURCES_COORDINATE_DIR = "spark-resources" - val ALLOCATED_RESOURCES_FILE = "__allocated_resources__.json" - val RESOURCES_LOCK_FILE = "__allocated_resources__.lock" /** * A mutable resource information which provides more efficient modification on addresses. @@ -86,249 +77,6 @@ private[spark] object StandaloneResourceUtils extends Logging { } } - /** - * Assigns (if coordinate needed) resources to workers/drivers from the same host to avoid - * address conflict. - * - * This function works in three steps. First, acquiring the lock on RESOURCES_LOCK_FILE - * to achieve synchronization among workers and drivers. Second, getting all allocated - * resources from ALLOCATED_RESOURCES_FILE and assigning isolated resources to the worker - * or driver after differentiating available resources in discovered resources from - * allocated resources. If available resources don't meet worker's or driver's requirement, - * try to update allocated resources by excluding the resource allocation if its related - * process has already terminated and do the assignment again. If still don't meet requirement, - * exception should be thrown. Third, updating ALLOCATED_RESOURCES_FILE with new allocated - * resources along with pid for the worker or driver. Then, return allocated resources - * information after releasing the lock. - * - * @param conf SparkConf - * @param componentName spark.driver / spark.worker - * @param resources the resources found by worker/driver on the host - * @param pid the process id of worker/driver to acquire resources. - * @return allocated resources for the worker/driver or throws exception if can't - * meet worker/driver's requirement - */ - def acquireResources( - conf: SparkConf, - componentName: String, - resources: Map[String, ResourceInformation], - pid: Int) - : Map[String, ResourceInformation] = { - if (!needCoordinate(conf)) { - return resources - } - val resourceRequirements = parseResourceRequirements(conf, componentName) - if (resourceRequirements.isEmpty) { - return Map.empty - } - val lock = acquireLock(conf) - try { - val resourcesFile = new File(getOrCreateResourcesDir(conf), ALLOCATED_RESOURCES_FILE) - // all allocated resources in ALLOCATED_RESOURCES_FILE, can be updated if any allocations' - // related processes detected to be terminated while checking pids below. - var origAllocation = Seq.empty[StandaloneResourceAllocation] - // Map[pid -> Map[resourceName -> Addresses[]]] - var allocated = { - if (resourcesFile.exists()) { - origAllocation = allocatedStandaloneResources(resourcesFile.getPath) - val allocations = origAllocation.map { resource => - val resourceMap = { - resource.allocations.map { allocation => - allocation.id.resourceName -> allocation.addresses.toArray - }.toMap - } - resource.pid -> resourceMap - }.toMap - allocations - } else { - Map.empty[Int, Map[String, Array[String]]] - } - } - - // new allocated resources for worker or driver, - // map from resource name to its allocated addresses. - var newAssignments: Map[String, Array[String]] = null - // Whether we've checked process status and we'll only do the check at most once. - // Do the check iff the available resources can't meet the requirements at the first time. - var checked = false - // Whether we need to keep allocating for the worker/driver and we'll only go through - // the loop at most twice. - var keepAllocating = true - while (keepAllocating) { - keepAllocating = false - // store the pid whose related allocated resources conflict with - // discovered resources passed in. - val pidsToCheck = mutable.Set[Int]() - newAssignments = resourceRequirements.map { req => - val rName = req.resourceName - val amount = req.amount - // initially, we must have available.length >= amount as we've done pre-check previously - var available = resources(rName).addresses - // gets available resource addresses by excluding all - // allocated resource addresses from discovered resources - allocated.foreach { a => - val thePid = a._1 - val resourceMap = a._2 - val assigned = resourceMap.getOrElse(rName, Array.empty) - val retained = available.diff(assigned) - // if len(retained) < len(available) after differ to assigned, then, there must be - // some conflicting resources addresses between available and assigned. So, we should - // store its pid here to check whether it's alive in case we don't find enough - // resources after traversal all allocated resources. - if (retained.length < available.length && !checked) { - pidsToCheck += thePid - } - if (retained.length >= amount) { - available = retained - } else if (checked) { - keepAllocating = false - throw new SparkException(s"No more resources available since they've already" + - s" assigned to other workers/drivers.") - } else { - keepAllocating = true - } - } - val assigned = { - if (keepAllocating) { // can't meet the requirement - // excludes the allocation whose related process has already been terminated. - val (invalid, valid) = allocated.partition { a => - pidsToCheck(a._1) && !(Utils.isTesting || Utils.isProcessRunning(a._1))} - allocated = valid - origAllocation = origAllocation.filter( - allocation => !invalid.contains(allocation.pid)) - checked = true - // note this is a meaningless return value, just to avoid creating any new object - available - } else { - available.take(amount) - } - } - rName -> assigned - }.toMap - } - val newAllocation = { - val allocations = newAssignments.map { case (rName, addresses) => - ResourceAllocation(new ResourceID(componentName, rName), addresses) - }.toSeq - StandaloneResourceAllocation(pid, allocations) - } - writeResourceAllocationJson( - componentName, origAllocation ++ Seq(newAllocation), resourcesFile) - newAllocation.toResourceInformationMap - } finally { - releaseLock(lock) - } - } - - /** - * Frees (if coordinate needed) all the resources a worker/driver (pid) has in one shot - * to make those resources be available for other workers/drivers on the same host. - * @param conf SparkConf - * @param componentName spark.driver / spark.worker - * @param toRelease the resources expected to release - * @param pid the process id of worker/driver to release resources. - */ - def releaseResources( - conf: SparkConf, - componentName: String, - toRelease: Map[String, ResourceInformation], - pid: Int) - : Unit = { - if (!needCoordinate(conf)) { - return - } - if (toRelease != null && toRelease.nonEmpty) { - val lock = acquireLock(conf) - try { - val resourcesFile = new File(getOrCreateResourcesDir(conf), ALLOCATED_RESOURCES_FILE) - if (resourcesFile.exists()) { - val (target, others) = - allocatedStandaloneResources(resourcesFile.getPath).partition(_.pid == pid) - if (target.nonEmpty) { - if (others.isEmpty) { - if (!resourcesFile.delete()) { - logError(s"Failed to delete $ALLOCATED_RESOURCES_FILE.") - } - } else { - writeResourceAllocationJson(componentName, others, resourcesFile) - } - logDebug(s"$componentName(pid=$pid) released resources: ${toRelease.mkString("\n")}") - } else { - logWarning(s"$componentName(pid=$pid) has already released its resources.") - } - } - } finally { - releaseLock(lock) - } - } - } - - private def acquireLock(conf: SparkConf): FileLock = { - val resourcesDir = getOrCreateResourcesDir(conf) - val lockFile = new File(resourcesDir, RESOURCES_LOCK_FILE) - val lockFileChannel = new RandomAccessFile(lockFile, "rw").getChannel - var keepTry = true - var lock: FileLock = null - while (keepTry) { - try { - lock = lockFileChannel.lock() - logInfo(s"Acquired lock on $RESOURCES_LOCK_FILE.") - keepTry = false - } catch { - case e: OverlappingFileLockException => - // This exception throws when we're in LocalSparkCluster mode. FileLock is designed - // to be used across JVMs, but our LocalSparkCluster is designed to launch multiple - // workers in the same JVM. As a result, when an worker in LocalSparkCluster try to - // acquire the lock on `resources.lock` which already locked by other worker, we'll - // hit this exception. So, we should manually control it. - keepTry = true - // there may be multiple workers race for the lock, - // so, sleep for a random time to avoid possible conflict - val duration = Random.nextInt(1000) + 1000 - Thread.sleep(duration) - } - } - assert(lock != null, s"Acquired null lock on $RESOURCES_LOCK_FILE.") - lock - } - - private def releaseLock(lock: FileLock): Unit = { - try { - lock.release() - lock.channel().close() - logInfo(s"Released lock on $RESOURCES_LOCK_FILE.") - } catch { - case e: Exception => - logError(s"Error while releasing lock on $RESOURCES_LOCK_FILE.", e) - } - } - - private def getOrCreateResourcesDir(conf: SparkConf): File = { - val coordinateDir = new File(conf.get(SPARK_RESOURCES_DIR).getOrElse { - val sparkHome = if (Utils.isTesting) { - assert(sys.props.contains("spark.test.home") || - sys.env.contains("SPARK_HOME"), "spark.test.home or SPARK_HOME is not set.") - sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) - } else { - sys.env.getOrElse("SPARK_HOME", ".") - } - sparkHome - }) - val resourceDir = new File(coordinateDir, SPARK_RESOURCES_COORDINATE_DIR) - if (!resourceDir.exists()) { - Utils.createDirectory(resourceDir) - } - resourceDir - } - - private def allocatedStandaloneResources(resourcesFile: String) - : Seq[StandaloneResourceAllocation] = { - withResourcesJson[StandaloneResourceAllocation](resourcesFile) { json => - implicit val formats = DefaultFormats - parse(json).extract[Seq[StandaloneResourceAllocation]] - } - } - /** * Save the allocated resources of driver(cluster only) or executor into a JSON formatted * resources file. Used in Standalone only. @@ -372,11 +120,6 @@ private[spark] object StandaloneResourceUtils extends Logging { Files.write(jsonFile.toPath, compact(render(allocationJson)).getBytes()) } - /** Whether needs to coordinate resources among workers and drivers for user */ - def needCoordinate(conf: SparkConf): Boolean = { - conf.get(SPARK_RESOURCES_COORDINATE) - } - def toMutable(immutableResources: Map[String, ResourceInformation]) : Map[String, MutableResourceInfo] = { immutableResources.map { case (rName, rInfo) => @@ -406,11 +149,11 @@ private[spark] object StandaloneResourceUtils extends Logging { // used for UI def formatResourcesUsed( - resourcesTotal: Map[String, ResourceInformation], - resourcesUsed: Map[String, ResourceInformation]): String = { - resourcesTotal.map { case (rName, rInfo) => - val used = resourcesUsed(rName).addresses.length - val total = rInfo.addresses.length + resourcesTotal: Map[String, Int], + resourcesUsed: Map[String, Int]): String = { + resourcesTotal.map { case (rName, totalSize) => + val used = resourcesUsed(rName) + val total = totalSize s"$used / $total $rName" }.mkString(", ") } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala index 9f63a6441a838..b4771c80a175f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala @@ -116,7 +116,7 @@ object EventLogFileReader { def apply(fs: FileSystem, status: FileStatus): Option[EventLogFileReader] = { if (isSingleEventLog(status)) { - Some(new SingleFileEventLogFileReader(fs, status.getPath)) + Some(new SingleFileEventLogFileReader(fs, status.getPath, Option(status))) } else if (isRollingEventLogs(status)) { Some(new RollingEventLogFilesFileReader(fs, status.getPath)) } else { @@ -164,10 +164,13 @@ object EventLogFileReader { * FileNotFoundException could occur if the log file is renamed before getting the * status of log file. */ -class SingleFileEventLogFileReader( +private[history] class SingleFileEventLogFileReader( fs: FileSystem, - path: Path) extends EventLogFileReader(fs, path) { - private lazy val status = fileSystem.getFileStatus(rootPath) + path: Path, + maybeStatus: Option[FileStatus]) extends EventLogFileReader(fs, path) { + private lazy val status = maybeStatus.getOrElse(fileSystem.getFileStatus(rootPath)) + + def this(fs: FileSystem, path: Path) = this(fs, path, None) override def lastIndex: Option[Long] = None @@ -203,7 +206,7 @@ class SingleFileEventLogFileReader( * This reader lists the files only once; if caller would like to play with updated list, * it needs to create another reader instance. */ -class RollingEventLogFilesFileReader( +private[history] class RollingEventLogFilesFileReader( fs: FileSystem, path: Path) extends EventLogFileReader(fs, path) { import RollingEventLogFilesWriter._ diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala index 1d58d054b7825..7d44cbd9f64f9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala @@ -166,7 +166,8 @@ object EventLogFileWriter { val IN_PROGRESS = ".inprogress" val COMPACTED = ".compact" - val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort) + val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("660", 8).toShort) + val LOG_FOLDER_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort) def apply( appId: String, @@ -317,7 +318,8 @@ class RollingEventLogFilesWriter( throw new IOException(s"Target log directory already exists ($logDirForAppPath)") } - fileSystem.mkdirs(logDirForAppPath, EventLogFileWriter.LOG_FILE_PERMISSIONS) + // SPARK-30860: use the class method to avoid the umask causing permission issues + FileSystem.mkdirs(fileSystem, logDirForAppPath, EventLogFileWriter.LOG_FOLDER_PERMISSIONS) createAppStatusFile(inProgress = true) rollEventLogFile() } @@ -361,7 +363,9 @@ class RollingEventLogFilesWriter( private def createAppStatusFile(inProgress: Boolean): Unit = { val appStatusPath = getAppStatusFilePath(logDirForAppPath, appId, appAttemptId, inProgress) - val outputStream = fileSystem.create(appStatusPath) + // SPARK-30860: use the class method to avoid the umask causing permission issues + val outputStream = FileSystem.create(fileSystem, appStatusPath, + EventLogFileWriter.LOG_FILE_PERMISSIONS) // we intentionally create zero-byte file to minimize the cost outputStream.close() } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 99d3eceb1121a..b31333fc48218 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -27,6 +27,7 @@ import java.util.zip.ZipOutputStream import scala.collection.JavaConverters._ import scala.collection.mutable import scala.io.Source +import scala.util.control.NonFatal import scala.xml.Node import com.fasterxml.jackson.annotation.JsonIgnore @@ -459,9 +460,21 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val newLastScanTime = clock.getTimeMillis() logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime") + // Mark entries that are processing as not stale. Such entries do not have a chance to be + // updated with the new 'lastProcessed' time and thus any entity that completes processing + // right after this check and before the check for stale entities will be identified as stale + // and will be deleted from the UI until the next 'checkForLogs' run. + val notStale = mutable.HashSet[String]() val updated = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil) .filter { entry => !isBlacklisted(entry.getPath) } - .filter { entry => !isProcessing(entry.getPath) } + .filter { entry => + if (isProcessing(entry.getPath)) { + notStale.add(entry.getPath.toString()) + false + } else { + true + } + } .flatMap { entry => EventLogFileReader(fs, entry) } .filter { reader => try { @@ -519,10 +532,21 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // If the file is currently not being tracked by the SHS, add an entry for it and try // to parse it. This will allow the cleaner code to detect the file as stale later on // if it was not possible to parse it. - listing.write(LogInfo(reader.rootPath.toString(), newLastScanTime, LogType.EventLogs, - None, None, reader.fileSizeForLastIndex, reader.lastIndex, None, - reader.completed)) - reader.fileSizeForLastIndex > 0 + try { + listing.write(LogInfo(reader.rootPath.toString(), newLastScanTime, + LogType.EventLogs, None, None, reader.fileSizeForLastIndex, reader.lastIndex, + None, reader.completed)) + reader.fileSizeForLastIndex > 0 + } catch { + case _: FileNotFoundException => false + case NonFatal(e) => + logWarning(s"Error while reading new log ${reader.rootPath}", e) + false + } + + case NonFatal(e) => + logWarning(s"Error while filtering log ${reader.rootPath}", e) + false } } .sortWith { case (entry1, entry2) => @@ -550,12 +574,14 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .last(newLastScanTime - 1) .asScala .toList - stale.filterNot(isProcessing).foreach { log => - log.appId.foreach { appId => - cleanAppData(appId, log.attemptId, log.logPath) - listing.delete(classOf[LogInfo], log.logPath) + stale.filterNot(isProcessing) + .filterNot(info => notStale.contains(info.logPath)) + .foreach { log => + log.appId.foreach { appId => + cleanAppData(appId, log.attemptId, log.logPath) + listing.delete(classOf[LogInfo], log.logPath) + } } - } lastScanTime.set(newLastScanTime) } catch { diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index 62cac261ae014..7e0d311311f30 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -69,6 +69,9 @@ class HistoryServer( private val loaderServlet = new HttpServlet { protected override def doGet(req: HttpServletRequest, res: HttpServletResponse): Unit = { + + res.setContentType("text/html;charset=utf-8") + // Parse the URI created by getAttemptURI(). It contains an app ID and an optional // attempt ID (separated by a slash). val parts = Option(req.getPathInfo()).getOrElse("").split("/") diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala index 0a1f33395ad62..fe18b03a7d603 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala @@ -75,14 +75,29 @@ private class HistoryServerDiskManager( // Go through the recorded store directories and remove any that may have been removed by // external code. - val orphans = listing.view(classOf[ApplicationStoreInfo]).asScala.filter { info => - !new File(info.path).exists() - }.toSeq + val (existences, orphans) = listing + .view(classOf[ApplicationStoreInfo]) + .asScala + .toSeq + .partition { info => + new File(info.path).exists() + } orphans.foreach { info => listing.delete(info.getClass(), info.path) } + // Reading level db would trigger table file compaction, then it may cause size of level db + // directory changed. When service restarts, "currentUsage" is calculated from real directory + // size. Update "ApplicationStoreInfo.size" to ensure "currentUsage" equals + // sum of "ApplicationStoreInfo.size". + existences.foreach { info => + val fileSize = sizeOf(new File(info.path)) + if (fileSize != info.size) { + listing.write(info.copy(size = fileSize)) + } + } + logInfo("Initialized disk manager: " + s"current usage = ${Utils.bytesToString(currentUsage.get())}, " + s"max usage = ${Utils.bytesToString(maxUsage)}") @@ -233,7 +248,7 @@ private class HistoryServerDiskManager( } } - private def appStorePath(appId: String, attemptId: Option[String]): File = { + private[history] def appStorePath(appId: String, attemptId: Option[String]): File = { val fileName = appId + attemptId.map("_" + _).getOrElse("") + ".ldb" new File(appStoreDir, fileName) } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/DriverInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/DriverInfo.scala index bf68ba8e15af4..252e7048ba105 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/DriverInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/DriverInfo.scala @@ -37,7 +37,7 @@ private[deploy] class DriverInfo( @transient var worker: Option[WorkerInfo] = None // resources(e.f. gpu/fpga) allocated to this driver // map from resource name to ResourceInformation - private var _resources: Map[String, ResourceInformation] = _ + private var _resources: Map[String, ResourceInformation] = Map.empty init() diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 8d3795cae707a..ab6cb304b59eb 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -143,7 +143,7 @@ private[deploy] class Master( logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}") webUi = new MasterWebUI(this, webUiPort) webUi.bind() - masterWebUiUrl = s"${webUi.scheme}$masterPublicAddress:${webUi.boundPort}" + masterWebUiUrl = webUi.webUrl if (reverseProxy) { masterWebUiUrl = conf.get(UI_REVERSE_PROXY_URL).orElse(Some(masterWebUiUrl)).get webUi.addProxy() @@ -704,7 +704,9 @@ private[deploy] class Master( val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE) .filter(canLaunchExecutor(_, app.desc)) .sortBy(_.coresFree).reverse - if (waitingApps.length == 1 && usableWorkers.isEmpty) { + val appMayHang = waitingApps.length == 1 && + waitingApps.head.executors.isEmpty && usableWorkers.isEmpty + if (appMayHang) { logWarning(s"App ${app.id} requires more resource than any of Workers could have.") } val assignedCores = scheduleExecutorsOnWorkers(app, usableWorkers, spreadOutApps) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index f64b449851d86..fcbeba9eef1a9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -76,19 +76,17 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { private def formatMasterResourcesInUse(aliveWorkers: Array[WorkerInfo]): String = { val totalInfo = aliveWorkers.map(_.resourcesInfo) - .map(resources => toMutable(resources)) .flatMap(_.toIterator) .groupBy(_._1) // group by resource name .map { case (rName, rInfoArr) => - rName -> rInfoArr.map(_._2).reduce(_ + _) - }.map { case (k, v) => (k, v.toResourceInformation) } + rName -> rInfoArr.map(_._2.addresses.size).sum + } val usedInfo = aliveWorkers.map(_.resourcesInfoUsed) - .map (resources => toMutable(resources)) .flatMap(_.toIterator) .groupBy(_._1) // group by resource name .map { case (rName, rInfoArr) => - rName -> rInfoArr.map(_._2).reduce(_ + _) - }.map { case (k, v) => (k, v.toResourceInformation) } + rName -> rInfoArr.map(_._2.addresses.size).sum + } formatResourcesUsed(totalInfo, usedInfo) } diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala index 3168c763df4df..6ce195b6c7a34 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala @@ -178,7 +178,7 @@ private[spark] class HadoopDelegationTokenManager( private def scheduleRenewal(delay: Long): Unit = { val _delay = math.max(0, delay) - logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(delay)}.") + logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(_delay)}.") val renewalTask = new Runnable() { override def run(): Unit = { @@ -230,6 +230,8 @@ private[spark] class HadoopDelegationTokenManager( val now = System.currentTimeMillis val ratio = sparkConf.get(CREDENTIALS_RENEWAL_INTERVAL_RATIO) val delay = (ratio * (nextRenewal - now)).toLong + logInfo(s"Calculated delay on renewal is $delay, based on next renewal $nextRenewal " + + s"and the ratio $ratio, and current time $now") scheduleRenewal(delay) creds } diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala index 4e91e72361488..cd9516b849565 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala @@ -63,7 +63,8 @@ private[deploy] class HadoopFSDelegationTokenProvider val identifier = token .decodeIdentifier() .asInstanceOf[AbstractDelegationTokenIdentifier] - identifier.getIssueDate + interval + val tokenKind = token.getKind.toString + getIssueDate(tokenKind, identifier) + interval } if (nextRenewalDates.isEmpty) None else Some(nextRenewalDates.min) } @@ -126,13 +127,33 @@ private[deploy] class HadoopFSDelegationTokenProvider Try { val newExpiration = token.renew(hadoopConf) val identifier = token.decodeIdentifier().asInstanceOf[AbstractDelegationTokenIdentifier] - val interval = newExpiration - identifier.getIssueDate - logInfo(s"Renewal interval is $interval for token ${token.getKind.toString}") + val tokenKind = token.getKind.toString + val interval = newExpiration - getIssueDate(tokenKind, identifier) + logInfo(s"Renewal interval is $interval for token $tokenKind") interval }.toOption } if (renewIntervals.isEmpty) None else Some(renewIntervals.min) } + + private def getIssueDate(kind: String, identifier: AbstractDelegationTokenIdentifier): Long = { + val now = System.currentTimeMillis() + val issueDate = identifier.getIssueDate + if (issueDate > now) { + logWarning(s"Token $kind has set up issue date later than current time. (provided: " + + s"$issueDate / current timestamp: $now) Please make sure clocks are in sync between " + + "machines. If the issue is not a clock mismatch, consult token implementor to check " + + "whether issue date is valid.") + issueDate + } else if (issueDate > 0L) { + issueDate + } else { + logWarning(s"Token $kind has not set up issue date properly. (provided: $issueDate) " + + s"Using current timestamp ($now) as issue date instead. Consult token implementor to fix " + + "the behavior.") + now + } + } } private[deploy] object HadoopFSDelegationTokenProvider { diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala index 53ec7b3a88f35..2caee1846595e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala @@ -33,9 +33,11 @@ import org.apache.spark.deploy.master.DriverState import org.apache.spark.deploy.master.DriverState.DriverState import org.apache.spark.internal.Logging import org.apache.spark.internal.config.{DRIVER_RESOURCES_FILE, SPARK_DRIVER_PREFIX} +import org.apache.spark.internal.config.UI.UI_REVERSE_PROXY import org.apache.spark.internal.config.Worker.WORKER_DRIVER_TERMINATE_TIMEOUT import org.apache.spark.resource.ResourceInformation import org.apache.spark.rpc.RpcEndpointRef +import org.apache.spark.ui.UIUtils import org.apache.spark.util.{Clock, ShutdownHookManager, SystemClock, Utils} /** @@ -50,6 +52,7 @@ private[deploy] class DriverRunner( val driverDesc: DriverDescription, val worker: RpcEndpointRef, val workerUrl: String, + val workerWebUiUrl: String, val securityManager: SecurityManager, val resources: Map[String, ResourceInformation] = Map.empty) extends Logging { @@ -190,6 +193,14 @@ private[deploy] class DriverRunner( val builder = CommandUtils.buildProcessBuilder(driverDesc.command.copy(javaOpts = javaOpts), securityManager, driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables) + // add WebUI driver log url to environment + val reverseProxy = conf.get(UI_REVERSE_PROXY) + val workerUrlRef = UIUtils.makeHref(reverseProxy, driverId, workerWebUiUrl) + builder.environment.put("SPARK_DRIVER_LOG_URL_STDOUT", + s"$workerUrlRef/logPage?driverId=$driverId&logType=stdout") + builder.environment.put("SPARK_DRIVER_LOG_URL_STDERR", + s"$workerUrlRef/logPage?driverId=$driverId&logType=stderr") + runDriver(builder, driverDir, driverDesc.supervise) } diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index 4be495ac4f13f..08fdd0ab61aa3 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -34,7 +34,7 @@ import org.apache.spark.deploy.{Command, ExecutorDescription, ExecutorState} import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.ExternalShuffleService import org.apache.spark.deploy.StandaloneResourceUtils._ -import org.apache.spark.deploy.master.{DriverState, Master, WorkerResourceInfo} +import org.apache.spark.deploy.master.{DriverState, Master} import org.apache.spark.deploy.worker.ui.WorkerWebUI import org.apache.spark.internal.{config, Logging} import org.apache.spark.internal.config.Tests.IS_TESTING @@ -57,8 +57,7 @@ private[deploy] class Worker( val conf: SparkConf, val securityMgr: SecurityManager, resourceFileOpt: Option[String] = None, - externalShuffleServiceSupplier: Supplier[ExternalShuffleService] = null, - pid: Int = Utils.getProcessId) + externalShuffleServiceSupplier: Supplier[ExternalShuffleService] = null) extends ThreadSafeRpcEndpoint with Logging { private val host = rpcEnv.address.host @@ -205,7 +204,6 @@ private[deploy] class Worker( logInfo("Spark home: " + sparkHome) createWorkDir() startExternalShuffleService() - releaseResourcesOnInterrupt() setupWorkerResources() webUi = new WorkerWebUI(this, workDir, webUiPort) webUi.bind() @@ -219,26 +217,13 @@ private[deploy] class Worker( metricsSystem.getServletHandlers.foreach(webUi.attachHandler) } - /** - * Used to catch the TERM signal from sbin/stop-slave.sh and - * release resources before Worker exits - */ - private def releaseResourcesOnInterrupt(): Unit = { - SignalUtils.register("TERM") { - releaseResources(conf, SPARK_WORKER_PREFIX, resources, pid) - false - } - } - private def setupWorkerResources(): Unit = { try { - val allResources = getOrDiscoverAllResources(conf, SPARK_WORKER_PREFIX, resourceFileOpt) - resources = acquireResources(conf, SPARK_WORKER_PREFIX, allResources, pid) + resources = getOrDiscoverAllResources(conf, SPARK_WORKER_PREFIX, resourceFileOpt) logResourceInfo(SPARK_WORKER_PREFIX, resources) } catch { case e: Exception => logError("Failed to setup worker resources: ", e) - releaseResources(conf, SPARK_WORKER_PREFIX, resources, pid) if (!Utils.isTesting) { System.exit(1) } @@ -373,7 +358,6 @@ private[deploy] class Worker( TimeUnit.SECONDS)) } } else { - releaseResources(conf, SPARK_WORKER_PREFIX, resources, pid) logError("All masters are unresponsive! Giving up.") System.exit(1) } @@ -472,7 +456,6 @@ private[deploy] class Worker( case RegisterWorkerFailed(message) => if (!registered) { logError("Worker registration failed: " + message) - releaseResources(conf, SPARK_WORKER_PREFIX, resources, pid) System.exit(1) } @@ -645,6 +628,7 @@ private[deploy] class Worker( driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)), self, workerUri, + workerWebUiUrl, securityMgr, resources_) drivers(driverId) = driver @@ -738,7 +722,6 @@ private[deploy] class Worker( } override def onStop(): Unit = { - releaseResources(conf, SPARK_WORKER_PREFIX, resources, pid) cleanupThreadExecutor.shutdownNow() metricsSystem.report() cancelLastRegistrationRetry() @@ -875,9 +858,8 @@ private[deploy] object Worker extends Logging { val securityMgr = new SecurityManager(conf) val rpcEnv = RpcEnv.create(systemName, host, port, conf, securityMgr) val masterAddresses = masterUrls.map(RpcAddress.fromSparkURL) - val pid = if (Utils.isTesting) workerNumber.get else Utils.getProcessId rpcEnv.setupEndpoint(ENDPOINT_NAME, new Worker(rpcEnv, webUiPort, cores, memory, - masterAddresses, ENDPOINT_NAME, workDir, conf, securityMgr, resourceFileOpt, pid = pid)) + masterAddresses, ENDPOINT_NAME, workDir, conf, securityMgr, resourceFileOpt)) rpcEnv } diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 8aeb16fe5d8c8..baf9e461eb8ff 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -153,11 +153,6 @@ private[spark] class Executor( // for fetching remote cached RDD blocks, so need to make sure it uses the right classloader too. env.serializerManager.setDefaultClassLoader(replClassLoader) - // Plugins need to load using a class loader that includes the executor's user classpath - private val plugins: Option[PluginContainer] = Utils.withContextClassLoader(replClassLoader) { - PluginContainer(env, resources.asJava) - } - // Max size of direct result. If task result is bigger than this, we use the block manager // to send the result back. private val maxDirectResultSize = Math.min( @@ -218,6 +213,27 @@ private[spark] class Executor( heartbeater.start() + private val appStartTime = conf.getLong("spark.app.startTime", 0) + + // To allow users to distribute plugins and their required files + // specified by --jars and --files on application submission, those jars/files should be + // downloaded and added to the class loader via updateDependencies. + // This should be done before plugin initialization below + // because executors search plugins from the class loader and initialize them. + private val Seq(initialUserJars, initialUserFiles) = Seq("jar", "file").map { key => + conf.getOption(s"spark.app.initial.$key.urls").map { urls => + Map(urls.split(",").map(url => (url, appStartTime)): _*) + }.getOrElse(Map.empty) + } + updateDependencies(initialUserFiles, initialUserJars) + + // Plugins need to load using a class loader that includes the executor's user classpath. + // Plugins also needs to be initialized after the heartbeater started + // to avoid blocking to send heartbeat (see SPARK-32175). + private val plugins: Option[PluginContainer] = Utils.withContextClassLoader(replClassLoader) { + PluginContainer(env, resources.asJava) + } + metricsPoller.start() private[executor] def numRunningTasks: Int = runningTasks.size() @@ -365,7 +381,9 @@ private[spark] class Executor( // Report executor runtime and JVM gc time Option(task).foreach(t => { t.metrics.setExecutorRunTime(TimeUnit.NANOSECONDS.toMillis( - System.nanoTime() - taskStartTimeNs)) + // SPARK-32898: it's possible that a task is killed when taskStartTimeNs has the initial + // value(=0) still. In this case, the executorRunTime should be considered as 0. + if (taskStartTimeNs > 0) System.nanoTime() - taskStartTimeNs else 0)) t.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime) }) diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala index 2e4846bec2db4..0c1d9635b6535 100644 --- a/core/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala @@ -117,7 +117,7 @@ trait Logging { } // For testing - def initializeForcefully(isInterpreter: Boolean, silent: Boolean): Unit = { + private[spark] def initializeForcefully(isInterpreter: Boolean, silent: Boolean): Unit = { initializeLogging(isInterpreter, silent) } diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala index 68e1994f0f94f..8d5959a0c8b7f 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala @@ -129,7 +129,7 @@ private[spark] class TypedConfigBuilder[T]( def createOptional: OptionalConfigEntry[T] = { val entry = new OptionalConfigEntry[T](parent.key, parent._prependedKey, parent._prependSeparator, parent._alternatives, converter, stringConverter, parent._doc, - parent._public) + parent._public, parent._version) parent._onCreate.foreach(_(entry)) entry } @@ -144,7 +144,7 @@ private[spark] class TypedConfigBuilder[T]( val transformedDefault = converter(stringConverter(default)) val entry = new ConfigEntryWithDefault[T](parent.key, parent._prependedKey, parent._prependSeparator, parent._alternatives, transformedDefault, converter, - stringConverter, parent._doc, parent._public) + stringConverter, parent._doc, parent._public, parent._version) parent._onCreate.foreach(_(entry)) entry } @@ -154,7 +154,7 @@ private[spark] class TypedConfigBuilder[T]( def createWithDefaultFunction(defaultFunc: () => T): ConfigEntry[T] = { val entry = new ConfigEntryWithDefaultFunction[T](parent.key, parent._prependedKey, parent._prependSeparator, parent._alternatives, defaultFunc, converter, stringConverter, - parent._doc, parent._public) + parent._doc, parent._public, parent._version) parent._onCreate.foreach(_ (entry)) entry } @@ -166,7 +166,7 @@ private[spark] class TypedConfigBuilder[T]( def createWithDefaultString(default: String): ConfigEntry[T] = { val entry = new ConfigEntryWithDefaultString[T](parent.key, parent._prependedKey, parent._prependSeparator, parent._alternatives, default, converter, stringConverter, - parent._doc, parent._public) + parent._doc, parent._public, parent._version) parent._onCreate.foreach(_(entry)) entry } @@ -186,6 +186,7 @@ private[spark] case class ConfigBuilder(key: String) { private[config] var _prependSeparator: String = "" private[config] var _public = true private[config] var _doc = "" + private[config] var _version = "" private[config] var _onCreate: Option[ConfigEntry[_] => Unit] = None private[config] var _alternatives = List.empty[String] @@ -199,6 +200,11 @@ private[spark] case class ConfigBuilder(key: String) { this } + def version(v: String): ConfigBuilder = { + _version = v + this + } + /** * Registers a callback for when the config entry is finally instantiated. Currently used by * SQLConf to keep track of SQL configuration entries. @@ -255,7 +261,7 @@ private[spark] case class ConfigBuilder(key: String) { def fallbackConf[T](fallback: ConfigEntry[T]): ConfigEntry[T] = { val entry = new FallbackConfigEntry(key, _prependedKey, _prependSeparator, _alternatives, _doc, - _public, fallback) + _public, _version, fallback) _onCreate.foreach(_(entry)) entry } diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala index c5df4c8820098..8c0b11d46312e 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala @@ -39,6 +39,7 @@ package org.apache.spark.internal.config * @param doc the documentation for the configuration * @param isPublic if this configuration is public to the user. If it's `false`, this * configuration is only used internally and we should not expose it to users. + * @param version the spark version when the configuration was released. * @tparam T the value type */ private[spark] abstract class ConfigEntry[T] ( @@ -49,7 +50,8 @@ private[spark] abstract class ConfigEntry[T] ( val valueConverter: String => T, val stringConverter: T => String, val doc: String, - val isPublic: Boolean) { + val isPublic: Boolean, + val version: String) { import ConfigEntry._ @@ -74,7 +76,8 @@ private[spark] abstract class ConfigEntry[T] ( def defaultValue: Option[T] = None override def toString: String = { - s"ConfigEntry(key=$key, defaultValue=$defaultValueString, doc=$doc, public=$isPublic)" + s"ConfigEntry(key=$key, defaultValue=$defaultValueString, doc=$doc, " + + s"public=$isPublic, version=$version)" } } @@ -87,7 +90,8 @@ private class ConfigEntryWithDefault[T] ( valueConverter: String => T, stringConverter: T => String, doc: String, - isPublic: Boolean) + isPublic: Boolean, + version: String) extends ConfigEntry( key, prependedKey, @@ -96,7 +100,8 @@ private class ConfigEntryWithDefault[T] ( valueConverter, stringConverter, doc, - isPublic + isPublic, + version ) { override def defaultValue: Option[T] = Some(_defaultValue) @@ -117,7 +122,8 @@ private class ConfigEntryWithDefaultFunction[T] ( valueConverter: String => T, stringConverter: T => String, doc: String, - isPublic: Boolean) + isPublic: Boolean, + version: String) extends ConfigEntry( key, prependedKey, @@ -126,7 +132,8 @@ private class ConfigEntryWithDefaultFunction[T] ( valueConverter, stringConverter, doc, - isPublic + isPublic, + version ) { override def defaultValue: Option[T] = Some(_defaultFunction()) @@ -147,7 +154,8 @@ private class ConfigEntryWithDefaultString[T] ( valueConverter: String => T, stringConverter: T => String, doc: String, - isPublic: Boolean) + isPublic: Boolean, + version: String) extends ConfigEntry( key, prependedKey, @@ -156,7 +164,8 @@ private class ConfigEntryWithDefaultString[T] ( valueConverter, stringConverter, doc, - isPublic + isPublic, + version ) { override def defaultValue: Option[T] = Some(valueConverter(_defaultValue)) @@ -181,7 +190,8 @@ private[spark] class OptionalConfigEntry[T]( val rawValueConverter: String => T, val rawStringConverter: T => String, doc: String, - isPublic: Boolean) + isPublic: Boolean, + version: String) extends ConfigEntry[Option[T]]( key, prependedKey, @@ -190,7 +200,8 @@ private[spark] class OptionalConfigEntry[T]( s => Some(rawValueConverter(s)), v => v.map(rawStringConverter).orNull, doc, - isPublic + isPublic, + version ) { override def defaultValueString: String = ConfigEntry.UNDEFINED @@ -210,6 +221,7 @@ private[spark] class FallbackConfigEntry[T] ( alternatives: List[String], doc: String, isPublic: Boolean, + version: String, val fallback: ConfigEntry[T]) extends ConfigEntry[T]( key, @@ -219,7 +231,8 @@ private[spark] class FallbackConfigEntry[T] ( fallback.valueConverter, fallback.stringConverter, doc, - isPublic + isPublic, + version ) { override def defaultValueString: String = s"" diff --git a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala index ceab957b36634..d494c5ec019c7 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Deploy.scala @@ -19,48 +19,59 @@ package org.apache.spark.internal.config private[spark] object Deploy { val RECOVERY_MODE = ConfigBuilder("spark.deploy.recoveryMode") + .version("0.8.1") .stringConf .createWithDefault("NONE") val RECOVERY_MODE_FACTORY = ConfigBuilder("spark.deploy.recoveryMode.factory") + .version("1.2.0") .stringConf .createWithDefault("") val RECOVERY_DIRECTORY = ConfigBuilder("spark.deploy.recoveryDirectory") + .version("0.8.1") .stringConf .createWithDefault("") val ZOOKEEPER_URL = ConfigBuilder("spark.deploy.zookeeper.url") .doc(s"When `${RECOVERY_MODE.key}` is set to ZOOKEEPER, this " + "configuration is used to set the zookeeper URL to connect to.") + .version("0.8.1") .stringConf .createOptional val ZOOKEEPER_DIRECTORY = ConfigBuilder("spark.deploy.zookeeper.dir") + .version("0.8.1") .stringConf .createOptional val RETAINED_APPLICATIONS = ConfigBuilder("spark.deploy.retainedApplications") + .version("0.8.0") .intConf .createWithDefault(200) val RETAINED_DRIVERS = ConfigBuilder("spark.deploy.retainedDrivers") + .version("1.1.0") .intConf .createWithDefault(200) val REAPER_ITERATIONS = ConfigBuilder("spark.dead.worker.persistence") + .version("0.8.0") .intConf .createWithDefault(15) val MAX_EXECUTOR_RETRIES = ConfigBuilder("spark.deploy.maxExecutorRetries") + .version("1.6.3") .intConf .createWithDefault(10) val SPREAD_OUT_APPS = ConfigBuilder("spark.deploy.spreadOut") + .version("0.6.1") .booleanConf .createWithDefault(true) val DEFAULT_CORES = ConfigBuilder("spark.deploy.defaultCores") + .version("0.9.0") .intConf .createWithDefault(Int.MaxValue) diff --git a/core/src/main/scala/org/apache/spark/internal/config/History.scala b/core/src/main/scala/org/apache/spark/internal/config/History.scala index 14fb5ff075472..581777de366ef 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/History.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/History.scala @@ -26,46 +26,56 @@ private[spark] object History { val DEFAULT_LOG_DIR = "file:/tmp/spark-events" val HISTORY_LOG_DIR = ConfigBuilder("spark.history.fs.logDirectory") + .version("1.1.0") .stringConf .createWithDefault(DEFAULT_LOG_DIR) val SAFEMODE_CHECK_INTERVAL_S = ConfigBuilder("spark.history.fs.safemodeCheck.interval") + .version("1.6.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("5s") val UPDATE_INTERVAL_S = ConfigBuilder("spark.history.fs.update.interval") + .version("1.4.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("10s") val CLEANER_ENABLED = ConfigBuilder("spark.history.fs.cleaner.enabled") + .version("1.4.0") .booleanConf .createWithDefault(false) val CLEANER_INTERVAL_S = ConfigBuilder("spark.history.fs.cleaner.interval") + .version("1.4.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("1d") val MAX_LOG_AGE_S = ConfigBuilder("spark.history.fs.cleaner.maxAge") + .version("1.4.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("7d") val MAX_LOG_NUM = ConfigBuilder("spark.history.fs.cleaner.maxNum") .doc("The maximum number of log files in the event log directory.") + .version("3.0.0") .intConf .createWithDefault(Int.MaxValue) val LOCAL_STORE_DIR = ConfigBuilder("spark.history.store.path") .doc("Local directory where to cache application history information. By default this is " + "not set, meaning all history information will be kept in memory.") + .version("2.3.0") .stringConf .createOptional val MAX_LOCAL_DISK_USAGE = ConfigBuilder("spark.history.store.maxDiskUsage") + .version("2.3.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("10g") val HISTORY_SERVER_UI_PORT = ConfigBuilder("spark.history.ui.port") .doc("Web UI port to bind Spark History Server") + .version("1.0.0") .intConf .createWithDefault(18080) @@ -73,6 +83,7 @@ private[spark] object History { ConfigBuilder("spark.history.fs.inProgressOptimization.enabled") .doc("Enable optimized handling of in-progress logs. This option may leave finished " + "applications that fail to rename their event logs listed as in-progress.") + .version("2.4.0") .booleanConf .createWithDefault(true) @@ -81,6 +92,7 @@ private[spark] object History { .doc("How many bytes to parse at the end of log files looking for the end event. " + "This is used to speed up generation of application listings by skipping unnecessary " + "parts of event log files. It can be disabled by setting this config to 0.") + .version("2.4.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("1m") @@ -90,6 +102,7 @@ private[spark] object History { "By default, all event log files will be retained. Please set the configuration " + s"and ${EVENT_LOG_ROLLING_MAX_FILE_SIZE.key} accordingly if you want to control " + "the overall size of event log files.") + .version("3.0.0") .intConf .checkValue(_ > 0, "Max event log files to retain should be higher than 0.") .createWithDefault(Integer.MAX_VALUE) @@ -99,54 +112,67 @@ private[spark] object History { .doc("The threshold score to determine whether it's good to do the compaction or not. " + "The compaction score is calculated in analyzing, and being compared to this value. " + "Compaction will proceed only when the score is higher than the threshold value.") + .version("3.0.0") .internal() .doubleConf .createWithDefault(0.7d) val DRIVER_LOG_CLEANER_ENABLED = ConfigBuilder("spark.history.fs.driverlog.cleaner.enabled") + .version("3.0.0") .fallbackConf(CLEANER_ENABLED) val DRIVER_LOG_CLEANER_INTERVAL = ConfigBuilder("spark.history.fs.driverlog.cleaner.interval") + .version("3.0.0") .fallbackConf(CLEANER_INTERVAL_S) val MAX_DRIVER_LOG_AGE_S = ConfigBuilder("spark.history.fs.driverlog.cleaner.maxAge") + .version("3.0.0") .fallbackConf(MAX_LOG_AGE_S) val HISTORY_SERVER_UI_ACLS_ENABLE = ConfigBuilder("spark.history.ui.acls.enable") + .version("1.0.1") .booleanConf .createWithDefault(false) val HISTORY_SERVER_UI_ADMIN_ACLS = ConfigBuilder("spark.history.ui.admin.acls") + .version("2.1.1") .stringConf .toSequence .createWithDefault(Nil) val HISTORY_SERVER_UI_ADMIN_ACLS_GROUPS = ConfigBuilder("spark.history.ui.admin.acls.groups") + .version("2.1.1") .stringConf .toSequence .createWithDefault(Nil) val NUM_REPLAY_THREADS = ConfigBuilder("spark.history.fs.numReplayThreads") + .version("2.0.0") .intConf .createWithDefaultFunction(() => Math.ceil(Runtime.getRuntime.availableProcessors() / 4f).toInt) val RETAINED_APPLICATIONS = ConfigBuilder("spark.history.retainedApplications") + .version("1.0.0") .intConf .createWithDefault(50) val PROVIDER = ConfigBuilder("spark.history.provider") + .version("1.1.0") .stringConf .createOptional val KERBEROS_ENABLED = ConfigBuilder("spark.history.kerberos.enabled") + .version("1.0.1") .booleanConf .createWithDefault(false) val KERBEROS_PRINCIPAL = ConfigBuilder("spark.history.kerberos.principal") + .version("1.0.1") .stringConf .createOptional val KERBEROS_KEYTAB = ConfigBuilder("spark.history.kerberos.keytab") + .version("1.0.1") .stringConf .createOptional @@ -156,15 +182,17 @@ private[spark] object History { "some path variables via patterns which can vary on cluster manager. Please check the " + "documentation for your cluster manager to see which patterns are supported, if any. " + "This configuration has no effect on a live application, it only affects the history server.") + .version("3.0.0") .stringConf .createOptional val APPLY_CUSTOM_EXECUTOR_LOG_URL_TO_INCOMPLETE_APP = ConfigBuilder("spark.history.custom.executor.log.url.applyIncompleteApplication") .doc("Whether to apply custom executor log url, as specified by " + - "`spark.history.custom.executor.log.url`, to incomplete application as well. " + + s"${CUSTOM_EXECUTOR_LOG_URL.key}, to incomplete application as well. " + "Even if this is true, this still only affects the behavior of the history server, " + "not running spark applications.") + .version("3.0.0") .booleanConf .createWithDefault(true) } diff --git a/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala b/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala index 717a09914a2f5..90c59b079461c 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala @@ -22,35 +22,44 @@ import org.apache.spark.network.util.ByteUnit private[spark] object Kryo { val KRYO_REGISTRATION_REQUIRED = ConfigBuilder("spark.kryo.registrationRequired") + .version("1.1.0") .booleanConf .createWithDefault(false) val KRYO_USER_REGISTRATORS = ConfigBuilder("spark.kryo.registrator") + .version("0.5.0") .stringConf - .createOptional + .toSequence + .createWithDefault(Nil) val KRYO_CLASSES_TO_REGISTER = ConfigBuilder("spark.kryo.classesToRegister") + .version("1.2.0") .stringConf .toSequence .createWithDefault(Nil) val KRYO_USE_UNSAFE = ConfigBuilder("spark.kryo.unsafe") + .version("2.1.0") .booleanConf .createWithDefault(false) val KRYO_USE_POOL = ConfigBuilder("spark.kryo.pool") + .version("3.0.0") .booleanConf .createWithDefault(true) val KRYO_REFERENCE_TRACKING = ConfigBuilder("spark.kryo.referenceTracking") + .version("0.8.0") .booleanConf .createWithDefault(true) val KRYO_SERIALIZER_BUFFER_SIZE = ConfigBuilder("spark.kryoserializer.buffer") + .version("1.4.0") .bytesConf(ByteUnit.KiB) .createWithDefaultString("64k") val KRYO_SERIALIZER_MAX_BUFFER_SIZE = ConfigBuilder("spark.kryoserializer.buffer.max") + .version("1.4.0") .bytesConf(ByteUnit.MiB) .createWithDefaultString("64m") diff --git a/core/src/main/scala/org/apache/spark/internal/config/Network.scala b/core/src/main/scala/org/apache/spark/internal/config/Network.scala index 129e31a82979f..0961d062cc04f 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Network.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Network.scala @@ -23,71 +23,85 @@ private[spark] object Network { private[spark] val NETWORK_CRYPTO_SASL_FALLBACK = ConfigBuilder("spark.network.crypto.saslFallback") + .version("2.2.0") .booleanConf .createWithDefault(true) private[spark] val NETWORK_CRYPTO_ENABLED = ConfigBuilder("spark.network.crypto.enabled") + .version("2.2.0") .booleanConf .createWithDefault(false) private[spark] val NETWORK_REMOTE_READ_NIO_BUFFER_CONVERSION = ConfigBuilder("spark.network.remoteReadNioBufferConversion") + .version("2.4.0") .booleanConf .createWithDefault(false) private[spark] val NETWORK_TIMEOUT = ConfigBuilder("spark.network.timeout") + .version("1.3.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("120s") private[spark] val NETWORK_TIMEOUT_INTERVAL = ConfigBuilder("spark.network.timeoutInterval") + .version("1.3.2") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString(STORAGE_BLOCKMANAGER_TIMEOUTINTERVAL.defaultValueString) private[spark] val RPC_ASK_TIMEOUT = ConfigBuilder("spark.rpc.askTimeout") + .version("1.4.0") .stringConf .createOptional private[spark] val RPC_CONNECT_THREADS = ConfigBuilder("spark.rpc.connect.threads") + .version("1.6.0") .intConf .createWithDefault(64) private[spark] val RPC_IO_NUM_CONNECTIONS_PER_PEER = ConfigBuilder("spark.rpc.io.numConnectionsPerPeer") + .version("1.6.0") .intConf .createWithDefault(1) private[spark] val RPC_IO_THREADS = ConfigBuilder("spark.rpc.io.threads") + .version("1.6.0") .intConf .createOptional private[spark] val RPC_LOOKUP_TIMEOUT = ConfigBuilder("spark.rpc.lookupTimeout") + .version("1.4.0") .stringConf .createOptional private[spark] val RPC_MESSAGE_MAX_SIZE = ConfigBuilder("spark.rpc.message.maxSize") + .version("2.0.0") .intConf .createWithDefault(128) private[spark] val RPC_NETTY_DISPATCHER_NUM_THREADS = ConfigBuilder("spark.rpc.netty.dispatcher.numThreads") + .version("1.6.0") .intConf .createOptional private[spark] val RPC_NUM_RETRIES = ConfigBuilder("spark.rpc.numRetries") + .version("1.4.0") .intConf .createWithDefault(3) private[spark] val RPC_RETRY_WAIT = ConfigBuilder("spark.rpc.retry.wait") + .version("1.4.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("3s") } diff --git a/core/src/main/scala/org/apache/spark/internal/config/Python.scala b/core/src/main/scala/org/apache/spark/internal/config/Python.scala index 26a0598f49411..188d884319644 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Python.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Python.scala @@ -22,26 +22,32 @@ import org.apache.spark.network.util.ByteUnit private[spark] object Python { val PYTHON_WORKER_REUSE = ConfigBuilder("spark.python.worker.reuse") + .version("1.2.0") .booleanConf .createWithDefault(true) val PYTHON_TASK_KILL_TIMEOUT = ConfigBuilder("spark.python.task.killTimeout") + .version("2.2.2") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("2s") val PYTHON_USE_DAEMON = ConfigBuilder("spark.python.use.daemon") + .version("2.3.0") .booleanConf .createWithDefault(true) val PYTHON_DAEMON_MODULE = ConfigBuilder("spark.python.daemon.module") + .version("2.4.0") .stringConf .createOptional val PYTHON_WORKER_MODULE = ConfigBuilder("spark.python.worker.module") + .version("2.4.0") .stringConf .createOptional val PYSPARK_EXECUTOR_MEMORY = ConfigBuilder("spark.executor.pyspark.memory") + .version("2.4.0") .bytesConf(ByteUnit.MiB) .createOptional } diff --git a/core/src/main/scala/org/apache/spark/internal/config/R.scala b/core/src/main/scala/org/apache/spark/internal/config/R.scala index 26e06a5231c42..46fc198cd4cf5 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/R.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/R.scala @@ -19,22 +19,27 @@ package org.apache.spark.internal.config private[spark] object R { val R_BACKEND_CONNECTION_TIMEOUT = ConfigBuilder("spark.r.backendConnectionTimeout") + .version("2.1.0") .intConf .createWithDefault(6000) val R_NUM_BACKEND_THREADS = ConfigBuilder("spark.r.numRBackendThreads") + .version("1.4.0") .intConf .createWithDefault(2) val R_HEARTBEAT_INTERVAL = ConfigBuilder("spark.r.heartBeatInterval") + .version("2.1.0") .intConf .createWithDefault(100) val SPARKR_COMMAND = ConfigBuilder("spark.sparkr.r.command") + .version("1.5.3") .stringConf .createWithDefault("Rscript") val R_COMMAND = ConfigBuilder("spark.r.command") + .version("1.5.3") .stringConf .createOptional } diff --git a/core/src/main/scala/org/apache/spark/internal/config/Status.scala b/core/src/main/scala/org/apache/spark/internal/config/Status.scala index 3cc00a6f094cf..669fa07053cad 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Status.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Status.scala @@ -22,36 +22,44 @@ import java.util.concurrent.TimeUnit private[spark] object Status { val ASYNC_TRACKING_ENABLED = ConfigBuilder("spark.appStateStore.asyncTracking.enable") + .version("2.3.0") .booleanConf .createWithDefault(true) val LIVE_ENTITY_UPDATE_PERIOD = ConfigBuilder("spark.ui.liveUpdate.period") + .version("2.3.0") .timeConf(TimeUnit.NANOSECONDS) .createWithDefaultString("100ms") val LIVE_ENTITY_UPDATE_MIN_FLUSH_PERIOD = ConfigBuilder("spark.ui.liveUpdate.minFlushPeriod") .doc("Minimum time elapsed before stale UI data is flushed. This avoids UI staleness when " + "incoming task events are not fired frequently.") + .version("2.4.2") .timeConf(TimeUnit.NANOSECONDS) .createWithDefaultString("1s") val MAX_RETAINED_JOBS = ConfigBuilder("spark.ui.retainedJobs") + .version("1.2.0") .intConf .createWithDefault(1000) val MAX_RETAINED_STAGES = ConfigBuilder("spark.ui.retainedStages") + .version("0.9.0") .intConf .createWithDefault(1000) val MAX_RETAINED_TASKS_PER_STAGE = ConfigBuilder("spark.ui.retainedTasks") + .version("2.0.1") .intConf .createWithDefault(100000) val MAX_RETAINED_DEAD_EXECUTORS = ConfigBuilder("spark.ui.retainedDeadExecutors") + .version("2.0.0") .intConf .createWithDefault(100) val MAX_RETAINED_ROOT_NODES = ConfigBuilder("spark.ui.dagGraph.retainedRootRDDs") + .version("2.1.0") .intConf .createWithDefault(Int.MaxValue) @@ -59,6 +67,7 @@ private[spark] object Status { ConfigBuilder("spark.metrics.appStatusSource.enabled") .doc("Whether Dropwizard/Codahale metrics " + "will be reported for the status of the running spark app.") + .version("3.0.0") .booleanConf .createWithDefault(false) } diff --git a/core/src/main/scala/org/apache/spark/internal/config/Streaming.scala b/core/src/main/scala/org/apache/spark/internal/config/Streaming.scala index 6e58c090e8126..44a467b278614 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Streaming.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Streaming.scala @@ -23,16 +23,19 @@ private[spark] object Streaming { private[spark] val STREAMING_DYN_ALLOCATION_ENABLED = ConfigBuilder("spark.streaming.dynamicAllocation.enabled") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val STREAMING_DYN_ALLOCATION_TESTING = ConfigBuilder("spark.streaming.dynamicAllocation.testing") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val STREAMING_DYN_ALLOCATION_MIN_EXECUTORS = ConfigBuilder("spark.streaming.dynamicAllocation.minExecutors") + .version("3.0.0") .intConf .checkValue(_ > 0, "The min executor number of streaming dynamic " + "allocation must be positive.") @@ -40,6 +43,7 @@ private[spark] object Streaming { private[spark] val STREAMING_DYN_ALLOCATION_MAX_EXECUTORS = ConfigBuilder("spark.streaming.dynamicAllocation.maxExecutors") + .version("3.0.0") .intConf .checkValue(_ > 0, "The max executor number of streaming dynamic " + "allocation must be positive.") @@ -47,6 +51,7 @@ private[spark] object Streaming { private[spark] val STREAMING_DYN_ALLOCATION_SCALING_INTERVAL = ConfigBuilder("spark.streaming.dynamicAllocation.scalingInterval") + .version("3.0.0") .timeConf(TimeUnit.SECONDS) .checkValue(_ > 0, "The scaling interval of streaming dynamic " + "allocation must be positive.") @@ -54,6 +59,7 @@ private[spark] object Streaming { private[spark] val STREAMING_DYN_ALLOCATION_SCALING_UP_RATIO = ConfigBuilder("spark.streaming.dynamicAllocation.scalingUpRatio") + .version("3.0.0") .doubleConf .checkValue(_ > 0, "The scaling up ratio of streaming dynamic " + "allocation must be positive.") @@ -61,6 +67,7 @@ private[spark] object Streaming { private[spark] val STREAMING_DYN_ALLOCATION_SCALING_DOWN_RATIO = ConfigBuilder("spark.streaming.dynamicAllocation.scalingDownRatio") + .version("3.0.0") .doubleConf .checkValue(_ > 0, "The scaling down ratio of streaming dynamic " + "allocation must be positive.") diff --git a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala index 21660ab3a9512..e328ed026bfb0 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala @@ -22,35 +22,58 @@ private[spark] object Tests { val TEST_USE_COMPRESSED_OOPS_KEY = "spark.test.useCompressedOops" val TEST_MEMORY = ConfigBuilder("spark.testing.memory") + .version("1.6.0") .longConf .createWithDefault(Runtime.getRuntime.maxMemory) val TEST_SCHEDULE_INTERVAL = ConfigBuilder("spark.testing.dynamicAllocation.scheduleInterval") + .version("2.3.0") .longConf .createWithDefault(100) val IS_TESTING = ConfigBuilder("spark.testing") + .version("1.0.1") .booleanConf .createOptional val TEST_NO_STAGE_RETRY = ConfigBuilder("spark.test.noStageRetry") + .version("1.2.0") .booleanConf .createWithDefault(false) val TEST_RESERVED_MEMORY = ConfigBuilder("spark.testing.reservedMemory") + .version("1.6.0") .longConf .createOptional val TEST_N_HOSTS = ConfigBuilder("spark.testing.nHosts") + .version("3.0.0") .intConf .createWithDefault(5) val TEST_N_EXECUTORS_HOST = ConfigBuilder("spark.testing.nExecutorsPerHost") + .version("3.0.0") .intConf .createWithDefault(4) val TEST_N_CORES_EXECUTOR = ConfigBuilder("spark.testing.nCoresPerExecutor") + .version("3.0.0") .intConf .createWithDefault(2) + + val RESOURCES_WARNING_TESTING = ConfigBuilder("spark.resources.warnings.testing") + .version("3.0.1") + .booleanConf + .createWithDefault(false) + + // This configuration is used for unit tests to allow skipping the task cpus to cores validation + // to allow emulating standalone mode behavior while running in local mode. Standalone mode + // by default doesn't specify a number of executor cores, it just uses all the ones available + // on the host. + val SKIP_VALIDATE_CORES_TESTING = + ConfigBuilder("spark.testing.skipValidateCores") + .version("3.0.1") + .booleanConf + .createWithDefault(false) } diff --git a/core/src/main/scala/org/apache/spark/internal/config/UI.scala b/core/src/main/scala/org/apache/spark/internal/config/UI.scala index 60d985713d30e..231eecf086bbe 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/UI.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/UI.scala @@ -25,31 +25,37 @@ private[spark] object UI { val UI_SHOW_CONSOLE_PROGRESS = ConfigBuilder("spark.ui.showConsoleProgress") .doc("When true, show the progress bar in the console.") + .version("1.2.1") .booleanConf .createWithDefault(false) val UI_CONSOLE_PROGRESS_UPDATE_INTERVAL = ConfigBuilder("spark.ui.consoleProgress.update.interval") + .version("2.1.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefault(200) val UI_ENABLED = ConfigBuilder("spark.ui.enabled") .doc("Whether to run the web UI for the Spark application.") + .version("1.1.1") .booleanConf .createWithDefault(true) val UI_PORT = ConfigBuilder("spark.ui.port") .doc("Port for your application's dashboard, which shows memory and workload data.") + .version("0.7.0") .intConf .createWithDefault(4040) val UI_FILTERS = ConfigBuilder("spark.ui.filters") .doc("Comma separated list of filter class names to apply to the Spark Web UI.") + .version("1.0.0") .stringConf .toSequence .createWithDefault(Nil) val UI_ALLOW_FRAMING_FROM = ConfigBuilder("spark.ui.allowFramingFrom") + .version("1.6.0") .stringConf .createOptional @@ -61,6 +67,7 @@ private[spark] object UI { "through spark master/proxy public URL. This setting affects all the workers and " + "application UIs running in the cluster and must be set on all the workers, drivers " + " and masters.") + .version("2.1.0") .booleanConf .createWithDefault(false) @@ -69,15 +76,18 @@ private[spark] object UI { "in front of Spark Master. This is useful when running proxy for authentication e.g. " + "OAuth proxy. Make sure this is a complete URL including scheme (http/https) and port to " + "reach your proxy.") + .version("2.1.0") .stringConf .createOptional val UI_KILL_ENABLED = ConfigBuilder("spark.ui.killEnabled") .doc("Allows jobs and stages to be killed from the web UI.") + .version("1.0.0") .booleanConf .createWithDefault(true) val UI_THREAD_DUMPS_ENABLED = ConfigBuilder("spark.ui.threadDumpsEnabled") + .version("1.2.0") .booleanConf .createWithDefault(true) @@ -85,73 +95,88 @@ private[spark] object UI { .internal() .doc("Expose executor metrics at /metrics/executors/prometheus. " + "For master/worker/driver metrics, you need to configure `conf/metrics.properties`.") + .version("3.0.0") .booleanConf .createWithDefault(false) val UI_X_XSS_PROTECTION = ConfigBuilder("spark.ui.xXssProtection") .doc("Value for HTTP X-XSS-Protection response header") + .version("2.3.0") .stringConf .createWithDefaultString("1; mode=block") val UI_X_CONTENT_TYPE_OPTIONS = ConfigBuilder("spark.ui.xContentTypeOptions.enabled") .doc("Set to 'true' for setting X-Content-Type-Options HTTP response header to 'nosniff'") + .version("2.3.0") .booleanConf .createWithDefault(true) val UI_STRICT_TRANSPORT_SECURITY = ConfigBuilder("spark.ui.strictTransportSecurity") .doc("Value for HTTP Strict Transport Security Response Header") + .version("2.3.0") .stringConf .createOptional val UI_REQUEST_HEADER_SIZE = ConfigBuilder("spark.ui.requestHeaderSize") .doc("Value for HTTP request header size in bytes.") + .version("2.2.3") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("8k") val UI_TIMELINE_TASKS_MAXIMUM = ConfigBuilder("spark.ui.timeline.tasks.maximum") + .version("1.4.0") .intConf .createWithDefault(1000) val ACLS_ENABLE = ConfigBuilder("spark.acls.enable") + .version("1.1.0") .booleanConf .createWithDefault(false) val UI_VIEW_ACLS = ConfigBuilder("spark.ui.view.acls") + .version("1.0.0") .stringConf .toSequence .createWithDefault(Nil) val UI_VIEW_ACLS_GROUPS = ConfigBuilder("spark.ui.view.acls.groups") + .version("2.0.0") .stringConf .toSequence .createWithDefault(Nil) val ADMIN_ACLS = ConfigBuilder("spark.admin.acls") + .version("1.1.0") .stringConf .toSequence .createWithDefault(Nil) val ADMIN_ACLS_GROUPS = ConfigBuilder("spark.admin.acls.groups") + .version("2.0.0") .stringConf .toSequence .createWithDefault(Nil) val MODIFY_ACLS = ConfigBuilder("spark.modify.acls") + .version("1.1.0") .stringConf .toSequence .createWithDefault(Nil) val MODIFY_ACLS_GROUPS = ConfigBuilder("spark.modify.acls.groups") + .version("2.0.0") .stringConf .toSequence .createWithDefault(Nil) val USER_GROUPS_MAPPING = ConfigBuilder("spark.user.groups.mapping") + .version("2.0.0") .stringConf .createWithDefault("org.apache.spark.security.ShellBasedGroupsMappingProvider") val PROXY_REDIRECT_URI = ConfigBuilder("spark.ui.proxyRedirectUri") .doc("Proxy address to use when responding with HTTP redirects.") + .version("3.0.0") .stringConf .createOptional @@ -163,6 +188,7 @@ private[spark] object UI { "This configuration replaces original log urls in event log, which will be also effective " + "when accessing the application on history server. The new log urls must be permanent, " + "otherwise you might have dead link for executor log urls.") + .version("3.0.0") .stringConf .createOptional } diff --git a/core/src/main/scala/org/apache/spark/internal/config/Worker.scala b/core/src/main/scala/org/apache/spark/internal/config/Worker.scala index f1eaae29f18df..a8072712c46ce 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Worker.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Worker.scala @@ -28,47 +28,58 @@ private[spark] object Worker { .doc("Path to a file containing the resources allocated to the worker. " + "The file should be formatted as a JSON array of ResourceAllocation objects. " + "Only used internally in standalone mode.") + .version("3.0.0") .stringConf .createOptional val WORKER_TIMEOUT = ConfigBuilder("spark.worker.timeout") + .version("0.6.2") .longConf .createWithDefault(60) val WORKER_DRIVER_TERMINATE_TIMEOUT = ConfigBuilder("spark.worker.driverTerminateTimeout") + .version("2.1.2") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("10s") val WORKER_CLEANUP_ENABLED = ConfigBuilder("spark.worker.cleanup.enabled") + .version("1.0.0") .booleanConf .createWithDefault(false) val WORKER_CLEANUP_INTERVAL = ConfigBuilder("spark.worker.cleanup.interval") + .version("1.0.0") .longConf .createWithDefault(60 * 30) val APP_DATA_RETENTION = ConfigBuilder("spark.worker.cleanup.appDataTtl") + .version("1.0.0") .longConf .createWithDefault(7 * 24 * 3600) val PREFER_CONFIGURED_MASTER_ADDRESS = ConfigBuilder("spark.worker.preferConfiguredMasterAddress") + .version("2.2.1") .booleanConf .createWithDefault(false) val WORKER_UI_PORT = ConfigBuilder("spark.worker.ui.port") + .version("1.1.0") .intConf .createOptional val WORKER_UI_RETAINED_EXECUTORS = ConfigBuilder("spark.worker.ui.retainedExecutors") + .version("1.5.0") .intConf .createWithDefault(1000) val WORKER_UI_RETAINED_DRIVERS = ConfigBuilder("spark.worker.ui.retainedDrivers") + .version("1.5.0") .intConf .createWithDefault(1000) val UNCOMPRESSED_LOG_FILE_LENGTH_CACHE_SIZE_CONF = ConfigBuilder("spark.worker.ui.compressedLogFileLengthCacheSize") - .intConf - .createWithDefault(100) + .version("2.0.2") + .intConf + .createWithDefault(100) } diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index f91f31be2f1ad..0afbb5278b1da 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -37,31 +37,15 @@ package object config { private[spark] val SPARK_TASK_PREFIX = "spark.task" private[spark] val LISTENER_BUS_EVENT_QUEUE_PREFIX = "spark.scheduler.listenerbus.eventqueue" - private[spark] val SPARK_RESOURCES_COORDINATE = - ConfigBuilder("spark.resources.coordinate.enable") - .doc("Whether to coordinate resources automatically among workers/drivers(client only) " + - "in Standalone. If false, the user is responsible for configuring different resources " + - "for workers/drivers that run on the same host.") - .booleanConf - .createWithDefault(true) - - private[spark] val SPARK_RESOURCES_DIR = - ConfigBuilder("spark.resources.dir") - .doc("Directory used to coordinate resources among workers/drivers(client only) in " + - "Standalone. Default is SPARK_HOME. Make sure to use the same directory for worker " + - "and drivers in client mode that run on the same host. Don't clean up this directory " + - "while workers/drivers are still alive to avoid the most likely resources conflict. ") - .stringConf - .createOptional - private[spark] val RESOURCES_DISCOVERY_PLUGIN = - ConfigBuilder("spark.resources.discovery.plugin") + ConfigBuilder("spark.resources.discoveryPlugin") .doc("Comma-separated list of class names implementing" + "org.apache.spark.api.resource.ResourceDiscoveryPlugin to load into the application." + "This is for advanced users to replace the resource discovery class with a " + "custom implementation. Spark will try each class specified until one of them " + "returns the resource information for that resource. It tries the discovery " + "script last if none of the plugins return information for that resource.") + .version("3.0.0") .stringConf .toSequence .createWithDefault(Nil) @@ -72,96 +56,121 @@ package object config { .doc("Path to a file containing the resources allocated to the driver. " + "The file should be formatted as a JSON array of ResourceAllocation objects. " + "Only used internally in standalone mode.") + .version("3.0.0") .stringConf .createOptional private[spark] val DRIVER_CLASS_PATH = - ConfigBuilder(SparkLauncher.DRIVER_EXTRA_CLASSPATH).stringConf.createOptional + ConfigBuilder(SparkLauncher.DRIVER_EXTRA_CLASSPATH) + .version("1.0.0") + .stringConf + .createOptional private[spark] val DRIVER_JAVA_OPTIONS = ConfigBuilder(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS) .withPrepended(SparkLauncher.DRIVER_DEFAULT_JAVA_OPTIONS) + .version("1.0.0") .stringConf .createOptional private[spark] val DRIVER_LIBRARY_PATH = - ConfigBuilder(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH).stringConf.createOptional + ConfigBuilder(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH) + .version("1.0.0") + .stringConf + .createOptional private[spark] val DRIVER_USER_CLASS_PATH_FIRST = - ConfigBuilder("spark.driver.userClassPathFirst").booleanConf.createWithDefault(false) + ConfigBuilder("spark.driver.userClassPathFirst") + .version("1.3.0") + .booleanConf + .createWithDefault(false) private[spark] val DRIVER_CORES = ConfigBuilder("spark.driver.cores") .doc("Number of cores to use for the driver process, only in cluster mode.") + .version("1.3.0") .intConf .createWithDefault(1) private[spark] val DRIVER_MEMORY = ConfigBuilder(SparkLauncher.DRIVER_MEMORY) .doc("Amount of memory to use for the driver process, in MiB unless otherwise specified.") + .version("1.1.1") .bytesConf(ByteUnit.MiB) .createWithDefaultString("1g") private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.driver.memoryOverhead") .doc("The amount of non-heap memory to be allocated per driver in cluster mode, " + "in MiB unless otherwise specified.") + .version("2.3.0") .bytesConf(ByteUnit.MiB) .createOptional private[spark] val DRIVER_LOG_DFS_DIR = - ConfigBuilder("spark.driver.log.dfsDir").stringConf.createOptional + ConfigBuilder("spark.driver.log.dfsDir").version("3.0.0").stringConf.createOptional private[spark] val DRIVER_LOG_LAYOUT = ConfigBuilder("spark.driver.log.layout") + .version("3.0.0") .stringConf .createOptional private[spark] val DRIVER_LOG_PERSISTTODFS = ConfigBuilder("spark.driver.log.persistToDfs.enabled") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val DRIVER_LOG_ALLOW_EC = ConfigBuilder("spark.driver.log.allowErasureCoding") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_ENABLED = ConfigBuilder("spark.eventLog.enabled") + .version("1.0.0") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_DIR = ConfigBuilder("spark.eventLog.dir") + .version("1.0.0") .stringConf .createWithDefault(EventLoggingListener.DEFAULT_LOG_DIR) private[spark] val EVENT_LOG_COMPRESS = ConfigBuilder("spark.eventLog.compress") + .version("1.0.0") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_BLOCK_UPDATES = ConfigBuilder("spark.eventLog.logBlockUpdates.enabled") + .version("2.3.0") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_ALLOW_EC = ConfigBuilder("spark.eventLog.erasureCoding.enabled") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_TESTING = ConfigBuilder("spark.eventLog.testing") .internal() + .version("1.0.1") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_OUTPUT_BUFFER_SIZE = ConfigBuilder("spark.eventLog.buffer.kb") .doc("Buffer size to use when writing to output streams, in KiB unless otherwise specified.") + .version("1.0.0") .bytesConf(ByteUnit.KiB) .createWithDefaultString("100k") private[spark] val EVENT_LOG_STAGE_EXECUTOR_METRICS = - ConfigBuilder("spark.eventLog.logStageExecutorMetrics.enabled") + ConfigBuilder("spark.eventLog.logStageExecutorMetrics") .doc("Whether to write per-stage peaks of executor metrics (for each executor) " + "to the event log.") + .version("3.0.0") .booleanConf .createWithDefault(false) @@ -170,6 +179,7 @@ package object config { .doc("Names of supported young generation garbage collector. A name usually is " + " the return of GarbageCollectorMXBean.getName. The built-in young generation garbage " + s"collectors are ${GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS}") + .version("3.0.0") .stringConf .toSequence .createWithDefault(GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS) @@ -179,55 +189,75 @@ package object config { .doc("Names of supported old generation garbage collector. A name usually is " + "the return of GarbageCollectorMXBean.getName. The built-in old generation garbage " + s"collectors are ${GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS}") + .version("3.0.0") .stringConf .toSequence .createWithDefault(GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS) private[spark] val EVENT_LOG_OVERWRITE = - ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false) + ConfigBuilder("spark.eventLog.overwrite") + .version("1.0.0") + .booleanConf + .createWithDefault(false) private[spark] val EVENT_LOG_CALLSITE_LONG_FORM = - ConfigBuilder("spark.eventLog.longForm.enabled").booleanConf.createWithDefault(false) + ConfigBuilder("spark.eventLog.longForm.enabled") + .version("2.4.0") + .booleanConf + .createWithDefault(false) private[spark] val EVENT_LOG_ENABLE_ROLLING = ConfigBuilder("spark.eventLog.rolling.enabled") - .doc("Whether rolling over event log files is enabled. If set to true, it cuts down " + + .doc("Whether rolling over event log files is enabled. If set to true, it cuts down " + "each event log file to the configured size.") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val EVENT_LOG_ROLLING_MAX_FILE_SIZE = ConfigBuilder("spark.eventLog.rolling.maxFileSize") - .doc("The max size of event log file to be rolled over.") + .doc(s"When ${EVENT_LOG_ENABLE_ROLLING.key}=true, specifies the max size of event log file" + + " to be rolled over.") + .version("3.0.0") .bytesConf(ByteUnit.BYTE) .checkValue(_ >= ByteUnit.MiB.toBytes(10), "Max file size of event log should be " + "configured to be at least 10 MiB.") .createWithDefaultString("128m") private[spark] val EXECUTOR_ID = - ConfigBuilder("spark.executor.id").stringConf.createOptional + ConfigBuilder("spark.executor.id").version("1.2.0").stringConf.createOptional private[spark] val EXECUTOR_CLASS_PATH = - ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_CLASSPATH).stringConf.createOptional + ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_CLASSPATH) + .version("1.0.0") + .stringConf + .createOptional private[spark] val EXECUTOR_HEARTBEAT_DROP_ZERO_ACCUMULATOR_UPDATES = ConfigBuilder("spark.executor.heartbeat.dropZeroAccumulatorUpdates") .internal() + .version("3.0.0") .booleanConf .createWithDefault(true) private[spark] val EXECUTOR_HEARTBEAT_INTERVAL = ConfigBuilder("spark.executor.heartbeatInterval") + .version("1.1.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("10s") private[spark] val EXECUTOR_HEARTBEAT_MAX_FAILURES = - ConfigBuilder("spark.executor.heartbeat.maxFailures").internal().intConf.createWithDefault(60) + ConfigBuilder("spark.executor.heartbeat.maxFailures") + .internal() + .version("1.6.2") + .intConf + .createWithDefault(60) private[spark] val EXECUTOR_PROCESS_TREE_METRICS_ENABLED = ConfigBuilder("spark.executor.processTreeMetrics.enabled") .doc("Whether to collect process tree metrics (from the /proc filesystem) when collecting " + "executor metrics.") + .version("3.0.0") .booleanConf .createWithDefault(false) @@ -236,33 +266,44 @@ package object config { .doc("How often to collect executor metrics (in milliseconds). " + "If 0, the polling is done on executor heartbeats. " + "If positive, the polling is done at this interval.") + .version("3.0.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("0") private[spark] val EXECUTOR_JAVA_OPTIONS = ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS) .withPrepended(SparkLauncher.EXECUTOR_DEFAULT_JAVA_OPTIONS) + .version("1.0.0") .stringConf .createOptional private[spark] val EXECUTOR_LIBRARY_PATH = - ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_LIBRARY_PATH).stringConf.createOptional + ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_LIBRARY_PATH) + .version("1.0.0") + .stringConf + .createOptional private[spark] val EXECUTOR_USER_CLASS_PATH_FIRST = - ConfigBuilder("spark.executor.userClassPathFirst").booleanConf.createWithDefault(false) + ConfigBuilder("spark.executor.userClassPathFirst") + .version("1.3.0") + .booleanConf + .createWithDefault(false) private[spark] val EXECUTOR_CORES = ConfigBuilder(SparkLauncher.EXECUTOR_CORES) + .version("1.0.0") .intConf .createWithDefault(1) private[spark] val EXECUTOR_MEMORY = ConfigBuilder(SparkLauncher.EXECUTOR_MEMORY) .doc("Amount of memory to use per executor process, in MiB unless otherwise specified.") + .version("0.7.0") .bytesConf(ByteUnit.MiB) .createWithDefaultString("1g") private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.executor.memoryOverhead") - .doc("The amount of non-heap memory to be allocated per executor in cluster mode, " + - "in MiB unless otherwise specified.") + .doc("The amount of non-heap memory to be allocated per executor, in MiB unless otherwise" + + " specified.") + .version("2.3.0") .bytesConf(ByteUnit.MiB) .createOptional @@ -272,12 +313,14 @@ package object config { "the cluster (not from each machine). If not set, the default will be " + "`spark.deploy.defaultCores` on Spark's standalone cluster manager, or infinite " + "(all available cores) on Mesos.") + .version("0.6.0") .intConf .createOptional private[spark] val MEMORY_OFFHEAP_ENABLED = ConfigBuilder("spark.memory.offHeap.enabled") .doc("If true, Spark will attempt to use off-heap memory for certain operations. " + "If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.") + .version("1.6.0") .withAlternative("spark.unsafe.offHeap") .booleanConf .createWithDefault(false) @@ -288,6 +331,7 @@ package object config { "This setting has no impact on heap memory usage, so if your executors' total memory " + "consumption must fit within some hard limit then be sure to shrink your JVM heap size " + "accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.") + .version("1.6.0") .bytesConf(ByteUnit.BYTE) .checkValue(_ >= 0, "The off-heap memory size must not be negative") .createWithDefault(0) @@ -297,6 +341,7 @@ package object config { "size of the region set aside by spark.memory.fraction. The higher this is, the " + "less working memory may be available to execution and tasks may spill to disk more " + "often. Leaving this at the default value is recommended. ") + .version("1.6.0") .doubleConf .checkValue(v => v >= 0.0 && v < 1.0, "Storage fraction must be in [0,1)") .createWithDefault(0.5) @@ -307,16 +352,19 @@ package object config { "The purpose of this config is to set aside memory for internal metadata, " + "user data structures, and imprecise size estimation in the case of sparse, " + "unusually large records. Leaving this at the default value is recommended. ") + .version("1.6.0") .doubleConf .createWithDefault(0.6) private[spark] val STORAGE_SAFETY_FRACTION = ConfigBuilder("spark.storage.safetyFraction") + .version("1.1.0") .doubleConf .createWithDefault(0.9) private[spark] val STORAGE_UNROLL_MEMORY_THRESHOLD = ConfigBuilder("spark.storage.unrollMemoryThreshold") .doc("Initial memory to request before unrolling any block") + .version("1.1.0") .longConf .createWithDefault(1024 * 1024) @@ -326,6 +374,7 @@ package object config { "Cached RDD block replicas lost due to executor failures are replenished " + "if there are any existing available replicas. This tries to " + "get the replication level of the block to the initial number") + .version("2.2.0") .booleanConf .createWithDefault(false) @@ -336,48 +385,62 @@ package object config { "This prevents Spark from memory mapping very small blocks. " + "In general, memory mapping has high overhead for blocks close to or below " + "the page size of the operating system.") + .version("0.9.2") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("2m") private[spark] val STORAGE_REPLICATION_POLICY = ConfigBuilder("spark.storage.replication.policy") + .version("2.1.0") .stringConf .createWithDefaultString(classOf[RandomBlockReplicationPolicy].getName) private[spark] val STORAGE_REPLICATION_TOPOLOGY_MAPPER = ConfigBuilder("spark.storage.replication.topologyMapper") + .version("2.1.0") .stringConf .createWithDefaultString(classOf[DefaultTopologyMapper].getName) private[spark] val STORAGE_CACHED_PEERS_TTL = ConfigBuilder("spark.storage.cachedPeersTtl") - .intConf.createWithDefault(60 * 1000) + .version("1.1.1") + .intConf + .createWithDefault(60 * 1000) private[spark] val STORAGE_MAX_REPLICATION_FAILURE = ConfigBuilder("spark.storage.maxReplicationFailures") - .intConf.createWithDefault(1) + .version("1.1.1") + .intConf + .createWithDefault(1) private[spark] val STORAGE_REPLICATION_TOPOLOGY_FILE = - ConfigBuilder("spark.storage.replication.topologyFile").stringConf.createOptional + ConfigBuilder("spark.storage.replication.topologyFile") + .version("2.1.0") + .stringConf + .createOptional private[spark] val STORAGE_EXCEPTION_PIN_LEAK = ConfigBuilder("spark.storage.exceptionOnPinLeak") + .version("1.6.2") .booleanConf .createWithDefault(false) private[spark] val STORAGE_BLOCKMANAGER_TIMEOUTINTERVAL = ConfigBuilder("spark.storage.blockManagerTimeoutIntervalMs") + .version("0.7.3") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("60s") private[spark] val STORAGE_BLOCKMANAGER_SLAVE_TIMEOUT = ConfigBuilder("spark.storage.blockManagerSlaveTimeoutMs") + .version("0.7.0") .timeConf(TimeUnit.MILLISECONDS) - .createWithDefaultString(Network.NETWORK_TIMEOUT.defaultValueString) + .createOptional private[spark] val STORAGE_CLEANUP_FILES_AFTER_EXECUTOR_EXIT = ConfigBuilder("spark.storage.cleanupFilesAfterExecutorExit") .doc("Whether or not cleanup the files not served by the external shuffle service " + "on executor exits.") + .version("2.4.0") .booleanConf .createWithDefault(true) @@ -385,6 +448,7 @@ package object config { ConfigBuilder("spark.diskStore.subDirectories") .doc("Number of subdirectories inside each path listed in spark.local.dir for " + "hashing Block files into.") + .version("0.6.0") .intConf .checkValue(_ > 0, "The number of subdirectories must be positive.") .createWithDefault(64) @@ -393,71 +457,102 @@ package object config { ConfigBuilder("spark.block.failures.beforeLocationRefresh") .doc("Max number of failures before this block manager refreshes " + "the block locations from the driver.") + .version("2.0.0") .intConf .createWithDefault(5) - private[spark] val IS_PYTHON_APP = ConfigBuilder("spark.yarn.isPython").internal() - .booleanConf.createWithDefault(false) + private[spark] val IS_PYTHON_APP = + ConfigBuilder("spark.yarn.isPython") + .internal() + .version("1.5.0") + .booleanConf + .createWithDefault(false) - private[spark] val CPUS_PER_TASK = ConfigBuilder("spark.task.cpus").intConf.createWithDefault(1) + private[spark] val CPUS_PER_TASK = + ConfigBuilder("spark.task.cpus").version("0.5.0").intConf.createWithDefault(1) private[spark] val DYN_ALLOCATION_ENABLED = - ConfigBuilder("spark.dynamicAllocation.enabled").booleanConf.createWithDefault(false) + ConfigBuilder("spark.dynamicAllocation.enabled") + .version("1.2.0") + .booleanConf + .createWithDefault(false) private[spark] val DYN_ALLOCATION_TESTING = - ConfigBuilder("spark.dynamicAllocation.testing").booleanConf.createWithDefault(false) + ConfigBuilder("spark.dynamicAllocation.testing") + .version("1.2.0") + .booleanConf + .createWithDefault(false) private[spark] val DYN_ALLOCATION_MIN_EXECUTORS = - ConfigBuilder("spark.dynamicAllocation.minExecutors").intConf.createWithDefault(0) + ConfigBuilder("spark.dynamicAllocation.minExecutors") + .version("1.2.0") + .intConf + .createWithDefault(0) private[spark] val DYN_ALLOCATION_INITIAL_EXECUTORS = ConfigBuilder("spark.dynamicAllocation.initialExecutors") + .version("1.3.0") .fallbackConf(DYN_ALLOCATION_MIN_EXECUTORS) private[spark] val DYN_ALLOCATION_MAX_EXECUTORS = - ConfigBuilder("spark.dynamicAllocation.maxExecutors").intConf.createWithDefault(Int.MaxValue) + ConfigBuilder("spark.dynamicAllocation.maxExecutors") + .version("1.2.0") + .intConf + .createWithDefault(Int.MaxValue) private[spark] val DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO = ConfigBuilder("spark.dynamicAllocation.executorAllocationRatio") - .doubleConf.createWithDefault(1.0) + .version("2.4.0") + .doubleConf + .createWithDefault(1.0) private[spark] val DYN_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT = ConfigBuilder("spark.dynamicAllocation.cachedExecutorIdleTimeout") + .version("1.4.0") .timeConf(TimeUnit.SECONDS) .checkValue(_ >= 0L, "Timeout must be >= 0.") .createWithDefault(Integer.MAX_VALUE) private[spark] val DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT = ConfigBuilder("spark.dynamicAllocation.executorIdleTimeout") + .version("1.2.0") .timeConf(TimeUnit.SECONDS) .checkValue(_ >= 0L, "Timeout must be >= 0.") .createWithDefault(60) - private[spark] val DYN_ALLOCATION_SHUFFLE_TRACKING = + private[spark] val DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED = ConfigBuilder("spark.dynamicAllocation.shuffleTracking.enabled") + .version("3.0.0") .booleanConf .createWithDefault(false) - private[spark] val DYN_ALLOCATION_SHUFFLE_TIMEOUT = - ConfigBuilder("spark.dynamicAllocation.shuffleTimeout") + private[spark] val DYN_ALLOCATION_SHUFFLE_TRACKING_TIMEOUT = + ConfigBuilder("spark.dynamicAllocation.shuffleTracking.timeout") + .version("3.0.0") .timeConf(TimeUnit.MILLISECONDS) .checkValue(_ >= 0L, "Timeout must be >= 0.") .createWithDefault(Long.MaxValue) private[spark] val DYN_ALLOCATION_SCHEDULER_BACKLOG_TIMEOUT = ConfigBuilder("spark.dynamicAllocation.schedulerBacklogTimeout") + .version("1.2.0") .timeConf(TimeUnit.SECONDS).createWithDefault(1) private[spark] val DYN_ALLOCATION_SUSTAINED_SCHEDULER_BACKLOG_TIMEOUT = ConfigBuilder("spark.dynamicAllocation.sustainedSchedulerBacklogTimeout") + .version("1.2.0") .fallbackConf(DYN_ALLOCATION_SCHEDULER_BACKLOG_TIMEOUT) private[spark] val LOCALITY_WAIT = ConfigBuilder("spark.locality.wait") + .version("0.5.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("3s") private[spark] val SHUFFLE_SERVICE_ENABLED = - ConfigBuilder("spark.shuffle.service.enabled").booleanConf.createWithDefault(false) + ConfigBuilder("spark.shuffle.service.enabled") + .version("1.2.0") + .booleanConf + .createWithDefault(false) private[spark] val SHUFFLE_SERVICE_FETCH_RDD_ENABLED = ConfigBuilder(Constants.SHUFFLE_SERVICE_FETCH_RDD_ENABLED) @@ -465,6 +560,7 @@ package object config { "In case of dynamic allocation if this feature is enabled executors having only disk " + "persisted blocks are considered idle after " + "'spark.dynamicAllocation.executorIdleTimeout' and will be released accordingly.") + .version("3.0.0") .booleanConf .createWithDefault(false) @@ -472,21 +568,26 @@ package object config { ConfigBuilder("spark.shuffle.service.db.enabled") .doc("Whether to use db in ExternalShuffleService. Note that this only affects " + "standalone mode.") + .version("3.0.0") .booleanConf .createWithDefault(true) private[spark] val SHUFFLE_SERVICE_PORT = - ConfigBuilder("spark.shuffle.service.port").intConf.createWithDefault(7337) + ConfigBuilder("spark.shuffle.service.port").version("1.2.0").intConf.createWithDefault(7337) private[spark] val KEYTAB = ConfigBuilder("spark.kerberos.keytab") .doc("Location of user's keytab.") + .version("3.0.0") .stringConf.createOptional private[spark] val PRINCIPAL = ConfigBuilder("spark.kerberos.principal") .doc("Name of the Kerberos principal.") - .stringConf.createOptional + .version("3.0.0") + .stringConf + .createOptional private[spark] val KERBEROS_RELOGIN_PERIOD = ConfigBuilder("spark.kerberos.relogin.period") + .version("3.0.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("1m") @@ -496,6 +597,7 @@ package object config { "Which credentials to use when renewing delegation tokens for executors. Can be either " + "'keytab', the default, which requires a keytab to be provided, or 'ccache', which uses " + "the local credentials cache.") + .version("3.0.0") .stringConf .checkValues(Set("keytab", "ccache")) .createWithDefault("keytab") @@ -504,104 +606,124 @@ package object config { ConfigBuilder("spark.kerberos.access.hadoopFileSystems") .doc("Extra Hadoop filesystem URLs for which to request delegation tokens. The filesystem " + "that hosts fs.defaultFS does not need to be listed here.") + .version("3.0.0") .stringConf .toSequence .createWithDefault(Nil) private[spark] val EXECUTOR_INSTANCES = ConfigBuilder("spark.executor.instances") + .version("1.0.0") .intConf .createOptional private[spark] val PY_FILES = ConfigBuilder("spark.yarn.dist.pyFiles") .internal() + .version("2.2.1") .stringConf .toSequence .createWithDefault(Nil) private[spark] val TASK_MAX_DIRECT_RESULT_SIZE = ConfigBuilder("spark.task.maxDirectResultSize") + .version("2.0.0") .bytesConf(ByteUnit.BYTE) .createWithDefault(1L << 20) private[spark] val TASK_MAX_FAILURES = ConfigBuilder("spark.task.maxFailures") + .version("0.8.0") .intConf .createWithDefault(4) private[spark] val TASK_REAPER_ENABLED = ConfigBuilder("spark.task.reaper.enabled") + .version("2.0.3") .booleanConf .createWithDefault(false) private[spark] val TASK_REAPER_KILL_TIMEOUT = ConfigBuilder("spark.task.reaper.killTimeout") + .version("2.0.3") .timeConf(TimeUnit.MILLISECONDS) .createWithDefault(-1) private[spark] val TASK_REAPER_POLLING_INTERVAL = ConfigBuilder("spark.task.reaper.pollingInterval") + .version("2.0.3") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("10s") private[spark] val TASK_REAPER_THREAD_DUMP = ConfigBuilder("spark.task.reaper.threadDump") + .version("2.0.3") .booleanConf .createWithDefault(true) // Blacklist confs private[spark] val BLACKLIST_ENABLED = ConfigBuilder("spark.blacklist.enabled") + .version("2.1.0") .booleanConf .createOptional private[spark] val MAX_TASK_ATTEMPTS_PER_EXECUTOR = ConfigBuilder("spark.blacklist.task.maxTaskAttemptsPerExecutor") + .version("2.1.0") .intConf .createWithDefault(1) private[spark] val MAX_TASK_ATTEMPTS_PER_NODE = ConfigBuilder("spark.blacklist.task.maxTaskAttemptsPerNode") + .version("2.1.0") .intConf .createWithDefault(2) private[spark] val MAX_FAILURES_PER_EXEC = ConfigBuilder("spark.blacklist.application.maxFailedTasksPerExecutor") + .version("2.2.0") .intConf .createWithDefault(2) private[spark] val MAX_FAILURES_PER_EXEC_STAGE = ConfigBuilder("spark.blacklist.stage.maxFailedTasksPerExecutor") + .version("2.1.0") .intConf .createWithDefault(2) private[spark] val MAX_FAILED_EXEC_PER_NODE = ConfigBuilder("spark.blacklist.application.maxFailedExecutorsPerNode") + .version("2.2.0") .intConf .createWithDefault(2) private[spark] val MAX_FAILED_EXEC_PER_NODE_STAGE = ConfigBuilder("spark.blacklist.stage.maxFailedExecutorsPerNode") + .version("2.1.0") .intConf .createWithDefault(2) private[spark] val BLACKLIST_TIMEOUT_CONF = ConfigBuilder("spark.blacklist.timeout") + .version("2.1.0") .timeConf(TimeUnit.MILLISECONDS) .createOptional private[spark] val BLACKLIST_KILL_ENABLED = ConfigBuilder("spark.blacklist.killBlacklistedExecutors") + .version("2.2.0") .booleanConf .createWithDefault(false) private[spark] val BLACKLIST_LEGACY_TIMEOUT_CONF = ConfigBuilder("spark.scheduler.executorTaskBlacklistTime") .internal() + .version("1.0.0") .timeConf(TimeUnit.MILLISECONDS) .createOptional private[spark] val BLACKLIST_FETCH_FAILURE_ENABLED = ConfigBuilder("spark.blacklist.application.fetchFailure.enabled") + .version("2.3.0") .booleanConf .createWithDefault(false) // End blacklist confs @@ -611,6 +733,7 @@ package object config { .doc("Whether to un-register all the outputs on the host in condition that we receive " + " a FetchFailure. This is set default to false, which means, we only un-register the " + " outputs related to the exact executor(instead of the host) on a FetchFailure.") + .version("2.3.0") .booleanConf .createWithDefault(false) @@ -620,6 +743,7 @@ package object config { "an event queue using capacity specified by `spark.scheduler.listenerbus" + ".eventqueue.queueName.capacity` first. If it's not configured, Spark will " + "use the default capacity specified by this config.") + .version("2.3.0") .intConf .checkValue(_ > 0, "The capacity of listener bus event queue must be positive") .createWithDefault(10000) @@ -627,15 +751,17 @@ package object config { private[spark] val LISTENER_BUS_METRICS_MAX_LISTENER_CLASSES_TIMED = ConfigBuilder("spark.scheduler.listenerbus.metrics.maxListenerClassesTimed") .internal() + .version("2.3.0") .intConf .createWithDefault(128) private[spark] val LISTENER_BUS_LOG_SLOW_EVENT_ENABLED = - ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent.enabled") + ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent") .internal() .doc("When enabled, log the event that takes too much time to process. This helps us " + "discover the event types that cause performance bottlenecks. The time threshold is " + "controlled by spark.scheduler.listenerbus.logSlowEvent.threshold.") + .version("3.0.0") .booleanConf .createWithDefault(true) @@ -643,53 +769,66 @@ package object config { ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent.threshold") .internal() .doc("The time threshold of whether a event is considered to be taking too much time to " + - "process. Log the event if spark.scheduler.listenerbus.logSlowEvent.enabled is true.") + s"process. Log the event if ${LISTENER_BUS_LOG_SLOW_EVENT_ENABLED.key} is true.") + .version("3.0.0") .timeConf(TimeUnit.NANOSECONDS) .createWithDefaultString("1s") // This property sets the root namespace for metrics reporting private[spark] val METRICS_NAMESPACE = ConfigBuilder("spark.metrics.namespace") + .version("2.1.0") .stringConf .createOptional private[spark] val METRICS_CONF = ConfigBuilder("spark.metrics.conf") + .version("0.8.0") .stringConf .createOptional private[spark] val METRICS_EXECUTORMETRICS_SOURCE_ENABLED = ConfigBuilder("spark.metrics.executorMetricsSource.enabled") .doc("Whether to register the ExecutorMetrics source with the metrics system.") + .version("3.0.0") .booleanConf .createWithDefault(true) private[spark] val METRICS_STATIC_SOURCES_ENABLED = ConfigBuilder("spark.metrics.staticSources.enabled") .doc("Whether to register static sources with the metrics system.") + .version("3.0.0") .booleanConf .createWithDefault(true) private[spark] val PYSPARK_DRIVER_PYTHON = ConfigBuilder("spark.pyspark.driver.python") + .version("2.1.0") .stringConf .createOptional private[spark] val PYSPARK_PYTHON = ConfigBuilder("spark.pyspark.python") + .version("2.1.0") .stringConf .createOptional // To limit how many applications are shown in the History Server summary ui private[spark] val HISTORY_UI_MAX_APPS = - ConfigBuilder("spark.history.ui.maxApplications").intConf.createWithDefault(Integer.MAX_VALUE) + ConfigBuilder("spark.history.ui.maxApplications") + .version("2.0.1") + .intConf + .createWithDefault(Integer.MAX_VALUE) private[spark] val IO_ENCRYPTION_ENABLED = ConfigBuilder("spark.io.encryption.enabled") + .version("2.1.0") .booleanConf .createWithDefault(false) private[spark] val IO_ENCRYPTION_KEYGEN_ALGORITHM = ConfigBuilder("spark.io.encryption.keygen.algorithm") + .version("2.1.0") .stringConf .createWithDefault("HmacSHA1") private[spark] val IO_ENCRYPTION_KEY_SIZE_BITS = ConfigBuilder("spark.io.encryption.keySizeBits") + .version("2.1.0") .intConf .checkValues(Set(128, 192, 256)) .createWithDefault(128) @@ -697,57 +836,68 @@ package object config { private[spark] val IO_CRYPTO_CIPHER_TRANSFORMATION = ConfigBuilder("spark.io.crypto.cipher.transformation") .internal() + .version("2.1.0") .stringConf .createWithDefaultString("AES/CTR/NoPadding") private[spark] val DRIVER_HOST_ADDRESS = ConfigBuilder("spark.driver.host") .doc("Address of driver endpoints.") + .version("0.7.0") .stringConf .createWithDefault(Utils.localCanonicalHostName()) private[spark] val DRIVER_PORT = ConfigBuilder("spark.driver.port") .doc("Port of driver endpoints.") + .version("0.7.0") .intConf .createWithDefault(0) private[spark] val DRIVER_SUPERVISE = ConfigBuilder("spark.driver.supervise") .doc("If true, restarts the driver automatically if it fails with a non-zero exit status. " + "Only has effect in Spark standalone mode or Mesos cluster deploy mode.") + .version("1.3.0") .booleanConf .createWithDefault(false) private[spark] val DRIVER_BIND_ADDRESS = ConfigBuilder("spark.driver.bindAddress") .doc("Address where to bind network listen sockets on the driver.") + .version("2.1.0") .fallbackConf(DRIVER_HOST_ADDRESS) private[spark] val BLOCK_MANAGER_PORT = ConfigBuilder("spark.blockManager.port") .doc("Port to use for the block manager when a more specific setting is not provided.") + .version("1.1.0") .intConf .createWithDefault(0) private[spark] val DRIVER_BLOCK_MANAGER_PORT = ConfigBuilder("spark.driver.blockManager.port") .doc("Port to use for the block manager on the driver.") + .version("2.1.0") .fallbackConf(BLOCK_MANAGER_PORT) private[spark] val IGNORE_CORRUPT_FILES = ConfigBuilder("spark.files.ignoreCorruptFiles") .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " + "encountering corrupted or non-existing files and contents that have been read will still " + "be returned.") + .version("2.1.0") .booleanConf .createWithDefault(false) private[spark] val IGNORE_MISSING_FILES = ConfigBuilder("spark.files.ignoreMissingFiles") .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " + "encountering missing files and the contents that have been read will still be returned.") + .version("2.4.0") .booleanConf .createWithDefault(false) private[spark] val APP_CALLER_CONTEXT = ConfigBuilder("spark.log.callerContext") + .version("2.2.0") .stringConf .createOptional private[spark] val FILES_MAX_PARTITION_BYTES = ConfigBuilder("spark.files.maxPartitionBytes") .doc("The maximum number of bytes to pack into a single partition when reading files.") + .version("2.1.0") .bytesConf(ByteUnit.BYTE) .createWithDefault(128 * 1024 * 1024) @@ -756,6 +906,7 @@ package object config { " the same time. This is used when putting multiple files into a partition. It's better to" + " over estimate, then the partitions with small files will be faster than partitions with" + " bigger files.") + .version("2.1.0") .bytesConf(ByteUnit.BYTE) .createWithDefault(4 * 1024 * 1024) @@ -763,6 +914,7 @@ package object config { ConfigBuilder("spark.hadoopRDD.ignoreEmptySplits") .internal() .doc("When true, HadoopRDD/NewHadoopRDD will not create partitions for empty input splits.") + .version("2.3.0") .booleanConf .createWithDefault(false) @@ -772,6 +924,7 @@ package object config { "driver and executor environments contain sensitive information. When this regex matches " + "a property key or value, the value is redacted from the environment UI and various logs " + "like YARN and event logs.") + .version("2.1.2") .regexConf .createWithDefault("(?i)secret|password|token".r) @@ -780,26 +933,31 @@ package object config { .doc("Regex to decide which parts of strings produced by Spark contain sensitive " + "information. When this regex matches a string part, that string part is replaced by a " + "dummy value. This is currently used to redact the output of SQL explain commands.") + .version("2.2.0") .regexConf .createOptional private[spark] val AUTH_SECRET = ConfigBuilder("spark.authenticate.secret") + .version("1.0.0") .stringConf .createOptional private[spark] val AUTH_SECRET_BIT_LENGTH = ConfigBuilder("spark.authenticate.secretBitLength") + .version("1.6.0") .intConf .createWithDefault(256) private[spark] val NETWORK_AUTH_ENABLED = ConfigBuilder("spark.authenticate") + .version("1.0.0") .booleanConf .createWithDefault(false) private[spark] val SASL_ENCRYPTION_ENABLED = ConfigBuilder("spark.authenticate.enableSaslEncryption") + .version("1.4.0") .booleanConf .createWithDefault(false) @@ -809,6 +967,7 @@ package object config { "loaded from this path on both the driver and the executors if overrides are not set for " + "either entity (see below). File-based secret keys are only allowed when using " + "Kubernetes.") + .version("3.0.0") .stringConf .createOptional @@ -821,6 +980,7 @@ package object config { "be specified for the executors. The fallback configuration allows the same path to be " + "used for both the driver and the executors when running in cluster mode. File-based " + "secret keys are only allowed when using Kubernetes.") + .version("3.0.0") .fallbackConf(AUTH_SECRET_FILE) private[spark] val AUTH_SECRET_FILE_EXECUTOR = @@ -832,12 +992,14 @@ package object config { "specified for the executors. The fallback configuration allows the same path to be " + "used for both the driver and the executors when running in cluster mode. File-based " + "secret keys are only allowed when using Kubernetes.") + .version("3.0.0") .fallbackConf(AUTH_SECRET_FILE) private[spark] val BUFFER_WRITE_CHUNK_SIZE = ConfigBuilder("spark.buffer.write.chunkSize") .internal() .doc("The chunk size in bytes during writing out the bytes of ChunkedByteBuffer.") + .version("2.3.0") .bytesConf(ByteUnit.BYTE) .checkValue(_ <= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH, "The chunk size during writing out the bytes of ChunkedByteBuffer should" + @@ -848,6 +1010,7 @@ package object config { ConfigBuilder("spark.checkpoint.compress") .doc("Whether to compress RDD checkpoints. Generally a good idea. Compression will use " + "spark.io.compression.codec.") + .version("2.2.0") .booleanConf .createWithDefault(false) @@ -858,6 +1021,7 @@ package object config { "Caching preferred locations can relieve query loading to DFS and save the query " + "time. The drawback is that the cached locations can be possibly outdated and " + "lose data locality. If this config is not specified, it will not cache.") + .version("3.0.0") .timeConf(TimeUnit.MINUTES) .checkValue(_ > 0, "The expire time for caching preferred locations cannot be non-positive.") .createOptional @@ -867,12 +1031,14 @@ package object config { .doc("Threshold in bytes above which the size of shuffle blocks in " + "HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM " + "by avoiding underestimating shuffle block size when fetch shuffle blocks.") + .version("2.2.1") .bytesConf(ByteUnit.BYTE) .createWithDefault(100 * 1024 * 1024) private[spark] val SHUFFLE_REGISTRATION_TIMEOUT = ConfigBuilder("spark.shuffle.registration.timeout") .doc("Timeout in milliseconds for registration to the external shuffle service.") + .version("2.3.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefault(5000) @@ -880,6 +1046,7 @@ package object config { ConfigBuilder("spark.shuffle.registration.maxAttempts") .doc("When we fail to register to the external shuffle service, we will " + "retry for maxAttempts times.") + .version("2.3.0") .intConf .createWithDefault(3) @@ -890,17 +1057,19 @@ package object config { "address in a single fetch or simultaneously, this could crash the serving executor or " + "Node Manager. This is especially useful to reduce the load on the Node Manager when " + "external shuffle is enabled. You can mitigate the issue by setting it to a lower value.") + .version("2.2.1") .intConf .checkValue(_ > 0, "The max no. of blocks in flight cannot be non-positive.") .createWithDefault(Int.MaxValue) private[spark] val MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM = - ConfigBuilder("spark.maxRemoteBlockSizeFetchToMem") + ConfigBuilder("spark.network.maxRemoteBlockSizeFetchToMem") .doc("Remote block will be fetched to disk when size of the block is above this threshold " + "in bytes. This is to avoid a giant request takes too much memory. Note this " + "configuration will affect both shuffle fetch and block manager remote block fetch. " + "For users who enabled external shuffle service, this feature can only work when " + "external shuffle service is at least 2.3.0.") + .version("3.0.0") .bytesConf(ByteUnit.BYTE) // fetch-to-mem is guaranteed to fail if the message is bigger than 2 GB, so we might // as well use fetch-to-disk in that case. The message includes some metadata in addition @@ -916,12 +1085,14 @@ package object config { .doc("Enable tracking of updatedBlockStatuses in the TaskMetrics. Off by default since " + "tracking the block statuses can use a lot of memory and its not used anywhere within " + "spark.") + .version("2.3.0") .booleanConf .createWithDefault(false) private[spark] val SHUFFLE_IO_PLUGIN_CLASS = ConfigBuilder("spark.shuffle.sort.io.plugin.class") .doc("Name of the class to use for shuffle IO.") + .version("3.0.0") .stringConf .createWithDefault(classOf[LocalDiskShuffleDataIO].getName) @@ -930,6 +1101,7 @@ package object config { .doc("Size of the in-memory buffer for each shuffle file output stream, in KiB unless " + "otherwise specified. These buffers reduce the number of disk seeks and system calls " + "made in creating intermediate shuffle files.") + .version("1.4.0") .bytesConf(ByteUnit.KiB) .checkValue(v => v > 0 && v <= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 1024, s"The file buffer size must be positive and less than or equal to" + @@ -940,6 +1112,7 @@ package object config { ConfigBuilder("spark.shuffle.unsafe.file.output.buffer") .doc("The file system for this buffer size after each partition " + "is written in unsafe shuffle writer. In KiB unless otherwise specified.") + .version("2.3.0") .bytesConf(ByteUnit.KiB) .checkValue(v => v > 0 && v <= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH / 1024, s"The buffer size must be positive and less than or equal to" + @@ -949,6 +1122,7 @@ package object config { private[spark] val SHUFFLE_DISK_WRITE_BUFFER_SIZE = ConfigBuilder("spark.shuffle.spill.diskWriteBufferSize") .doc("The buffer size, in bytes, to use when writing the sorted records to an on-disk file.") + .version("2.3.0") .bytesConf(ByteUnit.BYTE) .checkValue(v => v > 12 && v <= ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH, s"The buffer size must be greater than 12 and less than or equal to " + @@ -960,6 +1134,7 @@ package object config { .internal() .doc("The memory check period is used to determine how often we should check whether " + "there is a need to request more memory when we try to unroll the given block in memory.") + .version("2.3.0") .longConf .createWithDefault(16) @@ -967,6 +1142,7 @@ package object config { ConfigBuilder("spark.storage.unrollMemoryGrowthFactor") .internal() .doc("Memory to request as a multiple of the size that used to unroll the block.") + .version("2.3.0") .doubleConf .createWithDefault(1.5) @@ -977,12 +1153,14 @@ package object config { "where the YARN service does not support schemes that are supported by Spark, like http, " + "https and ftp, or jars required to be in the local YARN client's classpath. Wildcard " + "'*' is denoted to download resources for all the schemes.") + .version("2.3.0") .stringConf .toSequence .createWithDefault(Nil) private[spark] val EXTRA_LISTENERS = ConfigBuilder("spark.extraListeners") .doc("Class names of listeners to add to SparkContext during initialization.") + .version("1.3.0") .stringConf .toSequence .createOptional @@ -994,6 +1172,7 @@ package object config { "By default it's Integer.MAX_VALUE, which means we never force the sorter to spill, " + "until we reach some limitations, like the max page size limitation for the pointer " + "array in the sorter.") + .version("1.6.0") .intConf .createWithDefault(Integer.MAX_VALUE) @@ -1003,30 +1182,35 @@ package object config { .doc("Multi-thread is used when the number of mappers * shuffle partitions is greater than " + "or equal to this threshold. Note that the actual parallelism is calculated by number of " + "mappers * shuffle partitions / this threshold + 1, so this threshold should be positive.") + .version("2.3.0") .intConf .checkValue(v => v > 0, "The threshold should be positive.") .createWithDefault(10000000) private[spark] val MAX_RESULT_SIZE = ConfigBuilder("spark.driver.maxResultSize") .doc("Size limit for results.") + .version("1.2.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("1g") private[spark] val CREDENTIALS_RENEWAL_INTERVAL_RATIO = ConfigBuilder("spark.security.credentials.renewalRatio") .doc("Ratio of the credential's expiration time when Spark should fetch new credentials.") + .version("2.4.0") .doubleConf .createWithDefault(0.75d) private[spark] val CREDENTIALS_RENEWAL_RETRY_WAIT = ConfigBuilder("spark.security.credentials.retryWait") .doc("How long to wait before retrying to fetch new credentials after a failure.") + .version("2.4.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("1h") private[spark] val SHUFFLE_SORT_INIT_BUFFER_SIZE = ConfigBuilder("spark.shuffle.sort.initialBufferSize") .internal() + .version("2.1.0") .bytesConf(ByteUnit.BYTE) .checkValue(v => v > 0 && v <= Int.MaxValue, s"The buffer size must be greater than 0 and less than or equal to ${Int.MaxValue}.") @@ -1036,6 +1220,7 @@ package object config { ConfigBuilder("spark.shuffle.compress") .doc("Whether to compress shuffle output. Compression will use " + "spark.io.compression.codec.") + .version("0.6.0") .booleanConf .createWithDefault(true) @@ -1043,6 +1228,7 @@ package object config { ConfigBuilder("spark.shuffle.spill.compress") .doc("Whether to compress data spilled during shuffles. Compression will use " + "spark.io.compression.codec.") + .version("0.9.0") .booleanConf .createWithDefault(true) @@ -1052,6 +1238,7 @@ package object config { .doc("The codec used to compress MapStatus, which is generated by ShuffleMapTask. " + "By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also " + "use fully qualified class names to specify the codec.") + .version("3.0.0") .stringConf .createWithDefault("zstd") @@ -1060,6 +1247,7 @@ package object config { .internal() .doc("Initial threshold for the size of a collection before we start tracking its " + "memory usage.") + .version("1.1.1") .bytesConf(ByteUnit.BYTE) .createWithDefault(5 * 1024 * 1024) @@ -1067,6 +1255,7 @@ package object config { ConfigBuilder("spark.shuffle.spill.batchSize") .internal() .doc("Size of object batches when reading/writing from serializers.") + .version("0.9.0") .longConf .createWithDefault(10000) @@ -1074,34 +1263,40 @@ package object config { ConfigBuilder("spark.shuffle.sort.bypassMergeThreshold") .doc("In the sort-based shuffle manager, avoid merge-sorting data if there is no " + "map-side aggregation and there are at most this many reduce partitions") + .version("1.1.1") .intConf .createWithDefault(200) private[spark] val SHUFFLE_MANAGER = ConfigBuilder("spark.shuffle.manager") + .version("1.1.0") .stringConf .createWithDefault("sort") private[spark] val SHUFFLE_REDUCE_LOCALITY_ENABLE = ConfigBuilder("spark.shuffle.reduceLocality.enabled") .doc("Whether to compute locality preferences for reduce tasks") + .version("1.5.0") .booleanConf .createWithDefault(true) private[spark] val SHUFFLE_MAPOUTPUT_MIN_SIZE_FOR_BROADCAST = ConfigBuilder("spark.shuffle.mapOutput.minSizeForBroadcast") .doc("The size at which we use Broadcast to send the map output statuses to the executors.") + .version("2.0.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("512k") private[spark] val SHUFFLE_MAPOUTPUT_DISPATCHER_NUM_THREADS = ConfigBuilder("spark.shuffle.mapOutput.dispatcher.numThreads") + .version("2.0.0") .intConf .createWithDefault(8) private[spark] val SHUFFLE_DETECT_CORRUPT = ConfigBuilder("spark.shuffle.detectCorrupt") .doc("Whether to detect any corruption in fetched blocks.") + .version("2.2.0") .booleanConf .createWithDefault(true) @@ -1111,28 +1306,21 @@ package object config { "by using extra memory to detect early corruption. Any IOException thrown will cause " + "the task to be retried once and if it fails again with same exception, then " + "FetchFailedException will be thrown to retry previous stage") + .version("3.0.0") .booleanConf .createWithDefault(false) - private[spark] val STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE = - ConfigBuilder("spark.storage.localDiskByExecutors.cacheSize") - .doc("The max number of executors for which the local dirs are stored. This size is " + - "both applied for the driver and both for the executors side to avoid having an " + - "unbounded store. This cache will be used to avoid the network in case of fetching disk " + - "persisted RDD blocks or shuffle blocks (when `spark.shuffle.readHostLocalDisk.enabled` " + - "is set) from the same host.") - .intConf - .createWithDefault(1000) - private[spark] val SHUFFLE_SYNC = ConfigBuilder("spark.shuffle.sync") .doc("Whether to force outstanding writes to disk.") + .version("0.8.0") .booleanConf .createWithDefault(false) private[spark] val SHUFFLE_UNSAFE_FAST_MERGE_ENABLE = ConfigBuilder("spark.shuffle.unsafe.fastMergeEnabled") .doc("Whether to perform a fast spill merge.") + .version("1.4.0") .booleanConf .createWithDefault(true) @@ -1140,6 +1328,7 @@ package object config { ConfigBuilder("spark.shuffle.sort.useRadixSort") .doc("Whether to use radix sort for sorting in-memory partition ids. Radix sort is much " + "faster, but requires additional memory to be reserved memory as pointers are added.") + .version("2.0.0") .booleanConf .createWithDefault(true) @@ -1147,6 +1336,7 @@ package object config { ConfigBuilder("spark.shuffle.minNumPartitionsToHighlyCompress") .internal() .doc("Number of partitions to determine if MapStatus should use HighlyCompressedMapStatus") + .version("2.4.0") .intConf .checkValue(v => v > 0, "The value should be a positive integer.") .createWithDefault(2000) @@ -1156,21 +1346,36 @@ package object config { .doc("Whether to use the old protocol while doing the shuffle block fetching. " + "It is only enabled while we need the compatibility in the scenario of new Spark " + "version job fetching shuffle blocks from old version external shuffle service.") + .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED = - ConfigBuilder("spark.shuffle.readHostLocalDisk.enabled") - .doc(s"If enabled (and `${SHUFFLE_USE_OLD_FETCH_PROTOCOL.key}` is disabled), shuffle " + + ConfigBuilder("spark.shuffle.readHostLocalDisk") + .doc(s"If enabled (and `${SHUFFLE_USE_OLD_FETCH_PROTOCOL.key}` is disabled and external " + + s"shuffle `${SHUFFLE_SERVICE_ENABLED.key}` is enabled), shuffle " + "blocks requested from those block managers which are running on the same host are read " + "from the disk directly instead of being fetched as remote blocks over the network.") + .version("3.0.0") .booleanConf .createWithDefault(true) + private[spark] val STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE = + ConfigBuilder("spark.storage.localDiskByExecutors.cacheSize") + .doc("The max number of executors for which the local dirs are stored. This size is " + + "both applied for the driver and both for the executors side to avoid having an " + + "unbounded store. This cache will be used to avoid the network in case of fetching disk " + + s"persisted RDD blocks or shuffle blocks " + + s"(when `${SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED.key}` is set) from the same host.") + .version("3.0.0") + .intConf + .createWithDefault(1000) + private[spark] val MEMORY_MAP_LIMIT_FOR_TESTS = ConfigBuilder("spark.storage.memoryMapLimitForTests") .internal() .doc("For testing only, controls the size of chunks when memory mapping a file") + .version("2.3.0") .bytesConf(ByteUnit.BYTE) .createWithDefault(ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) @@ -1180,6 +1385,7 @@ package object config { "coordinator didn't receive all the sync messages from barrier tasks within the " + "configured time, throw a SparkException to fail all the tasks. The default value is set " + "to 31536000(3600 * 24 * 365) so the barrier() call shall wait for one year.") + .version("2.4.0") .timeConf(TimeUnit.SECONDS) .checkValue(v => v > 0, "The value should be a positive time value.") .createWithDefaultString("365d") @@ -1188,6 +1394,7 @@ package object config { ConfigBuilder("spark.scheduler.blacklist.unschedulableTaskSetTimeout") .doc("The timeout in seconds to wait to acquire a new executor and schedule a task " + "before aborting a TaskSet which is unschedulable because of being completely blacklisted.") + .version("2.4.1") .timeConf(TimeUnit.SECONDS) .checkValue(v => v >= 0, "The value should be a non negative time value.") .createWithDefault(120) @@ -1202,6 +1409,7 @@ package object config { "configured max failure times for a job then fail current job submission. Note this " + "config only applies to jobs that contain one or more barrier stages, we won't perform " + "the check on non-barrier jobs.") + .version("2.4.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("15s") @@ -1215,6 +1423,7 @@ package object config { "max failure times for a job then fail current job submission. Note this config only " + "applies to jobs that contain one or more barrier stages, we won't perform the check on " + "non-barrier jobs.") + .version("2.4.0") .intConf .checkValue(v => v > 0, "The max failures should be a positive value.") .createWithDefault(40) @@ -1222,18 +1431,21 @@ package object config { private[spark] val UNSAFE_EXCEPTION_ON_MEMORY_LEAK = ConfigBuilder("spark.unsafe.exceptionOnMemoryLeak") .internal() + .version("1.4.0") .booleanConf .createWithDefault(false) private[spark] val UNSAFE_SORTER_SPILL_READ_AHEAD_ENABLED = ConfigBuilder("spark.unsafe.sorter.spill.read.ahead.enabled") .internal() + .version("2.3.0") .booleanConf .createWithDefault(true) private[spark] val UNSAFE_SORTER_SPILL_READER_BUFFER_SIZE = ConfigBuilder("spark.unsafe.sorter.spill.reader.buffer.size") .internal() + .version("2.1.0") .bytesConf(ByteUnit.BYTE) .checkValue(v => 1024 * 1024 <= v && v <= MAX_BUFFER_SIZE_BYTES, s"The value must be in allowed range [1,048,576, ${MAX_BUFFER_SIZE_BYTES}].") @@ -1246,63 +1458,83 @@ package object config { .withPrepended(DEFAULT_PLUGINS_LIST, separator = ",") .doc("Comma-separated list of class names implementing " + "org.apache.spark.api.plugin.SparkPlugin to load into the application.") + .version("3.0.0") .stringConf .toSequence .createWithDefault(Nil) private[spark] val CLEANER_PERIODIC_GC_INTERVAL = ConfigBuilder("spark.cleaner.periodicGC.interval") + .version("1.6.0") .timeConf(TimeUnit.SECONDS) .createWithDefaultString("30min") private[spark] val CLEANER_REFERENCE_TRACKING = ConfigBuilder("spark.cleaner.referenceTracking") + .version("1.0.0") .booleanConf .createWithDefault(true) private[spark] val CLEANER_REFERENCE_TRACKING_BLOCKING = ConfigBuilder("spark.cleaner.referenceTracking.blocking") + .version("1.0.0") .booleanConf .createWithDefault(true) private[spark] val CLEANER_REFERENCE_TRACKING_BLOCKING_SHUFFLE = ConfigBuilder("spark.cleaner.referenceTracking.blocking.shuffle") + .version("1.1.1") .booleanConf .createWithDefault(false) private[spark] val CLEANER_REFERENCE_TRACKING_CLEAN_CHECKPOINTS = ConfigBuilder("spark.cleaner.referenceTracking.cleanCheckpoints") + .version("1.4.0") .booleanConf .createWithDefault(false) private[spark] val EXECUTOR_LOGS_ROLLING_STRATEGY = - ConfigBuilder("spark.executor.logs.rolling.strategy").stringConf.createWithDefault("") + ConfigBuilder("spark.executor.logs.rolling.strategy") + .version("1.1.0") + .stringConf + .createWithDefault("") private[spark] val EXECUTOR_LOGS_ROLLING_TIME_INTERVAL = - ConfigBuilder("spark.executor.logs.rolling.time.interval").stringConf.createWithDefault("daily") + ConfigBuilder("spark.executor.logs.rolling.time.interval") + .version("1.1.0") + .stringConf + .createWithDefault("daily") private[spark] val EXECUTOR_LOGS_ROLLING_MAX_SIZE = ConfigBuilder("spark.executor.logs.rolling.maxSize") + .version("1.4.0") .stringConf .createWithDefault((1024 * 1024).toString) private[spark] val EXECUTOR_LOGS_ROLLING_MAX_RETAINED_FILES = - ConfigBuilder("spark.executor.logs.rolling.maxRetainedFiles").intConf.createWithDefault(-1) + ConfigBuilder("spark.executor.logs.rolling.maxRetainedFiles") + .version("1.1.0") + .intConf + .createWithDefault(-1) private[spark] val EXECUTOR_LOGS_ROLLING_ENABLE_COMPRESSION = ConfigBuilder("spark.executor.logs.rolling.enableCompression") + .version("2.0.2") .booleanConf .createWithDefault(false) private[spark] val MASTER_REST_SERVER_ENABLED = ConfigBuilder("spark.master.rest.enabled") + .version("1.3.0") .booleanConf .createWithDefault(false) private[spark] val MASTER_REST_SERVER_PORT = ConfigBuilder("spark.master.rest.port") + .version("1.3.0") .intConf .createWithDefault(6066) private[spark] val MASTER_UI_PORT = ConfigBuilder("spark.master.ui.port") + .version("1.1.0") .intConf .createWithDefault(8080) @@ -1311,6 +1543,7 @@ package object config { .doc("Block size in bytes used in Snappy compression, in the case when " + "Snappy compression codec is used. Lowering this block size " + "will also lower shuffle memory usage when Snappy is used") + .version("1.4.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("32k") @@ -1319,6 +1552,7 @@ package object config { .doc("Block size in bytes used in LZ4 compression, in the case when LZ4 compression" + "codec is used. Lowering this block size will also lower shuffle memory " + "usage when LZ4 is used.") + .version("1.4.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("32k") @@ -1328,6 +1562,7 @@ package object config { "broadcast variables and shuffle outputs. By default, Spark provides four codecs: " + "lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify " + "the codec") + .version("0.8.0") .stringConf .createWithDefaultString("lz4") @@ -1337,6 +1572,7 @@ package object config { "compression codec is used. Lowering this size will lower the shuffle " + "memory usage when Zstd is used, but it might increase the compression " + "cost because of excessive JNI call overhead") + .version("2.3.0") .bytesConf(ByteUnit.BYTE) .createWithDefaultString("32k") @@ -1344,6 +1580,7 @@ package object config { ConfigBuilder("spark.io.compression.zstd.level") .doc("Compression level for Zstd compression codec. Increasing the compression " + "level will result in better compression at the expense of more CPU and memory") + .version("2.3.0") .intConf .createWithDefault(1) @@ -1352,6 +1589,7 @@ package object config { .internal() .doc("If the size in bytes of a file loaded by Spark exceeds this threshold, " + "a warning is logged with the possible reasons.") + .version("3.0.0") .bytesConf(ByteUnit.BYTE) .createWithDefault(1024 * 1024 * 1024) @@ -1360,28 +1598,34 @@ package object config { .doc("The codec used to compress event log. By default, Spark provides four codecs: " + "lz4, lzf, snappy, and zstd. You can also use fully qualified class names to specify " + "the codec. If this is not given, spark.io.compression.codec will be used.") + .version("3.0.0") .fallbackConf(IO_COMPRESSION_CODEC) private[spark] val BUFFER_SIZE = ConfigBuilder("spark.buffer.size") + .version("0.5.0") .intConf .checkValue(_ >= 0, "The buffer size must not be negative") .createWithDefault(65536) private[spark] val LOCALITY_WAIT_PROCESS = ConfigBuilder("spark.locality.wait.process") + .version("0.8.0") .fallbackConf(LOCALITY_WAIT) private[spark] val LOCALITY_WAIT_NODE = ConfigBuilder("spark.locality.wait.node") + .version("0.8.0") .fallbackConf(LOCALITY_WAIT) private[spark] val LOCALITY_WAIT_RACK = ConfigBuilder("spark.locality.wait.rack") + .version("0.8.0") .fallbackConf(LOCALITY_WAIT) - private[spark] val REDUCER_MAX_SIZE_IN_FLIGHT = ConfigBuilder("spark.reducer.maxSizeInFlight") + private[spark] val REDUCER_MAX_SIZE_IN_FLIGHT = ConfigBuilder("spark.reducer.maxSizeInFlight") .doc("Maximum size of map outputs to fetch simultaneously from each reduce task, " + "in MiB unless otherwise specified. Since each output requires us to create a " + "buffer to receive it, this represents a fixed memory overhead per reduce task, " + "so keep it small unless you have a large amount of memory") + .version("1.4.0") .bytesConf(ByteUnit.MiB) .createWithDefaultString("48m") @@ -1391,12 +1635,14 @@ package object config { "it might lead to very large number of inbound connections to one or more nodes, " + "causing the workers to fail under load. By allowing it to limit the number of " + "fetch requests, this scenario can be mitigated") + .version("2.0.0") .intConf .createWithDefault(Int.MaxValue) private[spark] val BROADCAST_COMPRESS = ConfigBuilder("spark.broadcast.compress") .doc("Whether to compress broadcast variables before sending them. " + "Generally a good idea. Compression will use spark.io.compression.codec") + .version("0.6.0") .booleanConf.createWithDefault(true) private[spark] val BROADCAST_BLOCKSIZE = ConfigBuilder("spark.broadcast.blockSize") @@ -1404,6 +1650,7 @@ package object config { "KiB unless otherwise specified. Too large a value decreases " + "parallelism during broadcast (makes it slower); however, " + "if it is too small, BlockManager might take a performance hit") + .version("0.5.0") .bytesConf(ByteUnit.KiB) .createWithDefaultString("4m") @@ -1413,12 +1660,15 @@ package object config { "corrupted blocks, at the cost of computing and sending a little " + "more data. It's possible to disable it if the network has other " + "mechanisms to guarantee data won't be corrupted during broadcast") - .booleanConf.createWithDefault(true) + .version("2.1.1") + .booleanConf + .createWithDefault(true) private[spark] val BROADCAST_FOR_UDF_COMPRESSION_THRESHOLD = ConfigBuilder("spark.broadcast.UDFCompressionThreshold") .doc("The threshold at which user-defined functions (UDFs) and Python RDD commands " + "are compressed by broadcast in bytes unless otherwise specified") + .version("3.0.0") .bytesConf(ByteUnit.BYTE) .checkValue(v => v >= 0, "The threshold should be non-negative.") .createWithDefault(1L * 1024 * 1024) @@ -1429,92 +1679,112 @@ package object config { "or StorageLevel.MEMORY_ONLY in Python). Can save substantial " + "space at the cost of some extra CPU time. " + "Compression will use spark.io.compression.codec") - .booleanConf.createWithDefault(false) + .version("0.6.0") + .booleanConf + .createWithDefault(false) private[spark] val RDD_PARALLEL_LISTING_THRESHOLD = ConfigBuilder("spark.rdd.parallelListingThreshold") + .version("2.0.0") .intConf .createWithDefault(10) private[spark] val RDD_LIMIT_SCALE_UP_FACTOR = ConfigBuilder("spark.rdd.limit.scaleUpFactor") + .version("2.1.0") .intConf .createWithDefault(4) private[spark] val SERIALIZER = ConfigBuilder("spark.serializer") + .version("0.5.0") .stringConf .createWithDefault("org.apache.spark.serializer.JavaSerializer") private[spark] val SERIALIZER_OBJECT_STREAM_RESET = ConfigBuilder("spark.serializer.objectStreamReset") + .version("1.0.0") .intConf .createWithDefault(100) private[spark] val SERIALIZER_EXTRA_DEBUG_INFO = ConfigBuilder("spark.serializer.extraDebugInfo") + .version("1.3.0") .booleanConf .createWithDefault(true) private[spark] val JARS = ConfigBuilder("spark.jars") + .version("0.9.0") .stringConf .toSequence .createWithDefault(Nil) private[spark] val FILES = ConfigBuilder("spark.files") + .version("1.0.0") .stringConf .toSequence .createWithDefault(Nil) private[spark] val SUBMIT_DEPLOY_MODE = ConfigBuilder("spark.submit.deployMode") + .version("1.5.0") .stringConf .createWithDefault("client") private[spark] val SUBMIT_PYTHON_FILES = ConfigBuilder("spark.submit.pyFiles") + .version("1.0.1") .stringConf .toSequence .createWithDefault(Nil) private[spark] val SCHEDULER_ALLOCATION_FILE = ConfigBuilder("spark.scheduler.allocation.file") + .version("0.8.1") .stringConf .createOptional private[spark] val SCHEDULER_MIN_REGISTERED_RESOURCES_RATIO = ConfigBuilder("spark.scheduler.minRegisteredResourcesRatio") + .version("1.1.1") .doubleConf .createOptional private[spark] val SCHEDULER_MAX_REGISTERED_RESOURCE_WAITING_TIME = ConfigBuilder("spark.scheduler.maxRegisteredResourcesWaitingTime") + .version("1.1.1") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("30s") private[spark] val SCHEDULER_MODE = ConfigBuilder("spark.scheduler.mode") + .version("0.8.0") .stringConf .createWithDefault(SchedulingMode.FIFO.toString) private[spark] val SCHEDULER_REVIVE_INTERVAL = ConfigBuilder("spark.scheduler.revive.interval") + .version("0.8.1") .timeConf(TimeUnit.MILLISECONDS) .createOptional private[spark] val SPECULATION_ENABLED = ConfigBuilder("spark.speculation") + .version("0.6.0") .booleanConf .createWithDefault(false) private[spark] val SPECULATION_INTERVAL = ConfigBuilder("spark.speculation.interval") + .version("0.6.0") .timeConf(TimeUnit.MILLISECONDS) .createWithDefault(100) private[spark] val SPECULATION_MULTIPLIER = ConfigBuilder("spark.speculation.multiplier") + .version("0.6.0") .doubleConf .createWithDefault(1.5) private[spark] val SPECULATION_QUANTILE = ConfigBuilder("spark.speculation.quantile") + .version("0.6.0") .doubleConf .createWithDefault(0.75) @@ -1528,17 +1798,26 @@ package object config { "large enough. E.g. tasks might be re-launched if there are enough successful runs " + "even though the threshold hasn't been reached. The number of slots is computed based " + "on the conf values of spark.executor.cores and spark.task.cpus minimum 1.") + .version("3.0.0") .timeConf(TimeUnit.MILLISECONDS) .createOptional private[spark] val STAGING_DIR = ConfigBuilder("spark.yarn.stagingDir") .doc("Staging directory used while submitting applications.") + .version("2.0.0") .stringConf .createOptional private[spark] val BUFFER_PAGESIZE = ConfigBuilder("spark.buffer.pageSize") .doc("The amount of memory used per page in bytes") + .version("1.5.0") .bytesConf(ByteUnit.BYTE) .createOptional + private[spark] val EXECUTOR_ALLOW_SPARK_CONTEXT = + ConfigBuilder("spark.executor.allowSparkContext") + .doc("If set to true, SparkContext can be created in executors.") + .version("3.0.1") + .booleanConf + .createWithDefault(true) } diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala index a619f10bbf064..a5d2c5c88ae03 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala @@ -18,7 +18,7 @@ package org.apache.spark.internal.io import java.text.NumberFormat -import java.util.{Date, Locale} +import java.util.{Date, Locale, UUID} import scala.reflect.ClassTag @@ -70,6 +70,11 @@ object SparkHadoopWriter extends Logging { // Assert the output format/key/value class is set in JobConf. config.assertConf(jobContext, rdd.conf) + // propagate the description UUID into the jobs, so that committers + // get an ID guaranteed to be unique. + jobContext.getConfiguration.set("spark.sql.sources.writeJobUUID", + UUID.randomUUID.toString) + val committer = config.createCommitter(commitJobId) committer.setupJob(jobContext) diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriterUtils.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriterUtils.scala index de828a6d6156e..657842c620f30 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriterUtils.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriterUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.internal.io import java.text.SimpleDateFormat import java.util.{Date, Locale} -import scala.util.DynamicVariable +import scala.util.{DynamicVariable, Random} import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.{JobConf, JobID} @@ -37,14 +37,35 @@ private[spark] object SparkHadoopWriterUtils { private val RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES = 256 + private val RAND = new Random() + /** + * Create a job ID. + * + * @param time (current) time + * @param id job number + * @return a job ID + */ def createJobID(time: Date, id: Int): JobID = { + if (id < 0) { + throw new IllegalArgumentException("Job number is negative") + } val jobtrackerID = createJobTrackerID(time) new JobID(jobtrackerID, id) } + /** + * Generate an ID for a job tracker. + * @param time (current) time + * @return a string for a job ID + */ def createJobTrackerID(time: Date): String = { - new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(time) + val base = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(time) + var l1 = RAND.nextLong() + if (l1 < 0) { + l1 = -l1 + } + base + l1 } def createPathFromString(path: String, conf: JobConf): Path = { diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala index 7c33bce78378d..59b863b89f75a 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala @@ -24,15 +24,18 @@ import com.codahale.metrics.MetricRegistry import org.eclipse.jetty.servlet.ServletContextHandler import org.apache.spark.{SecurityManager, SparkConf} +import org.apache.spark.annotation.Experimental import org.apache.spark.ui.JettyUtils._ /** + * :: Experimental :: * This exposes the metrics of the given registry with Prometheus format. * * The output is consistent with /metrics/json result in terms of item ordering * and with the previous result of Spark JMX Sink + Prometheus JMX Converter combination * in terms of key string format. */ +@Experimental private[spark] class PrometheusServlet( val property: Properties, val registry: MetricRegistry, @@ -53,58 +56,65 @@ private[spark] class PrometheusServlet( def getMetricsSnapshot(request: HttpServletRequest): String = { import scala.collection.JavaConverters._ + val guagesLabel = """{type="gauges"}""" + val countersLabel = """{type="counters"}""" + val metersLabel = countersLabel + val histogramslabels = """{type="histograms"}""" + val timersLabels = """{type="timers"}""" + val sb = new StringBuilder() registry.getGauges.asScala.foreach { case (k, v) => if (!v.getValue.isInstanceOf[String]) { - sb.append(s"${normalizeKey(k)}Value ${v.getValue}\n") + sb.append(s"${normalizeKey(k)}Number$guagesLabel ${v.getValue}\n") + sb.append(s"${normalizeKey(k)}Value$guagesLabel ${v.getValue}\n") } } registry.getCounters.asScala.foreach { case (k, v) => - sb.append(s"${normalizeKey(k)}Count ${v.getCount}\n") + sb.append(s"${normalizeKey(k)}Count$countersLabel ${v.getCount}\n") } registry.getHistograms.asScala.foreach { case (k, h) => val snapshot = h.getSnapshot val prefix = normalizeKey(k) - sb.append(s"${prefix}Count ${h.getCount}\n") - sb.append(s"${prefix}Max ${snapshot.getMax}\n") - sb.append(s"${prefix}Mean ${snapshot.getMean}\n") - sb.append(s"${prefix}Min ${snapshot.getMin}\n") - sb.append(s"${prefix}50thPercentile ${snapshot.getMedian}\n") - sb.append(s"${prefix}75thPercentile ${snapshot.get75thPercentile}\n") - sb.append(s"${prefix}95thPercentile ${snapshot.get95thPercentile}\n") - sb.append(s"${prefix}98thPercentile ${snapshot.get98thPercentile}\n") - sb.append(s"${prefix}99thPercentile ${snapshot.get99thPercentile}\n") - sb.append(s"${prefix}999thPercentile ${snapshot.get999thPercentile}\n") - sb.append(s"${prefix}StdDev ${snapshot.getStdDev}\n") + sb.append(s"${prefix}Count$histogramslabels ${h.getCount}\n") + sb.append(s"${prefix}Max$histogramslabels ${snapshot.getMax}\n") + sb.append(s"${prefix}Mean$histogramslabels ${snapshot.getMean}\n") + sb.append(s"${prefix}Min$histogramslabels ${snapshot.getMin}\n") + sb.append(s"${prefix}50thPercentile$histogramslabels ${snapshot.getMedian}\n") + sb.append(s"${prefix}75thPercentile$histogramslabels ${snapshot.get75thPercentile}\n") + sb.append(s"${prefix}95thPercentile$histogramslabels ${snapshot.get95thPercentile}\n") + sb.append(s"${prefix}98thPercentile$histogramslabels ${snapshot.get98thPercentile}\n") + sb.append(s"${prefix}99thPercentile$histogramslabels ${snapshot.get99thPercentile}\n") + sb.append(s"${prefix}999thPercentile$histogramslabels ${snapshot.get999thPercentile}\n") + sb.append(s"${prefix}StdDev$histogramslabels ${snapshot.getStdDev}\n") } registry.getMeters.entrySet.iterator.asScala.foreach { kv => val prefix = normalizeKey(kv.getKey) val meter = kv.getValue - sb.append(s"${prefix}Count ${meter.getCount}\n") - sb.append(s"${prefix}MeanRate ${meter.getMeanRate}\n") - sb.append(s"${prefix}OneMinuteRate ${meter.getOneMinuteRate}\n") - sb.append(s"${prefix}FiveMinuteRate ${meter.getFiveMinuteRate}\n") - sb.append(s"${prefix}FifteenMinuteRate ${meter.getFifteenMinuteRate}\n") + sb.append(s"${prefix}Count$metersLabel ${meter.getCount}\n") + sb.append(s"${prefix}MeanRate$metersLabel ${meter.getMeanRate}\n") + sb.append(s"${prefix}OneMinuteRate$metersLabel ${meter.getOneMinuteRate}\n") + sb.append(s"${prefix}FiveMinuteRate$metersLabel ${meter.getFiveMinuteRate}\n") + sb.append(s"${prefix}FifteenMinuteRate$metersLabel ${meter.getFifteenMinuteRate}\n") } registry.getTimers.entrySet.iterator.asScala.foreach { kv => val prefix = normalizeKey(kv.getKey) val timer = kv.getValue val snapshot = timer.getSnapshot - sb.append(s"${prefix}Count ${timer.getCount}\n") - sb.append(s"${prefix}Max ${snapshot.getMax}\n") - sb.append(s"${prefix}Mean ${snapshot.getMax}\n") - sb.append(s"${prefix}Min ${snapshot.getMin}\n") - sb.append(s"${prefix}50thPercentile ${snapshot.getMedian}\n") - sb.append(s"${prefix}75thPercentile ${snapshot.get75thPercentile}\n") - sb.append(s"${prefix}95thPercentile ${snapshot.get95thPercentile}\n") - sb.append(s"${prefix}98thPercentile ${snapshot.get98thPercentile}\n") - sb.append(s"${prefix}99thPercentile ${snapshot.get99thPercentile}\n") - sb.append(s"${prefix}999thPercentile ${snapshot.get999thPercentile}\n") - sb.append(s"${prefix}StdDev ${snapshot.getStdDev}\n") - sb.append(s"${prefix}FifteenMinuteRate ${timer.getFifteenMinuteRate}\n") - sb.append(s"${prefix}FiveMinuteRate ${timer.getFiveMinuteRate}\n") - sb.append(s"${prefix}OneMinuteRate ${timer.getOneMinuteRate}\n") - sb.append(s"${prefix}MeanRate ${timer.getMeanRate}\n") + sb.append(s"${prefix}Count$timersLabels ${timer.getCount}\n") + sb.append(s"${prefix}Max$timersLabels ${snapshot.getMax}\n") + sb.append(s"${prefix}Mean$timersLabels ${snapshot.getMax}\n") + sb.append(s"${prefix}Min$timersLabels ${snapshot.getMin}\n") + sb.append(s"${prefix}50thPercentile$timersLabels ${snapshot.getMedian}\n") + sb.append(s"${prefix}75thPercentile$timersLabels ${snapshot.get75thPercentile}\n") + sb.append(s"${prefix}95thPercentile$timersLabels ${snapshot.get95thPercentile}\n") + sb.append(s"${prefix}98thPercentile$timersLabels ${snapshot.get98thPercentile}\n") + sb.append(s"${prefix}99thPercentile$timersLabels ${snapshot.get99thPercentile}\n") + sb.append(s"${prefix}999thPercentile$timersLabels ${snapshot.get999thPercentile}\n") + sb.append(s"${prefix}StdDev$timersLabels ${snapshot.getStdDev}\n") + sb.append(s"${prefix}FifteenMinuteRate$timersLabels ${timer.getFifteenMinuteRate}\n") + sb.append(s"${prefix}FiveMinuteRate$timersLabels ${timer.getFiveMinuteRate}\n") + sb.append(s"${prefix}OneMinuteRate$timersLabels ${timer.getOneMinuteRate}\n") + sb.append(s"${prefix}MeanRate$timersLabels ${timer.getMeanRate}\n") } sb.toString() } diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala index 2ac6d3c500f9d..d861e91771673 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala @@ -21,6 +21,7 @@ import java.io.File import java.util.Optional import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.resource.ResourceDiscoveryPlugin import org.apache.spark.internal.Logging import org.apache.spark.util.Utils.executeAndGetOutput @@ -28,10 +29,13 @@ import org.apache.spark.util.Utils.executeAndGetOutput /** * The default plugin that is loaded into a Spark application to control how custom * resources are discovered. This executes the discovery script specified by the user - * and gets the json output back and contructs ResourceInformation objects from that. + * and gets the json output back and constructs ResourceInformation objects from that. * If the user specifies custom plugins, this is the last one to be executed and * throws if the resource isn't discovered. + * + * @since 3.0.0 */ +@DeveloperApi class ResourceDiscoveryScriptPlugin extends ResourceDiscoveryPlugin with Logging { override def discoverResource( request: ResourceRequest, diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala b/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala index d5ac41b995559..be056e15b6d03 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceInformation.scala @@ -33,6 +33,8 @@ import org.apache.spark.annotation.Evolving * * @param name the name of the resource * @param addresses an array of strings describing the addresses of the resource + * + * @since 3.0.0 */ @Evolving class ResourceInformation( diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala index 14019d27fc2e6..f3c39d9107e98 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala @@ -34,9 +34,12 @@ import org.apache.spark.internal.config.Python.PYSPARK_EXECUTOR_MEMORY * specify executor and task requirements for an RDD that will get applied during a * stage. This allows the user to change the resource requirements between stages. * This is meant to be immutable so user can't change it after building. + * + * This api is currently private until the rest of the pieces are in place and then it + * will become public. */ @Evolving -class ResourceProfile( +private[spark] class ResourceProfile( val executorResources: Map[String, ExecutorResourceRequest], val taskResources: Map[String, TaskResourceRequest]) extends Serializable with Logging { diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala index 0d55c176eeb65..db1c77d7c4d40 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala @@ -29,9 +29,12 @@ import org.apache.spark.annotation.Evolving * A ResourceProfile allows the user to specify executor and task requirements for an RDD * that will get applied during a stage. This allows the user to change the resource * requirements between stages. + * + * This api is currently private until the rest of the pieces are in place and then it + * will become public. */ @Evolving -class ResourceProfileBuilder() { +private[spark] class ResourceProfileBuilder() { private val _taskResources = new ConcurrentHashMap[String, TaskResourceRequest]() private val _executorResources = new ConcurrentHashMap[String, ExecutorResourceRequest]() diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 7dd7fc1b99353..16fe897827c33 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -148,8 +148,13 @@ private[spark] object ResourceUtils extends Logging { def listResourceIds(sparkConf: SparkConf, componentName: String): Seq[ResourceID] = { sparkConf.getAllWithPrefix(s"$componentName.$RESOURCE_PREFIX.").map { case (key, _) => - key.substring(0, key.indexOf('.')) - }.toSet.toSeq.map(name => new ResourceID(componentName, name)) + val index = key.indexOf('.') + if (index < 0) { + throw new SparkException(s"You must specify an amount config for resource: $key " + + s"config: $componentName.$RESOURCE_PREFIX.$key") + } + key.substring(0, index) + }.distinct.map(name => new ResourceID(componentName, name)) } def parseAllResourceRequests( diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala index 49d58929a97a4..a3d27b0d09923 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpointRef.scala @@ -108,17 +108,13 @@ private[spark] abstract class RpcEndpointRef(conf: SparkConf) /** * An exception thrown if the RPC is aborted. */ -class RpcAbortException(message: String) extends Exception(message) +private[spark] class RpcAbortException(message: String) extends Exception(message) /** * A wrapper for [[Future]] but add abort method. * This is used in long run RPC and provide an approach to abort the RPC. */ -private[spark] class AbortableRpcFuture[T: ClassTag]( - future: Future[T], - onAbort: String => Unit) { - - def abort(reason: String): Unit = onAbort(reason) - - def toFuture: Future[T] = future +private[spark] +class AbortableRpcFuture[T: ClassTag](val future: Future[T], onAbort: Throwable => Unit) { + def abort(t: Throwable): Unit = onAbort(t) } diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala index 2ed03f7430c32..472401b23fe8e 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala @@ -200,6 +200,16 @@ private[netty] class Inbox(val endpointName: String, val endpoint: RpcEndpoint) * Calls action closure, and calls the endpoint's onError function in the case of exceptions. */ private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = { + def dealWithFatalError(fatal: Throwable): Unit = { + inbox.synchronized { + assert(numActiveThreads > 0, "The number of active threads should be positive.") + // Should reduce the number of active threads before throw the error. + numActiveThreads -= 1 + } + logError(s"An error happened while processing message in the inbox for $endpointName", fatal) + throw fatal + } + try action catch { case NonFatal(e) => try endpoint.onError(e) catch { @@ -209,8 +219,18 @@ private[netty] class Inbox(val endpointName: String, val endpoint: RpcEndpoint) } else { logError("Ignoring error", ee) } + case fatal: Throwable => + dealWithFatalError(fatal) } + case fatal: Throwable => + dealWithFatalError(fatal) } } + // exposed only for testing + def getNumActiveThreads: Int = { + inbox.synchronized { + inbox.numActiveThreads + } + } } diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala index 265e158d7c5e3..9259ec7699262 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala @@ -208,6 +208,7 @@ private[netty] class NettyRpcEnv( message: RequestMessage, timeout: RpcTimeout): AbortableRpcFuture[T] = { val promise = Promise[Any]() val remoteAddr = message.receiver.address + var rpcMsg: Option[RpcOutboxMessage] = None def onFailure(e: Throwable): Unit = { if (!promise.tryFailure(e)) { @@ -226,8 +227,9 @@ private[netty] class NettyRpcEnv( } } - def onAbort(reason: String): Unit = { - onFailure(new RpcAbortException(reason)) + def onAbort(t: Throwable): Unit = { + onFailure(t) + rpcMsg.foreach(_.onAbort()) } try { @@ -242,10 +244,10 @@ private[netty] class NettyRpcEnv( val rpcMessage = RpcOutboxMessage(message.serialize(this), onFailure, (client, response) => onSuccess(deserialize[Any](client, response))) + rpcMsg = Option(rpcMessage) postToOutbox(message.receiver, rpcMessage) promise.future.failed.foreach { case _: TimeoutException => rpcMessage.onTimeout() - case _: RpcAbortException => rpcMessage.onAbort() case _ => }(ThreadUtils.sameThread) } @@ -270,7 +272,7 @@ private[netty] class NettyRpcEnv( } private[netty] def ask[T: ClassTag](message: RequestMessage, timeout: RpcTimeout): Future[T] = { - askAbortable(message, timeout).toFuture + askAbortable(message, timeout).future } private[netty] def serialize(content: Any): ByteBuffer = { @@ -547,7 +549,7 @@ private[netty] class NettyRpcEndpointRef( } override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = { - askAbortable(message, timeout).toFuture + askAbortable(message, timeout).future } override def send(message: Any): Unit = { diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala index 1bcddaceb3576..5164c30fce0a1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala @@ -64,11 +64,14 @@ private class AsyncEventQueue( // processed (instead of just dequeued). private val eventCount = new AtomicLong() - /** A counter for dropped events. It will be reset every time we log it. */ + /** A counter for dropped events. */ private val droppedEventsCounter = new AtomicLong(0L) + /** A counter to keep number of dropped events last time it was logged */ + @volatile private var lastDroppedEventsCounter: Long = 0L + /** When `droppedEventsCounter` was logged last time in milliseconds. */ - @volatile private var lastReportTimestamp = 0L + private val lastReportTimestamp = new AtomicLong(0L) private val logDroppedEvent = new AtomicBoolean(false) @@ -167,21 +170,19 @@ private class AsyncEventQueue( } logTrace(s"Dropping event $event") - val droppedCount = droppedEventsCounter.get - if (droppedCount > 0) { - // Don't log too frequently - if (System.currentTimeMillis() - lastReportTimestamp >= 60 * 1000) { - // There may be multiple threads trying to decrease droppedEventsCounter. - // Use "compareAndSet" to make sure only one thread can win. - // And if another thread is increasing droppedEventsCounter, "compareAndSet" will fail and - // then that thread will update it. - if (droppedEventsCounter.compareAndSet(droppedCount, 0)) { - val prevLastReportTimestamp = lastReportTimestamp - lastReportTimestamp = System.currentTimeMillis() - val previous = new java.util.Date(prevLastReportTimestamp) - logWarning(s"Dropped $droppedCount events from $name since " + - s"${if (prevLastReportTimestamp == 0) "the application started" else s"$previous"}.") - } + val droppedEventsCount = droppedEventsCounter.get + val droppedCountIncreased = droppedEventsCount - lastDroppedEventsCounter + val lastReportTime = lastReportTimestamp.get + val curTime = System.currentTimeMillis() + // Don't log too frequently + if (droppedCountIncreased > 0 && curTime - lastReportTime >= LOGGING_INTERVAL) { + // There may be multiple threads trying to logging dropped events, + // Use 'compareAndSet' to make sure only one thread can win. + if (lastReportTimestamp.compareAndSet(lastReportTime, curTime)) { + val previous = new java.util.Date(lastReportTime) + lastDroppedEventsCounter = droppedEventsCount + logWarning(s"Dropped $droppedCountIncreased events from $name since " + + s"${if (lastReportTime == 0) "the application started" else s"$previous"}.") } } } @@ -213,4 +214,5 @@ private object AsyncEventQueue { val POISON_PILL = new SparkListenerEvent() { } + val LOGGING_INTERVAL = 60 * 1000 } diff --git a/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala b/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala index 2274e6898adf6..043c6b90384b4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala @@ -60,6 +60,6 @@ private[spark] object BarrierJobAllocationFailed { val ERROR_MESSAGE_BARRIER_REQUIRE_MORE_SLOTS_THAN_CURRENT_TOTAL_NUMBER = "[SPARK-24819]: Barrier execution mode does not allow run a barrier stage that requires " + "more slots than the total number of slots in the cluster currently. Please init a new " + - "cluster with more CPU cores or repartition the input RDD(s) to reduce the number of " + - "slots required to run this barrier stage." + "cluster with more resources(e.g. CPU, GPU) or repartition the input RDD(s) to reduce " + + "the number of slots required to run this barrier stage." } diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 7bf363dd71c1b..b483b52662270 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -168,13 +168,34 @@ private[spark] class DAGScheduler( */ private val cacheLocs = new HashMap[Int, IndexedSeq[Seq[TaskLocation]]] - // For tracking failed nodes, we use the MapOutputTracker's epoch number, which is sent with - // every task. When we detect a node failing, we note the current epoch number and failed - // executor, increment it for new tasks, and use this to ignore stray ShuffleMapTask results. - // - // TODO: Garbage collect information about failure epochs when we know there are no more - // stray messages to detect. - private val failedEpoch = new HashMap[String, Long] + /** + * Tracks the latest epoch of a fully processed error related to the given executor. (We use + * the MapOutputTracker's epoch number, which is sent with every task.) + * + * When an executor fails, it can affect the results of many tasks, and we have to deal with + * all of them consistently. We don't simply ignore all future results from that executor, + * as the failures may have been transient; but we also don't want to "overreact" to follow- + * on errors we receive. Furthermore, we might receive notification of a task success, after + * we find out the executor has actually failed; we'll assume those successes are, in fact, + * simply delayed notifications and the results have been lost, if the tasks started in the + * same or an earlier epoch. In particular, we use this to control when we tell the + * BlockManagerMaster that the BlockManager has been lost. + */ + private val executorFailureEpoch = new HashMap[String, Long] + + /** + * Tracks the latest epoch of a fully processed error where shuffle files have been lost from + * the given executor. + * + * This is closely related to executorFailureEpoch. They only differ for the executor when + * there is an external shuffle service serving shuffle files and we haven't been notified that + * the entire worker has been lost. In that case, when an executor is lost, we do not update + * the shuffleFileLostEpoch; we wait for a fetch failure. This way, if only the executor + * fails, we do not unregister the shuffle data as it can still be served; but if there is + * a failure in the shuffle service (resulting in fetch failure), we unregister the shuffle + * data only once, even if we get many fetch failures. + */ + private val shuffleFileLostEpoch = new HashMap[String, Long] private [scheduler] val outputCommitCoordinator = env.outputCommitCoordinator @@ -432,10 +453,12 @@ private[spark] class DAGScheduler( * submission. */ private def checkBarrierStageWithNumSlots(rdd: RDD[_]): Unit = { - val numPartitions = rdd.getNumPartitions - val maxNumConcurrentTasks = sc.maxNumConcurrentTasks - if (rdd.isBarrier() && numPartitions > maxNumConcurrentTasks) { - throw new BarrierJobSlotsNumberCheckFailed(numPartitions, maxNumConcurrentTasks) + if (rdd.isBarrier()) { + val numPartitions = rdd.getNumPartitions + val maxNumConcurrentTasks = sc.maxNumConcurrentTasks + if (numPartitions > maxNumConcurrentTasks) { + throw new BarrierJobSlotsNumberCheckFailed(numPartitions, maxNumConcurrentTasks) + } } } @@ -1431,6 +1454,7 @@ private[spark] class DAGScheduler( // If the whole job has finished, remove it if (job.numFinished == job.numPartitions) { markStageAsFinished(resultStage) + cancelRunningIndependentStages(job, s"Job ${job.jobId} is finished.") cleanupStateForJobAndIndependentStages(job) try { // killAllTaskAttempts will fail if a SchedulerBackend does not implement @@ -1471,7 +1495,8 @@ private[spark] class DAGScheduler( val status = event.result.asInstanceOf[MapStatus] val execId = status.location.executorId logDebug("ShuffleMapTask finished on " + execId) - if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) { + if (executorFailureEpoch.contains(execId) && + smt.epoch <= executorFailureEpoch(execId)) { logInfo(s"Ignoring possibly bogus $smt completion from executor $execId") } else { // The epoch of the task is acceptable (i.e., the task was launched after the most @@ -1817,12 +1842,8 @@ private[spark] class DAGScheduler( * modify the scheduler's internal state. Use executorLost() to post a loss event from outside. * * We will also assume that we've lost all shuffle blocks associated with the executor if the - * executor serves its own blocks (i.e., we're not using external shuffle), the entire slave - * is lost (likely including the shuffle service), or a FetchFailed occurred, in which case we - * presume all shuffle data related to this executor to be lost. - * - * Optionally the epoch during which the failure was caught can be passed to avoid allowing - * stray fetch failures from possibly retriggering the detection of a node as lost. + * executor serves its own blocks (i.e., we're not using an external shuffle service), or the + * entire Standalone worker is lost. */ private[scheduler] def handleExecutorLost( execId: String, @@ -1838,29 +1859,44 @@ private[spark] class DAGScheduler( maybeEpoch = None) } + /** + * Handles removing an executor from the BlockManagerMaster as well as unregistering shuffle + * outputs for the executor or optionally its host. + * + * @param execId executor to be removed + * @param fileLost If true, indicates that we assume we've lost all shuffle blocks associated + * with the executor; this happens if the executor serves its own blocks (i.e., we're not + * using an external shuffle service), the entire Standalone worker is lost, or a FetchFailed + * occurred (in which case we presume all shuffle data related to this executor to be lost). + * @param hostToUnregisterOutputs (optional) executor host if we're unregistering all the + * outputs on the host + * @param maybeEpoch (optional) the epoch during which the failure was caught (this prevents + * reprocessing for follow-on fetch failures) + */ private def removeExecutorAndUnregisterOutputs( execId: String, fileLost: Boolean, hostToUnregisterOutputs: Option[String], maybeEpoch: Option[Long] = None): Unit = { val currentEpoch = maybeEpoch.getOrElse(mapOutputTracker.getEpoch) - if (!failedEpoch.contains(execId) || failedEpoch(execId) < currentEpoch) { - failedEpoch(execId) = currentEpoch - logInfo("Executor lost: %s (epoch %d)".format(execId, currentEpoch)) + logDebug(s"Considering removal of executor $execId; " + + s"fileLost: $fileLost, currentEpoch: $currentEpoch") + if (!executorFailureEpoch.contains(execId) || executorFailureEpoch(execId) < currentEpoch) { + executorFailureEpoch(execId) = currentEpoch + logInfo(s"Executor lost: $execId (epoch $currentEpoch)") blockManagerMaster.removeExecutor(execId) - if (fileLost) { - hostToUnregisterOutputs match { - case Some(host) => - logInfo("Shuffle files lost for host: %s (epoch %d)".format(host, currentEpoch)) - mapOutputTracker.removeOutputsOnHost(host) - case None => - logInfo("Shuffle files lost for executor: %s (epoch %d)".format(execId, currentEpoch)) - mapOutputTracker.removeOutputsOnExecutor(execId) - } - clearCacheLocs() - - } else { - logDebug("Additional executor lost message for %s (epoch %d)".format(execId, currentEpoch)) + clearCacheLocs() + } + if (fileLost && + (!shuffleFileLostEpoch.contains(execId) || shuffleFileLostEpoch(execId) < currentEpoch)) { + shuffleFileLostEpoch(execId) = currentEpoch + hostToUnregisterOutputs match { + case Some(host) => + logInfo(s"Shuffle files lost for host: $host (epoch $currentEpoch)") + mapOutputTracker.removeOutputsOnHost(host) + case None => + logInfo(s"Shuffle files lost for executor: $execId (epoch $currentEpoch)") + mapOutputTracker.removeOutputsOnExecutor(execId) } } } @@ -1886,11 +1922,12 @@ private[spark] class DAGScheduler( } private[scheduler] def handleExecutorAdded(execId: String, host: String): Unit = { - // remove from failedEpoch(execId) ? - if (failedEpoch.contains(execId)) { + // remove from executorFailureEpoch(execId) ? + if (executorFailureEpoch.contains(execId)) { logInfo("Host added was in lost list earlier: " + host) - failedEpoch -= execId + executorFailureEpoch -= execId } + shuffleFileLostEpoch -= execId } private[scheduler] def handleStageCancellation(stageId: Int, reason: Option[String]): Unit = { @@ -1975,18 +2012,12 @@ private[spark] class DAGScheduler( } } - /** Fails a job and all stages that are only used by that job, and cleans up relevant state. */ - private def failJobAndIndependentStages( - job: ActiveJob, - failureReason: String, - exception: Option[Throwable] = None): Unit = { - val error = new SparkException(failureReason, exception.orNull) + /** Cancel all independent, running stages that are only used by this job. */ + private def cancelRunningIndependentStages(job: ActiveJob, reason: String): Boolean = { var ableToCancelStages = true - - // Cancel all independent, running stages. val stages = jobIdToStageIds(job.jobId) if (stages.isEmpty) { - logError("No stages registered for job " + job.jobId) + logError(s"No stages registered for job ${job.jobId}") } stages.foreach { stageId => val jobsForStage: Option[HashSet[Int]] = stageIdToStage.get(stageId).map(_.jobIds) @@ -1998,12 +2029,12 @@ private[spark] class DAGScheduler( if (!stageIdToStage.contains(stageId)) { logError(s"Missing Stage for stage with id $stageId") } else { - // This is the only job that uses this stage, so fail the stage if it is running. + // This stage is only used by the job, so finish the stage if it is running. val stage = stageIdToStage(stageId) if (runningStages.contains(stage)) { try { // cancelTasks will fail if a SchedulerBackend does not implement killTask taskScheduler.cancelTasks(stageId, shouldInterruptTaskThread(job)) - markStageAsFinished(stage, Some(failureReason)) + markStageAsFinished(stage, Some(reason)) } catch { case e: UnsupportedOperationException => logWarning(s"Could not cancel tasks for stage $stageId", e) @@ -2013,11 +2044,19 @@ private[spark] class DAGScheduler( } } } + ableToCancelStages + } - if (ableToCancelStages) { + /** Fails a job and all stages that are only used by that job, and cleans up relevant state. */ + private def failJobAndIndependentStages( + job: ActiveJob, + failureReason: String, + exception: Option[Throwable] = None): Unit = { + if (cancelRunningIndependentStages(job, failureReason)) { // SPARK-15783 important to cleanup state first, just for tests where we have some asserts // against the state. Otherwise we have a *little* bit of flakiness in the tests. cleanupStateForJobAndIndependentStages(job) + val error = new SparkException(failureReason, exception.orNull) job.listener.jobFailed(error) listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobFailed(error))) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 8c23388b37a3d..24e2a5e4d4a62 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -40,7 +40,7 @@ import org.apache.spark.util.{JsonProtocol, Utils} * spark.eventLog.enabled - Whether event logging is enabled. * spark.eventLog.dir - Path to the directory in which events are logged. * spark.eventLog.logBlockUpdates.enabled - Whether to log block updates - * spark.eventLog.logStageExecutorMetrics.enabled - Whether to log stage executor metrics + * spark.eventLog.logStageExecutorMetrics - Whether to log stage executor metrics * * Event log file writer maintains its own parameters: refer the doc of [[EventLogFileWriter]] * and its descendant for more details. diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala index fd04db8c09d76..508c6cebd9fe3 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala @@ -36,4 +36,5 @@ private[spark] class ExecutorResourceInfo( override protected def resourceName = this.name override protected def resourceAddresses = this.addresses override protected def slotsPerAddress: Int = numParts + def totalAddressAmount: Int = resourceAddresses.length * slotsPerAddress } diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index 7f8893ff3b9d8..60052402642f9 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -206,6 +206,7 @@ private[spark] class HighlyCompressedMapStatus private ( override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { loc = BlockManagerId(in) + numNonEmptyBlocks = -1 // SPARK-32436 Scala 2.13 doesn't initialize this during deserialization emptyBlocks = new RoaringBitmap() emptyBlocks.readExternal(in) avgSize = in.readLong() diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala index 857c89d7a98f5..15f2161fac39d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala @@ -69,7 +69,7 @@ private[spark] class ResultTask[T, U]( with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { - if (locs == null) Nil else locs.toSet.toSeq + if (locs == null) Nil else locs.distinct } override def runTask(context: TaskContext): U = { diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala index 9159d2a0158d5..7b76af2f489f9 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala @@ -77,7 +77,8 @@ private[spark] trait SchedulerBackend { def getDriverAttributes: Option[Map[String, String]] = None /** - * Get the max number of tasks that can be concurrent launched currently. + * Get the max number of tasks that can be concurrent launched based on the resources + * could be used, even if some of them are being used at the moment. * Note that please don't cache the value returned by this method, because the number can change * due to add/remove executors. * diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala index 4c0c30a3caf67..a0ba9208ea647 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala @@ -71,7 +71,7 @@ private[spark] class ShuffleMapTask( } @transient private val preferredLocs: Seq[TaskLocation] = { - if (locs == null) Nil else locs.toSet.toSeq + if (locs == null) Nil else locs.distinct } override def runTask(context: TaskContext): MapStatus = { diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index f25a36c7af22a..46641e5bf5580 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -57,6 +57,11 @@ import org.apache.spark.util.{AccumulatorV2, SystemClock, ThreadUtils, Utils} * * Periodic revival of all offers from the CoarseGrainedSchedulerBackend, to accommodate delay * scheduling * * task-result-getter threads + * + * CAUTION: Any non fatal exception thrown within Spark RPC framework can be swallowed. + * Thus, throwing exception in methods like resourceOffers, statusUpdate won't fail + * the application, but could lead to undefined behavior. Instead, we shall use method like + * TaskSetManger.abort() to abort a stage and then fail the application (SPARK-31485). */ private[spark] class TaskSchedulerImpl( val sc: SparkContext, @@ -356,9 +361,7 @@ private[spark] class TaskSchedulerImpl( // addresses are the same as that we allocated in taskSet.resourceOffer() since it's // synchronized. We don't remove the exact addresses allocated because the current // approach produces the identical result with less time complexity. - availableResources(i).getOrElse(rName, - throw new SparkException(s"Try to acquire resource $rName that doesn't exist.")) - .remove(0, rInfo.addresses.size) + availableResources(i)(rName).remove(0, rInfo.addresses.size) } // Only update hosts for a barrier task. if (taskSet.isBarrier) { @@ -408,7 +411,7 @@ private[spark] class TaskSchedulerImpl( newExecAvail = true } } - val hosts = offers.map(_.host).toSet.toSeq + val hosts = offers.map(_.host).distinct for ((host, Some(rack)) <- hosts.zip(getRacksForHosts(hosts))) { hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += host } @@ -430,7 +433,7 @@ private[spark] class TaskSchedulerImpl( val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK)) val availableResources = shuffledOffers.map(_.resources).toArray val availableCpus = shuffledOffers.map(o => o.cores).toArray - val sortedTaskSets = rootPool.getSortedTaskSetQueue + val sortedTaskSets = rootPool.getSortedTaskSetQueue.filterNot(_.isZombie) for (taskSet <- sortedTaskSets) { logDebug("parentName: %s, name: %s, runningTasks: %s".format( taskSet.parent.name, taskSet.name, taskSet.runningTasks)) @@ -443,7 +446,17 @@ private[spark] class TaskSchedulerImpl( // of locality levels so that it gets a chance to launch local tasks on all of them. // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY for (taskSet <- sortedTaskSets) { - val availableSlots = availableCpus.map(c => c / CPUS_PER_TASK).sum + // we only need to calculate available slots if using barrier scheduling, otherwise the + // value is -1 + val availableSlots = if (taskSet.isBarrier) { + val availableResourcesAmount = availableResources.map { resourceMap => + // note that the addresses here have been expanded according to the numParts + resourceMap.map { case (name, addresses) => (name, addresses.length) } + } + calculateAvailableSlots(this, availableCpus, availableResourcesAmount) + } else { + -1 + } // Skip the barrier taskSet if the available slots are less than the number of pending tasks. if (taskSet.isBarrier && availableSlots < taskSet.numTasks) { // Skip the launch process. @@ -516,11 +529,18 @@ private[spark] class TaskSchedulerImpl( // Check whether the barrier tasks are partially launched. // TODO SPARK-24818 handle the assert failure case (that can happen when some locality // requirements are not fulfilled, and we should revert the launched tasks). - require(addressesWithDescs.size == taskSet.numTasks, - s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " + - s"because only ${addressesWithDescs.size} out of a total number of " + - s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " + - "been blacklisted or cannot fulfill task locality requirements.") + if (addressesWithDescs.size != taskSet.numTasks) { + val errorMsg = + s"Fail resource offers for barrier stage ${taskSet.stageId} because only " + + s"${addressesWithDescs.size} out of a total number of ${taskSet.numTasks}" + + s" tasks got resource offers. This happens because barrier execution currently " + + s"does not work gracefully with delay scheduling. We highly recommend you to " + + s"disable delay scheduling by setting spark.locality.wait=0 as a workaround if " + + s"you see this error frequently." + logWarning(errorMsg) + taskSet.abort(errorMsg) + throw new SparkException(errorMsg) + } // materialize the barrier coordinator. maybeInitBarrierCoordinator() @@ -582,8 +602,12 @@ private[spark] class TaskSchedulerImpl( if (state == TaskState.LOST) { // TaskState.LOST is only used by the deprecated Mesos fine-grained scheduling mode, // where each executor corresponds to a single task, so mark the executor as failed. - val execId = taskIdToExecutorId.getOrElse(tid, throw new IllegalStateException( - "taskIdToTaskSetManager.contains(tid) <=> taskIdToExecutorId.contains(tid)")) + val execId = taskIdToExecutorId.getOrElse(tid, { + val errorMsg = + "taskIdToTaskSetManager.contains(tid) <=> taskIdToExecutorId.contains(tid)" + taskSet.abort(errorMsg) + throw new SparkException(errorMsg) + }) if (executorIdToRunningTaskIds.contains(execId)) { reason = Some( SlaveLost(s"Task $tid was lost, so marking the executor as lost as well.")) @@ -919,6 +943,30 @@ private[spark] object TaskSchedulerImpl { val SCHEDULER_MODE_PROPERTY = SCHEDULER_MODE.key + /** + * Calculate the max available task slots given the `availableCpus` and `availableResources` + * from a collection of executors. + * + * @param scheduler the TaskSchedulerImpl instance + * @param availableCpus an Array of the amount of available cpus from the executors. + * @param availableResources an Array of the resources map from the executors. In the resource + * map, it maps from the resource name to its amount. + * @return the number of max task slots + */ + def calculateAvailableSlots( + scheduler: TaskSchedulerImpl, + availableCpus: Array[Int], + availableResources: Array[Map[String, Int]]): Int = { + val cpusPerTask = scheduler.CPUS_PER_TASK + val resourcesReqsPerTask = scheduler.resourcesReqsPerTask + availableCpus.zip(availableResources).map { case (cpu, resources) => + val cpuNum = cpu / cpusPerTask + resourcesReqsPerTask.map { req => + resources.get(req.resourceName).map(_ / req.amount).getOrElse(0) + }.reduceOption(Math.min).map(_.min(cpuNum)).getOrElse(cpuNum) + }.sum + } + /** * Used to balance containers across hosts. * diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 3b620ec69a9ab..2ce11347ade39 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -229,6 +229,8 @@ private[spark] class TaskSetManager( index: Int, resolveRacks: Boolean = true, speculatable: Boolean = false): Unit = { + // A zombie TaskSetManager may reach here while handling failed task. + if (isZombie) return val pendingTaskSetToAddTo = if (speculatable) pendingSpeculatableTasks else pendingTasks for (loc <- tasks(index).preferredLocations) { loc match { @@ -1082,6 +1084,8 @@ private[spark] class TaskSetManager( } def recomputeLocality(): Unit = { + // A zombie TaskSetManager may reach here while executorLost happens + if (isZombie) return val previousLocalityLevel = myLocalityLevels(currentLocalityIndex) myLocalityLevels = computeValidLocalityLevels() localityWaits = myLocalityLevels.map(getLocalityWait) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 55f4005ef1b45..2a06b1a98e91a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -236,7 +236,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } val data = new ExecutorData(executorRef, executorAddress, hostname, 0, cores, logUrlHandler.applyPattern(logUrls, attributes), attributes, - resourcesInfo, resourceProfileId) + resourcesInfo, resourceProfileId, registrationTs = System.currentTimeMillis()) // This must be synchronized because variables mutated // in this block are read when requesting executors CoarseGrainedSchedulerBackend.this.synchronized { @@ -249,10 +249,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp logDebug(s"Decremented number of pending executors ($numPendingExecutors left)") } } - // Note: some tests expect the reply to come after we put the executor in the map - context.reply(true) listenerBus.post( SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data)) + // Note: some tests expect the reply to come after we put the executor in the map + context.reply(true) } case StopDriver => @@ -557,16 +557,36 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp executorDataMap.keySet.toSeq } + def getExecutorsWithRegistrationTs(): Map[String, Long] = synchronized { + executorDataMap.mapValues(v => v.registrationTs).toMap + } + override def isExecutorActive(id: String): Boolean = synchronized { executorDataMap.contains(id) && !executorsPendingToRemove.contains(id) && !executorsPendingLossReason.contains(id) } + /** + * Get the max number of tasks that can be concurrent launched based on the resources + * could be used, even if some of them are being used at the moment. + * Note that please don't cache the value returned by this method, because the number can change + * due to add/remove executors. + * + * @return The max number of tasks that can be concurrent launched currently. + */ override def maxNumConcurrentTasks(): Int = synchronized { - executorDataMap.values.map { executor => - executor.totalCores / scheduler.CPUS_PER_TASK - }.sum + val (cpus, resources) = { + executorDataMap + .filter { case (id, _) => isExecutorActive(id) } + .values.toArray.map { executor => + ( + executor.totalCores, + executor.resourcesInfo.map { case (name, rInfo) => (name, rInfo.totalAddressAmount) } + ) + }.unzip + } + TaskSchedulerImpl.calculateAvailableSlots(scheduler, cpus, resources) } // this function is for testing only diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala index 062146174f6a8..86b44e835368c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala @@ -30,6 +30,7 @@ import org.apache.spark.scheduler.ExecutorResourceInfo * @param totalCores The total number of cores available to the executor * @param resourcesInfo The information of the currently available resources on the executor * @param resourceProfileId The id of the ResourceProfile being used by this executor + * @param registrationTs The registration timestamp of this executor */ private[cluster] class ExecutorData( val executorEndpoint: RpcEndpointRef, @@ -40,6 +41,7 @@ private[cluster] class ExecutorData( override val logUrlMap: Map[String, String], override val attributes: Map[String, String], override val resourcesInfo: Map[String, ExecutorResourceInfo], - override val resourceProfileId: Int + override val resourceProfileId: Int, + val registrationTs: Long ) extends ExecutorInfo(executorHost, totalCores, logUrlMap, attributes, resourcesInfo, resourceProfileId) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index a9b607d8cc38c..1b8b8cb4cf852 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -17,6 +17,7 @@ package org.apache.spark.scheduler.cluster +import java.util.Locale import java.util.concurrent.Semaphore import java.util.concurrent.atomic.AtomicBoolean @@ -216,6 +217,13 @@ private[spark] class StandaloneSchedulerBackend( } } + override def getDriverLogUrls: Option[Map[String, String]] = { + val prefix = "SPARK_DRIVER_LOG_URL_" + val driverLogUrls = sys.env.filterKeys(_.startsWith(prefix)) + .map(e => (e._1.substring(prefix.length).toLowerCase(Locale.ROOT), e._2)).toMap + if (driverLogUrls.nonEmpty) Some(driverLogUrls) else None + } + private def waitForRegistration() = { registrationBarrier.acquire() } diff --git a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala index a24f1902faa31..3664d3ff4d8cf 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala @@ -45,12 +45,12 @@ private[spark] class ExecutorMonitor( private val storageTimeoutNs = TimeUnit.SECONDS.toNanos( conf.get(DYN_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT)) private val shuffleTimeoutNs = TimeUnit.MILLISECONDS.toNanos( - conf.get(DYN_ALLOCATION_SHUFFLE_TIMEOUT)) + conf.get(DYN_ALLOCATION_SHUFFLE_TRACKING_TIMEOUT)) private val fetchFromShuffleSvcEnabled = conf.get(SHUFFLE_SERVICE_ENABLED) && conf.get(SHUFFLE_SERVICE_FETCH_RDD_ENABLED) private val shuffleTrackingEnabled = !conf.get(SHUFFLE_SERVICE_ENABLED) && - conf.get(DYN_ALLOCATION_SHUFFLE_TRACKING) + conf.get(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED) private val executors = new ConcurrentHashMap[String, Tracker]() private val execResourceProfileCount = new ConcurrentHashMap[Int, Int]() diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 55ac2c410953b..cdaab599e2a0b 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -502,7 +502,6 @@ private[serializer] object KryoSerializer { "org.apache.spark.ml.attribute.NumericAttribute", "org.apache.spark.ml.feature.Instance", - "org.apache.spark.ml.feature.InstanceBlock", "org.apache.spark.ml.feature.LabeledPoint", "org.apache.spark.ml.feature.OffsetInstance", "org.apache.spark.ml.linalg.DenseMatrix", diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala index af2c82e771970..b7caf4f0aeabf 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala @@ -209,6 +209,7 @@ private[spark] class IndexShuffleBlockResolver( } } } finally { + logDebug(s"Shuffle index for mapId $mapId: ${lengths.mkString("[", ",", "]")}") if (indexTmp.exists() && !indexTmp.delete()) { logError(s"Failed to delete temporary index file at ${indexTmp.getAbsolutePath}") } diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala index a5850fc2ac4b9..dfa4f92f8b87e 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala @@ -71,7 +71,7 @@ private[spark] class AppStatusListener( // causing too many writes to the underlying store, and other expensive operations). private val liveStages = new ConcurrentHashMap[(Int, Int), LiveStage]() private val liveJobs = new HashMap[Int, LiveJob]() - private val liveExecutors = new HashMap[String, LiveExecutor]() + private[spark] val liveExecutors = new HashMap[String, LiveExecutor]() private val deadExecutors = new HashMap[String, LiveExecutor]() private val liveTasks = new HashMap[Long, LiveTask]() private val liveRDDs = new HashMap[Int, LiveRDD]() @@ -281,10 +281,12 @@ private[spark] class AppStatusListener( // Implicitly blacklist every available executor for the stage associated with this node Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage => - val executorIds = liveExecutors.values.filter(_.host == event.hostId).map(_.executorId).toSeq + val executorIds = liveExecutors.values.filter(exec => exec.host == event.hostId + && exec.executorId != SparkContext.DRIVER_IDENTIFIER).map(_.executorId).toSeq setStageBlackListStatus(stage, now, executorIds: _*) } - liveExecutors.values.filter(_.hostname == event.hostId).foreach { exec => + liveExecutors.values.filter(exec => exec.hostname == event.hostId + && exec.executorId != SparkContext.DRIVER_IDENTIFIER).foreach { exec => addBlackListedStageTo(exec, event.stageId, now) } } @@ -333,7 +335,7 @@ private[spark] class AppStatusListener( // Implicitly (un)blacklist every executor associated with the node. liveExecutors.values.foreach { exec => - if (exec.hostname == host) { + if (exec.hostname == host && exec.executorId != SparkContext.DRIVER_IDENTIFIER) { exec.isBlacklisted = blacklisted liveUpdate(exec, now) } @@ -772,6 +774,11 @@ private[spark] class AppStatusListener( event.maxOnHeapMem.foreach { _ => exec.totalOnHeap = event.maxOnHeapMem.get exec.totalOffHeap = event.maxOffHeapMem.get + // SPARK-30594: whenever(first time or re-register) a BlockManager added, all blocks + // from this BlockManager will be reported to driver later. So, we should clean up + // used memory to avoid overlapped count. + exec.usedOnHeap = 0 + exec.usedOffHeap = 0 } exec.isActive = true exec.maxMemory = event.maxMem @@ -1042,7 +1049,7 @@ private[spark] class AppStatusListener( } } - private def updateExecutorMemoryDiskInfo( + private[spark] def updateExecutorMemoryDiskInfo( exec: LiveExecutor, storageLevel: StorageLevel, memoryDelta: Long, diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala index 6b89812cc2bf0..a4fa256b18adb 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala @@ -22,7 +22,7 @@ import java.util.{List => JList} import scala.collection.JavaConverters._ import scala.collection.mutable.HashMap -import org.apache.spark.{JobExecutionStatus, SparkConf} +import org.apache.spark.{JobExecutionStatus, SparkConf, SparkException} import org.apache.spark.status.api.v1 import org.apache.spark.ui.scope._ import org.apache.spark.util.Utils @@ -36,7 +36,14 @@ private[spark] class AppStatusStore( val listener: Option[AppStatusListener] = None) { def applicationInfo(): v1.ApplicationInfo = { - store.view(classOf[ApplicationInfoWrapper]).max(1).iterator().next().info + try { + // The ApplicationInfo may not be available when Spark is starting up. + store.view(classOf[ApplicationInfoWrapper]).max(1).iterator().next().info + } catch { + case _: NoSuchElementException => + throw new NoSuchElementException("Failed to get the application information. " + + "If you are starting up Spark, please wait a while until it's ready.") + } } def environmentInfo(): v1.ApplicationEnvironmentInfo = { @@ -512,7 +519,13 @@ private[spark] class AppStatusStore( } def appSummary(): AppSummary = { - store.read(classOf[AppSummary], classOf[AppSummary].getName()) + try { + store.read(classOf[AppSummary], classOf[AppSummary].getName()) + } catch { + case _: NoSuchElementException => + throw new NoSuchElementException("Failed to get the application summary. " + + "If you are starting up Spark, please wait a while until it's ready.") + } } def close(): Unit = { diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala index e3046dce34e67..2714f30de14f0 100644 --- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala +++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala @@ -245,7 +245,7 @@ private class LiveTask( } -private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveEntity { +private[spark] class LiveExecutor(val executorId: String, _addTime: Long) extends LiveEntity { var hostPort: String = null var host: String = null diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala index f9fb78e65a3d9..9658e5e627724 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala @@ -23,15 +23,19 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder} import org.glassfish.jersey.server.ServerProperties import org.glassfish.jersey.servlet.ServletContainer +import org.apache.spark.{SPARK_REVISION, SPARK_VERSION_SHORT} +import org.apache.spark.annotation.Experimental import org.apache.spark.ui.SparkUI /** + * :: Experimental :: * This aims to expose Executor metrics like REST API which is documented in * * https://spark.apache.org/docs/3.0.0/monitoring.html#executor-metrics * * Note that this is based on ExecutorSummary which is different from ExecutorSource. */ +@Experimental @Path("/executors") private[v1] class PrometheusResource extends ApiRequestContext { @GET @@ -39,6 +43,7 @@ private[v1] class PrometheusResource extends ApiRequestContext { @Produces(Array(MediaType.TEXT_PLAIN)) def executors(): String = { val sb = new StringBuilder + sb.append(s"""spark_info{version="$SPARK_VERSION_SHORT", revision="$SPARK_REVISION"} 1.0\n""") val store = uiRoot.asInstanceOf[SparkUI].store store.executorList(true).foreach { executor => val prefix = "metrics_executor_" @@ -47,27 +52,27 @@ private[v1] class PrometheusResource extends ApiRequestContext { "application_name" -> store.applicationInfo.name, "executor_id" -> executor.id ).map { case (k, v) => s"""$k="$v"""" }.mkString("{", ", ", "}") - sb.append(s"${prefix}rddBlocks_Count$labels ${executor.rddBlocks}\n") - sb.append(s"${prefix}memoryUsed_Count$labels ${executor.memoryUsed}\n") - sb.append(s"${prefix}diskUsed_Count$labels ${executor.diskUsed}\n") - sb.append(s"${prefix}totalCores_Count$labels ${executor.totalCores}\n") - sb.append(s"${prefix}maxTasks_Count$labels ${executor.maxTasks}\n") - sb.append(s"${prefix}activeTasks_Count$labels ${executor.activeTasks}\n") - sb.append(s"${prefix}failedTasks_Count$labels ${executor.failedTasks}\n") - sb.append(s"${prefix}completedTasks_Count$labels ${executor.completedTasks}\n") - sb.append(s"${prefix}totalTasks_Count$labels ${executor.totalTasks}\n") - sb.append(s"${prefix}totalDuration_Value$labels ${executor.totalDuration}\n") - sb.append(s"${prefix}totalGCTime_Value$labels ${executor.totalGCTime}\n") - sb.append(s"${prefix}totalInputBytes_Count$labels ${executor.totalInputBytes}\n") - sb.append(s"${prefix}totalShuffleRead_Count$labels ${executor.totalShuffleRead}\n") - sb.append(s"${prefix}totalShuffleWrite_Count$labels ${executor.totalShuffleWrite}\n") - sb.append(s"${prefix}maxMemory_Count$labels ${executor.maxMemory}\n") + sb.append(s"${prefix}rddBlocks$labels ${executor.rddBlocks}\n") + sb.append(s"${prefix}memoryUsed_bytes$labels ${executor.memoryUsed}\n") + sb.append(s"${prefix}diskUsed_bytes$labels ${executor.diskUsed}\n") + sb.append(s"${prefix}totalCores$labels ${executor.totalCores}\n") + sb.append(s"${prefix}maxTasks$labels ${executor.maxTasks}\n") + sb.append(s"${prefix}activeTasks$labels ${executor.activeTasks}\n") + sb.append(s"${prefix}failedTasks_total$labels ${executor.failedTasks}\n") + sb.append(s"${prefix}completedTasks_total$labels ${executor.completedTasks}\n") + sb.append(s"${prefix}totalTasks_total$labels ${executor.totalTasks}\n") + sb.append(s"${prefix}totalDuration_seconds_total$labels ${executor.totalDuration * 0.001}\n") + sb.append(s"${prefix}totalGCTime_seconds_total$labels ${executor.totalGCTime * 0.001}\n") + sb.append(s"${prefix}totalInputBytes_bytes_total$labels ${executor.totalInputBytes}\n") + sb.append(s"${prefix}totalShuffleRead_bytes_total$labels ${executor.totalShuffleRead}\n") + sb.append(s"${prefix}totalShuffleWrite_bytes_total$labels ${executor.totalShuffleWrite}\n") + sb.append(s"${prefix}maxMemory_bytes$labels ${executor.maxMemory}\n") executor.executorLogs.foreach { case (k, v) => } executor.memoryMetrics.foreach { m => - sb.append(s"${prefix}usedOnHeapStorageMemory_Count$labels ${m.usedOnHeapStorageMemory}\n") - sb.append(s"${prefix}usedOffHeapStorageMemory_Count$labels ${m.usedOffHeapStorageMemory}\n") - sb.append(s"${prefix}totalOnHeapStorageMemory_Count$labels ${m.totalOnHeapStorageMemory}\n") - sb.append(s"${prefix}totalOffHeapStorageMemory_Count$labels " + + sb.append(s"${prefix}usedOnHeapStorageMemory_bytes$labels ${m.usedOnHeapStorageMemory}\n") + sb.append(s"${prefix}usedOffHeapStorageMemory_bytes$labels ${m.usedOffHeapStorageMemory}\n") + sb.append(s"${prefix}totalOnHeapStorageMemory_bytes$labels ${m.totalOnHeapStorageMemory}\n") + sb.append(s"${prefix}totalOffHeapStorageMemory_bytes$labels " + s"${m.totalOffHeapStorageMemory}\n") } executor.peakMemoryMetrics.foreach { m => @@ -87,14 +92,16 @@ private[v1] class PrometheusResource extends ApiRequestContext { "ProcessTreePythonVMemory", "ProcessTreePythonRSSMemory", "ProcessTreeOtherVMemory", - "ProcessTreeOtherRSSMemory", - "MinorGCCount", - "MinorGCTime", - "MajorGCCount", - "MajorGCTime" + "ProcessTreeOtherRSSMemory" ) names.foreach { name => - sb.append(s"$prefix${name}_Count$labels ${m.getMetricValue(name)}\n") + sb.append(s"$prefix${name}_bytes$labels ${m.getMetricValue(name)}\n") + } + Seq("MinorGCCount", "MajorGCCount").foreach { name => + sb.append(s"$prefix${name}_total$labels ${m.getMetricValue(name)}\n") + } + Seq("MinorGCTime", "MajorGCTime").foreach { name => + sb.append(s"$prefix${name}_seconds_total$labels ${m.getMetricValue(name) * 0.001}\n") } } } diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala index 5ec9b36393764..37db64abb9633 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala @@ -127,6 +127,10 @@ private[spark] class ExecutorMetricsJsonDeserializer new TypeReference[Option[Map[String, java.lang.Long]]] {}) metricsMap.map(metrics => new ExecutorMetrics(metrics)) } + + override def getNullValue(ctxt: DeserializationContext): Option[ExecutorMetrics] = { + None + } } /** serializer for peakMemoryMetrics: convert ExecutorMetrics to map with metric name as key */ private[spark] class ExecutorMetricsJsonSerializer @@ -135,11 +139,15 @@ private[spark] class ExecutorMetricsJsonSerializer metrics: Option[ExecutorMetrics], jsonGenerator: JsonGenerator, serializerProvider: SerializerProvider): Unit = { - metrics.foreach { m: ExecutorMetrics => - val metricsMap = ExecutorMetricType.metricToOffset.map { case (metric, _) => - metric -> m.getMetricValue(metric) + if (metrics.isEmpty) { + jsonGenerator.writeNull() + } else { + metrics.foreach { m: ExecutorMetrics => + val metricsMap = ExecutorMetricType.metricToOffset.map { case (metric, _) => + metric -> m.getMetricValue(metric) + } + jsonGenerator.writeObject(metricsMap) } - jsonGenerator.writeObject(metricsMap) } } diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala index f0a94d84d8a04..c957ff75a501f 100644 --- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala +++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala @@ -154,7 +154,7 @@ private[spark] object TaskIndexNames { private[spark] class TaskDataWrapper( // Storing this as an object actually saves memory; it's also used as the key in the in-memory // store, so in that case you'd save the extra copy of the value here. - @KVIndexParam + @KVIndexParam(parent = TaskIndexNames.STAGE) val taskId: JLong, @KVIndexParam(value = TaskIndexNames.TASK_INDEX, parent = TaskIndexNames.STAGE) val index: Int, diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala index e335eb6ddb761..e440c1ab7bcd9 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala @@ -167,10 +167,12 @@ class BlockManagerMaster( * amount of remaining memory. */ def getMemoryStatus: Map[BlockManagerId, (Long, Long)] = { + if (driverEndpoint == null) return Map.empty driverEndpoint.askSync[Map[BlockManagerId, (Long, Long)]](GetMemoryStatus) } def getStorageStatus: Array[StorageStatus] = { + if (driverEndpoint == null) return Array.empty driverEndpoint.askSync[Array[StorageStatus]](GetStorageStatus) } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index 41ef1909cd4c2..d7f7eedc7f33b 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -98,8 +98,13 @@ class BlockManagerMasterEndpoint( case _updateBlockInfo @ UpdateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) => - context.reply(updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size)) - listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo))) + val isSuccess = updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) + context.reply(isSuccess) + // SPARK-30594: we should not post `SparkListenerBlockUpdated` when updateBlockInfo + // returns false since the block info would be updated again later. + if (isSuccess) { + listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo))) + } case GetLocations(blockId) => context.reply(getLocations(blockId)) diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala index ee43b76e17010..f2113947f6bf5 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala @@ -20,8 +20,6 @@ package org.apache.spark.storage import java.io.{File, IOException} import java.util.UUID -import scala.util.control.NonFatal - import org.apache.spark.SparkConf import org.apache.spark.executor.ExecutorExitCode import org.apache.spark.internal.{config, Logging} @@ -119,38 +117,20 @@ private[spark] class DiskBlockManager(conf: SparkConf, deleteFilesOnStop: Boolea /** Produces a unique block id and File suitable for storing local intermediate results. */ def createTempLocalBlock(): (TempLocalBlockId, File) = { - var blockId = TempLocalBlockId(UUID.randomUUID()) - var tempLocalFile = getFile(blockId) - var count = 0 - while (!canCreateFile(tempLocalFile) && count < Utils.MAX_DIR_CREATION_ATTEMPTS) { - blockId = TempLocalBlockId(UUID.randomUUID()) - tempLocalFile = getFile(blockId) - count += 1 + var blockId = new TempLocalBlockId(UUID.randomUUID()) + while (getFile(blockId).exists()) { + blockId = new TempLocalBlockId(UUID.randomUUID()) } - (blockId, tempLocalFile) + (blockId, getFile(blockId)) } /** Produces a unique block id and File suitable for storing shuffled intermediate results. */ def createTempShuffleBlock(): (TempShuffleBlockId, File) = { - var blockId = TempShuffleBlockId(UUID.randomUUID()) - var tempShuffleFile = getFile(blockId) - var count = 0 - while (!canCreateFile(tempShuffleFile) && count < Utils.MAX_DIR_CREATION_ATTEMPTS) { - blockId = TempShuffleBlockId(UUID.randomUUID()) - tempShuffleFile = getFile(blockId) - count += 1 - } - (blockId, tempShuffleFile) - } - - private def canCreateFile(file: File): Boolean = { - try { - file.createNewFile() - } catch { - case NonFatal(_) => - logError("Failed to create temporary block file: " + file.getAbsoluteFile) - false + var blockId = new TempShuffleBlockId(UUID.randomUUID()) + while (getFile(blockId).exists()) { + blockId = new TempShuffleBlockId(UUID.randomUUID()) } + (blockId, getFile(blockId)) } /** diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index cd4c86006af5a..5efbc0703f729 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -91,8 +91,7 @@ final class ShuffleBlockFetcherIterator( private val targetRemoteRequestSize = math.max(maxBytesInFlight / 5, 1L) /** - * Total number of blocks to fetch. This should be equal to the total number of blocks - * in [[blocksByAddress]] because we already filter out zero-sized blocks in [[blocksByAddress]]. + * Total number of blocks to fetch. */ private[this] var numBlocksToFetch = 0 @@ -290,7 +289,6 @@ final class ShuffleBlockFetcherIterator( var localBlockBytes = 0L var hostLocalBlockBytes = 0L var remoteBlockBytes = 0L - var numRemoteBlocks = 0 val hostLocalDirReadingEnabled = blockManager.hostLocalDirManager != null && blockManager.hostLocalDirManager.isDefined @@ -299,25 +297,31 @@ final class ShuffleBlockFetcherIterator( if (address.executorId == blockManager.blockManagerId.executorId) { checkBlockSizes(blockInfos) val mergedBlockInfos = mergeContinuousShuffleBlockIdsIfNeeded( - blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)).to[ArrayBuffer]) + blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)), doBatchFetch) + numBlocksToFetch += mergedBlockInfos.size localBlocks ++= mergedBlockInfos.map(info => (info.blockId, info.mapIndex)) localBlockBytes += mergedBlockInfos.map(_.size).sum } else if (hostLocalDirReadingEnabled && address.host == blockManager.blockManagerId.host) { checkBlockSizes(blockInfos) val mergedBlockInfos = mergeContinuousShuffleBlockIdsIfNeeded( - blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)).to[ArrayBuffer]) + blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)), doBatchFetch) + numBlocksToFetch += mergedBlockInfos.size val blocksForAddress = mergedBlockInfos.map(info => (info.blockId, info.size, info.mapIndex)) hostLocalBlocksByExecutor += address -> blocksForAddress hostLocalBlocks ++= blocksForAddress.map(info => (info._1, info._3)) hostLocalBlockBytes += mergedBlockInfos.map(_.size).sum } else { - numRemoteBlocks += blockInfos.size remoteBlockBytes += blockInfos.map(_._2).sum collectFetchRequests(address, blockInfos, collectedRemoteRequests) } } + val numRemoteBlocks = collectedRemoteRequests.map(_.blocks.size).sum val totalBytes = localBlockBytes + remoteBlockBytes + hostLocalBlockBytes + assert(numBlocksToFetch == localBlocks.size + hostLocalBlocks.size + numRemoteBlocks, + s"The number of non-empty blocks $numBlocksToFetch doesn't equal to the number of local " + + s"blocks ${localBlocks.size} + the number of host-local blocks ${hostLocalBlocks.size} " + + s"+ the number of remote blocks ${numRemoteBlocks}.") logInfo(s"Getting $numBlocksToFetch (${Utils.bytesToString(totalBytes)}) non-empty blocks " + s"including ${localBlocks.size} (${Utils.bytesToString(localBlockBytes)}) local and " + s"${hostLocalBlocks.size} (${Utils.bytesToString(hostLocalBlockBytes)}) " + @@ -325,6 +329,39 @@ final class ShuffleBlockFetcherIterator( collectedRemoteRequests } + private def createFetchRequest( + blocks: Seq[FetchBlockInfo], + address: BlockManagerId): FetchRequest = { + logDebug(s"Creating fetch request of ${blocks.map(_.size).sum} at $address " + + s"with ${blocks.size} blocks") + FetchRequest(address, blocks) + } + + private def createFetchRequests( + curBlocks: Seq[FetchBlockInfo], + address: BlockManagerId, + isLast: Boolean, + collectedRemoteRequests: ArrayBuffer[FetchRequest]): Seq[FetchBlockInfo] = { + val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks, doBatchFetch) + numBlocksToFetch += mergedBlocks.size + var retBlocks = Seq.empty[FetchBlockInfo] + if (mergedBlocks.length <= maxBlocksInFlightPerAddress) { + collectedRemoteRequests += createFetchRequest(mergedBlocks, address) + } else { + mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks => + if (blocks.length == maxBlocksInFlightPerAddress || isLast) { + collectedRemoteRequests += createFetchRequest(blocks, address) + } else { + // The last group does not exceed `maxBlocksInFlightPerAddress`. Put it back + // to `curBlocks`. + retBlocks = blocks + numBlocksToFetch -= blocks.size + } + } + } + retBlocks + } + private def collectFetchRequests( address: BlockManagerId, blockInfos: Seq[(BlockId, Long, Int)], @@ -333,32 +370,6 @@ final class ShuffleBlockFetcherIterator( var curRequestSize = 0L var curBlocks = new ArrayBuffer[FetchBlockInfo] - def createFetchRequest(blocks: Seq[FetchBlockInfo]): Unit = { - collectedRemoteRequests += FetchRequest(address, blocks) - logDebug(s"Creating fetch request of $curRequestSize at $address " - + s"with ${blocks.size} blocks") - } - - def createFetchRequests(): Unit = { - val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks) - curBlocks = new ArrayBuffer[FetchBlockInfo] - if (mergedBlocks.length <= maxBlocksInFlightPerAddress) { - createFetchRequest(mergedBlocks) - } else { - mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks => - if (blocks.length == maxBlocksInFlightPerAddress) { - createFetchRequest(blocks) - } else { - // The last group does not exceed `maxBlocksInFlightPerAddress`. Put it back - // to `curBlocks`. - curBlocks = blocks - numBlocksToFetch -= blocks.size - } - } - } - curRequestSize = curBlocks.map(_.size).sum - } - while (iterator.hasNext) { val (blockId, size, mapIndex) = iterator.next() assertPositiveBlockSize(blockId, size) @@ -367,12 +378,16 @@ final class ShuffleBlockFetcherIterator( // For batch fetch, the actual block in flight should count for merged block. val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= maxBlocksInFlightPerAddress if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) { - createFetchRequests() + curBlocks = createFetchRequests(curBlocks, address, isLast = false, + collectedRemoteRequests).to[ArrayBuffer] + curRequestSize = curBlocks.map(_.size).sum } } // Add in the final request if (curBlocks.nonEmpty) { - createFetchRequests() + curBlocks = createFetchRequests(curBlocks, address, isLast = true, + collectedRemoteRequests).to[ArrayBuffer] + curRequestSize = curBlocks.map(_.size).sum } } @@ -388,73 +403,6 @@ final class ShuffleBlockFetcherIterator( blockInfos.foreach { case (blockId, size, _) => assertPositiveBlockSize(blockId, size) } } - private[this] def mergeContinuousShuffleBlockIdsIfNeeded( - blocks: ArrayBuffer[FetchBlockInfo]): ArrayBuffer[FetchBlockInfo] = { - val result = if (doBatchFetch) { - var curBlocks = new ArrayBuffer[FetchBlockInfo] - val mergedBlockInfo = new ArrayBuffer[FetchBlockInfo] - - def mergeFetchBlockInfo(toBeMerged: ArrayBuffer[FetchBlockInfo]): FetchBlockInfo = { - val startBlockId = toBeMerged.head.blockId.asInstanceOf[ShuffleBlockId] - - // The last merged block may comes from the input, and we can merge more blocks - // into it, if the map id is the same. - def shouldMergeIntoPreviousBatchBlockId = - mergedBlockInfo.last.blockId.asInstanceOf[ShuffleBlockBatchId].mapId == startBlockId.mapId - - val startReduceId = if (mergedBlockInfo.nonEmpty && shouldMergeIntoPreviousBatchBlockId) { - // Remove the previous batch block id as we will add a new one to replace it. - mergedBlockInfo.remove(mergedBlockInfo.length - 1).blockId - .asInstanceOf[ShuffleBlockBatchId].startReduceId - } else { - startBlockId.reduceId - } - - FetchBlockInfo( - ShuffleBlockBatchId( - startBlockId.shuffleId, - startBlockId.mapId, - startReduceId, - toBeMerged.last.blockId.asInstanceOf[ShuffleBlockId].reduceId + 1), - toBeMerged.map(_.size).sum, - toBeMerged.head.mapIndex) - } - - val iter = blocks.iterator - while (iter.hasNext) { - val info = iter.next() - // It's possible that the input block id is already a batch ID. For example, we merge some - // blocks, and then make fetch requests with the merged blocks according to "max blocks per - // request". The last fetch request may be too small, and we give up and put the remaining - // merged blocks back to the input list. - if (info.blockId.isInstanceOf[ShuffleBlockBatchId]) { - mergedBlockInfo += info - } else { - if (curBlocks.isEmpty) { - curBlocks += info - } else { - val curBlockId = info.blockId.asInstanceOf[ShuffleBlockId] - val currentMapId = curBlocks.head.blockId.asInstanceOf[ShuffleBlockId].mapId - if (curBlockId.mapId != currentMapId) { - mergedBlockInfo += mergeFetchBlockInfo(curBlocks) - curBlocks.clear() - } - curBlocks += info - } - } - } - if (curBlocks.nonEmpty) { - mergedBlockInfo += mergeFetchBlockInfo(curBlocks) - } - mergedBlockInfo - } else { - blocks - } - // update metrics - numBlocksToFetch += result.size - result - } - /** * Fetch the local blocks while we are fetching remote blocks. This is ok because * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we @@ -903,6 +851,86 @@ private class ShuffleFetchCompletionListener(var data: ShuffleBlockFetcherIterat private[storage] object ShuffleBlockFetcherIterator { + /** + * This function is used to merged blocks when doBatchFetch is true. Blocks which have the + * same `mapId` can be merged into one block batch. The block batch is specified by a range + * of reduceId, which implies the continuous shuffle blocks that we can fetch in a batch. + * For example, input blocks like (shuffle_0_0_0, shuffle_0_0_1, shuffle_0_1_0) can be + * merged into (shuffle_0_0_0_2, shuffle_0_1_0_1), and input blocks like (shuffle_0_0_0_2, + * shuffle_0_0_2, shuffle_0_0_3) can be merged into (shuffle_0_0_0_4). + * + * @param blocks blocks to be merged if possible. May contains already merged blocks. + * @param doBatchFetch whether to merge blocks. + * @return the input blocks if doBatchFetch=false, or the merged blocks if doBatchFetch=true. + */ + def mergeContinuousShuffleBlockIdsIfNeeded( + blocks: Seq[FetchBlockInfo], + doBatchFetch: Boolean): Seq[FetchBlockInfo] = { + val result = if (doBatchFetch) { + var curBlocks = new ArrayBuffer[FetchBlockInfo] + val mergedBlockInfo = new ArrayBuffer[FetchBlockInfo] + + def mergeFetchBlockInfo(toBeMerged: ArrayBuffer[FetchBlockInfo]): FetchBlockInfo = { + val startBlockId = toBeMerged.head.blockId.asInstanceOf[ShuffleBlockId] + + // The last merged block may comes from the input, and we can merge more blocks + // into it, if the map id is the same. + def shouldMergeIntoPreviousBatchBlockId = + mergedBlockInfo.last.blockId.asInstanceOf[ShuffleBlockBatchId].mapId == startBlockId.mapId + + val (startReduceId, size) = + if (mergedBlockInfo.nonEmpty && shouldMergeIntoPreviousBatchBlockId) { + // Remove the previous batch block id as we will add a new one to replace it. + val removed = mergedBlockInfo.remove(mergedBlockInfo.length - 1) + (removed.blockId.asInstanceOf[ShuffleBlockBatchId].startReduceId, + removed.size + toBeMerged.map(_.size).sum) + } else { + (startBlockId.reduceId, toBeMerged.map(_.size).sum) + } + + FetchBlockInfo( + ShuffleBlockBatchId( + startBlockId.shuffleId, + startBlockId.mapId, + startReduceId, + toBeMerged.last.blockId.asInstanceOf[ShuffleBlockId].reduceId + 1), + size, + toBeMerged.head.mapIndex) + } + + val iter = blocks.iterator + while (iter.hasNext) { + val info = iter.next() + // It's possible that the input block id is already a batch ID. For example, we merge some + // blocks, and then make fetch requests with the merged blocks according to "max blocks per + // request". The last fetch request may be too small, and we give up and put the remaining + // merged blocks back to the input list. + if (info.blockId.isInstanceOf[ShuffleBlockBatchId]) { + mergedBlockInfo += info + } else { + if (curBlocks.isEmpty) { + curBlocks += info + } else { + val curBlockId = info.blockId.asInstanceOf[ShuffleBlockId] + val currentMapId = curBlocks.head.blockId.asInstanceOf[ShuffleBlockId].mapId + if (curBlockId.mapId != currentMapId) { + mergedBlockInfo += mergeFetchBlockInfo(curBlocks) + curBlocks.clear() + } + curBlocks += info + } + } + } + if (curBlocks.nonEmpty) { + mergedBlockInfo += mergeFetchBlockInfo(curBlocks) + } + mergedBlockInfo + } else { + blocks + } + result + } + /** * The block information to fetch used in FetchRequest. * @param blockId block id diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index 94c99d48e773c..3820a880b2109 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -17,7 +17,7 @@ package org.apache.spark.ui -import java.net.{URI, URL} +import java.net.{URI, URL, URLDecoder} import java.util.EnumSet import javax.servlet.DispatcherType import javax.servlet.http._ @@ -376,8 +376,7 @@ private[spark] object JettyUtils extends Logging { if (baseRequest.isSecure) { return } - val httpsURI = createRedirectURI(scheme, baseRequest.getServerName, securePort, - baseRequest.getRequestURI, baseRequest.getQueryString) + val httpsURI = createRedirectURI(scheme, securePort, baseRequest) response.setContentLength(0) response.sendRedirect(response.encodeRedirectURL(httpsURI)) baseRequest.setHandled(true) @@ -401,17 +400,13 @@ private[spark] object JettyUtils extends Logging { uri.append(rest) } - val rewrittenURI = URI.create(uri.toString()) - if (query != null) { - return new URI( - rewrittenURI.getScheme(), - rewrittenURI.getAuthority(), - rewrittenURI.getPath(), - query, - rewrittenURI.getFragment() - ).normalize() + val queryString = if (query == null) { + "" + } else { + s"?$query" } - rewrittenURI.normalize() + // SPARK-33611: use method `URI.create` to avoid percent-encoding twice on the query string. + URI.create(uri.toString() + queryString).normalize() } def createProxyLocationHeader( @@ -439,16 +434,34 @@ private[spark] object JettyUtils extends Logging { handler.addFilter(holder, "/*", EnumSet.allOf(classOf[DispatcherType])) } + private def decodeURL(url: String, encoding: String): String = { + if (url == null) { + null + } else { + URLDecoder.decode(url, encoding) + } + } + // Create a new URI from the arguments, handling IPv6 host encoding and default ports. - private def createRedirectURI( - scheme: String, server: String, port: Int, path: String, query: String) = { + private def createRedirectURI(scheme: String, port: Int, request: Request): String = { + val server = request.getServerName val redirectServer = if (server.contains(":") && !server.startsWith("[")) { s"[${server}]" } else { server } val authority = s"$redirectServer:$port" - new URI(scheme, authority, path, query, null).toString + val queryEncoding = if (request.getQueryEncoding != null) { + request.getQueryEncoding + } else { + // By default decoding the URI as "UTF-8" should be enough for SparkUI + "UTF-8" + } + // The request URL can be raw or encoded here. To avoid the request URL being + // encoded twice, let's decode it here. + val requestURI = decodeURL(request.getRequestURI, queryEncoding) + val queryString = decodeURL(request.getQueryString, queryEncoding) + new URI(scheme, authority, requestURI, queryString, null).toString } def toVirtualHosts(connectors: String*): Array[String] = connectors.map("@" + _).toArray diff --git a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala index 9a3cd53076c43..8ce2065de4cf8 100644 --- a/core/src/main/scala/org/apache/spark/ui/PagedTable.scala +++ b/core/src/main/scala/org/apache/spark/ui/PagedTable.scala @@ -115,17 +115,18 @@ private[spark] trait PagedTable[T] { _dataSource.pageSize } - val pageNavi = pageNavigation(pageToShow, pageSize, totalPages) + val pageNaviTop = pageNavigation(pageToShow, pageSize, totalPages, tableId + "-top") + val pageNaviBottom = pageNavigation(pageToShow, pageSize, totalPages, tableId + "-bottom")

- {pageNavi} + {pageNaviTop} {headers} {data.map(row)}
- {pageNavi} + {pageNaviBottom}
} catch { case e: IndexOutOfBoundsException => @@ -171,7 +172,11 @@ private[spark] trait PagedTable[T] { * > means jumping to the next page. * }}} */ - private[ui] def pageNavigation(page: Int, pageSize: Int, totalPages: Int): Seq[Node] = { + private[ui] def pageNavigation( + page: Int, + pageSize: Int, + totalPages: Int, + navigationId: String = tableId): Seq[Node] = { // A group includes all page numbers will be shown in the page navigation. // The size of group is 10 means there are 10 page numbers will be shown. // The first group is 1 to 10, the second is 2 to 20, and so on @@ -214,7 +219,7 @@ private[spark] trait PagedTable[T] {
-
{totalPages} Pages. Jump to - diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 94c45215b5ff2..87ab5436fa4b5 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -224,13 +224,14 @@ private[spark] object UIUtils extends Logging { - + - + @@ -288,6 +289,7 @@ private[spark] object UIUtils extends Logging { {commonHeaderNodes(request)} + {if (showVisualization) vizHeaderNodes(request) else Seq.empty} {if (useDataTables) dataTablesHeaderNodes(request) else Seq.empty} ' + | 'Status: ${status}
' + | 'Submitted: ${UIUtils.formatDate(new Date(submissionTime))}' + | '${ @@ -108,7 +109,7 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We "" } }">' + - | '${jsEscapedDesc} (Job ${jobId})
' + | '${jsEscapedDescForLabel} (Job ${jobId})
' |} """.stripMargin jobEventJsonAsStr diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala index 12f1aa25e8d2a..c80b6d13970a9 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala @@ -69,7 +69,8 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP // The timeline library treats contents as HTML, so we have to escape them. We need to add // extra layers of escaping in order to embed this in a Javascript string literal. val escapedName = Utility.escape(name) - val jsEscapedName = StringEscapeUtils.escapeEcmaScript(escapedName) + val jsEscapedNameForTooltip = StringEscapeUtils.escapeEcmaScript(Utility.escape(escapedName)) + val jsEscapedNameForLabel = StringEscapeUtils.escapeEcmaScript(escapedName) s""" |{ | 'className': 'stage job-timeline-object ${status}', @@ -78,7 +79,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP | 'end': new Date(${completionTime}), | 'content': '
' + | 'Status: ${status.toUpperCase(Locale.ROOT)}
' + | 'Submitted: ${UIUtils.formatDate(submissionTime)}' + | '${ @@ -88,7 +89,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP "" } }">' + - | '${jsEscapedName} (Stage ${stageId}.${attemptId})
', + | '${jsEscapedNameForLabel} (Stage ${stageId}.${attemptId})', |} """.stripMargin } @@ -279,9 +280,9 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP val pendingOrSkippedTableId = if (isComplete) { - "pending" - } else { "skipped" + } else { + "pending" } val activeStagesTable = diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala index 9ace324322947..842ee7aaf49bf 100644 --- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala +++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala @@ -21,6 +21,7 @@ import java.util.Objects import scala.collection.mutable import scala.collection.mutable.{ListBuffer, StringBuilder} +import scala.xml.Utility import org.apache.commons.text.StringEscapeUtils @@ -245,8 +246,9 @@ private[spark] object RDDOperationGraph extends Logging { } else { "" } - val label = s"${node.name} [${node.id}]$isCached$isBarrier\n${node.callsite}" - s"""${node.id} [label="${StringEscapeUtils.escapeJava(label)}"]""" + val escapedCallsite = Utility.escape(node.callsite) + val label = s"${node.name} [${node.id}]$isCached$isBarrier
${escapedCallsite}" + s"""${node.id} [labelType="html" label="${StringEscapeUtils.escapeJava(label)}"]""" } /** Update the dot representation of the RDDOperationGraph in cluster to subgraph. */ diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index d2ad14f2a1a96..6ffd6605f75b8 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -18,12 +18,15 @@ package org.apache.spark.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import java.lang.invoke.SerializedLambda +import java.lang.invoke.{MethodHandleInfo, SerializedLambda} +import scala.collection.JavaConverters._ import scala.collection.mutable.{Map, Set, Stack} -import org.apache.xbean.asm7.{ClassReader, ClassVisitor, MethodVisitor, Type} +import org.apache.commons.lang3.ClassUtils +import org.apache.xbean.asm7.{ClassReader, ClassVisitor, Handle, MethodVisitor, Type} import org.apache.xbean.asm7.Opcodes._ +import org.apache.xbean.asm7.tree.{ClassNode, MethodNode} import org.apache.spark.{SparkEnv, SparkException} import org.apache.spark.internal.Logging @@ -159,39 +162,6 @@ private[spark] object ClosureCleaner extends Logging { clean(closure, checkSerializable, cleanTransitively, Map.empty) } - /** - * Try to get a serialized Lambda from the closure. - * - * @param closure the closure to check. - */ - private def getSerializedLambda(closure: AnyRef): Option[SerializedLambda] = { - val isClosureCandidate = - closure.getClass.isSynthetic && - closure - .getClass - .getInterfaces.exists(_.getName == "scala.Serializable") - - if (isClosureCandidate) { - try { - Option(inspect(closure)) - } catch { - case e: Exception => - // no need to check if debug is enabled here the Spark - // logging api covers this. - logDebug("Closure is not a serialized lambda.", e) - None - } - } else { - None - } - } - - private def inspect(closure: AnyRef): SerializedLambda = { - val writeReplace = closure.getClass.getDeclaredMethod("writeReplace") - writeReplace.setAccessible(true) - writeReplace.invoke(closure).asInstanceOf[java.lang.invoke.SerializedLambda] - } - /** * Helper method to clean the given closure in place. * @@ -239,12 +209,12 @@ private[spark] object ClosureCleaner extends Logging { cleanTransitively: Boolean, accessedFields: Map[Class[_], Set[String]]): Unit = { - // most likely to be the case with 2.12, 2.13 + // indylambda check. Most likely to be the case with 2.12, 2.13 // so we check first // non LMF-closures should be less frequent from now on - val lambdaFunc = getSerializedLambda(func) + val maybeIndylambdaProxy = IndylambdaScalaClosures.getSerializationProxy(func) - if (!isClosure(func.getClass) && lambdaFunc.isEmpty) { + if (!isClosure(func.getClass) && maybeIndylambdaProxy.isEmpty) { logDebug(s"Expected a closure; got ${func.getClass.getName}") return } @@ -256,7 +226,7 @@ private[spark] object ClosureCleaner extends Logging { return } - if (lambdaFunc.isEmpty) { + if (maybeIndylambdaProxy.isEmpty) { logDebug(s"+++ Cleaning closure $func (${func.getClass.getName}) +++") // A list of classes that represents closures enclosed in the given one @@ -300,7 +270,7 @@ private[spark] object ClosureCleaner extends Logging { } } - logDebug(s" + fields accessed by starting closure: " + accessedFields.size) + logDebug(s" + fields accessed by starting closure: ${accessedFields.size} classes") accessedFields.foreach { f => logDebug(" " + f) } // List of outer (class, object) pairs, ordered from outermost to innermost @@ -372,14 +342,64 @@ private[spark] object ClosureCleaner extends Logging { logDebug(s" +++ closure $func (${func.getClass.getName}) is now cleaned +++") } else { - logDebug(s"Cleaning lambda: ${lambdaFunc.get.getImplMethodName}") + val lambdaProxy = maybeIndylambdaProxy.get + val implMethodName = lambdaProxy.getImplMethodName + + logDebug(s"Cleaning indylambda closure: $implMethodName") + + // capturing class is the class that declared this lambda + val capturingClassName = lambdaProxy.getCapturingClass.replace('/', '.') + val classLoader = func.getClass.getClassLoader // this is the safest option + // scalastyle:off classforname + val capturingClass = Class.forName(capturingClassName, false, classLoader) + // scalastyle:on classforname - val captClass = Utils.classForName(lambdaFunc.get.getCapturingClass.replace('/', '.'), - initialize = false, noSparkClassLoader = true) // Fail fast if we detect return statements in closures - getClassReader(captClass) - .accept(new ReturnStatementFinder(Some(lambdaFunc.get.getImplMethodName)), 0) - logDebug(s" +++ Lambda closure (${lambdaFunc.get.getImplMethodName}) is now cleaned +++") + val capturingClassReader = getClassReader(capturingClass) + capturingClassReader.accept(new ReturnStatementFinder(Option(implMethodName)), 0) + + val isClosureDeclaredInScalaRepl = capturingClassName.startsWith("$line") && + capturingClassName.endsWith("$iw") + val outerThisOpt = if (lambdaProxy.getCapturedArgCount > 0) { + Option(lambdaProxy.getCapturedArg(0)) + } else { + None + } + + // only need to clean when there is an enclosing "this" captured by the closure, and it + // should be something cleanable, i.e. a Scala REPL line object + val needsCleaning = isClosureDeclaredInScalaRepl && + outerThisOpt.isDefined && outerThisOpt.get.getClass.getName == capturingClassName + + if (needsCleaning) { + // indylambda closures do not reference enclosing closures via an `$outer` chain, so no + // transitive cleaning on the `$outer` chain is needed. + // Thus clean() shouldn't be recursively called with a non-empty accessedFields. + assert(accessedFields.isEmpty) + + initAccessedFields(accessedFields, Seq(capturingClass)) + IndylambdaScalaClosures.findAccessedFields( + lambdaProxy, classLoader, accessedFields, cleanTransitively) + + logDebug(s" + fields accessed by starting closure: ${accessedFields.size} classes") + accessedFields.foreach { f => logDebug(" " + f) } + + if (accessedFields(capturingClass).size < capturingClass.getDeclaredFields.length) { + // clone and clean the enclosing `this` only when there are fields to null out + + val outerThis = outerThisOpt.get + + logDebug(s" + cloning instance of REPL class $capturingClassName") + val clonedOuterThis = cloneAndSetFields( + parent = null, outerThis, capturingClass, accessedFields) + + val outerField = func.getClass.getDeclaredField("arg$1") + outerField.setAccessible(true) + outerField.set(func, clonedOuterThis) + } + } + + logDebug(s" +++ indylambda closure ($implMethodName) is now cleaned +++") } if (checkSerializable) { @@ -414,6 +434,312 @@ private[spark] object ClosureCleaner extends Logging { } } +private[spark] object IndylambdaScalaClosures extends Logging { + // internal name of java.lang.invoke.LambdaMetafactory + val LambdaMetafactoryClassName = "java/lang/invoke/LambdaMetafactory" + // the method that Scala indylambda use for bootstrap method + val LambdaMetafactoryMethodName = "altMetafactory" + val LambdaMetafactoryMethodDesc = "(Ljava/lang/invoke/MethodHandles$Lookup;" + + "Ljava/lang/String;Ljava/lang/invoke/MethodType;[Ljava/lang/Object;)" + + "Ljava/lang/invoke/CallSite;" + + /** + * Check if the given reference is a indylambda style Scala closure. + * If so (e.g. for Scala 2.12+ closures), return a non-empty serialization proxy + * (SerializedLambda) of the closure; + * otherwise (e.g. for Scala 2.11 closures) return None. + * + * @param maybeClosure the closure to check. + */ + def getSerializationProxy(maybeClosure: AnyRef): Option[SerializedLambda] = { + def isClosureCandidate(cls: Class[_]): Boolean = { + // TODO: maybe lift this restriction to support other functional interfaces in the future + val implementedInterfaces = ClassUtils.getAllInterfaces(cls).asScala + implementedInterfaces.exists(_.getName.startsWith("scala.Function")) + } + + maybeClosure.getClass match { + // shortcut the fast check: + // 1. indylambda closure classes are generated by Java's LambdaMetafactory, and they're + // always synthetic. + // 2. We only care about Serializable closures, so let's check that as well + case c if !c.isSynthetic || !maybeClosure.isInstanceOf[Serializable] => None + + case c if isClosureCandidate(c) => + try { + Option(inspect(maybeClosure)).filter(isIndylambdaScalaClosure) + } catch { + case e: Exception => + logDebug("The given reference is not an indylambda Scala closure.", e) + None + } + + case _ => None + } + } + + def isIndylambdaScalaClosure(lambdaProxy: SerializedLambda): Boolean = { + lambdaProxy.getImplMethodKind == MethodHandleInfo.REF_invokeStatic && + lambdaProxy.getImplMethodName.contains("$anonfun$") + } + + def inspect(closure: AnyRef): SerializedLambda = { + val writeReplace = closure.getClass.getDeclaredMethod("writeReplace") + writeReplace.setAccessible(true) + writeReplace.invoke(closure).asInstanceOf[SerializedLambda] + } + + /** + * Check if the handle represents the LambdaMetafactory that indylambda Scala closures + * use for creating the lambda class and getting a closure instance. + */ + def isLambdaMetafactory(bsmHandle: Handle): Boolean = { + bsmHandle.getOwner == LambdaMetafactoryClassName && + bsmHandle.getName == LambdaMetafactoryMethodName && + bsmHandle.getDesc == LambdaMetafactoryMethodDesc + } + + /** + * Check if the handle represents a target method that is: + * - a STATIC method that implements a Scala lambda body in the indylambda style + * - captures the enclosing `this`, i.e. the first argument is a reference to the same type as + * the owning class. + * Returns true if both criteria above are met. + */ + def isLambdaBodyCapturingOuter(handle: Handle, ownerInternalName: String): Boolean = { + handle.getTag == H_INVOKESTATIC && + handle.getName.contains("$anonfun$") && + handle.getOwner == ownerInternalName && + handle.getDesc.startsWith(s"(L$ownerInternalName;") + } + + /** + * Check if the callee of a call site is a inner class constructor. + * - A constructor has to be invoked via INVOKESPECIAL + * - A constructor's internal name is "<init>" and the return type is "V" (void) + * - An inner class' first argument in the signature has to be a reference to the + * enclosing "this", aka `$outer` in Scala. + */ + def isInnerClassCtorCapturingOuter( + op: Int, owner: String, name: String, desc: String, callerInternalName: String): Boolean = { + op == INVOKESPECIAL && name == "" && desc.startsWith(s"(L$callerInternalName;") + } + + /** + * Scans an indylambda Scala closure, along with its lexically nested closures, and populate + * the accessed fields info on which fields on the outer object are accessed. + * + * This is equivalent to getInnerClosureClasses() + InnerClosureFinder + FieldAccessFinder fused + * into one for processing indylambda closures. The traversal order along the call graph is the + * same for all three combined, so they can be fused together easily while maintaining the same + * ordering as the existing implementation. + * + * Precondition: this function expects the `accessedFields` to be populated with all known + * outer classes and their super classes to be in the map as keys, e.g. + * initializing via ClosureCleaner.initAccessedFields. + */ + // scalastyle:off line.size.limit + // Example: run the following code snippet in a Spark Shell w/ Scala 2.12+: + // val topLevelValue = "someValue"; val closure = (j: Int) => { + // class InnerFoo { + // val innerClosure = (x: Int) => (1 to x).map { y => y + topLevelValue } + // } + // val innerFoo = new InnerFoo + // (1 to j).flatMap(innerFoo.innerClosure) + // } + // sc.parallelize(0 to 2).map(closure).collect + // + // produces the following trace-level logs: + // (slightly simplified: + // - omitting the "ignoring ..." lines; + // - "$iw" is actually "$line14.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw"; + // - "invokedynamic" lines are simplified to just show the name+desc, omitting the bsm info) + // Cleaning indylambda closure: $anonfun$closure$1$adapted + // scanning $iw.$anonfun$closure$1$adapted(L$iw;Ljava/lang/Object;)Lscala/collection/immutable/IndexedSeq; + // found intra class call to $iw.$anonfun$closure$1(L$iw;I)Lscala/collection/immutable/IndexedSeq; + // scanning $iw.$anonfun$closure$1(L$iw;I)Lscala/collection/immutable/IndexedSeq; + // found inner class $iw$InnerFoo$1 + // found method innerClosure()Lscala/Function1; + // found method $anonfun$innerClosure$2(L$iw$InnerFoo$1;I)Ljava/lang/String; + // found method $anonfun$innerClosure$1(L$iw$InnerFoo$1;I)Lscala/collection/immutable/IndexedSeq; + // found method (L$iw;)V + // found method $anonfun$innerClosure$2$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Ljava/lang/String; + // found method $anonfun$innerClosure$1$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Lscala/collection/immutable/IndexedSeq; + // found method $deserializeLambda$(Ljava/lang/invoke/SerializedLambda;)Ljava/lang/Object; + // found call to outer $iw$InnerFoo$1.innerClosure()Lscala/Function1; + // scanning $iw$InnerFoo$1.innerClosure()Lscala/Function1; + // scanning $iw$InnerFoo$1.$deserializeLambda$(Ljava/lang/invoke/SerializedLambda;)Ljava/lang/Object; + // invokedynamic: lambdaDeserialize(Ljava/lang/invoke/SerializedLambda;)Ljava/lang/Object;, bsm...) + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$1$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Lscala/collection/immutable/IndexedSeq; + // found intra class call to $iw$InnerFoo$1.$anonfun$innerClosure$1(L$iw$InnerFoo$1;I)Lscala/collection/immutable/IndexedSeq; + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$1(L$iw$InnerFoo$1;I)Lscala/collection/immutable/IndexedSeq; + // invokedynamic: apply(L$iw$InnerFoo$1;)Lscala/Function1;, bsm...) + // found inner closure $iw$InnerFoo$1.$anonfun$innerClosure$2$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Ljava/lang/String; (6) + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$2$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Ljava/lang/String; + // found intra class call to $iw$InnerFoo$1.$anonfun$innerClosure$2(L$iw$InnerFoo$1;I)Ljava/lang/String; + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$2(L$iw$InnerFoo$1;I)Ljava/lang/String; + // found call to outer $iw.topLevelValue()Ljava/lang/String; + // scanning $iw.topLevelValue()Ljava/lang/String; + // found field access topLevelValue on $iw + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$2$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Ljava/lang/String; + // found intra class call to $iw$InnerFoo$1.$anonfun$innerClosure$2(L$iw$InnerFoo$1;I)Ljava/lang/String; + // scanning $iw$InnerFoo$1.(L$iw;)V + // invokedynamic: apply(L$iw$InnerFoo$1;)Lscala/Function1;, bsm...) + // found inner closure $iw$InnerFoo$1.$anonfun$innerClosure$1$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Lscala/collection/immutable/IndexedSeq; (6) + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$1(L$iw$InnerFoo$1;I)Lscala/collection/immutable/IndexedSeq; + // invokedynamic: apply(L$iw$InnerFoo$1;)Lscala/Function1;, bsm...) + // found inner closure $iw$InnerFoo$1.$anonfun$innerClosure$2$adapted(L$iw$InnerFoo$1;Ljava/lang/Object;)Ljava/lang/String; (6) + // scanning $iw$InnerFoo$1.$anonfun$innerClosure$2(L$iw$InnerFoo$1;I)Ljava/lang/String; + // found call to outer $iw.topLevelValue()Ljava/lang/String; + // scanning $iw$InnerFoo$1.innerClosure()Lscala/Function1; + // + fields accessed by starting closure: 2 classes + // (class java.lang.Object,Set()) + // (class $iw,Set(topLevelValue)) + // + cloning instance of REPL class $iw + // +++ indylambda closure ($anonfun$closure$1$adapted) is now cleaned +++ + // + // scalastyle:on line.size.limit + def findAccessedFields( + lambdaProxy: SerializedLambda, + lambdaClassLoader: ClassLoader, + accessedFields: Map[Class[_], Set[String]], + findTransitively: Boolean): Unit = { + + // We may need to visit the same class multiple times for different methods on it, and we'll + // need to lookup by name. So we use ASM's Tree API and cache the ClassNode/MethodNode. + val classInfoByInternalName = Map.empty[String, (Class[_], ClassNode)] + val methodNodeById = Map.empty[MethodIdentifier[_], MethodNode] + def getOrUpdateClassInfo(classInternalName: String): (Class[_], ClassNode) = { + val classInfo = classInfoByInternalName.getOrElseUpdate(classInternalName, { + val classExternalName = classInternalName.replace('/', '.') + // scalastyle:off classforname + val clazz = Class.forName(classExternalName, false, lambdaClassLoader) + // scalastyle:on classforname + val classNode = new ClassNode() + val classReader = ClosureCleaner.getClassReader(clazz) + classReader.accept(classNode, 0) + + for (m <- classNode.methods.asScala) { + methodNodeById(MethodIdentifier(clazz, m.name, m.desc)) = m + } + + (clazz, classNode) + }) + classInfo + } + + val implClassInternalName = lambdaProxy.getImplClass + val (implClass, _) = getOrUpdateClassInfo(implClassInternalName) + + val implMethodId = MethodIdentifier( + implClass, lambdaProxy.getImplMethodName, lambdaProxy.getImplMethodSignature) + + // The set internal names of classes that we would consider following the calls into. + // Candidates are: known outer class which happens to be the starting closure's impl class, + // and all inner classes discovered below. + // Note that code in an inner class can make calls to methods in any of its enclosing classes, + // e.g. + // starting closure (in class T) + // inner class A + // inner class B + // inner closure + // we need to track calls from "inner closure" to outer classes relative to it (class T, A, B) + // to better find and track field accesses. + val trackedClassInternalNames = Set[String](implClassInternalName) + + // Depth-first search for inner closures and track the fields that were accessed in them. + // Start from the lambda body's implementation method, follow method invocations + val visited = Set.empty[MethodIdentifier[_]] + val stack = Stack[MethodIdentifier[_]](implMethodId) + def pushIfNotVisited(methodId: MethodIdentifier[_]): Unit = { + if (!visited.contains(methodId)) { + stack.push(methodId) + } + } + + while (!stack.isEmpty) { + val currentId = stack.pop + visited += currentId + + val currentClass = currentId.cls + val currentMethodNode = methodNodeById(currentId) + logTrace(s" scanning ${currentId.cls.getName}.${currentId.name}${currentId.desc}") + currentMethodNode.accept(new MethodVisitor(ASM7) { + val currentClassName = currentClass.getName + val currentClassInternalName = currentClassName.replace('.', '/') + + // Find and update the accessedFields info. Only fields on known outer classes are tracked. + // This is the FieldAccessFinder equivalent. + override def visitFieldInsn(op: Int, owner: String, name: String, desc: String): Unit = { + if (op == GETFIELD || op == PUTFIELD) { + val ownerExternalName = owner.replace('/', '.') + for (cl <- accessedFields.keys if cl.getName == ownerExternalName) { + logTrace(s" found field access $name on $ownerExternalName") + accessedFields(cl) += name + } + } + } + + override def visitMethodInsn( + op: Int, owner: String, name: String, desc: String, itf: Boolean): Unit = { + val ownerExternalName = owner.replace('/', '.') + if (owner == currentClassInternalName) { + logTrace(s" found intra class call to $ownerExternalName.$name$desc") + // could be invoking a helper method or a field accessor method, just follow it. + pushIfNotVisited(MethodIdentifier(currentClass, name, desc)) + } else if (isInnerClassCtorCapturingOuter( + op, owner, name, desc, currentClassInternalName)) { + // Discover inner classes. + // This this the InnerClassFinder equivalent for inner classes, which still use the + // `$outer` chain. So this is NOT controlled by the `findTransitively` flag. + logDebug(s" found inner class $ownerExternalName") + val innerClassInfo = getOrUpdateClassInfo(owner) + val innerClass = innerClassInfo._1 + val innerClassNode = innerClassInfo._2 + trackedClassInternalNames += owner + // We need to visit all methods on the inner class so that we don't missing anything. + for (m <- innerClassNode.methods.asScala) { + logTrace(s" found method ${m.name}${m.desc}") + pushIfNotVisited(MethodIdentifier(innerClass, m.name, m.desc)) + } + } else if (findTransitively && trackedClassInternalNames.contains(owner)) { + logTrace(s" found call to outer $ownerExternalName.$name$desc") + val (calleeClass, _) = getOrUpdateClassInfo(owner) // make sure MethodNodes are cached + pushIfNotVisited(MethodIdentifier(calleeClass, name, desc)) + } else { + // keep the same behavior as the original ClosureCleaner + logTrace(s" ignoring call to $ownerExternalName.$name$desc") + } + } + + // Find the lexically nested closures + // This is the InnerClosureFinder equivalent for indylambda nested closures + override def visitInvokeDynamicInsn( + name: String, desc: String, bsmHandle: Handle, bsmArgs: Object*): Unit = { + logTrace(s" invokedynamic: $name$desc, bsmHandle=$bsmHandle, bsmArgs=$bsmArgs") + + // fast check: we only care about Scala lambda creation + // TODO: maybe lift this restriction and support other functional interfaces + if (!name.startsWith("apply")) return + if (!Type.getReturnType(desc).getDescriptor.startsWith("Lscala/Function")) return + + if (isLambdaMetafactory(bsmHandle)) { + // OK we're in the right bootstrap method for serializable Java 8 style lambda creation + val targetHandle = bsmArgs(1).asInstanceOf[Handle] + if (isLambdaBodyCapturingOuter(targetHandle, currentClassInternalName)) { + // this is a lexically nested closure that also captures the enclosing `this` + logDebug(s" found inner closure $targetHandle") + val calleeMethodId = + MethodIdentifier(currentClass, targetHandle.getName, targetHandle.getDesc) + pushIfNotVisited(calleeMethodId) + } + } + } + }) + } + } +} + private[spark] class ReturnStatementInClosureException extends SparkException("Return statements aren't allowed in Spark closures") @@ -422,7 +748,7 @@ private class ReturnStatementFinder(targetMethodName: Option[String] = None) override def visitMethod(access: Int, name: String, desc: String, sig: String, exceptions: Array[String]): MethodVisitor = { - // $anonfun$ covers Java 8 lambdas + // $anonfun$ covers indylambda closures if (name.contains("apply") || name.contains("$anonfun$")) { // A method with suffix "$adapted" will be generated in cases like // { _:Int => return; Seq()} but not { _:Int => return; true} diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index 4d89c4f079f29..78fbd0cadc5c3 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -22,7 +22,7 @@ import java.util.{Properties, UUID} import scala.collection.JavaConverters._ import scala.collection.Map -import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.json4s.DefaultFormats import org.json4s.JsonAST._ @@ -59,6 +59,7 @@ private[spark] object JsonProtocol { private implicit val format = DefaultFormats private val mapper = new ObjectMapper().registerModule(DefaultScalaModule) + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) /** ------------------------------------------------- * * JSON serialization methods for SparkListenerEvents | @@ -350,12 +351,22 @@ private[spark] object JsonProtocol { case v: Long => JInt(v) // We only have 3 kind of internal accumulator types, so if it's not int or long, it must be // the blocks accumulator, whose type is `java.util.List[(BlockId, BlockStatus)]` - case v => - JArray(v.asInstanceOf[java.util.List[(BlockId, BlockStatus)]].asScala.toList.map { - case (id, status) => - ("Block ID" -> id.toString) ~ - ("Status" -> blockStatusToJson(status)) + case v: java.util.List[_] => + JArray(v.asScala.toList.flatMap { + case (id: BlockId, status: BlockStatus) => + Some( + ("Block ID" -> id.toString) ~ + ("Status" -> blockStatusToJson(status)) + ) + case _ => + // Ignore unsupported types. A user may put `METRICS_PREFIX` in the name. We should + // not crash. + None }) + case _ => + // Ignore unsupported types. A user may put `METRICS_PREFIX` in the name. We should not + // crash. + JNothing } } else { // For all external accumulators, just use strings @@ -474,6 +485,7 @@ private[spark] object JsonProtocol { ("Callsite" -> rddInfo.callSite) ~ ("Parent IDs" -> parentIds) ~ ("Storage Level" -> storageLevel) ~ + ("Barrier" -> rddInfo.isBarrier) ~ ("Number of Partitions" -> rddInfo.numPartitions) ~ ("Number of Cached Partitions" -> rddInfo.numCachedPartitions) ~ ("Memory Size" -> rddInfo.memSize) ~ @@ -981,7 +993,14 @@ private[spark] object JsonProtocol { val blockManagerAddress = blockManagerIdFromJson(json \ "Block Manager Address") val shuffleId = (json \ "Shuffle ID").extract[Int] val mapId = (json \ "Map ID").extract[Long] - val mapIndex = (json \ "Map Index").extract[Int] + val mapIndex = json \ "Map Index" match { + case JNothing => + // Note, we use the invalid value Int.MinValue here to fill the map index for backward + // compatibility. Otherwise, the fetch failed event will be dropped when the history + // server loads the event log written by the Spark version before 3.0. + Int.MinValue + case x => x.extract[Int] + } val reduceId = (json \ "Reduce ID").extract[Int] val message = jsonOption(json \ "Message").map(_.extract[String]) new FetchFailed(blockManagerAddress, shuffleId, mapId, mapIndex, reduceId, diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index 4f1311224bb95..4db268604a3e9 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -209,9 +209,7 @@ private [util] class SparkShutdownHookManager { private class SparkShutdownHook(private val priority: Int, hook: () => Unit) extends Comparable[SparkShutdownHook] { - override def compareTo(other: SparkShutdownHook): Int = { - other.priority - priority - } + override def compareTo(other: SparkShutdownHook): Int = other.priority.compareTo(priority) def run(): Unit = hook() diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index de39e4b410f25..78206c51c1028 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -18,6 +18,7 @@ package org.apache.spark.util import java.util.concurrent._ +import java.util.concurrent.{Future => JFuture} import java.util.concurrent.locks.ReentrantLock import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor, Future} @@ -28,7 +29,6 @@ import scala.util.control.NonFatal import com.google.common.util.concurrent.ThreadFactoryBuilder import org.apache.spark.SparkException -import org.apache.spark.rpc.RpcAbortException private[spark] object ThreadUtils { @@ -298,12 +298,28 @@ private[spark] object ThreadUtils { // TimeoutException and RpcAbortException is thrown in the current thread, so not need to warp // the exception. case NonFatal(t) - if !t.isInstanceOf[TimeoutException] && !t.isInstanceOf[RpcAbortException] => + if !t.isInstanceOf[TimeoutException] => throw new SparkException("Exception thrown in awaitResult: ", t) } } // scalastyle:on awaitresult + @throws(classOf[SparkException]) + def awaitResult[T](future: JFuture[T], atMost: Duration): T = { + try { + atMost match { + case Duration.Inf => future.get() + case _ => future.get(atMost._1, atMost._2) + } + } catch { + case e: SparkFatalException => + throw e.throwable + case NonFatal(t) + if !t.isInstanceOf[TimeoutException] => + throw new SparkException("Exception thrown in awaitResult: ", t) + } + } + // scalastyle:off awaitready /** * Preferred alternative to `Await.ready()`. diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 297cc5e4cb100..867cd1983be8f 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -95,7 +95,7 @@ private[spark] object Utils extends Logging { */ val DEFAULT_DRIVER_MEM_MB = JavaUtils.DEFAULT_DRIVER_MEM_MB.toInt - val MAX_DIR_CREATION_ATTEMPTS: Int = 10 + private val MAX_DIR_CREATION_ATTEMPTS: Int = 10 @volatile private var localRootDirs: Array[String] = null /** Scheme used for files that are locally available on worker nodes in the cluster. */ @@ -1879,7 +1879,9 @@ private[spark] object Utils extends Logging { * Indicates whether Spark is currently running unit tests. */ def isTesting: Boolean = { - sys.env.contains("SPARK_TESTING") || sys.props.contains(IS_TESTING.key) + // Scala's `sys.env` creates a ton of garbage by constructing Scala immutable maps, so + // we directly use the Java APIs instead. + System.getenv("SPARK_TESTING") != null || System.getProperty(IS_TESTING.key) != null } /** @@ -2546,28 +2548,6 @@ private[spark] object Utils extends Logging { new File(path.getAbsolutePath + "." + UUID.randomUUID()) } - /** - * Given a process id, return true if the process is still running. - */ - def isProcessRunning(pid: Int): Boolean = { - val process = executeCommand(Seq("kill", "-0", pid.toString)) - process.waitFor(10, TimeUnit.SECONDS) - process.exitValue() == 0 - } - - /** - * Returns the pid of this JVM process. - */ - def getProcessId: Int = { - val PROCESS = "(\\d+)@(.*)".r - val name = getProcessName() - name match { - case PROCESS(pid, _) => pid.toInt - case _ => - throw new SparkException(s"Unexpected process name: $name, expected to be PID@hostname.") - } - } - /** * Returns the name of this JVM process. This is OS dependent but typically (OSX, Linux, Windows), * this is formatted as PID@hostname. @@ -2772,19 +2752,16 @@ private[spark] object Utils extends Logging { } val masterScheme = new URI(masterWithoutK8sPrefix).getScheme - val resolvedURL = masterScheme.toLowerCase(Locale.ROOT) match { - case "https" => + + val resolvedURL = Option(masterScheme).map(_.toLowerCase(Locale.ROOT)) match { + case Some("https") => masterWithoutK8sPrefix - case "http" => + case Some("http") => logWarning("Kubernetes master URL uses HTTP instead of HTTPS.") masterWithoutK8sPrefix - case null => - val resolvedURL = s"https://$masterWithoutK8sPrefix" - logInfo("No scheme specified for kubernetes master URL, so defaulting to https. Resolved " + - s"URL is $resolvedURL.") - resolvedURL case _ => - throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme) + throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme + + " found in URL: " + masterWithoutK8sPrefix) } s"k8s://$resolvedURL" diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java index 6e995a3929a75..ab177cefa1cca 100644 --- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java +++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java @@ -561,6 +561,8 @@ public void spillInIterator() throws IOException { iter2.next(); } assertFalse(iter2.hasNext()); + // calls hasNext twice deliberately, make sure it's idempotent + assertFalse(iter2.hasNext()); } finally { map.free(); for (File spillFile : spillFilesCreated) { diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java index 43977717f6c97..9904a0a113fdd 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java @@ -23,7 +23,6 @@ import java.util.LinkedList; import java.util.UUID; -import org.hamcrest.Matchers; import scala.Tuple2$; import org.junit.After; @@ -38,7 +37,6 @@ import org.apache.spark.executor.TaskMetrics; import org.apache.spark.internal.config.package$; import org.apache.spark.memory.TestMemoryManager; -import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.serializer.JavaSerializer; import org.apache.spark.serializer.SerializerInstance; @@ -359,6 +357,39 @@ public void forcedSpillingWithReadIterator() throws Exception { assertSpillFilesWereCleanedUp(); } + @Test + public void forcedSpillingNullsWithReadIterator() throws Exception { + final UnsafeExternalSorter sorter = newSorter(); + long[] record = new long[100]; + final int recordSize = record.length * 8; + final int n = (int) pageSizeBytes / recordSize * 3; + for (int i = 0; i < n; i++) { + boolean isNull = i % 2 == 0; + sorter.insertRecord(record, Platform.LONG_ARRAY_OFFSET, recordSize, 0, isNull); + } + assertTrue(sorter.getNumberOfAllocatedPages() >= 2); + + UnsafeExternalSorter.SpillableIterator iter = + (UnsafeExternalSorter.SpillableIterator) sorter.getSortedIterator(); + final int numRecordsToReadBeforeSpilling = n / 3; + for (int i = 0; i < numRecordsToReadBeforeSpilling; i++) { + assertTrue(iter.hasNext()); + iter.loadNext(); + } + + assertTrue(iter.spill() > 0); + assertEquals(0, iter.spill()); + + for (int i = numRecordsToReadBeforeSpilling; i < n; i++) { + assertTrue(iter.hasNext()); + iter.loadNext(); + } + assertFalse(iter.hasNext()); + + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); + } + @Test public void forcedSpillingWithNotReadIterator() throws Exception { final UnsafeExternalSorter sorter = newSorter(); @@ -518,40 +549,28 @@ public void testGetIterator() throws Exception { } @Test - public void testOOMDuringSpill() throws Exception { + public void testNoOOMDuringSpill() throws Exception { final UnsafeExternalSorter sorter = newSorter(); - // we assume that given default configuration, - // the size of the data we insert to the sorter (ints) - // and assuming we shouldn't spill before pointers array is exhausted - // (memory manager is not configured to throw at this point) - // - so this loop runs a reasonable number of iterations (<2000). - // test indeed completed within <30ms (on a quad i7 laptop). - for (int i = 0; sorter.hasSpaceForAnotherRecord(); ++i) { + for (int i = 0; i < 100; i++) { insertNumber(sorter, i); } - // we expect the next insert to attempt growing the pointerssArray first - // allocation is expected to fail, then a spill is triggered which - // attempts another allocation which also fails and we expect to see this - // OOM here. the original code messed with a released array within the - // spill code and ended up with a failed assertion. we also expect the - // location of the OOM to be - // org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset - memoryManager.markconsequentOOM(2); - try { - insertNumber(sorter, 1024); - fail("expected OutOfMmoryError but it seems operation surprisingly succeeded"); - } - // we expect an SparkOutOfMemoryError here, anything else (i.e the original NPE is a failure) - catch (SparkOutOfMemoryError oom){ - String oomStackTrace = Utils.exceptionString(oom); - assertThat("expected SparkOutOfMemoryError in " + - "org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset", - oomStackTrace, - Matchers.containsString( - "org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.reset")); + + // Check that spilling still succeeds when the task is starved for memory. + memoryManager.markconsequentOOM(Integer.MAX_VALUE); + sorter.spill(); + memoryManager.resetConsequentOOM(); + + // Ensure that records can be appended after spilling, i.e. check that the sorter will allocate + // the new pointer array that it could not allocate while spilling. + for (int i = 0; i < 100; ++i) { + insertNumber(sorter, i); } + + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); } + private void verifyIntIterator(UnsafeSorterIterator iter, int start, int end) throws IOException { for (int i = start; i < end; i++) { diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java index 2b8a0602730e1..9d4909ddce792 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorterSuite.java @@ -20,6 +20,7 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; +import org.apache.spark.unsafe.array.LongArray; import org.junit.Assert; import org.junit.Test; @@ -27,7 +28,6 @@ import org.apache.spark.SparkConf; import org.apache.spark.memory.TestMemoryConsumer; import org.apache.spark.memory.TestMemoryManager; -import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.memory.MemoryBlock; @@ -37,7 +37,6 @@ import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.isIn; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; import static org.mockito.Mockito.mock; public class UnsafeInMemorySorterSuite { @@ -147,7 +146,7 @@ public int compare( } @Test - public void freeAfterOOM() { + public void testNoOOMDuringReset() { final SparkConf sparkConf = new SparkConf(); sparkConf.set(package$.MODULE$.MEMORY_OFFHEAP_ENABLED(), false); @@ -156,12 +155,7 @@ public void freeAfterOOM() { final TaskMemoryManager memoryManager = new TaskMemoryManager( testMemoryManager, 0); final TestMemoryConsumer consumer = new TestMemoryConsumer(memoryManager); - final MemoryBlock dataPage = memoryManager.allocatePage(2048, consumer); - final Object baseObject = dataPage.getBaseObject(); - // Write the records into the data page: - long position = dataPage.getBaseOffset(); - final HashPartitioner hashPartitioner = new HashPartitioner(4); // Use integer comparison for comparing prefixes (which are partition ids, in this case) final PrefixComparator prefixComparator = PrefixComparators.LONG; final RecordComparator recordComparator = new RecordComparator() { @@ -179,18 +173,24 @@ public int compare( UnsafeInMemorySorter sorter = new UnsafeInMemorySorter(consumer, memoryManager, recordComparator, prefixComparator, 100, shouldUseRadixSort()); - testMemoryManager.markExecutionAsOutOfMemoryOnce(); - try { - sorter.reset(); - fail("expected SparkOutOfMemoryError but it seems operation surprisingly succeeded"); - } catch (SparkOutOfMemoryError oom) { - // as expected - } - // [SPARK-21907] this failed on NPE at - // org.apache.spark.memory.MemoryConsumer.freeArray(MemoryConsumer.java:108) - sorter.free(); - // simulate a 'back to back' free. - sorter.free(); + // Ensure that the sorter does not OOM while freeing its memory. + testMemoryManager.markconsequentOOM(Integer.MAX_VALUE); + sorter.freeMemory(); + testMemoryManager.resetConsequentOOM(); + Assert.assertFalse(sorter.hasSpaceForAnotherRecord()); + + // Get the sorter in an usable state again by allocating a new pointer array. + LongArray array = consumer.allocateArray(1000); + sorter.expandPointerArray(array); + + // Ensure that it is safe to call freeMemory() multiple times. + testMemoryManager.markconsequentOOM(Integer.MAX_VALUE); + sorter.freeMemory(); + sorter.freeMemory(); + testMemoryManager.resetConsequentOOM(); + Assert.assertFalse(sorter.hasSpaceForAnotherRecord()); + + assertEquals(0L, memoryManager.cleanUpAllAllocatedMemory()); } } diff --git a/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java b/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java index 0f489fb219010..b188ee16b97d0 100644 --- a/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaSparkContextSuite.java @@ -28,6 +28,7 @@ import org.apache.spark.api.java.*; import org.apache.spark.*; +import org.apache.spark.util.Utils; /** * Java apps can use both Java-friendly JavaSparkContext and Scala SparkContext. @@ -35,14 +36,16 @@ public class JavaSparkContextSuite implements Serializable { @Test - public void javaSparkContext() { + public void javaSparkContext() throws IOException { + File tempDir = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark"); + String dummyJarFile = File.createTempFile(tempDir.toString(), "jarFile").toString(); String[] jars = new String[] {}; java.util.Map environment = new java.util.HashMap<>(); new JavaSparkContext(new SparkConf().setMaster("local").setAppName("name")).stop(); new JavaSparkContext("local", "name", new SparkConf()).stop(); new JavaSparkContext("local", "name").stop(); - new JavaSparkContext("local", "name", "sparkHome", "jarFile").stop(); + new JavaSparkContext("local", "name", "sparkHome", dummyJarFile).stop(); new JavaSparkContext("local", "name", "sparkHome", jars).stop(); new JavaSparkContext("local", "name", "sparkHome", jars, environment).stop(); } diff --git a/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json index 7c3f77d8c10cf..388e89ceff054 100644 --- a/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json @@ -16,7 +16,7 @@ "totalInputBytes" : 0, "totalShuffleRead" : 0, "totalShuffleWrite" : 0, - "isBlacklisted" : true, + "isBlacklisted" : false, "maxMemory" : 908381388, "addTime" : "2016-11-16T22:33:31.477GMT", "executorLogs" : { }, diff --git a/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json index 0986e85f16b3e..d030d47b2e8cc 100644 --- a/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json @@ -16,7 +16,7 @@ "totalInputBytes" : 0, "totalShuffleRead" : 0, "totalShuffleWrite" : 0, - "isBlacklisted" : true, + "isBlacklisted" : false, "maxMemory" : 908381388, "addTime" : "2016-11-16T22:33:31.477GMT", "executorLogs" : { }, diff --git a/core/src/test/resources/spark-events/application_1553914137147_0018 b/core/src/test/resources/spark-events/application_1553914137147_0018 index 8c34e7265f8da..03ea3040de1db 100644 --- a/core/src/test/resources/spark-events/application_1553914137147_0018 +++ b/core/src/test/resources/spark-events/application_1553914137147_0018 @@ -1,6 +1,6 @@ {"Event":"SparkListenerLogStart","Spark Version":"3.0.0-SNAPSHOT"} {"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"test-1.vpc.company.com","Port":44768},"Maximum Memory":956615884,"Timestamp":1554755989747,"Maximum Onheap Memory":956615884,"Maximum Offheap Memory":0} -{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/java/jdk1.8.0_144/jre","Java Version":"1.8.0_144 (Oracle Corporation)","Scala Version":"version 2.12.8"},"Spark Properties":{"spark.lineage.log.dir":"/var/log/spark2/lineage","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.yarn.jars":"local:/opt/cloudera/parcels/SPARK2/lib/spark2/jars/*","spark.executor.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.driver.host":"test-1.vpc.company.com","spark.eventLog.enabled":"true","spark.executor.heartbeatInterval":"1000","spark.executor.memoryOverhead":"1024","spark.driver.port":"34194","spark.shuffle.service.enabled":"false","spark.driver.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.lineage.enabled":"false","spark.jars":"file:/tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar","spark.executor.metrics.pollingInterval":"100","spark.yarn.historyServer.address":"http://test-1.vpc.company.com:18089","spark.ui.enabled":"true","spark.app.name":"LargeBlocks","spark.ui.killEnabled":"true","spark.sql.hive.metastore.jars":"${env:HADOOP_COMMON_HOME}/../hive/lib/*:${env:HADOOP_COMMON_HOME}/client/*","spark.locality.wait.process":"0","spark.dynamicAllocation.schedulerBacklogTimeout":"1","spark.yarn.am.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.scheduler.mode":"FIFO","spark.eventLog.logStageExecutorMetrics.enabled":"true","spark.driver.memory":"2g","spark.executor.instances":"3","spark.submit.pyFiles":"","spark.yarn.config.gatewayPath":"/opt/cloudera/parcels","spark.executor.id":"driver","spark.yarn.config.replacementPath":"{{HADOOP_COMMON_HOME}}/../../..","spark.driver.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.eventLog.logStageExecutorProcessTreeMetrics.enabled":"true","spark.submit.deployMode":"client","spark.shuffle.service.port":"7337","spark.yarn.maxAppAttempts":"1","spark.master":"yarn","spark.authenticate":"false","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.network.crypto.enabled":"false","spark.executor.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.executor.memory":"7g","spark.io.encryption.enabled":"false","spark.eventLog.dir":"hdfs://test-1.vpc.company.com:8020/user/spark/spark2ApplicationHistory","spark.dynamicAllocation.enabled":"false","spark.sql.catalogImplementation":"hive","spark.executor.cores":"1","spark.driver.appUIAddress":"http://test-1.vpc.company.com:4040","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"test-1.vpc.company.com","spark.dynamicAllocation.minExecutors":"0","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://test-1.vpc.company.com:8088/proxy/application_1553914137147_0018","spark.dynamicAllocation.executorIdleTimeout":"60","spark.app.id":"application_1553914137147_0018","spark.sql.hive.metastore.version":"1.1.0"},"Hadoop Properties":{"yarn.resourcemanager.amlauncher.thread-count":"50","dfs.namenode.resource.check.interval":"5000","fs.s3a.connection.maximum":"100","mapreduce.jobtracker.jobhistory.task.numberprogresssplits":"12","dfs.data.transfer.server.tcpnodelay":"true","mapreduce.tasktracker.healthchecker.script.timeout":"600000","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.jobhistory.loadedjob.tasks.max":"-1","mapreduce.framework.name":"yarn","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","dfs.cachereport.intervalMsec":"10000","dfs.namenode.checkpoint.txns":"1000000","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"30","mapreduce.tasktracker.local.dir.minspacekill":"0","hadoop.security.groups.cache.background.reload.threads":"3","dfs.namenode.lease-recheck-interval-ms":"2000","fs.s3.block.size":"67108864","dfs.client.domain.socket.data.traffic":"false","dfs.ha.zkfc.nn.http.timeout.ms":"20000","hadoop.registry.secure":"false","hadoop.hdfs.configuration.version":"1","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"${hadoop.tmp.dir}/s3","mapreduce.job.acl-view-job":" ","fs.s3a.s3guard.ddb.background.sleep":"25","mapreduce.jobhistory.loadedjobs.cache.size":"5","mapreduce.jobtracker.persist.jobstatus.hours":"1","fs.s3a.s3guard.ddb.table.create":"false","dfs.datanode.slow.io.warning.threshold.ms":"300","dfs.namenode.handler.count":"10","dfs.namenode.list.reencryption.status.num.responses":"100","mapreduce.input.fileinputformat.split.minsize":"0","dfs.datanode.failed.volumes.tolerated":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","yarn.resourcemanager.amliveliness-monitor.interval-ms":"1000","yarn.resourcemanager.client.thread-count":"50","io.seqfile.compress.blocksize":"1000000","mapreduce.tasktracker.http.threads":"40","dfs.namenode.retrycache.expirytime.millis":"600000","dfs.namenode.backup.address":"0.0.0.0:50100","dfs.datanode.data.dir":"file://${hadoop.tmp.dir}/dfs/data","dfs.datanode.shared.file.descriptor.paths":"/dev/shm,/tmp","dfs.replication":"3","mapreduce.jobtracker.jobhistory.block.size":"3145728","dfs.encrypt.data.transfer.cipher.key.bitlength":"128","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","dfs.secondary.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","mapreduce.task.profile.maps":"0-2","dfs.datanode.block-pinning.enabled":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","mapreduce.jobtracker.retiredjobs.cache.size":"1000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"test-1.vpc.company.com:19890","fs.trash.checkpoint.interval":"0","dfs.namenode.checkpoint.check.period":"60","yarn.nodemanager.container-monitor.interval-ms":"3000","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","hadoop.proxyuser.HTTP.groups":"*","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"10000","dfs.namenode.reencrypt.edek.threads":"10","yarn.resourcemanager.system-metrics-publisher.enabled":"false","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","dfs.namenode.fs-limits.max-component-length":"255","mapreduce.tasktracker.taskcontroller":"org.apache.hadoop.mapred.DefaultTaskController","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","dfs.namenode.top.window.num.buckets":"10","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","dfs.datanode.block.id.layout.upgrade.threads":"12","mapreduce.jobtracker.tasktracker.maxblacklists":"4","yarn.nodemanager.docker-container-executor.exec-name":"/usr/bin/docker","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"1000","hadoop.common.configuration.version":"0.23.0","fs.s3a.s3guard.ddb.table.capacity.read":"500","yarn.nodemanager.remote-app-log-dir-suffix":"logs","dfs.namenode.decommission.max.concurrent.tracked.nodes":"100","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","dfs.client.read.shortcircuit.skip.checksum":"false","mapreduce.task.profile.reduces":"0-2","dfs.datanode.address":"0.0.0.0:50010","dfs.https.server.keystore.resource":"ssl-server.xml","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"test-1.vpc.company.com:8030","mapreduce.task.timeout":"600000","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"hdfs://test-1.vpc.company.com:8020","fs.har.impl.disable.cache":"true","io.compression.codec.bzip2.library":"system-native","dfs.namenode.audit.loggers":"default","dfs.block.access.key.update.interval":"600","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"test-1.vpc.company.com:8090","dfs.namenode.max.objects":"0","mapreduce.jobhistory.address":"test-1.vpc.company.com:10020","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:0","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","dfs.namenode.max.full.block.report.leases":"6","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","dfs.client.mmap.cache.timeout.ms":"3600000","dfs.mover.max-no-move-interval":"60000","fs.ftp.transfer.mode":"BLOCK_TRANSFER_MODE","dfs.client.datanode-restart.timeout":"30","dfs.datanode.drop.cache.behind.reads":"false","ipc.server.log.slow.rpc":"false","dfs.namenode.read-lock-reporting-threshold-ms":"5000","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","yarn.nodemanager.default-container-executor.log-dirs.permissions":"710","dfs.namenode.checkpoint.edits.dir":"${dfs.namenode.checkpoint.dir}","yarn.app.attempt.diagnostics.limit.kc":"64","dfs.balancer.block-move.timeout":"0","dfs.client.block.write.replace-datanode-on-failure.enable":"true","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"8192","io.compression.codecs":"org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec","fs.s3a.fast.upload.active.blocks":"4","hadoop.security.credential.clear-text-fallback":"true","dfs.heartbeat.interval":"3","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","dfs.ha.tail-edits.period":"60","dfs.datanode.max.locked.memory":"0","dfs.datanode.scan.period.hours":"504","mapreduce.jobtracker.expire.trackers.interval":"600000","yarn.resourcemanager.nodemanager-connect-retries":"10","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"false","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","dfs.namenode.edits.noeditlogchannelflush":"false","mapreduce.task.io.sort.factor":"64","mapreduce.tasktracker.outofband.heartbeat":"false","ha.failover-controller.new-active.rpc-timeout.ms":"60000","dfs.webhdfs.ugi.expire.after.access":"600000","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","dfs.namenode.https-address":"test-1.vpc.company.com:20102","yarn.am.blacklisting.enabled":"true","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","dfs.image.transfer-bootstrap-standby.bandwidthPerSec":"0","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"20","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.proxyuser.hive.groups":"*","ha.zookeeper.session-timeout.ms":"5000","mapreduce.map.java.opts":"-Djava.net.preferIPv4Stack=true","tfile.io.chunk.size":"1048576","fs.s3a.s3guard.ddb.table.capacity.write":"100","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","dfs.ha.automatic-failover.enabled":"false","mapreduce.job.counters.groups.max":"50","dfs.namenode.decommission.interval":"30","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","dfs.datanode.handler.count":"10","dfs.namenode.xattrs.enabled":"true","dfs.namenode.safemode.threshold-pct":"0.999f","dfs.client.socket.send.buffer.size":"0","mapreduce.map.sort.spill.percent":"0.8","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","dfs.datanode.sync.behind.writes":"false","dfs.namenode.stale.datanode.interval":"30000","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","dfs.datanode.transfer.socket.send.buffer.size":"0","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","dfs.namenode.reencrypt.throttle.limit.handler.ratio":"1.0","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler","yarn.app.mapreduce.am.command-opts":"-Djava.net.preferIPv4Stack=true -Xmx825955249","dfs.journalnode.https-address":"0.0.0.0:8481","mapreduce.cluster.local.dir":"${hadoop.tmp.dir}/mapred/local","hadoop.proxyuser.hue.hosts":"*","io.mapfile.bloom.error.rate":"0.005","dfs.user.home.dir.prefix":"/user","hadoop.proxyuser.hue.groups":"*","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","mapreduce.jobtracker.persist.jobstatus.dir":"/jobtracker/jobsInfo","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","fs.s3a.metadatastore.impl":"org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore","io.skip.checksum.errors":"false","dfs.namenode.snapshot.capture.openfiles":"false","dfs.datanode.directoryscan.interval":"21600","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","dfs.client.read.shortcircuit.streams.cache.expiry.ms":"300000","fs.s3a.connection.timeout":"200000","mapreduce.job.max.split.locations":"10","dfs.namenode.write.stale.datanode.ratio":"0.5f","hadoop.registry.zk.session.timeout.ms":"60000","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","mapreduce.jobtracker.taskcache.levels":"2","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","mapreduce.job.emit-timeline-data":"false","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","dfs.domain.socket.path":"/var/run/hdfs-sockets/dn","yarn.resourcemanager.admin.client.thread-count":"1","mapreduce.jobtracker.persist.jobstatus.active":"true","dfs.namenode.reencrypt.sleep.interval":"1m","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"test-1.vpc.company.com:10033","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","mapreduce.tasktracker.report.address":"127.0.0.1:0","dfs.namenode.edit.log.autoroll.multiplier.threshold":"2.0","io.seqfile.lazydecompress":"true","ftp.blocksize":"67108864","dfs.namenode.backup.http-address":"0.0.0.0:50105","dfs.disk.balancer.max.disk.throughputInMBperSec":"10","mapreduce.jobtracker.instrumentation":"org.apache.hadoop.mapred.JobTrackerMetricsInst","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.delete.debug-delay-sec":"0","mapreduce.jobtracker.http.address":"0.0.0.0:50030","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","hadoop.security.groups.cache.secs":"300","yarn.resourcemanager.zk-retry-interval-ms":"1000","yarn.scheduler.increment-allocation-mb":"512","nfs.mountd.port":"4242","mapreduce.shuffle.max.threads":"0","hadoop.security.authorization":"false","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","mapreduce.jobhistory.http.policy":"HTTP_ONLY","dfs.datanode.dns.interface":"default","mapreduce.reduce.java.opts":"-Djava.net.preferIPv4Stack=true","s3native.replication":"3","hadoop.security.group.mapping.ldap.ssl":"false","dfs.namenode.fs-limits.max-xattrs-per-inode":"32","yarn.client.application-client-protocol.poll-interval-ms":"200","hadoop.proxyuser.flume.groups":"*","dfs.namenode.fs-limits.max-xattr-size":"16384","dfs.namenode.maintenance.replication.min":"1","dfs.client.write.exclude.nodes.cache.expiry.interval.millis":"600000","ha.zookeeper.parent-znode":"/hadoop-ha","dfs.namenode.safemode.extension":"30000","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","dfs.blocksize":"134217728","dfs.namenode.servicerpc-address":"test-1.vpc.company.com:8022","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"test-1.vpc.company.com:8032","ipc.client.ping":"true","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","dfs.namenode.resource.checked.volumes.minimum":"1","hadoop.proxyuser.HTTP.hosts":"*","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","dfs.namenode.quota.init-threads":"4","dfs.journalnode.http-address":"0.0.0.0:8480","dfs.block.scanner.volume.bytes.per.second":"1048576","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","dfs.storage.policy.enabled":"true","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","dfs.namenode.edits.dir":"${dfs.namenode.name.dir}","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","dfs.namenode.support.allow.format":"true","dfs.ha.tail-edits.rolledits.timeout":"60","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","mapreduce.cluster.temp.dir":"${hadoop.tmp.dir}/mapred/temp","s3.replication":"3","dfs.client.failover.connection.retries":"0","hadoop.tmp.dir":"/tmp/hadoop-${user.name}","mapreduce.job.maps":"2","dfs.namenode.secondary.http-address":"0.0.0.0:50090","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"50","nfs.wtmax":"1048576","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","nfs.dump.dir":"/tmp/.hdfs-nfs","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","mapreduce.client.submit.file.replication":"3","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"128M","dfs.namenode.service.handler.count":"10","dfs.datanode.data.dir.perm":"700","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","dfs.namenode.name.dir":"file:///dataroot/dataroot/dfs/nn","yarn.resourcemanager.zk-acl":"world:anyone:rwcda","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","mapreduce.reduce.input.buffer.percent":"0.0","fs.ftp.host.port":"21","ipc.ping.interval":"60000","dfs.namenode.num.checkpoints.retained":"2","dfs.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","yarn.resourcemanager.admin.address":"test-1.vpc.company.com:8033","file.client-write-packet-size":"65536","hadoop.treat.subject.external":"false","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"false","dfs.disk.balancer.plan.threshold.percent":"10","mapreduce.local.clientfactory.class.name":"org.apache.hadoop.mapred.LocalClientFactory","dfs.client.use.legacy.blockreader":"false","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"256","yarn.nodemanager.localizer.client.thread-count":"5","dfs.namenode.checkpoint.max-retries":"3","dfs.namenode.reject-unresolved-dn-topology-mapping":"false","dfs.namenode.delegation.token.max-lifetime":"*********(redacted)","dfs.client.block.write.replace-datanode-on-failure.min-replication":"0","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec,org.apache.hadoop.crypto.JceAesCtrCryptoCodec","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"2000","dfs.namenode.num.extra.edits.retained":"1000000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","yarn.scheduler.increment-allocation-vcores":"1","fs.df.interval":"60000","fs.s3.sleepTimeSeconds":"10","fs.s3a.multiobjectdelete.enable":"true","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"true","io.file.buffer.size":"65536","hadoop.work.around.non.threadsafe.getpwuid":"false","dfs.permissions.superusergroup":"supergroup","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","mapreduce.tasktracker.dns.interface":"default","hadoop.security.sensitive-config-keys":"*********(redacted)","fs.s3a.s3guard.ddb.max.retries":"9","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","dfs.permissions.enabled":"true","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.scheduler.minimum-allocation-mb":"1024","yarn.app.mapreduce.am.staging-dir":"/user","mapreduce.reduce.shuffle.read.timeout":"180000","yarn.app.mapreduce.am.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","dfs.datanode.https.address":"0.0.0.0:50475","dfs.namenode.hosts.provider.classname":"org.apache.hadoop.hdfs.server.blockmanagement.HostFileManager","dfs.datanode.transfer.socket.recv.buffer.size":"0","fs.s3a.connection.establish.timeout":"5000","dfs.namenode.fslock.fair":"true","mapreduce.job.running.map.limit":"0","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","hadoop.fuse.connection.timeout":"300","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","adl.http.timeout":"5000","yarn.nodemanager.vmem-pmem-ratio":"2.1","dfs.client.slow.io.warning.threshold.ms":"30000","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","s3native.stream-buffer-size":"4096","yarn.nodemanager.remote-app-log-dir":"/tmp/logs","fs.s3a.s3guard.cli.prune.age":"86400000","dfs.client.read.shortcircuit.streams.cache.size":"256","dfs.client.use.legacy.blockreader.local":"false","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3n.multipart.uploads.enabled":"false","dfs.namenode.path.based.cache.retry.interval.ms":"30000","hadoop.security.crypto.buffer.size":"8192","yarn.client.failover-retries-on-socket-timeouts":"0","dfs.balancer.keytab.enabled":"false","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","dfs.datanode.balance.bandwidthPerSec":"10485760","dfs.namenode.name.dir.restore":"false","hadoop.registry.jaas.context":"Client","dfs.client.failover.sleep.max.millis":"15000","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","s3.blocksize":"67108864","yarn.am.blacklisting.disable-failure-threshold":"0.8f","io.map.index.interval":"128","mapreduce.job.counters.max":"120","dfs.namenode.max-lock-hold-to-release-lease-ms":"25","dfs.namenode.datanode.registration.ip-hostname-check":"true","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.LeveldbTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","dfs.namenode.resource.du.reserved":"104857600","dfs.datanode.bp-ready.timeout":"20","yarn.nodemanager.localizer.fetch.thread-count":"4","yarn.resourcemanager.scheduler.client.thread-count":"50","hadoop.ssl.hostname.verifier":"DEFAULT","dfs.namenode.full.block.report.lease.length.ms":"300000","mapreduce.tasktracker.instrumentation":"org.apache.hadoop.mapred.TaskTrackerMetricsInst","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","s3.stream-buffer-size":"4096","yarn.resourcemanager.nm.liveness-monitor.interval-ms":"1000","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","dfs.namenode.secondary.https-address":"0.0.0.0:50091","s3native.bytes-per-checksum":"512","dfs.namenode.fs-limits.max-directory-items":"1048576","nfs.server.port":"2049","dfs.namenode.delegation.token.renew-interval":"*********(redacted)","mapreduce.jobtracker.address":"local","yarn.nodemanager.recovery.enabled":"false","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","dfs.namenode.list.openfiles.num.responses":"1000","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","dfs.namenode.blocks.per.postponedblocks.rescan":"10000","dfs.namenode.checkpoint.period":"3600","hadoop.security.groups.cache.background.reload":"false","yarn.resourcemanager.amlauncher.log.command":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","dfs.namenode.snapshot.skip.capture.accesstime-only-change":"false","dfs.webhdfs.user.provider.user.pattern":"^[A-Za-z_][A-Za-z0-9._-]*[$]?$","dfs.webhdfs.acl.provider.permission.pattern":"^(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?(,(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?)*$","mapreduce.fileoutputcommitter.algorithm.version":"1","yarn.resourcemanager.work-preserving-recovery.enabled":"false","mapreduce.map.skip.maxrecords":"0","mapreduce.jobtracker.handler.count":"10","hadoop.http.authentication.type":"simple","mapreduce.job.jvm.numtasks":"1","hadoop.proxyuser.flume.hosts":"*","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","fs.s3n.block.size":"67108864","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","mapreduce.jobtracker.staging.root.dir":"${hadoop.tmp.dir}/mapred/staging","dfs.namenode.http-address":"test-1.vpc.company.com:20101","mapreduce.jobtracker.jobhistory.lru.cache.size":"5","dfs.datanode.directoryscan.threads":"1","dfs.datanode.fsdatasetcache.max.threads.per.volume":"4","dfs.namenode.fs-limits.max-blocks-per-file":"1048576","dfs.disk.balancer.enabled":"false","mapreduce.shuffle.listen.queue.size":"128","mapreduce.tasktracker.local.dir.minspacestart":"0","mapreduce.map.cpu.vcores":"1","hadoop.user.group.static.mapping.overrides":"dr.who=;","dfs.datanode.cache.revocation.timeout.ms":"900000","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","dfs.client.mmap.cache.size":"256","dfs.ha.log-roll.period":"120","dfs.client.failover.sleep.base.millis":"500","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","dfs.namenode.accesstime.precision":"3600000","yarn.app.mapreduce.client.job.max-retries":"3","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","hadoop.registry.zk.retry.interval.ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","dfs.client.context":"default","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"localhost:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","hadoop.ssl.server.conf":"ssl-server.xml","dfs.http.policy":"HTTP_ONLY","dfs.client.https.keystore.resource":"ssl-client.xml","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.acl.enable":"true","dfs.domain.socket.disable.interval.seconds":"1","dfs.image.transfer.chunksize":"65536","dfs.balancer.max-no-move-interval":"60000","mapreduce.tasktracker.map.tasks.maximum":"2","dfs.namenode.edits.journal-plugin.qjournal":"org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager","mapreduce.task.profile":"false","dfs.webhdfs.enabled":"true","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"nobody","dfs.namenode.list.encryption.zones.num.responses":"100","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","dfs.namenode.top.num.users":"10","dfs.disk.balancer.block.tolerance.percent":"10","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"${yarn.log.dir}/userlogs","io.seqfile.sorter.recordlimit":"1000000","hadoop.security.auth_to_local":"DEFAULT","dfs.blockreport.initialDelay":"0","fs.automatic.close":"true","dfs.client.block.write.replace-datanode-on-failure.best-effort":"false","dfs.namenode.replication.min":"1","dfs.balancer.address":"0.0.0.0:0","fs.s3n.multipart.copy.block.size":"5368709120","yarn.nodemanager.hostname":"0.0.0.0","nfs.rtmax":"1048576","yarn.resourcemanager.zk-timeout-ms":"10000","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","dfs.datanode.directoryscan.throttle.limit.ms.per.sec":"1000","yarn.nodemanager.container-localizer.log.level":"INFO","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","dfs.namenode.replication.work.multiplier.per.iteration":"2","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","dfs.namenode.reencrypt.throttle.limit.updater.ratio":"1.0","dfs.namenode.avoid.write.stale.datanode":"false","dfs.short.circuit.shared.memory.watcher.interrupt.check.ms":"60000","dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction":"0.75f","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","dfs.client.mmap.enabled":"true","mapreduce.reduce.cpu.vcores":"1","hadoop.proxyuser.oozie.groups":"*","fs.client.resolve.remote.symlinks":"true","dfs.image.compression.codec":"org.apache.hadoop.io.compress.DefaultCodec","mapreduce.jobtracker.restart.recover":"false","dfs.namenode.decommission.blocks.per.interval":"500000","mapreduce.tasktracker.reduce.tasks.maximum":"2","yarn.nodemanager.local-dirs":"${hadoop.tmp.dir}/nm-local-dir","mapreduce.shuffle.connection-keep-alive.enable":"false","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","dfs.namenode.safemode.min.datanodes":"0","fs.ftp.data.connection.mode":"ACTIVE_LOCAL_DATA_CONNECTION_MODE","mapreduce.job.userlog.retain.hours":"24","yarn.scheduler.maximum-allocation-vcores":"4","yarn.nodemanager.log-aggregation.compression-type":"none","dfs.namenode.enable.retrycache":"true","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","dfs.namenode.startup.delay.block.deletion.sec":"0","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","dfs.datanode.readahead.bytes":"4194304","mapreduce.jobtracker.heartbeats.in.second":"100","mapreduce.job.running.reduce.limit":"0","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","mapreduce.task.tmp.dir":"./tmp","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:mapred@hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","dfs.namenode.delegation.key.update-interval":"86400000","fs.s3a.max.total.tasks":"5","dfs.client.file-block-storage-locations.num-threads":"10","mapreduce.tasktracker.healthchecker.interval":"60000","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.fast.upload":"false","fs.s3a.attempts.maximum":"20","dfs.namenode.avoid.read.stale.datanode":"false","hadoop.registry.zk.connection.timeout.ms":"15000","dfs.https.port":"20102","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","mapreduce.map.log.level":"INFO","mapreduce.output.fileoutputformat.compress.type":"BLOCK","hadoop.registry.rm.enabled":"false","mapreduce.ifile.readahead.bytes":"4194304","mapreduce.tasktracker.tasks.sleeptimebeforesigkill":"5000","yarn.resourcemanager.fs.state-store.retry-policy-spec":"2000, 500","dfs.namenode.posix.acl.inheritance.enabled":"false","dfs.blockreport.intervalMsec":"21600000","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","dfs.namenode.path.based.cache.refresh.interval.ms":"30000","dfs.namenode.edekcacheloader.interval.ms":"1000","file.stream-buffer-size":"4096","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec","mapreduce.map.speculative":"false","dfs.disk.balancer.max.disk.errors":"5","dfs.datanode.use.datanode.hostname":"false","mapreduce.job.speculative.retry-after-speculate":"15000","hadoop.proxyuser.hdfs.hosts":"*","dfs.namenode.fs-limits.min-block-size":"1048576","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","mapreduce.job.reduce.slowstart.completedmaps":"0.8","dfs.client.read.shortcircuit":"false","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","hadoop.proxyuser.yarn.groups":"*","dfs.client.cached.conn.retry":"3","dfs.namenode.invalidate.work.pct.per.iteration":"0.32f","hadoop.http.logs.enabled":"true","fs.s3a.block.size":"32M","yarn.nodemanager.logaggregation.threadpool-size-max":"100","dfs.replication.max":"512","dfs.namenode.inotify.max.events.per.rpc":"1000","yarn.resourcemanager.hostname":"0.0.0.0","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"0","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","ipc.client.rpc-timeout.ms":"0","fs.s3.maxRetries":"4","dfs.default.chunk.view.size":"32768","mapreduce.input.lineinputformat.linespermap":"1","ipc.client.connect.max.retries.on.timeouts":"45","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","dfs.client.mmap.retry.timeout.ms":"300000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","dfs.namenode.list.cache.directives.num.responses":"100","fs.s3a.socket.recv.buffer":"8192","dfs.image.compress":"false","dfs.namenode.kerberos.principal.pattern":"*","yarn.application.classpath":"$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*","fs.s3n.multipart.uploads.block.size":"67108864","mapreduce.tasktracker.http.address":"0.0.0.0:50060","yarn.resourcemanager.resource-tracker.address":"test-1.vpc.company.com:8031","hadoop.fuse.timer.period":"5","mapreduce.job.heap.memory-mb.ratio":"0.8","dfs.datanode.hdfs-blocks-metadata.enabled":"true","dfs.namenode.checkpoint.dir":"file://${hadoop.tmp.dir}/dfs/namesecondary","dfs.datanode.max.transfer.threads":"4096","dfs.namenode.edits.asynclogging":"true","nfs.allow.insecure.ports":"true","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"0","s3native.client-write-packet-size":"65536","mapreduce.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","yarn.timeline-service.hostname":"0.0.0.0","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","hadoop.proxyuser.mapred.hosts":"*","hadoop.proxyuser.oozie.hosts":"*","yarn.nodemanager.log.retain-seconds":"10800","hadoop.proxyuser.mapred.groups":"*","yarn.resourcemanager.keytab":"/etc/krb5.keytab","mapreduce.reduce.merge.inmem.threshold":"1000","dfs.client.https.need-auth":"false","dfs.blockreport.split.threshold":"1000000","dfs.client.block.write.replace-datanode-on-failure.policy":"DEFAULT","mapreduce.shuffle.ssl.enabled":"false","dfs.namenode.write-lock-reporting-threshold-ms":"5000","dfs.block.access.token.enable":"*********(redacted)","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","httpfs.buffer.size":"4096","dfs.client.file-block-storage-locations.timeout.millis":"1000","dfs.namenode.block-placement-policy.default.prefer-local-node":"true","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.log-aggregation.retain-seconds":"-1","dfs.namenode.replication.considerLoad":"true","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.proxyuser.hdfs.groups":"*","dfs.namenode.retrycache.heap.percent":"0.03f","dfs.datanode.cache.revocation.polling.ms":"500","mapreduce.jobhistory.webapp.address":"test-1.vpc.company.com:19888","dfs.namenode.path.based.cache.block.map.allocation.percent":"0.25","mapreduce.jobtracker.system.dir":"${hadoop.tmp.dir}/mapred/system","mapreduce.tasktracker.taskmemorymanager.monitoringinterval":"5000","dfs.journalnode.rpc-address":"0.0.0.0:8485","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"test-1.vpc.company.com:8088","mapreduce.jobhistory.recovery.enable":"false","dfs.client.short.circuit.replica.stale.threshold.ms":"1800000","mapreduce.reduce.shuffle.parallelcopies":"10","fs.trash.interval":"1","dfs.namenode.replication.interval":"3","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","dfs.namenode.top.enabled":"true","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","dfs.datanode.du.reserved":"0","yarn.app.mapreduce.am.resource.mb":"1024","mapreduce.input.fileinputformat.list-status.num-threads":"1","dfs.namenode.lazypersist.file.scrub.interval.sec":"300","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.nodemanager.resource.cpu-vcores":"8","mapreduce.job.reduces":"6","fs.s3a.multipart.size":"64M","yarn.scheduler.minimum-allocation-vcores":"1","dfs.namenode.reencrypt.batch.size":"1000","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","dfs.datanode.http.address":"0.0.0.0:50075","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","fs.s3a.metadatastore.authoritative":"false","ha.health-monitor.sleep-after-disconnect.ms":"1000","s3.bytes-per-checksum":"512","yarn.app.mapreduce.shuffle.log.limit.kb":"0","dfs.namenode.list.cache.pools.num.responses":"100","hadoop.security.group.mapping":"org.apache.hadoop.security.ShellBasedUnixGroupsMapping","mapreduce.jobhistory.jhist.format":"binary","yarn.resourcemanager.ha.enabled":"false","dfs.encrypt.data.transfer":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","mapreduce.job.redacted-properties":"*********(redacted)","dfs.namenode.top.windows.minutes":"1,5,25","s3.client-write-packet-size":"65536","mapreduce.map.output.compress":"true","ha.zookeeper.acl":"world:anyone:rwcda","ipc.server.max.connections":"0","yarn.scheduler.maximum-allocation-mb":"12288","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.app.mapreduce.am.container.log.limit.kb":"0","s3native.blocksize":"67108864","ipc.client.connect.retry.interval":"1000","hadoop.proxyuser.httpfs.groups":"*","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","dfs.namenode.edit.log.autoroll.check.interval.ms":"300000","mapreduce.jobhistory.cleaner.enable":"true","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","hadoop.proxyuser.httpfs.hosts":"*","dfs.client.use.datanode.hostname":"false","dfs.stream-buffer-size":"4096","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","dfs.datanode.drop.cache.behind.writes":"false","mapreduce.tasktracker.dns.nameserver":"default","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","mapreduce.job.end-notification.retry.attempts":"0","hadoop.proxyuser.yarn.hosts":"*","yarn.resourcemanager.zk-num-retries":"1000","dfs.client.failover.max.attempts":"15","mapreduce.tasktracker.indexcache.mb":"10","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","dfs.namenode.snapshotdiff.allow.snap-root-descendant":"true","yarn.nodemanager.localizer.cache.target-size-mb":"10240","zlib.compress.level":"DEFAULT_COMPRESSION","ftp.client-write-packet-size":"65536","mapreduce.jobtracker.maxtasks.perjob":"-1","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","hadoop.proxyuser.hive.hosts":"*","dfs.block.access.token.lifetime":"*********(redacted)","dfs.namenode.max.extra.edits.segments.retained":"10000","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","dfs.image.transfer.bandwidthPerSec":"0","io.native.lib.available":"true","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","ipc.server.listen.queue.size":"128","dfs.namenode.edekcacheloader.initial.delay.ms":"3000","map.sort.class":"org.apache.hadoop.util.QuickSort","dfs.namenode.acls.enabled":"false","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","dfs.datanode.ipc.address":"0.0.0.0:50020","yarn.nodemanager.vmem-check-enabled":"false","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","dfs.lock.suppress.warning.interval":"10s","dfs.client.block.write.retries":"3","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.application.classpath":"$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH","yarn.client.nodemanager-connect.retry-interval-ms":"10000","dfs.client-write-packet-size":"65536","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME","dfs.datanode.dns.nameserver":"default","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","dfs.image.transfer.timeout":"60000","yarn.resourcemanager.recovery.enabled":"false","dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold":"10737418240","dfs.client.failover.connection.retries.on.timeouts":"0"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.security.egd":"file:///dev/urandom","java.vm.specification.version":"1.8","user.home":"/home/systest","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/java/jdk1.8.0_144/jre/lib/amd64","user.dir":"/tmp","java.library.path":":/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.144-b01","jetty.git.hash":"27208684755d94a92186989f695db2d7b21ebc51","java.endorsed.dirs":"/usr/java/jdk1.8.0_144/jre/lib/endorsed","java.runtime.version":"1.8.0_144-b01","java.vm.info":"mixed mode","java.ext.dirs":"/usr/java/jdk1.8.0_144/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/java/jdk1.8.0_144/jre/lib/resources.jar:/usr/java/jdk1.8.0_144/jre/lib/rt.jar:/usr/java/jdk1.8.0_144/jre/lib/sunrsasign.jar:/usr/java/jdk1.8.0_144/jre/lib/jsse.jar:/usr/java/jdk1.8.0_144/jre/lib/jce.jar:/usr/java/jdk1.8.0_144/jre/lib/charsets.jar:/usr/java/jdk1.8.0_144/jre/lib/jfr.jar:/usr/java/jdk1.8.0_144/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Los_Angeles","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"3.10.0-514.26.2.el7.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"systest","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode client --conf spark.driver.memory=2g --conf spark.executor.heartbeatInterval=1000 --conf spark.executor.metrics.pollingInterval=100 --conf spark.eventLog.logStageExecutorProcessTreeMetrics.enabled=true --conf spark.yarn.maxAppAttempts=1 --conf spark.locality.wait.process=0 --conf spark.executor.memoryOverhead=1024 --conf spark.executor.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --conf spark.eventLog.logStageExecutorMetrics.enabled=true --conf spark.driver.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --class com.company.spark.LargeBlocks --num-executors 3 --executor-memory 7g /tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar --targetBlockSizeGb 2.5 --taskSleepMillis 200 --doCache true --cacheOnDisk true --replicas 1 --concurrentReadJobs 2","java.home":"/usr/java/jdk1.8.0_144/jre","java.version":"1.8.0_144","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/opt/cloudera/parcels/CDH/jars/jackson-mapper-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/joni-2.1.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-external-blockcache-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-json-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/xz-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-annotations-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/cglib-2.2.1-v20090111.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-procedure-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-jackson-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/asm-3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xml-apis-1.3.04.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-jvm-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-dbcp-1.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-streaming_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-xml-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/minlog-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hsqldb-1.8.0.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pmml-model-1.4.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire-macros_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zstd-jni-1.3.2-2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpcore-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jta-1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-yarn_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-logging-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/activation-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xbean-asm7-shaded-4.12.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/NOTICE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-format-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpclient-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-3.9.9.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/htrace-core-3.1.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.ws.rs-api-2.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-core-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-nativetask-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-client-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-xc-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-configuration-1.6.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math3-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsp-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-auth-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-registry-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/JavaEWAH-0.3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-graphite-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/joda-time-2.9.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-gridmix-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-compress-1.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javolution-5.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-1.7.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/core-1.1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-runtime-3.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-net-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-framework-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-java-1.1.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-assembly_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-examples-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/shapeless_2.12-2.3.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xercesImpl-2.9.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-server-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang-2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jtransforms-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-0.2.jar":"System Classpath","/etc/spark2/conf/yarn-conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-cli-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.annotation-api-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0-2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javassist-3.18.1-GA.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-applicationhistoryservice-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka-clients-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guice-3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/core-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/opencsv-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-datajoin-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-common-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jdo-api-3.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-webapp-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compiler-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-annotations-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/libthrift-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka_2.11-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-proxy-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-2.7.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jettison-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libfb303-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/zookeeper-3.4.5-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-resource-bundle-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jaxb-api-2.2.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-streaming-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop2-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pyrolite-4.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/activation-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aircompressor-0.10.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-ant-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jline-2.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-all-4.0.23.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-media-jaxb-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlets-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-xml_2.12-1.0.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-graphx_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-plugins-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-web-proxy-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-kvstore_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/objenesis-2.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-i18n-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stream-2.7.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-aws-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-scala_2.12-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/spymemcached-2.11.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-util-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-core-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/mockito-all-1.8.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jets3t-0.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-xc-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rsgroup-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-metastore-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/py4j-0.10.8.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-distributedshell-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/osgi-resource-locator-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-rumen-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsch-0.1.42.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/zkclient-0.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xmlenc-0.52.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-io-2.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-core-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jamon-runtime-2.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-common-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-common-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-3.10.5.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-app-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/machinist_2.12-0.6.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-shuffle_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill_2.12-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/findbugs-annotations-1.3.9-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rest-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ST4-4.0.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jruby-cloudera-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-hive_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/leveldbjni-all-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-core_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-distcp-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-server-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-mapreduce-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jcl-over-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-digester-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-http-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-jackson_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-math3-3.4.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-log4j12-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-jobclient-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-shell-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-server-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill-java-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-locator-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-parser-combinators_2.12-1.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/RoaringBitmap-0.5.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-api-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/log4j-1.2.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-compiler-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-security-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-shuffle-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-all-4.1.30.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze-macros_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jodd-core-3.5.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/azure-data-lake-store-sdk-2.2.9.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-12.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-codec-1.10.jar":"System Classpath","/usr/java/jdk1.8.0_144/lib/tools.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-nodemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sql_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-exec-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jcodings-1.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-datalake-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-kerberos-codec-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-11.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-catalyst_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-io-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-unmanaged-am-launcher-2.6.0-cdh5.15.2.jar":"System Classpath","/etc/spark2/conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-sslengine-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpclient-4.5.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-continuation-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-ast_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-jndi-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsr305-3.0.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/avro-1.7.6-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/univocity-parsers-2.7.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-annotations-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-asn1-api-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jettison-1.3.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-utils-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hppc-0.7.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/logredactor-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-vector-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-impl-2.2.3-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archives-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-plus-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-client-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hue-plugins-3.9.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/high-scale-lib-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-jaxrs-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-collections-3.2.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-library-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/java-xmlbuilder-0.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apache-log4j-extras-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-format-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/disruptor-3.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib-local_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-pool-1.5.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlet-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arpack_combined_all-0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-annotations-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/log4j-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-repl_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/derby-10.12.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-shims-1.5.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-core-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-repackaged-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-api-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-daemon-1.0.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ivy-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hamcrest-core-1.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang3-3.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/spark-streaming-kafka-0-8_2.11-2.4.0.cloudera1-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-databind-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/kryo-shaded-4.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-reflect-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-examples-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpcore-4.4.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-memory-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/LICENSE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compress-1.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-client-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/oro-2.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-thrift-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-ipc-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-httpclient-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-encoding-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okhttp-2.4.0.jar":"System Classpath","spark://test-1.vpc.company.com:34194/jars/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar":"Added By User","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-compiler-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-prefix-tree-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-api-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-core-1.8.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/lz4-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/janino-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libthrift-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-launcher_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-core_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/lz4-java-1.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-jaxrs-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-common_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-unsafe_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-sls-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/snappy-java-1.0.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-guava-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/validation-api-1.1.0.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/microsoft-windowsazure-storage-sdk-0.6.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/compress-lzf-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-openstack-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-client-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-it-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-extras-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-recipes-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-column-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/unused-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-crypto-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-databind-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core4-4.0.1-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guava-14.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/aws-java-sdk-bundle-1.11.134.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zookeeper-3.4.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-paranamer-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-protocol-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-runtime-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-scalap_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-logging-1.1.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-resourcemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archive-logs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/paranamer-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-beanutils-1.9.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jul-to-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stringtemplate-3.2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/gson-2.2.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okio-1.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-api-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsr305-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sketch_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-el-1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-tags_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-core-3.2.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/protobuf-java-2.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-bundle-1.6.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xz-1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-client-2.6.0-cdh5.15.2.jar":"System Classpath"}} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/java/jdk1.8.0_144/jre","Java Version":"1.8.0_144 (Oracle Corporation)","Scala Version":"version 2.12.8"},"Spark Properties":{"spark.lineage.log.dir":"/var/log/spark2/lineage","spark.serializer":"org.apache.spark.serializer.KryoSerializer","spark.yarn.jars":"local:/opt/cloudera/parcels/SPARK2/lib/spark2/jars/*","spark.executor.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.driver.host":"test-1.vpc.company.com","spark.eventLog.enabled":"true","spark.executor.heartbeatInterval":"1000","spark.executor.memoryOverhead":"1024","spark.driver.port":"34194","spark.shuffle.service.enabled":"false","spark.driver.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.lineage.enabled":"false","spark.jars":"file:/tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar","spark.executor.metrics.pollingInterval":"100","spark.yarn.historyServer.address":"http://test-1.vpc.company.com:18089","spark.ui.enabled":"true","spark.app.name":"LargeBlocks","spark.ui.killEnabled":"true","spark.sql.hive.metastore.jars":"${env:HADOOP_COMMON_HOME}/../hive/lib/*:${env:HADOOP_COMMON_HOME}/client/*","spark.locality.wait.process":"0","spark.dynamicAllocation.schedulerBacklogTimeout":"1","spark.yarn.am.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.scheduler.mode":"FIFO","spark.eventLog.logStageExecutorMetrics":"true","spark.driver.memory":"2g","spark.executor.instances":"3","spark.submit.pyFiles":"","spark.yarn.config.gatewayPath":"/opt/cloudera/parcels","spark.executor.id":"driver","spark.yarn.config.replacementPath":"{{HADOOP_COMMON_HOME}}/../../..","spark.driver.extraJavaOptions":"-Djava.security.egd=file:///dev/urandom","spark.eventLog.logStageExecutorProcessTreeMetrics.enabled":"true","spark.submit.deployMode":"client","spark.shuffle.service.port":"7337","spark.yarn.maxAppAttempts":"1","spark.master":"yarn","spark.authenticate":"false","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.network.crypto.enabled":"false","spark.executor.extraLibraryPath":"/opt/cloudera/parcels/CDH/lib/hadoop/lib/native","spark.executor.memory":"7g","spark.io.encryption.enabled":"false","spark.eventLog.dir":"hdfs://test-1.vpc.company.com:8020/user/spark/spark2ApplicationHistory","spark.dynamicAllocation.enabled":"false","spark.sql.catalogImplementation":"hive","spark.executor.cores":"1","spark.driver.appUIAddress":"http://test-1.vpc.company.com:4040","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"test-1.vpc.company.com","spark.dynamicAllocation.minExecutors":"0","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://test-1.vpc.company.com:8088/proxy/application_1553914137147_0018","spark.dynamicAllocation.executorIdleTimeout":"60","spark.app.id":"application_1553914137147_0018","spark.sql.hive.metastore.version":"1.1.0"},"Hadoop Properties":{"yarn.resourcemanager.amlauncher.thread-count":"50","dfs.namenode.resource.check.interval":"5000","fs.s3a.connection.maximum":"100","mapreduce.jobtracker.jobhistory.task.numberprogresssplits":"12","dfs.data.transfer.server.tcpnodelay":"true","mapreduce.tasktracker.healthchecker.script.timeout":"600000","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.jobhistory.loadedjob.tasks.max":"-1","mapreduce.framework.name":"yarn","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","dfs.cachereport.intervalMsec":"10000","dfs.namenode.checkpoint.txns":"1000000","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"30","mapreduce.tasktracker.local.dir.minspacekill":"0","hadoop.security.groups.cache.background.reload.threads":"3","dfs.namenode.lease-recheck-interval-ms":"2000","fs.s3.block.size":"67108864","dfs.client.domain.socket.data.traffic":"false","dfs.ha.zkfc.nn.http.timeout.ms":"20000","hadoop.registry.secure":"false","hadoop.hdfs.configuration.version":"1","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"${hadoop.tmp.dir}/s3","mapreduce.job.acl-view-job":" ","fs.s3a.s3guard.ddb.background.sleep":"25","mapreduce.jobhistory.loadedjobs.cache.size":"5","mapreduce.jobtracker.persist.jobstatus.hours":"1","fs.s3a.s3guard.ddb.table.create":"false","dfs.datanode.slow.io.warning.threshold.ms":"300","dfs.namenode.handler.count":"10","dfs.namenode.list.reencryption.status.num.responses":"100","mapreduce.input.fileinputformat.split.minsize":"0","dfs.datanode.failed.volumes.tolerated":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","yarn.resourcemanager.amliveliness-monitor.interval-ms":"1000","yarn.resourcemanager.client.thread-count":"50","io.seqfile.compress.blocksize":"1000000","mapreduce.tasktracker.http.threads":"40","dfs.namenode.retrycache.expirytime.millis":"600000","dfs.namenode.backup.address":"0.0.0.0:50100","dfs.datanode.data.dir":"file://${hadoop.tmp.dir}/dfs/data","dfs.datanode.shared.file.descriptor.paths":"/dev/shm,/tmp","dfs.replication":"3","mapreduce.jobtracker.jobhistory.block.size":"3145728","dfs.encrypt.data.transfer.cipher.key.bitlength":"128","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","dfs.secondary.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","mapreduce.task.profile.maps":"0-2","dfs.datanode.block-pinning.enabled":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","mapreduce.jobtracker.retiredjobs.cache.size":"1000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"test-1.vpc.company.com:19890","fs.trash.checkpoint.interval":"0","dfs.namenode.checkpoint.check.period":"60","yarn.nodemanager.container-monitor.interval-ms":"3000","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","hadoop.proxyuser.HTTP.groups":"*","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"10000","dfs.namenode.reencrypt.edek.threads":"10","yarn.resourcemanager.system-metrics-publisher.enabled":"false","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","dfs.namenode.fs-limits.max-component-length":"255","mapreduce.tasktracker.taskcontroller":"org.apache.hadoop.mapred.DefaultTaskController","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","dfs.namenode.top.window.num.buckets":"10","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","dfs.datanode.block.id.layout.upgrade.threads":"12","mapreduce.jobtracker.tasktracker.maxblacklists":"4","yarn.nodemanager.docker-container-executor.exec-name":"/usr/bin/docker","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"1000","hadoop.common.configuration.version":"0.23.0","fs.s3a.s3guard.ddb.table.capacity.read":"500","yarn.nodemanager.remote-app-log-dir-suffix":"logs","dfs.namenode.decommission.max.concurrent.tracked.nodes":"100","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","dfs.client.read.shortcircuit.skip.checksum":"false","mapreduce.task.profile.reduces":"0-2","dfs.datanode.address":"0.0.0.0:50010","dfs.https.server.keystore.resource":"ssl-server.xml","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"test-1.vpc.company.com:8030","mapreduce.task.timeout":"600000","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"hdfs://test-1.vpc.company.com:8020","fs.har.impl.disable.cache":"true","io.compression.codec.bzip2.library":"system-native","dfs.namenode.audit.loggers":"default","dfs.block.access.key.update.interval":"600","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"test-1.vpc.company.com:8090","dfs.namenode.max.objects":"0","mapreduce.jobhistory.address":"test-1.vpc.company.com:10020","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:0","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","dfs.namenode.max.full.block.report.leases":"6","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","dfs.client.mmap.cache.timeout.ms":"3600000","dfs.mover.max-no-move-interval":"60000","fs.ftp.transfer.mode":"BLOCK_TRANSFER_MODE","dfs.client.datanode-restart.timeout":"30","dfs.datanode.drop.cache.behind.reads":"false","ipc.server.log.slow.rpc":"false","dfs.namenode.read-lock-reporting-threshold-ms":"5000","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","yarn.nodemanager.default-container-executor.log-dirs.permissions":"710","dfs.namenode.checkpoint.edits.dir":"${dfs.namenode.checkpoint.dir}","yarn.app.attempt.diagnostics.limit.kc":"64","dfs.balancer.block-move.timeout":"0","dfs.client.block.write.replace-datanode-on-failure.enable":"true","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"8192","io.compression.codecs":"org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec","fs.s3a.fast.upload.active.blocks":"4","hadoop.security.credential.clear-text-fallback":"true","dfs.heartbeat.interval":"3","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","dfs.ha.tail-edits.period":"60","dfs.datanode.max.locked.memory":"0","dfs.datanode.scan.period.hours":"504","mapreduce.jobtracker.expire.trackers.interval":"600000","yarn.resourcemanager.nodemanager-connect-retries":"10","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"false","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","dfs.namenode.edits.noeditlogchannelflush":"false","mapreduce.task.io.sort.factor":"64","mapreduce.tasktracker.outofband.heartbeat":"false","ha.failover-controller.new-active.rpc-timeout.ms":"60000","dfs.webhdfs.ugi.expire.after.access":"600000","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","dfs.namenode.https-address":"test-1.vpc.company.com:20102","yarn.am.blacklisting.enabled":"true","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","dfs.image.transfer-bootstrap-standby.bandwidthPerSec":"0","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"20","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.proxyuser.hive.groups":"*","ha.zookeeper.session-timeout.ms":"5000","mapreduce.map.java.opts":"-Djava.net.preferIPv4Stack=true","tfile.io.chunk.size":"1048576","fs.s3a.s3guard.ddb.table.capacity.write":"100","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","dfs.ha.automatic-failover.enabled":"false","mapreduce.job.counters.groups.max":"50","dfs.namenode.decommission.interval":"30","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","dfs.datanode.handler.count":"10","dfs.namenode.xattrs.enabled":"true","dfs.namenode.safemode.threshold-pct":"0.999f","dfs.client.socket.send.buffer.size":"0","mapreduce.map.sort.spill.percent":"0.8","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","dfs.datanode.sync.behind.writes":"false","dfs.namenode.stale.datanode.interval":"30000","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","dfs.datanode.transfer.socket.send.buffer.size":"0","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","dfs.namenode.reencrypt.throttle.limit.handler.ratio":"1.0","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler","yarn.app.mapreduce.am.command-opts":"-Djava.net.preferIPv4Stack=true -Xmx825955249","dfs.journalnode.https-address":"0.0.0.0:8481","mapreduce.cluster.local.dir":"${hadoop.tmp.dir}/mapred/local","hadoop.proxyuser.hue.hosts":"*","io.mapfile.bloom.error.rate":"0.005","dfs.user.home.dir.prefix":"/user","hadoop.proxyuser.hue.groups":"*","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","mapreduce.jobtracker.persist.jobstatus.dir":"/jobtracker/jobsInfo","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","fs.s3a.metadatastore.impl":"org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore","io.skip.checksum.errors":"false","dfs.namenode.snapshot.capture.openfiles":"false","dfs.datanode.directoryscan.interval":"21600","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","dfs.client.read.shortcircuit.streams.cache.expiry.ms":"300000","fs.s3a.connection.timeout":"200000","mapreduce.job.max.split.locations":"10","dfs.namenode.write.stale.datanode.ratio":"0.5f","hadoop.registry.zk.session.timeout.ms":"60000","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","mapreduce.jobtracker.taskcache.levels":"2","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","mapreduce.job.emit-timeline-data":"false","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","dfs.domain.socket.path":"/var/run/hdfs-sockets/dn","yarn.resourcemanager.admin.client.thread-count":"1","mapreduce.jobtracker.persist.jobstatus.active":"true","dfs.namenode.reencrypt.sleep.interval":"1m","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"test-1.vpc.company.com:10033","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","mapreduce.tasktracker.report.address":"127.0.0.1:0","dfs.namenode.edit.log.autoroll.multiplier.threshold":"2.0","io.seqfile.lazydecompress":"true","ftp.blocksize":"67108864","dfs.namenode.backup.http-address":"0.0.0.0:50105","dfs.disk.balancer.max.disk.throughputInMBperSec":"10","mapreduce.jobtracker.instrumentation":"org.apache.hadoop.mapred.JobTrackerMetricsInst","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.delete.debug-delay-sec":"0","mapreduce.jobtracker.http.address":"0.0.0.0:50030","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","hadoop.security.groups.cache.secs":"300","yarn.resourcemanager.zk-retry-interval-ms":"1000","yarn.scheduler.increment-allocation-mb":"512","nfs.mountd.port":"4242","mapreduce.shuffle.max.threads":"0","hadoop.security.authorization":"false","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","mapreduce.jobhistory.http.policy":"HTTP_ONLY","dfs.datanode.dns.interface":"default","mapreduce.reduce.java.opts":"-Djava.net.preferIPv4Stack=true","s3native.replication":"3","hadoop.security.group.mapping.ldap.ssl":"false","dfs.namenode.fs-limits.max-xattrs-per-inode":"32","yarn.client.application-client-protocol.poll-interval-ms":"200","hadoop.proxyuser.flume.groups":"*","dfs.namenode.fs-limits.max-xattr-size":"16384","dfs.namenode.maintenance.replication.min":"1","dfs.client.write.exclude.nodes.cache.expiry.interval.millis":"600000","ha.zookeeper.parent-znode":"/hadoop-ha","dfs.namenode.safemode.extension":"30000","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","dfs.blocksize":"134217728","dfs.namenode.servicerpc-address":"test-1.vpc.company.com:8022","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"test-1.vpc.company.com:8032","ipc.client.ping":"true","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","dfs.namenode.resource.checked.volumes.minimum":"1","hadoop.proxyuser.HTTP.hosts":"*","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","dfs.namenode.quota.init-threads":"4","dfs.journalnode.http-address":"0.0.0.0:8480","dfs.block.scanner.volume.bytes.per.second":"1048576","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","dfs.storage.policy.enabled":"true","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","dfs.namenode.edits.dir":"${dfs.namenode.name.dir}","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","dfs.namenode.support.allow.format":"true","dfs.ha.tail-edits.rolledits.timeout":"60","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","mapreduce.cluster.temp.dir":"${hadoop.tmp.dir}/mapred/temp","s3.replication":"3","dfs.client.failover.connection.retries":"0","hadoop.tmp.dir":"/tmp/hadoop-${user.name}","mapreduce.job.maps":"2","dfs.namenode.secondary.http-address":"0.0.0.0:50090","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"50","nfs.wtmax":"1048576","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","nfs.dump.dir":"/tmp/.hdfs-nfs","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","mapreduce.client.submit.file.replication":"3","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"128M","dfs.namenode.service.handler.count":"10","dfs.datanode.data.dir.perm":"700","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","dfs.namenode.name.dir":"file:///dataroot/dataroot/dfs/nn","yarn.resourcemanager.zk-acl":"world:anyone:rwcda","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","mapreduce.reduce.input.buffer.percent":"0.0","fs.ftp.host.port":"21","ipc.ping.interval":"60000","dfs.namenode.num.checkpoints.retained":"2","dfs.namenode.kerberos.internal.spnego.principal":"${dfs.web.authentication.kerberos.principal}","yarn.resourcemanager.admin.address":"test-1.vpc.company.com:8033","file.client-write-packet-size":"65536","hadoop.treat.subject.external":"false","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"false","dfs.disk.balancer.plan.threshold.percent":"10","mapreduce.local.clientfactory.class.name":"org.apache.hadoop.mapred.LocalClientFactory","dfs.client.use.legacy.blockreader":"false","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"256","yarn.nodemanager.localizer.client.thread-count":"5","dfs.namenode.checkpoint.max-retries":"3","dfs.namenode.reject-unresolved-dn-topology-mapping":"false","dfs.namenode.delegation.token.max-lifetime":"*********(redacted)","dfs.client.block.write.replace-datanode-on-failure.min-replication":"0","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec,org.apache.hadoop.crypto.JceAesCtrCryptoCodec","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"2000","dfs.namenode.num.extra.edits.retained":"1000000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","yarn.scheduler.increment-allocation-vcores":"1","fs.df.interval":"60000","fs.s3.sleepTimeSeconds":"10","fs.s3a.multiobjectdelete.enable":"true","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"true","io.file.buffer.size":"65536","hadoop.work.around.non.threadsafe.getpwuid":"false","dfs.permissions.superusergroup":"supergroup","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","mapreduce.tasktracker.dns.interface":"default","hadoop.security.sensitive-config-keys":"*********(redacted)","fs.s3a.s3guard.ddb.max.retries":"9","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","dfs.permissions.enabled":"true","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.scheduler.minimum-allocation-mb":"1024","yarn.app.mapreduce.am.staging-dir":"/user","mapreduce.reduce.shuffle.read.timeout":"180000","yarn.app.mapreduce.am.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","dfs.datanode.https.address":"0.0.0.0:50475","dfs.namenode.hosts.provider.classname":"org.apache.hadoop.hdfs.server.blockmanagement.HostFileManager","dfs.datanode.transfer.socket.recv.buffer.size":"0","fs.s3a.connection.establish.timeout":"5000","dfs.namenode.fslock.fair":"true","mapreduce.job.running.map.limit":"0","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","hadoop.fuse.connection.timeout":"300","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","adl.http.timeout":"5000","yarn.nodemanager.vmem-pmem-ratio":"2.1","dfs.client.slow.io.warning.threshold.ms":"30000","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","s3native.stream-buffer-size":"4096","yarn.nodemanager.remote-app-log-dir":"/tmp/logs","fs.s3a.s3guard.cli.prune.age":"86400000","dfs.client.read.shortcircuit.streams.cache.size":"256","dfs.client.use.legacy.blockreader.local":"false","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3n.multipart.uploads.enabled":"false","dfs.namenode.path.based.cache.retry.interval.ms":"30000","hadoop.security.crypto.buffer.size":"8192","yarn.client.failover-retries-on-socket-timeouts":"0","dfs.balancer.keytab.enabled":"false","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","dfs.datanode.balance.bandwidthPerSec":"10485760","dfs.namenode.name.dir.restore":"false","hadoop.registry.jaas.context":"Client","dfs.client.failover.sleep.max.millis":"15000","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","s3.blocksize":"67108864","yarn.am.blacklisting.disable-failure-threshold":"0.8f","io.map.index.interval":"128","mapreduce.job.counters.max":"120","dfs.namenode.max-lock-hold-to-release-lease-ms":"25","dfs.namenode.datanode.registration.ip-hostname-check":"true","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.LeveldbTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","dfs.namenode.resource.du.reserved":"104857600","dfs.datanode.bp-ready.timeout":"20","yarn.nodemanager.localizer.fetch.thread-count":"4","yarn.resourcemanager.scheduler.client.thread-count":"50","hadoop.ssl.hostname.verifier":"DEFAULT","dfs.namenode.full.block.report.lease.length.ms":"300000","mapreduce.tasktracker.instrumentation":"org.apache.hadoop.mapred.TaskTrackerMetricsInst","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","s3.stream-buffer-size":"4096","yarn.resourcemanager.nm.liveness-monitor.interval-ms":"1000","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","dfs.namenode.secondary.https-address":"0.0.0.0:50091","s3native.bytes-per-checksum":"512","dfs.namenode.fs-limits.max-directory-items":"1048576","nfs.server.port":"2049","dfs.namenode.delegation.token.renew-interval":"*********(redacted)","mapreduce.jobtracker.address":"local","yarn.nodemanager.recovery.enabled":"false","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","dfs.namenode.list.openfiles.num.responses":"1000","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","dfs.namenode.blocks.per.postponedblocks.rescan":"10000","dfs.namenode.checkpoint.period":"3600","hadoop.security.groups.cache.background.reload":"false","yarn.resourcemanager.amlauncher.log.command":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","dfs.namenode.snapshot.skip.capture.accesstime-only-change":"false","dfs.webhdfs.user.provider.user.pattern":"^[A-Za-z_][A-Za-z0-9._-]*[$]?$","dfs.webhdfs.acl.provider.permission.pattern":"^(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?(,(default:)?(user|group|mask|other):[[A-Za-z_][A-Za-z0-9._-]]*:([rwx-]{3})?)*$","mapreduce.fileoutputcommitter.algorithm.version":"1","yarn.resourcemanager.work-preserving-recovery.enabled":"false","mapreduce.map.skip.maxrecords":"0","mapreduce.jobtracker.handler.count":"10","hadoop.http.authentication.type":"simple","mapreduce.job.jvm.numtasks":"1","hadoop.proxyuser.flume.hosts":"*","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","fs.s3n.block.size":"67108864","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","mapreduce.jobtracker.staging.root.dir":"${hadoop.tmp.dir}/mapred/staging","dfs.namenode.http-address":"test-1.vpc.company.com:20101","mapreduce.jobtracker.jobhistory.lru.cache.size":"5","dfs.datanode.directoryscan.threads":"1","dfs.datanode.fsdatasetcache.max.threads.per.volume":"4","dfs.namenode.fs-limits.max-blocks-per-file":"1048576","dfs.disk.balancer.enabled":"false","mapreduce.shuffle.listen.queue.size":"128","mapreduce.tasktracker.local.dir.minspacestart":"0","mapreduce.map.cpu.vcores":"1","hadoop.user.group.static.mapping.overrides":"dr.who=;","dfs.datanode.cache.revocation.timeout.ms":"900000","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","dfs.client.mmap.cache.size":"256","dfs.ha.log-roll.period":"120","dfs.client.failover.sleep.base.millis":"500","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","dfs.namenode.accesstime.precision":"3600000","yarn.app.mapreduce.client.job.max-retries":"3","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","hadoop.registry.zk.retry.interval.ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","dfs.client.context":"default","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"localhost:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","hadoop.ssl.server.conf":"ssl-server.xml","dfs.http.policy":"HTTP_ONLY","dfs.client.https.keystore.resource":"ssl-client.xml","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.acl.enable":"true","dfs.domain.socket.disable.interval.seconds":"1","dfs.image.transfer.chunksize":"65536","dfs.balancer.max-no-move-interval":"60000","mapreduce.tasktracker.map.tasks.maximum":"2","dfs.namenode.edits.journal-plugin.qjournal":"org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager","mapreduce.task.profile":"false","dfs.webhdfs.enabled":"true","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"nobody","dfs.namenode.list.encryption.zones.num.responses":"100","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","dfs.namenode.top.num.users":"10","dfs.disk.balancer.block.tolerance.percent":"10","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"${yarn.log.dir}/userlogs","io.seqfile.sorter.recordlimit":"1000000","hadoop.security.auth_to_local":"DEFAULT","dfs.blockreport.initialDelay":"0","fs.automatic.close":"true","dfs.client.block.write.replace-datanode-on-failure.best-effort":"false","dfs.namenode.replication.min":"1","dfs.balancer.address":"0.0.0.0:0","fs.s3n.multipart.copy.block.size":"5368709120","yarn.nodemanager.hostname":"0.0.0.0","nfs.rtmax":"1048576","yarn.resourcemanager.zk-timeout-ms":"10000","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","dfs.datanode.directoryscan.throttle.limit.ms.per.sec":"1000","yarn.nodemanager.container-localizer.log.level":"INFO","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","dfs.namenode.replication.work.multiplier.per.iteration":"2","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","dfs.namenode.reencrypt.throttle.limit.updater.ratio":"1.0","dfs.namenode.avoid.write.stale.datanode":"false","dfs.short.circuit.shared.memory.watcher.interrupt.check.ms":"60000","dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction":"0.75f","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","dfs.client.mmap.enabled":"true","mapreduce.reduce.cpu.vcores":"1","hadoop.proxyuser.oozie.groups":"*","fs.client.resolve.remote.symlinks":"true","dfs.image.compression.codec":"org.apache.hadoop.io.compress.DefaultCodec","mapreduce.jobtracker.restart.recover":"false","dfs.namenode.decommission.blocks.per.interval":"500000","mapreduce.tasktracker.reduce.tasks.maximum":"2","yarn.nodemanager.local-dirs":"${hadoop.tmp.dir}/nm-local-dir","mapreduce.shuffle.connection-keep-alive.enable":"false","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","dfs.namenode.safemode.min.datanodes":"0","fs.ftp.data.connection.mode":"ACTIVE_LOCAL_DATA_CONNECTION_MODE","mapreduce.job.userlog.retain.hours":"24","yarn.scheduler.maximum-allocation-vcores":"4","yarn.nodemanager.log-aggregation.compression-type":"none","dfs.namenode.enable.retrycache":"true","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","dfs.namenode.startup.delay.block.deletion.sec":"0","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","dfs.datanode.readahead.bytes":"4194304","mapreduce.jobtracker.heartbeats.in.second":"100","mapreduce.job.running.reduce.limit":"0","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","mapreduce.task.tmp.dir":"./tmp","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:mapred@hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","dfs.namenode.delegation.key.update-interval":"86400000","fs.s3a.max.total.tasks":"5","dfs.client.file-block-storage-locations.num-threads":"10","mapreduce.tasktracker.healthchecker.interval":"60000","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.fast.upload":"false","fs.s3a.attempts.maximum":"20","dfs.namenode.avoid.read.stale.datanode":"false","hadoop.registry.zk.connection.timeout.ms":"15000","dfs.https.port":"20102","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","mapreduce.map.log.level":"INFO","mapreduce.output.fileoutputformat.compress.type":"BLOCK","hadoop.registry.rm.enabled":"false","mapreduce.ifile.readahead.bytes":"4194304","mapreduce.tasktracker.tasks.sleeptimebeforesigkill":"5000","yarn.resourcemanager.fs.state-store.retry-policy-spec":"2000, 500","dfs.namenode.posix.acl.inheritance.enabled":"false","dfs.blockreport.intervalMsec":"21600000","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","dfs.namenode.path.based.cache.refresh.interval.ms":"30000","dfs.namenode.edekcacheloader.interval.ms":"1000","file.stream-buffer-size":"4096","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec","mapreduce.map.speculative":"false","dfs.disk.balancer.max.disk.errors":"5","dfs.datanode.use.datanode.hostname":"false","mapreduce.job.speculative.retry-after-speculate":"15000","hadoop.proxyuser.hdfs.hosts":"*","dfs.namenode.fs-limits.min-block-size":"1048576","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","mapreduce.job.reduce.slowstart.completedmaps":"0.8","dfs.client.read.shortcircuit":"false","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","hadoop.proxyuser.yarn.groups":"*","dfs.client.cached.conn.retry":"3","dfs.namenode.invalidate.work.pct.per.iteration":"0.32f","hadoop.http.logs.enabled":"true","fs.s3a.block.size":"32M","yarn.nodemanager.logaggregation.threadpool-size-max":"100","dfs.replication.max":"512","dfs.namenode.inotify.max.events.per.rpc":"1000","yarn.resourcemanager.hostname":"0.0.0.0","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"0","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","ipc.client.rpc-timeout.ms":"0","fs.s3.maxRetries":"4","dfs.default.chunk.view.size":"32768","mapreduce.input.lineinputformat.linespermap":"1","ipc.client.connect.max.retries.on.timeouts":"45","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","dfs.client.mmap.retry.timeout.ms":"300000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","dfs.namenode.list.cache.directives.num.responses":"100","fs.s3a.socket.recv.buffer":"8192","dfs.image.compress":"false","dfs.namenode.kerberos.principal.pattern":"*","yarn.application.classpath":"$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*","fs.s3n.multipart.uploads.block.size":"67108864","mapreduce.tasktracker.http.address":"0.0.0.0:50060","yarn.resourcemanager.resource-tracker.address":"test-1.vpc.company.com:8031","hadoop.fuse.timer.period":"5","mapreduce.job.heap.memory-mb.ratio":"0.8","dfs.datanode.hdfs-blocks-metadata.enabled":"true","dfs.namenode.checkpoint.dir":"file://${hadoop.tmp.dir}/dfs/namesecondary","dfs.datanode.max.transfer.threads":"4096","dfs.namenode.edits.asynclogging":"true","nfs.allow.insecure.ports":"true","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"0","s3native.client-write-packet-size":"65536","mapreduce.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:$JAVA_LIBRARY_PATH","yarn.timeline-service.hostname":"0.0.0.0","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","hadoop.proxyuser.mapred.hosts":"*","hadoop.proxyuser.oozie.hosts":"*","yarn.nodemanager.log.retain-seconds":"10800","hadoop.proxyuser.mapred.groups":"*","yarn.resourcemanager.keytab":"/etc/krb5.keytab","mapreduce.reduce.merge.inmem.threshold":"1000","dfs.client.https.need-auth":"false","dfs.blockreport.split.threshold":"1000000","dfs.client.block.write.replace-datanode-on-failure.policy":"DEFAULT","mapreduce.shuffle.ssl.enabled":"false","dfs.namenode.write-lock-reporting-threshold-ms":"5000","dfs.block.access.token.enable":"*********(redacted)","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","httpfs.buffer.size":"4096","dfs.client.file-block-storage-locations.timeout.millis":"1000","dfs.namenode.block-placement-policy.default.prefer-local-node":"true","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.log-aggregation.retain-seconds":"-1","dfs.namenode.replication.considerLoad":"true","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.proxyuser.hdfs.groups":"*","dfs.namenode.retrycache.heap.percent":"0.03f","dfs.datanode.cache.revocation.polling.ms":"500","mapreduce.jobhistory.webapp.address":"test-1.vpc.company.com:19888","dfs.namenode.path.based.cache.block.map.allocation.percent":"0.25","mapreduce.jobtracker.system.dir":"${hadoop.tmp.dir}/mapred/system","mapreduce.tasktracker.taskmemorymanager.monitoringinterval":"5000","dfs.journalnode.rpc-address":"0.0.0.0:8485","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"test-1.vpc.company.com:8088","mapreduce.jobhistory.recovery.enable":"false","dfs.client.short.circuit.replica.stale.threshold.ms":"1800000","mapreduce.reduce.shuffle.parallelcopies":"10","fs.trash.interval":"1","dfs.namenode.replication.interval":"3","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","dfs.namenode.top.enabled":"true","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","dfs.datanode.du.reserved":"0","yarn.app.mapreduce.am.resource.mb":"1024","mapreduce.input.fileinputformat.list-status.num-threads":"1","dfs.namenode.lazypersist.file.scrub.interval.sec":"300","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.nodemanager.resource.cpu-vcores":"8","mapreduce.job.reduces":"6","fs.s3a.multipart.size":"64M","yarn.scheduler.minimum-allocation-vcores":"1","dfs.namenode.reencrypt.batch.size":"1000","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","dfs.datanode.http.address":"0.0.0.0:50075","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","fs.s3a.metadatastore.authoritative":"false","ha.health-monitor.sleep-after-disconnect.ms":"1000","s3.bytes-per-checksum":"512","yarn.app.mapreduce.shuffle.log.limit.kb":"0","dfs.namenode.list.cache.pools.num.responses":"100","hadoop.security.group.mapping":"org.apache.hadoop.security.ShellBasedUnixGroupsMapping","mapreduce.jobhistory.jhist.format":"binary","yarn.resourcemanager.ha.enabled":"false","dfs.encrypt.data.transfer":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","mapreduce.job.redacted-properties":"*********(redacted)","dfs.namenode.top.windows.minutes":"1,5,25","s3.client-write-packet-size":"65536","mapreduce.map.output.compress":"true","ha.zookeeper.acl":"world:anyone:rwcda","ipc.server.max.connections":"0","yarn.scheduler.maximum-allocation-mb":"12288","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.app.mapreduce.am.container.log.limit.kb":"0","s3native.blocksize":"67108864","ipc.client.connect.retry.interval":"1000","hadoop.proxyuser.httpfs.groups":"*","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","dfs.namenode.edit.log.autoroll.check.interval.ms":"300000","mapreduce.jobhistory.cleaner.enable":"true","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","hadoop.proxyuser.httpfs.hosts":"*","dfs.client.use.datanode.hostname":"false","dfs.stream-buffer-size":"4096","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","dfs.datanode.drop.cache.behind.writes":"false","mapreduce.tasktracker.dns.nameserver":"default","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","mapreduce.job.end-notification.retry.attempts":"0","hadoop.proxyuser.yarn.hosts":"*","yarn.resourcemanager.zk-num-retries":"1000","dfs.client.failover.max.attempts":"15","mapreduce.tasktracker.indexcache.mb":"10","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","dfs.namenode.snapshotdiff.allow.snap-root-descendant":"true","yarn.nodemanager.localizer.cache.target-size-mb":"10240","zlib.compress.level":"DEFAULT_COMPRESSION","ftp.client-write-packet-size":"65536","mapreduce.jobtracker.maxtasks.perjob":"-1","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","hadoop.proxyuser.hive.hosts":"*","dfs.block.access.token.lifetime":"*********(redacted)","dfs.namenode.max.extra.edits.segments.retained":"10000","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","dfs.image.transfer.bandwidthPerSec":"0","io.native.lib.available":"true","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","ipc.server.listen.queue.size":"128","dfs.namenode.edekcacheloader.initial.delay.ms":"3000","map.sort.class":"org.apache.hadoop.util.QuickSort","dfs.namenode.acls.enabled":"false","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","dfs.datanode.ipc.address":"0.0.0.0:50020","yarn.nodemanager.vmem-check-enabled":"false","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","dfs.lock.suppress.warning.interval":"10s","dfs.client.block.write.retries":"3","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.application.classpath":"$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$MR2_CLASSPATH","yarn.client.nodemanager-connect.retry-interval-ms":"10000","dfs.client-write-packet-size":"65536","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME","dfs.datanode.dns.nameserver":"default","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","dfs.image.transfer.timeout":"60000","yarn.resourcemanager.recovery.enabled":"false","dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold":"10737418240","dfs.client.failover.connection.retries.on.timeouts":"0"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.security.egd":"file:///dev/urandom","java.vm.specification.version":"1.8","user.home":"/home/systest","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/java/jdk1.8.0_144/jre/lib/amd64","user.dir":"/tmp","java.library.path":":/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/opt/cloudera/parcels/CDH/lib/hadoop/lib/native:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.144-b01","jetty.git.hash":"27208684755d94a92186989f695db2d7b21ebc51","java.endorsed.dirs":"/usr/java/jdk1.8.0_144/jre/lib/endorsed","java.runtime.version":"1.8.0_144-b01","java.vm.info":"mixed mode","java.ext.dirs":"/usr/java/jdk1.8.0_144/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/java/jdk1.8.0_144/jre/lib/resources.jar:/usr/java/jdk1.8.0_144/jre/lib/rt.jar:/usr/java/jdk1.8.0_144/jre/lib/sunrsasign.jar:/usr/java/jdk1.8.0_144/jre/lib/jsse.jar:/usr/java/jdk1.8.0_144/jre/lib/jce.jar:/usr/java/jdk1.8.0_144/jre/lib/charsets.jar:/usr/java/jdk1.8.0_144/jre/lib/jfr.jar:/usr/java/jdk1.8.0_144/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Los_Angeles","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"3.10.0-514.26.2.el7.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"systest","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode client --conf spark.driver.memory=2g --conf spark.executor.heartbeatInterval=1000 --conf spark.executor.metrics.pollingInterval=100 --conf spark.eventLog.logStageExecutorProcessTreeMetrics.enabled=true --conf spark.yarn.maxAppAttempts=1 --conf spark.locality.wait.process=0 --conf spark.executor.memoryOverhead=1024 --conf spark.executor.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --conf spark.eventLog.logStageExecutorMetrics=true --conf spark.driver.extraJavaOptions=-Djava.security.egd=file:///dev/urandom --class com.company.spark.LargeBlocks --num-executors 3 --executor-memory 7g /tmp/__spark_test__/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar --targetBlockSizeGb 2.5 --taskSleepMillis 200 --doCache true --cacheOnDisk true --replicas 1 --concurrentReadJobs 2","java.home":"/usr/java/jdk1.8.0_144/jre","java.version":"1.8.0_144","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/opt/cloudera/parcels/CDH/jars/jackson-mapper-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/joni-2.1.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-external-blockcache-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-json-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/xz-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-annotations-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/cglib-2.2.1-v20090111.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-procedure-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-jackson-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/asm-3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xml-apis-1.3.04.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-jvm-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-dbcp-1.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-streaming_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-xml-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/minlog-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hsqldb-1.8.0.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-rdbms-3.2.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pmml-model-1.4.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire-macros_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zstd-jni-1.3.2-2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpcore-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jta-1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-yarn_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-logging-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/activation-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xbean-asm7-shaded-4.12.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/NOTICE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-format-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/httpclient-4.2.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-3.9.9.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/htrace-core-3.1.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.ws.rs-api-2.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-core-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-nativetask-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-client-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-xc-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-configuration-1.6.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math3-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsp-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-auth-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-registry-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/JavaEWAH-0.3.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-graphite-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/joda-time-2.9.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-1.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-gridmix-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-compress-1.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javolution-5.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-1.7.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/core-1.1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-runtime-3.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-hdfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-net-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-framework-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-java-1.1.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-assembly_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-examples-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/shapeless_2.12-2.3.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xercesImpl-2.9.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-server-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang-2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jtransforms-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/snappy-0.2.jar":"System Classpath","/etc/spark2/conf/yarn-conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-cli-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.annotation-api-1.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0-2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javassist-3.18.1-GA.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-applicationhistoryservice-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka-clients-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guice-3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/core-3.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/opencsv-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-datajoin-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-common-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jdo-api-3.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-webapp-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compiler-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-annotations-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/libthrift-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/kafka_2.11-0.9.0-kafka-2.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-proxy-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/antlr-2.7.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jettison-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-core-asl-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libfb303-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/zookeeper-3.4.5-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-resource-bundle-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jaxb-api-2.2.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-streaming-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop2-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/pyrolite-4.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/activation-1.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aircompressor-0.10.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-ant-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jline-2.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-all-4.0.23.Final.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-media-jaxb-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlets-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-xml_2.12-1.0.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-graphx_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-hs-plugins-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-web-proxy-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-math-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-kvstore_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/objenesis-2.5.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-i18n-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stream-2.7.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-aws-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-scala_2.12-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/spymemcached-2.11.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-util-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-core-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/mockito-all-1.8.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jets3t-0.9.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-xc-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rsgroup-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-metastore-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/py4j-0.10.8.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-distributedshell-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/osgi-resource-locator-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-rumen-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsch-0.1.42.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/zkclient-0.7.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xmlenc-0.52.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-io-2.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/metrics-core-3.1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jamon-runtime-2.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-common-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-common-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/netty-3.10.5.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-app-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/machinist_2.12-0.6.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-util-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-shuffle_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill_2.12-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/findbugs-annotations-1.3.9-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-rest-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-hadoop-compat-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ST4-4.0.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jruby-cloudera-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-hive_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/leveldbjni-all-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-core_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-distcp-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-server-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core-3.2.0-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-mapreduce-1.5.5-nohive.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jcl-over-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-digester-1.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-http-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-jackson_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-2.2.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-math3-3.4.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-log4j12-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-jobclient-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-shell-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-server-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/chill-java-0.9.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-locator-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stax-api-1.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-parser-combinators_2.12-1.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/RoaringBitmap-0.5.11.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-api-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/log4j-1.2.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.inject-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-compiler-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-security-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-shuffle-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-nfs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/netty-all-4.1.30.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-api-2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze-macros_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jodd-core-3.5.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/azure-data-lake-store-sdk-2.2.9.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-12.0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-codec-1.10.jar":"System Classpath","/usr/java/jdk1.8.0_144/lib/tools.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-nodemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sql_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hive-exec-1.2.1.spark2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jcodings-1.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-azure-datalake-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apacheds-kerberos-codec-2.0.0-M15.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/guava-11.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-catalyst_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-io-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-applications-unmanaged-am-launcher-2.6.0-cdh5.15.2.jar":"System Classpath","/etc/spark2/conf/":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-sslengine-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-api-jdo-3.2.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpclient-4.5.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-continuation-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-ast_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-jndi-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsr305-3.0.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/avro-1.7.6-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/univocity-parsers-2.7.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-annotations-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/api-asn1-api-1.0.0-M20.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jettison-1.3.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-utils-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hppc-0.7.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/logredactor-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-vector-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jaxb-impl-2.2.3-1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archives-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-plus-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-client-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hue-plugins-3.9.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/high-scale-lib-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-jaxrs-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-collections-3.2.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-library-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/java-xmlbuilder-0.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/apache-log4j-extras-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-format-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/paranamer-2.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spire_2.12-0.13.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/disruptor-3.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib-local_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-pool-1.5.4.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-servlet-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arpack_combined_all-0.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-annotations-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jsp-2.1-6.1.14.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/log4j-1.2.17.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-repl_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/derby-10.12.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/orc-shims-1.5.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-client-core-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/aopalliance-repackaged-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/hk2-api-2.4.0-b34.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-daemon-1.0.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/ivy-2.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hamcrest-core-1.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-lang3-3.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/spark-streaming-kafka-0-8_2.11-2.4.0.cloudera1-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-databind-2.2.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/kryo-shaded-4.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-reflect-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-mapreduce-examples-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/httpcore-4.4.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/arrow-memory-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/lib/hadoop/LICENSE.txt":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-compress-1.8.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-client-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/oro-2.0.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-thrift-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/avro-ipc-1.8.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-httpclient-3.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-encoding-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okhttp-2.4.0.jar":"System Classpath","spark://test-1.vpc.company.com:34194/jars/spark3-tests-0.1.0-cdh5.9.0-SNAPSHOT-jar-with-dependencies.jar":"Added By User","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/scala-compiler-2.12.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-prefix-tree-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-6.1.26.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-api-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.9.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-beanutils-core-1.8.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/kafka-0.9/lz4-1.3.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/janino-3.0.11.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/libthrift-0.12.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-launcher_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-core_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/lz4-java-1.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jackson-jaxrs-1.8.8.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-network-common_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/breeze_2.12-0.13.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-unsafe_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-sls-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/snappy-java-1.0.4.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-guava-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/validation-api-1.1.0.Final.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/microsoft-windowsazure-storage-sdk-0.6.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/compress-lzf-1.0.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-openstack-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-mllib_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jetty-client-9.4.12.v20180830.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-it-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-extras-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jersey-container-servlet-2.22.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/curator-recipes-2.7.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-column-1.10.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/unused-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/commons-crypto-1.0.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-databind-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-codec-1.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/htrace-core4-4.0.1-incubating.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/guava-14.0.1.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/aws-java-sdk-bundle-1.11.134.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/zookeeper-3.4.6.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jackson-module-paranamer-2.9.8.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hbase-protocol-1.2.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/jasper-runtime-5.5.23.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/json4s-scalap_2.12-3.5.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-logging-1.1.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-resourcemanager-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-archive-logs-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/paranamer-2.3.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/metrics-core-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-beanutils-1.9.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jul-to-slf4j-1.7.16.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/stringtemplate-3.2.1.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/gson-2.2.4.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/okio-1.4.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/slf4j-api-1.7.5.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/jsr305-3.0.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-sketch_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/commons-el-1.0.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/spark-tags_2.12-3.0.0-SNAPSHOT.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/datanucleus-core-3.2.10.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/protobuf-java-2.5.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-server-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/parquet-hadoop-bundle-1.6.0.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-common-2.6.0-cdh5.15.2.jar":"System Classpath","/opt/cloudera/parcels/SPARK2/lib/spark2/jars/xz-1.5.jar":"System Classpath","/opt/cloudera/parcels/CDH/jars/hadoop-yarn-client-2.6.0-cdh5.15.2.jar":"System Classpath"}} {"Event":"SparkListenerApplicationStart","App Name":"LargeBlocks","App ID":"application_1553914137147_0018","Timestamp":1554755984286,"User":"systest"} {"Event":"SparkListenerExecutorAdded","Timestamp":1554755994596,"Executor ID":"1","Executor Info":{"Host":"test-2.vpc.company.com","Total Cores":1,"Log Urls":{"stdout":"http://test-2.vpc.company.com:8042/node/containerlogs/container_1553914137147_0018_01_000002/systest/stdout?start=-4096","stderr":"http://test-2.vpc.company.com:8042/node/containerlogs/container_1553914137147_0018_01_000002/systest/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"test-2.vpc.company.com:8042","USER":"systest","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"test-2.vpc.company.com","CONTAINER_ID":"container_1553914137147_0018_01_000002"}}} {"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"test-2.vpc.company.com","Port":43764},"Maximum Memory":3820172083,"Timestamp":1554755994649,"Maximum Onheap Memory":3820172083,"Maximum Offheap Memory":0} diff --git a/core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala b/core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala index 435b927068e60..7052d1a2028bb 100644 --- a/core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala +++ b/core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala @@ -19,9 +19,12 @@ package org.apache.spark import scala.concurrent.duration._ +import org.apache.spark.TestUtils.createTempScriptWithExpectedOutput import org.apache.spark.internal.config._ import org.apache.spark.rdd.{PartitionPruningRDD, RDD} +import org.apache.spark.resource.TestResourceIDs.{EXECUTOR_GPU_ID, TASK_GPU_ID, WORKER_GPU_ID} import org.apache.spark.scheduler.BarrierJobAllocationFailed._ +import org.apache.spark.scheduler.BarrierJobSlotsNumberCheckFailed import org.apache.spark.util.ThreadUtils /** @@ -259,4 +262,37 @@ class BarrierStageOnSubmittedSuite extends SparkFunSuite with LocalSparkContext testSubmitJob(sc, rdd, message = ERROR_MESSAGE_BARRIER_REQUIRE_MORE_SLOTS_THAN_CURRENT_TOTAL_NUMBER) } + + test("SPARK-32518: CoarseGrainedSchedulerBackend.maxNumConcurrentTasks should " + + "consider all kinds of resources for the barrier stage") { + withTempDir { dir => + val discoveryScript = createTempScriptWithExpectedOutput( + dir, "gpuDiscoveryScript", """{"name": "gpu","addresses":["0"]}""") + + val conf = new SparkConf() + .setMaster("local-cluster[1, 2, 1024]") + .setAppName("test-cluster") + .set(WORKER_GPU_ID.amountConf, "1") + .set(WORKER_GPU_ID.discoveryScriptConf, discoveryScript) + .set(EXECUTOR_GPU_ID.amountConf, "1") + .set(TASK_GPU_ID.amountConf, "1") + // disable barrier stage retry to fail the application as soon as possible + .set(BARRIER_MAX_CONCURRENT_TASKS_CHECK_MAX_FAILURES, 1) + // disable the check to simulate the behavior of Standalone in order to + // reproduce the issue. + .set(Tests.SKIP_VALIDATE_CORES_TESTING, true) + sc = new SparkContext(conf) + // setup an executor which will have 2 CPUs and 1 GPU + TestUtils.waitUntilExecutorsUp(sc, 1, 60000) + + val exception = intercept[BarrierJobSlotsNumberCheckFailed] { + sc.parallelize(Range(1, 10), 2) + .barrier() + .mapPartitions { iter => iter } + .collect() + } + assert(exception.getMessage.contains("[SPARK-24819]: Barrier execution " + + "mode does not allow run a barrier stage that requires more slots")) + } + } } diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index 3f309819065be..4d157b9607000 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -174,7 +174,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex private def testCaching(conf: SparkConf, storageLevel: StorageLevel): Unit = { sc = new SparkContext(conf.setMaster(clusterUrl).setAppName("test")) - TestUtils.waitUntilExecutorsUp(sc, 2, 30000) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) val data = sc.parallelize(1 to 1000, 10) val cachedData = data.persist(storageLevel) assert(cachedData.count === 1000) diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index 8d958494d52be..0b19146713966 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -1142,7 +1142,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { .set(config.DYN_ALLOCATION_TESTING, true) // SPARK-22864: effectively disable the allocation schedule by setting the period to a // really long value. - .set(TEST_SCHEDULE_INTERVAL, 10000L) + .set(TEST_SCHEDULE_INTERVAL, 30000L) } private def createManager( diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index 3bc2061c4f2ad..72e7ee0214187 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -221,7 +221,7 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst conf.registerKryoClasses(Array(classOf[Class1])) assert(conf.get(KRYO_CLASSES_TO_REGISTER).toSet === Seq(classOf[Class1].getName).toSet) - conf.set(KRYO_USER_REGISTRATORS, classOf[CustomRegistrator].getName) + conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[CustomRegistrator].getName)) // Kryo doesn't expose a way to discover registered classes, but at least make sure this doesn't // blow up. diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index df9c7c5eaa368..dc1c0451c628d 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -36,6 +36,7 @@ import org.scalatest.concurrent.Eventually import org.apache.spark.TestUtils._ import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.Tests.RESOURCES_WARNING_TESTING import org.apache.spark.internal.config.UI._ import org.apache.spark.resource.ResourceAllocation import org.apache.spark.resource.ResourceUtils._ @@ -841,7 +842,6 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu .setAppName("test-cluster") .set(DRIVER_GPU_ID.amountConf, "3") .set(DRIVER_GPU_ID.discoveryScriptConf, scriptPath) - .set(SPARK_RESOURCES_DIR, dir.getName()) sc = new SparkContext(conf) @@ -891,6 +891,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu .setAppName("test-cluster") conf.set(TASK_GPU_ID.amountConf, "2") conf.set(EXECUTOR_GPU_ID.amountConf, "4") + conf.set(RESOURCES_WARNING_TESTING, true) var error = intercept[SparkException] { sc = new SparkContext(conf) @@ -924,7 +925,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu assume(!(Utils.isWindows)) withTempDir { dir => val discoveryScript = createTempScriptWithExpectedOutput(dir, "resourceDiscoveryScript", - """{"name": "gpu","addresses":["0", "1", "2", "3", "4", "5", "6", "7", "8"]}""") + """{"name": "gpu","addresses":["0", "1", "2"]}""") val conf = new SparkConf() .setMaster("local-cluster[3, 1, 1024]") @@ -933,7 +934,6 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu .set(WORKER_GPU_ID.discoveryScriptConf, discoveryScript) .set(TASK_GPU_ID.amountConf, "3") .set(EXECUTOR_GPU_ID.amountConf, "3") - .set(SPARK_RESOURCES_DIR, dir.getName()) sc = new SparkContext(conf) @@ -945,13 +945,34 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu context.resources().get(GPU).get.addresses.iterator } val gpus = rdd.collect() - assert(gpus.sorted === Seq("0", "1", "2", "3", "4", "5", "6", "7", "8")) + assert(gpus.sorted === Seq("0", "0", "0", "1", "1", "1", "2", "2", "2")) eventually(timeout(10.seconds)) { assert(sc.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0) } } } + + test("SPARK-32160: Disallow to create SparkContext in executors if the config is set") { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) + + val error = intercept[SparkException] { + sc.range(0, 1).foreach { _ => + new SparkContext(new SparkConf().setAppName("test").setMaster("local") + .set(EXECUTOR_ALLOW_SPARK_CONTEXT, false)) + } + }.getMessage() + + assert(error.contains("SparkContext should only be created and accessed on the driver.")) + } + + test("SPARK-32160: Allow to create SparkContext in executors") { + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local-cluster[3, 1, 1024]")) + + sc.range(0, 1).foreach { _ => + new SparkContext(new SparkConf().setAppName("test").setMaster("local")).stop() + } + } } object SparkContextSuite { diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala index cf4400e080e37..d40207423b7ae 100644 --- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark // scalastyle:off import java.io.File +import java.util.{Locale, TimeZone} import org.apache.log4j.spi.LoggingEvent @@ -63,6 +64,17 @@ abstract class SparkFunSuite with Logging { // scalastyle:on + // Initialize the logger forcibly to let the logger log timestamp + // based on the local time zone depending on environments. + // The default time zone will be set to America/Los_Angeles later + // so this initialization is necessary here. + log + + // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) + TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) + // Add Locale setting + Locale.setDefault(Locale.US) + protected val enableAutoThreadAudit = true protected override def beforeAll(): Unit = { diff --git a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala index 9629f5ab1a3dd..022fcbb25b0af 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala @@ -26,7 +26,6 @@ import scala.util.Try import org.apache.commons.io.output.TeeOutputStream import org.apache.commons.lang3.SystemUtils -import org.scalatest.Assertions._ import org.apache.spark.util.Utils diff --git a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala index 31f065ec55749..b182b11a0e85e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala +++ b/core/src/test/scala/org/apache/spark/deploy/DeployTestUtils.scala @@ -102,6 +102,7 @@ private[deploy] object DeployTestUtils { createDriverDesc(), null, "spark://worker", + "http://publicAddress:80", new SecurityManager(conf)) } diff --git a/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceMetricsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceMetricsSuite.scala new file mode 100644 index 0000000000000..d681c13337e0d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceMetricsSuite.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy + +import scala.collection.JavaConverters._ + +import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} +import org.apache.spark.internal.config.{SHUFFLE_SERVICE_DB_ENABLED, SHUFFLE_SERVICE_ENABLED} +import org.apache.spark.util.Utils + +class ExternalShuffleServiceMetricsSuite extends SparkFunSuite { + + var sparkConf: SparkConf = _ + var externalShuffleService: ExternalShuffleService = _ + + override def beforeAll(): Unit = { + super.beforeAll() + sparkConf = new SparkConf() + sparkConf.set(SHUFFLE_SERVICE_ENABLED, true) + sparkConf.set(SHUFFLE_SERVICE_DB_ENABLED, false) + sparkConf.set("spark.local.dir", System.getProperty("java.io.tmpdir")) + Utils.loadDefaultSparkProperties(sparkConf, null) + val securityManager = new SecurityManager(sparkConf) + externalShuffleService = new ExternalShuffleService(sparkConf, securityManager) + externalShuffleService.start() + } + + override def afterAll(): Unit = { + if (externalShuffleService != null) { + externalShuffleService.stop() + } + super.afterAll() + } + + test("SPARK-31646: metrics should be registered") { + val sourceRef = classOf[ExternalShuffleService].getDeclaredField("shuffleServiceSource") + sourceRef.setAccessible(true) + val source = sourceRef.get(externalShuffleService).asInstanceOf[ExternalShuffleServiceSource] + assert(source.metricRegistry.getMetrics.keySet().asScala == + Set( + "blockTransferRateBytes", + "numActiveConnections", + "numCaughtExceptions", + "numRegisteredConnections", + "openBlockRequestLatencyMillis", + "registeredExecutorsSize", + "registerExecutorRequestLatencyMillis", + "shuffle-server.usedDirectMemory", + "shuffle-server.usedHeapMemory") + ) + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 9d4736825618e..fb2a65e8c07cf 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -31,6 +31,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FSDataInputStream, Path} import org.scalatest.{BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} +import org.scalatest.time.Span import org.scalatest.time.SpanSugar._ import org.apache.spark._ @@ -565,7 +566,8 @@ class SparkSubmitSuite } } - val clArgs2 = Seq("--class", "org.SomeClass", "thejar.jar") + val dummyJarFile = TestUtils.createJarWithClasses(Seq.empty) + val clArgs2 = Seq("--class", "org.SomeClass", dummyJarFile.toString) val appArgs2 = new SparkSubmitArguments(clArgs2) val (_, _, conf2, _) = submit.prepareSubmitEnvironment(appArgs2) assert(!conf2.contains(UI_SHOW_CONSOLE_PROGRESS)) @@ -1215,6 +1217,86 @@ class SparkSubmitSuite testRemoteResources(enableHttpFs = true, blacklistSchemes = Seq("*")) } + test("SPARK-32119: Jars and files should be loaded when Executors launch for plugins") { + val tempDir = Utils.createTempDir() + val tempFileName = "test.txt" + val tempFile = new File(tempDir, tempFileName) + + // scalastyle:off println + Utils.tryWithResource { + new PrintWriter(tempFile) + } { writer => + writer.println("SparkPluginTest") + } + // scalastyle:on println + + val sparkPluginCodeBody = + """ + |@Override + |public org.apache.spark.api.plugin.ExecutorPlugin executorPlugin() { + | return new TestExecutorPlugin(); + |} + | + |@Override + |public org.apache.spark.api.plugin.DriverPlugin driverPlugin() { return null; } + """.stripMargin + val executorPluginCodeBody = + s""" + |@Override + |public void init( + | org.apache.spark.api.plugin.PluginContext ctx, + | java.util.Map extraConf) { + | String str = null; + | try (java.io.BufferedReader reader = + | new java.io.BufferedReader(new java.io.InputStreamReader( + | new java.io.FileInputStream("$tempFileName")))) { + | str = reader.readLine(); + | } catch (java.io.IOException e) { + | throw new RuntimeException(e); + | } finally { + | assert str == "SparkPluginTest"; + | } + |} + """.stripMargin + + val compiledExecutorPlugin = TestUtils.createCompiledClass( + "TestExecutorPlugin", + tempDir, + "", + null, + Seq.empty, + Seq("org.apache.spark.api.plugin.ExecutorPlugin"), + executorPluginCodeBody) + + val thisClassPath = + sys.props("java.class.path").split(File.pathSeparator).map(p => new File(p).toURI.toURL) + val compiledSparkPlugin = TestUtils.createCompiledClass( + "TestSparkPlugin", + tempDir, + "", + null, + Seq(tempDir.toURI.toURL) ++ thisClassPath, + Seq("org.apache.spark.api.plugin.SparkPlugin"), + sparkPluginCodeBody) + + val jarUrl = TestUtils.createJar( + Seq(compiledSparkPlugin, compiledExecutorPlugin), + new File(tempDir, "testplugin.jar")) + + val unusedJar = TestUtils.createJarWithClasses(Seq.empty) + val unusedFile = Files.createTempFile(tempDir.toPath, "unused", null) + val args = Seq( + "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), + "--name", "testApp", + "--master", "local-cluster[1,1,1024]", + "--conf", "spark.plugins=TestSparkPlugin", + "--conf", "spark.ui.enabled=false", + "--jars", jarUrl.toString + "," + unusedJar.toString, + "--files", tempFile.toString + "," + unusedFile.toString, + unusedJar.toString) + runSparkSubmit(args) + } + private def testRemoteResources( enableHttpFs: Boolean, blacklistSchemes: Seq[String] = Nil): Unit = { @@ -1417,7 +1499,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { implicit val defaultSignaler: Signaler = ThreadSignaler // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly. - def runSparkSubmit(args: Seq[String], root: String = ".."): Unit = { + def runSparkSubmit(args: Seq[String], root: String = "..", timeout: Span = 1.minute): Unit = { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val sparkSubmitFile = if (Utils.isWindows) { new File(s"$root\\bin\\spark-submit.cmd") @@ -1430,7 +1512,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) try { - val exitCode = failAfter(1.minute) { process.waitFor() } + val exitCode = failAfter(timeout) { process.waitFor() } if (exitCode != 0) { fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.") } diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala index 060b878fb8ef2..e9b739ce7a4c6 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala @@ -213,7 +213,7 @@ class SingleEventLogFileWriterSuite extends EventLogFileWritersSuite { compressionCodecShortName) val finalLogPath = new Path(logPath) - assert(fileSystem.exists(finalLogPath) && fileSystem.isFile(finalLogPath)) + assert(fileSystem.exists(finalLogPath) && fileSystem.getFileStatus(finalLogPath).isFile) assert(expectedLines === readLinesFromEventLogFile(finalLogPath, fileSystem)) } } @@ -357,10 +357,10 @@ class RollingEventLogFilesWriterSuite extends EventLogFileWritersSuite { expectedLines: Seq[String]): Unit = { val logDirPath = getAppEventLogDirPath(logBaseDir, appId, appAttemptId) - assert(fileSystem.exists(logDirPath) && fileSystem.isDirectory(logDirPath)) + assert(fileSystem.exists(logDirPath) && fileSystem.getFileStatus(logDirPath).isDirectory) val appStatusFile = getAppStatusFilePath(logDirPath, appId, appAttemptId, inProgress = false) - assert(fileSystem.exists(appStatusFile) && fileSystem.isFile(appStatusFile)) + assert(fileSystem.exists(appStatusFile) && fileSystem.getFileStatus(appStatusFile).isFile) val eventLogFiles = listEventLogFiles(logDirPath) val allLines = mutable.ArrayBuffer[String]() diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index c2f34fc3a95ed..f3beb35f1f011 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -1470,6 +1470,55 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { } } + test("SPARK-33146: don't let one bad rolling log folder prevent loading other applications") { + withTempDir { dir => + val conf = createTestConf(true) + conf.set(HISTORY_LOG_DIR, dir.getAbsolutePath) + val hadoopConf = SparkHadoopUtil.newConfiguration(conf) + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val provider = new FsHistoryProvider(conf) + + val writer = new RollingEventLogFilesWriter("app", None, dir.toURI, conf, hadoopConf) + writer.start() + + writeEventsToRollingWriter(writer, Seq( + SparkListenerApplicationStart("app", Some("app"), 0, "user", None), + SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false) + provider.checkForLogs() + provider.cleanLogs() + assert(dir.listFiles().size === 1) + assert(provider.getListing.length === 1) + + // Manually delete the appstatus file to make an invalid rolling event log + val appStatusPath = RollingEventLogFilesWriter.getAppStatusFilePath(new Path(writer.logPath), + "app", None, true) + fs.delete(appStatusPath, false) + provider.checkForLogs() + provider.cleanLogs() + assert(provider.getListing.length === 0) + + // Create a new application + val writer2 = new RollingEventLogFilesWriter("app2", None, dir.toURI, conf, hadoopConf) + writer2.start() + writeEventsToRollingWriter(writer2, Seq( + SparkListenerApplicationStart("app2", Some("app2"), 0, "user", None), + SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false) + + // Both folders exist but only one application found + provider.checkForLogs() + provider.cleanLogs() + assert(provider.getListing.length === 1) + assert(dir.listFiles().size === 2) + + // Make sure a new provider sees the valid application + provider.stop() + val newProvider = new FsHistoryProvider(conf) + newProvider.checkForLogs() + assert(newProvider.getListing.length === 1) + } + } + /** * Asks the provider to check for logs and calls a function to perform checks on the updated * app list. Example: diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala index f78469e132490..9004e86323691 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala @@ -158,4 +158,56 @@ class HistoryServerDiskManagerSuite extends SparkFunSuite with BeforeAndAfter { assert(manager.approximateSize(50L, true) > 50L) } + test("SPARK-32024: update ApplicationStoreInfo.size during initializing") { + val manager = mockManager() + val leaseA = manager.lease(2) + doReturn(3L).when(manager).sizeOf(meq(leaseA.tmpPath)) + val dstPathA = manager.appStorePath("app1", None) + doReturn(3L).when(manager).sizeOf(meq(dstPathA)) + val dstA = leaseA.commit("app1", None) + assert(manager.free() === 0) + assert(manager.committed() === 3) + // Listing store tracks dstA now. + assert(store.read(classOf[ApplicationStoreInfo], dstA.getAbsolutePath).size === 3) + + // Simulate: service restarts, new disk manager (manager1) is initialized. + val manager1 = mockManager() + // Simulate: event KVstore compaction before restart, directory size reduces. + doReturn(2L).when(manager1).sizeOf(meq(dstA)) + doReturn(2L).when(manager1).sizeOf(meq(new File(testDir, "apps"))) + manager1.initialize() + // "ApplicationStoreInfo.size" is updated for dstA. + assert(store.read(classOf[ApplicationStoreInfo], dstA.getAbsolutePath).size === 2) + assert(manager1.free() === 1) + // If "ApplicationStoreInfo.size" is not correctly updated, "IllegalStateException" + // would be thrown. + val leaseB = manager1.lease(2) + assert(manager1.free() === 1) + doReturn(2L).when(manager1).sizeOf(meq(leaseB.tmpPath)) + val dstPathB = manager.appStorePath("app2", None) + doReturn(2L).when(manager1).sizeOf(meq(dstPathB)) + val dstB = leaseB.commit("app2", None) + assert(manager1.committed() === 2) + // Listing store tracks dstB only, dstA is evicted by "makeRoom()". + assert(store.read(classOf[ApplicationStoreInfo], dstB.getAbsolutePath).size === 2) + + val manager2 = mockManager() + // Simulate: cache entities are written after replaying, directory size increases. + doReturn(3L).when(manager2).sizeOf(meq(dstB)) + doReturn(3L).when(manager2).sizeOf(meq(new File(testDir, "apps"))) + manager2.initialize() + // "ApplicationStoreInfo.size" is updated for dstB. + assert(store.read(classOf[ApplicationStoreInfo], dstB.getAbsolutePath).size === 3) + assert(manager2.free() === 0) + val leaseC = manager2.lease(2) + doReturn(2L).when(manager2).sizeOf(meq(leaseC.tmpPath)) + val dstPathC = manager.appStorePath("app3", None) + doReturn(2L).when(manager2).sizeOf(meq(dstPathC)) + val dstC = leaseC.commit("app3", None) + assert(manager2.free() === 1) + assert(manager2.committed() === 2) + // Listing store tracks dstC only, dstB is evicted by "makeRoom()". + assert(store.read(classOf[ApplicationStoreInfo], dstC.getAbsolutePath).size === 2) + } + } diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index 206db0feb5716..c55b29b15051d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -314,7 +314,8 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers all (directSiteRelativeLinks) should not startWith (knoxBaseUrl) } - test("static relative links are prefixed with uiRoot (spark.ui.proxyBase)") { + // TODO (SPARK-31723): re-enable it + ignore("static relative links are prefixed with uiRoot (spark.ui.proxyBase)") { val uiRoot = Option(System.getenv("APPLICATION_WEB_PROXY_BASE")).getOrElse("/testwebproxybase") val page = new HistoryPage(server) val request = mock[HttpServletRequest] @@ -693,6 +694,17 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers out.close() } + test("SPARK-31697: HistoryServer should set Content-Type") { + val port = server.boundPort + val nonExistenceAppId = "local-non-existence" + val url = new URL(s"http://localhost:$port/history/$nonExistenceAppId") + val conn = url.openConnection().asInstanceOf[HttpURLConnection] + conn.setRequestMethod("GET") + conn.connect() + val expectedContentType = "text/html;charset=utf-8" + val actualContentType = conn.getContentType + assert(actualContentType === expectedContentType) + } } object HistoryServerSuite { diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index 0cf573c2490b3..994eb6f14e411 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -689,7 +689,16 @@ class MasterSuite extends SparkFunSuite val master = makeAliveMaster() var worker: MockExecutorLaunchFailWorker = null try { - worker = new MockExecutorLaunchFailWorker(master) + val conf = new SparkConf() + // SPARK-32250: When running test on Github Action machine, the available processors in JVM + // is only 2, while on Jenkins it's 32. For this specific test, 2 available processors, which + // also decides number of threads in Dispatcher, is not enough to consume the messages. In + // the worst situation, MockExecutorLaunchFailWorker would occupy these 2 threads for + // handling messages LaunchDriver, LaunchExecutor at the same time but leave no thread for + // the driver to handle the message RegisteredApplication. At the end, it results in the dead + // lock situation. Therefore, we need to set more threads to avoid the dead lock. + conf.set(Network.RPC_NETTY_DISPATCHER_NUM_THREADS, 6) + worker = new MockExecutorLaunchFailWorker(master, conf) worker.rpcEnv.setupEndpoint("worker", worker) val workerRegMsg = RegisterWorker( worker.id, diff --git a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala index 275bca3459855..d9d559509f4fb 100644 --- a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala @@ -19,10 +19,14 @@ package org.apache.spark.deploy.security import java.security.PrivilegedExceptionAction +import scala.util.control.NonFatal + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION import org.apache.hadoop.minikdc.MiniKdc import org.apache.hadoop.security.{Credentials, UserGroupInformation} +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil @@ -88,8 +92,30 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite { // krb5.conf. MiniKdc sets "java.security.krb5.conf" in start and removes it when stop called. val kdcDir = Utils.createTempDir() val kdcConf = MiniKdc.createConf() - kdc = new MiniKdc(kdcConf, kdcDir) - kdc.start() + // The port for MiniKdc service gets selected in the constructor, but will be bound + // to it later in MiniKdc.start() -> MiniKdc.initKDCServer() -> KdcServer.start(). + // In meantime, when some other service might capture the port during this progress, and + // cause BindException. + // This makes our tests which have dedicated JVMs and rely on MiniKDC being flaky + // + // https://issues.apache.org/jira/browse/HADOOP-12656 get fixed in Hadoop 2.8.0. + // + // The workaround here is to periodically repeat this process with a timeout , since we are + // using Hadoop 2.7.4 as default. + // https://issues.apache.org/jira/browse/SPARK-31631 + eventually(timeout(60.seconds), interval(1.second)) { + try { + kdc = new MiniKdc(kdcConf, kdcDir) + kdc.start() + } catch { + case NonFatal(e) => + if (kdc != null) { + kdc.stop() + kdc = null + } + throw e + } + } val krbConf = new Configuration() krbConf.set(HADOOP_SECURITY_AUTHENTICATION, "kerberos") diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala index c3b580e7ccac4..e429ddfd570de 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala @@ -40,7 +40,8 @@ class DriverRunnerTest extends SparkFunSuite { val worker = mock(classOf[RpcEndpointRef]) doNothing().when(worker).send(any()) spy(new DriverRunner(conf, "driverId", new File("workDir"), new File("sparkHome"), - driverDescription, worker, "spark://1.2.3.4/worker/", new SecurityManager(conf))) + driverDescription, worker, "spark://1.2.3.4/worker/", "http://publicAddress:80", + new SecurityManager(conf))) } private def createProcessBuilderAndProcess(): (ProcessBuilderLike, Process) = { diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala index bb541b4cad8bd..2d3d0afe3f80c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala @@ -36,7 +36,6 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} import org.apache.spark.TestUtils.{createTempJsonFile, createTempScriptWithExpectedOutput} import org.apache.spark.deploy.{Command, ExecutorState, ExternalShuffleService} import org.apache.spark.deploy.DeployMessages.{DriverStateChanged, ExecutorStateChanged, WorkDirCleanup} -import org.apache.spark.deploy.StandaloneResourceUtils.{ALLOCATED_RESOURCES_FILE, SPARK_RESOURCES_COORDINATE_DIR} import org.apache.spark.deploy.master.DriverState import org.apache.spark.internal.config import org.apache.spark.internal.config.Worker._ @@ -64,7 +63,6 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { private def makeWorker( conf: SparkConf = new SparkConf(), shuffleServiceSupplier: Supplier[ExternalShuffleService] = null, - pid: Int = Utils.getProcessId, local: Boolean = false): Worker = { assert(_worker === null, "Some Worker's RpcEnv is leaked in tests") val securityMgr = new SecurityManager(conf) @@ -72,7 +70,7 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { val resourcesFile = conf.get(SPARK_WORKER_RESOURCE_FILE) val localWorker = new Worker(rpcEnv, 50000, 20, 1234 * 5, Array.fill(1)(RpcAddress("1.2.3.4", 1234)), "Worker", "/tmp", - conf, securityMgr, resourcesFile, shuffleServiceSupplier, pid) + conf, securityMgr, resourcesFile, shuffleServiceSupplier) if (local) { localWorker } else { @@ -81,14 +79,6 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { } } - private def assertResourcesFileDeleted(): Unit = { - assert(sys.props.contains("spark.test.home")) - val sparkHome = sys.props.get("spark.test.home") - val resourceFile = new File(sparkHome + "/" + SPARK_RESOURCES_COORDINATE_DIR, - ALLOCATED_RESOURCES_FILE) - assert(!resourceFile.exists()) - } - before { MockitoAnnotations.initMocks(this) } @@ -251,7 +241,6 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { worker.rpcEnv.shutdown() worker.rpcEnv.awaitTermination() } - assertResourcesFileDeleted() } test("worker could load resources from resources file while launching") { @@ -273,7 +262,6 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { worker.rpcEnv.shutdown() worker.rpcEnv.awaitTermination() } - assertResourcesFileDeleted() } } @@ -292,7 +280,6 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { worker.rpcEnv.shutdown() worker.rpcEnv.awaitTermination() } - assertResourcesFileDeleted() } } @@ -316,65 +303,6 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { worker.rpcEnv.shutdown() worker.rpcEnv.awaitTermination() } - assertResourcesFileDeleted() - } - } - - test("Workers run on the same host should avoid resources conflict when coordinate is on") { - val conf = new SparkConf() - withTempDir { dir => - val scriptPath = createTempScriptWithExpectedOutput(dir, "fpgaDiscoverScript", - """{"name": "fpga","addresses":["f1", "f2", "f3", "f4", "f5"]}""") - conf.set(WORKER_FPGA_ID.discoveryScriptConf, scriptPath) - conf.set(WORKER_FPGA_ID.amountConf, "2") - val workers = (0 until 3).map(id => makeWorker(conf, pid = id, local = true)) - workers.zipWithIndex.foreach{case (w, i) => w.rpcEnv.setupEndpoint(s"worker$i", w)} - eventually(timeout(20.seconds)) { - val (empty, nonEmpty) = workers.partition(_.resources.isEmpty) - assert(empty.length === 1) - assert(nonEmpty.length === 2) - val totalResources = nonEmpty.flatMap(_.resources(FPGA).addresses).toSet.toSeq.sorted - assert(totalResources === Seq("f1", "f2", "f3", "f4")) - workers.foreach(_.rpcEnv.shutdown()) - workers.foreach(_.rpcEnv.awaitTermination()) - } - assertResourcesFileDeleted() - } - } - - test("Workers run on the same host should load resources naively when coordinate is off") { - val conf = new SparkConf() - // disable coordination - conf.set(config.SPARK_RESOURCES_COORDINATE, false) - withTempDir { dir => - val gpuArgs = ResourceAllocation(WORKER_GPU_ID, Seq("g0", "g1")) - val ja = Extraction.decompose(Seq(gpuArgs)) - val resourcesPath = createTempJsonFile(dir, "resources", ja) - val scriptPath = createTempScriptWithExpectedOutput(dir, "fpgaDiscoverScript", - """{"name": "fpga","addresses":["f1", "f2", "f3", "f4", "f5"]}""") - conf.set(SPARK_WORKER_RESOURCE_FILE.key, resourcesPath) - conf.set(WORKER_GPU_ID.amountConf, "2") - conf.set(WORKER_FPGA_ID.discoveryScriptConf, scriptPath) - conf.set(WORKER_FPGA_ID.amountConf, "2") - val workers = (0 until 3).map(id => makeWorker(conf, pid = id, local = true)) - workers.zipWithIndex.foreach{case (w, i) => w.rpcEnv.setupEndpoint(s"worker$i", w)} - eventually(timeout(20.seconds)) { - val (empty, nonEmpty) = workers.partition(_.resources.isEmpty) - assert(empty.length === 0) - assert(nonEmpty.length === 3) - // Each Worker should get the same resources from resources file and discovery script - // without coordination. Note that, normally, we must config different resources - // for workers run on the same host when coordinate config is off. Test here is used - // to validate the different behaviour comparing to the above test when coordinate config - // is on, so we admit the resources collision here. - nonEmpty.foreach { worker => - assert(worker.resources === Map(GPU -> gpuArgs.toResourceInformation, - FPGA -> new ResourceInformation(FPGA, Array("f1", "f2", "f3", "f4", "f5")))) - } - workers.foreach(_.rpcEnv.shutdown()) - workers.foreach(_.rpcEnv.awaitTermination()) - } - assertResourcesFileDeleted() } } diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 31049d104e63d..6b3df6d0c9970 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -17,8 +17,9 @@ package org.apache.spark.executor -import java.io.{Externalizable, ObjectInput, ObjectOutput} +import java.io.{Externalizable, File, ObjectInput, ObjectOutput} import java.lang.Thread.UncaughtExceptionHandler +import java.net.URL import java.nio.ByteBuffer import java.util.Properties import java.util.concurrent.{ConcurrentHashMap, CountDownLatch, TimeUnit} @@ -41,6 +42,7 @@ import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.broadcast.Broadcast +import org.apache.spark.deploy.{SimpleApplicationTest, SparkSubmitSuite} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.memory.TestMemoryManager @@ -52,7 +54,7 @@ import org.apache.spark.scheduler.{DirectTaskResult, FakeTask, ResultTask, Task, import org.apache.spark.serializer.{JavaSerializer, SerializerInstance, SerializerManager} import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockManager, BlockManagerId} -import org.apache.spark.util.{LongAccumulator, UninterruptibleThread} +import org.apache.spark.util.{LongAccumulator, SparkUncaughtExceptionHandler, UninterruptibleThread, Utils} class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar with Eventually with PrivateMethodTester { @@ -63,6 +65,33 @@ class ExecutorSuite extends SparkFunSuite super.afterEach() } + /** + * Creates an Executor with the provided arguments, is then passed to `f` + * and will be stopped after `f` returns. + */ + def withExecutor( + executorId: String, + executorHostname: String, + env: SparkEnv, + userClassPath: Seq[URL] = Nil, + isLocal: Boolean = true, + uncaughtExceptionHandler: UncaughtExceptionHandler + = new SparkUncaughtExceptionHandler, + resources: immutable.Map[String, ResourceInformation] + = immutable.Map.empty[String, ResourceInformation])(f: Executor => Unit): Unit = { + var executor: Executor = null + try { + executor = new Executor(executorId, executorHostname, env, userClassPath, isLocal, + uncaughtExceptionHandler, resources) + + f(executor) + } finally { + if (executor != null) { + executor.stop() + } + } + } + test("SPARK-15963: Catch `TaskKilledException` correctly in Executor.TaskRunner") { // mock some objects to make Executor.launchTask() happy val conf = new SparkConf @@ -115,10 +144,8 @@ class ExecutorSuite extends SparkFunSuite } }) - var executor: Executor = null - try { - executor = new Executor("id", "localhost", env, userClassPath = Nil, isLocal = true, - resources = immutable.Map.empty[String, ResourceInformation]) + withExecutor("id", "localhost", env) { executor => + // the task will be launched in a dedicated worker thread executor.launchTask(mockExecutorBackend, taskDescription) @@ -138,11 +165,6 @@ class ExecutorSuite extends SparkFunSuite assert(executorSuiteHelper.testFailedReason.toErrorString === "TaskKilled (test)") assert(executorSuiteHelper.taskState === TaskState.KILLED) } - finally { - if (executor != null) { - executor.stop() - } - } } test("SPARK-19276: Handle FetchFailedExceptions that are hidden by user exceptions") { @@ -254,25 +276,24 @@ class ExecutorSuite extends SparkFunSuite confs.foreach { case (k, v) => conf.set(k, v) } val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) - val executor = - new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true, - resources = immutable.Map.empty[String, ResourceInformation]) - val executorClass = classOf[Executor] - - // Save all heartbeats sent into an ArrayBuffer for verification - val heartbeats = ArrayBuffer[Heartbeat]() - val mockReceiver = mock[RpcEndpointRef] - when(mockReceiver.askSync(any[Heartbeat], any[RpcTimeout])(any)) - .thenAnswer((invocation: InvocationOnMock) => { - val args = invocation.getArguments() - heartbeats += args(0).asInstanceOf[Heartbeat] - HeartbeatResponse(false) - }) - val receiverRef = executorClass.getDeclaredField("heartbeatReceiverRef") - receiverRef.setAccessible(true) - receiverRef.set(executor, mockReceiver) + withExecutor("id", "localhost", SparkEnv.get) { executor => + val executorClass = classOf[Executor] - f(executor, heartbeats) + // Save all heartbeats sent into an ArrayBuffer for verification + val heartbeats = ArrayBuffer[Heartbeat]() + val mockReceiver = mock[RpcEndpointRef] + when(mockReceiver.askSync(any[Heartbeat], any[RpcTimeout])(any)) + .thenAnswer((invocation: InvocationOnMock) => { + val args = invocation.getArguments() + heartbeats += args(0).asInstanceOf[Heartbeat] + HeartbeatResponse(false) + }) + val receiverRef = executorClass.getDeclaredField("heartbeatReceiverRef") + receiverRef.setAccessible(true) + receiverRef.set(executor, mockReceiver) + + f(executor, heartbeats) + } } private def heartbeatZeroAccumulatorUpdateTest(dropZeroMetrics: Boolean): Unit = { @@ -353,10 +374,7 @@ class ExecutorSuite extends SparkFunSuite val taskDescription = createResultTaskDescription(serializer, taskBinary, rdd, 0) val mockBackend = mock[ExecutorBackend] - var executor: Executor = null - try { - executor = new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true, - resources = immutable.Map.empty[String, ResourceInformation]) + withExecutor("id", "localhost", SparkEnv.get) { executor => executor.launchTask(mockBackend, taskDescription) // Ensure that the executor's metricsPoller is polled so that values are recorded for @@ -367,10 +385,6 @@ class ExecutorSuite extends SparkFunSuite eventually(timeout(5.seconds), interval(10.milliseconds)) { assert(executor.numRunningTasks === 0) } - } finally { - if (executor != null) { - executor.stop() - } } // Verify that peak values for task metrics get sent in the TaskResult @@ -465,12 +479,11 @@ class ExecutorSuite extends SparkFunSuite poll: Boolean = false): (TaskFailedReason, UncaughtExceptionHandler) = { val mockBackend = mock[ExecutorBackend] val mockUncaughtExceptionHandler = mock[UncaughtExceptionHandler] - var executor: Executor = null val timedOut = new AtomicBoolean(false) - try { - executor = new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true, - uncaughtExceptionHandler = mockUncaughtExceptionHandler, - resources = immutable.Map.empty[String, ResourceInformation]) + + withExecutor("id", "localhost", SparkEnv.get, + uncaughtExceptionHandler = mockUncaughtExceptionHandler) { executor => + // the task will be launched in a dedicated worker thread executor.launchTask(mockBackend, taskDescription) if (killTask) { @@ -503,11 +516,8 @@ class ExecutorSuite extends SparkFunSuite assert(executor.numRunningTasks === 0) } assert(!timedOut.get(), "timed out waiting to be ready to kill tasks") - } finally { - if (executor != null) { - executor.stop() - } } + val orderedMock = inOrder(mockBackend) val statusCaptor = ArgumentCaptor.forClass(classOf[ByteBuffer]) orderedMock.verify(mockBackend) diff --git a/core/src/test/scala/org/apache/spark/internal/io/SparkHadoopWriterUtilsSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/SparkHadoopWriterUtilsSuite.scala new file mode 100644 index 0000000000000..33b58ec9e6665 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/internal/io/SparkHadoopWriterUtilsSuite.scala @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.io + +import java.util.Date + +import org.apache.hadoop.mapreduce.JobID + +import org.apache.spark.SparkFunSuite +import org.apache.spark.internal.io.SparkHadoopWriterUtils.createJobID + +/** + * Unit tests for functions in SparkHadoopWriterUtils. + */ +class SparkHadoopWriterUtilsSuite extends SparkFunSuite { + + /** + * Core test of JobID generation: + * They are created. + * The job number is converted to the job ID. + * They round trip to string and back + * (which implies that the full string matches the regexp + * in the JobID class). + */ + test("JobID Generation") { + val jobNumber = 1010 + val j1 = createJobID(new Date(), jobNumber) + assert(jobNumber == j1.getId, + s"Job number mismatch in $j1") + + val jobStr = j1.toString + // the string value begins with job_ + assert(jobStr.startsWith("job_"), + s"wrong prefix of $jobStr") + // and the hadoop code can parse it + val j2 = roundTrip(j1) + assert(j1.getId == j2.getId, "Job ID mismatch") + assert(j1.getJtIdentifier == j2.getJtIdentifier, "Job identifier mismatch") + } + + /** + * This is the problem surfacing in situations where committers expect + * Job IDs to be unique: if the timestamp is (exclusively) used + * then there will conflict in directories created. + */ + test("JobIDs generated at same time are different") { + val now = new Date() + val j1 = createJobID(now, 1) + val j2 = createJobID(now, 1) + assert(j1.toString != j2.toString) + } + + /** + * There's nothing explicitly in the Hadoop classes to stop + * job numbers being negative. + * There's some big assumptions in the FileOutputCommitter about attempt IDs + * being positive during any recovery operations; for safety the ID + * job number is validated. + */ + test("JobIDs with negative job number") { + intercept[IllegalArgumentException] { + createJobID(new Date(), -1) + } + } + + /** + * If someone ever does reinstate use of timestamps, + * make sure that the case of timestamp == 0 is handled. + */ + test("JobIDs on Epoch are different") { + val j1 = createJobID(new Date(0), 0) + val j2 = createJobID(new Date(0), 0) + assert (j1.toString != j2.toString) + } + + /** + * Do a round trip as a string and back again. + * This uses the JobID parser. + * @param jobID job ID + * @return the returned jobID + */ + private def roundTrip(jobID: JobID): JobID = { + val parsedJobId = JobID.forName(jobID.toString) + assert(jobID == parsedJobId, "Round trip was inconsistent") + parsedJobId + } +} diff --git a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala index cf2d9293ef822..7888796dd55e6 100644 --- a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala @@ -139,7 +139,7 @@ class PluginContainerSuite extends SparkFunSuite with BeforeAndAfterEach with Lo .set(NonLocalModeSparkPlugin.TEST_PATH_CONF, path.getAbsolutePath()) sc = new SparkContext(conf) - TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) eventually(timeout(10.seconds), interval(100.millis)) { val children = path.listFiles() @@ -169,7 +169,7 @@ class PluginContainerSuite extends SparkFunSuite with BeforeAndAfterEach with Lo sc = new SparkContext(conf) // Ensure all executors has started - TestUtils.waitUntilExecutorsUp(sc, 1, 10000) + TestUtils.waitUntilExecutorsUp(sc, 1, 60000) var children = Array.empty[File] eventually(timeout(10.seconds), interval(100.millis)) { diff --git a/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala index 60f67699f81be..987f383c9c4fa 100644 --- a/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala +++ b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala @@ -119,6 +119,14 @@ class TestMemoryManager(conf: SparkConf) consequentOOM += n } + /** + * Undos the effects of [[markExecutionAsOutOfMemoryOnce]] and [[markconsequentOOM]] and lets + * calls to [[acquireExecutionMemory()]] (if there is enough memory available). + */ + def resetConsequentOOM(): Unit = synchronized { + consequentOOM = 0 + } + def limit(avail: Long): Unit = synchronized { require(avail >= 0) available = avail diff --git a/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala b/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala index 0e21a36071c42..3d4b8c868d6fc 100644 --- a/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/sink/StatsdSinkSuite.scala @@ -35,12 +35,27 @@ class StatsdSinkSuite extends SparkFunSuite { STATSD_KEY_UNIT -> "seconds", STATSD_KEY_HOST -> "127.0.0.1" ) - private val socketTimeout = 30000 // milliseconds - private val socketBufferSize = 8192 + // The maximum size of a single datagram packet payload. Payloads + // larger than this will be truncated. + private val maxPayloadSize = 256 // bytes + + // The receive buffer must be large enough to hold all inflight + // packets. This includes any kernel and protocol overhead. + // This value was determined experimentally and should be + // increased if timeouts are seen. + private val socketMinRecvBufferSize = 16384 // bytes + private val socketTimeout = 30000 // milliseconds private def withSocketAndSink(testCode: (DatagramSocket, StatsdSink) => Any): Unit = { val socket = new DatagramSocket - socket.setReceiveBufferSize(socketBufferSize) + + // Leave the receive buffer size untouched unless it is too + // small. If the receive buffer is too small packets will be + // silently dropped and receive operations will timeout. + if (socket.getReceiveBufferSize() < socketMinRecvBufferSize) { + socket.setReceiveBufferSize(socketMinRecvBufferSize) + } + socket.setSoTimeout(socketTimeout) val props = new Properties defaultProps.foreach(e => props.put(e._1, e._2)) @@ -61,7 +76,7 @@ class StatsdSinkSuite extends SparkFunSuite { sink.registry.register("counter", counter) sink.report() - val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize) + val p = new DatagramPacket(new Array[Byte](maxPayloadSize), maxPayloadSize) socket.receive(p) val result = new String(p.getData, 0, p.getLength, UTF_8) @@ -77,7 +92,7 @@ class StatsdSinkSuite extends SparkFunSuite { sink.registry.register("gauge", gauge) sink.report() - val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize) + val p = new DatagramPacket(new Array[Byte](maxPayloadSize), maxPayloadSize) socket.receive(p) val result = new String(p.getData, 0, p.getLength, UTF_8) @@ -87,7 +102,7 @@ class StatsdSinkSuite extends SparkFunSuite { test("metrics StatsD sink with Histogram") { withSocketAndSink { (socket, sink) => - val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize) + val p = new DatagramPacket(new Array[Byte](maxPayloadSize), maxPayloadSize) val histogram = new Histogram(new UniformReservoir) histogram.update(10) histogram.update(20) @@ -121,7 +136,7 @@ class StatsdSinkSuite extends SparkFunSuite { test("metrics StatsD sink with Timer") { withSocketAndSink { (socket, sink) => - val p = new DatagramPacket(new Array[Byte](socketBufferSize), socketBufferSize) + val p = new DatagramPacket(new Array[Byte](maxPayloadSize), maxPayloadSize) val timer = new Timer() timer.update(1, SECONDS) timer.update(2, SECONDS) diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala index 7a05daa2ad715..ff7d680352177 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala @@ -50,13 +50,12 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext .set(WORKER_GPU_ID.amountConf, "2") .set(TASK_GPU_ID.amountConf, "1") .set(EXECUTOR_GPU_ID.amountConf, "1") - .set(SPARK_RESOURCES_DIR, dir.getName()) .set(WORKER_FPGA_ID.amountConf, "2") .set(TASK_FPGA_ID.amountConf, "1") .set(EXECUTOR_FPGA_ID.amountConf, "1") sc = new SparkContext(conf) - TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) eventually(timeout(10.seconds), interval(100.millis)) { val children = dir.listFiles() @@ -81,10 +80,9 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext .set(WORKER_GPU_ID.amountConf, "2") .set(TASK_GPU_ID.amountConf, "1") .set(EXECUTOR_GPU_ID.amountConf, "1") - .set(SPARK_RESOURCES_DIR, dir.getName()) sc = new SparkContext(conf) - TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) eventually(timeout(10.seconds), interval(100.millis)) { val children = dir.listFiles() @@ -108,10 +106,9 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext .set(WORKER_GPU_ID.amountConf, "2") .set(TASK_GPU_ID.amountConf, "1") .set(EXECUTOR_GPU_ID.amountConf, "1") - .set(SPARK_RESOURCES_DIR, dir.getName()) sc = new SparkContext(conf) - TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) eventually(timeout(10.seconds), interval(100.millis)) { val children = dir.listFiles() @@ -134,10 +131,9 @@ class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext .set(RESOURCES_DISCOVERY_PLUGIN, Seq(classOf[TestResourceDiscoveryPluginEmpty].getName())) .set(DRIVER_GPU_ID.discoveryScriptConf, scriptPath) .set(DRIVER_GPU_ID.amountConf, "2") - .set(SPARK_RESOURCES_DIR, dir.getName()) sc = new SparkContext(conf) - TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) assert(sc.resources.size === 1) assert(sc.resources.get(GPU).get.addresses === Array("5", "6")) diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala index c10f2c244e133..01c67b3856df4 100644 --- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala @@ -209,7 +209,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { // Use anotherEnv to find out the RpcEndpointRef val rpcEndpointRef = anotherEnv.setupEndpointRef(env.address, "ask-abort") try { - val e = intercept[RpcAbortException] { + val e = intercept[SparkException] { val timeout = new RpcTimeout(10.seconds, shortProp) val abortableRpcFuture = rpcEndpointRef.askAbortable[String]( "hello", timeout) @@ -217,15 +217,15 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { new Thread { override def run: Unit = { Thread.sleep(100) - abortableRpcFuture.abort("TestAbort") + abortableRpcFuture.abort(new RuntimeException("TestAbort")) } }.start() - timeout.awaitResult(abortableRpcFuture.toFuture) + timeout.awaitResult(abortableRpcFuture.future) } - // The SparkException cause should be a RpcAbortException with "TestAbort" message - assert(e.isInstanceOf[RpcAbortException]) - assert(e.getMessage.contains("TestAbort")) + // The SparkException cause should be a RuntimeException with "TestAbort" message + assert(e.getCause.isInstanceOf[RuntimeException]) + assert(e.getCause.getMessage.contains("TestAbort")) } finally { anotherEnv.shutdown() anotherEnv.awaitTermination() diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala index c74c728b3e3f3..8b1c602cd8e58 100644 --- a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala @@ -136,4 +136,17 @@ class InboxSuite extends SparkFunSuite { endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress) } + + test("SPARK-32738: should reduce the number of active threads when fatal error happens") { + val endpoint = mock(classOf[TestRpcEndpoint]) + when(endpoint.receive).thenThrow(new OutOfMemoryError()) + + val dispatcher = mock(classOf[Dispatcher]) + val inbox = new Inbox("name", endpoint) + inbox.post(OneWayMessage(null, "hi")) + intercept[OutOfMemoryError] { + inbox.process(dispatcher) + } + assert(inbox.getNumActiveThreads == 0) + } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala index fc8ac38479932..9b214afabdd37 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala @@ -21,19 +21,23 @@ import java.io.File import scala.util.Random +import org.scalatest.concurrent.Eventually +import org.scalatest.time.SpanSugar._ + import org.apache.spark._ import org.apache.spark.internal.config.Tests.TEST_NO_STAGE_RETRY -class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { +class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext with Eventually { - def initLocalClusterSparkContext(): Unit = { + def initLocalClusterSparkContext(numWorker: Int = 4): Unit = { val conf = new SparkConf() // Init local cluster here so each barrier task runs in a separated process, thus `barrier()` // call is actually useful. - .setMaster("local-cluster[4, 1, 1024]") + .setMaster(s"local-cluster[$numWorker, 1, 1024]") .setAppName("test-cluster") .set(TEST_NO_STAGE_RETRY, true) sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, numWorker, 60000) } test("global sync by barrier() call") { @@ -52,6 +56,70 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { assert(times.max - times.min <= 1000) } + test("share messages with allGather() call") { + initLocalClusterSparkContext() + val rdd = sc.makeRDD(1 to 10, 4) + val rdd2 = rdd.barrier().mapPartitions { it => + val context = BarrierTaskContext.get() + // Sleep for a random time before global sync. + Thread.sleep(Random.nextInt(1000)) + // Pass partitionId message in + val message: String = context.partitionId().toString + val messages: Array[String] = context.allGather(message) + Iterator.single(messages.toList) + } + val messages = rdd2.collect() + // All the task partitionIds are shared across all tasks + assert(messages.length === 4) + assert(messages.forall(_ == List("0", "1", "2", "3"))) + } + + test("throw exception if we attempt to synchronize with different blocking calls") { + initLocalClusterSparkContext() + val rdd = sc.makeRDD(1 to 10, 4) + val rdd2 = rdd.barrier().mapPartitions { it => + val context = BarrierTaskContext.get() + val partitionId = context.partitionId + if (partitionId == 0) { + context.barrier() + } else { + context.allGather(partitionId.toString) + } + Seq(null).iterator + } + val error = intercept[SparkException] { + rdd2.collect() + }.getMessage + assert(error.contains("Different barrier sync types found")) + } + + test("successively sync with allGather and barrier") { + initLocalClusterSparkContext() + val rdd = sc.makeRDD(1 to 10, 4) + val rdd2 = rdd.barrier().mapPartitions { it => + val context = BarrierTaskContext.get() + // Sleep for a random time before global sync. + Thread.sleep(Random.nextInt(1000)) + context.barrier() + val time1 = System.currentTimeMillis() + // Sleep for a random time before global sync. + Thread.sleep(Random.nextInt(1000)) + // Pass partitionId message in + val message = context.partitionId().toString + val messages = context.allGather(message) + val time2 = System.currentTimeMillis() + Seq((time1, time2)).iterator + } + val times = rdd2.collect() + // All the tasks shall finish the first round of global sync within a short time slot. + val times1 = times.map(_._1) + assert(times1.max - times1.min <= 1000) + + // All the tasks shall finish the second round of global sync within a short time slot. + val times2 = times.map(_._2) + assert(times2.max - times2.min <= 1000) + } + test("support multiple barrier() call within a single task") { initLocalClusterSparkContext() val rdd = sc.makeRDD(1 to 10, 4) @@ -148,12 +216,14 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { def testBarrierTaskKilled(interruptOnKill: Boolean): Unit = { withTempDir { dir => + val runningFlagFile = "barrier.task.running" val killedFlagFile = "barrier.task.killed" val rdd = sc.makeRDD(Seq(0, 1), 2) val rdd2 = rdd.barrier().mapPartitions { it => val context = BarrierTaskContext.get() if (context.partitionId() == 0) { try { + new File(dir, runningFlagFile).createNewFile() context.barrier() } catch { case _: TaskKilledException => @@ -172,8 +242,10 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { if (partitionId == 0) { new Thread { override def run: Unit = { - Thread.sleep(1000) - sc.killTaskAttempt(taskStart.taskInfo.taskId, interruptThread = interruptOnKill) + eventually(timeout(10.seconds)) { + assert(new File(dir, runningFlagFile).exists()) + sc.killTaskAttempt(taskStart.taskInfo.taskId, interruptThread = interruptOnKill) + } } }.start() } @@ -200,4 +272,21 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { initLocalClusterSparkContext() testBarrierTaskKilled(interruptOnKill = true) } + + test("SPARK-31485: barrier stage should fail if only partial tasks are launched") { + initLocalClusterSparkContext(2) + val id = sc.getExecutorIds().head + val rdd0 = sc.parallelize(Seq(0, 1, 2, 3), 2) + val dep = new OneToOneDependency[Int](rdd0) + // set up a barrier stage with 2 tasks and both tasks prefer the same executor (only 1 core) for + // scheduling. So, one of tasks won't be scheduled in one round of resource offer. + val rdd = new MyRDD(sc, 2, List(dep), Seq(Seq(s"executor_h_$id"), Seq(s"executor_h_$id"))) + val errorMsg = intercept[SparkException] { + rdd.barrier().mapPartitions { iter => + BarrierTaskContext.get().barrier() + iter + }.collect() + }.getMessage + assert(errorMsg.contains("Fail resource offers for barrier stage")) + } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala index c063301673598..fad971fd4ddec 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala @@ -251,6 +251,9 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo assert(execResources(GPU).assignedAddrs === Array("0")) } + // To avoid allocating any resources immediately after releasing the resource from the task to + // make sure that `availableAddrs` below won't change + when(ts.resourceOffers(any[IndexedSeq[WorkerOffer]])).thenReturn(Seq.empty) backend.driverEndpoint.send( StatusUpdate("1", 1, TaskState.FINISHED, buffer, taskResources)) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 101e60c73e9f8..d92bd5bcba5ad 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -25,6 +25,9 @@ import scala.annotation.meta.param import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map} import scala.util.control.NonFatal +import org.mockito.Mockito.spy +import org.mockito.Mockito.times +import org.mockito.Mockito.verify import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} import org.scalatest.exceptions.TestFailedException import org.scalatest.time.SpanSugar._ @@ -232,6 +235,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi var sparkListener: EventInfoRecordingListener = null + var blockManagerMaster: BlockManagerMaster = null var mapOutputTracker: MapOutputTrackerMaster = null var broadcastManager: BroadcastManager = null var securityMgr: SecurityManager = null @@ -245,17 +249,18 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi */ val cacheLocations = new HashMap[(Int, Int), Seq[BlockManagerId]] // stub out BlockManagerMaster.getLocations to use our cacheLocations - val blockManagerMaster = new BlockManagerMaster(null, null, conf, true) { - override def getLocations(blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = { - blockIds.map { - _.asRDDId.map(id => (id.rddId -> id.splitIndex)).flatMap(key => cacheLocations.get(key)). - getOrElse(Seq()) - }.toIndexedSeq - } - override def removeExecutor(execId: String): Unit = { - // don't need to propagate to the driver, which we don't have - } + class MyBlockManagerMaster(conf: SparkConf) extends BlockManagerMaster(null, null, conf, true) { + override def getLocations(blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = { + blockIds.map { + _.asRDDId.map { id => (id.rddId -> id.splitIndex) + }.flatMap { key => cacheLocations.get(key) + }.getOrElse(Seq()) + }.toIndexedSeq } + override def removeExecutor(execId: String): Unit = { + // don't need to propagate to the driver, which we don't have + } + } /** The list of results that DAGScheduler has collected. */ val results = new HashMap[Int, Any]() @@ -273,6 +278,16 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi override def jobFailed(exception: Exception): Unit = { failure = exception } } + class MyMapOutputTrackerMaster( + conf: SparkConf, + broadcastManager: BroadcastManager) + extends MapOutputTrackerMaster(conf, broadcastManager, true) { + + override def sendTracker(message: Any): Unit = { + // no-op, just so we can stop this to avoid leaking threads + } + } + override def beforeEach(): Unit = { super.beforeEach() init(new SparkConf()) @@ -290,11 +305,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi results.clear() securityMgr = new SecurityManager(conf) broadcastManager = new BroadcastManager(true, conf, securityMgr) - mapOutputTracker = new MapOutputTrackerMaster(conf, broadcastManager, true) { - override def sendTracker(message: Any): Unit = { - // no-op, just so we can stop this to avoid leaking threads - } - } + mapOutputTracker = spy(new MyMapOutputTrackerMaster(conf, broadcastManager)) + blockManagerMaster = spy(new MyBlockManagerMaster(conf)) scheduler = new DAGScheduler( sc, taskScheduler, @@ -537,6 +549,59 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(mapStatus2(2).location.host === "hostB") } + test("SPARK-32003: All shuffle files for executor should be cleaned up on fetch failure") { + // reset the test context with the right shuffle service config + afterEach() + val conf = new SparkConf() + conf.set(config.SHUFFLE_SERVICE_ENABLED.key, "true") + init(conf) + + val shuffleMapRdd = new MyRDD(sc, 3, Nil) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(3)) + val shuffleId = shuffleDep.shuffleId + val reduceRdd = new MyRDD(sc, 3, List(shuffleDep), tracker = mapOutputTracker) + + submit(reduceRdd, Array(0, 1, 2)) + // Map stage completes successfully, + // two tasks are run on an executor on hostA and one on an executor on hostB + complete(taskSets(0), Seq( + (Success, makeMapStatus("hostA", 3)), + (Success, makeMapStatus("hostA", 3)), + (Success, makeMapStatus("hostB", 3)))) + // Now the executor on hostA is lost + runEvent(ExecutorLost("exec-hostA", ExecutorExited(-100, false, "Container marked as failed"))) + // Executor is removed but shuffle files are not unregistered + verify(blockManagerMaster, times(1)).removeExecutor("exec-hostA") + verify(mapOutputTracker, times(0)).removeOutputsOnExecutor("exec-hostA") + + // The MapOutputTracker has all the shuffle files + val mapStatuses = mapOutputTracker.shuffleStatuses(shuffleId).mapStatuses + assert(mapStatuses.count(_ != null) === 3) + assert(mapStatuses.count(s => s != null && s.location.executorId == "exec-hostA") === 2) + assert(mapStatuses.count(s => s != null && s.location.executorId == "exec-hostB") === 1) + + // Now a fetch failure from the lost executor occurs + complete(taskSets(1), Seq( + (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null) + )) + // blockManagerMaster.removeExecutor is not called again + // but shuffle files are unregistered + verify(blockManagerMaster, times(1)).removeExecutor("exec-hostA") + verify(mapOutputTracker, times(1)).removeOutputsOnExecutor("exec-hostA") + + // Shuffle files for exec-hostA should be lost + assert(mapStatuses.count(_ != null) === 1) + assert(mapStatuses.count(s => s != null && s.location.executorId == "exec-hostA") === 0) + assert(mapStatuses.count(s => s != null && s.location.executorId == "exec-hostB") === 1) + + // Additional fetch failure from the executor does not result in further call to + // mapOutputTracker.removeOutputsOnExecutor + complete(taskSets(1), Seq( + (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 1, 0, "ignored"), null) + )) + verify(mapOutputTracker, times(1)).removeOutputsOnExecutor("exec-hostA") + } + test("zero split job") { var numResults = 0 var failureReason: Option[Exception] = None @@ -762,8 +827,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi complete(taskSets(1), Seq( (Success, 42), (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null))) - // this will get called - // blockManagerMaster.removeExecutor("exec-hostA") + verify(blockManagerMaster, times(1)).removeExecutor("exec-hostA") // ask the scheduler to try it again scheduler.resubmitFailedStages() // have the 2nd attempt pass @@ -806,11 +870,14 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi (Success, makeMapStatus("hostA", 1)), (Success, makeMapStatus("hostB", 1)))) runEvent(ExecutorLost("exec-hostA", event)) + verify(blockManagerMaster, times(1)).removeExecutor("exec-hostA") if (expectFileLoss) { + verify(mapOutputTracker, times(1)).removeOutputsOnExecutor("exec-hostA") intercept[MetadataFetchFailedException] { mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0) } } else { + verify(mapOutputTracker, times(0)).removeOutputsOnExecutor("exec-hostA") assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet === HashSet(makeBlockManagerId("hostA"), makeBlockManagerId("hostB"))) } @@ -1931,6 +1998,53 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assertDataStructuresEmpty() } + test("SPARK-30388: shuffle fetch failed on speculative task, but original task succeed") { + var completedStage: List[Int] = Nil + val listener = new SparkListener() { + override def onStageCompleted(event: SparkListenerStageCompleted): Unit = { + completedStage = completedStage :+ event.stageInfo.stageId + } + } + sc.addSparkListener(listener) + + val shuffleMapRdd = new MyRDD(sc, 2, Nil) + val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) + val reduceRdd = new MyRDD(sc, 2, List(shuffleDep)) + submit(reduceRdd, Array(0, 1)) + completeShuffleMapStageSuccessfully(0, 0, 2) + sc.listenerBus.waitUntilEmpty() + assert(completedStage === List(0)) + + // result task 0.0 succeed + runEvent(makeCompletionEvent(taskSets(1).tasks(0), Success, 42)) + // speculative result task 1.1 fetch failed + val info = new TaskInfo(4, index = 1, attemptNumber = 1, 0L, "", "", TaskLocality.ANY, true) + runEvent(makeCompletionEvent( + taskSets(1).tasks(1), + FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0L, 0, 1, "ignored"), + null, + Seq.empty, + Array.empty, + info + ) + ) + sc.listenerBus.waitUntilEmpty() + assert(completedStage === List(0, 1)) + + Thread.sleep(DAGScheduler.RESUBMIT_TIMEOUT * 2) + // map stage resubmitted + assert(scheduler.runningStages.size === 1) + val mapStage = scheduler.runningStages.head + assert(mapStage.id === 0) + assert(mapStage.latestInfo.failureReason.isEmpty) + + // original result task 1.0 succeed + runEvent(makeCompletionEvent(taskSets(1).tasks(1), Success, 42)) + sc.listenerBus.waitUntilEmpty() + assert(completedStage === List(0, 1, 1, 0)) + assert(scheduler.activeJobs.isEmpty) + } + test("misbehaved accumulator should not crash DAGScheduler and SparkContext") { val acc = new LongAccumulator { override def add(v: java.lang.Long): Unit = throw new DAGSchedulerSuiteDummyException diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index 286924001e920..046564d65b767 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -36,6 +36,7 @@ import org.apache.spark.deploy.history.{EventLogFileReader, SingleEventLogFileWr import org.apache.spark.deploy.history.EventLogTestHelper._ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.{EVENT_LOG_DIR, EVENT_LOG_ENABLED} import org.apache.spark.io._ import org.apache.spark.metrics.{ExecutorMetricType, MetricsSystem} import org.apache.spark.scheduler.cluster.ExecutorInfo @@ -99,6 +100,49 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit testStageExecutorMetricsEventLogging() } + test("SPARK-31764: isBarrier should be logged in event log") { + val conf = new SparkConf() + conf.set(EVENT_LOG_ENABLED, true) + conf.set(EVENT_LOG_DIR, testDirPath.toString) + val sc = new SparkContext("local", "test-SPARK-31764", conf) + val appId = sc.applicationId + + sc.parallelize(1 to 10) + .barrier() + .mapPartitions(_.map(elem => (elem, elem))) + .filter(elem => elem._1 % 2 == 0) + .reduceByKey(_ + _) + .collect + sc.stop() + + val eventLogStream = EventLogFileReader.openEventLog(new Path(testDirPath, appId), fileSystem) + val events = readLines(eventLogStream).map(line => JsonProtocol.sparkEventFromJson(parse(line))) + val jobStartEvents = events + .filter(event => event.isInstanceOf[SparkListenerJobStart]) + .map(_.asInstanceOf[SparkListenerJobStart]) + + assert(jobStartEvents.size === 1) + val stageInfos = jobStartEvents.head.stageInfos + assert(stageInfos.size === 2) + + val stage0 = stageInfos(0) + val rddInfosInStage0 = stage0.rddInfos + assert(rddInfosInStage0.size === 3) + val sortedRddInfosInStage0 = rddInfosInStage0.sortBy(_.scope.get.name) + assert(sortedRddInfosInStage0(0).scope.get.name === "filter") + assert(sortedRddInfosInStage0(0).isBarrier === true) + assert(sortedRddInfosInStage0(1).scope.get.name === "mapPartitions") + assert(sortedRddInfosInStage0(1).isBarrier === true) + assert(sortedRddInfosInStage0(2).scope.get.name === "parallelize") + assert(sortedRddInfosInStage0(2).isBarrier === false) + + val stage1 = stageInfos(1) + val rddInfosInStage1 = stage1.rddInfos + assert(rddInfosInStage1.size === 1) + assert(rddInfosInStage1(0).scope.get.name === "reduceByKey") + assert(rddInfosInStage1(0).isBarrier === false) // reduceByKey + } + /* ----------------- * * Actual test logic * * ----------------- */ diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index e7ecf847ff4f4..a083cdb95b20f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -758,7 +758,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B // that are explicitly blacklisted, plus those that have *any* executors blacklisted. val nodesForBlacklistedExecutors = offers.filter { offer => execBlacklist.contains(offer.executorId) - }.map(_.host).toSet.toSeq + }.map(_.host).distinct val nodesWithAnyBlacklisting = (nodeBlacklist ++ nodesForBlacklistedExecutors).toSet // Similarly, figure out which executors have any blacklisting. This means all executors // that are explicitly blacklisted, plus all executors on nodes that are blacklisted. diff --git a/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala index 615389ae5c2d4..c5f73150f5db5 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala @@ -42,7 +42,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { private val conf = new SparkConf() .set(DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT.key, "60s") .set(DYN_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT.key, "120s") - .set(DYN_ALLOCATION_SHUFFLE_TIMEOUT.key, "240s") + .set(DYN_ALLOCATION_SHUFFLE_TRACKING_TIMEOUT.key, "240s") .set(SHUFFLE_SERVICE_ENABLED, true) private var monitor: ExecutorMonitor = _ @@ -287,7 +287,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { test("shuffle block tracking") { val bus = mockListenerBus() - conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, false) + conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED, true).set(SHUFFLE_SERVICE_ENABLED, false) monitor = new ExecutorMonitor(conf, client, bus, clock) // 3 jobs: 2 and 3 share a shuffle, 1 has a separate shuffle. @@ -355,7 +355,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { test("SPARK-28839: Avoids NPE in context cleaner when shuffle service is on") { val bus = mockListenerBus() - conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, true) + conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED, true).set(SHUFFLE_SERVICE_ENABLED, true) monitor = new ExecutorMonitor(conf, client, bus, clock) { override def onOtherEvent(event: SparkListenerEvent): Unit = { throw new IllegalStateException("No event should be sent.") @@ -367,7 +367,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { test("shuffle tracking with multiple executors and concurrent jobs") { val bus = mockListenerBus() - conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, false) + conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED, true).set(SHUFFLE_SERVICE_ENABLED, false) monitor = new ExecutorMonitor(conf, client, bus, clock) monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) @@ -410,8 +410,8 @@ class ExecutorMonitorSuite extends SparkFunSuite { test("SPARK-28455: avoid overflow in timeout calculation") { conf - .set(DYN_ALLOCATION_SHUFFLE_TIMEOUT, Long.MaxValue) - .set(DYN_ALLOCATION_SHUFFLE_TRACKING, true) + .set(DYN_ALLOCATION_SHUFFLE_TRACKING_TIMEOUT, Long.MaxValue) + .set(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED, true) .set(SHUFFLE_SERVICE_ENABLED, false) monitor = new ExecutorMonitor(conf, client, null, clock) diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala index fd228cded783a..525e682dd5d42 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoBenchmark.scala @@ -125,7 +125,7 @@ object KryoBenchmark extends BenchmarkBase { def createSerializer(useUnsafe: Boolean): SerializerInstance = { val conf = new SparkConf() conf.set(SERIALIZER, "org.apache.spark.serializer.KryoSerializer") - conf.set(KRYO_USER_REGISTRATORS, classOf[MyRegistrator].getName) + conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[MyRegistrator].getName)) conf.set(KRYO_USE_UNSAFE, useUnsafe) new KryoSerializer(conf).newInstance() diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala index 953b651c72a83..dde0c98704d00 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala @@ -76,7 +76,7 @@ object KryoSerializerBenchmark extends BenchmarkBase { conf.set(EXECUTOR_EXTRA_JAVA_OPTIONS, "-XX:+UseParallelGC -XX:-UseDynamicNumberOfGCThreads") conf.set(SERIALIZER, "org.apache.spark.serializer.KryoSerializer") - conf.set(KRYO_USER_REGISTRATORS, classOf[MyRegistrator].getName) + conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[MyRegistrator].getName)) conf.set(KRYO_USE_POOL, usePool) if (sc != null) { diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala index d4fafab4a5d64..397fdce8ae6e3 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala @@ -29,7 +29,7 @@ class KryoSerializerDistributedSuite extends SparkFunSuite with LocalSparkContex test("kryo objects are serialised consistently in different processes") { val conf = new SparkConf(false) .set(config.SERIALIZER, "org.apache.spark.serializer.KryoSerializer") - .set(config.Kryo.KRYO_USER_REGISTRATORS, classOf[AppJarRegistrator].getName) + .set(config.Kryo.KRYO_USER_REGISTRATORS, Seq(classOf[AppJarRegistrator].getName)) .set(config.TASK_MAX_FAILURES, 1) .set(config.BLACKLIST_ENABLED, false) diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala index 4c47a67ee9ffc..229ef69973775 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala @@ -42,7 +42,7 @@ import org.apache.spark.util.ThreadUtils class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set(SERIALIZER, "org.apache.spark.serializer.KryoSerializer") - conf.set(KRYO_USER_REGISTRATORS, classOf[MyRegistrator].getName) + conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[MyRegistrator].getName)) conf.set(KRYO_USE_UNSAFE, false) test("SPARK-7392 configuration limits") { @@ -313,7 +313,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { import org.apache.spark.SparkException val conf = new SparkConf(false) - conf.set(KRYO_USER_REGISTRATORS, "this.class.does.not.exist") + conf.set(KRYO_USER_REGISTRATORS, Seq("this.class.does.not.exist")) val thrown = intercept[SparkException](new KryoSerializer(conf).newInstance().serialize(1)) assert(thrown.getMessage.contains("Failed to register classes with Kryo")) @@ -412,7 +412,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { val ser = new KryoSerializer(new SparkConf).newInstance().asInstanceOf[KryoSerializerInstance] assert(ser.getAutoReset) val conf = new SparkConf().set(KRYO_USER_REGISTRATORS, - classOf[RegistratorWithoutAutoReset].getName) + Seq(classOf[RegistratorWithoutAutoReset].getName)) val ser2 = new KryoSerializer(conf).newInstance().asInstanceOf[KryoSerializerInstance] assert(!ser2.getAutoReset) } @@ -443,7 +443,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { .set(KRYO_REFERENCE_TRACKING, referenceTracking) .set(KRYO_USE_POOL, usePool) if (!autoReset) { - conf.set(KRYO_USER_REGISTRATORS, classOf[RegistratorWithoutAutoReset].getName) + conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[RegistratorWithoutAutoReset].getName)) } val ser = new KryoSerializer(conf) val serInstance = ser.newInstance().asInstanceOf[KryoSerializerInstance] @@ -530,7 +530,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { class KryoSerializerAutoResetDisabledSuite extends SparkFunSuite with SharedSparkContext { conf.set(SERIALIZER, classOf[KryoSerializer].getName) - conf.set(KRYO_USER_REGISTRATORS, classOf[RegistratorWithoutAutoReset].getName) + conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[RegistratorWithoutAutoReset].getName)) conf.set(KRYO_REFERENCE_TRACKING, true) conf.set(SHUFFLE_MANAGER, "sort") conf.set(SHUFFLE_SORT_BYPASS_MERGE_THRESHOLD, 200) diff --git a/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala b/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala index dad080c5fc161..9747f5780dd1e 100644 --- a/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/SerializerPropertiesSuite.scala @@ -52,7 +52,7 @@ class SerializerPropertiesSuite extends SparkFunSuite { test("KryoSerializer does not support relocation when auto-reset is disabled") { val conf = new SparkConf().set(KRYO_USER_REGISTRATORS, - classOf[RegistratorWithoutAutoReset].getName) + Seq(classOf[RegistratorWithoutAutoReset].getName)) val ser = new KryoSerializer(conf) assert(!ser.newInstance().asInstanceOf[KryoSerializerInstance].getAutoReset()) testSupportsRelocationOfSerializedObjects(ser, generateRandomItem) diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala index e7eed7bf4c879..255f91866ef58 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala @@ -1657,6 +1657,30 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { } } + test("clean up used memory when BlockManager added") { + val listener = new AppStatusListener(store, conf, true) + // Add block manager at the first time + val driver = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, "localhost", 42) + listener.onBlockManagerAdded(SparkListenerBlockManagerAdded( + time, driver, 42L, Some(43L), Some(44L))) + // Update the memory metrics + listener.updateExecutorMemoryDiskInfo( + listener.liveExecutors(SparkContext.DRIVER_IDENTIFIER), + StorageLevel.MEMORY_AND_DISK, + 10L, + 10L + ) + // Re-add the same block manager again + listener.onBlockManagerAdded(SparkListenerBlockManagerAdded( + time, driver, 42L, Some(43L), Some(44L))) + + check[ExecutorSummaryWrapper](SparkContext.DRIVER_IDENTIFIER) { d => + val memoryMetrics = d.info.memoryMetrics.get + assert(memoryMetrics.usedOffHeapStorageMemory == 0) + assert(memoryMetrics.usedOnHeapStorageMemory == 0) + } + } + private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptNumber) diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/ExecutorSummarySuite.scala b/core/src/test/scala/org/apache/spark/status/api/v1/ExecutorSummarySuite.scala new file mode 100644 index 0000000000000..2723af75227e4 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/status/api/v1/ExecutorSummarySuite.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import java.util.Date + +import com.fasterxml.jackson.core.`type`.TypeReference +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule + +import org.apache.spark.SparkFunSuite + +class ExecutorSummarySuite extends SparkFunSuite { + + test("Check ExecutorSummary serialize and deserialize with empty peakMemoryMetrics") { + val mapper = new ObjectMapper().registerModule(DefaultScalaModule) + val executorSummary = new ExecutorSummary("id", "host:port", true, 1, + 10, 10, 1, 1, 1, + 0, 0, 1, 100, + 1, 100, 100, + 10, false, 20, new Date(1600984336352L), + Option.empty, Option.empty, Map(), Option.empty, Set(), Option.empty, Map(), Map()) + val expectedJson = "{\"id\":\"id\",\"hostPort\":\"host:port\",\"isActive\":true," + + "\"rddBlocks\":1,\"memoryUsed\":10,\"diskUsed\":10,\"totalCores\":1,\"maxTasks\":1," + + "\"activeTasks\":1,\"failedTasks\":0,\"completedTasks\":0,\"totalTasks\":1," + + "\"totalDuration\":100,\"totalGCTime\":1,\"totalInputBytes\":100," + + "\"totalShuffleRead\":100,\"totalShuffleWrite\":10,\"isBlacklisted\":false," + + "\"maxMemory\":20,\"addTime\":1600984336352,\"removeTime\":null,\"removeReason\":null," + + "\"executorLogs\":{},\"memoryMetrics\":null,\"blacklistedInStages\":[]," + + "\"peakMemoryMetrics\":null,\"attributes\":{},\"resources\":{}}" + val json = mapper.writeValueAsString(executorSummary) + assert(expectedJson.equals(json)) + val deserializeExecutorSummary = mapper.readValue(json, new TypeReference[ExecutorSummary] {}) + assert(deserializeExecutorSummary.peakMemoryMetrics == None) + } + +} diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerMasterSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerMasterSuite.scala new file mode 100644 index 0000000000000..0d54726af7ee8 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerMasterSuite.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.storage + +import org.junit.Assert.assertTrue + +import org.apache.spark.{SparkConf, SparkFunSuite} + +class BlockManagerMasterSuite extends SparkFunSuite { + + test("SPARK-31422: getMemoryStatus should not fail after BlockManagerMaster stops") { + val bmm = new BlockManagerMaster(null, null, new SparkConf, true) + assertTrue(bmm.getMemoryStatus.isEmpty) + } + + test("SPARK-31422: getStorageStatus should not fail after BlockManagerMaster stops") { + val bmm = new BlockManagerMaster(null, null, new SparkConf, true) + assertTrue(bmm.getStorageStatus.isEmpty) + } +} diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala index 59ace850d0bd2..6238d862d62df 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala @@ -92,8 +92,6 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite conf.set(MEMORY_STORAGE_FRACTION, 0.999) conf.set(STORAGE_UNROLL_MEMORY_THRESHOLD, 512L) - // to make a replication attempt to inactive store fail fast - conf.set("spark.core.connection.ack.wait.timeout", "1s") // to make cached peers refresh frequently conf.set(STORAGE_CACHED_PEERS_TTL, 10) diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 89f00b5a9d902..9cf531383ce31 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -50,7 +50,7 @@ import org.apache.spark.network.server.{NoOpRpcHandler, TransportServer, Transpo import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, ExecutorDiskUtils, ExternalBlockStoreClient} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterExecutor} import org.apache.spark.rpc.RpcEnv -import org.apache.spark.scheduler.LiveListenerBus +import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerBlockUpdated} import org.apache.spark.security.{CryptoStreamUtils, EncryptionFunSuite} import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, SerializerManager} import org.apache.spark.shuffle.sort.SortShuffleManager @@ -71,6 +71,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val allStores = ArrayBuffer[BlockManager]() var rpcEnv: RpcEnv = null var master: BlockManagerMaster = null + var liveListenerBus: LiveListenerBus = null val securityMgr = new SecurityManager(new SparkConf(false)) val bcastManager = new BroadcastManager(true, new SparkConf(false), securityMgr) val mapOutputTracker = new MapOutputTrackerMaster(new SparkConf(false), bcastManager, true) @@ -129,10 +130,28 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE blockManager } + // Save modified system properties so that we can restore them after tests. + val originalArch = System.getProperty("os.arch") + val originalCompressedOops = System.getProperty(TEST_USE_COMPRESSED_OOPS_KEY) + + def reinitializeSizeEstimator(arch: String, useCompressedOops: String): Unit = { + def set(k: String, v: String): Unit = { + if (v == null) { + System.clearProperty(k) + } else { + System.setProperty(k, v) + } + } + set("os.arch", arch) + set(TEST_USE_COMPRESSED_OOPS_KEY, useCompressedOops) + val initialize = PrivateMethod[Unit](Symbol("initialize")) + SizeEstimator invokePrivate initialize() + } + override def beforeEach(): Unit = { super.beforeEach() // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case - System.setProperty("os.arch", "amd64") + reinitializeSizeEstimator("amd64", "true") conf = new SparkConf(false) init(conf) @@ -145,17 +164,18 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE when(sc.conf).thenReturn(conf) val blockManagerInfo = new mutable.HashMap[BlockManagerId, BlockManagerInfo]() + liveListenerBus = spy(new LiveListenerBus(conf)) master = spy(new BlockManagerMaster(rpcEnv.setupEndpoint("blockmanager", new BlockManagerMasterEndpoint(rpcEnv, true, conf, - new LiveListenerBus(conf), None, blockManagerInfo)), + liveListenerBus, None, blockManagerInfo)), rpcEnv.setupEndpoint("blockmanagerHeartbeat", new BlockManagerMasterHeartbeatEndpoint(rpcEnv, true, blockManagerInfo)), conf, true)) - - val initialize = PrivateMethod[Unit](Symbol("initialize")) - SizeEstimator invokePrivate initialize() } override def afterEach(): Unit = { + // Restore system properties and SizeEstimator to their original states. + reinitializeSizeEstimator(originalArch, originalCompressedOops) + try { conf = null allStores.foreach(_.stop()) @@ -164,6 +184,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE rpcEnv.awaitTermination() rpcEnv = null master = null + liveListenerBus = null } finally { super.afterEach() } @@ -1693,6 +1714,16 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE assert(locs(blockIds(0)) == expectedLocs) } + test("SPARK-30594: Do not post SparkListenerBlockUpdated when updateBlockInfo returns false") { + // update block info for non-existent block manager + val updateInfo = UpdateBlockInfo(BlockManagerId("1", "host1", 100), + BlockId("test_1"), StorageLevel.MEMORY_ONLY, 1, 1) + val result = master.driverEndpoint.askSync[Boolean](updateInfo) + + assert(!result) + verify(liveListenerBus, never()).post(SparkListenerBlockUpdated(BlockUpdatedInfo(updateInfo))) + } + class MockBlockTransferService(val maxFailures: Int) extends BlockTransferService { var numCalls = 0 var tempFileManager: DownloadFileManager = null diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala index ccc525e854838..c757dee43808d 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala @@ -51,7 +51,7 @@ class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with B override def beforeEach(): Unit = { super.beforeEach() val conf = testConf.clone - conf.set("spark.local.dir", rootDirs).set("spark.diskStore.subDirectories", "1") + conf.set("spark.local.dir", rootDirs) diskBlockManager = new DiskBlockManager(conf, deleteFilesOnStop = true) } @@ -90,45 +90,4 @@ class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with B for (i <- 0 until numBytes) writer.write(i) writer.close() } - - test("temporary shuffle/local file should be able to handle disk failures") { - try { - // the following two lines pre-create subdirectories under each root dir of block manager - diskBlockManager.getFile("1") - diskBlockManager.getFile("2") - - val tempShuffleFile1 = diskBlockManager.createTempShuffleBlock()._2 - val tempLocalFile1 = diskBlockManager.createTempLocalBlock()._2 - assert(tempShuffleFile1.exists(), "There are no bad disks, so temp shuffle file exists") - assert(tempLocalFile1.exists(), "There are no bad disks, so temp local file exists") - - // partial disks damaged - rootDir0.setExecutable(false) - val tempShuffleFile2 = diskBlockManager.createTempShuffleBlock()._2 - val tempLocalFile2 = diskBlockManager.createTempLocalBlock()._2 - // It's possible that after 10 retries we still not able to find the healthy disk. we need to - // remove the flakiness of these two asserts - if (tempShuffleFile2.getParentFile.getParentFile.getParent === rootDir1.getAbsolutePath) { - assert(tempShuffleFile2.exists(), - "There is only one bad disk, so temp shuffle file should be created") - } - if (tempLocalFile2.getParentFile.getParentFile.getParent === rootDir1.getAbsolutePath) { - assert(tempLocalFile2.exists(), - "There is only one bad disk, so temp local file should be created") - } - - // all disks damaged - rootDir1.setExecutable(false) - val tempShuffleFile3 = diskBlockManager.createTempShuffleBlock()._2 - val tempLocalFile3 = diskBlockManager.createTempLocalBlock()._2 - assert(!tempShuffleFile3.exists(), - "All disks are broken, so there should be no temp shuffle file created") - assert(!tempLocalFile3.exists(), - "All disks are broken, so there should be no temp local file created") - } finally { - rootDir0.setExecutable(true) - rootDir1.setExecutable(true) - } - - } } diff --git a/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala index ccd7e4b62ad9e..c46ab2d199f0b 100644 --- a/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala @@ -26,6 +26,7 @@ import org.scalatest._ import org.apache.spark._ import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.Tests.TEST_USE_COMPRESSED_OOPS_KEY import org.apache.spark.memory.{MemoryMode, UnifiedMemoryManager} import org.apache.spark.serializer.{KryoSerializer, SerializerManager} import org.apache.spark.storage.memory.{BlockEvictionHandler, MemoryStore, PartiallySerializedBlock, PartiallyUnrolledIterator} @@ -51,12 +52,34 @@ class MemoryStoreSuite implicit def StringToBlockId(value: String): BlockId = new TestBlockId(value) def rdd(rddId: Int, splitId: Int): RDDBlockId = RDDBlockId(rddId, splitId) + // Save modified system properties so that we can restore them after tests. + val originalArch = System.getProperty("os.arch") + val originalCompressedOops = System.getProperty(TEST_USE_COMPRESSED_OOPS_KEY) + + def reinitializeSizeEstimator(arch: String, useCompressedOops: String): Unit = { + def set(k: String, v: String): Unit = { + if (v == null) { + System.clearProperty(k) + } else { + System.setProperty(k, v) + } + } + set("os.arch", arch) + set(TEST_USE_COMPRESSED_OOPS_KEY, useCompressedOops) + val initialize = PrivateMethod[Unit](Symbol("initialize")) + SizeEstimator invokePrivate initialize() + } + override def beforeEach(): Unit = { super.beforeEach() // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case - System.setProperty("os.arch", "amd64") - val initialize = PrivateMethod[Unit](Symbol("initialize")) - SizeEstimator invokePrivate initialize() + reinitializeSizeEstimator("amd64", "true") + } + + override def afterEach(): Unit = { + super.afterEach() + // Restore system properties and SizeEstimator to their original states. + reinitializeSizeEstimator(originalArch, originalCompressedOops) } def makeMemoryStore(maxMem: Long): (MemoryStore, BlockInfoManager) = { diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala index 45f47c7c49bca..43917a5b83bb0 100644 --- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala @@ -36,6 +36,7 @@ import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, ExternalBlockStoreClient} import org.apache.spark.network.util.LimitedInputStream import org.apache.spark.shuffle.FetchFailedException +import org.apache.spark.storage.ShuffleBlockFetcherIterator.FetchBlockInfo import org.apache.spark.util.Utils @@ -254,6 +255,98 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT intercept[FetchFailedException] { iterator.next() } } + test("Hit maxBytesInFlight limitation before maxBlocksInFlightPerAddress") { + val blockManager = mock(classOf[BlockManager]) + val localBmId = BlockManagerId("test-client", "test-local-host", 1) + doReturn(localBmId).when(blockManager).blockManagerId + + val remoteBmId1 = BlockManagerId("test-remote-client-1", "test-remote-host1", 1) + val remoteBmId2 = BlockManagerId("test-remote-client-2", "test-remote-host2", 2) + val blockId1 = ShuffleBlockId(0, 1, 0) + val blockId2 = ShuffleBlockId(1, 1, 0) + val blocksByAddress = Seq( + (remoteBmId1, Seq((blockId1, 1000L, 0))), + (remoteBmId2, Seq((blockId2, 1000L, 0)))).toIterator + val transfer = createMockTransfer(Map( + blockId1 -> createMockManagedBuffer(1000), + blockId2 -> createMockManagedBuffer(1000))) + val taskContext = TaskContext.empty() + val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 1000L, // allow 1 FetchRequests at most at the same time + Int.MaxValue, + Int.MaxValue, // set maxBlocksInFlightPerAddress to Int.MaxValue + Int.MaxValue, + true, + false, + metrics, + false) + // After initialize() we'll have 2 FetchRequests and each is 1000 bytes. So only the + // first FetchRequests can be sent, and the second one will hit maxBytesInFlight so + // it won't be sent. + verify(transfer, times(1)).fetchBlocks(any(), any(), any(), any(), any(), any()) + assert(iterator.hasNext) + // next() will trigger off sending deferred request + iterator.next() + // the second FetchRequest should be sent at this time + verify(transfer, times(2)).fetchBlocks(any(), any(), any(), any(), any(), any()) + assert(iterator.hasNext) + iterator.next() + assert(!iterator.hasNext) + } + + test("Hit maxBlocksInFlightPerAddress limitation before maxBytesInFlight") { + val blockManager = mock(classOf[BlockManager]) + val localBmId = BlockManagerId("test-client", "test-local-host", 1) + doReturn(localBmId).when(blockManager).blockManagerId + + val remoteBmId = BlockManagerId("test-remote-client-1", "test-remote-host", 2) + val blockId1 = ShuffleBlockId(0, 1, 0) + val blockId2 = ShuffleBlockId(0, 2, 0) + val blockId3 = ShuffleBlockId(0, 3, 0) + val blocksByAddress = Seq((remoteBmId, + Seq((blockId1, 1000L, 0), (blockId2, 1000L, 0), (blockId3, 1000L, 0)))).toIterator + val transfer = createMockTransfer(Map( + blockId1 -> createMockManagedBuffer(), + blockId2 -> createMockManagedBuffer(), + blockId3 -> createMockManagedBuffer())) + val taskContext = TaskContext.empty() + val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + Int.MaxValue, // set maxBytesInFlight to Int.MaxValue + Int.MaxValue, + 2, // set maxBlocksInFlightPerAddress to 2 + Int.MaxValue, + true, + false, + metrics, + false) + // After initialize(), we'll have 2 FetchRequests that one has 2 blocks inside and another one + // has only one block. So only the first FetchRequest can be sent. The second FetchRequest will + // hit maxBlocksInFlightPerAddress so it won't be sent. + verify(transfer, times(1)).fetchBlocks(any(), any(), any(), any(), any(), any()) + // the first request packaged 2 blocks, so we also need to + // call next() for 2 times to exhaust the iterator. + assert(iterator.hasNext) + iterator.next() + assert(iterator.hasNext) + iterator.next() + verify(transfer, times(2)).fetchBlocks(any(), any(), any(), any(), any(), any()) + assert(iterator.hasNext) + iterator.next() + assert(!iterator.hasNext) + } + test("fetch continuous blocks in batch successful 3 local + 4 host local + 2 remote reads") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) @@ -341,32 +434,86 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) } - test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + test("fetch continuous blocks in batch should respect maxBytesInFlight") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure remote blocks would return the merged block - val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) - val remoteBlocks = Seq[BlockId]( + val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1) + val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2) + val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _)) + val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1)) + val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer()) + val transfer = createMockTransfer(mergedRemoteBlocks) + + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))), + (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 1)))).toIterator + + val taskContext = TaskContext.empty() + val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 1500, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + true) + + var numResults = 0 + // After initialize(), there will be 6 FetchRequests. And each of the first 5 requests + // includes 1 merged block which is merged from 3 shuffle blocks. The last request has 1 merged + // block which merged from 2 shuffle blocks. So, only the first 5 requests(5 * 3 * 100 >= 1500) + // can be sent. The 6th FetchRequest will hit maxBlocksInFlightPerAddress so it won't + // be sent. + verify(transfer, times(5)).fetchBlocks(any(), any(), any(), any(), any(), any()) + while (iterator.hasNext) { + val (blockId, inputStream) = iterator.next() + // Make sure we release buffers when a wrapped input stream is closed. + val mockBuf = mergedRemoteBlocks(blockId) + verifyBufferRelease(mockBuf, inputStream) + numResults += 1 + } + // The 6th request will be sent after next() is called. + verify(transfer, times(6)).fetchBlocks(any(), any(), any(), any(), any(), any()) + assert(numResults == 6) + } + + test("fetch continuous blocks in batch should respect maxBlocksInFlightPerAddress") { + val blockManager = mock(classOf[BlockManager]) + val localBmId = BlockManagerId("test-client", "test-local-host", 1) + doReturn(localBmId).when(blockManager).blockManagerId + + // Make sure remote blocks would return the merged block + val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 1) + val remoteBlocks = Seq( ShuffleBlockId(0, 3, 0), ShuffleBlockId(0, 3, 1), - ShuffleBlockId(0, 3, 2), ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1), - ShuffleBlockId(0, 5, 0), - ShuffleBlockId(0, 5, 1), - ShuffleBlockId(0, 5, 2)) + ShuffleBlockId(0, 5, 0)) val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( - ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 0, 2) -> createMockManagedBuffer(), ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer(), - ShuffleBlockBatchId(0, 5, 0, 3) -> createMockManagedBuffer()) - val transfer = createMockTransfer(mergedRemoteBlocks) + ShuffleBlockBatchId(0, 5, 0, 1) -> createMockManagedBuffer()) + val transfer = createMockTransfer(mergedRemoteBlocks) val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( - (remoteBmId, remoteBlocks.map(blockId => (blockId, 1L, 1))) - ).toIterator - + (remoteBmId, remoteBlocks.map(blockId => (blockId, 100L, 1)))).toIterator val taskContext = TaskContext.empty() val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() val iterator = new ShuffleBlockFetcherIterator( @@ -375,7 +522,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT blockManager, blocksByAddress, (_, in) => in, - 35, + Int.MaxValue, Int.MaxValue, 2, Int.MaxValue, @@ -383,8 +530,12 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT false, metrics, true) - var numResults = 0 + // After initialize(), there will be 2 FetchRequests. First one has 2 merged blocks and each + // of them is merged from 2 shuffle blocks, second one has 1 merged block which is merged from + // 1 shuffle block. So only the first FetchRequest can be sent. The second FetchRequest will + // hit maxBlocksInFlightPerAddress so it won't be sent. + verify(transfer, times(1)).fetchBlocks(any(), any(), any(), any(), any(), any()) while (iterator.hasNext) { val (blockId, inputStream) = iterator.next() // Make sure we release buffers when a wrapped input stream is closed. @@ -392,8 +543,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT verifyBufferRelease(mockBuf, inputStream) numResults += 1 } - // The first 2 batch block ids are in the same fetch request as they don't exceed the max size - // and max blocks, so 2 requests in total. + // The second request will be sent after next() is called. verify(transfer, times(2)).fetchBlocks(any(), any(), any(), any(), any(), any()) assert(numResults == 3) } @@ -922,4 +1072,23 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val e = intercept[FetchFailedException] { iterator.next() } assert(e.getMessage.contains("Received a zero-size buffer")) } + + test("SPARK-31521: correct the fetch size when merging blocks into a merged block") { + val bId1 = ShuffleBlockBatchId(0, 0, 0, 5) + val bId2 = ShuffleBlockId(0, 0, 6) + val bId3 = ShuffleBlockId(0, 0, 7) + val block1 = FetchBlockInfo(bId1, 40, 0) + val block2 = FetchBlockInfo(bId2, 50, 0) + val block3 = FetchBlockInfo(bId3, 60, 0) + val inputBlocks = Seq(block1, block2, block3) + + val mergedBlocks = ShuffleBlockFetcherIterator. + mergeContinuousShuffleBlockIdsIfNeeded(inputBlocks, true) + assert(mergedBlocks.size === 1) + val mergedBlock = mergedBlocks.head + val mergedBlockId = mergedBlock.blockId.asInstanceOf[ShuffleBlockBatchId] + assert(mergedBlockId.startReduceId === bId1.startReduceId) + assert(mergedBlockId.endReduceId === bId3.reduceId + 1) + assert(mergedBlock.size === inputBlocks.map(_.size).sum) + } } diff --git a/core/src/test/scala/org/apache/spark/ui/ChromeUISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/ChromeUISeleniumSuite.scala new file mode 100644 index 0000000000000..9ba705c4abd75 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/ui/ChromeUISeleniumSuite.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ui + +import org.openqa.selenium.WebDriver +import org.openqa.selenium.chrome.{ChromeDriver, ChromeOptions} + +import org.apache.spark.tags.ChromeUITest + +/** + * Selenium tests for the Spark Web UI with Chrome. + */ +@ChromeUITest +class ChromeUISeleniumSuite extends RealBrowserUISeleniumSuite("webdriver.chrome.driver") { + + override var webDriver: WebDriver = _ + + override def beforeAll(): Unit = { + super.beforeAll() + val chromeOptions = new ChromeOptions + chromeOptions.addArguments("--headless", "--disable-gpu") + webDriver = new ChromeDriver(chromeOptions) + } + + override def afterAll(): Unit = { + try { + if (webDriver != null) { + webDriver.quit() + } + } finally { + super.afterAll() + } + } +} diff --git a/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala index d18f55474bdb3..4125436907007 100644 --- a/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala @@ -85,6 +85,35 @@ class PagedTableSuite extends SparkFunSuite { assert((pagedTable.pageNavigation(93, 10, 97).head \\ "li").map(_.text.trim) === Seq("<<", "<") ++ (91 to 97).map(_.toString) ++ Seq(">")) } + + test("pageNavigation with different id") { + val pagedTable = new PagedTable[Int] { + override def tableId: String = "testTable" + + override def tableCssClass: String = "" + + override def dataSource: PagedDataSource[Int] = null + + override def pageLink(page: Int): String = "" + + override def headers: Seq[Node] = Nil + + override def row(t: Int): Seq[Node] = Nil + + override def pageSizeFormField: String = "" + + override def pageNumberFormField: String = "" + + override def goButtonFormPath: String = "" + } + + val defaultIdNavigation = pagedTable.pageNavigation(1, 10, 2).head \\ "form" + assert(defaultIdNavigation \@ "id" === "form-testTable-page") + + val customIdNavigation = pagedTable.pageNavigation(1, 10, 2, "customIdTable").head \\ "form" + assert(customIdNavigation \@ "id" === "form-customIdTable-page") + assert(defaultIdNavigation !== customIdNavigation) + } } private[spark] class SeqPagedDataSource[T](seq: Seq[T], pageSize: Int) diff --git a/core/src/test/scala/org/apache/spark/ui/RealBrowserUISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/RealBrowserUISeleniumSuite.scala new file mode 100644 index 0000000000000..4b018f69b1660 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/ui/RealBrowserUISeleniumSuite.scala @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ui + +import org.openqa.selenium.{By, WebDriver} +import org.scalatest._ +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ +import org.scalatestplus.selenium.WebBrowser + +import org.apache.spark._ +import org.apache.spark.LocalSparkContext.withSpark +import org.apache.spark.internal.config.MEMORY_OFFHEAP_SIZE +import org.apache.spark.internal.config.UI.{UI_ENABLED, UI_KILL_ENABLED, UI_PORT} +import org.apache.spark.util.CallSite + +/** + * Selenium tests for the Spark Web UI with real web browsers. + */ +abstract class RealBrowserUISeleniumSuite(val driverProp: String) + extends SparkFunSuite with WebBrowser with Matchers with BeforeAndAfterAll { + + implicit var webDriver: WebDriver + private val driverPropPrefix = "spark.test." + + override def beforeAll(): Unit = { + super.beforeAll() + assume( + sys.props(driverPropPrefix + driverProp) !== null, + "System property " + driverPropPrefix + driverProp + + " should be set to the corresponding driver path.") + sys.props(driverProp) = sys.props(driverPropPrefix + driverProp) + } + + override def afterAll(): Unit = { + sys.props.remove(driverProp) + super.afterAll() + } + + test("SPARK-31534: text for tooltip should be escaped") { + withSpark(newSparkContext()) { sc => + sc.setLocalProperty(CallSite.LONG_FORM, "collect at :25") + sc.setLocalProperty(CallSite.SHORT_FORM, "collect at :25") + sc.parallelize(1 to 10).collect + + eventually(timeout(10.seconds), interval(50.milliseconds)) { + goToUi(sc, "/jobs") + + val jobDesc = + webDriver.findElement(By.cssSelector("div[class='application-timeline-content']")) + jobDesc.getAttribute("data-title") should include ("collect at <console>:25") + + goToUi(sc, "/jobs/job/?id=0") + webDriver.get(sc.ui.get.webUrl.stripSuffix("/") + "/jobs/job/?id=0") + val stageDesc = webDriver.findElement(By.cssSelector("div[class='job-timeline-content']")) + stageDesc.getAttribute("data-title") should include ("collect at <console>:25") + + // Open DAG Viz. + webDriver.findElement(By.id("job-dag-viz")).click() + val nodeDesc = webDriver.findElement(By.cssSelector("g[class='node_0 node']")) + nodeDesc.getAttribute("name") should include ("collect at <console>:25") + } + } + } + + test("SPARK-31882: Link URL for Stage DAGs should not depend on paged table.") { + withSpark(newSparkContext()) { sc => + sc.parallelize(1 to 100).map(v => (v, v)).repartition(10).reduceByKey(_ + _).collect + + eventually(timeout(10.seconds), interval(50.microseconds)) { + val pathWithPagedTable = + "/jobs/job/?id=0&completedStage.page=2&completedStage.sort=Stage+Id&" + + "completedStage.desc=true&completedStage.pageSize=1#completed" + goToUi(sc, pathWithPagedTable) + + // Open DAG Viz. + webDriver.findElement(By.id("job-dag-viz")).click() + val stages = webDriver.findElements(By.cssSelector("svg[class='job'] > a")) + stages.size() should be (3) + + stages.get(0).getAttribute("href") should include ("/stages/stage/?id=0&attempt=0") + stages.get(1).getAttribute("href") should include ("/stages/stage/?id=1&attempt=0") + stages.get(2).getAttribute("href") should include ("/stages/stage/?id=2&attempt=0") + } + } + } + + test("SPARK-31886: Color barrier execution mode RDD correctly") { + withSpark(newSparkContext()) { sc => + sc.parallelize(1 to 10).barrier.mapPartitions(identity).repartition(1).collect() + + eventually(timeout(10.seconds), interval(50.milliseconds)) { + goToUi(sc, "/jobs/job/?id=0") + webDriver.findElement(By.id("job-dag-viz")).click() + + val stage0 = webDriver.findElement(By.cssSelector("g[id='graph_0']")) + val stage1 = webDriver.findElement(By.cssSelector("g[id='graph_1']")) + val barrieredOps = webDriver.findElements(By.className("barrier-rdd")).iterator() + + while (barrieredOps.hasNext) { + val barrieredOpId = barrieredOps.next().getAttribute("innerHTML") + val foundInStage0 = + stage0.findElements( + By.cssSelector("g.barrier.cluster.cluster_" + barrieredOpId)) + assert(foundInStage0.size === 1) + + val foundInStage1 = + stage1.findElements( + By.cssSelector("g.barrier.cluster.cluster_" + barrieredOpId)) + assert(foundInStage1.size === 0) + } + } + } + } + + /** + * Create a test SparkContext with the SparkUI enabled. + * It is safe to `get` the SparkUI directly from the SparkContext returned here. + */ + private def newSparkContext( + killEnabled: Boolean = true, + master: String = "local", + additionalConfs: Map[String, String] = Map.empty): SparkContext = { + val conf = new SparkConf() + .setMaster(master) + .setAppName("test") + .set(UI_ENABLED, true) + .set(UI_PORT, 0) + .set(UI_KILL_ENABLED, killEnabled) + .set(MEMORY_OFFHEAP_SIZE.key, "64m") + additionalConfs.foreach { case (k, v) => conf.set(k, v) } + val sc = new SparkContext(conf) + assert(sc.ui.isDefined) + sc + } + + def goToUi(sc: SparkContext, path: String): Unit = { + goToUi(sc.ui.get, path) + } + + def goToUi(ui: SparkUI, path: String): Unit = { + go to (ui.webUrl.stripSuffix("/") + path) + } +} diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala index 9f0cdeac9ca39..909056eab8c5a 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala @@ -44,10 +44,11 @@ import org.apache.spark.internal.config.Status._ import org.apache.spark.internal.config.UI._ import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.status.api.v1.{JacksonMessageWriter, RDDDataDistribution, StageStatus} +import org.apache.spark.util.CallSite private[spark] class SparkUICssErrorHandler extends DefaultCssErrorHandler { - private val cssWhiteList = List("bootstrap.min.css", "vis.min.css") + private val cssWhiteList = List("bootstrap.min.css", "vis-timeline-graph2d.min.css") private def isInWhileList(uri: String): Boolean = cssWhiteList.exists(uri.endsWith) @@ -687,29 +688,29 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B assert(stage0.contains("digraph G {\n subgraph clusterstage_0 {\n " + "label="Stage 0";\n subgraph ")) assert(stage0.contains("{\n label="parallelize";\n " + - "0 [label="ParallelCollectionRDD [0]")) + "0 [labelType="html" label="ParallelCollectionRDD [0]")) assert(stage0.contains("{\n label="map";\n " + - "1 [label="MapPartitionsRDD [1]")) + "1 [labelType="html" label="MapPartitionsRDD [1]")) assert(stage0.contains("{\n label="groupBy";\n " + - "2 [label="MapPartitionsRDD [2]")) + "2 [labelType="html" label="MapPartitionsRDD [2]")) val stage1 = Source.fromURL(sc.ui.get.webUrl + "/stages/stage/?id=1&attempt=0&expandDagViz=true").mkString assert(stage1.contains("digraph G {\n subgraph clusterstage_1 {\n " + "label="Stage 1";\n subgraph ")) assert(stage1.contains("{\n label="groupBy";\n " + - "3 [label="ShuffledRDD [3]")) + "3 [labelType="html" label="ShuffledRDD [3]")) assert(stage1.contains("{\n label="map";\n " + - "4 [label="MapPartitionsRDD [4]")) + "4 [labelType="html" label="MapPartitionsRDD [4]")) assert(stage1.contains("{\n label="groupBy";\n " + - "5 [label="MapPartitionsRDD [5]")) + "5 [labelType="html" label="MapPartitionsRDD [5]")) val stage2 = Source.fromURL(sc.ui.get.webUrl + "/stages/stage/?id=2&attempt=0&expandDagViz=true").mkString assert(stage2.contains("digraph G {\n subgraph clusterstage_2 {\n " + "label="Stage 2";\n subgraph ")) assert(stage2.contains("{\n label="groupBy";\n " + - "6 [label="ShuffledRDD [6]")) + "6 [labelType="html" label="ShuffledRDD [6]")) } } } diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala index 2ad4a634cd9a7..c7e1dfe71d563 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala @@ -216,6 +216,15 @@ class UISuite extends SparkFunSuite { assert(rewrittenURI === null) } + test("SPARK-33611: Avoid encoding twice on the query parameter of proxy rewrittenURI") { + val prefix = "/worker-id" + val target = "http://localhost:8081" + val path = "/worker-id/json" + val rewrittenURI = + JettyUtils.createProxyURI(prefix, target, path, "order%5B0%5D%5Bcolumn%5D=0") + assert(rewrittenURI.toString === "http://localhost:8081/json?order%5B0%5D%5Bcolumn%5D=0") + } + test("verify rewriting location header for reverse proxy") { val clientRequest = mock(classOf[HttpServletRequest]) var headerValue = "http://localhost:4040/jobs" @@ -262,6 +271,27 @@ class UISuite extends SparkFunSuite { } } + test("SPARK-32467: Avoid encoding URL twice on https redirect") { + val (conf, securityMgr, sslOptions) = sslEnabledConf() + val serverInfo = JettyUtils.startJettyServer("0.0.0.0", 0, sslOptions, conf) + try { + val serverAddr = s"http://localhost:${serverInfo.boundPort}" + + val (_, ctx) = newContext("/ctx1") + serverInfo.addHandler(ctx, securityMgr) + + TestUtils.withHttpConnection(new URL(s"$serverAddr/ctx%281%29?a%5B0%5D=b")) { conn => + assert(conn.getResponseCode() === HttpServletResponse.SC_FOUND) + val location = Option(conn.getHeaderFields().get("Location")) + .map(_.get(0)).orNull + val expectedLocation = s"https://localhost:${serverInfo.securePort.get}/ctx(1)?a[0]=b" + assert(location == expectedLocation) + } + } finally { + stopServer(serverInfo) + } + } + test("http -> https redirect applies to all URIs") { val (conf, securityMgr, sslOptions) = sslEnabledConf() val serverInfo = JettyUtils.startJettyServer("0.0.0.0", 0, sslOptions, conf) diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala index 21e69550785a4..12d97573ff6ee 100644 --- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala @@ -337,7 +337,7 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging { assert(generatedFiles.size > 1) if (isCompressed) { assert( - generatedFiles.filter(_.getName.endsWith(RollingFileAppender.GZIP_LOG_SUFFIX)).size > 0) + generatedFiles.exists(_.getName.endsWith(RollingFileAppender.GZIP_LOG_SUFFIX))) } val allText = generatedFiles.map { file => if (file.getName.endsWith(RollingFileAppender.GZIP_LOG_SUFFIX)) { diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index a2a4b3aa974fc..b77cd810a0e61 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -305,6 +305,17 @@ class JsonProtocolSuite extends SparkFunSuite { assert(expectedFetchFailed === JsonProtocol.taskEndReasonFromJson(oldEvent)) } + test("SPARK-32124: FetchFailed Map Index backwards compatibility") { + // FetchFailed in Spark 2.4.0 does not have "Map Index" property. + val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, 18, 19, + "ignored") + val oldEvent = JsonProtocol.taskEndReasonToJson(fetchFailed) + .removeField({ _._1 == "Map Index" }) + val expectedFetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, + Int.MinValue, 19, "ignored") + assert(expectedFetchFailed === JsonProtocol.taskEndReasonFromJson(oldEvent)) + } + test("ShuffleReadMetrics: Local bytes read backwards compatibility") { // Metrics about local shuffle bytes read were added in 1.3.1. val metrics = makeTaskMetrics(1L, 2L, 3L, 4L, 5, 6, @@ -481,6 +492,76 @@ class JsonProtocolSuite extends SparkFunSuite { testAccumValue(Some("anything"), blocks, JString(blocks.toString)) testAccumValue(Some("anything"), 123, JString("123")) } + + /** Create an AccumulableInfo and verify we can serialize and deserialize it. */ + private def testAccumulableInfo( + name: String, + value: Option[Any], + expectedValue: Option[Any]): Unit = { + val isInternal = name.startsWith(InternalAccumulator.METRICS_PREFIX) + val accum = AccumulableInfo( + 123L, + Some(name), + update = value, + value = value, + internal = isInternal, + countFailedValues = false) + val json = JsonProtocol.accumulableInfoToJson(accum) + val newAccum = JsonProtocol.accumulableInfoFromJson(json) + assert(newAccum == accum.copy(update = expectedValue, value = expectedValue)) + } + + test("SPARK-31923: unexpected value type of internal accumulator") { + // Because a user may use `METRICS_PREFIX` in an accumulator name, we should test unexpected + // types to make sure we don't crash. + import InternalAccumulator.METRICS_PREFIX + testAccumulableInfo( + METRICS_PREFIX + "fooString", + value = Some("foo"), + expectedValue = None) + testAccumulableInfo( + METRICS_PREFIX + "fooList", + value = Some(java.util.Arrays.asList("string")), + expectedValue = Some(java.util.Collections.emptyList()) + ) + val blocks = Seq( + (TestBlockId("block1"), BlockStatus(StorageLevel.MEMORY_ONLY, 1L, 2L)), + (TestBlockId("block2"), BlockStatus(StorageLevel.DISK_ONLY, 3L, 4L))) + testAccumulableInfo( + METRICS_PREFIX + "fooList", + value = Some(java.util.Arrays.asList( + "string", + blocks(0), + blocks(1))), + expectedValue = Some(blocks.asJava) + ) + testAccumulableInfo( + METRICS_PREFIX + "fooSet", + value = Some(Set("foo")), + expectedValue = None) + } + + test("SPARK-30936: forwards compatibility - ignore unknown fields") { + val expected = TestListenerEvent("foo", 123) + val unknownFieldsJson = + """{ + | "Event" : "org.apache.spark.util.TestListenerEvent", + | "foo" : "foo", + | "bar" : 123, + | "unknown" : "unknown" + |}""".stripMargin + assert(JsonProtocol.sparkEventFromJson(parse(unknownFieldsJson)) === expected) + } + + test("SPARK-30936: backwards compatibility - set default values for missing fields") { + val expected = TestListenerEvent("foo", 0) + val unknownFieldsJson = + """{ + | "Event" : "org.apache.spark.util.TestListenerEvent", + | "foo" : "foo" + |}""".stripMargin + assert(JsonProtocol.sparkEventFromJson(parse(unknownFieldsJson)) === expected) + } } @@ -1041,6 +1122,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 201, | "Number of Cached Partitions": 301, | "Memory Size": 401, @@ -1563,6 +1645,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 200, | "Number of Cached Partitions": 300, | "Memory Size": 400, @@ -1607,6 +1690,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 400, | "Number of Cached Partitions": 600, | "Memory Size": 800, @@ -1623,6 +1707,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 401, | "Number of Cached Partitions": 601, | "Memory Size": 801, @@ -1667,6 +1752,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 600, | "Number of Cached Partitions": 900, | "Memory Size": 1200, @@ -1683,6 +1769,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 601, | "Number of Cached Partitions": 901, | "Memory Size": 1201, @@ -1699,6 +1786,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 602, | "Number of Cached Partitions": 902, | "Memory Size": 1202, @@ -1743,6 +1831,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 800, | "Number of Cached Partitions": 1200, | "Memory Size": 1600, @@ -1759,6 +1848,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 801, | "Number of Cached Partitions": 1201, | "Memory Size": 1601, @@ -1775,6 +1865,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 802, | "Number of Cached Partitions": 1202, | "Memory Size": 1602, @@ -1791,6 +1882,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Deserialized": true, | "Replication": 1 | }, + | "Barrier" : false, | "Number of Partitions": 803, | "Number of Cached Partitions": 1203, | "Memory Size": 1603, @@ -2310,3 +2402,5 @@ private[spark] object JsonProtocolSuite extends Assertions { |} """.stripMargin } + +case class TestListenerEvent(foo: String, bar: Int) extends SparkListenerEvent diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala index d4f2053e0b2f4..6183ba9faa6b4 100644 --- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala @@ -73,15 +73,35 @@ class SizeEstimatorSuite with PrivateMethodTester with ResetSystemProperties { + // Save modified system properties so that we can restore them after tests. + val originalArch = System.getProperty("os.arch") + val originalCompressedOops = System.getProperty(TEST_USE_COMPRESSED_OOPS_KEY) + + def reinitializeSizeEstimator(arch: String, useCompressedOops: String): Unit = { + def set(k: String, v: String): Unit = { + if (v == null) { + System.clearProperty(k) + } else { + System.setProperty(k, v) + } + } + set("os.arch", arch) + set(TEST_USE_COMPRESSED_OOPS_KEY, useCompressedOops) + val initialize = PrivateMethod[Unit](Symbol("initialize")) + SizeEstimator invokePrivate initialize() + } + override def beforeEach(): Unit = { - // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case super.beforeEach() - System.setProperty("os.arch", "amd64") - System.setProperty(TEST_USE_COMPRESSED_OOPS_KEY, "true") + // Set the arch to 64-bit and compressedOops to true so that SizeEstimator + // provides identical results accross all systems in these tests. + reinitializeSizeEstimator("amd64", "true") } override def afterEach(): Unit = { super.afterEach() + // Restore system properties and SizeEstimator to their original states. + reinitializeSizeEstimator(originalArch, originalCompressedOops) } test("simple classes") { @@ -178,11 +198,7 @@ class SizeEstimatorSuite } test("32-bit arch") { - System.setProperty("os.arch", "x86") - - val initialize = PrivateMethod[Unit](Symbol("initialize")) - SizeEstimator invokePrivate initialize() - + reinitializeSizeEstimator("x86", "true") assertResult(40)(SizeEstimator.estimate(DummyString(""))) assertResult(48)(SizeEstimator.estimate(DummyString("a"))) assertResult(48)(SizeEstimator.estimate(DummyString("ab"))) @@ -192,11 +208,7 @@ class SizeEstimatorSuite // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors // (Sun vs IBM). Use a DummyString class to make tests deterministic. test("64-bit arch with no compressed oops") { - System.setProperty("os.arch", "amd64") - System.setProperty(TEST_USE_COMPRESSED_OOPS_KEY, "false") - val initialize = PrivateMethod[Unit](Symbol("initialize")) - SizeEstimator invokePrivate initialize() - + reinitializeSizeEstimator("amd64", "false") assertResult(56)(SizeEstimator.estimate(DummyString(""))) assertResult(64)(SizeEstimator.estimate(DummyString("a"))) assertResult(64)(SizeEstimator.estimate(DummyString("ab"))) @@ -214,14 +226,13 @@ class SizeEstimatorSuite } test("class field blocks rounding on 64-bit VM without useCompressedOops") { + reinitializeSizeEstimator("amd64", "false") assertResult(24)(SizeEstimator.estimate(new DummyClass5)) assertResult(32)(SizeEstimator.estimate(new DummyClass6)) } test("check 64-bit detection for s390x arch") { - System.setProperty("os.arch", "s390x") - val initialize = PrivateMethod[Unit](Symbol("initialize")) - SizeEstimator invokePrivate initialize() + reinitializeSizeEstimator("s390x", "true") // Class should be 32 bytes on s390x if recognised as 64 bit platform assertResult(32)(SizeEstimator.estimate(new DummyClass7)) } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 8f8902e497d49..931eb6b5413f7 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -745,10 +745,14 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { manager.add(3, () => output += 3) manager.add(2, () => output += 2) manager.add(4, () => output += 4) + manager.add(Int.MinValue, () => output += Int.MinValue) + manager.add(Int.MinValue, () => output += Int.MinValue) + manager.add(Int.MaxValue, () => output += Int.MaxValue) + manager.add(Int.MaxValue, () => output += Int.MaxValue) manager.remove(hook1) manager.runAll() - assert(output.toList === List(4, 3, 2)) + assert(output.toList === List(Int.MaxValue, Int.MaxValue, 4, 3, 2, Int.MinValue, Int.MinValue)) } test("isInDirectory") { @@ -1243,6 +1247,10 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { intercept[IllegalArgumentException] { Utils.checkAndGetK8sMasterUrl("k8s://foo://host:port") } + + intercept[IllegalArgumentException] { + Utils.checkAndGetK8sMasterUrl("k8s:///https://host:port") + } } test("stringHalfWidth") { diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 73f461255de43..20a4002def2fb 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -27,8 +27,8 @@ d3.min.js dagre-d3.min.js graphlib-dot.min.js sorttable.js -vis.min.js -vis.min.css +vis-timeline-graph2d.min.js +vis-timeline-graph2d.min.css dataTables.bootstrap.css dataTables.bootstrap.min.js dataTables.rowsGroup.js @@ -39,11 +39,11 @@ jquery.dataTables.1.10.18.min.js jquery.mustache.js jsonFormatter.min.css jsonFormatter.min.js -.*avsc -.*txt -.*json -.*data -.*log +.*\.avsc +.*\.txt +.*\.json +.*\.data +.*\.log pyspark-coverage-site/ cloudpickle.py heapq3.py @@ -96,17 +96,17 @@ local-1430917381535_2 DESCRIPTION NAMESPACE test_support/* -.*Rd +.*\.Rd help/* html/* INDEX .lintr gen-java.* -.*avpr -.*parquet +.*\.avpr +.*\.parquet spark-deps-.* -.*csv -.*tsv +.*\.csv +.*\.tsv .*\.sql .Rbuildignore META-INF/* @@ -119,3 +119,11 @@ vote.tmpl SessionManager.java SessionHandler.java GangliaReporter.java +_metadata +_SUCCESS +part-00000 +.*\.res +flights_tiny.txt.1 +over1k +over10k +exported_table/* diff --git a/dev/check-license b/dev/check-license index 0cc17ffe55c67..bd255954d6db4 100755 --- a/dev/check-license +++ b/dev/check-license @@ -67,7 +67,7 @@ mkdir -p "$FWDIR"/lib exit 1 } -mkdir target +mkdir -p target $java_cmd -jar "$rat_jar" -E "$FWDIR"/dev/.rat-excludes -d "$FWDIR" > target/rat-results.txt if [ $? -ne 0 ]; then diff --git a/dev/create-release/do-release-docker.sh b/dev/create-release/do-release-docker.sh index 694a87bf78084..8f53f4a4e13ad 100755 --- a/dev/create-release/do-release-docker.sh +++ b/dev/create-release/do-release-docker.sh @@ -54,7 +54,7 @@ WORKDIR= IMGTAG=latest JAVA= RELEASE_STEP= -while getopts "d:hj:ns:t:" opt; do +while getopts ":d:hj:ns:t:" opt; do case $opt in d) WORKDIR="$OPTARG" ;; n) DRY_RUN=1 ;; @@ -62,7 +62,7 @@ while getopts "d:hj:ns:t:" opt; do j) JAVA="$OPTARG" ;; s) RELEASE_STEP="$OPTARG" ;; h) usage ;; - ?) error "Invalid option. Run with -h for help." ;; + \?) error "Invalid option. Run with -h for help." ;; esac done @@ -93,7 +93,7 @@ done GPG_KEY_FILE="$WORKDIR/gpg.key" fcreate_secure "$GPG_KEY_FILE" -$GPG --export-secret-key --armor "$GPG_KEY" > "$GPG_KEY_FILE" +$GPG --export-secret-key --armor --pinentry-mode loopback --passphrase "$GPG_PASSPHRASE" "$GPG_KEY" > "$GPG_KEY_FILE" run_silent "Building spark-rm image with tag $IMGTAG..." "docker-build.log" \ docker build -t "spark-rm:$IMGTAG" --build-arg UID=$UID "$SELF/spark-rm" @@ -128,6 +128,7 @@ ASF_PASSWORD=$ASF_PASSWORD GPG_PASSPHRASE=$GPG_PASSPHRASE RELEASE_STEP=$RELEASE_STEP USER=$USER +ZINC_OPTS=${RELEASE_ZINC_OPTS:-"-Xmx4g -XX:ReservedCodeCacheSize=2g"} EOF JAVA_VOL= diff --git a/dev/create-release/do-release.sh b/dev/create-release/do-release.sh index f1d4f3ab5ddec..4f18a55a3bceb 100755 --- a/dev/create-release/do-release.sh +++ b/dev/create-release/do-release.sh @@ -20,11 +20,11 @@ SELF=$(cd $(dirname $0) && pwd) . "$SELF/release-util.sh" -while getopts "bn" opt; do +while getopts ":b:n" opt; do case $opt in b) GIT_BRANCH=$OPTARG ;; n) DRY_RUN=1 ;; - ?) error "Invalid option: $OPTARG" ;; + \?) error "Invalid option: $OPTARG" ;; esac done diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 022d3af95c05d..eb972589a995e 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -103,7 +103,7 @@ if [ -z "$SPARK_VERSION" ]; then # Run $MVN in a separate command so that 'set -e' does the right thing. TMP=$(mktemp) $MVN help:evaluate -Dexpression=project.version > $TMP - SPARK_VERSION=$(cat $TMP | grep -v INFO | grep -v WARNING | grep -v Download) + SPARK_VERSION=$(cat $TMP | grep -v INFO | grep -v WARNING | grep -vi Download) rm $TMP fi @@ -164,7 +164,7 @@ fi DEST_DIR_NAME="$SPARK_PACKAGE_VERSION" git clean -d -f -x -rm .gitignore +rm -f .gitignore cd .. if [[ "$1" == "package" ]]; then @@ -174,9 +174,9 @@ if [[ "$1" == "package" ]]; then # For source release in v2.4+, exclude copy of binary license/notice if [[ $SPARK_VERSION > "2.4" ]]; then - rm spark-$SPARK_VERSION/LICENSE-binary - rm spark-$SPARK_VERSION/NOTICE-binary - rm -r spark-$SPARK_VERSION/licenses-binary + rm -f spark-$SPARK_VERSION/LICENSE-binary + rm -f spark-$SPARK_VERSION/NOTICE-binary + rm -rf spark-$SPARK_VERSION/licenses-binary fi tar cvzf spark-$SPARK_VERSION.tgz --exclude spark-$SPARK_VERSION/.git spark-$SPARK_VERSION @@ -380,7 +380,7 @@ if [[ "$1" == "publish-snapshot" ]]; then echo "" >> $tmp_settings # Generate random point for Zinc - export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") + export ZINC_PORT=$(python -S -c "import random; print(random.randrange(3030,4030))") $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $SCALA_2_12_PROFILES $PUBLISH_PROFILES deploy @@ -412,7 +412,7 @@ if [[ "$1" == "publish-release" ]]; then tmp_repo=$(mktemp -d spark-repo-XXXXX) # Generate random point for Zinc - export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") + export ZINC_PORT=$(python -S -c "import random; print(random.randrange(3030,4030))") # TODO: revisit for Scala 2.13 support diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh index 39856a9955955..a9a518f9e10d7 100755 --- a/dev/create-release/release-tag.sh +++ b/dev/create-release/release-tag.sh @@ -64,8 +64,12 @@ init_maven_sbt ASF_SPARK_REPO="gitbox.apache.org/repos/asf/spark.git" +function uriencode { jq -nSRr --arg v "$1" '$v|@uri'; } + +declare -r ENCODED_ASF_PASSWORD=$(uriencode "$ASF_PASSWORD") + rm -rf spark -git clone "https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO" -b $GIT_BRANCH +git clone "https://$ASF_USERNAME:$ENCODED_ASF_PASSWORD@$ASF_SPARK_REPO" -b $GIT_BRANCH cd spark git config user.name "$GIT_NAME" diff --git a/dev/create-release/release-util.sh b/dev/create-release/release-util.sh index 1282e15e79913..55e1d09da5898 100755 --- a/dev/create-release/release-util.sh +++ b/dev/create-release/release-util.sh @@ -159,10 +159,14 @@ function get_release_info { export SPARK_PACKAGE_VERSION="$RELEASE_TAG" # Gather some user information. - export ASF_USERNAME=$(read_config "ASF user" "$LOGNAME") + if [ -z "$ASF_USERNAME" ]; then + export ASF_USERNAME=$(read_config "ASF user" "$LOGNAME") + fi - GIT_NAME=$(git config user.name || echo "") - export GIT_NAME=$(read_config "Full name" "$GIT_NAME") + if [ -z "$GIT_NAME" ]; then + GIT_NAME=$(git config user.name || echo "") + export GIT_NAME=$(read_config "Full name" "$GIT_NAME") + fi export GIT_EMAIL="$ASF_USERNAME@apache.org" export GPG_KEY=$(read_config "GPG key" "$GIT_EMAIL") @@ -225,7 +229,7 @@ function init_maven_sbt { if [[ $JAVA_VERSION < "1.8." ]]; then # Needed for maven central when using Java 7. SBT_OPTS="-Dhttps.protocols=TLSv1.1,TLSv1.2" - MVN_EXTRA_OPTS="-Dhttps.protocols=TLSv1.1,TLSv1.2" + MVN_EXTRA_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dhttps.protocols=TLSv1.1,TLSv1.2" MVN="$MVN $MVN_EXTRA_OPTS" fi export MVN MVN_EXTRA_OPTS SBT_OPTS diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 3ba8e97929613..ff6af6ff57d35 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -21,7 +21,7 @@ # * Java 8 # * Ivy # * Python (2.7.15/3.6.7) -# * R-base/R-base-dev (3.6.1) +# * R-base/R-base-dev (4.0.2) # * Ruby 2.3 build utilities FROM ubuntu:18.04 @@ -33,8 +33,8 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true # These arguments are just for reuse and not really meant to be customized. ARG APT_INSTALL="apt-get install --no-install-recommends -y" -ARG BASE_PIP_PKGS="setuptools wheel" -ARG PIP_PKGS="pyopenssl numpy sphinx" +ARG PIP_PKGS="sphinx==2.3.1 mkdocs==1.0.4 numpy==1.18.1" +ARG GEM_PKGS="jekyll:4.0.0 jekyll-redirect-from:0.16.0 rouge:3.15.0" # Install extra needed repos and refresh. # - CRAN repo @@ -43,7 +43,7 @@ ARG PIP_PKGS="pyopenssl numpy sphinx" # This is all in a single "RUN" command so that if anything changes, "apt update" is run to fetch # the most current package versions (instead of potentially using old versions cached by docker). RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ - echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' >> /etc/apt/sources.list && \ + echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list && \ gpg --keyserver keyserver.ubuntu.com --recv-key E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ gpg -a --export E084DAB9 | apt-key add - && \ apt-get clean && \ @@ -62,12 +62,13 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ curl -sL https://deb.nodesource.com/setup_11.x | bash && \ $APT_INSTALL nodejs && \ # Install needed python packages. Use pip for installing packages (for consistency). - $APT_INSTALL libpython3-dev python3-pip && \ + $APT_INSTALL libpython3-dev python3-pip python3-setuptools && \ + # qpdf is required for CRAN checks to pass. + $APT_INSTALL qpdf jq && \ # Change default python version to python3. update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1 && \ update-alternatives --install /usr/bin/python python /usr/bin/python3.6 2 && \ update-alternatives --set python /usr/bin/python3.6 && \ - pip3 install $BASE_PIP_PKGS && \ pip3 install $PIP_PKGS && \ # Install R packages and dependencies used when building. # R depends on pandoc*, libssl (which are installed above). @@ -76,10 +77,8 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" && \ Rscript -e "devtools::install_github('jimhester/lintr')" && \ # Install tools needed to build the documentation. - $APT_INSTALL ruby2.3 ruby2.3-dev mkdocs && \ - gem install jekyll --no-rdoc --no-ri -v 3.8.6 && \ - gem install jekyll-redirect-from && \ - gem install rouge + $APT_INSTALL ruby2.5 ruby2.5-dev && \ + gem install --no-document $GEM_PKGS WORKDIR /opt/spark-rm/output diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 index 62d5772ffc94a..8c6cc15c150b3 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 @@ -35,10 +35,10 @@ commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar commons-cli/1.2//commons-cli-1.2.jar commons-codec/1.10//commons-codec-1.10.jar commons-collections/3.2.2//commons-collections-3.2.2.jar -commons-compiler/3.0.15//commons-compiler-3.0.15.jar -commons-compress/1.8.1//commons-compress-1.8.1.jar +commons-compiler/3.0.16//commons-compiler-3.0.16.jar +commons-compress/1.20//commons-compress-1.20.jar commons-configuration/1.6//commons-configuration-1.6.jar -commons-crypto/1.0.0//commons-crypto-1.0.0.jar +commons-crypto/1.1.0//commons-crypto-1.1.0.jar commons-dbcp/1.4//commons-dbcp-1.4.jar commons-digester/1.8//commons-digester-1.8.jar commons-httpclient/3.1//commons-httpclient-3.1.jar @@ -93,6 +93,7 @@ jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar jackson-core/2.10.0//jackson-core-2.10.0.jar jackson-databind/2.10.0//jackson-databind-2.10.0.jar jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-datatype-jsr310/2.10.3//jackson-datatype-jsr310-2.10.3.jar jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar @@ -105,14 +106,14 @@ jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar -janino/3.0.15//janino-3.0.15.jar +janino/3.0.16//janino-3.0.16.jar javassist/3.25.0-GA//javassist-3.25.0-GA.jar javax.inject/1//javax.inject-1.jar javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.2//jaxb-api-2.2.2.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar -jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar jersey-common/2.30//jersey-common-2.30.jar @@ -135,11 +136,11 @@ json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client/4.6.4//kubernetes-client-4.6.4.jar -kubernetes-model-common/4.6.4//kubernetes-model-common-4.6.4.jar -kubernetes-model/4.6.4//kubernetes-model-4.6.4.jar +kubernetes-client/4.9.2//kubernetes-client-4.9.2.jar +kubernetes-model-common/4.9.2//kubernetes-model-common-4.9.2.jar +kubernetes-model/4.9.2//kubernetes-model-4.9.2.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar libthrift/0.12.0//libthrift-0.12.0.jar @@ -155,14 +156,14 @@ metrics-jmx/4.1.1//metrics-jmx-4.1.1.jar metrics-json/4.1.1//metrics-json-4.1.1.jar metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar minlog/1.3.0//minlog-1.3.0.jar -netty-all/4.1.42.Final//netty-all-4.1.42.Final.jar +netty-all/4.1.47.Final//netty-all-4.1.47.Final.jar objenesis/2.5.1//objenesis-2.5.1.jar okhttp/3.12.6//okhttp-3.12.6.jar okio/1.15.0//okio-1.15.0.jar opencsv/2.3//opencsv-2.3.jar -orc-core/1.5.9/nohive/orc-core-1.5.9-nohive.jar -orc-mapreduce/1.5.9/nohive/orc-mapreduce-1.5.9-nohive.jar -orc-shims/1.5.9//orc-shims-1.5.9.jar +orc-core/1.5.10/nohive/orc-core-1.5.10-nohive.jar +orc-mapreduce/1.5.10/nohive/orc-mapreduce-1.5.10-nohive.jar +orc-shims/1.5.10//orc-shims-1.5.10.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar @@ -174,7 +175,7 @@ parquet-hadoop-bundle/1.6.0//parquet-hadoop-bundle-1.6.0.jar parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar -py4j/0.10.8.1//py4j-0.10.8.1.jar +py4j/0.10.9//py4j-0.10.9.jar pyrolite/4.30//pyrolite-4.30.jar scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar scala-compiler/2.12.10//scala-compiler-2.12.10.jar @@ -184,10 +185,10 @@ scala-reflect/2.12.10//scala-reflect-2.12.10.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.7.45//shims-0.7.45.jar -slf4j-api/1.7.16//slf4j-api-1.7.16.jar -slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +slf4j-api/1.7.30//slf4j-api-1.7.30.jar +slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar snakeyaml/1.24//snakeyaml-1.24.jar -snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar +snappy-java/1.1.8.2//snappy-java-1.1.8.2.jar snappy/0.2//snappy-0.2.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar @@ -199,9 +200,10 @@ stream/2.9.6//stream-2.9.6.jar stringtemplate/3.2.1//stringtemplate-3.2.1.jar super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar -univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar -xercesImpl/2.9.1//xercesImpl-2.9.1.jar +xercesImpl/2.12.0//xercesImpl-2.12.0.jar +xml-apis/1.4.01//xml-apis-1.4.01.jar xmlenc/0.52//xmlenc-0.52.jar xz/1.5//xz-1.5.jar zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 1b57250c1fb54..3e82df2bcdfa1 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -33,10 +33,10 @@ commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar commons-cli/1.2//commons-cli-1.2.jar commons-codec/1.10//commons-codec-1.10.jar commons-collections/3.2.2//commons-collections-3.2.2.jar -commons-compiler/3.0.15//commons-compiler-3.0.15.jar -commons-compress/1.8.1//commons-compress-1.8.1.jar +commons-compiler/3.0.16//commons-compiler-3.0.16.jar +commons-compress/1.20//commons-compress-1.20.jar commons-configuration/1.6//commons-configuration-1.6.jar -commons-crypto/1.0.0//commons-crypto-1.0.0.jar +commons-crypto/1.1.0//commons-crypto-1.1.0.jar commons-dbcp/1.4//commons-dbcp-1.4.jar commons-digester/1.8//commons-digester-1.8.jar commons-httpclient/3.1//commons-httpclient-3.1.jar @@ -79,21 +79,20 @@ hadoop-yarn-client/2.7.4//hadoop-yarn-client-2.7.4.jar hadoop-yarn-common/2.7.4//hadoop-yarn-common-2.7.4.jar hadoop-yarn-server-common/2.7.4//hadoop-yarn-server-common-2.7.4.jar hadoop-yarn-server-web-proxy/2.7.4//hadoop-yarn-server-web-proxy-2.7.4.jar -hive-beeline/2.3.6//hive-beeline-2.3.6.jar -hive-cli/2.3.6//hive-cli-2.3.6.jar -hive-common/2.3.6//hive-common-2.3.6.jar -hive-exec/2.3.6/core/hive-exec-2.3.6-core.jar -hive-jdbc/2.3.6//hive-jdbc-2.3.6.jar -hive-llap-common/2.3.6//hive-llap-common-2.3.6.jar -hive-metastore/2.3.6//hive-metastore-2.3.6.jar -hive-serde/2.3.6//hive-serde-2.3.6.jar -hive-service-rpc/2.3.6//hive-service-rpc-2.3.6.jar -hive-shims-0.23/2.3.6//hive-shims-0.23-2.3.6.jar -hive-shims-common/2.3.6//hive-shims-common-2.3.6.jar -hive-shims-scheduler/2.3.6//hive-shims-scheduler-2.3.6.jar -hive-shims/2.3.6//hive-shims-2.3.6.jar +hive-beeline/2.3.7//hive-beeline-2.3.7.jar +hive-cli/2.3.7//hive-cli-2.3.7.jar +hive-common/2.3.7//hive-common-2.3.7.jar +hive-exec/2.3.7/core/hive-exec-2.3.7-core.jar +hive-jdbc/2.3.7//hive-jdbc-2.3.7.jar +hive-llap-common/2.3.7//hive-llap-common-2.3.7.jar +hive-metastore/2.3.7//hive-metastore-2.3.7.jar +hive-serde/2.3.7//hive-serde-2.3.7.jar +hive-shims-0.23/2.3.7//hive-shims-0.23-2.3.7.jar +hive-shims-common/2.3.7//hive-shims-common-2.3.7.jar +hive-shims-scheduler/2.3.7//hive-shims-scheduler-2.3.7.jar +hive-shims/2.3.7//hive-shims-2.3.7.jar hive-storage-api/2.7.1//hive-storage-api-2.7.1.jar -hive-vector-code-gen/2.3.6//hive-vector-code-gen-2.3.6.jar +hive-vector-code-gen/2.3.7//hive-vector-code-gen-2.3.7.jar hk2-api/2.6.1//hk2-api-2.6.1.jar hk2-locator/2.6.1//hk2-locator-2.6.1.jar hk2-utils/2.6.1//hk2-utils-2.6.1.jar @@ -107,6 +106,7 @@ jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar jackson-core/2.10.0//jackson-core-2.10.0.jar jackson-databind/2.10.0//jackson-databind-2.10.0.jar jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-datatype-jsr310/2.10.3//jackson-datatype-jsr310-2.10.3.jar jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar @@ -119,7 +119,7 @@ jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar -janino/3.0.15//janino-3.0.15.jar +janino/3.0.16//janino-3.0.16.jar javassist/3.25.0-GA//javassist-3.25.0-GA.jar javax.inject/1//javax.inject-1.jar javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar @@ -127,7 +127,7 @@ javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.2//jaxb-api-2.2.2.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar -jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar jersey-common/2.30//jersey-common-2.30.jar @@ -151,11 +151,11 @@ json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client/4.6.4//kubernetes-client-4.6.4.jar -kubernetes-model-common/4.6.4//kubernetes-model-common-4.6.4.jar -kubernetes-model/4.6.4//kubernetes-model-4.6.4.jar +kubernetes-client/4.9.2//kubernetes-client-4.9.2.jar +kubernetes-model-common/4.9.2//kubernetes-model-common-4.9.2.jar +kubernetes-model/4.9.2//kubernetes-model-4.9.2.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar libthrift/0.12.0//libthrift-0.12.0.jar @@ -171,14 +171,14 @@ metrics-jmx/4.1.1//metrics-jmx-4.1.1.jar metrics-json/4.1.1//metrics-json-4.1.1.jar metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar minlog/1.3.0//minlog-1.3.0.jar -netty-all/4.1.42.Final//netty-all-4.1.42.Final.jar +netty-all/4.1.47.Final//netty-all-4.1.47.Final.jar objenesis/2.5.1//objenesis-2.5.1.jar okhttp/3.12.6//okhttp-3.12.6.jar okio/1.15.0//okio-1.15.0.jar opencsv/2.3//opencsv-2.3.jar -orc-core/1.5.9//orc-core-1.5.9.jar -orc-mapreduce/1.5.9//orc-mapreduce-1.5.9.jar -orc-shims/1.5.9//orc-shims-1.5.9.jar +orc-core/1.5.10//orc-core-1.5.10.jar +orc-mapreduce/1.5.10//orc-mapreduce-1.5.10.jar +orc-shims/1.5.10//orc-shims-1.5.10.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar @@ -189,7 +189,7 @@ parquet-format/2.4.0//parquet-format-2.4.0.jar parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar -py4j/0.10.8.1//py4j-0.10.8.1.jar +py4j/0.10.9//py4j-0.10.9.jar pyrolite/4.30//pyrolite-4.30.jar scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar scala-compiler/2.12.10//scala-compiler-2.12.10.jar @@ -199,10 +199,10 @@ scala-reflect/2.12.10//scala-reflect-2.12.10.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.7.45//shims-0.7.45.jar -slf4j-api/1.7.16//slf4j-api-1.7.16.jar -slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +slf4j-api/1.7.30//slf4j-api-1.7.30.jar +slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar snakeyaml/1.24//snakeyaml-1.24.jar -snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar +snappy-java/1.1.8.2//snappy-java-1.1.8.2.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar @@ -213,10 +213,11 @@ stream/2.9.6//stream-2.9.6.jar super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar transaction-api/1.1//transaction-api-1.1.jar -univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar velocity/1.5//velocity-1.5.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar -xercesImpl/2.9.1//xercesImpl-2.9.1.jar +xercesImpl/2.12.0//xercesImpl-2.12.0.jar +xml-apis/1.4.01//xml-apis-1.4.01.jar xmlenc/0.52//xmlenc-0.52.jar xz/1.5//xz-1.5.jar zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index ffd2364a51317..8de3433f2d9fb 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -30,10 +30,10 @@ commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar commons-cli/1.2//commons-cli-1.2.jar commons-codec/1.10//commons-codec-1.10.jar commons-collections/3.2.2//commons-collections-3.2.2.jar -commons-compiler/3.0.15//commons-compiler-3.0.15.jar -commons-compress/1.8.1//commons-compress-1.8.1.jar +commons-compiler/3.0.16//commons-compiler-3.0.16.jar +commons-compress/1.20//commons-compress-1.20.jar commons-configuration2/2.1.1//commons-configuration2-2.1.1.jar -commons-crypto/1.0.0//commons-crypto-1.0.0.jar +commons-crypto/1.1.0//commons-crypto-1.1.0.jar commons-daemon/1.0.13//commons-daemon-1.0.13.jar commons-dbcp/1.4//commons-dbcp-1.4.jar commons-httpclient/3.1//commons-httpclient-3.1.jar @@ -78,21 +78,20 @@ hadoop-yarn-common/3.2.0//hadoop-yarn-common-3.2.0.jar hadoop-yarn-registry/3.2.0//hadoop-yarn-registry-3.2.0.jar hadoop-yarn-server-common/3.2.0//hadoop-yarn-server-common-3.2.0.jar hadoop-yarn-server-web-proxy/3.2.0//hadoop-yarn-server-web-proxy-3.2.0.jar -hive-beeline/2.3.6//hive-beeline-2.3.6.jar -hive-cli/2.3.6//hive-cli-2.3.6.jar -hive-common/2.3.6//hive-common-2.3.6.jar -hive-exec/2.3.6/core/hive-exec-2.3.6-core.jar -hive-jdbc/2.3.6//hive-jdbc-2.3.6.jar -hive-llap-common/2.3.6//hive-llap-common-2.3.6.jar -hive-metastore/2.3.6//hive-metastore-2.3.6.jar -hive-serde/2.3.6//hive-serde-2.3.6.jar -hive-service-rpc/2.3.6//hive-service-rpc-2.3.6.jar -hive-shims-0.23/2.3.6//hive-shims-0.23-2.3.6.jar -hive-shims-common/2.3.6//hive-shims-common-2.3.6.jar -hive-shims-scheduler/2.3.6//hive-shims-scheduler-2.3.6.jar -hive-shims/2.3.6//hive-shims-2.3.6.jar +hive-beeline/2.3.7//hive-beeline-2.3.7.jar +hive-cli/2.3.7//hive-cli-2.3.7.jar +hive-common/2.3.7//hive-common-2.3.7.jar +hive-exec/2.3.7/core/hive-exec-2.3.7-core.jar +hive-jdbc/2.3.7//hive-jdbc-2.3.7.jar +hive-llap-common/2.3.7//hive-llap-common-2.3.7.jar +hive-metastore/2.3.7//hive-metastore-2.3.7.jar +hive-serde/2.3.7//hive-serde-2.3.7.jar +hive-shims-0.23/2.3.7//hive-shims-0.23-2.3.7.jar +hive-shims-common/2.3.7//hive-shims-common-2.3.7.jar +hive-shims-scheduler/2.3.7//hive-shims-scheduler-2.3.7.jar +hive-shims/2.3.7//hive-shims-2.3.7.jar hive-storage-api/2.7.1//hive-storage-api-2.7.1.jar -hive-vector-code-gen/2.3.6//hive-vector-code-gen-2.3.6.jar +hive-vector-code-gen/2.3.7//hive-vector-code-gen-2.3.7.jar hk2-api/2.6.1//hk2-api-2.6.1.jar hk2-locator/2.6.1//hk2-locator-2.6.1.jar hk2-utils/2.6.1//hk2-utils-2.6.1.jar @@ -106,6 +105,7 @@ jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar jackson-core/2.10.0//jackson-core-2.10.0.jar jackson-databind/2.10.0//jackson-databind-2.10.0.jar jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-datatype-jsr310/2.10.3//jackson-datatype-jsr310-2.10.3.jar jackson-jaxrs-base/2.9.5//jackson-jaxrs-base-2.9.5.jar jackson-jaxrs-json-provider/2.9.5//jackson-jaxrs-json-provider-2.9.5.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar @@ -118,7 +118,7 @@ jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar -janino/3.0.15//janino-3.0.15.jar +janino/3.0.16//janino-3.0.16.jar javassist/3.25.0-GA//javassist-3.25.0-GA.jar javax.inject/1//javax.inject-1.jar javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar @@ -127,7 +127,7 @@ javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.11//jaxb-api-2.2.11.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar jcip-annotations/1.0-1//jcip-annotations-1.0-1.jar -jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar jersey-common/2.30//jersey-common-2.30.jar @@ -149,7 +149,7 @@ json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kerb-admin/1.0.1//kerb-admin-1.0.1.jar kerb-client/1.0.1//kerb-client-1.0.1.jar kerb-common/1.0.1//kerb-common-1.0.1.jar @@ -165,9 +165,9 @@ kerby-pkix/1.0.1//kerby-pkix-1.0.1.jar kerby-util/1.0.1//kerby-util-1.0.1.jar kerby-xdr/1.0.1//kerby-xdr-1.0.1.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client/4.6.4//kubernetes-client-4.6.4.jar -kubernetes-model-common/4.6.4//kubernetes-model-common-4.6.4.jar -kubernetes-model/4.6.4//kubernetes-model-4.6.4.jar +kubernetes-client/4.9.2//kubernetes-client-4.9.2.jar +kubernetes-model-common/4.9.2//kubernetes-model-common-4.9.2.jar +kubernetes-model/4.9.2//kubernetes-model-4.9.2.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar libthrift/0.12.0//libthrift-0.12.0.jar @@ -183,17 +183,16 @@ metrics-jmx/4.1.1//metrics-jmx-4.1.1.jar metrics-json/4.1.1//metrics-json-4.1.1.jar metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar minlog/1.3.0//minlog-1.3.0.jar -mssql-jdbc/6.2.1.jre7//mssql-jdbc-6.2.1.jre7.jar -netty-all/4.1.42.Final//netty-all-4.1.42.Final.jar +netty-all/4.1.47.Final//netty-all-4.1.47.Final.jar nimbus-jose-jwt/4.41.1//nimbus-jose-jwt-4.41.1.jar objenesis/2.5.1//objenesis-2.5.1.jar okhttp/2.7.5//okhttp-2.7.5.jar okhttp/3.12.6//okhttp-3.12.6.jar okio/1.15.0//okio-1.15.0.jar opencsv/2.3//opencsv-2.3.jar -orc-core/1.5.9//orc-core-1.5.9.jar -orc-mapreduce/1.5.9//orc-mapreduce-1.5.9.jar -orc-shims/1.5.9//orc-shims-1.5.9.jar +orc-core/1.5.10//orc-core-1.5.10.jar +orc-mapreduce/1.5.10//orc-mapreduce-1.5.10.jar +orc-shims/1.5.10//orc-shims-1.5.10.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar @@ -204,7 +203,7 @@ parquet-format/2.4.0//parquet-format-2.4.0.jar parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar -py4j/0.10.8.1//py4j-0.10.8.1.jar +py4j/0.10.9//py4j-0.10.9.jar pyrolite/4.30//pyrolite-4.30.jar re2j/1.1//re2j-1.1.jar scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar @@ -215,10 +214,10 @@ scala-reflect/2.12.10//scala-reflect-2.12.10.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.7.45//shims-0.7.45.jar -slf4j-api/1.7.16//slf4j-api-1.7.16.jar -slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +slf4j-api/1.7.30//slf4j-api-1.7.30.jar +slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar snakeyaml/1.24//snakeyaml-1.24.jar -snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar +snappy-java/1.1.8.2//snappy-java-1.1.8.2.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar @@ -230,7 +229,7 @@ super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar token-provider/1.0.1//token-provider-1.0.1.jar transaction-api/1.1//transaction-api-1.1.jar -univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar velocity/1.5//velocity-1.5.jar woodstox-core/5.0.3//woodstox-core-5.0.3.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar diff --git a/dev/lint-python b/dev/lint-python index e9ed83dec5fce..674896e3b5dd0 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -16,9 +16,6 @@ # limitations under the License. # # define test binaries + versions -PYDOCSTYLE_BUILD="pydocstyle" -MINIMUM_PYDOCSTYLE="3.0.0" - FLAKE8_BUILD="flake8" MINIMUM_FLAKE8="3.5.0" @@ -27,6 +24,19 @@ MINIMUM_PYCODESTYLE="2.4.0" SPHINX_BUILD="sphinx-build" +PYTHON_EXECUTABLE="python3" + +function satisfies_min_version { + local provided_version="$1" + local expected_version="$2" + echo "$( + "$PYTHON_EXECUTABLE" << EOM +from setuptools.extern.packaging import version +print(version.parse('$provided_version') >= version.parse('$expected_version')) +EOM + )" +} + function compile_python_test { local COMPILE_STATUS= local COMPILE_REPORT= @@ -56,7 +66,7 @@ function pycodestyle_test { local PYCODESTYLE_STATUS= local PYCODESTYLE_REPORT= local RUN_LOCAL_PYCODESTYLE= - local VERSION= + local PYCODESTYLE_VERSION= local EXPECTED_PYCODESTYLE= local PYCODESTYLE_SCRIPT_PATH="$SPARK_ROOT_DIR/dev/pycodestyle-$MINIMUM_PYCODESTYLE.py" local PYCODESTYLE_SCRIPT_REMOTE_PATH="https://raw.githubusercontent.com/PyCQA/pycodestyle/$MINIMUM_PYCODESTYLE/pycodestyle.py" @@ -69,11 +79,8 @@ function pycodestyle_test { # check for locally installed pycodestyle & version RUN_LOCAL_PYCODESTYLE="False" if hash "$PYCODESTYLE_BUILD" 2> /dev/null; then - VERSION=$( $PYCODESTYLE_BUILD --version 2> /dev/null) - EXPECTED_PYCODESTYLE=$( (python3 -c 'from distutils.version import LooseVersion; - print(LooseVersion("""'${VERSION[0]}'""") >= LooseVersion("""'$MINIMUM_PYCODESTYLE'"""))')\ - 2> /dev/null) - + PYCODESTYLE_VERSION="$($PYCODESTYLE_BUILD --version)" + EXPECTED_PYCODESTYLE="$(satisfies_min_version $PYCODESTYLE_VERSION $MINIMUM_PYCODESTYLE)" if [ "$EXPECTED_PYCODESTYLE" == "True" ]; then RUN_LOCAL_PYCODESTYLE="True" fi @@ -117,7 +124,6 @@ function pycodestyle_test { function flake8_test { local FLAKE8_VERSION= - local VERSION= local EXPECTED_FLAKE8= local FLAKE8_REPORT= local FLAKE8_STATUS= @@ -128,11 +134,9 @@ function flake8_test { exit 1 fi - FLAKE8_VERSION="$($FLAKE8_BUILD --version 2> /dev/null)" - VERSION=($FLAKE8_VERSION) - EXPECTED_FLAKE8=$( (python3 -c 'from distutils.version import LooseVersion; - print(LooseVersion("""'${VERSION[0]}'""") >= LooseVersion("""'$MINIMUM_FLAKE8'"""))') \ - 2> /dev/null) + _FLAKE8_VERSION=($($FLAKE8_BUILD --version)) + FLAKE8_VERSION="${_FLAKE8_VERSION[0]}" + EXPECTED_FLAKE8="$(satisfies_min_version $FLAKE8_VERSION $MINIMUM_FLAKE8)" if [[ "$EXPECTED_FLAKE8" == "False" ]]; then echo "\ @@ -158,50 +162,6 @@ flake8 checks failed." fi } -function pydocstyle_test { - local PYDOCSTYLE_REPORT= - local PYDOCSTYLE_STATUS= - local PYDOCSTYLE_VERSION= - local EXPECTED_PYDOCSTYLE= - - # Exclude auto-generated configuration file. - local DOC_PATHS_TO_CHECK="$( cd "${SPARK_ROOT_DIR}" && find . -name "*.py" | grep -vF 'functions.py' )" - - # Check python document style, skip check if pydocstyle is not installed. - if ! hash "$PYDOCSTYLE_BUILD" 2> /dev/null; then - echo "The pydocstyle command was not found. Skipping pydocstyle checks for now." - echo - return - fi - - PYDOCSTYLE_VERSION="$($PYDOCSTYLEBUILD --version 2> /dev/null)" - EXPECTED_PYDOCSTYLE=$(python3 -c 'from distutils.version import LooseVersion; \ - print(LooseVersion("""'$PYDOCSTYLE_VERSION'""") >= LooseVersion("""'$MINIMUM_PYDOCSTYLE'"""))' \ - 2> /dev/null) - - if [[ "$EXPECTED_PYDOCSTYLE" == "False" ]]; then - echo "\ -The minimum version of pydocstyle needs to be $MINIMUM_PYDOCSTYLE. -Your current version is $PYDOCSTYLE_VERSION. -Skipping pydocstyle checks for now." - echo - return - fi - - echo "starting $PYDOCSTYLE_BUILD test..." - PYDOCSTYLE_REPORT=$( ($PYDOCSTYLE_BUILD --config=dev/tox.ini $DOC_PATHS_TO_CHECK) 2>&1) - PYDOCSTYLE_STATUS=$? - - if [ "$PYDOCSTYLE_STATUS" -ne 0 ]; then - echo "pydocstyle checks failed:" - echo "$PYDOCSTYLE_REPORT" - exit "$PYDOCSTYLE_STATUS" - else - echo "pydocstyle checks passed." - echo - fi -} - function sphinx_test { local SPHINX_REPORT= local SPHINX_STATUS= @@ -247,7 +207,6 @@ PYTHON_SOURCE="$(find . -name "*.py")" compile_python_test "$PYTHON_SOURCE" pycodestyle_test "$PYTHON_SOURCE" flake8_test -pydocstyle_test sphinx_test echo diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 0b30eec76bb53..9e767ce5a3daa 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -39,6 +39,7 @@ NAME=none MVN="$SPARK_HOME/build/mvn" function exit_with_usage { + set +x echo "make-distribution.sh - tool for making binary distributions of Spark" echo "" echo "usage:" diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 470f21e69d46a..b322d3f61b444 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -63,11 +63,15 @@ fi PYSPARK_VERSION=$(python3 -c "exec(open('python/pyspark/version.py').read());print(__version__)") PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz" # The pip install options we use for all the pip commands -PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall " +PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall" # Test both regular user and edit/dev install modes. PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST" "pip install $PIP_OPTIONS -e python/") +# Jenkins has PySpark installed under user sitepackages shared for some reasons. +# In this test, explicitly exclude user sitepackages to prevent side effects +export PYTHONNOUSERSITE=1 + for python in "${PYTHON_EXECS[@]}"; do for install_command in "${PIP_COMMANDS[@]}"; do echo "Testing pip installation with python $python" @@ -76,8 +80,12 @@ for python in "${PYTHON_EXECS[@]}"; do VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python rm -rf "$VIRTUALENV_PATH" if [ -n "$USE_CONDA" ]; then + if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then + # See also https://github.com/conda/conda/issues/7980 + source "$CONDA_PREFIX/etc/profile.d/conda.sh" + fi conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools - source activate "$VIRTUALENV_PATH" + source activate "$VIRTUALENV_PATH" || conda activate "$VIRTUALENV_PATH" else mkdir -p "$VIRTUALENV_PATH" virtualenv --python=$python "$VIRTUALENV_PATH" @@ -120,7 +128,7 @@ for python in "${PYTHON_EXECS[@]}"; do # conda / virtualenv environments need to be deactivated differently if [ -n "$USE_CONDA" ]; then - source deactivate + source deactivate || conda deactivate else deactivate fi diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins index c3adc696a5122..c155d4ea3f076 100755 --- a/dev/run-tests-jenkins +++ b/dev/run-tests-jenkins @@ -26,6 +26,7 @@ FWDIR="$( cd "$( dirname "$0" )/.." && pwd )" cd "$FWDIR" export PATH=/home/anaconda/envs/py36/bin:$PATH +export LANG="en_US.UTF-8" PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 6, 0))') if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index 72e32d4e16e14..13be9592d771f 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -198,7 +198,7 @@ def main(): # format: http://linux.die.net/man/1/timeout # must be less than the timeout configured on Jenkins. Usually Jenkins's timeout is higher # then this. Please consult with the build manager or a committer when it should be increased. - tests_timeout = "400m" + tests_timeout = "500m" # Array to capture all test names to run on the pull request. These tests are represented # by their file equivalents in the dev/tests/ directory. diff --git a/dev/run-tests.py b/dev/run-tests.py index 5255a77ec2081..d58884fc7783c 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -43,19 +43,21 @@ def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. If a file is not associated with a more specific submodule, then this method will consider that - file to belong to the 'root' module. GitHub Action and Appveyor files are ignored. + file to belong to the 'root' module. `.github` directory is counted only in GitHub Actions, + and `appveyor.yml` is always ignored because this file is dedicated only to AppVeyor builds. >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"])) ['pyspark-core', 'sql'] >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])] ['root'] - >>> [x.name for x in determine_modules_for_files( \ - [".github/workflows/master.yml", "appveyor.yml"])] + >>> [x.name for x in determine_modules_for_files(["appveyor.yml"])] [] """ changed_modules = set() for filename in filenames: - if filename in (".github/workflows/master.yml", "appveyor.yml"): + if filename in ("appveyor.yml",): + continue + if ("GITHUB_ACTIONS" not in os.environ) and filename.startswith(".github"): continue matched_at_least_one_module = False for module in modules.all_modules: @@ -101,28 +103,52 @@ def setup_test_environ(environ): os.environ[k] = v -def determine_modules_to_test(changed_modules): +def determine_modules_to_test(changed_modules, deduplicated=True): """ Given a set of modules that have changed, compute the transitive closure of those modules' dependent modules in order to determine the set of modules that should be tested. Returns a topologically-sorted list of modules (ties are broken by sorting on module names). + If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication + by dependencies into account. >>> [x.name for x in determine_modules_to_test([modules.root])] ['root'] >>> [x.name for x in determine_modules_to_test([modules.build])] ['root'] + >>> [x.name for x in determine_modules_to_test([modules.core])] + ['root'] + >>> [x.name for x in determine_modules_to_test([modules.launcher])] + ['root'] >>> [x.name for x in determine_modules_to_test([modules.graphx])] ['graphx', 'examples'] - >>> x = [x.name for x in determine_modules_to_test([modules.sql])] - >>> x # doctest: +NORMALIZE_WHITESPACE + >>> [x.name for x in determine_modules_to_test([modules.sql])] + ... # doctest: +NORMALIZE_WHITESPACE ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml'] + >>> sorted([x.name for x in determine_modules_to_test( + ... [modules.sparkr, modules.sql], deduplicated=False)]) + ... # doctest: +NORMALIZE_WHITESPACE + ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml', + 'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + >>> sorted([x.name for x in determine_modules_to_test( + ... [modules.sql, modules.core], deduplicated=False)]) + ... # doctest: +NORMALIZE_WHITESPACE + ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver', + 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', + 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', + 'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', + 'streaming-kinesis-asl'] """ modules_to_test = set() for module in changed_modules: - modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) + modules_to_test = modules_to_test.union( + determine_modules_to_test(module.dependent_modules, deduplicated)) modules_to_test = modules_to_test.union(set(changed_modules)) + + if not deduplicated: + return modules_to_test + # If we need to run all of the tests, then we should short-circuit and return 'root' if modules.root in modules_to_test: return [modules.root] @@ -415,7 +441,7 @@ def run_scala_tests_sbt(test_modules, test_profiles): exec_sbt(profiles_and_goals) -def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags): +def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags): """Function to properly execute all tests passed in as a set from the `determine_test_suites` function""" set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") @@ -425,6 +451,8 @@ def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags): test_profiles = extra_profiles + \ list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) + if included_tags: + test_profiles += ['-Dtest.include.tags=' + ",".join(included_tags)] if excluded_tags: test_profiles += ['-Dtest.exclude.tags=' + ",".join(excluded_tags)] @@ -456,6 +484,12 @@ def run_python_tests(test_modules, parallelism, with_coverage=False): if test_modules != [modules.root]: command.append("--modules=%s" % ','.join(m.name for m in test_modules)) command.append("--parallelism=%i" % parallelism) + if "GITHUB_ACTIONS" in os.environ: + # See SPARK-33565. Python 3.8 was temporarily removed as its default Python executables + # to test because of Jenkins environment issue. Once Jenkins has Python 3.8 to test, + # we should remove this change back and add python3.8 into python/run-tests.py script. + command.append("--python-executable=%s" % ','.join( + x for x in ["python3.8", "python2.7", "pypy3", "pypy"] if which(x))) run_cmd(command) if with_coverage: @@ -532,6 +566,24 @@ def parse_opts(): "-p", "--parallelism", type=int, default=8, help="The number of suites to test in parallel (default %(default)d)" ) + parser.add_argument( + "-m", "--modules", type=str, + default=None, + help="A comma-separated list of modules to test " + "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules])) + ) + parser.add_argument( + "-e", "--excluded-tags", type=str, + default=None, + help="A comma-separated list of tags to exclude in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) + parser.add_argument( + "-i", "--included-tags", type=str, + default=None, + help="A comma-separated list of tags to include in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) args, unknown = parser.parse_known_args() if unknown: @@ -564,11 +616,20 @@ def main(): " install one and retry.") sys.exit(2) - # install SparkR - if which("R"): - run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) - else: - print("Cannot install SparkR as R was not found in PATH") + # Install SparkR + should_only_test_modules = opts.modules is not None + test_modules = [] + if should_only_test_modules: + str_test_modules = [m.strip() for m in opts.modules.split(",")] + test_modules = [m for m in modules.all_modules if m.name in str_test_modules] + + if not should_only_test_modules or modules.sparkr in test_modules: + # If tests modules are specified, we will not run R linter. + # SparkR needs the manual SparkR installation. + if which("R"): + run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) + else: + print("Cannot install SparkR as R was not found in PATH") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables @@ -582,27 +643,69 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally and can use local settings + # else we're running locally or Github Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") - test_env = "local" + if "GITHUB_ACTIONS" in os.environ: + test_env = "github_actions" + else: + test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "and Hive profile", hive_version, "under environment", test_env) extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) - changed_modules = None - changed_files = None - if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): + changed_modules = [] + changed_files = [] + included_tags = [] + excluded_tags = [] + if should_only_test_modules: + # If we're running the tests in Github Actions, attempt to detect and test + # only the affected modules. + if test_env == "github_actions": + if os.environ["GITHUB_BASE_REF"] != "": + # Pull requests + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"]) + else: + # Build for each commit. + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"]) + + modules_to_test = determine_modules_to_test( + determine_modules_for_files(changed_files), deduplicated=False) + + if modules.root not in modules_to_test: + # If root module is not found, only test the intersected modules. + # If root module is found, just run the modules as specified initially. + test_modules = list(set(modules_to_test).intersection(test_modules)) + + changed_modules = test_modules + if len(changed_modules) == 0: + print("[info] There are no modules to test, exiting without testing.") + return + + # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and + # detect modules to test. + elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) + test_modules = determine_modules_to_test(changed_modules) excluded_tags = determine_tags_to_exclude(changed_modules) + # If there is no changed module found, tests all. if not changed_modules: changed_modules = [modules.root] - excluded_tags = [] + if not test_modules: + test_modules = determine_modules_to_test(changed_modules) + + if opts.excluded_tags: + excluded_tags.extend([t.strip() for t in opts.excluded_tags.split(",")]) + if opts.included_tags: + included_tags.extend([t.strip() for t in opts.included_tags.split(",")]) + print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) @@ -615,33 +718,32 @@ def main(): test_environ.update(m.environ) setup_test_environ(test_environ) - test_modules = determine_modules_to_test(changed_modules) - - # license checks - run_apache_rat_checks() - - # style checks - if not changed_files or any(f.endswith(".scala") - or f.endswith("scalastyle-config.xml") - for f in changed_files): - run_scala_style_checks(extra_profiles) should_run_java_style_checks = False - if not changed_files or any(f.endswith(".java") - or f.endswith("checkstyle.xml") - or f.endswith("checkstyle-suppressions.xml") - for f in changed_files): - # Run SBT Checkstyle after the build to prevent a side-effect to the build. - should_run_java_style_checks = True - if not changed_files or any(f.endswith("lint-python") - or f.endswith("tox.ini") - or f.endswith(".py") - for f in changed_files): - run_python_style_checks() - if not changed_files or any(f.endswith(".R") - or f.endswith("lint-r") - or f.endswith(".lintr") - for f in changed_files): - run_sparkr_style_checks() + if not should_only_test_modules: + # license checks + run_apache_rat_checks() + + # style checks + if not changed_files or any(f.endswith(".scala") + or f.endswith("scalastyle-config.xml") + for f in changed_files): + run_scala_style_checks(extra_profiles) + if not changed_files or any(f.endswith(".java") + or f.endswith("checkstyle.xml") + or f.endswith("checkstyle-suppressions.xml") + for f in changed_files): + # Run SBT Checkstyle after the build to prevent a side-effect to the build. + should_run_java_style_checks = True + if not changed_files or any(f.endswith("lint-python") + or f.endswith("tox.ini") + or f.endswith(".py") + for f in changed_files): + run_python_style_checks() + if not changed_files or any(f.endswith(".R") + or f.endswith("lint-r") + or f.endswith(".lintr") + for f in changed_files): + run_sparkr_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed @@ -663,7 +765,7 @@ def main(): build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites - run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags) + run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 40f2ca288d694..8705d523208fe 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -100,9 +100,75 @@ def __hash__(self): ] ) +kvstore = Module( + name="kvstore", + dependencies=[tags], + source_file_regexes=[ + "common/kvstore/", + ], + sbt_test_goals=[ + "kvstore/test", + ], +) + +network_common = Module( + name="network-common", + dependencies=[tags], + source_file_regexes=[ + "common/network-common/", + ], + sbt_test_goals=[ + "network-common/test", + ], +) + +network_shuffle = Module( + name="network-shuffle", + dependencies=[tags], + source_file_regexes=[ + "common/network-shuffle/", + ], + sbt_test_goals=[ + "network-shuffle/test", + ], +) + +unsafe = Module( + name="unsafe", + dependencies=[tags], + source_file_regexes=[ + "common/unsafe", + ], + sbt_test_goals=[ + "unsafe/test", + ], +) + +launcher = Module( + name="launcher", + dependencies=[tags], + source_file_regexes=[ + "launcher/", + ], + sbt_test_goals=[ + "launcher/test", + ], +) + +core = Module( + name="core", + dependencies=[kvstore, network_common, network_shuffle, unsafe, launcher], + source_file_regexes=[ + "core/", + ], + sbt_test_goals=[ + "core/test", + ], +) + catalyst = Module( name="catalyst", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "sql/catalyst/", ], @@ -111,7 +177,6 @@ def __hash__(self): ], ) - sql = Module( name="sql", dependencies=[catalyst], @@ -123,7 +188,6 @@ def __hash__(self): ], ) - hive = Module( name="hive", dependencies=[sql], @@ -142,7 +206,6 @@ def __hash__(self): ] ) - repl = Module( name="repl", dependencies=[hive], @@ -154,7 +217,6 @@ def __hash__(self): ], ) - hive_thriftserver = Module( name="hive-thriftserver", dependencies=[hive], @@ -192,7 +254,6 @@ def __hash__(self): ] ) - sketch = Module( name="sketch", dependencies=[tags], @@ -204,10 +265,9 @@ def __hash__(self): ] ) - graphx = Module( name="graphx", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "graphx/", ], @@ -216,10 +276,9 @@ def __hash__(self): ] ) - streaming = Module( name="streaming", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "streaming", ], @@ -235,7 +294,7 @@ def __hash__(self): # fail other PRs. streaming_kinesis_asl = Module( name="streaming-kinesis-asl", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "external/kinesis-asl/", "external/kinesis-asl-assembly/", @@ -254,21 +313,23 @@ def __hash__(self): streaming_kafka_0_10 = Module( name="streaming-kafka-0-10", - dependencies=[streaming], + dependencies=[streaming, core], source_file_regexes=[ # The ending "/" is necessary otherwise it will include "sql-kafka" codes "external/kafka-0-10/", "external/kafka-0-10-assembly", + "external/kafka-0-10-token-provider", ], sbt_test_goals=[ "streaming-kafka-0-10/test", + "token-provider-kafka-0-10/test" ] ) mllib_local = Module( name="mllib-local", - dependencies=[tags], + dependencies=[tags, core], source_file_regexes=[ "mllib-local", ], @@ -302,10 +363,9 @@ def __hash__(self): ] ) - pyspark_core = Module( name="pyspark-core", - dependencies=[], + dependencies=[core], source_file_regexes=[ "python/(?!pyspark/(ml|mllib|sql|streaming))" ], @@ -339,7 +399,6 @@ def __hash__(self): ] ) - pyspark_sql = Module( name="pyspark-sql", dependencies=[pyspark_core, hive, avro], @@ -364,7 +423,6 @@ def __hash__(self): "pyspark.sql.avro.functions", "pyspark.sql.pandas.conversion", "pyspark.sql.pandas.map_ops", - "pyspark.sql.pandas.functions", "pyspark.sql.pandas.group_ops", "pyspark.sql.pandas.types", "pyspark.sql.pandas.serializers", @@ -490,6 +548,7 @@ def __hash__(self): "pyspark.ml.tests.test_stat", "pyspark.ml.tests.test_training_summary", "pyspark.ml.tests.test_tuning", + "pyspark.ml.tests.test_util", "pyspark.ml.tests.test_wrapper", ], blacklisted_python_implementations=[ @@ -579,7 +638,7 @@ def __hash__(self): # No other modules should directly depend on this module. root = Module( name="root", - dependencies=[build], # Changes to build should trigger all tests. + dependencies=[build, core], # Changes to build should trigger all tests. source_file_regexes=[], # In order to run all of the tests, enable every test profile: build_profile_flags=list(set( diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index 936ac00f6b9e7..46f1f55938120 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -47,7 +47,7 @@ OLD_VERSION=$($MVN -q \ -Dexec.executable="echo" \ -Dexec.args='${project.version}' \ --non-recursive \ - org.codehaus.mojo:exec-maven-plugin:1.6.0:exec) + org.codehaus.mojo:exec-maven-plugin:1.6.0:exec | grep -E '[0-9]+\.[0-9]+\.[0-9]+') if [ $? != 0 ]; then echo -e "Error while getting version string from Maven:\n$OLD_VERSION" exit 1 diff --git a/dev/tox.ini b/dev/tox.ini index 54f65692c8303..49b673bd63db6 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -16,6 +16,4 @@ [pycodestyle] ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504 max-line-length=100 -exclude=cloudpickle.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/* -[pydocstyle] -ignore=D100,D101,D102,D103,D104,D105,D106,D107,D200,D201,D202,D203,D204,D205,D206,D207,D208,D209,D210,D211,D212,D213,D214,D215,D300,D301,D302,D400,D401,D402,D403,D404,D405,D406,D407,D408,D409,D410,D411,D412,D413,D414 +exclude=*/target/*,cloudpickle.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/* diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000000..9df83f37815b7 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +generated-*.html diff --git a/docs/README.md b/docs/README.md index 22039871cf63d..a1c569ed8712c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -39,6 +39,12 @@ installed. Also install the following libraries: $ sudo gem install jekyll jekyll-redirect-from rouge ``` +If your ruby version is 3.0 or higher, you should also install `webrick`. + +```sh +$ sudo gem install jekyll jekyll-redirect-from webrick +``` + Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0. ### R Documentation diff --git a/docs/_config.yml b/docs/_config.yml index a888620139207..abb3b49fff54d 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -8,14 +8,17 @@ gems: kramdown: entity_output: numeric +plugins: + - jekyll-redirect-from + include: - _static - _modules # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 3.0.0-SNAPSHOT -SPARK_VERSION_SHORT: 3.0.0 +SPARK_VERSION: 3.0.2-SNAPSHOT +SPARK_VERSION_SHORT: 3.0.2 SCALA_BINARY_VERSION: "2.12" SCALA_VERSION: "2.12.10" MESOS_VERSION: 1.0.0 diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index 3e4db7107ec34..581fd1de6ad2d 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -24,6 +24,8 @@ subitems: - text: "Generic Load/Save Functions" url: sql-data-sources-load-save-functions.html + - text: "Generic File Source Options" + url: sql-data-sources-generic-options.html - text: Parquet Files url: sql-data-sources-parquet.html - text: ORC Files @@ -65,6 +67,8 @@ url: sql-pyspark-pandas-with-arrow.html#enabling-for-conversion-tofrom-pandas - text: "Pandas UDFs (a.k.a. Vectorized UDFs)" url: sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs + - text: "Pandas Function APIs" + url: sql-pyspark-pandas-with-arrow.html#pandas-function-apis - text: Usage Notes url: sql-pyspark-pandas-with-arrow.html#usage-notes - text: Migration Guide @@ -72,12 +76,36 @@ - text: SQL Reference url: sql-ref.html subitems: + - text: ANSI Compliance + url: sql-ref-ansi-compliance.html + subitems: + - text: Arithmetic Operations + url: sql-ref-ansi-compliance.html#arithmetic-operations + - text: Type Conversion + url: sql-ref-ansi-compliance.html#type-conversion + - text: SQL Keywords + url: sql-ref-ansi-compliance.html#sql-keywords - text: Data Types url: sql-ref-datatypes.html + - text: Datetime Pattern + url: sql-ref-datetime-pattern.html + - text: Functions + url: sql-ref-functions.html + subitems: + - text: Built-in Functions + url: sql-ref-functions-builtin.html + - text: Scalar UDFs (User-Defined Functions) + url: sql-ref-functions-udf-scalar.html + - text: UDAFs (User-Defined Aggregate Functions) + url: sql-ref-functions-udf-aggregate.html + - text: Integration with Hive UDFs/UDAFs/UDTFs + url: sql-ref-functions-udf-hive.html + - text: Identifiers + url: sql-ref-identifier.html + - text: Literals + url: sql-ref-literals.html - text: Null Semantics url: sql-ref-null-semantics.html - - text: NaN Semantics - url: sql-ref-nan-semantics.html - text: SQL Syntax url: sql-ref-syntax.html subitems: @@ -110,6 +138,8 @@ url: sql-ref-syntax-ddl-truncate-table.html - text: REPAIR TABLE url: sql-ref-syntax-ddl-repair-table.html + - text: USE DATABASE + url: sql-ref-syntax-ddl-usedb.html - text: Data Manipulation Statements url: sql-ref-syntax-dml.html subitems: @@ -139,19 +169,43 @@ url: sql-ref-syntax-qry-select-distribute-by.html - text: LIMIT Clause url: sql-ref-syntax-qry-select-limit.html - - text: USE database - url: sql-ref-syntax-qry-select-usedb.html + - text: Common Table Expression + url: sql-ref-syntax-qry-select-cte.html + - text: Hints + url: sql-ref-syntax-qry-select-hints.html + - text: Inline Table + url: sql-ref-syntax-qry-select-inline-table.html + - text: File + url: sql-ref-syntax-qry-select-file.html + - text: JOIN + url: sql-ref-syntax-qry-select-join.html + - text: LIKE Predicate + url: sql-ref-syntax-qry-select-like.html + - text: Set Operators + url: sql-ref-syntax-qry-select-setops.html + - text: TABLESAMPLE + url: sql-ref-syntax-qry-select-sampling.html + - text: Table-valued Function + url: sql-ref-syntax-qry-select-tvf.html + - text: Window Function + url: sql-ref-syntax-qry-select-window.html + - text: CASE Clause + url: sql-ref-syntax-qry-select-case.html + - text: LATERAL VIEW Clause + url: sql-ref-syntax-qry-select-lateral-view.html + - text: PIVOT Clause + url: sql-ref-syntax-qry-select-pivot.html - text: EXPLAIN url: sql-ref-syntax-qry-explain.html - text: Auxiliary Statements url: sql-ref-syntax-aux.html subitems: - - text: Analyze statement + - text: ANALYZE url: sql-ref-syntax-aux-analyze.html subitems: - text: ANALYZE TABLE url: sql-ref-syntax-aux-analyze-table.html - - text: Caching statements + - text: CACHE url: sql-ref-syntax-aux-cache.html subitems: - text: CACHE TABLE @@ -161,10 +215,10 @@ - text: CLEAR CACHE url: sql-ref-syntax-aux-cache-clear-cache.html - text: REFRESH TABLE - url: sql-ref-syntax-aux-refresh-table.html + url: sql-ref-syntax-aux-cache-refresh-table.html - text: REFRESH - url: sql-ref-syntax-aux-cache-refresh.md - - text: Describe Commands + url: sql-ref-syntax-aux-cache-refresh.html + - text: DESCRIBE url: sql-ref-syntax-aux-describe.html subitems: - text: DESCRIBE DATABASE @@ -175,33 +229,35 @@ url: sql-ref-syntax-aux-describe-function.html - text: DESCRIBE QUERY url: sql-ref-syntax-aux-describe-query.html - - text: Show commands + - text: SHOW url: sql-ref-syntax-aux-show.html subitems: - text: SHOW COLUMNS url: sql-ref-syntax-aux-show-columns.html + - text: SHOW CREATE TABLE + url: sql-ref-syntax-aux-show-create-table.html - text: SHOW DATABASES url: sql-ref-syntax-aux-show-databases.html - text: SHOW FUNCTIONS url: sql-ref-syntax-aux-show-functions.html + - text: SHOW PARTITIONS + url: sql-ref-syntax-aux-show-partitions.html - text: SHOW TABLE url: sql-ref-syntax-aux-show-table.html - text: SHOW TABLES url: sql-ref-syntax-aux-show-tables.html - text: SHOW TBLPROPERTIES url: sql-ref-syntax-aux-show-tblproperties.html - - text: SHOW PARTITIONS - url: sql-ref-syntax-aux-show-partitions.html - - text: SHOW CREATE TABLE - url: sql-ref-syntax-aux-show-create-table.html - - text: Configuration Management Commands + - text: SHOW VIEWS + url: sql-ref-syntax-aux-show-views.html + - text: CONFIGURATION MANAGEMENT url: sql-ref-syntax-aux-conf-mgmt.html subitems: - text: SET url: sql-ref-syntax-aux-conf-mgmt-set.html - text: RESET url: sql-ref-syntax-aux-conf-mgmt-reset.html - - text: Resource Management Commands + - text: RESOURCE MANAGEMENT url: sql-ref-syntax-aux-resource-mgmt.html subitems: - text: ADD FILE @@ -212,5 +268,3 @@ url: sql-ref-syntax-aux-resource-mgmt-list-file.html - text: LIST JAR url: sql-ref-syntax-aux-resource-mgmt-list-jar.html - - text: Arithmetic operations - url: sql-ref-arithmetic-ops.html diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index d5fb18bfb06c0..d05ac6bbe129d 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -82,7 +82,7 @@
  • `compression` (default `null`): compression codec to use when saving to file. This can be * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`, * `snappy` and `deflate`).
  • - *
  • `dateFormat` (default `uuuu-MM-dd`): sets the string that indicates a date format. - * Custom date formats follow the formats at `java.time.format.DateTimeFormatter`. + *
  • `dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format. + * Custom date formats follow the formats at + * + * Datetime Patterns. * This applies to date type.
  • - *
  • `timestampFormat` (default `uuuu-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that + *
  • `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at - * `java.time.format.DateTimeFormatter`. This applies to timestamp type.
  • + * + * Datetime Patterns. + * This applies to timestamp type. *
  • `encoding` (by default it is not set): specifies encoding (charset) of saved json * files. If it is not set, the UTF-8 charset will be used.
  • *
  • `lineSep` (default `\n`): defines the line separator that should be used for writing.
  • @@ -869,12 +928,16 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { *
  • `compression` (default `null`): compression codec to use when saving to file. This can be * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`, * `snappy` and `deflate`).
  • - *
  • `dateFormat` (default `uuuu-MM-dd`): sets the string that indicates a date format. - * Custom date formats follow the formats at `java.time.format.DateTimeFormatter`. + *
  • `dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format. + * Custom date formats follow the formats at + * + * Datetime Patterns. * This applies to date type.
  • - *
  • `timestampFormat` (default `uuuu-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that + *
  • `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at - * `java.time.format.DateTimeFormatter`. This applies to timestamp type.
  • + * + * Datetime Patterns. + * This applies to timestamp type. *
  • `ignoreLeadingWhiteSpace` (default `true`): a flag indicating whether or not leading * whitespaces from values being written should be skipped.
  • *
  • `ignoreTrailingWhiteSpace` (default `true`): a flag indicating defines whether or not @@ -896,7 +959,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { private def runCommand(session: SparkSession, name: String)(command: LogicalPlan): Unit = { val qe = session.sessionState.executePlan(command) // call `QueryExecution.toRDD` to trigger the execution of commands. - SQLExecution.withNewExecutionId(session, qe, Some(name))(qe.toRdd) + SQLExecution.withNewExecutionId(qe, Some(name))(qe.toRdd) } private def lookupV2Provider(): Option[TableProvider] = { @@ -915,7 +978,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { private var mode: SaveMode = SaveMode.ErrorIfExists - private val extraOptions = new scala.collection.mutable.HashMap[String, String] + private var extraOptions = CaseInsensitiveMap[String](Map.empty) private var partitioningColumns: Option[Seq[String]] = None diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala index f5dd7613d4103..87f35410172d6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala @@ -23,8 +23,7 @@ import scala.collection.mutable import org.apache.spark.annotation.Experimental import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.expressions.{Attribute, Bucket, Days, Hours, Literal, Months, Years} -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect} -import org.apache.spark.sql.connector.catalog.TableCatalog +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelectStatement, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelectStatement} import org.apache.spark.sql.connector.expressions.{LogicalExpressions, NamedReference, Transform} import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation @@ -47,8 +46,6 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) private val sparkSession = ds.sparkSession - private val catalogManager = sparkSession.sessionState.analyzer.catalogManager - private val tableName = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(table) private val (catalog, identifier) = { @@ -120,19 +117,19 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) } override def create(): Unit = { - // create and replace could alternatively create ParsedPlan statements, like - // `CreateTableFromDataFrameStatement(UnresolvedRelation(tableName), ...)`, to keep the catalog - // resolution logic in the analyzer. runCommand("create") { - CreateTableAsSelect( - catalog, - identifier, - partitioning.getOrElse(Seq.empty), + CreateTableAsSelectStatement( + tableName, logicalPlan, - properties = provider.map(p => properties + (TableCatalog.PROP_PROVIDER -> p)) - .getOrElse(properties).toMap, - writeOptions = options.toMap, - ignoreIfExists = false) + partitioning.getOrElse(Seq.empty), + None, + properties.toMap, + provider, + Map.empty, + None, + None, + options.toMap, + ifNotExists = false) } } @@ -226,18 +223,22 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) private def runCommand(name: String)(command: LogicalPlan): Unit = { val qe = sparkSession.sessionState.executePlan(command) // call `QueryExecution.toRDD` to trigger the execution of commands. - SQLExecution.withNewExecutionId(sparkSession, qe, Some(name))(qe.toRdd) + SQLExecution.withNewExecutionId(qe, Some(name))(qe.toRdd) } private def internalReplace(orCreate: Boolean): Unit = { runCommand("replace") { - ReplaceTableAsSelect( - catalog, - identifier, - partitioning.getOrElse(Seq.empty), + ReplaceTableAsSelectStatement( + tableName, logicalPlan, - properties = provider.map(p => properties + ("provider" -> p)).getOrElse(properties).toMap, - writeOptions = options.toMap, + partitioning.getOrElse(Seq.empty), + None, + properties.toMap, + provider, + Map.empty, + None, + None, + options.toMap, orCreate = orCreate) } } @@ -246,6 +247,7 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) /** * Configuration methods common to create/replace operations and insert/overwrite operations. * @tparam R builder type to return + * @since 3.0.0 */ trait WriteConfigMethods[R] { /** @@ -293,6 +295,8 @@ trait WriteConfigMethods[R] { /** * Trait to restrict calls to create and replace operations. + * + * @since 3.0.0 */ trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { /** @@ -320,7 +324,7 @@ trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { * [[org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException]]. * * @throws org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException - * If the table already exists + * If the table does not exist */ @throws(classOf[CannotReplaceMissingTableException]) def replace(): Unit diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index a1c33f92d17b4..3b6fd2fcc177c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -22,6 +22,7 @@ import java.io.{ByteArrayOutputStream, CharArrayWriter, DataOutputStream} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.language.implicitConversions +import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal import org.apache.commons.lang3.StringUtils @@ -34,6 +35,7 @@ import org.apache.spark.api.python.{PythonRDD, SerDeUtil} import org.apache.spark.api.r.RRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, ScalaReflection} import org.apache.spark.sql.catalyst.QueryPlanningTracker import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.HiveTableRelation @@ -48,6 +50,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningC import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression import org.apache.spark.sql.execution.arrow.{ArrowBatchStreamWriter, ArrowConverters} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.LogicalRelation @@ -82,18 +85,19 @@ private[sql] object Dataset { dataset } - def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = { - val qe = sparkSession.sessionState.executePlan(logicalPlan) - qe.assertAnalyzed() - new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema)) + def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = + sparkSession.withActive { + val qe = sparkSession.sessionState.executePlan(logicalPlan) + qe.assertAnalyzed() + new Dataset[Row](qe, RowEncoder(qe.analyzed.schema)) } /** A variant of ofRows that allows passing in a tracker so we can track query parsing time. */ def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan, tracker: QueryPlanningTracker) - : DataFrame = { + : DataFrame = sparkSession.withActive { val qe = new QueryExecution(sparkSession, logicalPlan, tracker) qe.assertAnalyzed() - new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema)) + new Dataset[Row](qe, RowEncoder(qe.analyzed.schema)) } } @@ -185,13 +189,12 @@ private[sql] object Dataset { */ @Stable class Dataset[T] private[sql]( - @transient private val _sparkSession: SparkSession, @DeveloperApi @Unstable @transient val queryExecution: QueryExecution, @DeveloperApi @Unstable @transient val encoder: Encoder[T]) extends Serializable { @transient lazy val sparkSession: SparkSession = { - if (_sparkSession == null) { + if (queryExecution == null || queryExecution.sparkSession == null) { throw new SparkException( "Dataset transformations and actions can only be invoked by the driver, not inside of" + " other Dataset transformations; for example, dataset1.map(x => dataset2.values.count()" + @@ -199,7 +202,7 @@ class Dataset[T] private[sql]( "performed inside of the dataset1.map transformation. For more information," + " see SPARK-28702.") } - _sparkSession + queryExecution.sparkSession } // A globally unique id of this Dataset. @@ -211,7 +214,7 @@ class Dataset[T] private[sql]( // you wrap it with `withNewExecutionId` if this actions doesn't call other action. def this(sparkSession: SparkSession, logicalPlan: LogicalPlan, encoder: Encoder[T]) = { - this(sparkSession, sparkSession.sessionState.executePlan(logicalPlan), encoder) + this(sparkSession.sessionState.executePlan(logicalPlan), encoder) } def this(sqlContext: SQLContext, logicalPlan: LogicalPlan, encoder: Encoder[T]) = { @@ -445,7 +448,7 @@ class Dataset[T] private[sql]( */ // This is declared with parentheses to prevent the Scala compiler from treating // `ds.toDF("1")` as invoking this toDF and then apply on the returned DataFrame. - def toDF(): DataFrame = new Dataset[Row](sparkSession, queryExecution, RowEncoder(schema)) + def toDF(): DataFrame = new Dataset[Row](queryExecution, RowEncoder(schema)) /** * Returns a new Dataset where each record has been mapped on to the specified type. The @@ -503,7 +506,9 @@ class Dataset[T] private[sql]( * @group basic * @since 1.6.0 */ - def schema: StructType = queryExecution.analyzed.schema + def schema: StructType = sparkSession.withActive { + queryExecution.analyzed.schema + } /** * Prints the schema to the console in a nice tree format. @@ -539,7 +544,7 @@ class Dataset[T] private[sql]( * @group basic * @since 3.0.0 */ - def explain(mode: String): Unit = { + def explain(mode: String): Unit = sparkSession.withActive { // Because temporary views are resolved during analysis when we create a Dataset, and // `ExplainCommand` analyzes input query plan and resolves temporary views again. Using // `ExplainCommand` here will probably output different query plans, compared to the results @@ -1430,7 +1435,25 @@ class Dataset[T] private[sql]( */ @scala.annotation.varargs def select(cols: Column*): DataFrame = withPlan { - Project(cols.map(_.named), logicalPlan) + val untypedCols = cols.map { + case typedCol: TypedColumn[_, _] => + // Checks if a `TypedColumn` has been inserted with + // specific input type and schema by `withInputType`. + val needInputType = typedCol.expr.find { + case ta: TypedAggregateExpression if ta.inputDeserializer.isEmpty => true + case _ => false + }.isDefined + + if (!needInputType) { + typedCol + } else { + throw new AnalysisException(s"Typed column $typedCol that needs input type and schema " + + "cannot be passed in untyped `select` API. Use the typed `Dataset.select` API instead.") + } + + case other => other + } + Project(untypedCols.map(_.named), logicalPlan) } /** @@ -1502,7 +1525,7 @@ class Dataset[T] private[sql]( val namedColumns = columns.map(_.withInputType(exprEnc, logicalPlan.output).named) val execution = new QueryExecution(sparkSession, Project(namedColumns, logicalPlan)) - new Dataset(sparkSession, execution, ExpressionEncoder.tuple(encoders)) + new Dataset(execution, ExpressionEncoder.tuple(encoders)) } /** @@ -2244,6 +2267,90 @@ class Dataset[T] private[sql]( randomSplit(weights.toArray, seed) } + /** + * (Scala-specific) Returns a new Dataset where each row has been expanded to zero or more + * rows by the provided function. This is similar to a `LATERAL VIEW` in HiveQL. The columns of + * the input row are implicitly joined with each row that is output by the function. + * + * Given that this is deprecated, as an alternative, you can explode columns either using + * `functions.explode()` or `flatMap()`. The following example uses these alternatives to count + * the number of books that contain a given word: + * + * {{{ + * case class Book(title: String, words: String) + * val ds: Dataset[Book] + * + * val allWords = ds.select('title, explode(split('words, " ")).as("word")) + * + * val bookCountPerWord = allWords.groupBy("word").agg(countDistinct("title")) + * }}} + * + * Using `flatMap()` this can similarly be exploded as: + * + * {{{ + * ds.flatMap(_.words.split(" ")) + * }}} + * + * @group untypedrel + * @since 2.0.0 + */ + @deprecated("use flatMap() or select() with functions.explode() instead", "2.0.0") + def explode[A <: Product : TypeTag](input: Column*)(f: Row => TraversableOnce[A]): DataFrame = { + val elementSchema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType] + + val convert = CatalystTypeConverters.createToCatalystConverter(elementSchema) + + val rowFunction = + f.andThen(_.map(convert(_).asInstanceOf[InternalRow])) + val generator = UserDefinedGenerator(elementSchema, rowFunction, input.map(_.expr)) + + withPlan { + Generate(generator, unrequiredChildIndex = Nil, outer = false, + qualifier = None, generatorOutput = Nil, logicalPlan) + } + } + + /** + * (Scala-specific) Returns a new Dataset where a single column has been expanded to zero + * or more rows by the provided function. This is similar to a `LATERAL VIEW` in HiveQL. All + * columns of the input row are implicitly joined with each value that is output by the function. + * + * Given that this is deprecated, as an alternative, you can explode columns either using + * `functions.explode()`: + * + * {{{ + * ds.select(explode(split('words, " ")).as("word")) + * }}} + * + * or `flatMap()`: + * + * {{{ + * ds.flatMap(_.words.split(" ")) + * }}} + * + * @group untypedrel + * @since 2.0.0 + */ + @deprecated("use flatMap() or select() with functions.explode() instead", "2.0.0") + def explode[A, B : TypeTag](inputColumn: String, outputColumn: String)(f: A => TraversableOnce[B]) + : DataFrame = { + val dataType = ScalaReflection.schemaFor[B].dataType + val attributes = AttributeReference(outputColumn, dataType)() :: Nil + // TODO handle the metadata? + val elementSchema = attributes.toStructType + + def rowFunction(row: Row): TraversableOnce[InternalRow] = { + val convert = CatalystTypeConverters.createToCatalystConverter(dataType) + f(row(0).asInstanceOf[A]).map(o => InternalRow(convert(o))) + } + val generator = UserDefinedGenerator(elementSchema, rowFunction, apply(inputColumn).expr :: Nil) + + withPlan { + Generate(generator, unrequiredChildIndex = Nil, outer = false, + qualifier = None, generatorOutput = Nil, logicalPlan) + } + } + /** * Returns a new Dataset by adding a column or replacing the existing column that has * the same name. @@ -2434,6 +2541,8 @@ class Dataset[T] private[sql]( def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan { val resolver = sparkSession.sessionState.analyzer.resolver val allColumns = queryExecution.analyzed.output + // SPARK-31990: We must keep `toSet.toSeq` here because of the backward compatibility issue + // (the Streaming's state store depends on the `groupCols` order). val groupCols = colNames.toSet.toSeq.flatMap { (colName: String) => // It is possibly there are more than one columns with the same name, // so we call filter instead of find. @@ -2858,9 +2967,8 @@ class Dataset[T] private[sql]( */ def toLocalIterator(): java.util.Iterator[T] = { withAction("toLocalIterator", queryExecution) { plan => - // `ExpressionEncoder` is not thread-safe, here we create a new encoder. - val enc = resolvedEnc.copy() - plan.executeToIterator().map(enc.fromRow).asJava + val fromRow = resolvedEnc.createDeserializer() + plan.executeToIterator().map(fromRow).asJava } } @@ -3108,6 +3216,18 @@ class Dataset[T] private[sql]( */ def javaRDD: JavaRDD[T] = toJavaRDD + /** + * Registers this Dataset as a temporary table using the given name. The lifetime of this + * temporary table is tied to the [[SparkSession]] that was used to create this Dataset. + * + * @group basic + * @since 1.6.0 + */ + @deprecated("Use createOrReplaceTempView(viewName) instead.", "2.0.0") + def registerTempTable(tableName: String): Unit = { + createOrReplaceTempView(tableName) + } + /** * Creates a local temporary view using the given name. The lifetime of this * temporary view is tied to the [[SparkSession]] that was used to create this Dataset. @@ -3268,9 +3388,10 @@ class Dataset[T] private[sql]( new JSONOptions(Map.empty[String, String], sessionLocalTimeZone)) new Iterator[String] { + private val toRow = exprEnc.createSerializer() override def hasNext: Boolean = iter.hasNext override def next(): String = { - gen.write(exprEnc.toRow(iter.next())) + gen.write(toRow(iter.next())) gen.flush() val json = writer.toString @@ -3362,8 +3483,8 @@ class Dataset[T] private[sql]( private[sql] def collectAsArrowToR(): Array[Any] = { val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone - withAction("collectAsArrowToR", queryExecution) { plan => - RRDD.serveToStream("serve-Arrow") { outputStream => + RRDD.serveToStream("serve-Arrow") { outputStream => + withAction("collectAsArrowToR", queryExecution) { plan => val buffer = new ByteArrayOutputStream() val out = new DataOutputStream(outputStream) val batchWriter = new ArrowBatchStreamWriter(schema, buffer, timeZoneId) @@ -3371,7 +3492,7 @@ class Dataset[T] private[sql]( val numPartitions = arrowBatchRdd.partitions.length // Store collection results for worst case of 1 to N-1 partitions - val results = new Array[Array[Array[Byte]]](numPartitions - 1) + val results = new Array[Array[Array[Byte]]](Math.max(0, numPartitions - 1)) var lastIndex = -1 // index of last partition written // Handler to eagerly write partitions to Python in order @@ -3416,8 +3537,8 @@ class Dataset[T] private[sql]( private[sql] def collectAsArrowToPython: Array[Any] = { val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone - withAction("collectAsArrowToPython", queryExecution) { plan => - PythonRDD.serveToStream("serve-Arrow") { outputStream => + PythonRDD.serveToStream("serve-Arrow") { outputStream => + withAction("collectAsArrowToPython", queryExecution) { plan => val out = new DataOutputStream(outputStream) val batchWriter = new ArrowBatchStreamWriter(schema, out, timeZoneId) @@ -3472,7 +3593,7 @@ class Dataset[T] private[sql]( * an execution. */ private def withNewExecutionId[U](body: => U): U = { - SQLExecution.withNewExecutionId(sparkSession, queryExecution)(body) + SQLExecution.withNewExecutionId(queryExecution)(body) } /** @@ -3481,7 +3602,7 @@ class Dataset[T] private[sql]( * reset. */ private def withNewRDDExecutionId[U](body: => U): U = { - SQLExecution.withNewExecutionId(sparkSession, rddQueryExecution) { + SQLExecution.withNewExecutionId(rddQueryExecution) { rddQueryExecution.executedPlan.resetMetrics() body } @@ -3492,7 +3613,7 @@ class Dataset[T] private[sql]( * user-registered callback functions. */ private def withAction[U](name: String, qe: QueryExecution)(action: SparkPlan => U) = { - SQLExecution.withNewExecutionId(sparkSession, qe, Some(name)) { + SQLExecution.withNewExecutionId(qe, Some(name)) { qe.executedPlan.resetMetrics() action(qe.executedPlan) } @@ -3502,9 +3623,8 @@ class Dataset[T] private[sql]( * Collect all elements from a spark plan. */ private def collectFromPlan(plan: SparkPlan): Array[T] = { - // `ExpressionEncoder` is not thread-safe, here we create a new encoder. - val enc = resolvedEnc.copy() - plan.executeCollect().map(enc.fromRow) + val fromRow = resolvedEnc.createDeserializer() + plan.executeCollect().map(fromRow) } private def sortInternal(global: Boolean, sortExprs: Seq[Column]): Dataset[T] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 89cc9735e4f6a..76ee297dfca79 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -449,10 +449,7 @@ class KeyValueGroupedDataset[K, V] private[sql]( val aggregate = Aggregate(groupingAttributes, keyColumn +: namedColumns, logicalPlan) val execution = new QueryExecution(sparkSession, aggregate) - new Dataset( - sparkSession, - execution, - ExpressionEncoder.tuple(kExprEnc +: encoders)) + new Dataset(execution, ExpressionEncoder.tuple(kExprEnc +: encoders)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index b1ba7d4538732..c37d8eaa294bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -546,9 +546,10 @@ class RelationalGroupedDataset protected[sql]( case ne: NamedExpression => ne case other => Alias(other, other.toString)() } - val groupingAttributes = groupingNamedExpressions.map(_.toAttribute) val child = df.logicalPlan - val project = Project(groupingNamedExpressions ++ child.output, child) + val project = df.sparkSession.sessionState.executePlan( + Project(groupingNamedExpressions ++ child.output, child)).analyzed + val groupingAttributes = project.output.take(groupingNamedExpressions.length) val output = expr.dataType.asInstanceOf[StructType].toAttributes val plan = FlatMapGroupsInPandas(groupingAttributes, expr, output, project) @@ -583,14 +584,16 @@ class RelationalGroupedDataset protected[sql]( case other => Alias(other, other.toString)() } - val leftAttributes = leftGroupingNamedExpressions.map(_.toAttribute) - val rightAttributes = rightGroupingNamedExpressions.map(_.toAttribute) - val leftChild = df.logicalPlan val rightChild = r.df.logicalPlan - val left = Project(leftGroupingNamedExpressions ++ leftChild.output, leftChild) - val right = Project(rightGroupingNamedExpressions ++ rightChild.output, rightChild) + val left = df.sparkSession.sessionState.executePlan( + Project(leftGroupingNamedExpressions ++ leftChild.output, leftChild)).analyzed + val right = r.df.sparkSession.sessionState.executePlan( + Project(rightGroupingNamedExpressions ++ rightChild.output, rightChild)).analyzed + + val leftAttributes = left.output.take(leftGroupingNamedExpressions.length) + val rightAttributes = right.output.take(rightGroupingNamedExpressions.length) val output = expr.dataType.asInstanceOf[StructType].toAttributes val plan = FlatMapCoGroupsInPandas(leftAttributes, rightAttributes, expr, output, left, right) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index e1b44b5918143..e9bc25d489718 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.internal.SQLConf.{DeprecatedConfig, RemovedConfig} * @since 2.0.0 */ @Stable -class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging { +class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { /** * Sets the given Spark runtime configuration property. @@ -40,8 +40,6 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging */ def set(key: String, value: String): Unit = { requireNonStaticConf(key) - requireDefaultValueOfRemovedConf(key, value) - logDeprecationWarning(key) sqlConf.setConfString(key, value) } @@ -130,7 +128,6 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging */ def unset(key: String): Unit = { requireNonStaticConf(key) - logDeprecationWarning(key) sqlConf.unsetConf(key) } @@ -161,26 +158,4 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging throw new AnalysisException(s"Cannot modify the value of a Spark config: $key") } } - - private def requireDefaultValueOfRemovedConf(key: String, value: String): Unit = { - SQLConf.removedSQLConfigs.get(key).foreach { - case RemovedConfig(configName, version, defaultValue, comment) => - if (value != defaultValue) { - throw new AnalysisException( - s"The SQL config '$configName' was removed in the version $version. $comment") - } - } - } - - /** - * Logs a warning message if the given config key is deprecated. - */ - private def logDeprecationWarning(key: String): Unit = { - SQLConf.deprecatedSQLConfigs.get(key).foreach { - case DeprecatedConfig(configName, version, comment) => - logWarning( - s"The SQL config '$configName' has been deprecated in Spark v$version " + - s"and may be removed in the future. $comment") - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 2054874e5e07b..dd237962110ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -24,7 +24,7 @@ import scala.reflect.runtime.universe.TypeTag import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.annotation.{DeveloperApi, Experimental, Stable, Unstable} -import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.ConfigEntry import org.apache.spark.rdd.RDD @@ -64,6 +64,15 @@ class SQLContext private[sql](val sparkSession: SparkSession) // Note: Since Spark 2.0 this class has become a wrapper of SparkSession, where the // real functionality resides. This class remains mainly for backward compatibility. + + @deprecated("Use SparkSession.builder instead", "2.0.0") + def this(sc: SparkContext) = { + this(SparkSession.builder().sparkContext(sc).getOrCreate()) + } + + @deprecated("Use SparkSession.builder instead", "2.0.0") + def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) + // TODO: move this logic into SparkSession private[sql] def sessionState: SessionState = sparkSession.sessionState @@ -479,6 +488,97 @@ class SQLContext private[sql](val sparkSession: SparkSession) def readStream: DataStreamReader = sparkSession.readStream + /** + * Creates an external table from the given path and returns the corresponding DataFrame. + * It will use the default data source configured by spark.sql.sources.default. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable(tableName: String, path: String): DataFrame = { + sparkSession.catalog.createTable(tableName, path) + } + + /** + * Creates an external table from the given path based on a data source + * and returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + path: String, + source: String): DataFrame = { + sparkSession.catalog.createTable(tableName, path, source) + } + + /** + * Creates an external table from the given path based on a data source and a set of options. + * Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + options: java.util.Map[String, String]): DataFrame = { + sparkSession.catalog.createTable(tableName, source, options) + } + + /** + * (Scala-specific) + * Creates an external table from the given path based on a data source and a set of options. + * Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + options: Map[String, String]): DataFrame = { + sparkSession.catalog.createTable(tableName, source, options) + } + + /** + * Create an external table from the given path based on a data source, a schema and + * a set of options. Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: java.util.Map[String, String]): DataFrame = { + sparkSession.catalog.createTable(tableName, source, schema, options) + } + + /** + * (Scala-specific) + * Create an external table from the given path based on a data source, a schema and + * a set of options. Then, returns the corresponding DataFrame. + * + * @group ddl_ops + * @since 1.3.0 + */ + @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: Map[String, String]): DataFrame = { + sparkSession.catalog.createTable(tableName, source, schema, options) + } + /** * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist * only during the lifetime of this instance of SQLContext. @@ -541,8 +641,8 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Executes a SQL query using Spark, returning the result as a `DataFrame`. The dialect that is - * used for SQL parsing can be configured with 'spark.sql.dialect'. + * Executes a SQL query using Spark, returning the result as a `DataFrame`. + * This API eagerly runs DDL/DML commands, but not for SELECT queries. * * @group basic * @since 1.3.0 @@ -561,7 +661,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Returns a `DataFrame` containing names of existing tables in the current database. - * The returned DataFrame has two columns, tableName and isTemporary (a Boolean + * The returned DataFrame has three columns, database, tableName and isTemporary (a Boolean * indicating if a table is a temporary one or not). * * @group ddl_ops @@ -573,7 +673,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Returns a `DataFrame` containing names of existing tables in the given database. - * The returned DataFrame has two columns, tableName and isTemporary (a Boolean + * The returned DataFrame has three columns, database, tableName and isTemporary (a Boolean * indicating if a table is a temporary one or not). * * @group ddl_ops @@ -611,6 +711,289 @@ class SQLContext private[sql](val sparkSession: SparkSession) sessionState.catalog.listTables(databaseName).map(_.table).toArray } + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + // Deprecated methods + //////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + /** + * @deprecated As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rowRDD: RDD[Row], schema: StructType): DataFrame = { + createDataFrame(rowRDD, schema) + } + + /** + * @deprecated As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = { + createDataFrame(rowRDD, schema) + } + + /** + * @deprecated As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rdd: RDD[_], beanClass: Class[_]): DataFrame = { + createDataFrame(rdd, beanClass) + } + + /** + * @deprecated As of 1.3.0, replaced by `createDataFrame()`. + */ + @deprecated("Use createDataFrame instead.", "1.3.0") + def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): DataFrame = { + createDataFrame(rdd, beanClass) + } + + /** + * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty + * `DataFrame` if no paths are passed in. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().parquet()`. + */ + @deprecated("Use read.parquet() instead.", "1.4.0") + @scala.annotation.varargs + def parquetFile(paths: String*): DataFrame = { + if (paths.isEmpty) { + emptyDataFrame + } else { + read.parquet(paths : _*) + } + } + + /** + * Loads a JSON file (one object per line), returning the result as a `DataFrame`. + * It goes through the entire dataset once to determine the schema. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonFile(path: String): DataFrame = { + read.json(path) + } + + /** + * Loads a JSON file (one object per line) and applies the given schema, + * returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonFile(path: String, schema: StructType): DataFrame = { + read.schema(schema).json(path) + } + + /** + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonFile(path: String, samplingRatio: Double): DataFrame = { + read.option("samplingRatio", samplingRatio.toString).json(path) + } + + /** + * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a + * `DataFrame`. + * It goes through the entire dataset once to determine the schema. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: RDD[String]): DataFrame = read.json(json) + + /** + * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a + * `DataFrame`. + * It goes through the entire dataset once to determine the schema. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: JavaRDD[String]): DataFrame = read.json(json) + + /** + * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema, + * returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: RDD[String], schema: StructType): DataFrame = { + read.schema(schema).json(json) + } + + /** + * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the given + * schema, returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: JavaRDD[String], schema: StructType): DataFrame = { + read.schema(schema).json(json) + } + + /** + * Loads an RDD[String] storing JSON objects (one object per record) inferring the + * schema, returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: RDD[String], samplingRatio: Double): DataFrame = { + read.option("samplingRatio", samplingRatio.toString).json(json) + } + + /** + * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the + * schema, returning the result as a `DataFrame`. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().json()`. + */ + @deprecated("Use read.json() instead.", "1.4.0") + def jsonRDD(json: JavaRDD[String], samplingRatio: Double): DataFrame = { + read.option("samplingRatio", samplingRatio.toString).json(json) + } + + /** + * Returns the dataset stored at path as a DataFrame, + * using the default data source configured by spark.sql.sources.default. + * + * @group genericdata + * @deprecated As of 1.4.0, replaced by `read().load(path)`. + */ + @deprecated("Use read.load(path) instead.", "1.4.0") + def load(path: String): DataFrame = { + read.load(path) + } + + /** + * Returns the dataset stored at path as a DataFrame, using the given data source. + * + * @group genericdata + * @deprecated As of 1.4.0, replaced by `read().format(source).load(path)`. + */ + @deprecated("Use read.format(source).load(path) instead.", "1.4.0") + def load(path: String, source: String): DataFrame = { + read.format(source).load(path) + } + + /** + * (Java-specific) Returns the dataset specified by the given data source and + * a set of options as a DataFrame. + * + * @group genericdata + * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`. + */ + @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") + def load(source: String, options: java.util.Map[String, String]): DataFrame = { + read.options(options).format(source).load() + } + + /** + * (Scala-specific) Returns the dataset specified by the given data source and + * a set of options as a DataFrame. + * + * @group genericdata + * @deprecated As of 1.4.0, replaced by `read().format(source).options(options).load()`. + */ + @deprecated("Use read.format(source).options(options).load() instead.", "1.4.0") + def load(source: String, options: Map[String, String]): DataFrame = { + read.options(options).format(source).load() + } + + /** + * (Java-specific) Returns the dataset specified by the given data source and + * a set of options as a DataFrame, using the given schema as the schema of the DataFrame. + * + * @group genericdata + * @deprecated As of 1.4.0, replaced by + * `read().format(source).schema(schema).options(options).load()`. + */ + @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") + def load( + source: String, + schema: StructType, + options: java.util.Map[String, String]): DataFrame = { + read.format(source).schema(schema).options(options).load() + } + + /** + * (Scala-specific) Returns the dataset specified by the given data source and + * a set of options as a DataFrame, using the given schema as the schema of the DataFrame. + * + * @group genericdata + * @deprecated As of 1.4.0, replaced by + * `read().format(source).schema(schema).options(options).load()`. + */ + @deprecated("Use read.format(source).schema(schema).options(options).load() instead.", "1.4.0") + def load(source: String, schema: StructType, options: Map[String, String]): DataFrame = { + read.format(source).schema(schema).options(options).load() + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL + * url named table. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().jdbc()`. + */ + @deprecated("Use read.jdbc() instead.", "1.4.0") + def jdbc(url: String, table: String): DataFrame = { + read.jdbc(url, table, new Properties) + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL + * url named table. Partitions of the table will be retrieved in parallel based on the parameters + * passed to this function. + * + * @param columnName the name of a column of integral type that will be used for partitioning. + * @param lowerBound the minimum value of `columnName` used to decide partition stride + * @param upperBound the maximum value of `columnName` used to decide partition stride + * @param numPartitions the number of partitions. the range `minValue`-`maxValue` will be split + * evenly into this many partitions + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().jdbc()`. + */ + @deprecated("Use read.jdbc() instead.", "1.4.0") + def jdbc( + url: String, + table: String, + columnName: String, + lowerBound: Long, + upperBound: Long, + numPartitions: Int): DataFrame = { + read.jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, new Properties) + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL + * url named table. The theParts parameter gives a list expressions + * suitable for inclusion in WHERE clauses; each one defines one partition + * of the `DataFrame`. + * + * @group specificdata + * @deprecated As of 1.4.0, replaced by `read().jdbc()`. + */ + @deprecated("Use read.jdbc() instead.", "1.4.0") + def jdbc(url: String, table: String, theParts: Array[String]): DataFrame = { + read.jdbc(url, table, theParts, new Properties) + } } /** @@ -623,6 +1006,45 @@ class SQLContext private[sql](val sparkSession: SparkSession) */ object SQLContext { + /** + * Get the singleton SQLContext if it exists or create a new one using the given SparkContext. + * + * This function can be used to create a singleton SQLContext object that can be shared across + * the JVM. + * + * If there is an active SQLContext for current thread, it will be returned instead of the global + * one. + * + * @since 1.5.0 + */ + @deprecated("Use SparkSession.builder instead", "2.0.0") + def getOrCreate(sparkContext: SparkContext): SQLContext = { + SparkSession.builder().sparkContext(sparkContext).getOrCreate().sqlContext + } + + /** + * Changes the SQLContext that will be returned in this thread and its children when + * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives + * a SQLContext with an isolated session, instead of the global (first created) context. + * + * @since 1.6.0 + */ + @deprecated("Use SparkSession.setActiveSession instead", "2.0.0") + def setActive(sqlContext: SQLContext): Unit = { + SparkSession.setActiveSession(sqlContext.sparkSession) + } + + /** + * Clears the active SQLContext for current thread. Subsequent calls to getOrCreate will + * return the first created context instead of a thread-local override. + * + * @since 1.6.0 + */ + @deprecated("Use SparkSession.clearActiveSession instead", "2.0.0") + def clearActive(): Unit = { + SparkSession.clearActiveSession() + } + /** * Converts an iterator of Java Beans to InternalRow using the provided * bean info & schema. This is not related to the singleton, but is a static diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index abefb348cafc7..f89e58c6e971c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql import java.io.Closeable import java.util.concurrent.TimeUnit._ -import java.util.concurrent.atomic.AtomicReference +import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference} import scala.collection.JavaConverters._ import scala.reflect.runtime.universe.TypeTag @@ -29,6 +29,7 @@ import org.apache.spark.{SPARK_VERSION, SparkConf, SparkContext, TaskContext} import org.apache.spark.annotation.{DeveloperApi, Experimental, Stable, Unstable} import org.apache.spark.api.java.JavaRDD import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXECUTOR_ALLOW_SPARK_CONTEXT import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} import org.apache.spark.sql.catalog.Catalog @@ -49,7 +50,6 @@ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.util.ExecutionListenerManager import org.apache.spark.util.{CallSite, Utils} - /** * The entry point to programming Spark with the Dataset and DataFrame API. * @@ -293,8 +293,7 @@ class SparkSession private( * * @since 2.0.0 */ - def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = { - SparkSession.setActiveSession(this) + def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = withActive { val encoder = Encoders.product[A] Dataset.ofRows(self, ExternalRDD(rdd, self)(encoder)) } @@ -304,8 +303,7 @@ class SparkSession private( * * @since 2.0.0 */ - def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = { - SparkSession.setActiveSession(this) + def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = withActive { val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType] val attributeSeq = schema.toAttributes Dataset.ofRows(self, LocalRelation.fromProduct(attributeSeq, data)) @@ -343,11 +341,12 @@ class SparkSession private( * @since 2.0.0 */ @DeveloperApi - def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = { + def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = withActive { // TODO: use MutableProjection when rowRDD is another DataFrame and the applied // schema differs from the existing schema on any field data type. val encoder = RowEncoder(schema) - val catalystRows = rowRDD.map(encoder.toRow) + val toRow = encoder.createSerializer() + val catalystRows = rowRDD.map(toRow) internalCreateDataFrame(catalystRows.setName(rowRDD.name), schema) } @@ -373,7 +372,7 @@ class SparkSession private( * @since 2.0.0 */ @DeveloperApi - def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = { + def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = withActive { Dataset.ofRows(self, LocalRelation.fromExternalRows(schema.toAttributes, rows.asScala)) } @@ -385,7 +384,7 @@ class SparkSession private( * * @since 2.0.0 */ - def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = { + def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = withActive { val attributeSeq: Seq[AttributeReference] = getSchema(beanClass) val className = beanClass.getName val rowRdd = rdd.mapPartitions { iter => @@ -414,7 +413,7 @@ class SparkSession private( * SELECT * queries will return the columns in an undefined order. * @since 1.6.0 */ - def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = { + def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = withActive { val attrSeq = getSchema(beanClass) val rows = SQLContext.beansToRows(data.asScala.iterator, beanClass, attrSeq) Dataset.ofRows(self, LocalRelation(attrSeq, rows.toSeq)) @@ -462,8 +461,9 @@ class SparkSession private( */ def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = { val enc = encoderFor[T] + val toRow = enc.createSerializer() val attributes = enc.schema.toAttributes - val encoded = data.map(d => enc.toRow(d).copy()) + val encoded = data.map(d => toRow(d).copy()) val plan = new LocalRelation(attributes, encoded) Dataset[T](self, plan) } @@ -595,11 +595,11 @@ class SparkSession private( /** * Executes a SQL query using Spark, returning the result as a `DataFrame`. - * The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'. + * This API eagerly runs DDL/DML commands, but not for SELECT queries. * * @since 2.0.0 */ - def sql(sqlText: String): DataFrame = { + def sql(sqlText: String): DataFrame = withActive { val tracker = new QueryPlanningTracker val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) { sessionState.sqlParser.parsePlan(sqlText) @@ -751,6 +751,20 @@ class SparkSession private( } } + /** + * Execute a block of code with the this session set as the active session, and restore the + * previous session on completion. + */ + private[sql] def withActive[T](block: => T): T = { + // Use the active session thread local directly to make sure we get the session that is actually + // set and not the default session. This to prevent that we promote the default session to the + // active session once we are done. + val old = SparkSession.activeThreadSession.get() + SparkSession.setActiveSession(this) + try block finally { + SparkSession.setActiveSession(old) + } + } } @@ -881,20 +895,23 @@ object SparkSession extends Logging { * SparkSession exists, the method creates a new SparkSession and assigns the * newly created SparkSession as the global default. * - * In case an existing SparkSession is returned, the config options specified in + * In case an existing SparkSession is returned, the non-static config options specified in * this builder will be applied to the existing SparkSession. * * @since 2.0.0 */ def getOrCreate(): SparkSession = synchronized { - assertOnDriver() + val sparkConf = new SparkConf() + options.foreach { case (k, v) => sparkConf.set(k, v) } + + if (!sparkConf.get(EXECUTOR_ALLOW_SPARK_CONTEXT)) { + assertOnDriver() + } + // Get the session from current thread's active session. var session = activeThreadSession.get() if ((session ne null) && !session.sparkContext.isStopped) { - options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) } - if (options.nonEmpty) { - logWarning("Using an existing SparkSession; some configuration may not take effect.") - } + applyModifiableSettings(session) return session } @@ -903,18 +920,12 @@ object SparkSession extends Logging { // If the current thread does not have an active session, get it from the global session. session = defaultSession.get() if ((session ne null) && !session.sparkContext.isStopped) { - options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) } - if (options.nonEmpty) { - logWarning("Using an existing SparkSession; some configuration may not take effect.") - } + applyModifiableSettings(session) return session } // No active nor global default session. Create a new one. val sparkContext = userSuppliedContext.getOrElse { - val sparkConf = new SparkConf() - options.foreach { case (k, v) => sparkConf.set(k, v) } - // set a random app name if not given. if (!sparkConf.contains("spark.app.name")) { sparkConf.setAppName(java.util.UUID.randomUUID().toString) @@ -932,19 +943,27 @@ object SparkSession extends Logging { options.foreach { case (k, v) => session.initialSessionOptions.put(k, v) } setDefaultSession(session) setActiveSession(session) - - // Register a successfully instantiated context to the singleton. This should be at the - // end of the class definition so that the singleton is updated only if there is no - // exception in the construction of the instance. - sparkContext.addSparkListener(new SparkListener { - override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { - defaultSession.set(null) - } - }) + registerContextListener(sparkContext) } return session } + + private def applyModifiableSettings(session: SparkSession): Unit = { + val (staticConfs, otherConfs) = + options.partition(kv => SQLConf.staticConfKeys.contains(kv._1)) + + otherConfs.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) } + + if (staticConfs.nonEmpty) { + logWarning("Using an existing SparkSession; the static sql configurations will not take" + + " effect.") + } + if (otherConfs.nonEmpty) { + logWarning("Using an existing SparkSession; some spark core configurations may not take" + + " effect.") + } + } } /** @@ -1040,6 +1059,20 @@ object SparkSession extends Logging { // Private methods from now on //////////////////////////////////////////////////////////////////////////////////////// + private val listenerRegistered: AtomicBoolean = new AtomicBoolean(false) + + /** Register the AppEnd listener onto the Context */ + private def registerContextListener(sparkContext: SparkContext): Unit = { + if (!listenerRegistered.get()) { + sparkContext.addSparkListener(new SparkListener { + override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { + defaultSession.set(null) + } + }) + listenerRegistered.set(true) + } + } + /** The active SparkSession for the current thread. */ private val activeThreadSession = new InheritableThreadLocal[SparkSession] @@ -1057,7 +1090,7 @@ object SparkSession extends Logging { } private def assertOnDriver(): Unit = { - if (Utils.isTesting && TaskContext.get != null) { + if (TaskContext.get != null) { // we're accessing it during task execution, fail. throw new IllegalStateException( "SparkSession should only be created and accessed on the driver.") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala index 1c2bf9e7c2a57..bd870fb8dcae6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.ColumnarRule +import org.apache.spark.sql.execution.{ColumnarRule, SparkPlan} /** * :: Experimental :: @@ -44,6 +44,7 @@ import org.apache.spark.sql.execution.ColumnarRule *
  • Customized Parser.
  • *
  • (External) Catalog listeners.
  • *
  • Columnar Rules.
  • + *
  • Adaptive Query Stage Preparation Rules.
  • * * * The extensions can be used by calling `withExtensions` on the [[SparkSession.Builder]], for @@ -96,8 +97,10 @@ class SparkSessionExtensions { type ParserBuilder = (SparkSession, ParserInterface) => ParserInterface type FunctionDescription = (FunctionIdentifier, ExpressionInfo, FunctionBuilder) type ColumnarRuleBuilder = SparkSession => ColumnarRule + type QueryStagePrepRuleBuilder = SparkSession => Rule[SparkPlan] private[this] val columnarRuleBuilders = mutable.Buffer.empty[ColumnarRuleBuilder] + private[this] val queryStagePrepRuleBuilders = mutable.Buffer.empty[QueryStagePrepRuleBuilder] /** * Build the override rules for columnar execution. @@ -106,6 +109,13 @@ class SparkSessionExtensions { columnarRuleBuilders.map(_.apply(session)) } + /** + * Build the override rules for the query stage preparation phase of adaptive query execution. + */ + private[sql] def buildQueryStagePrepRules(session: SparkSession): Seq[Rule[SparkPlan]] = { + queryStagePrepRuleBuilders.map(_.apply(session)).toSeq + } + /** * Inject a rule that can override the columnar execution of an executor. */ @@ -113,6 +123,14 @@ class SparkSessionExtensions { columnarRuleBuilders += builder } + /** + * Inject a rule that can override the the query stage preparation phase of adaptive query + * execution. + */ + def injectQueryStagePrepRule(builder: QueryStagePrepRuleBuilder): Unit = { + queryStagePrepRuleBuilders += builder + } + private[this] val resolutionRuleBuilders = mutable.Buffer.empty[RuleBuilder] /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 0f08e10c00d22..ced4af46c3f30 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -125,7 +125,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends (0 to 22).foreach { x => val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"}) val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _) - val inputSchemas = (1 to x).foldRight("Nil")((i, s) => {s"Try(ScalaReflection.schemaFor[A$i]).toOption :: $s"}) + val inputEncoders = (1 to x).foldRight("Nil")((i, s) => {s"Try(ExpressionEncoder[A$i]()).toOption :: $s"}) println(s""" |/** | * Registers a deterministic Scala closure of $x arguments as user-defined function (UDF). @@ -134,8 +134,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends | */ |def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = { | val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - | val inputSchemas: Seq[Option[ScalaReflection.Schema]] = $inputSchemas - | val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + | val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = $inputEncoders + | val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) | val finalUdf = if (nullable) udf else udf.asNonNullable() | def builder(e: Seq[Expression]) = if (e.length == $x) { | finalUdf.createScalaUDF(e) @@ -163,7 +163,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = { | val func = $funcCall | def builder(e: Seq[Expression]) = if (e.length == $i) { - | ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + | ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) | } else { | throw new AnalysisException("Invalid number of arguments for function " + name + | ". Expected: $i; Found: " + e.length) @@ -180,8 +180,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag](name: String, func: Function0[RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 0) { finalUdf.createScalaUDF(e) @@ -200,8 +200,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag](name: String, func: Function1[A1, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 1) { finalUdf.createScalaUDF(e) @@ -220,8 +220,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag](name: String, func: Function2[A1, A2, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 2) { finalUdf.createScalaUDF(e) @@ -240,8 +240,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](name: String, func: Function3[A1, A2, A3, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 3) { finalUdf.createScalaUDF(e) @@ -260,8 +260,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](name: String, func: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 4) { finalUdf.createScalaUDF(e) @@ -280,8 +280,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](name: String, func: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 5) { finalUdf.createScalaUDF(e) @@ -300,8 +300,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](name: String, func: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 6) { finalUdf.createScalaUDF(e) @@ -320,8 +320,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](name: String, func: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 7) { finalUdf.createScalaUDF(e) @@ -340,8 +340,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](name: String, func: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 8) { finalUdf.createScalaUDF(e) @@ -360,8 +360,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](name: String, func: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 9) { finalUdf.createScalaUDF(e) @@ -380,8 +380,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](name: String, func: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 10) { finalUdf.createScalaUDF(e) @@ -400,8 +400,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag](name: String, func: Function11[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 11) { finalUdf.createScalaUDF(e) @@ -420,8 +420,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag](name: String, func: Function12[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 12) { finalUdf.createScalaUDF(e) @@ -440,8 +440,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag](name: String, func: Function13[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 13) { finalUdf.createScalaUDF(e) @@ -460,8 +460,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag](name: String, func: Function14[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 14) { finalUdf.createScalaUDF(e) @@ -480,8 +480,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag](name: String, func: Function15[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 15) { finalUdf.createScalaUDF(e) @@ -500,8 +500,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag](name: String, func: Function16[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 16) { finalUdf.createScalaUDF(e) @@ -520,8 +520,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag](name: String, func: Function17[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Try(ScalaReflection.schemaFor[A17]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Try(ExpressionEncoder[A17]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 17) { finalUdf.createScalaUDF(e) @@ -540,8 +540,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag](name: String, func: Function18[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Try(ScalaReflection.schemaFor[A17]).toOption :: Try(ScalaReflection.schemaFor[A18]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Try(ExpressionEncoder[A17]()).toOption :: Try(ExpressionEncoder[A18]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 18) { finalUdf.createScalaUDF(e) @@ -560,8 +560,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag](name: String, func: Function19[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Try(ScalaReflection.schemaFor[A17]).toOption :: Try(ScalaReflection.schemaFor[A18]).toOption :: Try(ScalaReflection.schemaFor[A19]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Try(ExpressionEncoder[A17]()).toOption :: Try(ExpressionEncoder[A18]()).toOption :: Try(ExpressionEncoder[A19]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 19) { finalUdf.createScalaUDF(e) @@ -580,8 +580,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag](name: String, func: Function20[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Try(ScalaReflection.schemaFor[A17]).toOption :: Try(ScalaReflection.schemaFor[A18]).toOption :: Try(ScalaReflection.schemaFor[A19]).toOption :: Try(ScalaReflection.schemaFor[A20]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Try(ExpressionEncoder[A17]()).toOption :: Try(ExpressionEncoder[A18]()).toOption :: Try(ExpressionEncoder[A19]()).toOption :: Try(ExpressionEncoder[A20]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 20) { finalUdf.createScalaUDF(e) @@ -600,8 +600,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag](name: String, func: Function21[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Try(ScalaReflection.schemaFor[A17]).toOption :: Try(ScalaReflection.schemaFor[A18]).toOption :: Try(ScalaReflection.schemaFor[A19]).toOption :: Try(ScalaReflection.schemaFor[A20]).toOption :: Try(ScalaReflection.schemaFor[A21]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Try(ExpressionEncoder[A17]()).toOption :: Try(ExpressionEncoder[A18]()).toOption :: Try(ExpressionEncoder[A19]()).toOption :: Try(ExpressionEncoder[A20]()).toOption :: Try(ExpressionEncoder[A21]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 21) { finalUdf.createScalaUDF(e) @@ -620,8 +620,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends */ def register[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag, A11: TypeTag, A12: TypeTag, A13: TypeTag, A14: TypeTag, A15: TypeTag, A16: TypeTag, A17: TypeTag, A18: TypeTag, A19: TypeTag, A20: TypeTag, A21: TypeTag, A22: TypeTag](name: String, func: Function22[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas: Seq[Option[ScalaReflection.Schema]] = Try(ScalaReflection.schemaFor[A1]).toOption :: Try(ScalaReflection.schemaFor[A2]).toOption :: Try(ScalaReflection.schemaFor[A3]).toOption :: Try(ScalaReflection.schemaFor[A4]).toOption :: Try(ScalaReflection.schemaFor[A5]).toOption :: Try(ScalaReflection.schemaFor[A6]).toOption :: Try(ScalaReflection.schemaFor[A7]).toOption :: Try(ScalaReflection.schemaFor[A8]).toOption :: Try(ScalaReflection.schemaFor[A9]).toOption :: Try(ScalaReflection.schemaFor[A10]).toOption :: Try(ScalaReflection.schemaFor[A11]).toOption :: Try(ScalaReflection.schemaFor[A12]).toOption :: Try(ScalaReflection.schemaFor[A13]).toOption :: Try(ScalaReflection.schemaFor[A14]).toOption :: Try(ScalaReflection.schemaFor[A15]).toOption :: Try(ScalaReflection.schemaFor[A16]).toOption :: Try(ScalaReflection.schemaFor[A17]).toOption :: Try(ScalaReflection.schemaFor[A18]).toOption :: Try(ScalaReflection.schemaFor[A19]).toOption :: Try(ScalaReflection.schemaFor[A20]).toOption :: Try(ScalaReflection.schemaFor[A21]).toOption :: Try(ScalaReflection.schemaFor[A22]).toOption :: Nil - val udf = SparkUserDefinedFunction(func, dataType, inputSchemas).withName(name) + val inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Try(ExpressionEncoder[A11]()).toOption :: Try(ExpressionEncoder[A12]()).toOption :: Try(ExpressionEncoder[A13]()).toOption :: Try(ExpressionEncoder[A14]()).toOption :: Try(ExpressionEncoder[A15]()).toOption :: Try(ExpressionEncoder[A16]()).toOption :: Try(ExpressionEncoder[A17]()).toOption :: Try(ExpressionEncoder[A18]()).toOption :: Try(ExpressionEncoder[A19]()).toOption :: Try(ExpressionEncoder[A20]()).toOption :: Try(ExpressionEncoder[A21]()).toOption :: Try(ExpressionEncoder[A22]()).toOption :: Nil + val udf = SparkUserDefinedFunction(func, dataType, inputEncoders).withName(name) val finalUdf = if (nullable) udf else udf.asNonNullable() def builder(e: Seq[Expression]) = if (e.length == 22) { finalUdf.createScalaUDF(e) @@ -731,7 +731,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF0[_], returnType: DataType): Unit = { val func = () => f.asInstanceOf[UDF0[Any]].call() def builder(e: Seq[Expression]) = if (e.length == 0) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 0; Found: " + e.length) @@ -746,7 +746,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any) def builder(e: Seq[Expression]) = if (e.length == 1) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 1; Found: " + e.length) @@ -761,7 +761,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 2) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 2; Found: " + e.length) @@ -776,7 +776,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 3) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 3; Found: " + e.length) @@ -791,7 +791,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 4) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 4; Found: " + e.length) @@ -806,7 +806,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 5) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 5; Found: " + e.length) @@ -821,7 +821,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 6) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 6; Found: " + e.length) @@ -836,7 +836,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 7) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 7; Found: " + e.length) @@ -851,7 +851,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 8) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 8; Found: " + e.length) @@ -866,7 +866,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 9) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 9; Found: " + e.length) @@ -881,7 +881,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 10) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 10; Found: " + e.length) @@ -896,7 +896,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 11) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 11; Found: " + e.length) @@ -911,7 +911,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 12) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 12; Found: " + e.length) @@ -926,7 +926,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 13) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 13; Found: " + e.length) @@ -941,7 +941,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 14) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 14; Found: " + e.length) @@ -956,7 +956,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 15) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 15; Found: " + e.length) @@ -971,7 +971,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 16) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 16; Found: " + e.length) @@ -986,7 +986,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 17) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 17; Found: " + e.length) @@ -1001,7 +1001,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 18) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 18; Found: " + e.length) @@ -1016,7 +1016,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 19) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 19; Found: " + e.length) @@ -1031,7 +1031,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 20) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 20; Found: " + e.length) @@ -1046,7 +1046,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 21) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 21; Found: " + e.length) @@ -1061,7 +1061,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { val func = f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 22) { - ScalaUDF(func, returnType, e, e.map(_ => false), udfName = Some(name)) + ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 22; Found: " + e.length) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index b232aa18c816e..38254609e8f67 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -20,18 +20,23 @@ package org.apache.spark.sql.api.python import java.io.InputStream import java.nio.channels.Channels +import scala.util.control.NonFatal + import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDDServer +import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.{ExplainMode, QueryExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.DataType -private[sql] object PythonSQLUtils { +private[sql] object PythonSQLUtils extends Logging { def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText) // This is needed when generating SQL documentation for built-in functions. @@ -39,6 +44,30 @@ private[sql] object PythonSQLUtils { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } + private def listAllSQLConfigs(): Seq[(String, String, String, String)] = { + val conf = new SQLConf() + // Force to build static SQL configurations + StaticSQLConf + // Force to build SQL configurations from Hive module + try { + val symbol = ScalaReflection.mirror.staticModule("org.apache.spark.sql.hive.HiveUtils") + ScalaReflection.mirror.reflectModule(symbol).instance + } catch { + case NonFatal(e) => + logWarning("Cannot generated sql configurations from hive module", e) + } + conf.getAllDefinedConfs + } + + def listRuntimeSQLConfigs(): Array[(String, String, String, String)] = { + // Py4J doesn't seem to translate Seq well, so we convert to an Array. + listAllSQLConfigs().filterNot(p => SQLConf.staticConfKeys.contains(p._1)).toArray + } + + def listStaticSQLConfigs(): Array[(String, String, String, String)] = { + listAllSQLConfigs().filter(p => SQLConf.staticConfKeys.contains(p._1)).toArray + } + /** * Python callable function to read a file in Arrow stream format and create a [[RDD]] * using each serialized ArrowRecordBatch as a partition. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index 318cc629e7a34..60738e6d4ef9e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalog import scala.collection.JavaConverters._ -import org.apache.spark.annotation.Stable +import org.apache.spark.annotation.{Evolving, Experimental, Stable} import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset} import org.apache.spark.sql.types.StructType import org.apache.spark.storage.StorageLevel @@ -208,6 +208,20 @@ abstract class Catalog { */ def functionExists(dbName: String, functionName: String): Boolean + /** + * Creates a table from the given path and returns the corresponding DataFrame. + * It will use the default data source configured by spark.sql.sources.default. + * + * @param tableName is either a qualified or unqualified name that designates a table. + * If no database identifier is provided, it refers to a table in + * the current database. + * @since 2.0.0 + */ + @deprecated("use createTable instead.", "2.2.0") + def createExternalTable(tableName: String, path: String): DataFrame = { + createTable(tableName, path) + } + /** * Creates a table from the given path and returns the corresponding DataFrame. * It will use the default data source configured by spark.sql.sources.default. @@ -219,6 +233,20 @@ abstract class Catalog { */ def createTable(tableName: String, path: String): DataFrame + /** + * Creates a table from the given path based on a data source and returns the corresponding + * DataFrame. + * + * @param tableName is either a qualified or unqualified name that designates a table. + * If no database identifier is provided, it refers to a table in + * the current database. + * @since 2.0.0 + */ + @deprecated("use createTable instead.", "2.2.0") + def createExternalTable(tableName: String, path: String, source: String): DataFrame = { + createTable(tableName, path, source) + } + /** * Creates a table from the given path based on a data source and returns the corresponding * DataFrame. @@ -230,6 +258,23 @@ abstract class Catalog { */ def createTable(tableName: String, path: String, source: String): DataFrame + /** + * Creates a table from the given path based on a data source and a set of options. + * Then, returns the corresponding DataFrame. + * + * @param tableName is either a qualified or unqualified name that designates a table. + * If no database identifier is provided, it refers to a table in + * the current database. + * @since 2.0.0 + */ + @deprecated("use createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + options: java.util.Map[String, String]): DataFrame = { + createTable(tableName, source, options) + } + /** * Creates a table based on the dataset in a data source and a set of options. * Then, returns the corresponding DataFrame. @@ -246,6 +291,24 @@ abstract class Catalog { createTable(tableName, source, options.asScala.toMap) } + /** + * (Scala-specific) + * Creates a table from the given path based on a data source and a set of options. + * Then, returns the corresponding DataFrame. + * + * @param tableName is either a qualified or unqualified name that designates a table. + * If no database identifier is provided, it refers to a table in + * the current database. + * @since 2.0.0 + */ + @deprecated("use createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + options: Map[String, String]): DataFrame = { + createTable(tableName, source, options) + } + /** * (Scala-specific) * Creates a table based on the dataset in a data source and a set of options. @@ -261,6 +324,24 @@ abstract class Catalog { source: String, options: Map[String, String]): DataFrame + /** + * Create a table from the given path based on a data source, a schema and a set of options. + * Then, returns the corresponding DataFrame. + * + * @param tableName is either a qualified or unqualified name that designates a table. + * If no database identifier is provided, it refers to a table in + * the current database. + * @since 2.0.0 + */ + @deprecated("use createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: java.util.Map[String, String]): DataFrame = { + createTable(tableName, source, schema, options) + } + /** * Create a table based on the dataset in a data source, a schema and a set of options. * Then, returns the corresponding DataFrame. @@ -278,6 +359,25 @@ abstract class Catalog { createTable(tableName, source, schema, options.asScala.toMap) } + /** + * (Scala-specific) + * Create a table from the given path based on a data source, a schema and a set of options. + * Then, returns the corresponding DataFrame. + * + * @param tableName is either a qualified or unqualified name that designates a table. + * If no database identifier is provided, it refers to a table in + * the current database. + * @since 2.0.0 + */ + @deprecated("use createTable instead.", "2.2.0") + def createExternalTable( + tableName: String, + source: String, + schema: StructType, + options: Map[String, String]): DataFrame = { + createTable(tableName, source, schema, options) + } + /** * (Scala-specific) * Create a table based on the dataset in a data source, a schema and a set of options. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 486e7f1f84b46..9a8d2f0a142d4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, Identifier, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, RefreshTable} @@ -39,7 +39,8 @@ import org.apache.spark.sql.types.{HIVE_TYPE_STRING, HiveStringType, MetadataBui class ResolveSessionCatalog( val catalogManager: CatalogManager, conf: SQLConf, - isView: Seq[String] => Boolean) + isTempView: Seq[String] => Boolean, + isTempFunction: String => Boolean) extends Rule[LogicalPlan] with LookupCatalog { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import org.apache.spark.sql.connector.catalog.CatalogV2Util._ @@ -49,6 +50,9 @@ class ResolveSessionCatalog( nameParts @ SessionCatalogAndTable(catalog, tbl), cols) => loadTable(catalog, tbl.asIdentifier).collect { case v1Table: V1Table => + if (!DDLUtils.isHiveTable(v1Table.v1Table)) { + cols.foreach(c => failCharType(c.dataType)) + } cols.foreach { c => assertTopLevelColumn(c.name, "AlterTableAddColumnsCommand") if (!c.nullable) { @@ -58,6 +62,7 @@ class ResolveSessionCatalog( } AlterTableAddColumnsCommand(tbl.asTableIdentifier, cols.map(convertToStructField)) }.getOrElse { + cols.foreach(c => failCharType(c.dataType)) val changes = cols.map { col => TableChange.addColumn( col.name.toArray, @@ -73,14 +78,14 @@ class ResolveSessionCatalog( nameParts @ SessionCatalogAndTable(catalog, tbl), _, _, _, _, _) => loadTable(catalog, tbl.asIdentifier).collect { case v1Table: V1Table => + if (!DDLUtils.isHiveTable(v1Table.v1Table)) { + a.dataType.foreach(failCharType) + } + if (a.column.length > 1) { throw new AnalysisException( "ALTER COLUMN with qualified column is only supported with v2 tables.") } - if (a.dataType.isEmpty) { - throw new AnalysisException( - "ALTER COLUMN with v1 tables must specify new data type.") - } if (a.nullable.isDefined) { throw new AnalysisException( "ALTER COLUMN with v1 tables cannot specify NOT NULL.") @@ -92,18 +97,29 @@ class ResolveSessionCatalog( val builder = new MetadataBuilder // Add comment to metadata a.comment.map(c => builder.putString("comment", c)) + val colName = a.column(0) + val dataType = a.dataType.getOrElse { + v1Table.schema.findNestedField(Seq(colName), resolver = conf.resolver) + .map(_._2.dataType) + .getOrElse { + throw new AnalysisException( + s"ALTER COLUMN cannot find column ${quoteIfNeeded(colName)} in v1 table. " + + s"Available: ${v1Table.schema.fieldNames.mkString(", ")}") + } + } // Add Hive type string to metadata. - val cleanedDataType = HiveStringType.replaceCharType(a.dataType.get) - if (a.dataType.get != cleanedDataType) { - builder.putString(HIVE_TYPE_STRING, a.dataType.get.catalogString) + val cleanedDataType = HiveStringType.replaceCharType(dataType) + if (dataType != cleanedDataType) { + builder.putString(HIVE_TYPE_STRING, dataType.catalogString) } val newColumn = StructField( - a.column(0), + colName, cleanedDataType, nullable = true, builder.build()) - AlterTableChangeColumnCommand(tbl.asTableIdentifier, a.column(0), newColumn) + AlterTableChangeColumnCommand(tbl.asTableIdentifier, colName, newColumn) }.getOrElse { + a.dataType.foreach(failCharType) val colName = a.column.toArray val typeChange = a.dataType.map { newDataType => TableChange.updateColumnType(colName, newDataType) @@ -209,8 +225,9 @@ class ResolveSessionCatalog( } AlterDatabaseSetLocationCommand(ns.head, location) - case RenameTableStatement(SessionCatalogAndTable(_, oldName), newNameParts, isView) => - AlterTableRenameCommand(oldName.asTableIdentifier, newNameParts.asTableIdentifier, isView) + // v1 RENAME TABLE supports temp view. + case RenameTableStatement(TempViewOrV1Table(oldName), newName, isView) => + AlterTableRenameCommand(oldName.asTableIdentifier, newName.asTableIdentifier, isView) case DescribeRelation(ResolvedTable(_, ident, _: V1Table), partitionSpec, isExtended) => DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended) @@ -219,45 +236,42 @@ class ResolveSessionCatalog( case DescribeRelation(ResolvedView(ident), partitionSpec, isExtended) => DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended) - case DescribeColumnStatement( - SessionCatalogAndTable(catalog, tbl), colNameParts, isExtended) => - loadTable(catalog, tbl.asIdentifier).collect { - case v1Table: V1Table => - DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended) - }.getOrElse { - if (isView(tbl)) { - DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended) - } else { - throw new AnalysisException("Describing columns is not supported for v2 tables.") - } - } + case DescribeColumnStatement(tbl, colNameParts, isExtended) => + val name = parseTempViewOrV1Table(tbl, "Describing columns") + DescribeColumnCommand(name.asTableIdentifier, colNameParts, isExtended) // For CREATE TABLE [AS SELECT], we should use the v1 command if the catalog is resolved to the // session catalog and the table provider is not v2. case c @ CreateTableStatement( SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => - if (!isV2Provider(c.provider)) { + val provider = c.provider.getOrElse(conf.defaultDataSourceName) + if (!isV2Provider(provider)) { + if (!DDLUtils.isHiveTable(Some(provider))) { + assertNoCharTypeInSchema(c.tableSchema) + } val tableDesc = buildCatalogTable(tbl.asTableIdentifier, c.tableSchema, - c.partitioning, c.bucketSpec, c.properties, c.provider, c.options, c.location, + c.partitioning, c.bucketSpec, c.properties, provider, c.options, c.location, c.comment, c.ifNotExists) val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTable(tableDesc, mode, None) } else { + assertNoCharTypeInSchema(c.tableSchema) CreateV2Table( catalog.asTableCatalog, tbl.asIdentifier, c.tableSchema, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), ignoreIfExists = c.ifNotExists) } case c @ CreateTableAsSelectStatement( - SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => - if (!isV2Provider(c.provider)) { + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => + val provider = c.provider.getOrElse(conf.defaultDataSourceName) + if (!isV2Provider(provider)) { val tableDesc = buildCatalogTable(tbl.asTableIdentifier, new StructType, - c.partitioning, c.bucketSpec, c.properties, c.provider, c.options, c.location, + c.partitioning, c.bucketSpec, c.properties, provider, c.options, c.location, c.comment, c.ifNotExists) val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTable(tableDesc, mode, Some(c.asSelect)) @@ -268,34 +282,38 @@ class ResolveSessionCatalog( // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), c.asSelect, - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), - writeOptions = c.options, + convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), + writeOptions = c.writeOptions, ignoreIfExists = c.ifNotExists) } - case RefreshTableStatement(SessionCatalogAndTable(_, tbl)) => - RefreshTable(tbl.asTableIdentifier) + // v1 REFRESH TABLE supports temp view. + case RefreshTableStatement(TempViewOrV1Table(name)) => + RefreshTable(name.asTableIdentifier) // For REPLACE TABLE [AS SELECT], we should fail if the catalog is resolved to the // session catalog and the table provider is not v2. case c @ ReplaceTableStatement( SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => - if (!isV2Provider(c.provider)) { + val provider = c.provider.getOrElse(conf.defaultDataSourceName) + if (!isV2Provider(provider)) { throw new AnalysisException("REPLACE TABLE is only supported with v2 tables.") } else { + assertNoCharTypeInSchema(c.tableSchema) ReplaceTable( catalog.asTableCatalog, tbl.asIdentifier, c.tableSchema, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), orCreate = c.orCreate) } case c @ ReplaceTableAsSelectStatement( - SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => - if (!isV2Provider(c.provider)) { + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => + val provider = c.provider.getOrElse(conf.defaultDataSourceName) + if (!isV2Provider(provider)) { throw new AnalysisException("REPLACE TABLE AS SELECT is only supported with v2 tables.") } else { ReplaceTableAsSelect( @@ -304,16 +322,18 @@ class ResolveSessionCatalog( // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), c.asSelect, - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), - writeOptions = c.options, + convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), + writeOptions = c.writeOptions, orCreate = c.orCreate) } - case DropTableStatement(SessionCatalogAndTable(catalog, tbl), ifExists, purge) => - DropTableCommand(tbl.asTableIdentifier, ifExists, isView = false, purge = purge) + // v1 DROP TABLE supports temp view. + case DropTableStatement(TempViewOrV1Table(name), ifExists, purge) => + DropTableCommand(name.asTableIdentifier, ifExists, isView = false, purge = purge) - case DropViewStatement(SessionCatalogAndTable(catalog, viewName), ifExists) => - DropTableCommand(viewName.asTableIdentifier, ifExists, isView = true, purge = false) + // v1 DROP TABLE supports temp view. + case DropViewStatement(TempViewOrV1Table(name), ifExists) => + DropTableCommand(name.asTableIdentifier, ifExists, isView = true, purge = false) case c @ CreateNamespaceStatement(CatalogAndNamespace(catalog, ns), _, _) if isSessionCatalog(catalog) => @@ -360,7 +380,7 @@ class ResolveSessionCatalog( } case AnalyzeColumnStatement(tbl, columnNames, allColumns) => - val v1TableName = parseV1Table(tbl, "ANALYZE TABLE") + val v1TableName = parseTempViewOrV1Table(tbl, "ANALYZE TABLE") AnalyzeColumnCommand(v1TableName.asTableIdentifier, columnNames, allColumns) case RepairTableStatement(tbl) => @@ -379,20 +399,26 @@ class ResolveSessionCatalog( partition) case ShowCreateTableStatement(tbl, asSerde) if !asSerde => - val v1TableName = parseV1Table(tbl, "SHOW CREATE TABLE") - ShowCreateTableCommand(v1TableName.asTableIdentifier) + val name = parseTempViewOrV1Table(tbl, "SHOW CREATE TABLE") + ShowCreateTableCommand(name.asTableIdentifier) case ShowCreateTableStatement(tbl, asSerde) if asSerde => val v1TableName = parseV1Table(tbl, "SHOW CREATE TABLE AS SERDE") ShowCreateTableAsSerdeCommand(v1TableName.asTableIdentifier) case CacheTableStatement(tbl, plan, isLazy, options) => - val v1TableName = parseV1Table(tbl, "CACHE TABLE") - CacheTableCommand(v1TableName.asTableIdentifier, plan, isLazy, options) + val name = if (plan.isDefined) { + // CACHE TABLE ... AS SELECT creates a temp view with the input query. + // Temp view doesn't belong to any catalog and we shouldn't resolve catalog in the name. + tbl + } else { + parseTempViewOrV1Table(tbl, "CACHE TABLE") + } + CacheTableCommand(name.asTableIdentifier, plan, isLazy, options) case UncacheTableStatement(tbl, ifExists) => - val v1TableName = parseV1Table(tbl, "UNCACHE TABLE") - UncacheTableCommand(v1TableName.asTableIdentifier, ifExists) + val name = parseTempViewOrV1Table(tbl, "UNCACHE TABLE") + UncacheTableCommand(name.asTableIdentifier, ifExists) case TruncateTableStatement(tbl, partitionSpec) => val v1TableName = parseV1Table(tbl, "TRUNCATE TABLE") @@ -407,24 +433,27 @@ class ResolveSessionCatalog( partitionSpec) case ShowColumnsStatement(tbl, ns) => + if (ns.isDefined && ns.get.length > 1) { + throw new AnalysisException( + s"Namespace name should have only one part if specified: ${ns.get.quoted}") + } + // Use namespace only if table name doesn't specify it. If namespace is already specified + // in the table name, it's checked against the given namespace below. + val nameParts = if (ns.isDefined && tbl.length == 1) { + ns.get ++ tbl + } else { + tbl + } val sql = "SHOW COLUMNS" - val v1TableName = parseV1Table(tbl, sql).asTableIdentifier + val v1TableName = parseTempViewOrV1Table(nameParts, sql).asTableIdentifier val resolver = conf.resolver val db = ns match { - case Some(db) if (v1TableName.database.exists(!resolver(_, db.head))) => + case Some(db) if v1TableName.database.exists(!resolver(_, db.head)) => throw new AnalysisException( s"SHOW COLUMNS with conflicting databases: " + s"'${db.head}' != '${v1TableName.database.get}'") case _ => ns.map(_.head) } - if (ns.isDefined && ns.get.length > 1) { - throw new AnalysisException( - s"Namespace name should have only one part if specified: ${ns.get.quoted}") - } - if (tbl.length > 2) { - throw new AnalysisException( - s"Table name should have at most two parts: ${tbl.quoted}") - } ShowColumnsCommand(db, v1TableName) case AlterTableRecoverPartitionsStatement(tbl) => @@ -464,10 +493,10 @@ class ResolveSessionCatalog( serdeProperties, partitionSpec) - case AlterViewAsStatement(tbl, originalText, query) => - val v1TableName = parseV1Table(tbl, "ALTER VIEW QUERY") + case AlterViewAsStatement(name, originalText, query) => + val viewName = parseTempViewOrV1Table(name, "ALTER VIEW QUERY") AlterViewAsCommand( - v1TableName.asTableIdentifier, + viewName.asTableIdentifier, originalText, query) @@ -475,7 +504,12 @@ class ResolveSessionCatalog( tbl, userSpecifiedColumns, comment, properties, originalText, child, allowExisting, replace, viewType) => - val v1TableName = parseV1Table(tbl, "CREATE VIEW") + val v1TableName = if (viewType != PersistedView) { + // temp view doesn't belong to any catalog and we shouldn't resolve catalog in the name. + tbl + } else { + parseV1Table(tbl, "CREATE VIEW") + } CreateViewCommand( v1TableName.asTableIdentifier, userSpecifiedColumns, @@ -487,59 +521,110 @@ class ResolveSessionCatalog( replace, viewType) - case ShowTableProperties(r: ResolvedTable, propertyKey) if isSessionCatalog(r.catalog) => + case ShowViews(resolved: ResolvedNamespace, pattern) => + resolved match { + case SessionCatalogAndNamespace(_, ns) => + // Fallback to v1 ShowViewsCommand since there is no view API in v2 catalog + assert(ns.nonEmpty) + if (ns.length != 1) { + throw new AnalysisException(s"The database name is not valid: ${ns.quoted}") + } + ShowViewsCommand(ns.head, pattern) + case _ => + throw new AnalysisException(s"Catalog ${resolved.catalog.name} doesn't support " + + "SHOW VIEWS, only SessionCatalog supports this command.") + } + + case ShowTableProperties( + r @ ResolvedTable(_, _, _: V1Table), propertyKey) if isSessionCatalog(r.catalog) => ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey) - case DescribeFunctionStatement(CatalogAndIdentifier(catalog, ident), extended) => + case ShowTableProperties(r: ResolvedView, propertyKey) => + ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey) + + case DescribeFunctionStatement(nameParts, extended) => val functionIdent = - parseSessionCatalogFunctionIdentifier("DESCRIBE FUNCTION", catalog, ident) + parseSessionCatalogFunctionIdentifier(nameParts, "DESCRIBE FUNCTION") DescribeFunctionCommand(functionIdent, extended) case ShowFunctionsStatement(userScope, systemScope, pattern, fun) => val (database, function) = fun match { - case Some(CatalogAndIdentifier(catalog, ident)) => + case Some(nameParts) => val FunctionIdentifier(fn, db) = - parseSessionCatalogFunctionIdentifier("SHOW FUNCTIONS", catalog, ident) + parseSessionCatalogFunctionIdentifier(nameParts, "SHOW FUNCTIONS") (db, Some(fn)) case None => (None, pattern) } ShowFunctionsCommand(database, function, userScope, systemScope) - case DropFunctionStatement(CatalogAndIdentifier(catalog, ident), ifExists, isTemp) => + case DropFunctionStatement(nameParts, ifExists, isTemp) => val FunctionIdentifier(function, database) = - parseSessionCatalogFunctionIdentifier("DROP FUNCTION", catalog, ident) + parseSessionCatalogFunctionIdentifier(nameParts, "DROP FUNCTION") DropFunctionCommand(database, function, ifExists, isTemp) - case CreateFunctionStatement(CatalogAndIdentifier(catalog, ident), + case CreateFunctionStatement(nameParts, className, resources, isTemp, ignoreIfExists, replace) => - val FunctionIdentifier(function, database) = - parseSessionCatalogFunctionIdentifier("CREATE FUNCTION", catalog, ident) - CreateFunctionCommand(database, function, className, resources, isTemp, ignoreIfExists, - replace) + if (isTemp) { + // temp func doesn't belong to any catalog and we shouldn't resolve catalog in the name. + val database = if (nameParts.length > 2) { + throw new AnalysisException(s"Unsupported function name '${nameParts.quoted}'") + } else if (nameParts.length == 2) { + Some(nameParts.head) + } else { + None + } + CreateFunctionCommand( + database, + nameParts.last, + className, + resources, + isTemp, + ignoreIfExists, + replace) + } else { + val FunctionIdentifier(function, database) = + parseSessionCatalogFunctionIdentifier(nameParts, "CREATE FUNCTION") + CreateFunctionCommand(database, function, className, resources, isTemp, ignoreIfExists, + replace) + } } + // TODO: move function related v2 statements to the new framework. private def parseSessionCatalogFunctionIdentifier( - sql: String, - catalog: CatalogPlugin, - functionIdent: Identifier): FunctionIdentifier = { - if (isSessionCatalog(catalog)) { - functionIdent.asMultipartIdentifier match { - case Seq(db, fn) => FunctionIdentifier(fn, Some(db)) - case Seq(fn) => FunctionIdentifier(fn, None) - case _ => - throw new AnalysisException(s"Unsupported function name '${functionIdent.quoted}'") - } - } else { - throw new AnalysisException(s"$sql is only supported in v1 catalog") + nameParts: Seq[String], + sql: String): FunctionIdentifier = { + if (nameParts.length == 1 && isTempFunction(nameParts.head)) { + return FunctionIdentifier(nameParts.head) } - } - private def parseV1Table(tableName: Seq[String], sql: String): Seq[String] = { - val CatalogAndIdentifier(catalog, ident) = tableName - if (!isSessionCatalog(catalog)) { - throw new AnalysisException(s"$sql is only supported with v1 tables.") + nameParts match { + case SessionCatalogAndIdentifier(_, ident) => + if (nameParts.length == 1) { + // If there is only one name part, it means the current catalog is the session catalog. + // Here we don't fill the default database, to keep the error message unchanged for + // v1 commands. + FunctionIdentifier(nameParts.head, None) + } else { + ident.namespace match { + case Array(db) => FunctionIdentifier(ident.name, Some(db)) + case _ => + throw new AnalysisException(s"Unsupported function name '$ident'") + } + } + + case _ => throw new AnalysisException(s"$sql is only supported in v1 catalog") } - ident.asMultipartIdentifier + } + + private def parseV1Table(tableName: Seq[String], sql: String): Seq[String] = tableName match { + case SessionCatalogAndTable(_, tbl) => tbl + case _ => throw new AnalysisException(s"$sql is only supported with v1 tables.") + } + + private def parseTempViewOrV1Table( + nameParts: Seq[String], sql: String): Seq[String] = nameParts match { + case TempViewOrV1Table(name) => name + case _ => throw new AnalysisException(s"$sql is only supported with temp views or v1 tables.") } private def buildCatalogTable( @@ -583,6 +668,14 @@ class ResolveSessionCatalog( } } + object TempViewOrV1Table { + def unapply(nameParts: Seq[String]): Option[Seq[String]] = nameParts match { + case _ if isTempView(nameParts) => Some(nameParts) + case SessionCatalogAndIdentifier(_, tbl) => Some(tbl.asMultipartIdentifier) + case _ => None + } + } + object SessionCatalogAndNamespace { def unapply(resolved: ResolvedNamespace): Option[(CatalogPlugin, Seq[String])] = if (isSessionCatalog(resolved.catalog)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala index 413bd7b29cf45..cee8585e387a6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, SubqueryExpression} import org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan, ResolvedHint} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.execution.command.CommandUtils import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation} @@ -45,7 +46,7 @@ case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryRelation) * * Internal to Spark SQL. */ -class CacheManager extends Logging { +class CacheManager extends Logging with AdaptiveSparkPlanHelper { /** * Maintains the list of cached plans as an immutable sequence. Any updates to the list @@ -79,20 +80,16 @@ class CacheManager extends Logging { if (lookupCachedData(planToCache).nonEmpty) { logWarning("Asked to cache already cached data.") } else { - val sparkSession = query.sparkSession - val qe = sparkSession.sessionState.executePlan(planToCache) - val originalValue = sparkSession.sessionState.conf.getConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED) - val inMemoryRelation = try { - // Avoiding changing the output partitioning, here disable AQE. - sparkSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) + // Turn off AQE so that the outputPartitioning of the underlying plan can be leveraged. + val sessionWithAqeOff = getOrCloneSessionWithAqeOff(query.sparkSession) + val inMemoryRelation = sessionWithAqeOff.withActive { + val qe = sessionWithAqeOff.sessionState.executePlan(planToCache) InMemoryRelation( - sparkSession.sessionState.conf.useCompression, - sparkSession.sessionState.conf.columnBatchSize, storageLevel, + sessionWithAqeOff.sessionState.conf.useCompression, + sessionWithAqeOff.sessionState.conf.columnBatchSize, storageLevel, qe.executedPlan, tableName, optimizedPlan = qe.optimizedPlan) - } finally { - sparkSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, originalValue) } this.synchronized { @@ -192,12 +189,16 @@ class CacheManager extends Logging { // Remove the cache entry before creating a new ones. cachedData = cachedData.filterNot(cd => needToRecache.exists(_ eq cd)) } - needToRecache.map { cd => + needToRecache.foreach { cd => cd.cachedRepresentation.cacheBuilder.clearCache() - val qe = spark.sessionState.executePlan(cd.plan) - val newCache = InMemoryRelation( - cacheBuilder = cd.cachedRepresentation.cacheBuilder.copy(cachedPlan = qe.executedPlan), - optimizedPlan = qe.optimizedPlan) + // Turn off AQE so that the outputPartitioning of the underlying plan can be leveraged. + val sessionWithAqeOff = getOrCloneSessionWithAqeOff(spark) + val newCache = sessionWithAqeOff.withActive { + val qe = sessionWithAqeOff.sessionState.executePlan(cd.plan) + InMemoryRelation( + cacheBuilder = cd.cachedRepresentation.cacheBuilder.copy(cachedPlan = qe.executedPlan), + optimizedPlan = qe.optimizedPlan) + } val recomputedPlan = cd.copy(cachedRepresentation = newCache) this.synchronized { if (lookupCachedData(recomputedPlan.plan).nonEmpty) { @@ -247,12 +248,17 @@ class CacheManager extends Logging { * `HadoopFsRelation` node(s) as part of its logical plan. */ def recacheByPath(spark: SparkSession, resourcePath: String): Unit = { - val (fs, qualifiedPath) = { - val path = new Path(resourcePath) - val fs = path.getFileSystem(spark.sessionState.newHadoopConf()) - (fs, fs.makeQualified(path)) - } + val path = new Path(resourcePath) + val fs = path.getFileSystem(spark.sessionState.newHadoopConf()) + recacheByPath(spark, path, fs) + } + /** + * Tries to re-cache all the cache entries that contain `resourcePath` in one or more + * `HadoopFsRelation` node(s) as part of its logical plan. + */ + def recacheByPath(spark: SparkSession, resourcePath: Path, fs: FileSystem): Unit = { + val qualifiedPath = fs.makeQualified(resourcePath) recacheByCondition(spark, _.plan.find(lookupAndRefresh(_, fs, qualifiedPath)).isDefined) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala index e482bc9941ea9..e1b9c8f430c56 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala @@ -87,7 +87,7 @@ object CollectMetricsExec { * Recursively collect all collected metrics from a query tree. */ def collect(plan: SparkPlan): Map[String, Row] = { - val metrics = plan.collectInPlanAndSubqueries { + val metrics = plan.collectWithSubqueries { case collector: CollectMetricsExec => collector.name -> collector.collectedMetrics } metrics.toMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 0d759085a7e2c..e2bf132a2e18f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -75,10 +75,10 @@ trait DataSourceScanExec extends LeafExecNode { } s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Output: ${producedAttributes.mkString("[", ", ", "]")} + |$formattedNodeName + |${ExplainUtils.generateFieldString("Output", output)} |${metadataStr.mkString("\n")} - """.stripMargin + |""".stripMargin } /** @@ -175,7 +175,7 @@ case class FileSourceScanExec( private lazy val needsUnsafeRowConversion: Boolean = { if (relation.fileFormat.isInstanceOf[ParquetSource]) { - SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled + sqlContext.conf.parquetVectorizedReaderEnabled } else { false } @@ -209,9 +209,6 @@ case class FileSourceScanExec( val ret = relation.location.listFiles( partitionFilters.filterNot(isDynamicPruningFilter), dataFilters) - if (relation.partitionSchemaOption.isDefined) { - driverMetrics("numPartitions") = ret.length - } setFilesNumAndSizeMetric(ret, true) val timeTakenMs = NANOSECONDS.toMillis( (System.nanoTime() - startTime) + optimizerMetadataTimeNs) @@ -253,79 +250,83 @@ case class FileSourceScanExec( partitionFilters.exists(ExecSubqueryExpression.hasSubquery) } - override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = { - val bucketSpec = if (relation.sparkSession.sessionState.conf.bucketingEnabled) { - relation.bucketSpec + private def toAttribute(colName: String): Option[Attribute] = + output.find(_.name == colName) + + // exposed for testing + lazy val bucketedScan: Boolean = { + if (relation.sparkSession.sessionState.conf.bucketingEnabled && relation.bucketSpec.isDefined) { + val spec = relation.bucketSpec.get + val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n)) + bucketColumns.size == spec.bucketColumnNames.size } else { - None + false } - bucketSpec match { - case Some(spec) => - // For bucketed columns: - // ----------------------- - // `HashPartitioning` would be used only when: - // 1. ALL the bucketing columns are being read from the table - // - // For sorted columns: - // --------------------- - // Sort ordering should be used when ALL these criteria's match: - // 1. `HashPartitioning` is being used - // 2. A prefix (or all) of the sort columns are being read from the table. - // - // Sort ordering would be over the prefix subset of `sort columns` being read - // from the table. - // eg. - // Assume (col0, col2, col3) are the columns read from the table - // If sort columns are (col0, col1), then sort ordering would be considered as (col0) - // If sort columns are (col1, col0), then sort ordering would be empty as per rule #2 - // above - - def toAttribute(colName: String): Option[Attribute] = - output.find(_.name == colName) - - val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n)) - if (bucketColumns.size == spec.bucketColumnNames.size) { - val partitioning = HashPartitioning(bucketColumns, spec.numBuckets) - val sortColumns = - spec.sortColumnNames.map(x => toAttribute(x)).takeWhile(x => x.isDefined).map(_.get) - val shouldCalculateSortOrder = - conf.getConf(SQLConf.LEGACY_BUCKETED_TABLE_SCAN_OUTPUT_ORDERING) && - sortColumns.nonEmpty && - !hasPartitionsAvailableAtRunTime - - val sortOrder = if (shouldCalculateSortOrder) { - // In case of bucketing, its possible to have multiple files belonging to the - // same bucket in a given relation. Each of these files are locally sorted - // but those files combined together are not globally sorted. Given that, - // the RDD partition will not be sorted even if the relation has sort columns set - // Current solution is to check if all the buckets have a single file in it - - val files = selectedPartitions.flatMap(partition => partition.files) - val bucketToFilesGrouping = - files.map(_.getPath.getName).groupBy(file => BucketingUtils.getBucketId(file)) - val singleFilePartitions = bucketToFilesGrouping.forall(p => p._2.length <= 1) - - if (singleFilePartitions) { - // TODO Currently Spark does not support writing columns sorting in descending order - // so using Ascending order. This can be fixed in future - sortColumns.map(attribute => SortOrder(attribute, Ascending)) - } else { - Nil - } - } else { - Nil - } - (partitioning, sortOrder) + } + + override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = { + if (bucketedScan) { + // For bucketed columns: + // ----------------------- + // `HashPartitioning` would be used only when: + // 1. ALL the bucketing columns are being read from the table + // + // For sorted columns: + // --------------------- + // Sort ordering should be used when ALL these criteria's match: + // 1. `HashPartitioning` is being used + // 2. A prefix (or all) of the sort columns are being read from the table. + // + // Sort ordering would be over the prefix subset of `sort columns` being read + // from the table. + // eg. + // Assume (col0, col2, col3) are the columns read from the table + // If sort columns are (col0, col1), then sort ordering would be considered as (col0) + // If sort columns are (col1, col0), then sort ordering would be empty as per rule #2 + // above + val spec = relation.bucketSpec.get + val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n)) + val partitioning = HashPartitioning(bucketColumns, spec.numBuckets) + val sortColumns = + spec.sortColumnNames.map(x => toAttribute(x)).takeWhile(x => x.isDefined).map(_.get) + val shouldCalculateSortOrder = + conf.getConf(SQLConf.LEGACY_BUCKETED_TABLE_SCAN_OUTPUT_ORDERING) && + sortColumns.nonEmpty && + !hasPartitionsAvailableAtRunTime + + val sortOrder = if (shouldCalculateSortOrder) { + // In case of bucketing, its possible to have multiple files belonging to the + // same bucket in a given relation. Each of these files are locally sorted + // but those files combined together are not globally sorted. Given that, + // the RDD partition will not be sorted even if the relation has sort columns set + // Current solution is to check if all the buckets have a single file in it + + val files = selectedPartitions.flatMap(partition => partition.files) + val bucketToFilesGrouping = + files.map(_.getPath.getName).groupBy(file => BucketingUtils.getBucketId(file)) + val singleFilePartitions = bucketToFilesGrouping.forall(p => p._2.length <= 1) + + if (singleFilePartitions) { + // TODO Currently Spark does not support writing columns sorting in descending order + // so using Ascending order. This can be fixed in future + sortColumns.map(attribute => SortOrder(attribute, Ascending)) } else { - (UnknownPartitioning(0), Nil) + Nil } - case _ => - (UnknownPartitioning(0), Nil) + } else { + Nil + } + (partitioning, sortOrder) + } else { + (UnknownPartitioning(0), Nil) } } @transient - private lazy val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter) + private lazy val pushedDownFilters = { + val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) + dataFilters.flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) + } override lazy val metadata: Map[String, String] = { def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]") @@ -376,10 +377,10 @@ case class FileSourceScanExec( } s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Output: ${producedAttributes.mkString("[", ", ", "]")} + |$formattedNodeName + |${ExplainUtils.generateFieldString("Output", output)} |${metadataStr.mkString("\n")} - """.stripMargin + |""".stripMargin } lazy val inputRDD: RDD[InternalRow] = { @@ -393,11 +394,11 @@ case class FileSourceScanExec( options = relation.options, hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options)) - val readRDD = relation.bucketSpec match { - case Some(bucketing) if relation.sparkSession.sessionState.conf.bucketingEnabled => - createBucketedReadRDD(bucketing, readFile, dynamicallySelectedPartitions, relation) - case _ => - createNonBucketedReadRDD(readFile, dynamicallySelectedPartitions, relation) + val readRDD = if (bucketedScan) { + createBucketedReadRDD(relation.bucketSpec.get, readFile, dynamicallySelectedPartitions, + relation) + } else { + createNonBucketedReadRDD(readFile, dynamicallySelectedPartitions, relation) } sendDriverMetrics() readRDD @@ -408,7 +409,7 @@ case class FileSourceScanExec( } /** SQL metrics generated only for scans using dynamic partition pruning. */ - private lazy val staticMetrics = if (partitionFilters.filter(isDynamicPruningFilter).nonEmpty) { + private lazy val staticMetrics = if (partitionFilters.exists(isDynamicPruningFilter)) { Map("staticFilesNum" -> SQLMetrics.createMetric(sparkContext, "static number of files read"), "staticFilesSize" -> SQLMetrics.createSizeMetric(sparkContext, "static size of files read")) } else { @@ -421,22 +422,23 @@ case class FileSourceScanExec( static: Boolean): Unit = { val filesNum = partitions.map(_.files.size.toLong).sum val filesSize = partitions.map(_.files.map(_.getLen).sum).sum - if (!static || partitionFilters.filter(isDynamicPruningFilter).isEmpty) { + if (!static || !partitionFilters.exists(isDynamicPruningFilter)) { driverMetrics("numFiles") = filesNum driverMetrics("filesSize") = filesSize } else { driverMetrics("staticFilesNum") = filesNum driverMetrics("staticFilesSize") = filesSize } + if (relation.partitionSchemaOption.isDefined) { + driverMetrics("numPartitions") = partitions.length + } } override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files read"), "metadataTime" -> SQLMetrics.createTimingMetric(sparkContext, "metadata time"), - "filesSize" -> SQLMetrics.createSizeMetric(sparkContext, "size of files read"), - "pruningTime" -> - SQLMetrics.createTimingMetric(sparkContext, "dynamic partition pruning time") + "filesSize" -> SQLMetrics.createSizeMetric(sparkContext, "size of files read") ) ++ { // Tracking scan time has overhead, we can't afford to do it for each row, and can only do // it for each batch. @@ -447,9 +449,12 @@ case class FileSourceScanExec( } } ++ { if (relation.partitionSchemaOption.isDefined) { - Some("numPartitions" -> SQLMetrics.createMetric(sparkContext, "number of partitions read")) + Map( + "numPartitions" -> SQLMetrics.createMetric(sparkContext, "number of partitions read"), + "pruningTime" -> + SQLMetrics.createTimingMetric(sparkContext, "dynamic partition pruning time")) } else { - None + Map.empty[String, SQLMetric] } } ++ staticMetrics @@ -585,12 +590,20 @@ case class FileSourceScanExec( new FileScanRDD(fsRelation.sparkSession, readFile, partitions) } + // Filters unused DynamicPruningExpression expressions - one which has been replaced + // with DynamicPruningExpression(Literal.TrueLiteral) during Physical Planning + private def filterUnusedDynamicPruningExpressions( + predicates: Seq[Expression]): Seq[Expression] = { + predicates.filterNot(_ == DynamicPruningExpression(Literal.TrueLiteral)) + } + override def doCanonicalize(): FileSourceScanExec = { FileSourceScanExec( relation, output.map(QueryPlan.normalizeExpressions(_, output)), requiredSchema, - QueryPlan.normalizePredicates(partitionFilters, output), + QueryPlan.normalizePredicates( + filterUnusedDynamicPruningExpressions(partitionFilters), output), optionalBucketSet, QueryPlan.normalizePredicates(dataFilters, output), None) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala index d4fe272f8c95f..b54bd6a579b66 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala @@ -23,9 +23,9 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Expression, PlanExpression} import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.trees.TreeNodeTag +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, QueryStageExec} -object ExplainUtils { +object ExplainUtils extends AdaptiveSparkPlanHelper { /** * Given a input physical plan, performs the following tasks. * 1. Computes the operator id for current operator and records it in the operaror @@ -145,15 +145,26 @@ object ExplainUtils { case p: WholeStageCodegenExec => case p: InputAdapter => case other: QueryPlan[_] => - if (!other.getTagValue(QueryPlan.OP_ID_TAG).isDefined) { + + def setOpId(): Unit = if (other.getTagValue(QueryPlan.OP_ID_TAG).isEmpty) { currentOperationID += 1 other.setTagValue(QueryPlan.OP_ID_TAG, currentOperationID) operatorIDs += ((currentOperationID, other)) } - other.innerChildren.foreach { plan => - currentOperationID = generateOperatorIDs(plan, - currentOperationID, - operatorIDs) + + other match { + case p: AdaptiveSparkPlanExec => + currentOperationID = + generateOperatorIDs(p.executedPlan, currentOperationID, operatorIDs) + setOpId() + case p: QueryStageExec => + currentOperationID = generateOperatorIDs(p.plan, currentOperationID, operatorIDs) + setOpId() + case _ => + setOpId() + other.innerChildren.foldLeft(currentOperationID) { + (curId, plan) => generateOperatorIDs(plan, curId, operatorIDs) + } } } currentOperationID @@ -164,24 +175,39 @@ object ExplainUtils { * whole stage code gen id in the plan via setting a tag. */ private def generateWholeStageCodegenIds(plan: QueryPlan[_]): Unit = { + var currentCodegenId = -1 + + def setCodegenId(p: QueryPlan[_], children: Seq[QueryPlan[_]]): Unit = { + if (currentCodegenId != -1) { + p.setTagValue(QueryPlan.CODEGEN_ID_TAG, currentCodegenId) + } + children.foreach(generateWholeStageCodegenIds) + } + // Skip the subqueries as they are not printed as part of main query block. if (plan.isInstanceOf[BaseSubqueryExec]) { return } - var currentCodegenId = -1 plan.foreach { case p: WholeStageCodegenExec => currentCodegenId = p.codegenStageId - case p: InputAdapter => currentCodegenId = -1 - case other: QueryPlan[_] => - if (currentCodegenId != -1) { - other.setTagValue(QueryPlan.CODEGEN_ID_TAG, currentCodegenId) - } - other.innerChildren.foreach { plan => - generateWholeStageCodegenIds(plan) - } + case _: InputAdapter => currentCodegenId = -1 + case p: AdaptiveSparkPlanExec => setCodegenId(p, Seq(p.executedPlan)) + case p: QueryStageExec => setCodegenId(p, Seq(p.plan)) + case other: QueryPlan[_] => setCodegenId(other, other.innerChildren) } } + /** + * Generate detailed field string with different format based on type of input value + */ + def generateFieldString(fieldName: String, values: Any): String = values match { + case iter: Iterable[_] if (iter.size == 0) => s"${fieldName}: []" + case iter: Iterable[_] => s"${fieldName} [${iter.size}]: ${iter.mkString("[", ", ", "]")}" + case str: String if (str == null || str.isEmpty) => s"${fieldName}: None" + case str: String => s"${fieldName}: ${str}" + case _ => throw new IllegalArgumentException(s"Unsupported type for argument values: $values") + } + /** * Given a input plan, returns an array of tuples comprising of : * 1. Hosting opeator id. @@ -207,28 +233,23 @@ object ExplainUtils { /** * Returns the operator identifier for the supplied plan by retrieving the - * `operationId` tag value.` + * `operationId` tag value. */ def getOpId(plan: QueryPlan[_]): String = { plan.getTagValue(QueryPlan.OP_ID_TAG).map(v => s"$v").getOrElse("unknown") } - /** - * Returns the operator identifier for the supplied plan by retrieving the - * `codegenId` tag value.` - */ - def getCodegenId(plan: QueryPlan[_]): String = { - plan.getTagValue(QueryPlan.CODEGEN_ID_TAG).map(v => s"[codegen id : $v]").getOrElse("") - } - def removeTags(plan: QueryPlan[_]): Unit = { + def remove(p: QueryPlan[_], children: Seq[QueryPlan[_]]): Unit = { + p.unsetTagValue(QueryPlan.OP_ID_TAG) + p.unsetTagValue(QueryPlan.CODEGEN_ID_TAG) + children.foreach(removeTags) + } + plan foreach { - case plan: QueryPlan[_] => - plan.unsetTagValue(QueryPlan.OP_ID_TAG) - plan.unsetTagValue(QueryPlan.CODEGEN_ID_TAG) - plan.innerChildren.foreach { p => - removeTags(p) - } + case p: AdaptiveSparkPlanExec => remove(p, Seq(p.executedPlan)) + case p: QueryStageExec => remove(p, Seq(p.plan)) + case plan: QueryPlan[_] => remove(plan, plan.innerChildren) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala index bbe47a63f4d61..9f99bf5011569 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala @@ -19,10 +19,12 @@ package org.apache.spark.sql.execution import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} +import java.time.{Instant, LocalDate, ZoneOffset} import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} -import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand} +import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, LegacyDateFormats, TimestampFormatter} +import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand, ShowViewsCommand} +import org.apache.spark.sql.execution.datasources.v2.{DescribeTableExec, ShowTablesExec} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval @@ -37,18 +39,21 @@ object HiveResult { */ def hiveResultString(executedPlan: SparkPlan): Seq[String] = executedPlan match { case ExecutedCommandExec(_: DescribeCommandBase) => - // If it is a describe command for a Hive table, we want to have the output format - // be similar with Hive. - executedPlan.executeCollectPublic().map { - case Row(name: String, dataType: String, comment) => - Seq(name, dataType, - Option(comment.asInstanceOf[String]).getOrElse("")) - .map(s => String.format(s"%-20s", s)) - .mkString("\t") - } - // SHOW TABLES in Hive only output table names, while ours output database, table name, isTemp. + formatDescribeTableOutput(executedPlan.executeCollectPublic()) + case _: DescribeTableExec => + formatDescribeTableOutput(executedPlan.executeCollectPublic()) + // SHOW TABLES in Hive only output table names while our v1 command outputs + // database, table name, isTemp. case command @ ExecutedCommandExec(s: ShowTablesCommand) if !s.isExtended => command.executeCollect().map(_.getString(1)) + // SHOW TABLES in Hive only output table names while our v2 command outputs + // namespace and table name. + case command : ShowTablesExec => + command.executeCollect().map(_.getString(1)) + // SHOW VIEWS in Hive only outputs view names while our v1 command outputs + // namespace, viewName, and isTemporary. + case command @ ExecutedCommandExec(_: ShowViewsCommand) => + command.executeCollect().map(_.getString(1)) case other => val result: Seq[Seq[Any]] = other.executeCollectPublic().map(_.toSeq).toSeq // We need the types so we can output struct field names @@ -58,17 +63,42 @@ object HiveResult { .map(_.mkString("\t")) } - private lazy val zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone) - private lazy val dateFormatter = DateFormatter(zoneId) - private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId) + private def formatDescribeTableOutput(rows: Array[Row]): Seq[String] = { + rows.map { + case Row(name: String, dataType: String, comment) => + Seq(name, dataType, Option(comment.asInstanceOf[String]).getOrElse("")) + .map(s => String.format(s"%-20s", s)) + .mkString("\t") + } + } + + // We can create the date formatter only once because it does not depend on Spark's + // session time zone controlled by the SQL config `spark.sql.session.timeZone`. + // The `zoneId` parameter is used only in parsing of special date values like `now`, + // `yesterday` and etc. but not in date formatting. While formatting of: + // - `java.time.LocalDate`, zone id is not used by `DateTimeFormatter` at all. + // - `java.sql.Date`, the date formatter delegates formatting to the legacy formatter + // which uses the default system time zone `TimeZone.getDefault`. This works correctly + // due to `DateTimeUtils.toJavaDate` which is based on the system time zone too. + private val dateFormatter = DateFormatter( + format = DateFormatter.defaultPattern, + // We can set any time zone id. UTC was taken for simplicity. + zoneId = ZoneOffset.UTC, + locale = DateFormatter.defaultLocale, + // Use `FastDateFormat` as the legacy formatter because it is thread-safe. + legacyFormat = LegacyDateFormats.FAST_DATE_FORMAT, + isParsing = false) + private def timestampFormatter = TimestampFormatter.getFractionFormatter( + DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)) /** Formats a datum (based on the given data type) and returns the string representation. */ def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a match { case (null, _) => if (nested) "null" else "NULL" case (b, BooleanType) => b.toString - case (d: Date, DateType) => dateFormatter.format(DateTimeUtils.fromJavaDate(d)) - case (t: Timestamp, TimestampType) => - timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t)) + case (d: Date, DateType) => dateFormatter.format(d) + case (ld: LocalDate, DateType) => dateFormatter.format(ld) + case (t: Timestamp, TimestampType) => timestampFormatter.format(t) + case (i: Instant, TimestampType) => timestampFormatter.format(i) case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8) case (decimal: java.math.BigDecimal, DecimalType()) => decimal.toPlainString case (n, _: NumericType) => n.toString diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala index 1b5115f2e29a3..b452213cd6cc7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala @@ -45,10 +45,14 @@ case class LocalTableScanExec( } } - private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), - sqlContext.sparkContext.defaultParallelism) - - private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) + @transient private lazy val rdd: RDD[InternalRow] = { + if (rows.isEmpty) { + sqlContext.sparkContext.emptyRDD + } else { + val numSlices = math.min(unsafeRows.length, sqlContext.sparkContext.defaultParallelism) + sqlContext.sparkContext.parallelize(unsafeRows, numSlices) + } + } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala index 45e5f415e8da1..492d177c7c773 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuery.scala @@ -117,7 +117,7 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic case a: AttributeReference => a.withName(relation.output.find(_.semanticEquals(a)).get.name) } - } + }.filterNot(SubqueryExpression.hasSubquery) child transform { case plan if plan eq relation => @@ -133,10 +133,14 @@ case class OptimizeMetadataOnlyQuery(catalog: SessionCatalog) extends Rule[Logic CaseInsensitiveMap(relation.tableMeta.storage.properties) val timeZoneId = caseInsensitiveProperties.get(DateTimeUtils.TIMEZONE_OPTION) .getOrElse(SQLConf.get.sessionLocalTimeZone) - val partitions = if (partFilters.nonEmpty) { - catalog.listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters) - } else { - catalog.listPartitions(relation.tableMeta.identifier) + val partitions = relation.prunedPartitions match { + // for the case where partitions have already been pruned by PruneHiveTablePartitions + case Some(parts) => parts + case None => if (partFilters.nonEmpty) { + catalog.listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters) + } else { + catalog.listPartitions(relation.tableMeta.identifier) + } } val partitionData = partitions.map { p => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 38ef66682c413..ed36c78a77460 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -32,8 +32,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat import org.apache.spark.sql.catalyst.util.truncatedString -import org.apache.spark.sql.dynamicpruning.PlanDynamicPruningFilters import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan} +import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange} import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata} import org.apache.spark.sql.internal.SQLConf @@ -63,13 +63,12 @@ class QueryExecution( } } - lazy val analyzed: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.ANALYSIS) { - SparkSession.setActiveSession(sparkSession) + lazy val analyzed: LogicalPlan = executePhase(QueryPlanningTracker.ANALYSIS) { // We can't clone `logical` here, which will reset the `_analyzed` flag. sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) } - lazy val withCachedData: LogicalPlan = { + lazy val withCachedData: LogicalPlan = sparkSession.withActive { assertAnalyzed() assertSupported() // clone the plan to avoid sharing the plan instance between different stages like analyzing, @@ -77,23 +76,41 @@ class QueryExecution( sparkSession.sharedState.cacheManager.useCachedData(analyzed.clone()) } - lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) { + lazy val optimizedPlan: LogicalPlan = executePhase(QueryPlanningTracker.OPTIMIZATION) { // clone the plan to avoid sharing the plan instance between different stages like analyzing, // optimizing and planning. - sparkSession.sessionState.optimizer.executeAndTrack(withCachedData.clone(), tracker) + val plan = sparkSession.sessionState.optimizer.executeAndTrack(withCachedData.clone(), tracker) + // We do not want optimized plans to be re-analyzed as literals that have been constant folded + // and such can cause issues during analysis. While `clone` should maintain the `analyzed` state + // of the LogicalPlan, we set the plan as analyzed here as well out of paranoia. + plan.setAnalyzed() + plan } - lazy val sparkPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) { - // Clone the logical plan here, in case the planner rules change the states of the logical plan. - QueryExecution.createSparkPlan(sparkSession, planner, optimizedPlan.clone()) + private def assertOptimized(): Unit = optimizedPlan + + lazy val sparkPlan: SparkPlan = { + // We need to materialize the optimizedPlan here because sparkPlan is also tracked under + // the planning phase + assertOptimized() + executePhase(QueryPlanningTracker.PLANNING) { + // Clone the logical plan here, in case the planner rules change the states of the logical + // plan. + QueryExecution.createSparkPlan(sparkSession, planner, optimizedPlan.clone()) + } } // executedPlan should not be used to initialize any SparkPlan. It should be // only used for execution. - lazy val executedPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) { - // clone the plan to avoid sharing the plan instance between different stages like analyzing, - // optimizing and planning. - QueryExecution.prepareForExecution(preparations, sparkPlan.clone()) + lazy val executedPlan: SparkPlan = { + // We need to materialize the optimizedPlan here, before tracking the planning phase, to ensure + // that the optimization time is not counted as part of the planning phase. + assertOptimized() + executePhase(QueryPlanningTracker.PLANNING) { + // clone the plan to avoid sharing the plan instance between different stages like analyzing, + // optimizing and planning. + QueryExecution.prepareForExecution(preparations, sparkPlan.clone()) + } } /** @@ -113,7 +130,12 @@ class QueryExecution( def observedMetrics: Map[String, Row] = CollectMetricsExec.collect(executedPlan) protected def preparations: Seq[Rule[SparkPlan]] = { - QueryExecution.preparations(sparkSession) + QueryExecution.preparations(sparkSession, + Option(InsertAdaptiveSparkPlan(AdaptiveExecutionContext(sparkSession, this)))) + } + + protected def executePhase[T](phase: String)(block: => T): T = sparkSession.withActive { + tracker.measurePhase(phase)(block) } def simpleString: String = simpleString(false) @@ -271,20 +293,26 @@ object QueryExecution { * are correct, insert whole stage code gen, and try to reduce the work done by reusing exchanges * and subqueries. */ - private[execution] def preparations(sparkSession: SparkSession): Seq[Rule[SparkPlan]] = + private[execution] def preparations( + sparkSession: SparkSession, + adaptiveExecutionRule: Option[InsertAdaptiveSparkPlan] = None): Seq[Rule[SparkPlan]] = { + // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op + // as the original plan is hidden behind `AdaptiveSparkPlanExec`. + adaptiveExecutionRule.toSeq ++ Seq( - // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op - // as the original plan is hidden behind `AdaptiveSparkPlanExec`. - InsertAdaptiveSparkPlan(AdaptiveExecutionContext(sparkSession)), PlanDynamicPruningFilters(sparkSession), PlanSubqueries(sparkSession), EnsureRequirements(sparkSession.sessionState.conf), + // `RemoveRedundantSorts` needs to be added before `EnsureRequirements` to guarantee the same + // number of partitions when instantiating PartitioningCollection. + RemoveRedundantSorts(sparkSession.sessionState.conf), ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf, sparkSession.sessionState.columnarRules), CollapseCodegenStages(sparkSession.sessionState.conf), ReuseExchange(sparkSession.sessionState.conf), ReuseSubquery(sparkSession.sessionState.conf) ) + } /** * Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal @@ -305,7 +333,6 @@ object QueryExecution { sparkSession: SparkSession, planner: SparkPlanner, plan: LogicalPlan): SparkPlan = { - SparkSession.setActiveSession(sparkSession) // TODO: We use next(), i.e. take the first plan returned by the planner, here for now, // but we will implement to choose the best plan. planner.plan(ReturnAnswer(plan)).next() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantSorts.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantSorts.scala new file mode 100644 index 0000000000000..71f36c8c1dd5a --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantSorts.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.catalyst.expressions.SortOrder +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.internal.SQLConf + +/** + * Remove redundant SortExec node from the spark plan. A sort node is redundant when + * its child satisfies both its sort orders and its required child distribution. Note + * this rule differs from the Optimizer rule EliminateSorts in that this rule also checks + * if the child satisfies the required distribution so that it is safe to remove not only a + * local sort but also a global sort when its child already satisfies required sort orders. + */ +case class RemoveRedundantSorts(conf: SQLConf) extends Rule[SparkPlan] { + def apply(plan: SparkPlan): SparkPlan = { + if (!conf.getConf(SQLConf.REMOVE_REDUNDANT_SORTS_ENABLED)) { + plan + } else { + removeSorts(plan) + } + } + + private def removeSorts(plan: SparkPlan): SparkPlan = plan transform { + case s @ SortExec(orders, _, child, _) + if SortOrder.orderingSatisfies(child.outputOrdering, orders) && + child.outputPartitioning.satisfies(s.requiredChildDistribution.head) => + child + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala index 995d94ef5eac7..5e4f30a5edaf1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala @@ -17,11 +17,9 @@ package org.apache.spark.sql.execution -import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future => JFuture} import java.util.concurrent.atomic.AtomicLong -import scala.concurrent.{ExecutionContext, Future} - import org.apache.spark.SparkContext import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.sql.SparkSession @@ -62,9 +60,9 @@ object SQLExecution { * we can connect them with an execution. */ def withNewExecutionId[T]( - sparkSession: SparkSession, queryExecution: QueryExecution, - name: Option[String] = None)(body: => T): T = { + name: Option[String] = None)(body: => T): T = queryExecution.sparkSession.withActive { + val sparkSession = queryExecution.sparkSession val sc = sparkSession.sparkContext val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY) val executionId = SQLExecution.nextExecutionId @@ -172,14 +170,24 @@ object SQLExecution { * SparkContext local properties are forwarded to execution thread */ def withThreadLocalCaptured[T]( - sparkSession: SparkSession, exec: ExecutionContext)(body: => T): Future[T] = { + sparkSession: SparkSession, exec: ExecutorService) (body: => T): JFuture[T] = { val activeSession = sparkSession val sc = sparkSession.sparkContext val localProps = Utils.cloneProperties(sc.getLocalProperties) - Future { + exec.submit(() => { + val originalSession = SparkSession.getActiveSession + val originalLocalProps = sc.getLocalProperties SparkSession.setActiveSession(activeSession) sc.setLocalProperties(localProps) - body - }(exec) + val res = body + // reset active session and local props. + sc.setLocalProperties(originalLocalProps) + if (originalSession.nonEmpty) { + SparkSession.setActiveSession(originalSession.get) + } else { + SparkSession.clearActiveSession() + } + res + }) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala index efa493923ccc1..53ab0493f47eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala @@ -26,17 +26,28 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} import org.apache.spark.sql.internal.SQLConf +sealed trait ShufflePartitionSpec + +// A partition that reads data of one or more reducers, from `startReducerIndex` (inclusive) to +// `endReducerIndex` (exclusive). +case class CoalescedPartitionSpec( + startReducerIndex: Int, endReducerIndex: Int) extends ShufflePartitionSpec + +// A partition that reads partial data of one reducer, from `startMapIndex` (inclusive) to +// `endMapIndex` (exclusive). +case class PartialReducerPartitionSpec( + reducerIndex: Int, startMapIndex: Int, endMapIndex: Int) extends ShufflePartitionSpec + +// A partition that reads partial data of one mapper, from `startReducerIndex` (inclusive) to +// `endReducerIndex` (exclusive). +case class PartialMapperPartitionSpec( + mapIndex: Int, startReducerIndex: Int, endReducerIndex: Int) extends ShufflePartitionSpec + /** - * The [[Partition]] used by [[ShuffledRowRDD]]. A post-shuffle partition - * (identified by `postShufflePartitionIndex`) contains a range of pre-shuffle partitions - * (`startPreShufflePartitionIndex` to `endPreShufflePartitionIndex - 1`, inclusive). + * The [[Partition]] used by [[ShuffledRowRDD]]. */ -private final class ShuffledRowRDDPartition( - val postShufflePartitionIndex: Int, - val startPreShufflePartitionIndex: Int, - val endPreShufflePartitionIndex: Int) extends Partition { - override val index: Int = postShufflePartitionIndex -} +private final case class ShuffledRowRDDPartition( + index: Int, spec: ShufflePartitionSpec) extends Partition /** * A dummy partitioner for use with records whose partition ids have been pre-computed (i.e. for @@ -94,8 +105,7 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A * interfaces / internals. * * This RDD takes a [[ShuffleDependency]] (`dependency`), - * and an optional array of partition start indices as input arguments - * (`specifiedPartitionStartIndices`). + * and an array of [[ShufflePartitionSpec]] as input arguments. * * The `dependency` has the parent RDD of this RDD, which represents the dataset before shuffle * (i.e. map output). Elements of this RDD are (partitionId, Row) pairs. @@ -103,78 +113,97 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A * `dependency.partitioner` is the original partitioner used to partition * map output, and `dependency.partitioner.numPartitions` is the number of pre-shuffle partitions * (i.e. the number of partitions of the map output). - * - * When `specifiedPartitionStartIndices` is defined, `specifiedPartitionStartIndices.length` - * will be the number of post-shuffle partitions. For this case, the `i`th post-shuffle - * partition includes `specifiedPartitionStartIndices[i]` to - * `specifiedPartitionStartIndices[i+1] - 1` (inclusive). - * - * When `specifiedPartitionStartIndices` is not defined, there will be - * `dependency.partitioner.numPartitions` post-shuffle partitions. For this case, - * a post-shuffle partition is created for every pre-shuffle partition. */ class ShuffledRowRDD( var dependency: ShuffleDependency[Int, InternalRow, InternalRow], metrics: Map[String, SQLMetric], - specifiedPartitionIndices: Option[Array[(Int, Int)]] = None) + partitionSpecs: Array[ShufflePartitionSpec]) extends RDD[InternalRow](dependency.rdd.context, Nil) { - if (SQLConf.get.fetchShuffleBlocksInBatchEnabled) { - dependency.rdd.context.setLocalProperty( - SortShuffleManager.FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED_KEY, "true") + def this( + dependency: ShuffleDependency[Int, InternalRow, InternalRow], + metrics: Map[String, SQLMetric]) = { + this(dependency, metrics, + Array.tabulate(dependency.partitioner.numPartitions)(i => CoalescedPartitionSpec(i, i + 1))) } - private[this] val numPreShufflePartitions = dependency.partitioner.numPartitions - - private[this] val partitionStartIndices: Array[Int] = specifiedPartitionIndices match { - case Some(indices) => indices.map(_._1) - case None => - // When specifiedPartitionStartIndices is not defined, every post-shuffle partition - // corresponds to a pre-shuffle partition. - (0 until numPreShufflePartitions).toArray + if (SQLConf.get.fetchShuffleBlocksInBatch) { + dependency.rdd.context.setLocalProperty( + SortShuffleManager.FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED_KEY, "true") } - private[this] val part: Partitioner = - new CoalescedPartitioner(dependency.partitioner, partitionStartIndices) - override def getDependencies: Seq[Dependency[_]] = List(dependency) - override val partitioner: Option[Partitioner] = Some(part) + override val partitioner: Option[Partitioner] = + if (partitionSpecs.forall(_.isInstanceOf[CoalescedPartitionSpec])) { + val indices = partitionSpecs.map(_.asInstanceOf[CoalescedPartitionSpec].startReducerIndex) + // TODO this check is based on assumptions of callers' behavior but is sufficient for now. + if (indices.toSet.size == partitionSpecs.length) { + Some(new CoalescedPartitioner(dependency.partitioner, indices)) + } else { + None + } + } else { + None + } override def getPartitions: Array[Partition] = { - specifiedPartitionIndices match { - case Some(indices) => - Array.tabulate[Partition](indices.length) { i => - new ShuffledRowRDDPartition(i, indices(i)._1, indices(i)._2) - } - case None => - Array.tabulate[Partition](numPreShufflePartitions) { i => - new ShuffledRowRDDPartition(i, i, i + 1) - } + Array.tabulate[Partition](partitionSpecs.length) { i => + ShuffledRowRDDPartition(i, partitionSpecs(i)) } } override def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - val dep = dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] - tracker.getPreferredLocationsForShuffle(dep, partition.index) + partition.asInstanceOf[ShuffledRowRDDPartition].spec match { + case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => + // TODO order by partition size. + startReducerIndex.until(endReducerIndex).flatMap { reducerIndex => + tracker.getPreferredLocationsForShuffle(dependency, reducerIndex) + } + + case PartialReducerPartitionSpec(_, startMapIndex, endMapIndex) => + tracker.getMapLocation(dependency, startMapIndex, endMapIndex) + + case PartialMapperPartitionSpec(mapIndex, _, _) => + tracker.getMapLocation(dependency, mapIndex, mapIndex + 1) + } } override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { - val shuffledRowPartition = split.asInstanceOf[ShuffledRowRDDPartition] val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics() // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator, // as well as the `tempMetrics` for basic shuffle metrics. val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics) - // The range of pre-shuffle partitions that we are fetching at here is - // [startPreShufflePartitionIndex, endPreShufflePartitionIndex - 1]. - val reader = - SparkEnv.get.shuffleManager.getReader( - dependency.shuffleHandle, - shuffledRowPartition.startPreShufflePartitionIndex, - shuffledRowPartition.endPreShufflePartitionIndex, - context, - sqlMetricsReporter) + val reader = split.asInstanceOf[ShuffledRowRDDPartition].spec match { + case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => + SparkEnv.get.shuffleManager.getReader( + dependency.shuffleHandle, + startReducerIndex, + endReducerIndex, + context, + sqlMetricsReporter) + + case PartialReducerPartitionSpec(reducerIndex, startMapIndex, endMapIndex) => + SparkEnv.get.shuffleManager.getReaderForRange( + dependency.shuffleHandle, + startMapIndex, + endMapIndex, + reducerIndex, + reducerIndex + 1, + context, + sqlMetricsReporter) + + case PartialMapperPartitionSpec(mapIndex, startReducerIndex, endReducerIndex) => + SparkEnv.get.shuffleManager.getReaderForRange( + dependency.shuffleHandle, + mapIndex, + mapIndex + 1, + startReducerIndex, + endReducerIndex, + context, + sqlMetricsReporter) + } reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 013d94768a2a8..33b86a2b5340c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -23,10 +23,10 @@ import org.apache.spark.sql.catalyst.optimizer._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.connector.catalog.CatalogManager -import org.apache.spark.sql.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning} import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions import org.apache.spark.sql.execution.datasources.SchemaPruning import org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown +import org.apache.spark.sql.execution.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning} import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs} class SparkOptimizer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index 3301e9b5ab180..062aa69b3adb3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch object SparkPlan { @@ -134,7 +135,12 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ def longMetric(name: String): SQLMetric = metrics(name) // TODO: Move to `DistributedPlan` - /** Specifies how data is partitioned across different nodes in the cluster. */ + /** + * Specifies how data is partitioned across different nodes in the cluster. + * Note this method may fail if it is invoked before `EnsureRequirements` is applied + * since `PartitioningCollection` requires all its partitionings to have + * the same number of partitions. + */ def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH! /** @@ -512,10 +518,21 @@ trait LeafExecNode extends SparkPlan { override final def children: Seq[SparkPlan] = Nil override def producedAttributes: AttributeSet = outputSet override def verboseStringWithOperatorId(): String = { - s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Output: ${producedAttributes.mkString("[", ", ", "]")} - """.stripMargin + val argumentString = argString(SQLConf.get.maxToStringFields) + val outputStr = s"${ExplainUtils.generateFieldString("Output", output)}" + + if (argumentString.nonEmpty) { + s""" + |$formattedNodeName + |$outputStr + |Arguments: $argumentString + |""".stripMargin + } else { + s""" + |$formattedNodeName + |$outputStr + |""".stripMargin + } } } @@ -531,10 +548,21 @@ trait UnaryExecNode extends SparkPlan { override final def children: Seq[SparkPlan] = child :: Nil override def verboseStringWithOperatorId(): String = { - s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Input: ${child.output.mkString("[", ", ", "]")} - """.stripMargin + val argumentString = argString(SQLConf.get.maxToStringFields) + val inputStr = s"${ExplainUtils.generateFieldString("Input", child.output)}" + + if (argumentString.nonEmpty) { + s""" + |$formattedNodeName + |$inputStr + |Arguments: $argumentString + |""".stripMargin + } else { + s""" + |$formattedNodeName + |$inputStr + |""".stripMargin + } } } @@ -544,10 +572,23 @@ trait BinaryExecNode extends SparkPlan { override final def children: Seq[SparkPlan] = Seq(left, right) override def verboseStringWithOperatorId(): String = { - s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Left output: ${left.output.mkString("[", ", ", "]")} - |Right output: ${right.output.mkString("[", ", ", "]")} - """.stripMargin + val argumentString = argString(SQLConf.get.maxToStringFields) + val leftOutputStr = s"${ExplainUtils.generateFieldString("Left output", left.output)}" + val rightOutputStr = s"${ExplainUtils.generateFieldString("Right output", right.output)}" + + if (argumentString.nonEmpty) { + s""" + |$formattedNodeName + |$leftOutputStr + |$rightOutputStr + |Arguments: $argumentString + |""".stripMargin + } else { + s""" + |$formattedNodeName + |$leftOutputStr + |$rightOutputStr + |""".stripMargin + } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala index 5b72ec058e127..357820a9d63d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, LocalShuffleReaderExec, QueryStageExec} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec} import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo import org.apache.spark.sql.internal.SQLConf diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index aa139cb6b0c3b..3e4bc48553c5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -196,7 +196,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { operationNotAllowed("CREATE TEMPORARY TABLE IF NOT EXISTS", ctx) } - val (_, _, _, options, _, _) = visitCreateTableClauses(ctx.createTableClauses()) + val (_, _, _, options, location, _) = visitCreateTableClauses(ctx.createTableClauses()) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText).getOrElse( throw new ParseException("CREATE TEMPORARY TABLE without a provider is not allowed.", ctx)) val schema = Option(ctx.colTypeList()).map(createSchema) @@ -205,7 +205,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { "CREATE TEMPORARY VIEW ... USING ... instead") val table = tableIdentifier(ident, "CREATE TEMPORARY VIEW", ctx) - CreateTempViewUsing(table, schema, replace = false, global = false, provider, options) + val optionsWithLocation = location.map(l => options + ("path" -> l)).getOrElse(options) + CreateTempViewUsing(table, schema, replace = false, global = false, provider, + optionsWithLocation) } } @@ -587,10 +589,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { */ override def visitRowFormatDelimited( ctx: RowFormatDelimitedContext): CatalogStorageFormat = withOrigin(ctx) { - // Collect the entries if any. - def entry(key: String, value: Token): Seq[(String, String)] = { - Option(value).toSeq.map(x => key -> string(x)) - } // TODO we need proper support for the NULL format. val entries = entry("field.delim", ctx.fieldsTerminatedBy) ++ @@ -689,9 +687,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { // expects a seq of pairs in which the old parsers' token names are used as keys. // Transforming the result of visitRowFormatDelimited would be quite a bit messier than // retrieving the key value pairs ourselves. - def entry(key: String, value: Token): Seq[(String, String)] = { - Option(value).map(t => key -> t.getText).toSeq - } val entries = entry("TOK_TABLEROWFORMATFIELD", c.fieldsTerminatedBy) ++ entry("TOK_TABLEROWFORMATCOLLITEMS", c.collectionItemsTerminatedBy) ++ entry("TOK_TABLEROWFORMATMAPKEYS", c.keysTerminatedBy) ++ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index bd2684d92a1d2..689d1eb62ffa5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -244,7 +244,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { // 4. Pick cartesian product if join type is inner like. // 5. Pick broadcast nested loop join as the final solution. It may OOM but we don't have // other choice. - case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right, hint) => + case j @ ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, nonEquiCond, left, right, hint) => def createBroadcastHashJoin(buildLeft: Boolean, buildRight: Boolean) = { val wantToBuildLeft = canBuildLeft(joinType) && buildLeft val wantToBuildRight = canBuildRight(joinType) && buildRight @@ -254,7 +254,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { rightKeys, joinType, buildSide, - condition, + nonEquiCond, planLater(left), planLater(right))) } @@ -269,7 +269,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { rightKeys, joinType, buildSide, - condition, + nonEquiCond, planLater(left), planLater(right))) } @@ -278,7 +278,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { def createSortMergeJoin() = { if (RowOrdering.isOrderable(leftKeys)) { Some(Seq(joins.SortMergeJoinExec( - leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)))) + leftKeys, rightKeys, joinType, nonEquiCond, planLater(left), planLater(right)))) } else { None } @@ -286,7 +286,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { def createCartesianProduct() = { if (joinType.isInstanceOf[InnerLike]) { - Some(Seq(joins.CartesianProductExec(planLater(left), planLater(right), condition))) + // `CartesianProductExec` can't implicitly evaluate equal join condition, here we should + // pass the original condition which includes both equal and non-equal conditions. + Some(Seq(joins.CartesianProductExec(planLater(left), planLater(right), j.condition))) } else { None } @@ -311,7 +313,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { // This join could be very slow or OOM val buildSide = getSmallerSide(left, right) Seq(joins.BroadcastNestedLoopJoinExec( - planLater(left), planLater(right), buildSide, joinType, condition)) + planLater(left), planLater(right), buildSide, joinType, nonEquiCond)) } } @@ -515,7 +517,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { val (functionsWithDistinct, functionsWithoutDistinct) = aggregateExpressions.partition(_.isDistinct) - if (functionsWithDistinct.map(_.aggregateFunction.children.toSet).distinct.length > 1) { + if (functionsWithDistinct.map( + _.aggregateFunction.children.filterNot(_.foldable).toSet).distinct.length > 1) { // This is a sanity check. We should not reach here when we have multiple distinct // column sets. Our `RewriteDistinctAggregates` should take care this case. sys.error("You hit a query analyzer bug. Please report your query to " + @@ -527,6 +530,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { val normalizedGroupingExpressions = groupingExpressions.map { e => NormalizeFloatingNumbers.normalize(e) match { case n: NamedExpression => n + // Keep the name of the original expression. case other => Alias(other, e.name)(exprId = e.exprId) } } @@ -539,10 +543,35 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { resultExpressions, planLater(child)) } else { + // functionsWithDistinct is guaranteed to be non-empty. Even though it may contain + // more than one DISTINCT aggregate function, all of those functions will have the + // same column expressions. For example, it would be valid for functionsWithDistinct + // to be [COUNT(DISTINCT foo), MAX(DISTINCT foo)], but + // [COUNT(DISTINCT bar), COUNT(DISTINCT foo)] is disallowed because those two distinct + // aggregates have different column expressions. + val distinctExpressions = + functionsWithDistinct.head.aggregateFunction.children.filterNot(_.foldable) + val normalizedNamedDistinctExpressions = distinctExpressions.map { e => + // Ideally this should be done in `NormalizeFloatingNumbers`, but we do it here + // because `distinctExpressions` is not extracted during logical phase. + NormalizeFloatingNumbers.normalize(e) match { + case ne: NamedExpression => ne + case other => + // Keep the name of the original expression. + val name = e match { + case ne: NamedExpression => ne.name + case _ => e.toString + } + Alias(other, name)() + } + } + AggUtils.planAggregateWithOneDistinct( normalizedGroupingExpressions, functionsWithDistinct, functionsWithoutDistinct, + distinctExpressions, + normalizedNamedDistinctExpressions, resultExpressions, planLater(child)) } @@ -656,7 +685,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case MemoryPlan(sink, output) => val encoder = RowEncoder(StructType.fromAttributes(output)) - LocalTableScanExec(output, sink.allData.map(r => encoder.toRow(r).copy())) :: Nil + val toRow = encoder.createSerializer() + LocalTableScanExec(output, sink.allData.map(r => toRow(r).copy())) :: Nil case logical.Distinct(child) => throw new IllegalStateException( @@ -724,7 +754,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case logical.Repartition(numPartitions, shuffle, child) => if (shuffle) { ShuffleExchangeExec(RoundRobinPartitioning(numPartitions), - planLater(child), canChangeNumPartitions = false) :: Nil + planLater(child), noUserSpecifiedNumPartition = false) :: Nil } else { execution.CoalesceExec(numPartitions, planLater(child)) :: Nil } @@ -758,7 +788,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { execution.RangeExec(r) :: Nil case r: logical.RepartitionByExpression => exchange.ShuffleExchangeExec( - r.partitioning, planLater(r.child), canChangeNumPartitions = false) :: Nil + r.partitioning, planLater(r.child), noUserSpecifiedNumPartition = false) :: Nil case ExternalRDD(outputObjAttr, rdd) => ExternalRDDScanExec(outputObjAttr, rdd) :: Nil case r: LogicalRDD => RDDScanExec(r.output, r.rdd, "ExistingRDD", r.outputPartitioning, r.outputOrdering) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 3f20b59361988..187827ca6005e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -37,7 +37,6 @@ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.ThreadUtils @@ -67,6 +66,15 @@ case class AdaptiveSparkPlanExec( @transient private val lock = new Object() + @transient private val logOnLevel: ( => String) => Unit = conf.adaptiveExecutionLogLevel match { + case "TRACE" => logTrace(_) + case "DEBUG" => logDebug(_) + case "INFO" => logInfo(_) + case "WARN" => logWarning(_) + case "ERROR" => logError(_) + case _ => logDebug(_) + } + // The logical plan optimizer for re-optimizing the current logical plan. @transient private val optimizer = new RuleExecutor[LogicalPlan] { // TODO add more optimization rules @@ -75,27 +83,31 @@ case class AdaptiveSparkPlanExec( ) } + @transient private val removeRedundantSorts = RemoveRedundantSorts(conf) @transient private val ensureRequirements = EnsureRequirements(conf) // A list of physical plan rules to be applied before creation of query stages. The physical // plan should reach a final status of query stages (i.e., no more addition or removal of // Exchange nodes) after running these rules. private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( - ensureRequirements - ) + ensureRequirements, + removeRedundantSorts + ) ++ context.session.sessionState.queryStagePrepRules // A list of physical optimizer rules to be applied to a new stage before its execution. These // optimizations should be stage-independent. @transient private val queryStageOptimizerRules: Seq[Rule[SparkPlan]] = Seq( ReuseAdaptiveSubquery(conf, context.subqueryCache), - // Here the 'OptimizeSkewedJoin' rule should be executed - // before 'ReduceNumShufflePartitions', as the skewed partition handled - // in 'OptimizeSkewedJoin' rule, should be omitted in 'ReduceNumShufflePartitions'. + CoalesceShufflePartitions(context.session), + // The following two rules need to make use of 'CustomShuffleReaderExec.partitionSpecs' + // added by `CoalesceShufflePartitions`. So they must be executed after it. OptimizeSkewedJoin(conf), - ReduceNumShufflePartitions(conf), - // The rule of 'OptimizeLocalShuffleReader' need to make use of the 'partitionStartIndices' - // in 'ReduceNumShufflePartitions' rule. So it must be after 'ReduceNumShufflePartitions' rule. - OptimizeLocalShuffleReader(conf), + OptimizeLocalShuffleReader(conf) + ) + + // A list of physical optimizer rules to be applied right after a new stage is created. The input + // plan to these rules has exchange as its root node. + @transient private val postStageCreationRules = Seq( ApplyColumnarRulesAndInsertTransitions(conf, context.session.sessionState.columnarRules), CollapseCodegenStages(conf) ) @@ -133,39 +145,33 @@ case class AdaptiveSparkPlanExec( executedPlan.resetMetrics() } - private def collectSQLMetrics(plan: SparkPlan): Seq[SQLMetric] = { - val metrics = new mutable.ArrayBuffer[SQLMetric]() - plan.foreach { - case p: ShuffleQueryStageExec if (p.resultOption.isEmpty) => - collectSQLMetrics(p.plan).foreach(metrics += _) - case p: BroadcastQueryStageExec if (p.resultOption.isEmpty) => - collectSQLMetrics(p.plan).foreach(metrics += _) - case p: SparkPlan => - p.metrics.foreach { case metric => - metrics += metric._2 - } - } - metrics + private def getExecutionId: Option[Long] = { + // If the `QueryExecution` does not match the current execution ID, it means the execution ID + // belongs to another (parent) query, and we should not call update UI in this query. + Option(context.session.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)) + .map(_.toLong).filter(SQLExecution.getQueryExecution(_) eq context.qe) } private def getFinalPhysicalPlan(): SparkPlan = lock.synchronized { - if (!isFinalPlan) { - // Subqueries do not have their own execution IDs and therefore rely on the main query to - // update UI. - val executionId = Option(context.session.sparkContext.getLocalProperty( - SQLExecution.EXECUTION_ID_KEY)).map(_.toLong) + if (isFinalPlan) return currentPhysicalPlan + + // In case of this adaptive plan being executed out of `withActive` scoped functions, e.g., + // `plan.queryExecution.rdd`, we need to set active session here as new plan nodes can be + // created in the middle of the execution. + context.session.withActive { + val executionId = getExecutionId var currentLogicalPlan = currentPhysicalPlan.logicalLink.get var result = createQueryStages(currentPhysicalPlan) val events = new LinkedBlockingQueue[StageMaterializationEvent]() - val errors = new mutable.ArrayBuffer[SparkException]() + val errors = new mutable.ArrayBuffer[Throwable]() var stagesToReplace = Seq.empty[QueryStageExec] while (!result.allChildStagesMaterialized) { currentPhysicalPlan = result.newPlan if (result.newStages.nonEmpty) { stagesToReplace = result.newStages ++ stagesToReplace - executionId.foreach(onUpdatePlan) + executionId.foreach(onUpdatePlan(_, result.newStages.map(_.plan))) - // Start materialization of all new stages. + // Start materialization of all new stages and fail fast if any stages failed eagerly result.newStages.foreach { stage => try { stage.materialize().onComplete { res => @@ -176,7 +182,8 @@ case class AdaptiveSparkPlanExec( } }(AdaptiveSparkPlanExec.executionContext) } catch { - case e: Throwable => events.offer(StageFailure(stage, e)) + case e: Throwable => + cleanUpAndThrowException(Seq(e), Some(stage.id)) } } } @@ -191,14 +198,12 @@ case class AdaptiveSparkPlanExec( case StageSuccess(stage, res) => stage.resultOption = Some(res) case StageFailure(stage, ex) => - errors.append( - new SparkException(s"Failed to materialize query stage: ${stage.treeString}." + - s" and the cause is ${ex.getMessage}", ex)) + errors.append(ex) } // In case of errors, we cancel all running stages and throw exception. if (errors.nonEmpty) { - cleanUpAndThrowException(errors) + cleanUpAndThrowException(errors, None) } // Try re-optimizing and re-planning. Adopt the new plan if its cost is equal to or less @@ -218,6 +223,7 @@ case class AdaptiveSparkPlanExec( val newCost = costEvaluator.evaluateCost(newPhysicalPlan) if (newCost < origCost || (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) { + logOnLevel(s"Plan changed from $currentPhysicalPlan to $newPhysicalPlan") cleanUpTempTags(newPhysicalPlan) currentPhysicalPlan = newPhysicalPlan currentLogicalPlan = newLogicalPlan @@ -228,34 +234,50 @@ case class AdaptiveSparkPlanExec( } // Run the final plan when there's no more unfinished stages. - currentPhysicalPlan = applyPhysicalRules(result.newPlan, queryStageOptimizerRules) + currentPhysicalPlan = applyPhysicalRules( + result.newPlan, queryStageOptimizerRules ++ postStageCreationRules) isFinalPlan = true - executionId.foreach(onUpdatePlan) - logDebug(s"Final plan: $currentPhysicalPlan") + executionId.foreach(onUpdatePlan(_, Seq(currentPhysicalPlan))) + currentPhysicalPlan + } + } + + // Use a lazy val to avoid this being called more than once. + @transient private lazy val finalPlanUpdate: Unit = { + // Subqueries that don't belong to any query stage of the main query will execute after the + // last UI update in `getFinalPhysicalPlan`, so we need to update UI here again to make sure + // the newly generated nodes of those subqueries are updated. + if (!isSubquery && currentPhysicalPlan.find(_.subqueries.nonEmpty).isDefined) { + getExecutionId.foreach(onUpdatePlan(_, Seq.empty)) } - currentPhysicalPlan + logOnLevel(s"Final plan: $currentPhysicalPlan") } override def executeCollect(): Array[InternalRow] = { - getFinalPhysicalPlan().executeCollect() + val rdd = getFinalPhysicalPlan().executeCollect() + finalPlanUpdate + rdd } override def executeTake(n: Int): Array[InternalRow] = { - getFinalPhysicalPlan().executeTake(n) + val rdd = getFinalPhysicalPlan().executeTake(n) + finalPlanUpdate + rdd } override def executeTail(n: Int): Array[InternalRow] = { - getFinalPhysicalPlan().executeTail(n) + val rdd = getFinalPhysicalPlan().executeTail(n) + finalPlanUpdate + rdd } override def doExecute(): RDD[InternalRow] = { - getFinalPhysicalPlan().execute() + val rdd = getFinalPhysicalPlan().execute() + finalPlanUpdate + rdd } - override def verboseString(maxFields: Int): String = simpleString(maxFields) - - override def simpleString(maxFields: Int): String = - s"AdaptiveSparkPlan(isFinalPlan=$isFinalPlan)" + protected override def stringArgs: Iterator[Any] = Iterator(s"isFinalPlan=$isFinalPlan") override def generateTreeString( depth: Int, @@ -361,10 +383,22 @@ case class AdaptiveSparkPlanExec( private def newQueryStage(e: Exchange): QueryStageExec = { val optimizedPlan = applyPhysicalRules(e.child, queryStageOptimizerRules) val queryStage = e match { - case s: ShuffleExchangeExec => - ShuffleQueryStageExec(currentStageId, s.copy(child = optimizedPlan)) - case b: BroadcastExchangeExec => - BroadcastQueryStageExec(currentStageId, b.copy(child = optimizedPlan)) + case s: ShuffleExchangeLike => + val newShuffle = applyPhysicalRules( + s.withNewChildren(Seq(optimizedPlan)), postStageCreationRules) + if (!newShuffle.isInstanceOf[ShuffleExchangeLike]) { + throw new IllegalStateException( + "Custom columnar rules cannot transform shuffle node to something else.") + } + ShuffleQueryStageExec(currentStageId, newShuffle) + case b: BroadcastExchangeLike => + val newBroadcast = applyPhysicalRules( + b.withNewChildren(Seq(optimizedPlan)), postStageCreationRules) + if (!newBroadcast.isInstanceOf[BroadcastExchangeLike]) { + throw new IllegalStateException( + "Custom columnar rules cannot transform broadcast node to something else.") + } + BroadcastQueryStageExec(currentStageId, newBroadcast) } currentStageId += 1 setLogicalLinkForNewQueryStage(queryStage, e) @@ -467,7 +501,6 @@ case class AdaptiveSparkPlanExec( private def reOptimize(logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = { logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) - SparkSession.setActiveSession(context.session) val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() val newPlan = applyPhysicalRules(sparkPlan, preprocessingRules ++ queryStagePreparationRules) (newPlan, optimized) @@ -495,62 +528,58 @@ case class AdaptiveSparkPlanExec( /** * Notify the listeners of the physical plan change. */ - private def onUpdatePlan(executionId: Long): Unit = { + private def onUpdatePlan(executionId: Long, newSubPlans: Seq[SparkPlan]): Unit = { if (isSubquery) { // When executing subqueries, we can't update the query plan in the UI as the // UI doesn't support partial update yet. However, the subquery may have been // optimized into a different plan and we must let the UI know the SQL metrics // of the new plan nodes, so that it can track the valid accumulator updates later // and display SQL metrics correctly. - onUpdateSQLMetrics(collectSQLMetrics(currentPhysicalPlan), executionId) + val newMetrics = newSubPlans.flatMap { p => + p.flatMap(_.metrics.values.map(m => SQLPlanMetric(m.name.get, m.id, m.metricType))) + } + context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveSQLMetricUpdates( + executionId.toLong, newMetrics)) } else { context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate( executionId, - SQLExecution.getQueryExecution(executionId).toString, - SparkPlanInfo.fromSparkPlan(this))) - } - } - - private def onUpdateSQLMetrics(sqlMetrics: Seq[SQLMetric], executionId: Long): Unit = { - val sqlPlanMetrics = sqlMetrics.map { case sqlMetric => - SQLPlanMetric(sqlMetric.name.get, sqlMetric.id, sqlMetric.metricType) + context.qe.toString, + SparkPlanInfo.fromSparkPlan(context.qe.executedPlan))) } - context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveSQLMetricUpdates( - executionId.toLong, sqlPlanMetrics)) } /** * Cancel all running stages with best effort and throw an Exception containing all stage * materialization errors and stage cancellation errors. */ - private def cleanUpAndThrowException(errors: Seq[SparkException]): Unit = { - val runningStages = currentPhysicalPlan.collect { - case s: QueryStageExec => s - } - val cancelErrors = new mutable.ArrayBuffer[SparkException]() - try { - runningStages.foreach { s => + private def cleanUpAndThrowException( + errors: Seq[Throwable], + earlyFailedStage: Option[Int]): Unit = { + currentPhysicalPlan.foreach { + // earlyFailedStage is the stage which failed before calling doMaterialize, + // so we should avoid calling cancel on it to re-trigger the failure again. + case s: QueryStageExec if !earlyFailedStage.contains(s.id) => try { s.cancel() } catch { case NonFatal(t) => - cancelErrors.append( - new SparkException(s"Failed to cancel query stage: ${s.treeString}", t)) + logError(s"Exception in cancelling query stage: ${s.treeString}", t) } - } - } finally { - val ex = new SparkException( - "Adaptive execution failed due to stage materialization failures." + - s" and the cause is ${errors.head.getMessage}", errors.head) - errors.tail.foreach(ex.addSuppressed) - cancelErrors.foreach(ex.addSuppressed) - throw ex + case _ => + } + val e = if (errors.size == 1) { + errors.head + } else { + val se = new SparkException("Multiple failures in stage materialization.", errors.head) + errors.tail.foreach(se.addSuppressed) + se } + throw e } } object AdaptiveSparkPlanExec { - private val executionContext = ExecutionContext.fromExecutorService( + private[adaptive] val executionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("QueryStageCreator", 16)) /** @@ -576,7 +605,7 @@ object AdaptiveSparkPlanExec { /** * The execution context shared between the main query and all sub-queries. */ -case class AdaptiveExecutionContext(session: SparkSession) { +case class AdaptiveExecutionContext(session: SparkSession, qe: QueryExecution) { /** * The subquery-reuse map shared across the entire query. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala index 61ae6cb14ccd3..3cf6a13a4a892 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala @@ -17,7 +17,9 @@ package org.apache.spark.sql.execution.adaptive +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.internal.SQLConf /** * This class provides utility methods related to tree traversal of an [[AdaptiveSparkPlanExec]] @@ -109,7 +111,7 @@ trait AdaptiveSparkPlanHelper { * Returns a sequence containing the result of applying a partial function to all elements in this * plan, also considering all the plans in its (nested) subqueries */ - def collectInPlanAndSubqueries[B](p: SparkPlan)(f: PartialFunction[SparkPlan, B]): Seq[B] = { + def collectWithSubqueries[B](p: SparkPlan)(f: PartialFunction[SparkPlan, B]): Seq[B] = { (p +: subqueriesAll(p)).flatMap(collect(_)(f)) } @@ -135,4 +137,18 @@ trait AdaptiveSparkPlanHelper { case a: AdaptiveSparkPlanExec => a.executedPlan case other => other } - } + + /** + * Returns a cloned [[SparkSession]] with adaptive execution disabled, or the original + * [[SparkSession]] if its adaptive execution is already disabled. + */ + def getOrCloneSessionWithAqeOff[T](session: SparkSession): SparkSession = { + if (!session.sessionState.conf.adaptiveExecutionEnabled) { + session + } else { + val newSession = session.cloneSession() + newSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) + newSession + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala new file mode 100644 index 0000000000000..096d65f16e42f --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.internal.SQLConf + +/** + * A rule to coalesce the shuffle partitions based on the map output statistics, which can + * avoid many small reduce tasks that hurt performance. + */ +case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPlan] { + import CoalesceShufflePartitions._ + private def conf = session.sessionState.conf + + override def apply(plan: SparkPlan): SparkPlan = { + if (!conf.coalesceShufflePartitionsEnabled) { + return plan + } + if (!plan.collectLeaves().forall(_.isInstanceOf[QueryStageExec]) + || plan.find(_.isInstanceOf[CustomShuffleReaderExec]).isDefined) { + // If not all leaf nodes are query stages, it's not safe to reduce the number of + // shuffle partitions, because we may break the assumption that all children of a spark plan + // have same number of output partitions. + return plan + } + + def collectShuffleStages(plan: SparkPlan): Seq[ShuffleQueryStageExec] = plan match { + case stage: ShuffleQueryStageExec => Seq(stage) + case _ => plan.children.flatMap(collectShuffleStages) + } + + val shuffleStages = collectShuffleStages(plan) + // ShuffleExchanges introduced by repartition do not support changing the number of partitions. + // We change the number of partitions in the stage only if all the ShuffleExchanges support it. + if (!shuffleStages.forall(_.shuffle.canChangeNumPartitions)) { + plan + } else { + // `ShuffleQueryStageExec#mapStats` returns None when the input RDD has 0 partitions, + // we should skip it when calculating the `partitionStartIndices`. + val validMetrics = shuffleStages.flatMap(_.mapStats) + + // We may have different pre-shuffle partition numbers, don't reduce shuffle partition number + // in that case. For example when we union fully aggregated data (data is arranged to a single + // partition) and a result of a SortMergeJoin (multiple partitions). + val distinctNumPreShufflePartitions = + validMetrics.map(stats => stats.bytesByPartitionId.length).distinct + if (validMetrics.nonEmpty && distinctNumPreShufflePartitions.length == 1) { + // We fall back to Spark default parallelism if the minimum number of coalesced partitions + // is not set, so to avoid perf regressions compared to no coalescing. + val minPartitionNum = conf.getConf(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM) + .getOrElse(session.sparkContext.defaultParallelism) + val partitionSpecs = ShufflePartitionsUtil.coalescePartitions( + validMetrics.toArray, + advisoryTargetSize = conf.getConf(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES), + minNumPartitions = minPartitionNum) + // This transformation adds new nodes, so we must use `transformUp` here. + val stageIds = shuffleStages.map(_.id).toSet + plan.transformUp { + // even for shuffle exchange whose input RDD has 0 partition, we should still update its + // `partitionStartIndices`, so that all the leaf shuffles in a stage have the same + // number of output partitions. + case stage: ShuffleQueryStageExec if stageIds.contains(stage.id) => + CustomShuffleReaderExec(stage, partitionSpecs, COALESCED_SHUFFLE_READER_DESCRIPTION) + } + } else { + plan + } + } + } +} + +object CoalesceShufflePartitions { + val COALESCED_SHUFFLE_READER_DESCRIPTION = "coalesced" +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala new file mode 100644 index 0000000000000..8fd572088b620 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.exchange.{ReusedExchangeExec, ShuffleExchangeExec, ShuffleExchangeLike} +import org.apache.spark.sql.vectorized.ColumnarBatch + + +/** + * A wrapper of shuffle query stage, which follows the given partition arrangement. + * + * @param child It is usually `ShuffleQueryStageExec`, but can be the shuffle exchange + * node during canonicalization. + * @param partitionSpecs The partition specs that defines the arrangement. + * @param description The string description of this shuffle reader. + */ +case class CustomShuffleReaderExec private( + child: SparkPlan, + partitionSpecs: Seq[ShufflePartitionSpec], + description: String) extends UnaryExecNode { + + override def supportsColumnar: Boolean = child.supportsColumnar + + override def output: Seq[Attribute] = child.output + override lazy val outputPartitioning: Partitioning = { + // If it is a local shuffle reader with one mapper per task, then the output partitioning is + // the same as the plan before shuffle. + // TODO this check is based on assumptions of callers' behavior but is sufficient for now. + if (partitionSpecs.forall(_.isInstanceOf[PartialMapperPartitionSpec]) && + partitionSpecs.map(_.asInstanceOf[PartialMapperPartitionSpec].mapIndex).toSet.size == + partitionSpecs.length) { + child match { + case ShuffleQueryStageExec(_, s: ShuffleExchangeLike) => + s.child.outputPartitioning + case ShuffleQueryStageExec(_, r @ ReusedExchangeExec(_, s: ShuffleExchangeLike)) => + s.child.outputPartitioning match { + case e: Expression => r.updateAttr(e).asInstanceOf[Partitioning] + case other => other + } + case _ => + throw new IllegalStateException("operating on canonicalization plan") + } + } else { + UnknownPartitioning(partitionSpecs.length) + } + } + + override def stringArgs: Iterator[Any] = Iterator(description) + + private def shuffleStage = child match { + case stage: ShuffleQueryStageExec => Some(stage) + case _ => None + } + + private lazy val shuffleRDD: RDD[_] = { + shuffleStage.map { stage => + stage.shuffle.getShuffleRDD(partitionSpecs.toArray) + }.getOrElse { + throw new IllegalStateException("operating on canonicalized plan") + } + } + + override protected def doExecute(): RDD[InternalRow] = { + shuffleRDD.asInstanceOf[RDD[InternalRow]] + } + + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = { + shuffleRDD.asInstanceOf[RDD[ColumnarBatch]] + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala index e5642991c59a3..0f2868e41cc39 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.adaptive -import org.apache.spark.MapOutputStatistics import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Join, LogicalPlan, NO_BROADCAST_HASH} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf @@ -30,10 +29,10 @@ case class DemoteBroadcastHashJoin(conf: SQLConf) extends Rule[LogicalPlan] { private def shouldDemote(plan: LogicalPlan): Boolean = plan match { case LogicalQueryStage(_, stage: ShuffleQueryStageExec) if stage.resultOption.isDefined - && stage.resultOption.get != null => - val mapOutputStatistics = stage.resultOption.get.asInstanceOf[MapOutputStatistics] - val partitionCnt = mapOutputStatistics.bytesByPartitionId.length - val nonZeroCnt = mapOutputStatistics.bytesByPartitionId.count(_ > 0) + && stage.mapStats.isDefined => + val mapStats = stage.mapStats.get + val partitionCnt = mapStats.bytesByPartitionId.length + val nonZeroCnt = mapStats.bytesByPartitionId.count(_ > 0) partitionCnt > 0 && nonZeroCnt > 0 && (nonZeroCnt * 1.0 / partitionCnt) < conf.nonEmptyPartitionRatioForBroadcastJoin case _ => false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala index 04696209ce10e..754225dd3fe95 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala @@ -20,13 +20,13 @@ package org.apache.spark.sql.execution.adaptive import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, DynamicPruningSubquery, ListQuery, Literal, SubqueryExpression} +import org.apache.spark.sql.catalyst.expressions.{DynamicPruningSubquery, ListQuery, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.UnspecifiedDistribution import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.command.ExecutedCommandExec +import org.apache.spark.sql.execution.command.{DataWritingCommandExec, ExecutedCommandExec} +import org.apache.spark.sql.execution.datasources.v2.V2CommandExec import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.internal.SQLConf @@ -41,49 +41,62 @@ case class InsertAdaptiveSparkPlan( private val conf = adaptiveExecutionContext.session.sessionState.conf - def containShuffle(plan: SparkPlan): Boolean = { - plan.find { - case _: Exchange => true - case s: SparkPlan => !s.requiredChildDistribution.forall(_ == UnspecifiedDistribution) - }.isDefined - } - - def containSubQuery(plan: SparkPlan): Boolean = { - plan.find(_.expressions.exists(_.find { - case _: SubqueryExpression => true - case _ => false - }.isDefined)).isDefined - } - override def apply(plan: SparkPlan): SparkPlan = applyInternal(plan, false) private def applyInternal(plan: SparkPlan, isSubquery: Boolean): SparkPlan = plan match { + case _ if !conf.adaptiveExecutionEnabled => plan case _: ExecutedCommandExec => plan - case _ if conf.adaptiveExecutionEnabled && supportAdaptive(plan) - && (isSubquery || containShuffle(plan) || containSubQuery(plan)) => - try { - // Plan sub-queries recursively and pass in the shared stage cache for exchange reuse. Fall - // back to non-adaptive mode if adaptive execution is supported in any of the sub-queries. - val subqueryMap = buildSubqueryMap(plan) - val planSubqueriesRule = PlanAdaptiveSubqueries(subqueryMap) - val preprocessingRules = Seq( - planSubqueriesRule) - // Run pre-processing rules. - val newPlan = AdaptiveSparkPlanExec.applyPhysicalRules(plan, preprocessingRules) - logDebug(s"Adaptive execution enabled for plan: $plan") - AdaptiveSparkPlanExec(newPlan, adaptiveExecutionContext, preprocessingRules, isSubquery) - } catch { - case SubqueryAdaptiveNotSupportedException(subquery) => - logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + - s"but is not supported for sub-query: $subquery.") - plan - } - case _ => - if (conf.adaptiveExecutionEnabled) { + case c: DataWritingCommandExec => c.copy(child = apply(c.child)) + case c: V2CommandExec => c.withNewChildren(c.children.map(apply)) + case _ if shouldApplyAQE(plan, isSubquery) => + if (supportAdaptive(plan)) { + try { + // Plan sub-queries recursively and pass in the shared stage cache for exchange reuse. + // Fall back to non-AQE mode if AQE is not supported in any of the sub-queries. + val subqueryMap = buildSubqueryMap(plan) + val planSubqueriesRule = PlanAdaptiveSubqueries(subqueryMap) + val preprocessingRules = Seq( + planSubqueriesRule) + // Run pre-processing rules. + val newPlan = AdaptiveSparkPlanExec.applyPhysicalRules(plan, preprocessingRules) + logDebug(s"Adaptive execution enabled for plan: $plan") + AdaptiveSparkPlanExec(newPlan, adaptiveExecutionContext, preprocessingRules, isSubquery) + } catch { + case SubqueryAdaptiveNotSupportedException(subquery) => + logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + + s"but is not supported for sub-query: $subquery.") + plan + } + } else { logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + s"but is not supported for query: $plan.") + plan } - plan + + case _ => plan + } + + // AQE is only useful when the query has exchanges or sub-queries. This method returns true if + // one of the following conditions is satisfied: + // - The config ADAPTIVE_EXECUTION_FORCE_APPLY is true. + // - The input query is from a sub-query. When this happens, it means we've already decided to + // apply AQE for the main query and we must continue to do it. + // - The query contains exchanges. + // - The query may need to add exchanges. It's an overkill to run `EnsureRequirements` here, so + // we just check `SparkPlan.requiredChildDistribution` and see if it's possible that the + // the query needs to add exchanges later. + // - The query contains sub-query. + private def shouldApplyAQE(plan: SparkPlan, isSubquery: Boolean): Boolean = { + conf.getConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY) || isSubquery || { + plan.find { + case _: Exchange => true + case p if !p.requiredChildDistribution.forall(_ == UnspecifiedDistribution) => true + case p => p.expressions.exists(_.find { + case _: SubqueryExpression => true + case _ => false + }.isDefined) + }.isDefined + } } private def supportAdaptive(plan: SparkPlan): Boolean = { @@ -102,36 +115,25 @@ case class InsertAdaptiveSparkPlan( * For each sub-query, generate the adaptive execution plan for each sub-query by applying this * rule, or reuse the execution plan from another sub-query of the same semantics if possible. */ - private def buildSubqueryMap(plan: SparkPlan): mutable.HashMap[Long, ExecSubqueryExpression] = { - val subqueryMap = mutable.HashMap.empty[Long, ExecSubqueryExpression] + private def buildSubqueryMap(plan: SparkPlan): Map[Long, SubqueryExec] = { + val subqueryMap = mutable.HashMap.empty[Long, SubqueryExec] plan.foreach(_.expressions.foreach(_.foreach { case expressions.ScalarSubquery(p, _, exprId) if !subqueryMap.contains(exprId.id) => val executedPlan = compileSubquery(p) verifyAdaptivePlan(executedPlan, p) - val scalarSubquery = execution.ScalarSubquery( - SubqueryExec(s"subquery${exprId.id}", executedPlan), exprId) - subqueryMap.put(exprId.id, scalarSubquery) - case expressions.InSubquery(values, ListQuery(query, _, exprId, _)) + val subquery = SubqueryExec(s"subquery#${exprId.id}", executedPlan) + subqueryMap.put(exprId.id, subquery) + case expressions.InSubquery(_, ListQuery(query, _, exprId, _)) if !subqueryMap.contains(exprId.id) => val executedPlan = compileSubquery(query) verifyAdaptivePlan(executedPlan, query) - val expr = if (values.length == 1) { - values.head - } else { - CreateNamedStruct( - values.zipWithIndex.flatMap { case (v, index) => - Seq(Literal(s"col_$index"), v) - } - ) - } - val inSubquery = InSubqueryExec(expr, - SubqueryExec(s"subquery#${exprId.id}", executedPlan), exprId) - subqueryMap.put(exprId.id, inSubquery) + val subquery = SubqueryExec(s"subquery#${exprId.id}", executedPlan) + subqueryMap.put(exprId.id, subquery) case _ => })) - subqueryMap + subqueryMap.toMap } def compileSubquery(plan: LogicalPlan): SparkPlan = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala deleted file mode 100644 index 19b78f5e36c9b..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.adaptive - -import scala.collection.mutable.ArrayBuffer - -import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} - -/** - * The [[Partition]] used by [[LocalShuffledRowRDD]]. - * @param mapIndex the index of mapper. - * @param startPartition the start partition ID in mapIndex mapper. - * @param endPartition the end partition ID in mapIndex mapper. - */ -private final class LocalShuffledRowRDDPartition( - override val index: Int, - val mapIndex: Int, - val startPartition: Int, - val endPartition: Int) extends Partition { -} - -/** - * This is a specialized version of [[org.apache.spark.sql.execution.ShuffledRowRDD]]. This is used - * in Spark SQL adaptive execution when a shuffle join is converted to broadcast join at runtime - * because the map output of one input table is small enough for broadcast. This RDD represents the - * data of another input table of the join that reads from shuffle. Each partition of the RDD reads - * the whole data from just one mapper output locally. So actually there is no data transferred - * from the network. - * - * This RDD takes a [[ShuffleDependency]] (`dependency`). - * - * The `dependency` has the parent RDD of this RDD, which represents the dataset before shuffle - * (i.e. map output). Elements of this RDD are (partitionId, Row) pairs. - * Partition ids should be in the range [0, numPartitions - 1]. - * `dependency.partitioner.numPartitions` is the number of pre-shuffle partitions. (i.e. the number - * of partitions of the map output). The post-shuffle partition number is the same to the parent - * RDD's partition number. - * - * `partitionStartIndicesPerMapper` specifies how to split the shuffle blocks of each mapper into - * one or more partitions. For a mapper `i`, the `j`th partition includes shuffle blocks from - * `partitionStartIndicesPerMapper[i][j]` to `partitionStartIndicesPerMapper[i][j+1]` (exclusive). - */ -class LocalShuffledRowRDD( - var dependency: ShuffleDependency[Int, InternalRow, InternalRow], - metrics: Map[String, SQLMetric], - partitionStartIndicesPerMapper: Array[Array[Int]]) - extends RDD[InternalRow](dependency.rdd.context, Nil) { - - private[this] val numReducers = dependency.partitioner.numPartitions - private[this] val numMappers = dependency.rdd.partitions.length - - override def getDependencies: Seq[Dependency[_]] = List(dependency) - - override def getPartitions: Array[Partition] = { - val partitions = ArrayBuffer[LocalShuffledRowRDDPartition]() - for (mapIndex <- 0 until numMappers) { - (partitionStartIndicesPerMapper(mapIndex) :+ numReducers).sliding(2, 1).foreach { - case Array(start, end) => - partitions += new LocalShuffledRowRDDPartition(partitions.length, mapIndex, start, end) - } - } - partitions.toArray - } - - override def getPreferredLocations(partition: Partition): Seq[String] = { - val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - tracker.getMapLocation(dependency, partition.index, partition.index + 1) - } - - override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { - val localRowPartition = split.asInstanceOf[LocalShuffledRowRDDPartition] - val mapIndex = localRowPartition.mapIndex - val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics() - // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator, - // as well as the `tempMetrics` for basic shuffle metrics. - val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics) - - val reader = SparkEnv.get.shuffleManager.getReaderForRange( - dependency.shuffleHandle, - mapIndex, - mapIndex + 1, - localRowPartition.startPartition, - localRowPartition.endPartition, - context, - sqlMetricsReporter) - reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2) - } - - override def clearDependencies() { - super.clearDependencies() - dependency = null - } -} - diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala index a8d8f358ab660..31d1f34b64a65 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala @@ -17,13 +17,9 @@ package org.apache.spark.sql.execution.adaptive -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} -import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ShuffleExchangeExec} +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildLeft, BuildRight, BuildSide} import org.apache.spark.sql.internal.SQLConf @@ -67,27 +63,33 @@ case class OptimizeLocalShuffleReader(conf: SQLConf) extends Rule[SparkPlan] { } } - private def createLocalReader(plan: SparkPlan): LocalShuffleReaderExec = { + private def createLocalReader(plan: SparkPlan): CustomShuffleReaderExec = { plan match { - case c @ CoalescedShuffleReaderExec(s: ShuffleQueryStageExec, _) => - LocalShuffleReaderExec( - s, getPartitionStartIndices(s, Some(c.partitionIndices.length))) + case c @ CustomShuffleReaderExec(s: ShuffleQueryStageExec, _, _) => + CustomShuffleReaderExec( + s, getPartitionSpecs(s, Some(c.partitionSpecs.length)), LOCAL_SHUFFLE_READER_DESCRIPTION) case s: ShuffleQueryStageExec => - LocalShuffleReaderExec(s, getPartitionStartIndices(s, None)) + CustomShuffleReaderExec(s, getPartitionSpecs(s, None), LOCAL_SHUFFLE_READER_DESCRIPTION) } } // TODO: this method assumes all shuffle blocks are the same data size. We should calculate the // partition start indices based on block size to avoid data skew. - private def getPartitionStartIndices( + private def getPartitionSpecs( shuffleStage: ShuffleQueryStageExec, - advisoryParallelism: Option[Int]): Array[Array[Int]] = { - val shuffleDep = shuffleStage.shuffle.shuffleDependency - val numReducers = shuffleDep.partitioner.numPartitions + advisoryParallelism: Option[Int]): Seq[ShufflePartitionSpec] = { + val numMappers = shuffleStage.shuffle.numMappers + val numReducers = shuffleStage.shuffle.numPartitions val expectedParallelism = advisoryParallelism.getOrElse(numReducers) - val numMappers = shuffleDep.rdd.getNumPartitions - Array.fill(numMappers) { - equallyDivide(numReducers, math.max(1, expectedParallelism / numMappers)).toArray + val splitPoints = if (numMappers == 0) { + Seq.empty + } else { + equallyDivide(numReducers, math.max(1, expectedParallelism / numMappers)) + } + (0 until numMappers).flatMap { mapIndex => + (splitPoints :+ numReducers).sliding(2).map { + case Seq(start, end) => PartialMapperPartitionSpec(mapIndex, start, end) + } } } @@ -120,6 +122,8 @@ case class OptimizeLocalShuffleReader(conf: SQLConf) extends Rule[SparkPlan] { object OptimizeLocalShuffleReader { + val LOCAL_SHUFFLE_READER_DESCRIPTION: String = "local" + object BroadcastJoinWithShuffleLeft { def unapply(plan: SparkPlan): Option[(SparkPlan, BuildSide)] = plan match { case join: BroadcastHashJoinExec if canUseLocalShuffleReader(join.left) => @@ -137,61 +141,10 @@ object OptimizeLocalShuffleReader { } def canUseLocalShuffleReader(plan: SparkPlan): Boolean = plan match { - case s: ShuffleQueryStageExec => s.shuffle.canChangeNumPartitions - case CoalescedShuffleReaderExec(s: ShuffleQueryStageExec, _) => s.shuffle.canChangeNumPartitions + case s: ShuffleQueryStageExec => + s.shuffle.canChangeNumPartitions && s.mapStats.isDefined + case CustomShuffleReaderExec(s: ShuffleQueryStageExec, _, _) => + s.shuffle.canChangeNumPartitions && s.mapStats.isDefined case _ => false } } - -/** - * A wrapper of shuffle query stage, which submits one or more reduce tasks per mapper to read the - * shuffle files written by one mapper. By doing this, it's very likely to read the shuffle files - * locally, as the shuffle files that a reduce task needs to read are in one node. - * - * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during - * canonicalization. - * @param partitionStartIndicesPerMapper A mapper usually writes many shuffle blocks, and it's - * better to launch multiple tasks to read shuffle blocks of - * one mapper. This array contains the partition start - * indices for each mapper. - */ -case class LocalShuffleReaderExec( - child: SparkPlan, - partitionStartIndicesPerMapper: Array[Array[Int]]) extends UnaryExecNode { - - override def output: Seq[Attribute] = child.output - - override lazy val outputPartitioning: Partitioning = { - // when we read one mapper per task, then the output partitioning is the same as the plan - // before shuffle. - if (partitionStartIndicesPerMapper.forall(_.length == 1)) { - child match { - case ShuffleQueryStageExec(_, s: ShuffleExchangeExec) => - s.child.outputPartitioning - case ShuffleQueryStageExec(_, r @ ReusedExchangeExec(_, s: ShuffleExchangeExec)) => - s.child.outputPartitioning match { - case e: Expression => r.updateAttr(e).asInstanceOf[Partitioning] - case other => other - } - case _ => - throw new IllegalStateException("operating on canonicalization plan") - } - } else { - UnknownPartitioning(partitionStartIndicesPerMapper.map(_.length).sum) - } - } - - private var cachedShuffleRDD: RDD[InternalRow] = null - - override protected def doExecute(): RDD[InternalRow] = { - if (cachedShuffleRDD == null) { - cachedShuffleRDD = child match { - case stage: ShuffleQueryStageExec => - stage.shuffle.createLocalShuffleRDD(partitionStartIndicesPerMapper) - case _ => - throw new IllegalStateException("operating on canonicalization plan") - } - } - cachedShuffleRDD - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 74b7fbd317fc8..b3b3eb2151f5e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -18,20 +18,41 @@ package org.apache.spark.sql.execution.adaptive import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer + +import org.apache.commons.io.FileUtils import org.apache.spark.{MapOutputStatistics, MapOutputTrackerMaster, SparkEnv} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins.SortMergeJoinExec import org.apache.spark.sql.internal.SQLConf +/** + * A rule to optimize skewed joins to avoid straggler tasks whose share of data are significantly + * larger than those of the rest of the tasks. + * + * The general idea is to divide each skew partition into smaller partitions and replicate its + * matching partition on the other side of the join so that they can run in parallel tasks. + * Note that when matching partitions from the left side and the right side both have skew, + * it will become a cartesian product of splits from left and right joining together. + * + * For example, assume the Sort-Merge join has 4 partitions: + * left: [L1, L2, L3, L4] + * right: [R1, R2, R3, R4] + * + * Let's say L2, L4 and R3, R4 are skewed, and each of them get split into 2 sub-partitions. This + * is scheduled to run 4 tasks at the beginning: (L1, R1), (L2, R2), (L3, R3), (L4, R4). + * This rule expands it to 9 tasks to increase parallelism: + * (L1, R1), + * (L2-1, R2), (L2-2, R2), + * (L3, R3-1), (L3, R3-2), + * (L4-1, R4-1), (L4-2, R4-1), (L4-1, R4-2), (L4-2, R4-2) + * + * Note that, when this rule is enabled, it also coalesces non-skewed partitions like + * `CoalesceShufflePartitions` does. + */ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { private val ensureRequirements = EnsureRequirements(conf) @@ -41,21 +62,17 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { /** * A partition is considered as a skewed partition if its size is larger than the median - * partition size * spark.sql.adaptive.skewedPartitionFactor and also larger than - * spark.sql.adaptive.skewedPartitionSizeThreshold. + * partition size * ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR and also larger than + * ADVISORY_PARTITION_SIZE_IN_BYTES. */ - private def isSkewed( - stats: MapOutputStatistics, - partitionId: Int, - medianSize: Long): Boolean = { - val size = stats.bytesByPartitionId(partitionId) - size > medianSize * conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR) && - size > conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD) + private def isSkewed(size: Long, medianSize: Long): Boolean = { + size > medianSize * conf.getConf(SQLConf.SKEW_JOIN_SKEWED_PARTITION_FACTOR) && + size > conf.getConf(SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD) } - private def medianSize(stats: MapOutputStatistics): Long = { - val numPartitions = stats.bytesByPartitionId.length - val bytes = stats.bytesByPartitionId.sorted + private def medianSize(sizes: Seq[Long]): Long = { + val numPartitions = sizes.length + val bytes = sizes.sorted numPartitions match { case _ if (numPartitions % 2 == 0) => math.max((bytes(numPartitions / 2) + bytes(numPartitions / 2 - 1)) / 2, 1) @@ -63,6 +80,19 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { } } + /** + * The goal of skew join optimization is to make the data distribution more even. The target size + * to split skewed partitions is the average size of non-skewed partition, or the + * advisory partition size if avg size is smaller than it. + */ + private def targetSize(sizes: Seq[Long], medianSize: Long): Long = { + val advisorySize = conf.getConf(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) + val nonSkewSizes = sizes.filterNot(isSkewed(_, medianSize)) + // It's impossible that all the partitions are skewed, as we use median size to define skew. + assert(nonSkewSizes.nonEmpty) + math.max(advisorySize, nonSkewSizes.sum / nonSkewSizes.length) + } + /** * Get the map size of the specific reduce shuffle Id. */ @@ -72,57 +102,43 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { } /** - * Split the skewed partition based on the map size and the max split number. + * Splits the skewed partition based on the map size and the target partition size + * after split, and create a list of `PartialMapperPartitionSpec`. Returns None if can't split. */ - private def getMapStartIndices(stage: ShuffleQueryStageExec, partitionId: Int): Array[Int] = { - val shuffleId = stage.shuffle.shuffleDependency.shuffleHandle.shuffleId - val mapPartitionSizes = getMapSizesForReduceId(shuffleId, partitionId) - val maxSplits = math.min(conf.getConf( - SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS), mapPartitionSizes.length) - val avgPartitionSize = mapPartitionSizes.sum / maxSplits - val advisoryPartitionSize = math.max(avgPartitionSize, - conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD)) - val partitionStartIndices = ArrayBuffer[Int]() - partitionStartIndices += 0 - var i = 0 - var postMapPartitionSize = 0L - while (i < mapPartitionSizes.length) { - val nextMapPartitionSize = mapPartitionSizes(i) - if (i > 0 && postMapPartitionSize + nextMapPartitionSize > advisoryPartitionSize) { - partitionStartIndices += i - postMapPartitionSize = nextMapPartitionSize - } else { - postMapPartitionSize += nextMapPartitionSize - } - i += 1 + private def createSkewPartitionSpecs( + shuffleId: Int, + reducerId: Int, + targetSize: Long): Option[Seq[PartialReducerPartitionSpec]] = { + val mapPartitionSizes = getMapSizesForReduceId(shuffleId, reducerId) + val mapStartIndices = ShufflePartitionsUtil.splitSizeListByTargetSize( + mapPartitionSizes, targetSize) + if (mapStartIndices.length > 1) { + Some(mapStartIndices.indices.map { i => + val startMapIndex = mapStartIndices(i) + val endMapIndex = if (i == mapStartIndices.length - 1) { + mapPartitionSizes.length + } else { + mapStartIndices(i + 1) + } + PartialReducerPartitionSpec(reducerId, startMapIndex, endMapIndex) + }) + } else { + None } - - if (partitionStartIndices.size > maxSplits) { - partitionStartIndices.take(maxSplits).toArray - } else partitionStartIndices.toArray - } - - private def getStatistics(stage: ShuffleQueryStageExec): MapOutputStatistics = { - assert(stage.resultOption.isDefined, "ShuffleQueryStageExec should" + - " already be ready when executing OptimizeSkewedPartitions rule") - stage.resultOption.get.asInstanceOf[MapOutputStatistics] } - private def supportSplitOnLeftPartition(joinType: JoinType) = { + private def canSplitLeftSide(joinType: JoinType) = { joinType == Inner || joinType == Cross || joinType == LeftSemi || joinType == LeftAnti || joinType == LeftOuter } - private def supportSplitOnRightPartition(joinType: JoinType) = { + private def canSplitRightSide(joinType: JoinType) = { joinType == Inner || joinType == Cross || joinType == RightOuter } - private def getNumMappers(stage: ShuffleQueryStageExec): Int = { - stage.shuffle.shuffleDependency.rdd.partitions.length - } - - private def getSizeInfo(medianSize: Long, maxSize: Long): String = { - s"median size: $medianSize, max size: ${maxSize}" + private def getSizeInfo(medianSize: Long, sizes: Seq[Long]): String = { + s"median size: $medianSize, max size: ${sizes.max}, min size: ${sizes.min}, avg size: " + + sizes.sum / sizes.length } /* @@ -130,88 +146,108 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { * 1. Check whether the shuffle partition is skewed based on the median size * and the skewed partition threshold in origin smj. * 2. Assuming partition0 is skewed in left side, and it has 5 mappers (Map0, Map1...Map4). - * And we will split the 5 Mappers into 3 mapper ranges [(Map0, Map1), (Map2, Map3), (Map4)] + * And we may split the 5 Mappers into 3 mapper ranges [(Map0, Map1), (Map2, Map3), (Map4)] * based on the map size and the max split number. - * 3. Create the 3 smjs with separately reading the above mapper ranges and then join with - * the Partition0 in right side. - * 4. Finally union the above 3 split smjs and the origin smj. + * 3. Wrap the join left child with a special shuffle reader that reads each mapper range with one + * task, so total 3 tasks. + * 4. Wrap the join right child with a special shuffle reader that reads partition0 3 times by + * 3 tasks separately. */ def optimizeSkewJoin(plan: SparkPlan): SparkPlan = plan.transformUp { - case smj @ SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, - s1 @ SortExec(_, _, left: ShuffleQueryStageExec, _), - s2 @ SortExec(_, _, right: ShuffleQueryStageExec, _), _) - if (supportedJoinTypes.contains(joinType)) => - val leftStats = getStatistics(left) - val rightStats = getStatistics(right) - val numPartitions = leftStats.bytesByPartitionId.length - - val leftMedSize = medianSize(leftStats) - val rightMedSize = medianSize(rightStats) + case smj @ SortMergeJoinExec(_, _, joinType, _, + s1 @ SortExec(_, _, ShuffleStage(left: ShuffleStageInfo), _), + s2 @ SortExec(_, _, ShuffleStage(right: ShuffleStageInfo), _), _) + if supportedJoinTypes.contains(joinType) => + assert(left.partitionsWithSizes.length == right.partitionsWithSizes.length) + val numPartitions = left.partitionsWithSizes.length + // Use the median size of the actual (coalesced) partition sizes to detect skewed partitions. + val leftMedSize = medianSize(left.partitionsWithSizes.map(_._2)) + val rightMedSize = medianSize(right.partitionsWithSizes.map(_._2)) logDebug( s""" - |Try to optimize skewed join. - |Left side partition size: - |${getSizeInfo(leftMedSize, leftStats.bytesByPartitionId.max)} - |Right side partition size: - |${getSizeInfo(rightMedSize, rightStats.bytesByPartitionId.max)} + |Optimizing skewed join. + |Left side partitions size info: + |${getSizeInfo(leftMedSize, left.partitionsWithSizes.map(_._2))} + |Right side partitions size info: + |${getSizeInfo(rightMedSize, right.partitionsWithSizes.map(_._2))} """.stripMargin) - - val skewedPartitions = mutable.HashSet[Int]() - val subJoins = mutable.ArrayBuffer[SparkPlan]() - for (partitionId <- 0 until numPartitions) { - val isLeftSkew = isSkewed(leftStats, partitionId, leftMedSize) - val isRightSkew = isSkewed(rightStats, partitionId, rightMedSize) - val leftMapIdStartIndices = if (isLeftSkew && supportSplitOnLeftPartition(joinType)) { - getMapStartIndices(left, partitionId) + val canSplitLeft = canSplitLeftSide(joinType) + val canSplitRight = canSplitRightSide(joinType) + // We use the actual partition sizes (may be coalesced) to calculate target size, so that + // the final data distribution is even (coalesced partitions + split partitions). + val leftActualSizes = left.partitionsWithSizes.map(_._2) + val rightActualSizes = right.partitionsWithSizes.map(_._2) + val leftTargetSize = targetSize(leftActualSizes, leftMedSize) + val rightTargetSize = targetSize(rightActualSizes, rightMedSize) + + val leftSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec] + val rightSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec] + val leftSkewDesc = new SkewDesc + val rightSkewDesc = new SkewDesc + for (partitionIndex <- 0 until numPartitions) { + val isLeftSkew = isSkewed(leftActualSizes(partitionIndex), leftMedSize) && canSplitLeft + val leftPartSpec = left.partitionsWithSizes(partitionIndex)._1 + val isLeftCoalesced = leftPartSpec.startReducerIndex + 1 < leftPartSpec.endReducerIndex + + val isRightSkew = isSkewed(rightActualSizes(partitionIndex), rightMedSize) && canSplitRight + val rightPartSpec = right.partitionsWithSizes(partitionIndex)._1 + val isRightCoalesced = rightPartSpec.startReducerIndex + 1 < rightPartSpec.endReducerIndex + + // A skewed partition should never be coalesced, but skip it here just to be safe. + val leftParts = if (isLeftSkew && !isLeftCoalesced) { + val reducerId = leftPartSpec.startReducerIndex + val skewSpecs = createSkewPartitionSpecs( + left.mapStats.shuffleId, reducerId, leftTargetSize) + if (skewSpecs.isDefined) { + logDebug(s"Left side partition $partitionIndex is skewed, split it into " + + s"${skewSpecs.get.length} parts.") + leftSkewDesc.addPartitionSize(leftActualSizes(partitionIndex)) + } + skewSpecs.getOrElse(Seq(leftPartSpec)) } else { - Array(0) + Seq(leftPartSpec) } - val rightMapIdStartIndices = if (isRightSkew && supportSplitOnRightPartition(joinType)) { - getMapStartIndices(right, partitionId) + + // A skewed partition should never be coalesced, but skip it here just to be safe. + val rightParts = if (isRightSkew && !isRightCoalesced) { + val reducerId = rightPartSpec.startReducerIndex + val skewSpecs = createSkewPartitionSpecs( + right.mapStats.shuffleId, reducerId, rightTargetSize) + if (skewSpecs.isDefined) { + logDebug(s"Right side partition $partitionIndex is skewed, split it into " + + s"${skewSpecs.get.length} parts.") + rightSkewDesc.addPartitionSize(rightActualSizes(partitionIndex)) + } + skewSpecs.getOrElse(Seq(rightPartSpec)) } else { - Array(0) + Seq(rightPartSpec) } - if (leftMapIdStartIndices.length > 1 || rightMapIdStartIndices.length > 1) { - skewedPartitions += partitionId - for (i <- 0 until leftMapIdStartIndices.length; - j <- 0 until rightMapIdStartIndices.length) { - val leftEndMapId = if (i == leftMapIdStartIndices.length - 1) { - getNumMappers(left) - } else { - leftMapIdStartIndices(i + 1) - } - val rightEndMapId = if (j == rightMapIdStartIndices.length - 1) { - getNumMappers(right) - } else { - rightMapIdStartIndices(j + 1) - } - // TODO: we may can optimize the sort merge join to broad cast join after - // obtaining the raw data size of per partition, - val leftSkewedReader = SkewedPartitionReaderExec( - left, partitionId, leftMapIdStartIndices(i), leftEndMapId) - val rightSkewedReader = SkewedPartitionReaderExec(right, partitionId, - rightMapIdStartIndices(j), rightEndMapId) - subJoins += SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, - s1.copy(child = leftSkewedReader), s2.copy(child = rightSkewedReader), true) - } + for { + leftSidePartition <- leftParts + rightSidePartition <- rightParts + } { + leftSidePartitions += leftSidePartition + rightSidePartitions += rightSidePartition } } - logDebug(s"number of skewed partitions is ${skewedPartitions.size}") - if (skewedPartitions.nonEmpty) { - val optimizedSmj = smj.copy( - left = s1.copy(child = PartialShuffleReaderExec(left, skewedPartitions.toSet)), - right = s2.copy(child = PartialShuffleReaderExec(right, skewedPartitions.toSet)), - isPartial = true) - subJoins += optimizedSmj - UnionExec(subJoins) + + logDebug("number of skewed partitions: " + + s"left ${leftSkewDesc.numPartitions}, right ${rightSkewDesc.numPartitions}") + if (leftSkewDesc.numPartitions > 0 || rightSkewDesc.numPartitions > 0) { + val newLeft = CustomShuffleReaderExec( + left.shuffleStage, leftSidePartitions, leftSkewDesc.toString) + val newRight = CustomShuffleReaderExec( + right.shuffleStage, rightSidePartitions, rightSkewDesc.toString) + smj.copy( + left = s1.copy(child = newLeft), right = s2.copy(child = newRight), isSkewJoin = true) } else { smj } } override def apply(plan: SparkPlan): SparkPlan = { - if (!conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_JOIN_ENABLED)) { + if (!conf.getConf(SQLConf.SKEW_JOIN_ENABLED)) { return plan } @@ -224,7 +260,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { if (shuffleStages.length == 2) { // When multi table join, there will be too many complex combination to consider. - // Currently we only handle 2 table join like following two use cases. + // Currently we only handle 2 table join like following use case. // SMJ // Sort // Shuffle @@ -248,81 +284,71 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { } } -/** - * A wrapper of shuffle query stage, which submits one reduce task to read a single - * shuffle partition 'partitionIndex' produced by the mappers in range [startMapIndex, endMapIndex). - * This is used to increase the parallelism when reading skewed partitions. - * - * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange - * node during canonicalization. - * @param partitionIndex The pre shuffle partition index. - * @param startMapIndex The start map index. - * @param endMapIndex The end map index. - */ -case class SkewedPartitionReaderExec( - child: QueryStageExec, - partitionIndex: Int, - startMapIndex: Int, - endMapIndex: Int) extends LeafExecNode { - - override def output: Seq[Attribute] = child.output - - override def outputPartitioning: Partitioning = { - UnknownPartitioning(1) - } - private var cachedSkewedShuffleRDD: SkewedShuffledRowRDD = null - - override def doExecute(): RDD[InternalRow] = { - if (cachedSkewedShuffleRDD == null) { - cachedSkewedShuffleRDD = child match { - case stage: ShuffleQueryStageExec => - stage.shuffle.createSkewedShuffleRDD(partitionIndex, startMapIndex, endMapIndex) - case _ => - throw new IllegalStateException("operating on canonicalization plan") +private object ShuffleStage { + def unapply(plan: SparkPlan): Option[ShuffleStageInfo] = plan match { + case s: ShuffleQueryStageExec if s.mapStats.isDefined => + val mapStats = s.mapStats.get + val sizes = mapStats.bytesByPartitionId + val partitions = sizes.zipWithIndex.map { + case (size, i) => CoalescedPartitionSpec(i, i + 1) -> size } - } - cachedSkewedShuffleRDD + Some(ShuffleStageInfo(s, mapStats, partitions)) + + case CustomShuffleReaderExec(s: ShuffleQueryStageExec, partitionSpecs, _) + if s.mapStats.isDefined && partitionSpecs.nonEmpty => + val mapStats = s.mapStats.get + val sizes = mapStats.bytesByPartitionId + val partitions = partitionSpecs.map { + case spec @ CoalescedPartitionSpec(start, end) => + var sum = 0L + var i = start + while (i < end) { + sum += sizes(i) + i += 1 + } + spec -> sum + case other => throw new IllegalArgumentException( + s"Expect CoalescedPartitionSpec but got $other") + } + Some(ShuffleStageInfo(s, mapStats, partitions)) + + case _ => None } } -/** - * A wrapper of shuffle query stage, which skips some partitions when reading the shuffle blocks. - * - * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during - * canonicalization. - * @param excludedPartitions The partitions to skip when reading. - */ -case class PartialShuffleReaderExec( - child: QueryStageExec, - excludedPartitions: Set[Int]) extends UnaryExecNode { - - override def output: Seq[Attribute] = child.output +private case class ShuffleStageInfo( + shuffleStage: ShuffleQueryStageExec, + mapStats: MapOutputStatistics, + partitionsWithSizes: Seq[(CoalescedPartitionSpec, Long)]) - override def outputPartitioning: Partitioning = { - UnknownPartitioning(1) - } +private class SkewDesc { + private[this] var numSkewedPartitions: Int = 0 + private[this] var totalSize: Long = 0 + private[this] var maxSize: Long = 0 + private[this] var minSize: Long = 0 - private def shuffleExchange(): ShuffleExchangeExec = child match { - case stage: ShuffleQueryStageExec => stage.shuffle - case _ => - throw new IllegalStateException("operating on canonicalization plan") - } + def numPartitions: Int = numSkewedPartitions - private def getPartitionIndexRanges(): Array[(Int, Int)] = { - val length = shuffleExchange().shuffleDependency.partitioner.numPartitions - (0 until length).filterNot(excludedPartitions.contains).map(i => (i, i + 1)).toArray + def addPartitionSize(size: Long): Unit = { + if (numSkewedPartitions == 0) { + maxSize = size + minSize = size + } + numSkewedPartitions += 1 + totalSize += size + if (size > maxSize) maxSize = size + if (size < minSize) minSize = size } - private var cachedShuffleRDD: RDD[InternalRow] = null - - override def doExecute(): RDD[InternalRow] = { - if (cachedShuffleRDD == null) { - cachedShuffleRDD = if (excludedPartitions.isEmpty) { - child.execute() - } else { - shuffleExchange().createShuffledRDD(Some(getPartitionIndexRanges())) - } + override def toString: String = { + if (numSkewedPartitions == 0) { + "no skewed partition" + } else { + val maxSizeStr = FileUtils.byteCountToDisplaySize(maxSize) + val minSizeStr = FileUtils.byteCountToDisplaySize(minSize) + val avgSizeStr = FileUtils.byteCountToDisplaySize(totalSize / numSkewedPartitions) + s"$numSkewedPartitions skewed partitions with " + + s"size(max=$maxSizeStr, min=$minSizeStr, avg=$avgSizeStr)" } - cachedShuffleRDD } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala index 91d4359224a6a..f845b6b16ee3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala @@ -18,19 +18,28 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.expressions.ListQuery +import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, ListQuery, Literal} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{ExecSubqueryExpression, SparkPlan} +import org.apache.spark.sql.execution +import org.apache.spark.sql.execution.{InSubqueryExec, SparkPlan, SubqueryExec} -case class PlanAdaptiveSubqueries( - subqueryMap: scala.collection.Map[Long, ExecSubqueryExpression]) extends Rule[SparkPlan] { +case class PlanAdaptiveSubqueries(subqueryMap: Map[Long, SubqueryExec]) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { plan.transformAllExpressions { case expressions.ScalarSubquery(_, _, exprId) => - subqueryMap(exprId.id) - case expressions.InSubquery(_, ListQuery(_, _, exprId, _)) => - subqueryMap(exprId.id) + execution.ScalarSubquery(subqueryMap(exprId.id), exprId) + case expressions.InSubquery(values, ListQuery(_, _, exprId, _)) => + val expr = if (values.length == 1) { + values.head + } else { + CreateNamedStruct( + values.zipWithIndex.flatMap { case (v, index) => + Seq(Literal(s"col_$index"), v) + } + ) + } + InSubqueryExec(expr, subqueryMap(exprId.id), exprId) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala index d5dc1be63f06e..74fe1eaab6f64 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala @@ -17,17 +17,23 @@ package org.apache.spark.sql.execution.adaptive -import scala.concurrent.Future +import java.util.concurrent.TimeUnit -import org.apache.spark.{FutureAction, MapOutputStatistics} +import scala.concurrent.{Future, Promise} + +import org.apache.spark.{FutureAction, MapOutputStatistics, SparkException} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.exchange._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.ThreadUtils /** * A query stage is an independent subgraph of the query plan. Query stage materializes its output @@ -75,6 +81,11 @@ abstract class QueryStageExec extends LeafExecNode { def newReuseInstance(newStageId: Int, newOutput: Seq[Attribute]): QueryStageExec + /** + * Returns the runtime statistics after stage materialization. + */ + def getRuntimeStatistics: Statistics + /** * Compute the statistics of the query stage if executed, otherwise None. */ @@ -100,8 +111,10 @@ abstract class QueryStageExec extends LeafExecNode { override def executeTail(n: Int): Array[InternalRow] = plan.executeTail(n) override def executeToIterator(): Iterator[InternalRow] = plan.executeToIterator() - override def doPrepare(): Unit = plan.prepare() - override def doExecute(): RDD[InternalRow] = plan.execute() + protected override def doPrepare(): Unit = plan.prepare() + protected override def doExecute(): RDD[InternalRow] = plan.execute() + override def supportsColumnar: Boolean = plan.supportsColumnar + protected override def doExecuteColumnar(): RDD[ColumnarBatch] = plan.executeColumnar() override def doExecuteBroadcast[T](): Broadcast[T] = plan.executeBroadcast() override def doCanonicalize(): SparkPlan = plan.canonicalized @@ -130,20 +143,20 @@ abstract class QueryStageExec extends LeafExecNode { } /** - * A shuffle query stage whose child is a [[ShuffleExchangeExec]] or [[ReusedExchangeExec]]. + * A shuffle query stage whose child is a [[ShuffleExchangeLike]] or [[ReusedExchangeExec]]. */ case class ShuffleQueryStageExec( override val id: Int, override val plan: SparkPlan) extends QueryStageExec { @transient val shuffle = plan match { - case s: ShuffleExchangeExec => s - case ReusedExchangeExec(_, s: ShuffleExchangeExec) => s + case s: ShuffleExchangeLike => s + case ReusedExchangeExec(_, s: ShuffleExchangeLike) => s case _ => throw new IllegalStateException("wrong plan for shuffle stage:\n " + plan.treeString) } - override def doMaterialize(): Future[Any] = { + override def doMaterialize(): Future[Any] = attachTree(this, "execute") { shuffle.mapOutputStatisticsFuture } @@ -161,24 +174,52 @@ case class ShuffleQueryStageExec( case _ => } } + + /** + * Returns the Option[MapOutputStatistics]. If the shuffle map stage has no partition, + * this method returns None, as there is no map statistics. + */ + def mapStats: Option[MapOutputStatistics] = { + assert(resultOption.isDefined, "ShuffleQueryStageExec should already be ready") + val stats = resultOption.get.asInstanceOf[MapOutputStatistics] + Option(stats) + } + + override def getRuntimeStatistics: Statistics = shuffle.runtimeStatistics } /** - * A broadcast query stage whose child is a [[BroadcastExchangeExec]] or [[ReusedExchangeExec]]. + * A broadcast query stage whose child is a [[BroadcastExchangeLike]] or [[ReusedExchangeExec]]. */ case class BroadcastQueryStageExec( override val id: Int, override val plan: SparkPlan) extends QueryStageExec { @transient val broadcast = plan match { - case b: BroadcastExchangeExec => b - case ReusedExchangeExec(_, b: BroadcastExchangeExec) => b + case b: BroadcastExchangeLike => b + case ReusedExchangeExec(_, b: BroadcastExchangeLike) => b case _ => throw new IllegalStateException("wrong plan for broadcast stage:\n " + plan.treeString) } + @transient private lazy val materializeWithTimeout = { + val broadcastFuture = broadcast.completionFuture + val timeout = SQLConf.get.broadcastTimeout + val promise = Promise[Any]() + val fail = BroadcastQueryStageExec.scheduledExecutor.schedule(new Runnable() { + override def run(): Unit = { + promise.tryFailure(new SparkException(s"Could not execute broadcast in $timeout secs. " + + s"You can increase the timeout for broadcasts via ${SQLConf.BROADCAST_TIMEOUT.key} or " + + s"disable broadcast join by setting ${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key} to -1")) + } + }, timeout, TimeUnit.SECONDS) + broadcastFuture.onComplete(_ => fail.cancel(false))(AdaptiveSparkPlanExec.executionContext) + Future.firstCompletedOf( + Seq(broadcastFuture, promise.future))(AdaptiveSparkPlanExec.executionContext) + } + override def doMaterialize(): Future[Any] = { - broadcast.completionFuture + materializeWithTimeout } override def newReuseInstance(newStageId: Int, newOutput: Seq[Attribute]): QueryStageExec = { @@ -193,4 +234,11 @@ case class BroadcastQueryStageExec( broadcast.relationFuture.cancel(true) } } + + override def getRuntimeStatistics: Statistics = broadcast.runtimeStatistics +} + +object BroadcastQueryStageExec { + private val scheduledExecutor = + ThreadUtils.newDaemonSingleThreadScheduledExecutor("BroadcastStageTimeout") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala deleted file mode 100644 index 2c50b638b4d45..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.adaptive - -import scala.collection.mutable.{ArrayBuffer, HashSet} - -import org.apache.spark.MapOutputStatistics -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} -import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan, UnaryExecNode} -import org.apache.spark.sql.internal.SQLConf - -/** - * A rule to adjust the post shuffle partitions based on the map output statistics. - * - * The strategy used to determine the number of post-shuffle partitions is described as follows. - * To determine the number of post-shuffle partitions, we have a target input size for a - * post-shuffle partition. Once we have size statistics of all pre-shuffle partitions, we will do - * a pass of those statistics and pack pre-shuffle partitions with continuous indices to a single - * post-shuffle partition until adding another pre-shuffle partition would cause the size of a - * post-shuffle partition to be greater than the target size. - * - * For example, we have two stages with the following pre-shuffle partition size statistics: - * stage 1: [100 MiB, 20 MiB, 100 MiB, 10MiB, 30 MiB] - * stage 2: [10 MiB, 10 MiB, 70 MiB, 5 MiB, 5 MiB] - * assuming the target input size is 128 MiB, we will have four post-shuffle partitions, - * which are: - * - post-shuffle partition 0: pre-shuffle partition 0 (size 110 MiB) - * - post-shuffle partition 1: pre-shuffle partition 1 (size 30 MiB) - * - post-shuffle partition 2: pre-shuffle partition 2 (size 170 MiB) - * - post-shuffle partition 3: pre-shuffle partition 3 and 4 (size 50 MiB) - */ -case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] { - - override def apply(plan: SparkPlan): SparkPlan = { - if (!conf.reducePostShufflePartitionsEnabled) { - return plan - } - // 'SkewedShufflePartitionReader' is added by us, so it's safe to ignore it when changing - // number of reducers. - val leafNodes = plan.collectLeaves().filter(!_.isInstanceOf[SkewedPartitionReaderExec]) - if (!leafNodes.forall(_.isInstanceOf[QueryStageExec])) { - // If not all leaf nodes are query stages, it's not safe to reduce the number of - // shuffle partitions, because we may break the assumption that all children of a spark plan - // have same number of output partitions. - return plan - } - - def collectShuffles(plan: SparkPlan): Seq[SparkPlan] = plan match { - case _: LocalShuffleReaderExec => Nil - case p: PartialShuffleReaderExec => Seq(p) - case stage: ShuffleQueryStageExec => Seq(stage) - case _ => plan.children.flatMap(collectShuffles) - } - - val shuffles = collectShuffles(plan) - val shuffleStages = shuffles.map { - case PartialShuffleReaderExec(s: ShuffleQueryStageExec, _) => s - case s: ShuffleQueryStageExec => s - } - // ShuffleExchanges introduced by repartition do not support changing the number of partitions. - // We change the number of partitions in the stage only if all the ShuffleExchanges support it. - if (!shuffleStages.forall(_.shuffle.canChangeNumPartitions)) { - plan - } else { - val shuffleMetrics = shuffleStages.map { stage => - assert(stage.resultOption.isDefined, "ShuffleQueryStageExec should already be ready") - stage.resultOption.get.asInstanceOf[MapOutputStatistics] - } - - // `ShuffleQueryStageExec` gives null mapOutputStatistics when the input RDD has 0 partitions, - // we should skip it when calculating the `partitionStartIndices`. - val validMetrics = shuffleMetrics.filter(_ != null) - // We may have different pre-shuffle partition numbers, don't reduce shuffle partition number - // in that case. For example when we union fully aggregated data (data is arranged to a single - // partition) and a result of a SortMergeJoin (multiple partitions). - val distinctNumPreShufflePartitions = - validMetrics.map(stats => stats.bytesByPartitionId.length).distinct - val distinctExcludedPartitions = shuffles.map { - case PartialShuffleReaderExec(_, excludedPartitions) => excludedPartitions - case _: ShuffleQueryStageExec => Set.empty[Int] - }.distinct - if (validMetrics.nonEmpty && distinctNumPreShufflePartitions.length == 1 - && distinctExcludedPartitions.length == 1) { - val excludedPartitions = distinctExcludedPartitions.head - val partitionIndices = estimatePartitionStartAndEndIndices( - validMetrics.toArray, excludedPartitions) - // This transformation adds new nodes, so we must use `transformUp` here. - // Even for shuffle exchange whose input RDD has 0 partition, we should still update its - // `partitionStartIndices`, so that all the leaf shuffles in a stage have the same - // number of output partitions. - val visitedStages = HashSet.empty[Int] - plan.transformDown { - // Replace `PartialShuffleReaderExec` with `CoalescedShuffleReaderExec`, which keeps the - // "excludedPartition" requirement and also merges some partitions. - case PartialShuffleReaderExec(stage: ShuffleQueryStageExec, _) => - visitedStages.add(stage.id) - CoalescedShuffleReaderExec(stage, partitionIndices) - - // We are doing `transformDown`, so the `ShuffleQueryStageExec` may already be optimized - // and wrapped by `CoalescedShuffleReaderExec`. - case stage: ShuffleQueryStageExec if !visitedStages.contains(stage.id) => - visitedStages.add(stage.id) - CoalescedShuffleReaderExec(stage, partitionIndices) - } - } else { - plan - } - } - } - - /** - * Estimates partition start and end indices for post-shuffle partitions based on - * mapOutputStatistics provided by all pre-shuffle stages and skip the omittedPartitions - * already handled in skewed partition optimization. - */ - // visible for testing. - private[sql] def estimatePartitionStartAndEndIndices( - mapOutputStatistics: Array[MapOutputStatistics], - excludedPartitions: Set[Int] = Set.empty): Array[(Int, Int)] = { - val minNumPostShufflePartitions = conf.minNumPostShufflePartitions - excludedPartitions.size - val advisoryTargetPostShuffleInputSize = conf.targetPostShuffleInputSize - // If minNumPostShufflePartitions is defined, it is possible that we need to use a - // value less than advisoryTargetPostShuffleInputSize as the target input size of - // a post shuffle task. - val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum - // The max at here is to make sure that when we have an empty table, we - // only have a single post-shuffle partition. - // There is no particular reason that we pick 16. We just need a number to - // prevent maxPostShuffleInputSize from being set to 0. - val maxPostShuffleInputSize = math.max( - math.ceil(totalPostShuffleInputSize / minNumPostShufflePartitions.toDouble).toLong, 16) - val targetPostShuffleInputSize = - math.min(maxPostShuffleInputSize, advisoryTargetPostShuffleInputSize) - - logInfo( - s"advisoryTargetPostShuffleInputSize: $advisoryTargetPostShuffleInputSize, " + - s"targetPostShuffleInputSize $targetPostShuffleInputSize.") - - // Make sure we do get the same number of pre-shuffle partitions for those stages. - val distinctNumPreShufflePartitions = - mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct - // The reason that we are expecting a single value of the number of pre-shuffle partitions - // is that when we add Exchanges, we set the number of pre-shuffle partitions - // (i.e. map output partitions) using a static setting, which is the value of - // spark.sql.shuffle.partitions. Even if two input RDDs are having different - // number of partitions, they will have the same number of pre-shuffle partitions - // (i.e. map output partitions). - assert( - distinctNumPreShufflePartitions.length == 1, - "There should be only one distinct value of the number pre-shuffle partitions " + - "among registered Exchange operator.") - - val partitionStartIndices = ArrayBuffer[Int]() - val partitionEndIndices = ArrayBuffer[Int]() - val numPartitions = distinctNumPreShufflePartitions.head - val includedPartitions = (0 until numPartitions).filter(!excludedPartitions.contains(_)) - val firstStartIndex = includedPartitions(0) - partitionStartIndices += firstStartIndex - var postShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId(firstStartIndex)).sum - var i = firstStartIndex - includedPartitions.drop(1).foreach { nextPartitionIndex => - val nextShuffleInputSize = - mapOutputStatistics.map(_.bytesByPartitionId(nextPartitionIndex)).sum - // If nextPartitionIndices is skewed and omitted, or including - // the nextShuffleInputSize would exceed the target partition size, - // then start a new partition. - if (nextPartitionIndex != i + 1 || - (postShuffleInputSize + nextShuffleInputSize > targetPostShuffleInputSize)) { - partitionEndIndices += i + 1 - partitionStartIndices += nextPartitionIndex - // reset postShuffleInputSize. - postShuffleInputSize = nextShuffleInputSize - i = nextPartitionIndex - } else { - postShuffleInputSize += nextShuffleInputSize - i += 1 - } - } - partitionEndIndices += i + 1 - partitionStartIndices.zip(partitionEndIndices).toArray - } -} - -/** - * A wrapper of shuffle query stage, which submits fewer reduce task as one reduce task may read - * multiple shuffle partitions. This can avoid many small reduce tasks that hurt performance. - * - * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during - * canonicalization. - */ -case class CoalescedShuffleReaderExec( - child: SparkPlan, - partitionIndices: Array[(Int, Int)]) extends UnaryExecNode { - - override def output: Seq[Attribute] = child.output - - override def outputPartitioning: Partitioning = { - UnknownPartitioning(partitionIndices.length) - } - - private var cachedShuffleRDD: ShuffledRowRDD = null - - override protected def doExecute(): RDD[InternalRow] = { - if (cachedShuffleRDD == null) { - cachedShuffleRDD = child match { - case stage: ShuffleQueryStageExec => - stage.shuffle.createShuffledRDD(Some(partitionIndices)) - case _ => - throw new IllegalStateException("operating on canonicalization plan") - } - } - cachedShuffleRDD - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala new file mode 100644 index 0000000000000..e10ed4f481cf7 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsUtil.scala @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.MapOutputStatistics +import org.apache.spark.internal.Logging +import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionSpec} + +object ShufflePartitionsUtil extends Logging { + final val SMALL_PARTITION_FACTOR = 0.2 + final val MERGED_PARTITION_FACTOR = 1.2 + + /** + * Coalesce the partitions from multiple shuffles. This method assumes that all the shuffles + * have the same number of partitions, and the partitions of same index will be read together + * by one task. + * + * The strategy used to determine the number of coalesced partitions is described as follows. + * To determine the number of coalesced partitions, we have a target size for a coalesced + * partition. Once we have size statistics of all shuffle partitions, we will do + * a pass of those statistics and pack shuffle partitions with continuous indices to a single + * coalesced partition until adding another shuffle partition would cause the size of a + * coalesced partition to be greater than the target size. + * + * For example, we have two shuffles with the following partition size statistics: + * - shuffle 1 (5 partitions): [100 MiB, 20 MiB, 100 MiB, 10MiB, 30 MiB] + * - shuffle 2 (5 partitions): [10 MiB, 10 MiB, 70 MiB, 5 MiB, 5 MiB] + * Assuming the target size is 128 MiB, we will have 4 coalesced partitions, which are: + * - coalesced partition 0: shuffle partition 0 (size 110 MiB) + * - coalesced partition 1: shuffle partition 1 (size 30 MiB) + * - coalesced partition 2: shuffle partition 2 (size 170 MiB) + * - coalesced partition 3: shuffle partition 3 and 4 (size 50 MiB) + * + * @return A sequence of [[CoalescedPartitionSpec]]s. For example, if partitions [0, 1, 2, 3, 4] + * split at indices [0, 2, 3], the returned partition specs will be: + * CoalescedPartitionSpec(0, 2), CoalescedPartitionSpec(2, 3) and + * CoalescedPartitionSpec(3, 5). + */ + def coalescePartitions( + mapOutputStatistics: Array[MapOutputStatistics], + advisoryTargetSize: Long, + minNumPartitions: Int): Seq[ShufflePartitionSpec] = { + // If `minNumPartitions` is very large, it is possible that we need to use a value less than + // `advisoryTargetSize` as the target size of a coalesced task. + val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum + // The max at here is to make sure that when we have an empty table, we only have a single + // coalesced partition. + // There is no particular reason that we pick 16. We just need a number to prevent + // `maxTargetSize` from being set to 0. + val maxTargetSize = math.max( + math.ceil(totalPostShuffleInputSize / minNumPartitions.toDouble).toLong, 16) + val targetSize = math.min(maxTargetSize, advisoryTargetSize) + + val shuffleIds = mapOutputStatistics.map(_.shuffleId).mkString(", ") + logInfo(s"For shuffle($shuffleIds), advisory target size: $advisoryTargetSize, " + + s"actual target size $targetSize.") + + // Make sure these shuffles have the same number of partitions. + val distinctNumShufflePartitions = + mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct + // The reason that we are expecting a single value of the number of shuffle partitions + // is that when we add Exchanges, we set the number of shuffle partitions + // (i.e. map output partitions) using a static setting, which is the value of + // `spark.sql.shuffle.partitions`. Even if two input RDDs are having different + // number of partitions, they will have the same number of shuffle partitions + // (i.e. map output partitions). + assert( + distinctNumShufflePartitions.length == 1, + "There should be only one distinct value of the number of shuffle partitions " + + "among registered Exchange operators.") + + val numPartitions = distinctNumShufflePartitions.head + val partitionSpecs = ArrayBuffer[CoalescedPartitionSpec]() + var latestSplitPoint = 0 + var coalescedSize = 0L + var i = 0 + while (i < numPartitions) { + // We calculate the total size of i-th shuffle partitions from all shuffles. + var totalSizeOfCurrentPartition = 0L + var j = 0 + while (j < mapOutputStatistics.length) { + totalSizeOfCurrentPartition += mapOutputStatistics(j).bytesByPartitionId(i) + j += 1 + } + + // If including the `totalSizeOfCurrentPartition` would exceed the target size, then start a + // new coalesced partition. + if (i > latestSplitPoint && coalescedSize + totalSizeOfCurrentPartition > targetSize) { + partitionSpecs += CoalescedPartitionSpec(latestSplitPoint, i) + latestSplitPoint = i + // reset postShuffleInputSize. + coalescedSize = totalSizeOfCurrentPartition + } else { + coalescedSize += totalSizeOfCurrentPartition + } + i += 1 + } + partitionSpecs += CoalescedPartitionSpec(latestSplitPoint, numPartitions) + + partitionSpecs + } + + /** + * Given a list of size, return an array of indices to split the list into multiple partitions, + * so that the size sum of each partition is close to the target size. Each index indicates the + * start of a partition. + */ + def splitSizeListByTargetSize(sizes: Seq[Long], targetSize: Long): Array[Int] = { + val partitionStartIndices = ArrayBuffer[Int]() + partitionStartIndices += 0 + var i = 0 + var currentPartitionSize = 0L + var lastPartitionSize = -1L + + def tryMergePartitions() = { + // When we are going to start a new partition, it's possible that the current partition or + // the previous partition is very small and it's better to merge the current partition into + // the previous partition. + val shouldMergePartitions = lastPartitionSize > -1 && + ((currentPartitionSize + lastPartitionSize) < targetSize * MERGED_PARTITION_FACTOR || + (currentPartitionSize < targetSize * SMALL_PARTITION_FACTOR || + lastPartitionSize < targetSize * SMALL_PARTITION_FACTOR)) + if (shouldMergePartitions) { + // We decide to merge the current partition into the previous one, so the start index of + // the current partition should be removed. + partitionStartIndices.remove(partitionStartIndices.length - 1) + lastPartitionSize += currentPartitionSize + } else { + lastPartitionSize = currentPartitionSize + } + } + + while (i < sizes.length) { + // If including the next size in the current partition exceeds the target size, package the + // current partition and start a new partition. + if (i > 0 && currentPartitionSize + sizes(i) > targetSize) { + tryMergePartitions() + partitionStartIndices += i + currentPartitionSize = sizes(i) + } else { + currentPartitionSize += sizes(i) + } + i += 1 + } + tryMergePartitions() + partitionStartIndices.toArray + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala deleted file mode 100644 index 52f793b24aa17..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/SkewedShuffledRowRDD.scala +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.adaptive - -import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} - -/** - * The [[Partition]] used by [[SkewedShuffledRowRDD]]. - */ -class SkewedShuffledRowRDDPartition(override val index: Int) extends Partition - -/** - * This is a specialized version of [[org.apache.spark.sql.execution.ShuffledRowRDD]]. This is used - * in Spark SQL adaptive execution to solve data skew issues. This RDD includes rearranged - * partitions from mappers. - * - * This RDD takes a [[ShuffleDependency]] (`dependency`), a partitionIndex - * and the range of startMapIndex to endMapIndex. - */ -class SkewedShuffledRowRDD( - var dependency: ShuffleDependency[Int, InternalRow, InternalRow], - partitionIndex: Int, - startMapIndex: Int, - endMapIndex: Int, - metrics: Map[String, SQLMetric]) - extends RDD[InternalRow](dependency.rdd.context, Nil) { - - override def getDependencies: Seq[Dependency[_]] = List(dependency) - - override def getPartitions: Array[Partition] = { - Array(new SkewedShuffledRowRDDPartition(0)) - } - - override def getPreferredLocations(partition: Partition): Seq[String] = { - val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] - tracker.getMapLocation(dependency, startMapIndex, endMapIndex) - } - - override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { - val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics() - // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator, - // as well as the `tempMetrics` for basic shuffle metrics. - val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics) - - val reader = SparkEnv.get.shuffleManager.getReaderForRange( - dependency.shuffleHandle, - startMapIndex, - endMapIndex, - partitionIndex, - partitionIndex + 1, - context, - sqlMetricsReporter) - reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2) - } - - override def clearDependencies() { - super.clearDependencies() - dependency = null - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala index 67cd720bb5b33..cdc57dbc7dcc2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec +import org.apache.spark.sql.execution.exchange.{ShuffleExchangeExec, ShuffleExchangeLike} /** * A simple implementation of [[Cost]], which takes a number of [[Long]] as the cost value. @@ -35,13 +35,13 @@ case class SimpleCost(value: Long) extends Cost { /** * A simple implementation of [[CostEvaluator]], which counts the number of - * [[ShuffleExchangeExec]] nodes in the plan. + * [[ShuffleExchangeLike]] nodes in the plan. */ object SimpleCostEvaluator extends CostEvaluator { override def evaluateCost(plan: SparkPlan): Cost = { val cost = plan.collect { - case s: ShuffleExchangeExec => s + case s: ShuffleExchangeLike => s }.size SimpleCost(cost) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala index 56a287d4d0279..761ac20e84744 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala @@ -135,20 +135,12 @@ object AggUtils { groupingExpressions: Seq[NamedExpression], functionsWithDistinct: Seq[AggregateExpression], functionsWithoutDistinct: Seq[AggregateExpression], + distinctExpressions: Seq[Expression], + normalizedNamedDistinctExpressions: Seq[NamedExpression], resultExpressions: Seq[NamedExpression], child: SparkPlan): Seq[SparkPlan] = { - // functionsWithDistinct is guaranteed to be non-empty. Even though it may contain more than one - // DISTINCT aggregate function, all of those functions will have the same column expressions. - // For example, it would be valid for functionsWithDistinct to be - // [COUNT(DISTINCT foo), MAX(DISTINCT foo)], but [COUNT(DISTINCT bar), COUNT(DISTINCT foo)] is - // disallowed because those two distinct aggregates have different column expressions. - val distinctExpressions = functionsWithDistinct.head.aggregateFunction.children - val namedDistinctExpressions = distinctExpressions.map { - case ne: NamedExpression => ne - case other => Alias(other, other.toString)() - } - val distinctAttributes = namedDistinctExpressions.map(_.toAttribute) + val distinctAttributes = normalizedNamedDistinctExpressions.map(_.toAttribute) val groupingAttributes = groupingExpressions.map(_.toAttribute) // 1. Create an Aggregate Operator for partial aggregations. @@ -159,7 +151,7 @@ object AggUtils { // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, the grouping // expressions will be [key, value]. createAggregate( - groupingExpressions = groupingExpressions ++ namedDistinctExpressions, + groupingExpressions = groupingExpressions ++ normalizedNamedDistinctExpressions, aggregateExpressions = aggregateExpressions, aggregateAttributes = aggregateAttributes, resultExpressions = groupingAttributes ++ distinctAttributes ++ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala new file mode 100644 index 0000000000000..f1e053f7fb2a5 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Final, PartialMerge} +import org.apache.spark.sql.execution.{ExplainUtils, UnaryExecNode} + +/** + * Holds common logic for aggregate operators + */ +trait BaseAggregateExec extends UnaryExecNode { + def groupingExpressions: Seq[NamedExpression] + def aggregateExpressions: Seq[AggregateExpression] + def aggregateAttributes: Seq[Attribute] + def resultExpressions: Seq[NamedExpression] + + override def verboseStringWithOperatorId(): String = { + s""" + |$formattedNodeName + |${ExplainUtils.generateFieldString("Input", child.output)} + |${ExplainUtils.generateFieldString("Keys", groupingExpressions)} + |${ExplainUtils.generateFieldString("Functions", aggregateExpressions)} + |${ExplainUtils.generateFieldString("Aggregate Attributes", aggregateAttributes)} + |${ExplainUtils.generateFieldString("Results", resultExpressions)} + |""".stripMargin + } + + protected def inputAttributes: Seq[Attribute] = { + val modes = aggregateExpressions.map(_.mode).distinct + if (modes.contains(Final) || modes.contains(PartialMerge)) { + // SPARK-31620: when planning aggregates, the partial aggregate uses aggregate function's + // `inputAggBufferAttributes` as its output. And Final and PartialMerge aggregate rely on the + // output to bind references for `DeclarativeAggregate.mergeExpressions`. But if we copy the + // aggregate function somehow after aggregate planning, like `PlanSubqueries`, the + // `DeclarativeAggregate` will be replaced by a new instance with new + // `inputAggBufferAttributes` and `mergeExpressions`. Then Final and PartialMerge aggregate + // can't bind the `mergeExpressions` with the output of the partial aggregate, as they use + // the `inputAggBufferAttributes` of the original `DeclarativeAggregate` before copy. Instead, + // we shall use `inputAggBufferAttributes` after copy to match the new `mergeExpressions`. + val aggAttrs = aggregateExpressions + // there're exactly four cases needs `inputAggBufferAttributes` from child according to the + // agg planning in `AggUtils`: Partial -> Final, PartialMerge -> Final, + // Partial -> PartialMerge, PartialMerge -> PartialMerge. + .filter(a => a.mode == Final || a.mode == PartialMerge).map(_.aggregateFunction) + .flatMap(_.inputAggBufferAttributes) + child.output.dropRight(aggAttrs.length) ++ aggAttrs + } else { + child.output + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index f73e214a6b41f..56641bee2111b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -53,7 +53,7 @@ case class HashAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode with BlockingOperatorWithCodegen with AliasAwareOutputPartitioning { + extends BaseAggregateExec with BlockingOperatorWithCodegen with AliasAwareOutputPartitioning { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -127,7 +127,7 @@ case class HashAggregateExec( resultExpressions, (expressions, inputSchema) => MutableProjection.create(expressions, inputSchema), - child.output, + inputAttributes, iter, testFallbackStartsAt, numOutputRows, @@ -332,7 +332,7 @@ case class HashAggregateExec( private def doConsumeWithoutKeys(ctx: CodegenContext, input: Seq[ExprCode]): String = { // only have DeclarativeAggregate val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate]) - val inputAttrs = functions.flatMap(_.aggBufferAttributes) ++ child.output + val inputAttrs = functions.flatMap(_.aggBufferAttributes) ++ inputAttributes // To individually generate code for each aggregate function, an element in `updateExprs` holds // all the expressions for the buffer of an aggregation function. val updateExprs = aggregateExpressions.map { e => @@ -367,10 +367,10 @@ case class HashAggregateExec( """.stripMargin } code""" - |// do aggregate for ${aggNames(i)} - |// evaluate aggregate function + |${ctx.registerComment(s"do aggregate for ${aggNames(i)}")} + |${ctx.registerComment("evaluate aggregate function")} |${evaluateVariables(bufferEvalsForOneFunc)} - |// update aggregation buffers + |${ctx.registerComment("update aggregation buffers")} |${updates.mkString("\n").trim} """.stripMargin } @@ -929,7 +929,7 @@ case class HashAggregateExec( } } - val inputAttr = aggregateBufferAttributes ++ child.output + val inputAttr = aggregateBufferAttributes ++ inputAttributes // Here we set `currentVars(0)` to `currentVars(numBufferSlots)` to null, so that when // generating code for buffer columns, we use `INPUT_ROW`(will be the buffer row), while // generating input columns, we use `currentVars`. @@ -975,9 +975,9 @@ case class HashAggregateExec( CodeGenerator.updateColumn(unsafeRowBuffer, dt, bufferOffset + j, ev, nullable) } code""" - |// evaluate aggregate function for ${aggNames(i)} + |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")} |${evaluateVariables(rowBufferEvalsForOneFunc)} - |// update unsafe row buffer + |${ctx.registerComment("update unsafe row buffer")} |${updateRowBuffers.mkString("\n").trim} """.stripMargin } @@ -1030,9 +1030,9 @@ case class HashAggregateExec( isVectorized = true) } code""" - |// evaluate aggregate function for ${aggNames(i)} + |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")} |${evaluateVariables(fastRowEvalsForOneFunc)} - |// update fast row + |${ctx.registerComment("update fast row")} |${updateRowBuffer.mkString("\n").trim} """.stripMargin } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala index 4376f6b6edd57..f1c0719ff8948 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala @@ -67,7 +67,7 @@ case class ObjectHashAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode with AliasAwareOutputPartitioning { + extends BaseAggregateExec with AliasAwareOutputPartitioning { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -123,7 +123,7 @@ case class ObjectHashAggregateExec( resultExpressions, (expressions, inputSchema) => MutableProjection.create(expressions, inputSchema), - child.output, + inputAttributes, iter, fallbackCountThreshold, numOutputRows) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala index b6e684e62ea5c..c23e19d3b6262 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.util.truncatedString -import org.apache.spark.sql.execution.{AliasAwareOutputPartitioning, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.{AliasAwareOutputPartitioning, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics /** @@ -38,7 +38,7 @@ case class SortAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode with AliasAwareOutputPartitioning { + extends BaseAggregateExec with AliasAwareOutputPartitioning { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -86,7 +86,7 @@ case class SortAggregateExec( val outputIter = new SortBasedAggregationIterator( partIndex, groupingExpressions, - child.output, + inputAttributes, iter, aggregateExpressions, aggregateAttributes, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala index dfae5c07e0373..44bc9c2e3a9d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala @@ -27,6 +27,8 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateMutableProjection, GenerateSafeProjection} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ @@ -458,7 +460,8 @@ case class ScalaUDAF( case class ScalaAggregator[IN, BUF, OUT]( children: Seq[Expression], agg: Aggregator[IN, BUF, OUT], - inputEncoderNR: ExpressionEncoder[IN], + inputEncoder: ExpressionEncoder[IN], + bufferEncoder: ExpressionEncoder[BUF], nullable: Boolean = true, isDeterministic: Boolean = true, mutableAggBufferOffset: Int = 0, @@ -469,10 +472,12 @@ case class ScalaAggregator[IN, BUF, OUT]( with ImplicitCastInputTypes with Logging { - private[this] lazy val inputEncoder = inputEncoderNR.resolveAndBind() - private[this] lazy val bufferEncoder = - agg.bufferEncoder.asInstanceOf[ExpressionEncoder[BUF]].resolveAndBind() + // input and buffer encoders are resolved by ResolveEncodersInScalaAgg + private[this] lazy val inputDeserializer = inputEncoder.createDeserializer() + private[this] lazy val bufferSerializer = bufferEncoder.createSerializer() + private[this] lazy val bufferDeserializer = bufferEncoder.createDeserializer() private[this] lazy val outputEncoder = agg.outputEncoder.asInstanceOf[ExpressionEncoder[OUT]] + private[this] lazy val outputSerializer = outputEncoder.createSerializer() def dataType: DataType = outputEncoder.objSerializer.dataType @@ -491,26 +496,41 @@ case class ScalaAggregator[IN, BUF, OUT]( def createAggregationBuffer(): BUF = agg.zero def update(buffer: BUF, input: InternalRow): BUF = - agg.reduce(buffer, inputEncoder.fromRow(inputProjection(input))) + agg.reduce(buffer, inputDeserializer(inputProjection(input))) def merge(buffer: BUF, input: BUF): BUF = agg.merge(buffer, input) def eval(buffer: BUF): Any = { - val row = outputEncoder.toRow(agg.finish(buffer)) + val row = outputSerializer(agg.finish(buffer)) if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType) } private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length) def serialize(agg: BUF): Array[Byte] = - bufferEncoder.toRow(agg).asInstanceOf[UnsafeRow].getBytes() + bufferSerializer(agg).asInstanceOf[UnsafeRow].getBytes() def deserialize(storageFormat: Array[Byte]): BUF = { bufferRow.pointTo(storageFormat, storageFormat.length) - bufferEncoder.fromRow(bufferRow) + bufferDeserializer(bufferRow) } override def toString: String = s"""${nodeName}(${children.mkString(",")})""" override def nodeName: String = agg.getClass.getSimpleName } + +/** + * An extension rule to resolve encoder expressions from a [[ScalaAggregator]] + */ +object ResolveEncodersInScalaAgg extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { + case p if !p.resolved => p + case p => p.transformExpressionsUp { + case agg: ScalaAggregator[_, _, _] => + agg.copy( + inputEncoder = agg.inputEncoder.resolveAndBind(), + bufferEncoder = agg.bufferEncoder.resolveAndBind()) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala index 614d6c2846bfa..136f7c47f5341 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala @@ -76,6 +76,8 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] { // We always remove the special metadata from `AttributeReference` at the end of this rule, so // Dataset column reference only exists in the root node via Dataset transformations like // `Dataset#select`. + if (plan.find(_.isInstanceOf[Join]).isEmpty) return stripColumnReferenceMetadataInPlan(plan) + val colRefAttrs = plan.expressions.flatMap(_.collect { case a: AttributeReference if isColumnReference(a) => a }) @@ -153,6 +155,10 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] { } } + stripColumnReferenceMetadataInPlan(plan) + } + + private def stripColumnReferenceMetadataInPlan(plan: LogicalPlan): LogicalPlan = { plan.transformExpressions { case a: AttributeReference if isColumnReference(a) => // Remove the special metadata from this `AttributeReference`, as the detection is done. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index c35c48496e1c9..036b02953a430 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -17,10 +17,11 @@ package org.apache.spark.sql.execution +import java.util.concurrent.{Future => JFuture} import java.util.concurrent.TimeUnit._ import scala.collection.mutable -import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.{ExecutionContext} import scala.concurrent.duration.Duration import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} @@ -85,10 +86,10 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) override def verboseStringWithOperatorId(): String = { s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Output : ${projectList.mkString("[", ", ", "]")} - |Input : ${child.output.mkString("[", ", ", "]")} - """.stripMargin + |$formattedNodeName + |${ExplainUtils.generateFieldString("Output", projectList)} + |${ExplainUtils.generateFieldString("Input", child.output)} + |""".stripMargin } } @@ -242,10 +243,10 @@ case class FilterExec(condition: Expression, child: SparkPlan) override def verboseStringWithOperatorId(): String = { s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Input : ${child.output.mkString("[", ", ", "]")} + |$formattedNodeName + |${ExplainUtils.generateFieldString("Input", child.output)} |Condition : ${condition} - """.stripMargin + |""".stripMargin } } @@ -746,7 +747,7 @@ case class SubqueryExec(name: String, child: SparkPlan) "collectTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to collect")) @transient - private lazy val relationFuture: Future[Array[InternalRow]] = { + private lazy val relationFuture: JFuture[Array[InternalRow]] = { // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here. val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLExecution.withThreadLocalCaptured[Array[InternalRow]]( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala index 00a1d54b41709..3cc59af9b7ce3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala @@ -318,7 +318,8 @@ private[columnar] case object RunLengthEncoding extends CompressionScheme { var valueCountLocal = 0 var currentValueLocal: Long = 0 - while (valueCountLocal < runLocal || (pos < capacity)) { + while (pos < capacity) { + assert(valueCountLocal <= runLocal) if (pos != nextNullIndex) { if (valueCountLocal == runLocal) { currentValueLocal = getFunction(buffer) @@ -616,7 +617,6 @@ private[columnar] case object BooleanBitSet extends CompressionScheme { override def hasNext: Boolean = visited < count override def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = { - val countLocal = count var currentWordLocal: Long = 0 var visitedLocal: Int = 0 val nullsBuffer = buffer.duplicate().order(ByteOrder.nativeOrder()) @@ -626,7 +626,7 @@ private[columnar] case object BooleanBitSet extends CompressionScheme { var pos = 0 var seenNulls = 0 - while (visitedLocal < countLocal) { + while (pos < capacity) { if (pos != nextNullIndex) { val bit = visitedLocal % BITS_PER_LONG diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala new file mode 100644 index 0000000000000..dedace4af4d14 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.util.SchemaUtils + +/** + * Checks legitimization of various execution commands. + */ +case class CommandCheck(conf: SQLConf) extends (LogicalPlan => Unit) { + + override def apply(plan: LogicalPlan): Unit = { + plan.foreach { + case AnalyzeColumnCommand(_, colsOpt, allColumns) if !allColumns => + colsOpt.foreach(SchemaUtils.checkColumnNameDuplication( + _, "in analyze columns.", conf.caseSensitiveAnalysis)) + + case _ => + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index b229b238238fd..34e0ac954010e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -385,4 +385,12 @@ object CommandUtils extends Logging { private def isDataPath(path: Path, stagingDir: String): Boolean = { !path.getName.startsWith(stagingDir) && DataSourceUtils.isDataPath(path) } + + def uncacheTableOrView(sparkSession: SparkSession, name: String): Unit = { + try { + sparkSession.catalog.uncacheTable(name) + } catch { + case NonFatal(e) => logWarning(s"Exception when attempting to uncache $name", e) + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala index 39b08e2894dcd..3dc1d52697714 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala @@ -115,14 +115,19 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm case Some(("-v", None)) => val runFunc = (sparkSession: SparkSession) => { sparkSession.sessionState.conf.getAllDefinedConfs.sorted.map { - case (key, defaultValue, doc) => - Row(key, Option(defaultValue).getOrElse(""), doc) + case (key, defaultValue, doc, version) => + Row( + key, + Option(defaultValue).getOrElse(""), + doc, + Option(version).getOrElse("")) } } val schema = StructType( StructField("key", StringType, nullable = false) :: StructField("value", StringType, nullable = false) :: - StructField("meaning", StringType, nullable = false) :: Nil) + StructField("meaning", StringType, nullable = false) :: + StructField("Since version", StringType, nullable = false) :: Nil) (schema.toAttributes, runFunc) // Queries the deprecated "mapred.reduce.tasks" property. @@ -157,7 +162,8 @@ object SetCommand { } /** - * This command is for resetting SQLConf to the default values. Command that runs + * This command is for resetting SQLConf to the default values. Any configurations that were set + * via [[SetCommand]] will get reset to default value. Command that runs * {{{ * reset; * }}} @@ -165,7 +171,11 @@ object SetCommand { case object ResetCommand extends RunnableCommand with IgnoreCachedData { override def run(sparkSession: SparkSession): Seq[Row] = { - sparkSession.sessionState.conf.clear() + val conf = sparkSession.sessionState.conf + conf.clear() + sparkSession.sparkContext.conf.getAll.foreach { case (k, v) => + conf.setConfString(k, v) + } Seq.empty[Row] } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index f41c4eca203af..588f52d2e69b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -484,17 +484,18 @@ case class AlterTableAddPartitionCommand( catalog.createPartitions(table.identifier, batch, ignoreIfExists = ifNotExists) } - if (table.stats.nonEmpty) { - if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) { - val addedSize = CommandUtils.calculateTotalLocationSize(sparkSession, table.identifier, - parts.map(_.storage.locationUri)) - if (addedSize > 0) { - val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize) - catalog.alterTableStats(table.identifier, Some(newStats)) - } - } else { - catalog.alterTableStats(table.identifier, None) + sparkSession.catalog.refreshTable(table.identifier.quotedString) + if (table.stats.nonEmpty && sparkSession.sessionState.conf.autoSizeUpdateEnabled) { + // Updating table stats only if new partition is not empty + val addedSize = CommandUtils.calculateTotalLocationSize(sparkSession, table.identifier, + parts.map(_.storage.locationUri)) + if (addedSize > 0) { + val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize) + catalog.alterTableStats(table.identifier, Some(newStats)) } + } else { + // Re-calculating of table size including all partitions + CommandUtils.updateTableStats(sparkSession, table) } Seq.empty[Row] } @@ -535,6 +536,7 @@ case class AlterTableRenamePartitionCommand( catalog.renamePartitions( tableName, Seq(normalizedOldPartition), Seq(normalizedNewPartition)) + sparkSession.catalog.refreshTable(table.identifier.quotedString) Seq.empty[Row] } @@ -580,6 +582,7 @@ case class AlterTableDropPartitionCommand( table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge, retainData = retainData) + sparkSession.catalog.refreshTable(table.identifier.quotedString) CommandUtils.updateTableStats(sparkSession, table) Seq.empty[Row] @@ -672,7 +675,7 @@ case class AlterTableRecoverPartitionsCommand( // This is always the case for Hive format tables, but is not true for Datasource tables created // before Spark 2.1 unless they are converted via `msck repair table`. spark.sessionState.catalog.alterTable(table.copy(tracksPartitionsInCatalog = true)) - catalog.refreshTable(tableName) + spark.catalog.refreshTable(tableIdentWithDB) logInfo(s"Recovered all partitions ($total).") Seq.empty[Row] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala index 6fdc7f4a58195..d55d6967a0291 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala @@ -88,7 +88,9 @@ case class CreateFunctionCommand( } else { // For a permanent, we will store the metadata into underlying external catalog. // This function will be loaded into the FunctionRegistry when a query uses it. - // We do not load it into FunctionRegistry right now. + // We do not load it into FunctionRegistry right now, to avoid loading the resource and + // UDF class immediately, as the Spark application to create the function may not have + // access to the resource and/or UDF class. catalog.createFunction(func, ignoreIfExists) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala index 1119e5cb1d288..549477dbae6ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala @@ -47,7 +47,7 @@ case class AddJarCommand(path: String) extends RunnableCommand { */ case class AddFileCommand(path: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { - val recursive = sparkSession.sessionState.conf.addDirectoryRecursiveEnabled + val recursive = !sparkSession.sessionState.conf.addSingleFileInAddFile sparkSession.sparkContext.addFile(path, recursive) Seq.empty[Row] } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 468ca505cce1f..42f4a7245031c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -19,12 +19,12 @@ package org.apache.spark.sql.execution.command import java.net.{URI, URISyntaxException} +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer -import scala.util.Try import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileContext, FsConstants, Path} -import org.apache.hadoop.fs.permission.{AclEntry, FsPermission} +import org.apache.hadoop.fs.permission.{AclEntry, AclEntryScope, AclEntryType, FsAction, FsPermission} import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier @@ -123,7 +123,8 @@ case class CreateTableLikeCommand( provider = newProvider, partitionColumnNames = sourceTableDesc.partitionColumnNames, bucketSpec = sourceTableDesc.bucketSpec, - properties = properties) + properties = properties, + tracksPartitionsInCatalog = sourceTableDesc.tracksPartitionsInCatalog) catalog.createTable(newTableDesc, ifNotExists) Seq.empty[Row] @@ -190,22 +191,19 @@ case class AlterTableRenameCommand( } else { val table = catalog.getTableMetadata(oldName) DDLUtils.verifyAlterTableType(catalog, table, isView) - // If an exception is thrown here we can just assume the table is uncached; - // this can happen with Hive tables when the underlying catalog is in-memory. - val wasCached = Try(sparkSession.catalog.isCached(oldName.unquotedString)).getOrElse(false) - if (wasCached) { - try { - sparkSession.catalog.uncacheTable(oldName.unquotedString) - } catch { - case NonFatal(e) => log.warn(e.toString, e) - } + // If `optStorageLevel` is defined, the old table was cached. + val optCachedData = sparkSession.sharedState.cacheManager.lookupCachedData( + sparkSession.table(oldName.unquotedString)) + val optStorageLevel = optCachedData.map(_.cachedRepresentation.cacheBuilder.storageLevel) + if (optStorageLevel.isDefined) { + CommandUtils.uncacheTableOrView(sparkSession, oldName.unquotedString) } // Invalidate the table last, otherwise uncaching the table would load the logical plan // back into the hive metastore cache catalog.refreshTable(oldName) catalog.renameTable(oldName, newName) - if (wasCached) { - sparkSession.catalog.cacheTable(newName.unquotedString) + optStorageLevel.foreach { storageLevel => + sparkSession.catalog.cacheTable(newName.unquotedString, storageLevel) } } Seq.empty[Row] @@ -228,12 +226,7 @@ case class AlterTableAddColumnsCommand( val catalog = sparkSession.sessionState.catalog val catalogTable = verifyAlterTableAddColumn(sparkSession.sessionState.conf, catalog, table) - try { - sparkSession.catalog.uncacheTable(table.quotedString) - } catch { - case NonFatal(e) => - log.warn(s"Exception when attempting to uncache table ${table.quotedString}", e) - } + CommandUtils.uncacheTableOrView(sparkSession, table.quotedString) catalog.refreshTable(table) SchemaUtils.checkColumnNameDuplication( @@ -506,8 +499,8 @@ case class TruncateTableCommand( var optPermission: Option[FsPermission] = None var optAcls: Option[java.util.List[AclEntry]] = None if (!ignorePermissionAcl) { - val fileStatus = fs.getFileStatus(path) try { + val fileStatus = fs.getFileStatus(path) optPermission = Some(fileStatus.getPermission()) } catch { case NonFatal(_) => // do nothing @@ -538,12 +531,27 @@ case class TruncateTableCommand( } } optAcls.foreach { acls => + val aclEntries = acls.asScala.filter(_.getName != null).asJava + + // If the path doesn't have default ACLs, `setAcl` API will throw an error + // as it expects user/group/other permissions must be in ACL entries. + // So we need to add tradition user/group/other permission + // in the form of ACL. + optPermission.map { permission => + aclEntries.add(newAclEntry(AclEntryScope.ACCESS, + AclEntryType.USER, permission.getUserAction())) + aclEntries.add(newAclEntry(AclEntryScope.ACCESS, + AclEntryType.GROUP, permission.getGroupAction())) + aclEntries.add(newAclEntry(AclEntryScope.ACCESS, + AclEntryType.OTHER, permission.getOtherAction())) + } + try { - fs.setAcl(path, acls) + fs.setAcl(path, aclEntries) } catch { case NonFatal(e) => throw new SecurityException( - s"Failed to set original ACL $acls back to " + + s"Failed to set original ACL $aclEntries back to " + s"the created path: $path. Exception: ${e.getMessage}") } } @@ -574,6 +582,16 @@ case class TruncateTableCommand( } Seq.empty[Row] } + + private def newAclEntry( + scope: AclEntryScope, + aclType: AclEntryType, + permission: FsAction): AclEntry = { + new AclEntry.Builder() + .setScope(scope) + .setType(aclType) + .setPermission(permission).build() + } } abstract class DescribeCommandBase extends RunnableCommand { @@ -866,12 +884,20 @@ case class ShowTablesCommand( // // Note: tableIdentifierPattern should be non-empty, otherwise a [[ParseException]] // should have been thrown by the sql parser. - val tableIdent = TableIdentifier(tableIdentifierPattern.get, Some(db)) - val table = catalog.getTableMetadata(tableIdent).identifier - val partition = catalog.getPartition(tableIdent, partitionSpec.get) - val database = table.database.getOrElse("") - val tableName = table.table - val isTemp = catalog.isTemporaryTable(table) + val table = catalog.getTableMetadata(TableIdentifier(tableIdentifierPattern.get, Some(db))) + + DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "SHOW TABLE EXTENDED") + + val tableIdent = table.identifier + val normalizedSpec = PartitioningUtils.normalizePartitionSpec( + partitionSpec.get, + table.partitionColumnNames, + tableIdent.quotedString, + sparkSession.sessionState.conf.resolver) + val partition = catalog.getPartition(tableIdent, normalizedSpec) + val database = tableIdent.database.getOrElse("") + val tableName = tableIdent.table + val isTemp = catalog.isTemporaryTable(tableIdent) val information = partition.simpleString Seq(Row(database, tableName, isTemp, s"$information\n")) } @@ -900,15 +926,20 @@ case class ShowTablePropertiesCommand(table: TableIdentifier, propertyKey: Optio } override def run(sparkSession: SparkSession): Seq[Row] = { - val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(table) - propertyKey match { - case Some(p) => - val propValue = catalogTable - .properties - .getOrElse(p, s"Table ${catalogTable.qualifiedName} does not have property: $p") - Seq(Row(propValue)) - case None => - catalogTable.properties.map(p => Row(p._1, p._2)).toSeq + val catalog = sparkSession.sessionState.catalog + if (catalog.isTemporaryTable(table)) { + Seq.empty[Row] + } else { + val catalogTable = catalog.getTableMetadata(table) + propertyKey match { + case Some(p) => + val propValue = catalogTable + .properties + .getOrElse(p, s"Table ${catalogTable.qualifiedName} does not have property: $p") + Seq(Row(propValue)) + case None => + catalogTable.properties.map(p => Row(p._1, p._2)).toSeq + } } } } @@ -984,20 +1015,18 @@ case class ShowPartitionsCommand( DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "SHOW PARTITIONS") /** - * Validate the partitioning spec by making sure all the referenced columns are + * Normalizes the partition spec w.r.t the partition columns and case sensitivity settings, + * and validates the spec by making sure all the referenced columns are * defined as partitioning columns in table definition. An AnalysisException exception is * thrown if the partitioning spec is invalid. */ - if (spec.isDefined) { - val badColumns = spec.get.keySet.filterNot(table.partitionColumnNames.contains) - if (badColumns.nonEmpty) { - val badCols = badColumns.mkString("[", ", ", "]") - throw new AnalysisException( - s"Non-partitioning column(s) $badCols are specified for SHOW PARTITIONS") - } - } + val normalizedSpec = spec.map(partitionSpec => PartitioningUtils.normalizePartitionSpec( + partitionSpec, + table.partitionColumnNames, + table.identifier.quotedString, + sparkSession.sessionState.conf.resolver)) - val partNames = catalog.listPartitionNames(tableName, spec) + val partNames = catalog.listPartitionNames(tableName, normalizedSpec) partNames.map(Row(_)) } } @@ -1039,6 +1068,42 @@ trait ShowCreateTableCommandBase { protected def concatByMultiLines(iter: Iterable[String]): String = { iter.mkString("(\n ", ",\n ", ")\n") } + + protected def showCreateView(metadata: CatalogTable, builder: StringBuilder): Unit = { + showViewDataColumns(metadata, builder) + showTableComment(metadata, builder) + showViewProperties(metadata, builder) + showViewText(metadata, builder) + } + + private def showViewDataColumns(metadata: CatalogTable, builder: StringBuilder): Unit = { + if (metadata.schema.nonEmpty) { + val viewColumns = metadata.schema.map { f => + val comment = f.getComment() + .map(escapeSingleQuotedString) + .map(" COMMENT '" + _ + "'") + + // view columns shouldn't have data type info + s"${quoteIdentifier(f.name)}${comment.getOrElse("")}" + } + builder ++= concatByMultiLines(viewColumns) + } + } + + private def showViewProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { + val viewProps = metadata.properties.filterKeys(!_.startsWith(CatalogTable.VIEW_PREFIX)) + if (viewProps.nonEmpty) { + val props = viewProps.map { case (key, value) => + s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" + } + + builder ++= s"TBLPROPERTIES ${concatByMultiLines(props)}" + } + } + + private def showViewText(metadata: CatalogTable, builder: StringBuilder): Unit = { + builder ++= metadata.viewText.mkString("AS ", "", "\n") + } } /** @@ -1076,23 +1141,39 @@ case class ShowCreateTableCommand(table: TableIdentifier) "Failed to execute SHOW CREATE TABLE against table " + s"${tableMetadata.identifier}, which is created by Hive and uses the " + "following unsupported feature(s)\n" + - tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n") + tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n") + ". " + + s"Please use `SHOW CREATE TABLE ${tableMetadata.identifier} AS SERDE` " + + "to show Hive DDL instead." ) } - if (tableMetadata.tableType == VIEW) { - throw new AnalysisException("Hive view isn't supported by SHOW CREATE TABLE") - } - if ("true".equalsIgnoreCase(tableMetadata.properties.getOrElse("transactional", "false"))) { throw new AnalysisException( - "SHOW CREATE TABLE doesn't support transactional Hive table") + "SHOW CREATE TABLE doesn't support transactional Hive table. " + + s"Please use `SHOW CREATE TABLE ${tableMetadata.identifier} AS SERDE` " + + "to show Hive DDL instead.") } - convertTableMetadata(tableMetadata) + if (tableMetadata.tableType == VIEW) { + tableMetadata + } else { + convertTableMetadata(tableMetadata) + } } - val stmt = showCreateDataSourceTable(metadata) + val builder = StringBuilder.newBuilder + + val stmt = if (tableMetadata.tableType == VIEW) { + builder ++= s"CREATE VIEW ${table.quotedString} " + showCreateView(metadata, builder) + + builder.toString() + } else { + builder ++= s"CREATE TABLE ${table.quotedString} " + + showCreateDataSourceTable(metadata, builder) + builder.toString() + } Seq(Row(stmt)) } @@ -1172,18 +1253,13 @@ case class ShowCreateTableCommand(table: TableIdentifier) } } - private def showCreateDataSourceTable(metadata: CatalogTable): String = { - val builder = StringBuilder.newBuilder - - builder ++= s"CREATE TABLE ${table.quotedString} " + private def showCreateDataSourceTable(metadata: CatalogTable, builder: StringBuilder): Unit = { showDataSourceTableDataColumns(metadata, builder) showDataSourceTableOptions(metadata, builder) showDataSourceTableNonDataColumns(metadata, builder) showTableComment(metadata, builder) showTableLocation(metadata, builder) showTableProperties(metadata, builder) - - builder.toString() } } @@ -1242,10 +1318,7 @@ case class ShowCreateTableAsSerdeCommand(table: TableIdentifier) builder ++= s"CREATE$tableTypeString ${table.quotedString}" if (metadata.tableType == VIEW) { - showViewDataColumns(metadata, builder) - showTableComment(metadata, builder) - showViewProperties(metadata, builder) - showViewText(metadata, builder) + showCreateView(metadata, builder) } else { showHiveTableHeader(metadata, builder) showTableComment(metadata, builder) @@ -1258,35 +1331,6 @@ case class ShowCreateTableAsSerdeCommand(table: TableIdentifier) builder.toString() } - private def showViewDataColumns(metadata: CatalogTable, builder: StringBuilder): Unit = { - if (metadata.schema.nonEmpty) { - val viewColumns = metadata.schema.map { f => - val comment = f.getComment() - .map(escapeSingleQuotedString) - .map(" COMMENT '" + _ + "'") - - // view columns shouldn't have data type info - s"${quoteIdentifier(f.name)}${comment.getOrElse("")}" - } - builder ++= concatByMultiLines(viewColumns) - } - } - - private def showViewProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { - val viewProps = metadata.properties.filterKeys(!_.startsWith(CatalogTable.VIEW_PREFIX)) - if (viewProps.nonEmpty) { - val props = viewProps.map { case (key, value) => - s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" - } - - builder ++= s"TBLPROPERTIES ${concatByMultiLines(props)}" - } - } - - private def showViewText(metadata: CatalogTable, builder: StringBuilder): Unit = { - builder ++= metadata.viewText.mkString("AS ", "", "\n") - } - private def showHiveTableHeader(metadata: CatalogTable, builder: StringBuilder): Unit = { val columns = metadata.schema.filterNot { column => metadata.partitionColumnNames.contains(column.name) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 38481dda428a5..0ba76ee94a860 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -23,10 +23,12 @@ import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, UnresolvedFunction, UnresolvedRelation, ViewType} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, SessionCatalog} -import org.apache.spark.sql.catalyst.expressions.{Alias, SubqueryExpression} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} -import org.apache.spark.sql.types.MetadataBuilder +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper +import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.types.{BooleanType, MetadataBuilder, StringType} import org.apache.spark.sql.util.SchemaUtils /** @@ -108,9 +110,21 @@ case class CreateViewCommand( verifyTemporaryObjectsNotExists(catalog) if (viewType == LocalTempView) { + if (replace && catalog.getTempView(name.table).isDefined && + !catalog.getTempView(name.table).get.sameResult(child)) { + logInfo(s"Try to uncache ${name.quotedString} before replacing.") + CommandUtils.uncacheTableOrView(sparkSession, name.quotedString) + } val aliasedPlan = aliasPlan(sparkSession, analyzedPlan) catalog.createTempView(name.table, aliasedPlan, overrideIfExists = replace) } else if (viewType == GlobalTempView) { + if (replace && catalog.getGlobalTempView(name.table).isDefined && + !catalog.getGlobalTempView(name.table).get.sameResult(child)) { + val db = sparkSession.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + val globalTempView = TableIdentifier(name.table, Option(db)) + logInfo(s"Try to uncache ${globalTempView.quotedString} before replacing.") + CommandUtils.uncacheTableOrView(sparkSession, globalTempView.quotedString) + } val aliasedPlan = aliasPlan(sparkSession, analyzedPlan) catalog.createGlobalTempView(name.table, aliasedPlan, overrideIfExists = replace) } else if (catalog.tableExists(name)) { @@ -125,6 +139,10 @@ case class CreateViewCommand( val viewIdent = tableMetadata.identifier checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent) + // uncache the cached data before replacing an exists view + logDebug(s"Try to uncache ${viewIdent.quotedString} before replacing.") + CommandUtils.uncacheTableOrView(sparkSession, viewIdent.quotedString) + // Handles `CREATE OR REPLACE VIEW v0 AS SELECT ...` // Nothing we need to retain from the old view, so just drop and create a new one catalog.dropTable(viewIdent, ignoreIfNotExists = false, purge = false) @@ -280,6 +298,40 @@ case class AlterViewAsCommand( } } +/** + * A command for users to get views in the given database. + * If a databaseName is not given, the current database will be used. + * The syntax of using this command in SQL is: + * {{{ + * SHOW VIEWS [(IN|FROM) database_name] [[LIKE] 'identifier_with_wildcards']; + * }}} + */ +case class ShowViewsCommand( + databaseName: String, + tableIdentifierPattern: Option[String]) extends RunnableCommand { + + // The result of SHOW VIEWS has three basic columns: namespace, viewName and isTemporary. + override val output: Seq[Attribute] = Seq( + AttributeReference("namespace", StringType, nullable = false)(), + AttributeReference("viewName", StringType, nullable = false)(), + AttributeReference("isTemporary", BooleanType, nullable = false)()) + + override def run(sparkSession: SparkSession): Seq[Row] = { + val catalog = sparkSession.sessionState.catalog + + // Show the information of views. + val views = tableIdentifierPattern.map(catalog.listViews(databaseName, _)) + .getOrElse(catalog.listViews(databaseName, "*")) + views.map { tableIdent => + val namespace = tableIdent.database.toArray.quoted + val tableName = tableIdent.table + val isTemp = catalog.isTemporaryTable(tableIdent) + + Row(namespace, tableName, isTemp) + } + } +} + object ViewHelper { import CatalogTable._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 3615afcf86c7a..54faadf73b25f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -67,8 +67,9 @@ import org.apache.spark.util.Utils * metadata. For example, when reading a partitioned table from a file system, partition columns * will be inferred from the directory layout even if they are not specified. * - * @param paths A list of file system paths that hold data. These will be globbed before and - * qualified. This option only works when reading from a [[FileFormat]]. + * @param paths A list of file system paths that hold data. These will be globbed before if + * the "__globPaths__" option is true, and will be qualified. This option only works + * when reading from a [[FileFormat]]. * @param userSpecifiedSchema An optional specification of the schema of the data. When present * we skip attempting to infer the schema. * @param partitionColumns A list of column names that the relation is partitioned by. This list is @@ -109,10 +110,22 @@ case class DataSource( private def providingInstance() = providingClass.getConstructor().newInstance() + private def newHadoopConfiguration(): Configuration = + sparkSession.sessionState.newHadoopConfWithOptions(options) + lazy val sourceInfo: SourceInfo = sourceSchema() private val caseInsensitiveOptions = CaseInsensitiveMap(options) private val equality = sparkSession.sessionState.conf.resolver + /** + * Whether or not paths should be globbed before being used to access files. + */ + def globPaths: Boolean = { + options.get(DataSource.GLOB_PATHS_KEY) + .map(_ == "true") + .getOrElse(true) + } + bucketSpec.map { bucket => SchemaUtils.checkColumnNameDuplication( bucket.bucketColumnNames, "in the bucket definition", equality) @@ -187,9 +200,11 @@ case class DataSource( val dataSchema = userSpecifiedSchema.map { schema => StructType(schema.filterNot(f => partitionSchema.exists(p => equality(p.name, f.name)))) }.orElse { + // Remove "path" option so that it is not added to the paths returned by + // `tempFileIndex.allFiles()`. format.inferSchema( sparkSession, - caseInsensitiveOptions, + caseInsensitiveOptions - "path", tempFileIndex.allFiles()) }.getOrElse { throw new AnalysisException( @@ -229,8 +244,8 @@ case class DataSource( // For glob pattern, we do not check it because the glob pattern might only make sense // once the streaming job starts and some upstream source starts dropping data. val hdfsPath = new Path(path) - if (!SparkHadoopUtil.get.isGlobPath(hdfsPath)) { - val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf()) + if (!globPaths || !SparkHadoopUtil.get.isGlobPath(hdfsPath)) { + val fs = hdfsPath.getFileSystem(newHadoopConfiguration()) if (!fs.exists(hdfsPath)) { throw new AnalysisException(s"Path does not exist: $path") } @@ -357,15 +372,17 @@ case class DataSource( case (format: FileFormat, _) if FileStreamSink.hasMetadata( caseInsensitiveOptions.get("path").toSeq ++ paths, - sparkSession.sessionState.newHadoopConf(), + newHadoopConfiguration(), sparkSession.sessionState.conf) => val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head) val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath, caseInsensitiveOptions, userSpecifiedSchema) val dataSchema = userSpecifiedSchema.orElse { + // Remove "path" option so that it is not added to the paths returned by + // `fileCatalog.allFiles()`. format.inferSchema( sparkSession, - caseInsensitiveOptions, + caseInsensitiveOptions - "path", fileCatalog.allFiles()) }.getOrElse { throw new AnalysisException( @@ -449,7 +466,7 @@ case class DataSource( val allPaths = paths ++ caseInsensitiveOptions.get("path") val outputPath = if (allPaths.length == 1) { val path = new Path(allPaths.head) - val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf()) + val fs = path.getFileSystem(newHadoopConfiguration()) path.makeQualified(fs.getUri, fs.getWorkingDirectory) } else { throw new IllegalArgumentException("Expected exactly one path to be specified, but " + @@ -569,10 +586,8 @@ case class DataSource( checkEmptyGlobPath: Boolean, checkFilesExist: Boolean): Seq[Path] = { val allPaths = caseInsensitiveOptions.get("path") ++ paths - val hadoopConf = sparkSession.sessionState.newHadoopConf() - - DataSource.checkAndGlobPathIfNecessary(allPaths.toSeq, hadoopConf, - checkEmptyGlobPath, checkFilesExist) + DataSource.checkAndGlobPathIfNecessary(allPaths.toSeq, newHadoopConfiguration(), + checkEmptyGlobPath, checkFilesExist, enableGlobbing = globPaths) } } @@ -735,6 +750,11 @@ object DataSource extends Logging { } } + /** + * The key in the "options" map for deciding whether or not to glob paths before use. + */ + val GLOB_PATHS_KEY = "__globPaths__" + /** * Checks and returns files in all the paths. */ @@ -742,12 +762,17 @@ object DataSource extends Logging { paths: Seq[String], hadoopConf: Configuration, checkEmptyGlobPath: Boolean, - checkFilesExist: Boolean): Seq[Path] = { + checkFilesExist: Boolean, + enableGlobbing: Boolean): Seq[Path] = { val allGlobPath = paths.flatMap { path => val hdfsPath = new Path(path) val fs = hdfsPath.getFileSystem(hadoopConf) val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) - val globPath = SparkHadoopUtil.get.globPathIfNecessary(fs, qualified) + val globPath = if (enableGlobbing) { + SparkHadoopUtil.get.globPathIfNecessary(fs, qualified) + } else { + qualified :: Nil + } if (checkEmptyGlobPath && globPath.isEmpty) { throw new AnalysisException(s"Path does not exist: $qualified") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index e3a0a0a6c34e5..23454d7d5e7f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -104,7 +105,17 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast None } else if (potentialSpecs.size == 1) { val partValue = potentialSpecs.head._2 - Some(Alias(cast(Literal(partValue), field.dataType), field.name)()) + conf.storeAssignmentPolicy match { + // SPARK-30844: try our best to follow StoreAssignmentPolicy for static partition + // values but not completely follow because we can't do static type checking due to + // the reason that the parser has erased the type info of static partition values + // and converted them to string. + case StoreAssignmentPolicy.ANSI | StoreAssignmentPolicy.STRICT => + Some(Alias(AnsiCast(Literal(partValue), field.dataType, + Option(conf.sessionLocalTimeZone)), field.name)()) + case _ => + Some(Alias(cast(Literal(partValue), field.dataType), field.name)()) + } } else { throw new AnalysisException( s"Partition column ${field.name} have multiple values specified, " + @@ -437,61 +448,63 @@ object DataSourceStrategy { } } - private def translateLeafNodeFilter(predicate: Expression): Option[Filter] = predicate match { - case expressions.EqualTo(a: Attribute, Literal(v, t)) => - Some(sources.EqualTo(a.name, convertToScala(v, t))) - case expressions.EqualTo(Literal(v, t), a: Attribute) => - Some(sources.EqualTo(a.name, convertToScala(v, t))) - - case expressions.EqualNullSafe(a: Attribute, Literal(v, t)) => - Some(sources.EqualNullSafe(a.name, convertToScala(v, t))) - case expressions.EqualNullSafe(Literal(v, t), a: Attribute) => - Some(sources.EqualNullSafe(a.name, convertToScala(v, t))) - - case expressions.GreaterThan(a: Attribute, Literal(v, t)) => - Some(sources.GreaterThan(a.name, convertToScala(v, t))) - case expressions.GreaterThan(Literal(v, t), a: Attribute) => - Some(sources.LessThan(a.name, convertToScala(v, t))) - - case expressions.LessThan(a: Attribute, Literal(v, t)) => - Some(sources.LessThan(a.name, convertToScala(v, t))) - case expressions.LessThan(Literal(v, t), a: Attribute) => - Some(sources.GreaterThan(a.name, convertToScala(v, t))) - - case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, t)) => - Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t))) - case expressions.GreaterThanOrEqual(Literal(v, t), a: Attribute) => - Some(sources.LessThanOrEqual(a.name, convertToScala(v, t))) - - case expressions.LessThanOrEqual(a: Attribute, Literal(v, t)) => - Some(sources.LessThanOrEqual(a.name, convertToScala(v, t))) - case expressions.LessThanOrEqual(Literal(v, t), a: Attribute) => - Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t))) - - case expressions.InSet(a: Attribute, set) => - val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType) - Some(sources.In(a.name, set.toArray.map(toScala))) + private def translateLeafNodeFilter( + predicate: Expression, + pushableColumn: PushableColumnBase): Option[Filter] = predicate match { + case expressions.EqualTo(pushableColumn(name), Literal(v, t)) => + Some(sources.EqualTo(name, convertToScala(v, t))) + case expressions.EqualTo(Literal(v, t), pushableColumn(name)) => + Some(sources.EqualTo(name, convertToScala(v, t))) + + case expressions.EqualNullSafe(pushableColumn(name), Literal(v, t)) => + Some(sources.EqualNullSafe(name, convertToScala(v, t))) + case expressions.EqualNullSafe(Literal(v, t), pushableColumn(name)) => + Some(sources.EqualNullSafe(name, convertToScala(v, t))) + + case expressions.GreaterThan(pushableColumn(name), Literal(v, t)) => + Some(sources.GreaterThan(name, convertToScala(v, t))) + case expressions.GreaterThan(Literal(v, t), pushableColumn(name)) => + Some(sources.LessThan(name, convertToScala(v, t))) + + case expressions.LessThan(pushableColumn(name), Literal(v, t)) => + Some(sources.LessThan(name, convertToScala(v, t))) + case expressions.LessThan(Literal(v, t), pushableColumn(name)) => + Some(sources.GreaterThan(name, convertToScala(v, t))) + + case expressions.GreaterThanOrEqual(pushableColumn(name), Literal(v, t)) => + Some(sources.GreaterThanOrEqual(name, convertToScala(v, t))) + case expressions.GreaterThanOrEqual(Literal(v, t), pushableColumn(name)) => + Some(sources.LessThanOrEqual(name, convertToScala(v, t))) + + case expressions.LessThanOrEqual(pushableColumn(name), Literal(v, t)) => + Some(sources.LessThanOrEqual(name, convertToScala(v, t))) + case expressions.LessThanOrEqual(Literal(v, t), pushableColumn(name)) => + Some(sources.GreaterThanOrEqual(name, convertToScala(v, t))) + + case expressions.InSet(e @ pushableColumn(name), set) => + val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) + Some(sources.In(name, set.toArray.map(toScala))) // Because we only convert In to InSet in Optimizer when there are more than certain // items. So it is possible we still get an In expression here that needs to be pushed // down. - case expressions.In(a: Attribute, list) if list.forall(_.isInstanceOf[Literal]) => + case expressions.In(e @ pushableColumn(name), list) if list.forall(_.isInstanceOf[Literal]) => val hSet = list.map(_.eval(EmptyRow)) - val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType) - Some(sources.In(a.name, hSet.toArray.map(toScala))) + val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) + Some(sources.In(name, hSet.toArray.map(toScala))) - case expressions.IsNull(a: Attribute) => - Some(sources.IsNull(a.name)) - case expressions.IsNotNull(a: Attribute) => - Some(sources.IsNotNull(a.name)) - case expressions.StartsWith(a: Attribute, Literal(v: UTF8String, StringType)) => - Some(sources.StringStartsWith(a.name, v.toString)) + case expressions.IsNull(pushableColumn(name)) => + Some(sources.IsNull(name)) + case expressions.IsNotNull(pushableColumn(name)) => + Some(sources.IsNotNull(name)) + case expressions.StartsWith(pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(sources.StringStartsWith(name, v.toString)) - case expressions.EndsWith(a: Attribute, Literal(v: UTF8String, StringType)) => - Some(sources.StringEndsWith(a.name, v.toString)) + case expressions.EndsWith(pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(sources.StringEndsWith(name, v.toString)) - case expressions.Contains(a: Attribute, Literal(v: UTF8String, StringType)) => - Some(sources.StringContains(a.name, v.toString)) + case expressions.Contains(pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(sources.StringContains(name, v.toString)) case expressions.Literal(true, BooleanType) => Some(sources.AlwaysTrue) @@ -507,8 +520,9 @@ object DataSourceStrategy { * * @return a `Some[Filter]` if the input [[Expression]] is convertible, otherwise a `None`. */ - protected[sql] def translateFilter(predicate: Expression): Option[Filter] = { - translateFilterWithMapping(predicate, None) + protected[sql] def translateFilter( + predicate: Expression, supportNestedPredicatePushdown: Boolean): Option[Filter] = { + translateFilterWithMapping(predicate, None, supportNestedPredicatePushdown) } /** @@ -518,11 +532,13 @@ object DataSourceStrategy { * @param translatedFilterToExpr An optional map from leaf node filter expressions to its * translated [[Filter]]. The map is used for rebuilding * [[Expression]] from [[Filter]]. + * @param nestedPredicatePushdownEnabled Whether nested predicate pushdown is enabled. * @return a `Some[Filter]` if the input [[Expression]] is convertible, otherwise a `None`. */ protected[sql] def translateFilterWithMapping( predicate: Expression, - translatedFilterToExpr: Option[mutable.HashMap[sources.Filter, Expression]]) + translatedFilterToExpr: Option[mutable.HashMap[sources.Filter, Expression]], + nestedPredicatePushdownEnabled: Boolean) : Option[Filter] = { predicate match { case expressions.And(left, right) => @@ -536,21 +552,26 @@ object DataSourceStrategy { // Pushing one leg of AND down is only safe to do at the top level. // You can see ParquetFilters' createFilter for more details. for { - leftFilter <- translateFilterWithMapping(left, translatedFilterToExpr) - rightFilter <- translateFilterWithMapping(right, translatedFilterToExpr) + leftFilter <- translateFilterWithMapping( + left, translatedFilterToExpr, nestedPredicatePushdownEnabled) + rightFilter <- translateFilterWithMapping( + right, translatedFilterToExpr, nestedPredicatePushdownEnabled) } yield sources.And(leftFilter, rightFilter) case expressions.Or(left, right) => for { - leftFilter <- translateFilterWithMapping(left, translatedFilterToExpr) - rightFilter <- translateFilterWithMapping(right, translatedFilterToExpr) + leftFilter <- translateFilterWithMapping( + left, translatedFilterToExpr, nestedPredicatePushdownEnabled) + rightFilter <- translateFilterWithMapping( + right, translatedFilterToExpr, nestedPredicatePushdownEnabled) } yield sources.Or(leftFilter, rightFilter) case expressions.Not(child) => - translateFilterWithMapping(child, translatedFilterToExpr).map(sources.Not) + translateFilterWithMapping(child, translatedFilterToExpr, nestedPredicatePushdownEnabled) + .map(sources.Not) case other => - val filter = translateLeafNodeFilter(other) + val filter = translateLeafNodeFilter(other, PushableColumn(nestedPredicatePushdownEnabled)) if (filter.isDefined && translatedFilterToExpr.isDefined) { translatedFilterToExpr.get(filter.get) = predicate } @@ -597,8 +618,9 @@ object DataSourceStrategy { // A map from original Catalyst expressions to corresponding translated data source filters. // If a predicate is not in this map, it means it cannot be pushed down. + val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) val translatedMap: Map[Expression, Filter] = predicates.flatMap { p => - translateFilter(p).map(f => p -> f) + translateFilter(p, supportNestedPredicatePushdown).map(f => p -> f) }.toMap val pushedFilters: Seq[Filter] = translatedMap.values.toSeq @@ -626,12 +648,53 @@ object DataSourceStrategy { output: Seq[Attribute], rdd: RDD[Row]): RDD[InternalRow] = { if (relation.needConversion) { - val converters = RowEncoder(StructType.fromAttributes(output)) + val toRow = RowEncoder(StructType.fromAttributes(output)).createSerializer() rdd.mapPartitions { iterator => - iterator.map(converters.toRow) + iterator.map(toRow) } } else { rdd.asInstanceOf[RDD[InternalRow]] } } } + +/** + * Find the column name of an expression that can be pushed down. + */ +abstract class PushableColumnBase { + val nestedPredicatePushdownEnabled: Boolean + + def unapply(e: Expression): Option[String] = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper + def helper(e: Expression): Option[Seq[String]] = e match { + case a: Attribute => + if (nestedPredicatePushdownEnabled || !a.name.contains(".")) { + Some(Seq(a.name)) + } else { + None + } + case s: GetStructField if nestedPredicatePushdownEnabled => + helper(s.child).map(_ :+ s.childSchema(s.ordinal).name) + case _ => None + } + helper(e).map(_.quoted) + } +} + +object PushableColumn { + def apply(nestedPredicatePushdownEnabled: Boolean): PushableColumnBase = { + if (nestedPredicatePushdownEnabled) { + PushableColumnAndNestedColumn + } else { + PushableColumnWithoutNestedColumn + } + } +} + +object PushableColumnAndNestedColumn extends PushableColumnBase { + override val nestedPredicatePushdownEnabled = true +} + +object PushableColumnWithoutNestedColumn extends PushableColumnBase { + override val nestedPredicatePushdownEnabled = false +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala index bd56635084c34..abb74d8d09ec6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala @@ -17,12 +17,21 @@ package org.apache.spark.sql.execution.datasources +import java.util.Locale + import org.apache.hadoop.fs.Path import org.json4s.NoTypeHints import org.json4s.jackson.Serialization +import org.apache.spark.SparkUpgradeException +import org.apache.spark.sql.{SPARK_LEGACY_DATETIME, SPARK_VERSION_METADATA_KEY} import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.util.RebaseDateTime +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy +import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils object DataSourceUtils { @@ -64,4 +73,121 @@ object DataSourceUtils { private[sql] def isDataFile(fileName: String) = !(fileName.startsWith("_") || fileName.startsWith(".")) + + /** + * Returns if the given relation's V1 datasource provider supports nested predicate pushdown. + */ + private[sql] def supportNestedPredicatePushdown(relation: BaseRelation): Boolean = + relation match { + case hs: HadoopFsRelation => + val supportedDatasources = + Utils.stringToSeq(SQLConf.get.getConf(SQLConf.NESTED_PREDICATE_PUSHDOWN_FILE_SOURCE_LIST) + .toLowerCase(Locale.ROOT)) + supportedDatasources.contains(hs.toString) + case _ => false + } + + def datetimeRebaseMode( + lookupFileMeta: String => String, + modeByConfig: String): LegacyBehaviorPolicy.Value = { + if (Utils.isTesting && SQLConf.get.getConfString("spark.test.forceNoRebase", "") == "true") { + return LegacyBehaviorPolicy.CORRECTED + } + // If there is no version, we return the mode specified by the config. + Option(lookupFileMeta(SPARK_VERSION_METADATA_KEY)).map { version => + // Files written by Spark 2.4 and earlier follow the legacy hybrid calendar and we need to + // rebase the datetime values. + // Files written by Spark 3.0 and latter may also need the rebase if they were written with + // the "LEGACY" rebase mode. + if (version < "3.0.0" || lookupFileMeta(SPARK_LEGACY_DATETIME) != null) { + LegacyBehaviorPolicy.LEGACY + } else { + LegacyBehaviorPolicy.CORRECTED + } + }.getOrElse(LegacyBehaviorPolicy.withName(modeByConfig)) + } + + def newRebaseExceptionInRead(format: String): SparkUpgradeException = { + val config = if (format == "Parquet") { + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key + } else if (format == "Avro") { + SQLConf.LEGACY_AVRO_REBASE_MODE_IN_READ.key + } else { + throw new IllegalStateException("unrecognized format " + format) + } + new SparkUpgradeException("3.0", "reading dates before 1582-10-15 or timestamps before " + + s"1900-01-01T00:00:00Z from $format files can be ambiguous, as the files may be written by " + + "Spark 2.x or legacy versions of Hive, which uses a legacy hybrid calendar that is " + + "different from Spark 3.0+'s Proleptic Gregorian calendar. See more details in " + + s"SPARK-31404. You can set $config to 'LEGACY' to rebase the datetime values w.r.t. " + + s"the calendar difference during reading. Or set $config to 'CORRECTED' to read the " + + "datetime values as it is.", null) + } + + def newRebaseExceptionInWrite(format: String): SparkUpgradeException = { + val config = if (format == "Parquet") { + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key + } else if (format == "Avro") { + SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key + } else { + throw new IllegalStateException("unrecognized format " + format) + } + new SparkUpgradeException("3.0", "writing dates before 1582-10-15 or timestamps before " + + s"1900-01-01T00:00:00Z into $format files can be dangerous, as the files may be read by " + + "Spark 2.x or legacy versions of Hive later, which uses a legacy hybrid calendar that is " + + "different from Spark 3.0+'s Proleptic Gregorian calendar. See more details in " + + s"SPARK-31404. You can set $config to 'LEGACY' to rebase the datetime values w.r.t. " + + "the calendar difference during writing, to get maximum interoperability. Or set " + + s"$config to 'CORRECTED' to write the datetime values as it is, if you are 100% sure that " + + "the written files will only be read by Spark 3.0+ or other systems that use Proleptic " + + "Gregorian calendar.", null) + } + + def creteDateRebaseFuncInRead( + rebaseMode: LegacyBehaviorPolicy.Value, + format: String): Int => Int = rebaseMode match { + case LegacyBehaviorPolicy.EXCEPTION => days: Int => + if (days < RebaseDateTime.lastSwitchJulianDay) { + throw DataSourceUtils.newRebaseExceptionInRead(format) + } + days + case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseJulianToGregorianDays + case LegacyBehaviorPolicy.CORRECTED => identity[Int] + } + + def creteDateRebaseFuncInWrite( + rebaseMode: LegacyBehaviorPolicy.Value, + format: String): Int => Int = rebaseMode match { + case LegacyBehaviorPolicy.EXCEPTION => days: Int => + if (days < RebaseDateTime.lastSwitchGregorianDay) { + throw DataSourceUtils.newRebaseExceptionInWrite(format) + } + days + case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseGregorianToJulianDays + case LegacyBehaviorPolicy.CORRECTED => identity[Int] + } + + def creteTimestampRebaseFuncInRead( + rebaseMode: LegacyBehaviorPolicy.Value, + format: String): Long => Long = rebaseMode match { + case LegacyBehaviorPolicy.EXCEPTION => micros: Long => + if (micros < RebaseDateTime.lastSwitchJulianTs) { + throw DataSourceUtils.newRebaseExceptionInRead(format) + } + micros + case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseJulianToGregorianMicros + case LegacyBehaviorPolicy.CORRECTED => identity[Long] + } + + def creteTimestampRebaseFuncInWrite( + rebaseMode: LegacyBehaviorPolicy.Value, + format: String): Long => Long = rebaseMode match { + case LegacyBehaviorPolicy.EXCEPTION => micros: Long => + if (micros < RebaseDateTime.lastSwitchGregorianTs) { + throw DataSourceUtils.newRebaseExceptionInWrite(format) + } + micros + case LegacyBehaviorPolicy.LEGACY => RebaseDateTime.rebaseGregorianToJulianMicros + case LegacyBehaviorPolicy.CORRECTED => identity[Long] + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala new file mode 100644 index 0000000000000..a04c2fcbbac12 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DaysWritable.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{DataInput, DataOutput, IOException} +import java.sql.Date + +import org.apache.hadoop.hive.serde2.io.DateWritable +import org.apache.hadoop.io.WritableUtils + +import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseJulianToGregorianDays} + +/** + * The class accepts/returns days in Gregorian calendar and rebase them + * via conversion to local date in Julian calendar for dates before 1582-10-15 + * in read/write for backward compatibility with Spark 2.4 and earlier versions. + * + * @param gregorianDays The number of days since the epoch 1970-01-01 in + * Gregorian calendar. + * @param julianDays The number of days since the epoch 1970-01-01 in + * Julian calendar. + */ +class DaysWritable( + var gregorianDays: Int, + var julianDays: Int) + extends DateWritable { + + def this() = this(0, 0) + def this(gregorianDays: Int) = + this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays)) + def this(dateWritable: DateWritable) = { + this( + gregorianDays = dateWritable match { + case daysWritable: DaysWritable => daysWritable.gregorianDays + case dateWritable: DateWritable => + rebaseJulianToGregorianDays(dateWritable.getDays) + }, + julianDays = dateWritable.getDays) + } + + override def getDays: Int = julianDays + override def get: Date = { + new Date(DateWritable.daysToMillis(julianDays)) + } + override def get(doesTimeMatter: Boolean): Date = { + new Date(DateWritable.daysToMillis(julianDays, doesTimeMatter)) + } + + override def set(d: Int): Unit = { + gregorianDays = d + julianDays = rebaseGregorianToJulianDays(d) + } + + @throws[IOException] + override def write(out: DataOutput): Unit = { + WritableUtils.writeVInt(out, julianDays) + } + + @throws[IOException] + override def readFields(in: DataInput): Unit = { + julianDays = WritableUtils.readVInt(in) + gregorianDays = rebaseJulianToGregorianDays(julianDays) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala index 50c4f6cd57a96..edb49d3f90ca3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala @@ -182,8 +182,7 @@ class DynamicPartitionDataWriter( val partitionName = ScalaUDF( ExternalCatalogUtils.getPartitionPathString _, StringType, - Seq(Literal(c.name), Cast(c, StringType, Option(description.timeZoneId))), - Seq(false, false)) + Seq(Literal(c.name), Cast(c, StringType, Option(description.timeZoneId)))) if (i == 0) Seq(partitionName) else Seq(Literal(Path.SEPARATOR), partitionName) }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index 219c778b9164a..a71aeb47872ce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCo import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.execution.{ProjectExec, SortExec, SparkPlan, SQLExecution} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -132,7 +133,7 @@ object FileFormatWriter extends Logging { fileFormat.prepareWrite(sparkSession, job, caseInsensitiveOptions, dataSchema) val description = new WriteJobDescription( - uuid = UUID.randomUUID().toString, + uuid = UUID.randomUUID.toString, serializableHadoopConf = new SerializableConfiguration(job.getConfiguration), outputWriterFactory = outputWriterFactory, allColumns = outputSpec.outputColumns, @@ -163,6 +164,10 @@ object FileFormatWriter extends Logging { SQLExecution.checkSQLExecutionId(sparkSession) + // propagate the decription UUID into the jobs, so that committers + // get an ID guaranteed to be unique. + job.getConfiguration.set("spark.sql.sources.writeJobUUID", description.uuid) + // This call shouldn't be put into the `try` block below because it only initializes and // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called. committer.setupJob(job) @@ -283,7 +288,7 @@ object FileFormatWriter extends Logging { } catch { case e: FetchFailedException => throw e - case f: FileAlreadyExistsException => + case f: FileAlreadyExistsException if SQLConf.get.fastFailFileFormatOutput => // If any output file to write already exists, it does not make sense to re-run this task. // We throw the exception and let Executor throw ExceptionFailure to abort the job. throw new TaskOutputFileAlreadyExistException(f) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 542c996a5342d..fc59336d6107c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -21,7 +21,7 @@ import java.io.{FileNotFoundException, IOException} import org.apache.parquet.io.ParquetDecodingException -import org.apache.spark.{Partition => RDDPartition, TaskContext} +import org.apache.spark.{Partition => RDDPartition, SparkUpgradeException, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{InputFileBlockHolder, RDD} import org.apache.spark.sql.SparkSession @@ -178,7 +178,9 @@ class FileScanRDD( s"Expected: ${e.getLogicalType}, Found: ${e.getPhysicalType}" throw new QueryExecutionException(message, e) case e: ParquetDecodingException => - if (e.getMessage.contains("Can not read value at")) { + if (e.getCause.isInstanceOf[SparkUpgradeException]) { + throw e.getCause + } else if (e.getMessage.contains("Can not read value at")) { val message = "Encounter error while reading parquet files. " + "One possible cause: Parquet column cannot be converted in the " + "corresponding files. Details: " diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index f45495121a980..4b9e0c6da11dd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -89,9 +89,8 @@ object FileSourceStrategy extends Strategy with Logging { case expressions.In(a: Attribute, list) if list.forall(_.isInstanceOf[Literal]) && a.name == bucketColumnName => getBucketSetFromIterable(a, list.map(e => e.eval(EmptyRow))) - case expressions.InSet(a: Attribute, hset) - if hset.forall(_.isInstanceOf[Literal]) && a.name == bucketColumnName => - getBucketSetFromIterable(a, hset.map(e => expressions.Literal(e).eval(EmptyRow))) + case expressions.InSet(a: Attribute, hset) if a.name == bucketColumnName => + getBucketSetFromIterable(a, hset) case expressions.IsNull(a: Attribute) if a.name == bucketColumnName => getBucketSetFromValue(a, null) case expressions.And(left, right) => @@ -154,11 +153,15 @@ object FileSourceStrategy extends Strategy with Logging { l.resolve( fsRelation.partitionSchema, fsRelation.sparkSession.sessionState.analyzer.resolver) val partitionSet = AttributeSet(partitionColumns) - val partitionKeyFilters = - ExpressionSet(normalizedFilters + val partitionKeyFilters = if (partitionColumns.isEmpty) { + ExpressionSet(Nil) + } else { + val predicates = ExpressionSet(normalizedFilters .filter(_.references.subsetOf(partitionSet))) + logInfo(s"Pruning directories with: ${predicates.mkString(",")}") + predicates + } - logInfo(s"Pruning directories with: ${partitionKeyFilters.mkString(",")}") // subquery expressions are filtered out because they can't be used to prune buckets or pushed // down as data filters, yet they would be executed @@ -178,8 +181,11 @@ object FileSourceStrategy extends Strategy with Logging { // Partition keys are not available in the statistics of the files. val dataFilters = normalizedFiltersWithoutSubqueries.filter(_.references.intersect(partitionSet).isEmpty) - logInfo(s"Pushed Filters: " + - s"${dataFilters.flatMap(DataSourceStrategy.translateFilter).mkString(",")}") + val supportNestedPredicatePushdown = + DataSourceUtils.supportNestedPredicatePushdown(fsRelation) + val pushedFilters = dataFilters + .flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) + logInfo(s"Pushed Filters: ${pushedFilters.mkString(",")}") // Predicates with both partition keys and attributes need to be evaluated after the scan. val afterScanFilters = filterSet -- partitionKeyFilters.filter(_.references.nonEmpty) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index f11972115e09f..fe733f4238e1a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -192,7 +192,7 @@ case class InsertIntoHadoopFsRelationCommand( // refresh cached files in FileIndex fileIndex.foreach(_.refresh()) // refresh data cache if table is cached - sparkSession.catalog.refreshByPath(outputPath.toString) + sparkSession.sharedState.cacheManager.recacheByPath(sparkSession, outputPath, fs) if (catalogTable.nonEmpty) { CommandUtils.updateTableStats(sparkSession, catalogTable.get) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala index 2e09c729529a6..5341e22f5e670 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala @@ -56,14 +56,17 @@ abstract class PartitioningAwareFileIndex( protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]] - protected lazy val pathGlobFilter = parameters.get("pathGlobFilter").map(new GlobFilter(_)) + private val caseInsensitiveMap = CaseInsensitiveMap(parameters) + + protected lazy val pathGlobFilter: Option[GlobFilter] = + caseInsensitiveMap.get("pathGlobFilter").map(new GlobFilter(_)) protected def matchGlobPattern(file: FileStatus): Boolean = { pathGlobFilter.forall(_.accept(file.getPath)) } - protected lazy val recursiveFileLookup = { - parameters.getOrElse("recursiveFileLookup", "false").toBoolean + protected lazy val recursiveFileLookup: Boolean = { + caseInsensitiveMap.getOrElse("recursiveFileLookup", "false").toBoolean } override def listFiles( @@ -215,7 +218,7 @@ abstract class PartitioningAwareFileIndex( * and the returned DataFrame will have the column of `something`. */ private def basePaths: Set[Path] = { - parameters.get(BASE_PATH_PARAM).map(new Path(_)) match { + caseInsensitiveMap.get(BASE_PATH_PARAM).map(new Path(_)) match { case Some(userDefinedBasePath) => val fs = userDefinedBasePath.getFileSystem(hadoopConf) if (!fs.isDirectory(userDefinedBasePath)) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index fdad43b23c5aa..5846d46e146ed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -60,7 +60,7 @@ object PartitionSpec { object PartitioningUtils { - val timestampPartitionPattern = "uuuu-MM-dd HH:mm:ss[.S]" + val timestampPartitionPattern = "yyyy-MM-dd HH:mm:ss[.S]" private[datasources] case class PartitionValues(columnNames: Seq[String], literals: Seq[Literal]) { @@ -131,7 +131,10 @@ object PartitioningUtils { } val dateFormatter = DateFormatter(zoneId) - val timestampFormatter = TimestampFormatter(timestampPartitionPattern, zoneId) + val timestampFormatter = TimestampFormatter( + timestampPartitionPattern, + zoneId, + isParsing = true) // First, we need to parse every partition's path and see if we can find partition values. val (partitionValues, optDiscoveredBasePaths) = paths.map { path => parsePartition(path, typeInference, basePaths, userSpecifiedDataTypes, @@ -542,6 +545,9 @@ object PartitioningUtils { partitionColumns: Seq[String], caseSensitive: Boolean): Unit = { + SchemaUtils.checkColumnNameDuplication( + partitionColumns, partitionColumns.mkString(", "), caseSensitive) + partitionColumnsSchema(schema, partitionColumns, caseSensitive).foreach { field => field.dataType match { case _: AtomicType => // OK diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala index 1ea19c187e51a..a7129fb14d1a6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala @@ -26,6 +26,19 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, FileScan} import org.apache.spark.sql.types.StructType +/** + * Prune the partitions of file source based table using partition filters. Currently, this rule + * is applied to [[HadoopFsRelation]] with [[CatalogFileIndex]] and [[DataSourceV2ScanRelation]] + * with [[FileScan]]. + * + * For [[HadoopFsRelation]], the location will be replaced by pruned file index, and corresponding + * statistics will be updated. And the partition filters will be kept in the filters of returned + * logical plan. + * + * For [[DataSourceV2ScanRelation]], both partition filters and data filters will be added to + * its underlying [[FileScan]]. And the partition filters will be removed in the filters of + * returned logical plan. + */ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { private def getPartitionKeyFiltersAndDataFilters( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala index 99882b0f7c7b0..28097c35401c9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala @@ -32,10 +32,12 @@ object SchemaMergeUtils extends Logging { */ def mergeSchemasInParallel( sparkSession: SparkSession, + parameters: Map[String, String], files: Seq[FileStatus], schemaReader: (Seq[FileStatus], Configuration, Boolean) => Seq[StructType]) : Option[StructType] = { - val serializedConf = new SerializableConfiguration(sparkSession.sessionState.newHadoopConf()) + val serializedConf = new SerializableConfiguration( + sparkSession.sessionState.newHadoopConfWithOptions(parameters)) // !! HACK ALERT !! // Here is a hack for Parquet, but it can be used by Orc as well. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 375cec597166c..d0283f39707c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -156,7 +156,7 @@ object TextInputCSVDataSource extends CSVDataSource { sparkSession, paths = paths, className = classOf[TextFileFormat].getName, - options = options.parameters + options = options.parameters.originalMap ++ Map(DataSource.GLOB_PATHS_KEY -> "false") ).resolveRelation(checkFilesExist = false)) .select("value").as[String](Encoders.STRING) } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 21fabac472f4b..d8b52c503ad34 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -33,11 +33,12 @@ object CSVUtils { // with the one below, `filterCommentAndEmpty` but execution path is different. One of them // might have to be removed in the near future if possible. import lines.sqlContext.implicits._ - val nonEmptyLines = lines.filter(length(trim($"value")) > 0) + val aliased = lines.toDF("value") + val nonEmptyLines = aliased.filter(length(trim($"value")) > 0) if (options.isCommentSet) { - nonEmptyLines.filter(!$"value".startsWith(options.comment.toString)) + nonEmptyLines.filter(!$"value".startsWith(options.comment.toString)).as[String] } else { - nonEmptyLines + nonEmptyLines.as[String] } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index c1e1aed83bae5..3f44312fd6985 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -331,9 +331,9 @@ object JdbcUtils extends Logging { def resultSetToRows(resultSet: ResultSet, schema: StructType): Iterator[Row] = { val inputMetrics = Option(TaskContext.get()).map(_.taskMetrics().inputMetrics).getOrElse(new InputMetrics) - val encoder = RowEncoder(schema).resolveAndBind() + val fromRow = RowEncoder(schema).resolveAndBind().createDeserializer() val internalRows = resultSetToSparkInternalRows(resultSet, schema, inputMetrics) - internalRows.map(encoder.fromRow) + internalRows.map(fromRow) } private[spark] def resultSetToSparkInternalRows( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala index 7ec2267e3461f..920f83e6bea97 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala @@ -120,7 +120,7 @@ object TextInputJsonDataSource extends JsonDataSource { sparkSession, paths = inputPaths.map(_.getPath.toString), className = classOf[TextFileFormat].getName, - options = parsedOptions.parameters + options = parsedOptions.parameters.originalMap ++ Map(DataSource.GLOB_PATHS_KEY -> "false") ).resolveRelation(checkFilesExist = false)) .select("value").as(Encoders.STRING) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index 4fad0a2484cde..8a6c4dce75f30 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} -import org.apache.spark.sql.internal.connector.SimpleTableProvider +import org.apache.spark.sql.internal.connector.{SimpleTableProvider, SupportsStreamingUpdate} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -52,8 +52,10 @@ private[noop] object NoopTable extends Table with SupportsWrite { } } -private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate { +private[noop] object NoopWriteBuilder extends WriteBuilder + with SupportsTruncate with SupportsStreamingUpdate { override def truncate(): WriteBuilder = this + override def update(): WriteBuilder = this override def buildForBatch(): BatchWrite = NoopBatchWrite override def buildForStreaming(): StreamingWrite = NoopStreamingWrite } @@ -61,6 +63,7 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate private[noop] object NoopBatchWrite extends BatchWrite { override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = NoopWriterFactory + override def useCommitCoordinator(): Boolean = false override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala index 6d52d40d6dd03..4ab009c6bd014 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala @@ -23,6 +23,7 @@ import org.apache.orc.mapred.{OrcList, OrcMap, OrcStruct, OrcTimestamp} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData} import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.catalyst.util.RebaseDateTime.rebaseJulianToGregorianDays import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -108,7 +109,7 @@ class OrcDeserializer( updater.set(ordinal, bytes) case DateType => (ordinal, value) => - updater.setInt(ordinal, DateTimeUtils.fromJavaDate(OrcShimUtils.getSqlDate(value))) + updater.setInt(ordinal, OrcShimUtils.getGregorianDays(value)) case TimestampType => (ordinal, value) => updater.setLong(ordinal, DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp])) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index fd791ce7c5e19..c540007d5aad1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -153,24 +153,19 @@ class OrcFileFormat filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = { - if (sparkSession.sessionState.conf.orcFilterPushDown) { - OrcFilters.createFilter(dataSchema, filters).foreach { f => - OrcInputFormat.setSearchArgument(hadoopConf, f, dataSchema.fieldNames) - } - } val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields) val sqlConf = sparkSession.sessionState.conf val enableVectorizedReader = supportBatch(sparkSession, resultSchema) val capacity = sqlConf.orcVectorizedReaderBatchSize - val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) - OrcConf.MAPRED_INPUT_SCHEMA.setString(hadoopConf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(hadoopConf, sqlConf.caseSensitiveAnalysis) val broadcastedConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis + val orcFilterPushDown = sparkSession.sessionState.conf.orcFilterPushDown + val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles (file: PartitionedFile) => { val conf = broadcastedConf.value.value @@ -179,16 +174,27 @@ class OrcFileFormat val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val requestedColIdsOrEmptyFile = + val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, requiredSchema, reader, conf) } - if (requestedColIdsOrEmptyFile.isEmpty) { + if (resultedColPruneInfo.isEmpty) { Iterator.empty } else { - val requestedColIds = requestedColIdsOrEmptyFile.get + // ORC predicate pushdown + if (orcFilterPushDown) { + OrcUtils.readCatalystSchema(filePath, conf, ignoreCorruptFiles).foreach { fileSchema => + OrcFilters.createFilter(fileSchema, filters).foreach { f => + OrcInputFormat.setSearchArgument(conf, f, fileSchema.fieldNames) + } + } + } + + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = OrcUtils.orcResultSchemaString(canPruneCols, + dataSchema, resultSchema, partitionSchema, conf) assert(requestedColIds.length == requiredSchema.length, "[BUG] requested column IDs do not match required schema") val taskConf = new Configuration(conf) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFiltersBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFiltersBase.scala index 0b5658715377a..4554899ec2827 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFiltersBase.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFiltersBase.scala @@ -17,14 +17,45 @@ package org.apache.spark.sql.execution.datasources.orc +import java.util.Locale + +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.quoteIfNeeded import org.apache.spark.sql.sources.{And, Filter} -import org.apache.spark.sql.types.{AtomicType, BinaryType, DataType} +import org.apache.spark.sql.types.{AtomicType, BinaryType, DataType, StructType} /** * Methods that can be shared when upgrading the built-in Hive. */ trait OrcFiltersBase { + case class OrcPrimitiveField(fieldName: String, fieldType: DataType) + + protected[sql] def getDataTypeMap( + schema: StructType, + caseSensitive: Boolean): Map[String, OrcPrimitiveField] = { + val fields = schema.flatMap { f => + if (isSearchableType(f.dataType)) { + Some(quoteIfNeeded(f.name) -> OrcPrimitiveField(quoteIfNeeded(f.name), f.dataType)) + } else { + None + } + } + + if (caseSensitive) { + fields.toMap + } else { + // Don't consider ambiguity here, i.e. more than one field are matched in case insensitive + // mode, just skip pushdown for these fields, they will trigger Exception when reading, + // See: SPARK-25175. + val dedupPrimitiveFields = fields + .groupBy(_._1.toLowerCase(Locale.ROOT)) + .filter(_._2.size == 1) + .mapValues(_.head._2) + CaseInsensitiveMap(dedupPrimitiveFields) + } + } + private[sql] def buildTree(filters: Seq[Filter]): Option[Filter] = { filters match { case Seq() => None @@ -36,21 +67,11 @@ trait OrcFiltersBase { } } - // Since ORC 1.5.0 (ORC-323), we need to quote for column names with `.` characters - // in order to distinguish predicate pushdown for nested columns. - protected[sql] def quoteAttributeNameIfNeeded(name: String) : String = { - if (!name.contains("`") && name.contains(".")) { - s"`$name`" - } else { - name - } - } - /** * Return true if this is a searchable type in ORC. * Both CharType and VarcharType are cleaned at AstBuilder. */ - protected[sql] def isSearchableType(dataType: DataType) = dataType match { + private def isSearchableType(dataType: DataType) = dataType match { case BinaryType => false case _: AtomicType => true case _ => false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index eea9b2a8f9613..c29287f1789e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -24,7 +24,7 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.orc.{OrcFile, Reader, TypeDescription, Writer} +import org.apache.orc.{OrcConf, OrcFile, Reader, TypeDescription, Writer} import org.apache.spark.{SPARK_VERSION_SHORT, SparkException} import org.apache.spark.deploy.SparkHadoopUtil @@ -81,10 +81,10 @@ object OrcUtils extends Logging { } } - def readSchema(sparkSession: SparkSession, files: Seq[FileStatus]) + def readSchema(sparkSession: SparkSession, files: Seq[FileStatus], options: Map[String, String]) : Option[StructType] = { val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles - val conf = sparkSession.sessionState.newHadoopConf() + val conf = sparkSession.sessionState.newHadoopConfWithOptions(options) files.toIterator.map(file => readSchema(file.getPath, conf, ignoreCorruptFiles)).collectFirst { case Some(schema) => logDebug(s"Reading schema from file $files, got Hive schema string: $schema") @@ -92,6 +92,20 @@ object OrcUtils extends Logging { } } + def readCatalystSchema( + file: Path, + conf: Configuration, + ignoreCorruptFiles: Boolean): Option[StructType] = { + readSchema(file, conf, ignoreCorruptFiles) match { + case Some(schema) => + Some(CatalystSqlParser.parseDataType(schema.toString).asInstanceOf[StructType]) + + case None => + // Field names is empty or `FileFormatException` was thrown but ignoreCorruptFiles is true. + None + } + } + /** * Reads ORC file schemas in multi-threaded manner, using native version of ORC. * This is visible for testing. @@ -109,22 +123,24 @@ object OrcUtils extends Logging { val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf) if (orcOptions.mergeSchema) { SchemaMergeUtils.mergeSchemasInParallel( - sparkSession, files, OrcUtils.readOrcSchemasInParallel) + sparkSession, options, files, OrcUtils.readOrcSchemasInParallel) } else { - OrcUtils.readSchema(sparkSession, files) + OrcUtils.readSchema(sparkSession, files, options) } } /** - * Returns the requested column ids from the given ORC file. Column id can be -1, which means the - * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty. + * @return Returns the combination of requested column ids from the given ORC file and + * boolean flag to find if the pruneCols is allowed or not. Requested Column id can be + * -1, which means the requested column doesn't exist in the ORC file. Returns None + * if the given ORC file is empty. */ def requestedColumnIds( isCaseSensitive: Boolean, dataSchema: StructType, requiredSchema: StructType, reader: Reader, - conf: Configuration): Option[Array[Int]] = { + conf: Configuration): Option[(Array[Int], Boolean)] = { val orcFieldNames = reader.getSchema.getFieldNames.asScala if (orcFieldNames.isEmpty) { // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer. @@ -136,6 +152,10 @@ object OrcUtils extends Logging { assert(orcFieldNames.length <= dataSchema.length, "The given data schema " + s"${dataSchema.catalogString} has less fields than the actual ORC physical schema, " + "no idea which columns were dropped, fail to read.") + // for ORC file written by Hive, no field names + // in the physical schema, there is a need to send the + // entire dataSchema instead of required schema. + // So pruneCols is not done in this case Some(requiredSchema.fieldNames.map { name => val index = dataSchema.fieldIndex(name) if (index < orcFieldNames.length) { @@ -143,7 +163,7 @@ object OrcUtils extends Logging { } else { -1 } - }) + }, false) } else { if (isCaseSensitive) { Some(requiredSchema.fieldNames.zipWithIndex.map { case (name, idx) => @@ -152,7 +172,7 @@ object OrcUtils extends Logging { } else { -1 } - }) + }, true) } else { // Do case-insensitive resolution only if in case-insensitive mode val caseInsensitiveOrcFieldMap = orcFieldNames.groupBy(_.toLowerCase(Locale.ROOT)) @@ -170,7 +190,7 @@ object OrcUtils extends Logging { idx } }.getOrElse(-1) - }) + }, true) } } } @@ -199,4 +219,25 @@ object OrcUtils extends Logging { s"map<${orcTypeDescriptionString(m.keyType)},${orcTypeDescriptionString(m.valueType)}>" case _ => dt.catalogString } + + /** + * @return Returns the result schema string based on the canPruneCols flag. + * resultSchemaString will be created using resultsSchema in case of + * canPruneCols is true and for canPruneCols as false value + * resultSchemaString will be created using the actual dataSchema. + */ + def orcResultSchemaString( + canPruneCols: Boolean, + dataSchema: StructType, + resultSchema: StructType, + partitionSchema: StructType, + conf: Configuration): String = { + val resultSchemaString = if (canPruneCols) { + OrcUtils.orcTypeDescriptionString(resultSchema) + } else { + OrcUtils.orcTypeDescriptionString(StructType(dataSchema.fields ++ partitionSchema.fields)) + } + OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) + resultSchemaString + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index f52aaf0140e1d..68f49f9442579 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -295,11 +295,15 @@ class ParquetFileFormat val convertTz = if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getTimeZone(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) } else { None } + val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( + footerFileMetaData.getKeyValueMetaData.get, + SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ)) + val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(broadcastedHadoopConf.value.value, attemptId) @@ -312,7 +316,10 @@ class ParquetFileFormat val taskContext = Option(TaskContext.get()) if (enableVectorizedReader) { val vectorizedReader = new VectorizedParquetRecordReader( - convertTz.orNull, enableOffHeapColumnVector && taskContext.isDefined, capacity) + convertTz.orNull, + datetimeRebaseMode.toString, + enableOffHeapColumnVector && taskContext.isDefined, + capacity) val iter = new RecordReaderIterator(vectorizedReader) // SPARK-23457 Register a task completion listener before `initialization`. taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) @@ -328,7 +335,8 @@ class ParquetFileFormat } else { logDebug(s"Falling back to parquet-mr") // ParquetRecordReader returns InternalRow - val readSupport = new ParquetReadSupport(convertTz, enableVectorizedReader = false) + val readSupport = new ParquetReadSupport( + convertTz, enableVectorizedReader = false, datetimeRebaseMode) val reader = if (pushed.isDefined && enableRecordFilter) { val parquetFilter = FilterCompat.get(pushed.get, null) new ParquetRecordReader[InternalRow](readSupport, parquetFilter) @@ -467,6 +475,7 @@ object ParquetFileFormat extends Logging { * S3 nodes). */ def mergeSchemasInParallel( + parameters: Map[String, String], filesToTouch: Seq[FileStatus], sparkSession: SparkSession): Option[StructType] = { val assumeBinaryIsString = sparkSession.sessionState.conf.isParquetBinaryAsString @@ -482,7 +491,7 @@ object ParquetFileFormat extends Logging { .map(ParquetFileFormat.readSchemaFromFooter(_, converter)) } - SchemaMergeUtils.mergeSchemasInParallel(sparkSession, filesToTouch, reader) + SchemaMergeUtils.mergeSchemasInParallel(sparkSession, parameters, filesToTouch, reader) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index b9b86adb438e6..13dee484bd98a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -20,14 +20,15 @@ package org.apache.spark.sql.execution.datasources.parquet import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Long => JLong} import java.math.{BigDecimal => JBigDecimal} import java.sql.{Date, Timestamp} +import java.time.{Instant, LocalDate} import java.util.Locale import scala.collection.JavaConverters.asScalaBufferConverter import org.apache.parquet.filter2.predicate._ -import org.apache.parquet.filter2.predicate.FilterApi._ +import org.apache.parquet.filter2.predicate.SparkFilterApi._ import org.apache.parquet.io.api.Binary -import org.apache.parquet.schema.{DecimalMetadata, MessageType, OriginalType, PrimitiveComparator} +import org.apache.parquet.schema.{DecimalMetadata, GroupType, MessageType, OriginalType, PrimitiveComparator, PrimitiveType, Type} import org.apache.parquet.schema.OriginalType._ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ @@ -49,15 +50,35 @@ class ParquetFilters( pushDownInFilterThreshold: Int, caseSensitive: Boolean) { // A map which contains parquet field name and data type, if predicate push down applies. - private val nameToParquetField : Map[String, ParquetField] = { - // Here we don't flatten the fields in the nested schema but just look up through - // root fields. Currently, accessing to nested fields does not push down filters - // and it does not support to create filters for them. - val primitiveFields = - schema.getFields.asScala.filter(_.isPrimitive).map(_.asPrimitiveType()).map { f => - f.getName -> ParquetField(f.getName, - ParquetSchemaType(f.getOriginalType, - f.getPrimitiveTypeName, f.getTypeLength, f.getDecimalMetadata)) + // + // Each key in `nameToParquetField` represents a column; `dots` are used as separators for + // nested columns. If any part of the names contains `dots`, it is quoted to avoid confusion. + // See `org.apache.spark.sql.connector.catalog.quote` for implementation details. + private val nameToParquetField : Map[String, ParquetPrimitiveField] = { + // Recursively traverse the parquet schema to get primitive fields that can be pushed-down. + // `parentFieldNames` is used to keep track of the current nested level when traversing. + def getPrimitiveFields( + fields: Seq[Type], + parentFieldNames: Array[String] = Array.empty): Seq[ParquetPrimitiveField] = { + fields.flatMap { + case p: PrimitiveType => + Some(ParquetPrimitiveField(fieldNames = parentFieldNames :+ p.getName, + fieldType = ParquetSchemaType(p.getOriginalType, + p.getPrimitiveTypeName, p.getTypeLength, p.getDecimalMetadata))) + // Note that when g is a `Struct`, `g.getOriginalType` is `null`. + // When g is a `Map`, `g.getOriginalType` is `MAP`. + // When g is a `List`, `g.getOriginalType` is `LIST`. + case g: GroupType if g.getOriginalType == null => + getPrimitiveFields(g.getFields.asScala, parentFieldNames :+ g.getName) + // Parquet only supports push-down for primitive types; as a result, Map and List types + // are removed. + case _ => None + } + } + + val primitiveFields = getPrimitiveFields(schema.getFields.asScala).map { field => + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper + (field.fieldNames.toSeq.quoted, field) } if (caseSensitive) { primitiveFields.toMap @@ -75,13 +96,13 @@ class ParquetFilters( } /** - * Holds a single field information stored in the underlying parquet file. + * Holds a single primitive field information stored in the underlying parquet file. * - * @param fieldName field name in parquet file + * @param fieldNames a field name as an array of string multi-identifier in parquet file * @param fieldType field type related info in parquet file */ - private case class ParquetField( - fieldName: String, + private case class ParquetPrimitiveField( + fieldNames: Array[String], fieldType: ParquetSchemaType) private case class ParquetSchemaType( @@ -103,8 +124,14 @@ class ParquetFilters( private val ParquetTimestampMicrosType = ParquetSchemaType(TIMESTAMP_MICROS, INT64, 0, null) private val ParquetTimestampMillisType = ParquetSchemaType(TIMESTAMP_MILLIS, INT64, 0, null) - private def dateToDays(date: Date): SQLDate = { - DateTimeUtils.fromJavaDate(date) + private def dateToDays(date: Any): SQLDate = date match { + case d: Date => DateTimeUtils.fromJavaDate(d) + case ld: LocalDate => DateTimeUtils.localDateToDays(ld) + } + + private def timestampToMicros(v: Any): JLong = v match { + case i: Instant => DateTimeUtils.instantToMicros(i) + case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) } private def decimalToInt32(decimal: JBigDecimal): Integer = decimal.unscaledValue().intValue() @@ -126,264 +153,258 @@ class ParquetFilters( Binary.fromConstantByteArray(fixedLengthBytes, 0, numBytes) } - private val makeEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + private def timestampToMillis(v: Any): JLong = { + val micros = timestampToMicros(v) + val millis = DateTimeUtils.toMillis(micros) + millis.asInstanceOf[JLong] + } + + private val makeEq: + PartialFunction[ParquetSchemaType, (Array[String], Any) => FilterPredicate] = { case ParquetBooleanType => - (n: String, v: Any) => FilterApi.eq(booleanColumn(n), v.asInstanceOf[JBoolean]) + (n: Array[String], v: Any) => FilterApi.eq(booleanColumn(n), v.asInstanceOf[JBoolean]) case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( intColumn(n), Option(v).map(_.asInstanceOf[Number].intValue.asInstanceOf[Integer]).orNull) case ParquetLongType => - (n: String, v: Any) => FilterApi.eq(longColumn(n), v.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.eq(longColumn(n), v.asInstanceOf[JLong]) case ParquetFloatType => - (n: String, v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[JFloat]) + (n: Array[String], v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[JFloat]) case ParquetDoubleType => - (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[JDouble]) + (n: Array[String], v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[JDouble]) // Binary.fromString and Binary.fromByteArray don't accept null values case ParquetStringType => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( binaryColumn(n), Option(v).map(s => Binary.fromString(s.asInstanceOf[String])).orNull) case ParquetBinaryType => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( binaryColumn(n), Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull) case ParquetDateType if pushDownDate => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( intColumn(n), - Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull) + Option(v).map(date => dateToDays(date).asInstanceOf[Integer]).orNull) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( longColumn(n), - Option(v).map(t => DateTimeUtils.fromJavaTimestamp(t.asInstanceOf[Timestamp]) - .asInstanceOf[JLong]).orNull) + Option(v).map(timestampToMicros).orNull) case ParquetTimestampMillisType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( longColumn(n), - Option(v).map(_.asInstanceOf[Timestamp].getTime.asInstanceOf[JLong]).orNull) + Option(v).map(timestampToMillis).orNull) case ParquetSchemaType(DECIMAL, INT32, _, _) if pushDownDecimal => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( intColumn(n), Option(v).map(d => decimalToInt32(d.asInstanceOf[JBigDecimal])).orNull) case ParquetSchemaType(DECIMAL, INT64, _, _) if pushDownDecimal => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( longColumn(n), Option(v).map(d => decimalToInt64(d.asInstanceOf[JBigDecimal])).orNull) case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, length, _) if pushDownDecimal => - (n: String, v: Any) => FilterApi.eq( + (n: Array[String], v: Any) => FilterApi.eq( binaryColumn(n), Option(v).map(d => decimalToByteArray(d.asInstanceOf[JBigDecimal], length)).orNull) } - private val makeNotEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + private val makeNotEq: + PartialFunction[ParquetSchemaType, (Array[String], Any) => FilterPredicate] = { case ParquetBooleanType => - (n: String, v: Any) => FilterApi.notEq(booleanColumn(n), v.asInstanceOf[JBoolean]) + (n: Array[String], v: Any) => FilterApi.notEq(booleanColumn(n), v.asInstanceOf[JBoolean]) case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( intColumn(n), Option(v).map(_.asInstanceOf[Number].intValue.asInstanceOf[Integer]).orNull) case ParquetLongType => - (n: String, v: Any) => FilterApi.notEq(longColumn(n), v.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.notEq(longColumn(n), v.asInstanceOf[JLong]) case ParquetFloatType => - (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[JFloat]) + (n: Array[String], v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[JFloat]) case ParquetDoubleType => - (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[JDouble]) + (n: Array[String], v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[JDouble]) case ParquetStringType => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( binaryColumn(n), Option(v).map(s => Binary.fromString(s.asInstanceOf[String])).orNull) case ParquetBinaryType => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( binaryColumn(n), Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull) case ParquetDateType if pushDownDate => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( intColumn(n), - Option(v).map(date => dateToDays(date.asInstanceOf[Date]).asInstanceOf[Integer]).orNull) + Option(v).map(date => dateToDays(date).asInstanceOf[Integer]).orNull) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( longColumn(n), - Option(v).map(t => DateTimeUtils.fromJavaTimestamp(t.asInstanceOf[Timestamp]) - .asInstanceOf[JLong]).orNull) + Option(v).map(timestampToMicros).orNull) case ParquetTimestampMillisType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( longColumn(n), - Option(v).map(_.asInstanceOf[Timestamp].getTime.asInstanceOf[JLong]).orNull) + Option(v).map(timestampToMillis).orNull) case ParquetSchemaType(DECIMAL, INT32, _, _) if pushDownDecimal => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( intColumn(n), Option(v).map(d => decimalToInt32(d.asInstanceOf[JBigDecimal])).orNull) case ParquetSchemaType(DECIMAL, INT64, _, _) if pushDownDecimal => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( longColumn(n), Option(v).map(d => decimalToInt64(d.asInstanceOf[JBigDecimal])).orNull) case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, length, _) if pushDownDecimal => - (n: String, v: Any) => FilterApi.notEq( + (n: Array[String], v: Any) => FilterApi.notEq( binaryColumn(n), Option(v).map(d => decimalToByteArray(d.asInstanceOf[JBigDecimal], length)).orNull) } - private val makeLt: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + private val makeLt: + PartialFunction[ParquetSchemaType, (Array[String], Any) => FilterPredicate] = { case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.lt(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer]) case ParquetLongType => - (n: String, v: Any) => FilterApi.lt(longColumn(n), v.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.lt(longColumn(n), v.asInstanceOf[JLong]) case ParquetFloatType => - (n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[JFloat]) + (n: Array[String], v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[JFloat]) case ParquetDoubleType => - (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[JDouble]) + (n: Array[String], v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[JDouble]) case ParquetStringType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.lt(binaryColumn(n), Binary.fromString(v.asInstanceOf[String])) case ParquetBinaryType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.lt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])) case ParquetDateType if pushDownDate => - (n: String, v: Any) => - FilterApi.lt(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer]) + (n: Array[String], v: Any) => + FilterApi.lt(intColumn(n), dateToDays(v).asInstanceOf[Integer]) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.lt( - longColumn(n), - DateTimeUtils.fromJavaTimestamp(v.asInstanceOf[Timestamp]).asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.lt(longColumn(n), timestampToMicros(v)) case ParquetTimestampMillisType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.lt( - longColumn(n), - v.asInstanceOf[Timestamp].getTime.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.lt(longColumn(n), timestampToMillis(v)) case ParquetSchemaType(DECIMAL, INT32, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.lt(intColumn(n), decimalToInt32(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, INT64, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.lt(longColumn(n), decimalToInt64(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, length, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.lt(binaryColumn(n), decimalToByteArray(v.asInstanceOf[JBigDecimal], length)) } - private val makeLtEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + private val makeLtEq: + PartialFunction[ParquetSchemaType, (Array[String], Any) => FilterPredicate] = { case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.ltEq(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer]) case ParquetLongType => - (n: String, v: Any) => FilterApi.ltEq(longColumn(n), v.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.ltEq(longColumn(n), v.asInstanceOf[JLong]) case ParquetFloatType => - (n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[JFloat]) + (n: Array[String], v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[JFloat]) case ParquetDoubleType => - (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[JDouble]) + (n: Array[String], v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[JDouble]) case ParquetStringType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.ltEq(binaryColumn(n), Binary.fromString(v.asInstanceOf[String])) case ParquetBinaryType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.ltEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])) case ParquetDateType if pushDownDate => - (n: String, v: Any) => - FilterApi.ltEq(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer]) + (n: Array[String], v: Any) => + FilterApi.ltEq(intColumn(n), dateToDays(v).asInstanceOf[Integer]) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.ltEq( - longColumn(n), - DateTimeUtils.fromJavaTimestamp(v.asInstanceOf[Timestamp]).asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.ltEq(longColumn(n), timestampToMicros(v)) case ParquetTimestampMillisType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.ltEq( - longColumn(n), - v.asInstanceOf[Timestamp].getTime.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.ltEq(longColumn(n), timestampToMillis(v)) case ParquetSchemaType(DECIMAL, INT32, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.ltEq(intColumn(n), decimalToInt32(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, INT64, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.ltEq(longColumn(n), decimalToInt64(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, length, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.ltEq(binaryColumn(n), decimalToByteArray(v.asInstanceOf[JBigDecimal], length)) } - private val makeGt: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + private val makeGt: + PartialFunction[ParquetSchemaType, (Array[String], Any) => FilterPredicate] = { case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gt(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer]) case ParquetLongType => - (n: String, v: Any) => FilterApi.gt(longColumn(n), v.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.gt(longColumn(n), v.asInstanceOf[JLong]) case ParquetFloatType => - (n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[JFloat]) + (n: Array[String], v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[JFloat]) case ParquetDoubleType => - (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[JDouble]) + (n: Array[String], v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[JDouble]) case ParquetStringType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gt(binaryColumn(n), Binary.fromString(v.asInstanceOf[String])) case ParquetBinaryType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])) case ParquetDateType if pushDownDate => - (n: String, v: Any) => - FilterApi.gt(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer]) + (n: Array[String], v: Any) => + FilterApi.gt(intColumn(n), dateToDays(v).asInstanceOf[Integer]) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.gt( - longColumn(n), - DateTimeUtils.fromJavaTimestamp(v.asInstanceOf[Timestamp]).asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.gt(longColumn(n), timestampToMicros(v)) case ParquetTimestampMillisType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.gt( - longColumn(n), - v.asInstanceOf[Timestamp].getTime.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.gt(longColumn(n), timestampToMillis(v)) case ParquetSchemaType(DECIMAL, INT32, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gt(intColumn(n), decimalToInt32(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, INT64, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gt(longColumn(n), decimalToInt64(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, length, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gt(binaryColumn(n), decimalToByteArray(v.asInstanceOf[JBigDecimal], length)) } - private val makeGtEq: PartialFunction[ParquetSchemaType, (String, Any) => FilterPredicate] = { + private val makeGtEq: + PartialFunction[ParquetSchemaType, (Array[String], Any) => FilterPredicate] = { case ParquetByteType | ParquetShortType | ParquetIntegerType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gtEq(intColumn(n), v.asInstanceOf[Number].intValue.asInstanceOf[Integer]) case ParquetLongType => - (n: String, v: Any) => FilterApi.gtEq(longColumn(n), v.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.gtEq(longColumn(n), v.asInstanceOf[JLong]) case ParquetFloatType => - (n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[JFloat]) + (n: Array[String], v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[JFloat]) case ParquetDoubleType => - (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[JDouble]) + (n: Array[String], v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[JDouble]) case ParquetStringType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gtEq(binaryColumn(n), Binary.fromString(v.asInstanceOf[String])) case ParquetBinaryType => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gtEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])) case ParquetDateType if pushDownDate => - (n: String, v: Any) => - FilterApi.gtEq(intColumn(n), dateToDays(v.asInstanceOf[Date]).asInstanceOf[Integer]) + (n: Array[String], v: Any) => + FilterApi.gtEq(intColumn(n), dateToDays(v).asInstanceOf[Integer]) case ParquetTimestampMicrosType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.gtEq( - longColumn(n), - DateTimeUtils.fromJavaTimestamp(v.asInstanceOf[Timestamp]).asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.gtEq(longColumn(n), timestampToMicros(v)) case ParquetTimestampMillisType if pushDownTimestamp => - (n: String, v: Any) => FilterApi.gtEq( - longColumn(n), - v.asInstanceOf[Timestamp].getTime.asInstanceOf[JLong]) + (n: Array[String], v: Any) => FilterApi.gtEq(longColumn(n), timestampToMillis(v)) case ParquetSchemaType(DECIMAL, INT32, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gtEq(intColumn(n), decimalToInt32(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, INT64, _, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gtEq(longColumn(n), decimalToInt64(v.asInstanceOf[JBigDecimal])) case ParquetSchemaType(DECIMAL, FIXED_LEN_BYTE_ARRAY, length, _) if pushDownDecimal => - (n: String, v: Any) => + (n: Array[String], v: Any) => FilterApi.gtEq(binaryColumn(n), decimalToByteArray(v.asInstanceOf[JBigDecimal], length)) } @@ -445,9 +466,10 @@ class ParquetFilters( case ParquetDoubleType => value.isInstanceOf[JDouble] case ParquetStringType => value.isInstanceOf[String] case ParquetBinaryType => value.isInstanceOf[Array[Byte]] - case ParquetDateType => value.isInstanceOf[Date] + case ParquetDateType => + value.isInstanceOf[Date] || value.isInstanceOf[LocalDate] case ParquetTimestampMicrosType | ParquetTimestampMillisType => - value.isInstanceOf[Timestamp] + value.isInstanceOf[Timestamp] || value.isInstanceOf[Instant] case ParquetSchemaType(DECIMAL, INT32, _, decimalMeta) => isDecimalMatched(value, decimalMeta) case ParquetSchemaType(DECIMAL, INT64, _, decimalMeta) => @@ -466,13 +488,8 @@ class ParquetFilters( case _ => false } - // Parquet does not allow dots in the column name because dots are used as a column path - // delimiter. Since Parquet 1.8.2 (PARQUET-389), Parquet accepts the filter predicates - // with missing columns. The incorrect results could be got from Parquet when we push down - // filters for the column having dots in the names. Thus, we do not push down such filters. - // See SPARK-20364. private def canMakeFilterOn(name: String, value: Any): Boolean = { - nameToParquetField.contains(name) && !name.contains(".") && valueCanMakeFilterOn(name, value) + nameToParquetField.contains(name) && valueCanMakeFilterOn(name, value) } /** @@ -503,38 +520,38 @@ class ParquetFilters( predicate match { case sources.IsNull(name) if canMakeFilterOn(name, null) => makeEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, null)) + .map(_(nameToParquetField(name).fieldNames, null)) case sources.IsNotNull(name) if canMakeFilterOn(name, null) => makeNotEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, null)) + .map(_(nameToParquetField(name).fieldNames, null)) case sources.EqualTo(name, value) if canMakeFilterOn(name, value) => makeEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.Not(sources.EqualTo(name, value)) if canMakeFilterOn(name, value) => makeNotEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.EqualNullSafe(name, value) if canMakeFilterOn(name, value) => makeEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.Not(sources.EqualNullSafe(name, value)) if canMakeFilterOn(name, value) => makeNotEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.LessThan(name, value) if canMakeFilterOn(name, value) => makeLt.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.LessThanOrEqual(name, value) if canMakeFilterOn(name, value) => makeLtEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.GreaterThan(name, value) if canMakeFilterOn(name, value) => makeGt.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.GreaterThanOrEqual(name, value) if canMakeFilterOn(name, value) => makeGtEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, value)) + .map(_(nameToParquetField(name).fieldNames, value)) case sources.And(lhs, rhs) => // At here, it is not safe to just convert one side and remove the other side @@ -585,13 +602,13 @@ class ParquetFilters( && values.distinct.length <= pushDownInFilterThreshold => values.distinct.flatMap { v => makeEq.lift(nameToParquetField(name).fieldType) - .map(_(nameToParquetField(name).fieldName, v)) + .map(_(nameToParquetField(name).fieldNames, v)) }.reduceLeftOption(FilterApi.or) case sources.StringStartsWith(name, prefix) if pushDownStartWith && canMakeFilterOn(name, prefix) => Option(prefix).map { v => - FilterApi.userDefined(binaryColumn(name), + FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames), new UserDefinedPredicate[Binary] with Serializable { private val strToBinary = Binary.fromReusedByteArray(v.getBytes) private val size = strToBinary.length diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 69c8bad5f1c83..a30d1c26b3b2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.datasources.parquet -import java.util.{Locale, Map => JMap, TimeZone} +import java.time.ZoneId +import java.util.{Locale, Map => JMap} import scala.collection.JavaConverters._ @@ -31,6 +32,7 @@ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types._ /** @@ -49,16 +51,18 @@ import org.apache.spark.sql.types._ * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]] * to [[prepareForRead()]], but use a private `var` for simplicity. */ -class ParquetReadSupport(val convertTz: Option[TimeZone], - enableVectorizedReader: Boolean) +class ParquetReadSupport( + val convertTz: Option[ZoneId], + enableVectorizedReader: Boolean, + datetimeRebaseMode: LegacyBehaviorPolicy.Value) extends ReadSupport[InternalRow] with Logging { private var catalystRequestedSchema: StructType = _ def this() { // We need a zero-arg constructor for SpecificParquetRecordReaderBase. But that is only - // used in the vectorized reader, where we get the convertTz value directly, and the value here - // is ignored. - this(None, enableVectorizedReader = true) + // used in the vectorized reader, where we get the convertTz/rebaseDateTime value directly, + // and the values here are ignored. + this(None, enableVectorizedReader = true, datetimeRebaseMode = LegacyBehaviorPolicy.CORRECTED) } /** @@ -126,7 +130,8 @@ class ParquetReadSupport(val convertTz: Option[TimeZone], parquetRequestedSchema, ParquetReadSupport.expandUDT(catalystRequestedSchema), new ParquetToSparkSchemaConverter(conf), - convertTz) + convertTz, + datetimeRebaseMode) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala index 3098a332d3027..bb528d548b6ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala @@ -17,12 +17,13 @@ package org.apache.spark.sql.execution.datasources.parquet -import java.util.TimeZone +import java.time.ZoneId import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer} import org.apache.parquet.schema.MessageType import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types.StructType /** @@ -31,16 +32,20 @@ import org.apache.spark.sql.types.StructType * @param parquetSchema Parquet schema of the records to be read * @param catalystSchema Catalyst schema of the rows to be constructed * @param schemaConverter A Parquet-Catalyst schema converter that helps initializing row converters + * @param convertTz the optional time zone to convert to int96 data + * @param datetimeRebaseMode the mode of rebasing date/timestamp from Julian to Proleptic Gregorian + * calendar */ private[parquet] class ParquetRecordMaterializer( parquetSchema: MessageType, catalystSchema: StructType, schemaConverter: ParquetToSparkSchemaConverter, - convertTz: Option[TimeZone]) + convertTz: Option[ZoneId], + datetimeRebaseMode: LegacyBehaviorPolicy.Value) extends RecordMaterializer[InternalRow] { - private val rootConverter = - new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, convertTz, NoopUpdater) + private val rootConverter = new ParquetRowConverter( + schemaConverter, parquetSchema, catalystSchema, convertTz, datetimeRebaseMode, NoopUpdater) override def getCurrentRecord: InternalRow = rootConverter.currentRecord diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 98ac2ecd2955c..9d37f17a24fb7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet import java.math.{BigDecimal, BigInteger} import java.nio.ByteOrder -import java.util.TimeZone +import java.time.{ZoneId, ZoneOffset} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer @@ -33,8 +33,11 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, DOUBLE import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CaseInsensitiveMap, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLTimestamp +import org.apache.spark.sql.execution.datasources.DataSourceUtils +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -118,14 +121,17 @@ private[parquet] class ParquetPrimitiveConverter(val updater: ParentContainerUpd * @param parquetType Parquet schema of Parquet records * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined * types should have been expanded. - * @param convertTz the optional time zone to convert to for int96 data + * @param convertTz the optional time zone to convert to int96 data + * @param datetimeRebaseMode the mode of rebasing date/timestamp from Julian to Proleptic Gregorian + * calendar * @param updater An updater which propagates converted field values to the parent container */ private[parquet] class ParquetRowConverter( schemaConverter: ParquetToSparkSchemaConverter, parquetType: GroupType, catalystType: StructType, - convertTz: Option[TimeZone], + convertTz: Option[ZoneId], + datetimeRebaseMode: LegacyBehaviorPolicy.Value, updater: ParentContainerUpdater) extends ParquetGroupConverter(updater) with Logging { @@ -154,8 +160,6 @@ private[parquet] class ParquetRowConverter( |${catalystType.prettyJson} """.stripMargin) - private[this] val UTC = DateTimeUtils.TimeZoneUTC - /** * Updater used together with field converters within a [[ParquetRowConverter]]. It propagates * converted filed values to the `ordinal`-th cell in `currentRow`. @@ -178,10 +182,23 @@ private[parquet] class ParquetRowConverter( */ def currentRecord: InternalRow = currentRow + private val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead( + datetimeRebaseMode, "Parquet") + + private val timestampRebaseFunc = DataSourceUtils.creteTimestampRebaseFuncInRead( + datetimeRebaseMode, "Parquet") + // Converters for each field. private[this] val fieldConverters: Array[Converter with HasParentContainerUpdater] = { + // (SPARK-31116) Use case insensitive map if spark.sql.caseSensitive is false + // to prevent throwing IllegalArgumentException when searching catalyst type's field index + val catalystFieldNameToIndex = if (SQLConf.get.caseSensitiveAnalysis) { + catalystType.fieldNames.zipWithIndex.toMap + } else { + CaseInsensitiveMap(catalystType.fieldNames.zipWithIndex.toMap) + } parquetType.getFields.asScala.map { parquetField => - val fieldIndex = catalystType.fieldIndex(parquetField.getName) + val fieldIndex = catalystFieldNameToIndex(parquetField.getName) val catalystField = catalystType(fieldIndex) // Converted field value should be set to the `fieldIndex`-th cell of `currentRow` newConverter(parquetField, catalystField.dataType, new RowUpdater(currentRow, fieldIndex)) @@ -267,14 +284,15 @@ private[parquet] class ParquetRowConverter( case TimestampType if parquetType.getOriginalType == OriginalType.TIMESTAMP_MICROS => new ParquetPrimitiveConverter(updater) { override def addLong(value: Long): Unit = { - updater.setLong(value) + updater.setLong(timestampRebaseFunc(value)) } } case TimestampType if parquetType.getOriginalType == OriginalType.TIMESTAMP_MILLIS => new ParquetPrimitiveConverter(updater) { override def addLong(value: Long): Unit = { - updater.setLong(DateTimeUtils.fromMillis(value)) + val micros = DateTimeUtils.fromMillis(value) + updater.setLong(timestampRebaseFunc(micros)) } } @@ -292,7 +310,8 @@ private[parquet] class ParquetRowConverter( val timeOfDayNanos = buf.getLong val julianDay = buf.getInt val rawTime = DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos) - val adjTime = convertTz.map(DateTimeUtils.convertTz(rawTime, _, UTC)).getOrElse(rawTime) + val adjTime = convertTz.map(DateTimeUtils.convertTz(rawTime, _, ZoneOffset.UTC)) + .getOrElse(rawTime) updater.setLong(adjTime) } } @@ -300,8 +319,7 @@ private[parquet] class ParquetRowConverter( case DateType => new ParquetPrimitiveConverter(updater) { override def addInt(value: Int): Unit = { - // DateType is not specialized in `SpecificMutableRow`, have to box it here. - updater.set(value.asInstanceOf[DateType#InternalType]) + updater.set(dateRebaseFunc(value)) } } @@ -349,7 +367,12 @@ private[parquet] class ParquetRowConverter( } } new ParquetRowConverter( - schemaConverter, parquetType.asGroupType(), t, convertTz, wrappedUpdater) + schemaConverter, + parquetType.asGroupType(), + t, + convertTz, + datetimeRebaseMode, + wrappedUpdater) case t => throw new RuntimeException( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala index 7e7dba92f37b5..b91d75c55c513 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala @@ -104,7 +104,7 @@ object ParquetUtils { .orElse(filesByType.data.headOption) .toSeq } - ParquetFileFormat.mergeSchemasInParallel(filesToTouch, sparkSession) + ParquetFileFormat.mergeSchemasInParallel(parameters, filesToTouch, sparkSession) } case class FileTypes( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala index f6490614ab05b..4e535c0c5ea99 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala @@ -31,11 +31,13 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.spark.SPARK_VERSION_SHORT import org.apache.spark.internal.Logging -import org.apache.spark.sql.SPARK_VERSION_METADATA_KEY +import org.apache.spark.sql.{SPARK_LEGACY_DATETIME, SPARK_VERSION_METADATA_KEY} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.SpecializedGetters import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.datasources.DataSourceUtils import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types._ /** @@ -77,6 +79,15 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { private val decimalBuffer = new Array[Byte](Decimal.minBytesForPrecision(DecimalType.MAX_PRECISION)) + private val datetimeRebaseMode = LegacyBehaviorPolicy.withName( + SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE)) + + private val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInWrite( + datetimeRebaseMode, "Parquet") + + private val timestampRebaseFunc = DataSourceUtils.creteTimestampRebaseFuncInWrite( + datetimeRebaseMode, "Parquet") + override def init(configuration: Configuration): WriteContext = { val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA) this.schema = StructType.fromString(schemaString) @@ -98,7 +109,13 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { val metadata = Map( SPARK_VERSION_METADATA_KEY -> SPARK_VERSION_SHORT, ParquetReadSupport.SPARK_METADATA_KEY -> schemaString - ).asJava + ) ++ { + if (datetimeRebaseMode == LegacyBehaviorPolicy.LEGACY) { + Some(SPARK_LEGACY_DATETIME -> "") + } else { + None + } + } logInfo( s"""Initialized Parquet WriteSupport with Catalyst schema: @@ -107,7 +124,7 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { |$messageType """.stripMargin) - new WriteContext(messageType, metadata) + new WriteContext(messageType, metadata.asJava) } override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { @@ -147,7 +164,11 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { (row: SpecializedGetters, ordinal: Int) => recordConsumer.addInteger(row.getShort(ordinal)) - case IntegerType | DateType => + case DateType => + (row: SpecializedGetters, ordinal: Int) => + recordConsumer.addInteger(dateRebaseFunc(row.getInt(ordinal))) + + case IntegerType => (row: SpecializedGetters, ordinal: Int) => recordConsumer.addInteger(row.getInt(ordinal)) @@ -179,11 +200,13 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS => (row: SpecializedGetters, ordinal: Int) => - recordConsumer.addLong(row.getLong(ordinal)) + val micros = row.getLong(ordinal) + recordConsumer.addLong(timestampRebaseFunc(micros)) case SQLConf.ParquetOutputTimestampType.TIMESTAMP_MILLIS => (row: SpecializedGetters, ordinal: Int) => - val millis = DateTimeUtils.toMillis(row.getLong(ordinal)) + val micros = row.getLong(ordinal) + val millis = DateTimeUtils.toMillis(timestampRebaseFunc(micros)) recordConsumer.addLong(millis) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 95343e2872def..888678d6ff36d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -240,7 +240,7 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi c.copy(tableDesc = normalizedTable.copy(schema = reorderedSchema)) } - case create: V2CreateTablePlan => + case create: V2CreateTablePlan if create.childrenResolved => val schema = create.tableSchema val partitioning = create.partitioning val identifier = create.tableName diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala index e4e7887017a1d..c199df676ced3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala @@ -40,7 +40,7 @@ case class BatchScanExec( override def hashCode(): Int = batch.hashCode() - override lazy val partitions: Seq[InputPartition] = batch.planInputPartitions() + @transient override lazy val partitions: Seq[InputPartition] = batch.planInputPartitions() override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 8f4e2d256c714..877aea178c552 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -130,7 +130,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat } case RefreshTable(catalog, ident) => - RefreshTableExec(catalog, ident) :: Nil + RefreshTableExec(session, catalog, ident) :: Nil case ReplaceTable(catalog, ident, schema, parts, props, orCreate) => val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) @@ -180,8 +180,9 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case OverwriteByExpression(r: DataSourceV2Relation, deleteExpr, query, writeOptions, _) => // fail if any filter cannot be converted. correctness depends on removing all matching data. val filters = splitConjunctivePredicates(deleteExpr).map { - filter => DataSourceStrategy.translateFilter(deleteExpr).getOrElse( - throw new AnalysisException(s"Cannot translate expression to source filter: $filter")) + filter => DataSourceStrategy.translateFilter(deleteExpr, + supportNestedPredicatePushdown = true).getOrElse( + throw new AnalysisException(s"Cannot translate expression to source filter: $filter")) }.toArray r.table.asWritable match { case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => @@ -205,7 +206,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat // correctness depends on removing all matching data. val filters = DataSourceStrategy.normalizeExprs(condition.toSeq, output) .flatMap(splitConjunctivePredicates(_).map { - f => DataSourceStrategy.translateFilter(f).getOrElse( + f => DataSourceStrategy.translateFilter(f, true).getOrElse( throw new AnalysisException(s"Exec update failed:" + s" cannot translate expression to source filter: $f")) }).toArray diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala index 64b98fb83b8fa..b4a14c6face31 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala @@ -34,7 +34,9 @@ case class DescribeNamespaceExec( catalog: SupportsNamespaces, namespace: Seq[String], isExtended: Boolean) extends V2CommandExec { - private val encoder = RowEncoder(StructType.fromAttributes(output)).resolveAndBind() + private val toRow = { + RowEncoder(StructType.fromAttributes(output)).resolveAndBind().createSerializer() + } override protected def run(): Seq[InternalRow] = { val rows = new ArrayBuffer[InternalRow]() @@ -57,6 +59,6 @@ case class DescribeNamespaceExec( } private def toCatalystRow(strs: String*): InternalRow = { - encoder.toRow(new GenericRowWithSchema(strs.toArray, schema)).copy() + toRow(new GenericRowWithSchema(strs.toArray, schema)).copy() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala index 9c280206c548e..bc6bb175f979e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala @@ -31,7 +31,9 @@ case class DescribeTableExec( table: Table, isExtended: Boolean) extends V2CommandExec { - private val encoder = RowEncoder(StructType.fromAttributes(output)).resolveAndBind() + private val toRow = { + RowEncoder(StructType.fromAttributes(output)).resolveAndBind().createSerializer() + } override protected def run(): Seq[InternalRow] = { val rows = new ArrayBuffer[InternalRow]() @@ -85,6 +87,6 @@ case class DescribeTableExec( private def emptyRow(): InternalRow = toCatalystRow("", "", "") private def toCatalystRow(strs: String*): InternalRow = { - encoder.toRow(new GenericRowWithSchema(strs.toArray, schema)).copy() + toRow(new GenericRowWithSchema(strs.toArray, schema)).copy() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala index 30a964d7e643f..fa871fe866fcd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala @@ -18,7 +18,10 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util +import scala.collection.JavaConverters._ + import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession @@ -53,14 +56,23 @@ trait FileDataSourceV2 extends TableProvider with DataSourceRegister { paths ++ Option(map.get("path")).toSeq } - protected def getTableName(paths: Seq[String]): String = { - val name = shortName() + " " + paths.map(qualifiedPathName).mkString(",") + protected def getOptionsWithoutPaths(map: CaseInsensitiveStringMap): CaseInsensitiveStringMap = { + val withoutPath = map.asCaseSensitiveMap().asScala.filterKeys { k => + !k.equalsIgnoreCase("path") && !k.equalsIgnoreCase("paths") + } + new CaseInsensitiveStringMap(withoutPath.asJava) + } + + protected def getTableName(map: CaseInsensitiveStringMap, paths: Seq[String]): String = { + val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions( + map.asCaseSensitiveMap().asScala.toMap) + val name = shortName() + " " + paths.map(qualifiedPathName(_, hadoopConf)).mkString(",") Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, name) } - private def qualifiedPathName(path: String): String = { + private def qualifiedPathName(path: String, hadoopConf: Configuration): String = { val hdfsPath = new Path(path) - val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf()) + val fs = hdfsPath.getFileSystem(hadoopConf) hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala index 59dc3ae56bf25..1658a250462b3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala @@ -53,7 +53,7 @@ abstract class FileTable( } else { // This is a non-streaming file based datasource. val rootPathsSpecified = DataSource.checkAndGlobPathIfNecessary(paths, hadoopConf, - checkEmptyGlobPath = true, checkFilesExist = true) + checkEmptyGlobPath = true, checkFilesExist = true, enableGlobbing = globPaths) val fileStatusCache = FileStatusCache.getOrCreate(sparkSession) new InMemoryFileIndex( sparkSession, rootPathsSpecified, caseSensitiveMap, userSpecifiedSchema, fileStatusCache) @@ -139,6 +139,14 @@ abstract class FileTable( * 2. Catalog support is required, which is still under development for data source V2. */ def fallbackFileFormat: Class[_ <: FileFormat] + + /** + * Whether or not paths should be globbed before being used to access files. + */ + private def globPaths: Boolean = { + val entry = options.get(DataSource.GLOB_PATHS_KEY) + Option(entry).map(_ == "true").getOrElse(true) + } } object FileTable { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala index d519832c57501..cd62ee7814bf2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala @@ -16,7 +16,6 @@ */ package org.apache.spark.sql.execution.datasources.v2 -import java.io.IOException import java.util.UUID import scala.collection.JavaConverters._ @@ -27,7 +26,7 @@ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.spark.internal.io.FileCommitProtocol -import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} +import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.connector.write.{BatchWrite, LogicalWriteInfo, WriteBuilder} @@ -46,12 +45,6 @@ abstract class FileWriteBuilder( private val schema = info.schema() private val queryId = info.queryId() private val options = info.options() - private var mode: SaveMode = _ - - def mode(mode: SaveMode): WriteBuilder = { - this.mode = mode - this - } override def buildForBatch(): BatchWrite = { val sparkSession = SparkSession.active @@ -68,26 +61,8 @@ abstract class FileWriteBuilder( lazy val description = createWriteJobDescription(sparkSession, hadoopConf, job, paths.head, options.asScala.toMap) - val fs = path.getFileSystem(hadoopConf) - mode match { - case SaveMode.ErrorIfExists if fs.exists(path) => - val qualifiedOutputPath = path.makeQualified(fs.getUri, fs.getWorkingDirectory) - throw new AnalysisException(s"path $qualifiedOutputPath already exists.") - - case SaveMode.Ignore if fs.exists(path) => - null - - case SaveMode.Overwrite => - if (fs.exists(path) && !committer.deleteWithJob(fs, path, true)) { - throw new IOException(s"Unable to clear directory $path prior to writing to it") - } - committer.setupJob(job) - new FileBatchWrite(job, description, committer) - - case _ => - committer.setupJob(job) - new FileBatchWrite(job, description, committer) - } + committer.setupJob(job) + new FileBatchWrite(job, description, committer) } /** @@ -104,7 +79,6 @@ abstract class FileWriteBuilder( private def validateInputs(caseSensitiveAnalysis: Boolean): Unit = { assert(schema != null, "Missing input data schema") assert(queryId != null, "Missing query ID") - assert(mode != null, "Missing save mode") if (paths.length != 1) { throw new IllegalArgumentException("Expected exactly one path to be specified, but " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala index 33338b06565c9..1a6f03f54f2e9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala @@ -48,7 +48,8 @@ object PushDownUtils extends PredicateHelper { for (filterExpr <- filters) { val translated = - DataSourceStrategy.translateFilterWithMapping(filterExpr, Some(translatedFilterToExpr)) + DataSourceStrategy.translateFilterWithMapping(filterExpr, Some(translatedFilterToExpr), + nestedPredicatePushdownEnabled = true) if (translated.isEmpty) { untranslatableExprs += filterExpr } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala index 2a19ff304a9e0..9717b9f954671 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala @@ -17,15 +17,26 @@ package org.apache.spark.sql.execution.datasources.v2 +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} +import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} case class RefreshTableExec( + session: SparkSession, catalog: TableCatalog, ident: Identifier) extends V2CommandExec { override protected def run(): Seq[InternalRow] = { catalog.invalidateTable(ident) + + if (catalog.tableExists(ident)) { + val table = catalog.loadTable(ident) + // invalidate all caches referencing the given table + // TODO(SPARK-33437): re-cache the table itself once we support caching a DSv2 table + val v2Relation = DataSourceV2Relation.create(table, Some(catalog), Some(ident)) + session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade = true) + } + Seq.empty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala index 9e6f00e0923ea..b13cea266707b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala @@ -32,8 +32,8 @@ case class SetCatalogAndNamespaceExec( override protected def run(): Seq[InternalRow] = { // The catalog is updated first because CatalogManager resets the current namespace // when the current catalog is set. - catalogName.map(catalogManager.setCurrentCatalog) - namespace.map(ns => catalogManager.setCurrentNamespace(ns.toArray)) + catalogName.foreach(catalogManager.setCurrentCatalog) + namespace.foreach(ns => catalogManager.setCurrentNamespace(ns.toArray)) Seq.empty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala index 42b80a15080a6..5f7b6f4061467 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala @@ -31,10 +31,11 @@ case class ShowCurrentNamespaceExec( catalogManager: CatalogManager) extends V2CommandExec { override protected def run(): Seq[InternalRow] = { - val encoder = RowEncoder(schema).resolveAndBind() - Seq(encoder - .toRow(new GenericRowWithSchema( - Array(catalogManager.currentCatalog.name, catalogManager.currentNamespace.quoted), schema)) - .copy()) + val toRow = RowEncoder(schema).resolveAndBind().createSerializer() + val result = new GenericRowWithSchema(Array[Any]( + catalogManager.currentCatalog.name, + catalogManager.currentNamespace.quoted), + schema) + Seq(toRow(result).copy()) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala index fe3ab8023db6f..9188f4eb60d56 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchem import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper import org.apache.spark.sql.connector.catalog.SupportsNamespaces +import org.apache.spark.sql.execution.LeafExecNode /** * Physical plan node for showing namespaces. @@ -33,8 +34,7 @@ case class ShowNamespacesExec( output: Seq[Attribute], catalog: SupportsNamespaces, namespace: Seq[String], - pattern: Option[String]) - extends V2CommandExec { + pattern: Option[String]) extends V2CommandExec with LeafExecNode { override protected def run(): Seq[InternalRow] = { val namespaces = if (namespace.nonEmpty) { @@ -44,13 +44,11 @@ case class ShowNamespacesExec( } val rows = new ArrayBuffer[InternalRow]() - val encoder = RowEncoder(schema).resolveAndBind() + val toRow = RowEncoder(schema).resolveAndBind().createSerializer() namespaces.map(_.quoted).map { ns => if (pattern.map(StringUtils.filterPattern(Seq(ns), _).nonEmpty).getOrElse(true)) { - rows += encoder - .toRow(new GenericRowWithSchema(Array(ns), schema)) - .copy() + rows += toRow(new GenericRowWithSchema(Array(ns), schema)).copy() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala index 7905c35f55de0..95715fd1af56e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder -import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} -import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, GenericRowWithSchema} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table, TableCatalog} /** * Physical plan node for showing table properties. @@ -30,19 +30,23 @@ case class ShowTablePropertiesExec( catalogTable: Table, propertyKey: Option[String]) extends V2CommandExec { + override def producedAttributes: AttributeSet = AttributeSet(output) + override protected def run(): Seq[InternalRow] = { import scala.collection.JavaConverters._ - val encoder = RowEncoder(schema).resolveAndBind() + val toRow = RowEncoder(schema).resolveAndBind().createSerializer() + // The reserved properties are accessible through DESCRIBE val properties = catalogTable.properties.asScala + .filter { case (k, v) => !CatalogV2Util.TABLE_RESERVED_PROPERTIES.contains(k) } propertyKey match { case Some(p) => val propValue = properties .getOrElse(p, s"Table ${catalogTable.name} does not have property: $p") - Seq(encoder.toRow(new GenericRowWithSchema(Array(p, propValue), schema)).copy()) + Seq(toRow(new GenericRowWithSchema(Array(p, propValue), schema)).copy()) case None => properties.keys.map(k => - encoder.toRow(new GenericRowWithSchema(Array(k, properties(k)), schema)).copy()).toSeq + toRow(new GenericRowWithSchema(Array(k, properties(k)), schema)).copy()).toSeq } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala index 995b00871fc2a..820f5ae8f1b12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchem import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper import org.apache.spark.sql.connector.catalog.TableCatalog +import org.apache.spark.sql.execution.LeafExecNode /** * Physical plan node for showing tables. @@ -33,21 +34,18 @@ case class ShowTablesExec( output: Seq[Attribute], catalog: TableCatalog, namespace: Seq[String], - pattern: Option[String]) - extends V2CommandExec { + pattern: Option[String]) extends V2CommandExec with LeafExecNode { override protected def run(): Seq[InternalRow] = { val rows = new ArrayBuffer[InternalRow]() - val encoder = RowEncoder(schema).resolveAndBind() + val toRow = RowEncoder(schema).resolveAndBind().createSerializer() val tables = catalog.listTables(namespace.toArray) tables.map { table => if (pattern.map(StringUtils.filterPattern(Seq(table.name()), _).nonEmpty).getOrElse(true)) { - rows += encoder - .toRow( - new GenericRowWithSchema( - Array(table.namespace().quoted, table.name()), - schema)) - .copy() + val result = new GenericRowWithSchema( + Array(table.namespace().quoted, table.name()), + schema) + rows += toRow(result).copy() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala index f97300025400d..95082bc466b90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.UUID import org.apache.spark.SparkException -import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute @@ -41,7 +40,7 @@ case class AppendDataExecV1( writeOptions: CaseInsensitiveStringMap, plan: LogicalPlan) extends V1FallbackWriters { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { writeWithV1(newWriteBuilder().buildForV1Write()) } } @@ -67,7 +66,7 @@ case class OverwriteByExpressionExecV1( filters.length == 1 && filters(0).isInstanceOf[AlwaysTrue] } - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { newWriteBuilder() match { case builder: SupportsTruncate if isTruncate(deleteWhere) => writeWithV1(builder.truncate().asV1Builder.buildForV1Write()) @@ -82,7 +81,7 @@ case class OverwriteByExpressionExecV1( } /** Some helper interfaces that use V2 write semantics through the V1 writer interface. */ -sealed trait V1FallbackWriters extends SupportsV1Write { +sealed trait V1FallbackWriters extends V2CommandExec with SupportsV1Write { override def output: Seq[Attribute] = Nil override final def children: Seq[SparkPlan] = Nil @@ -112,11 +111,10 @@ sealed trait V1FallbackWriters extends SupportsV1Write { * A trait that allows Tables that use V1 Writer interfaces to append data. */ trait SupportsV1Write extends SparkPlan { - // TODO: We should be able to work on SparkPlans at this point. def plan: LogicalPlan - protected def writeWithV1(relation: InsertableRelation): RDD[InternalRow] = { + protected def writeWithV1(relation: InsertableRelation): Seq[InternalRow] = { relation.insert(Dataset.ofRows(sqlContext.sparkSession, plan), overwrite = false) - sparkContext.emptyRDD + Nil } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala index a1f685d47a346..4be4a6b30edcd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala @@ -19,13 +19,13 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.execution.SparkPlan /** * A physical operator that executes run() and saves the result to prevent multiple executions. * Any V2 commands that do not require triggering a spark job should extend this class. */ -abstract class V2CommandExec extends LeafExecNode { +abstract class V2CommandExec extends SparkPlan { /** * Abstract method that each concrete command needs to implement to compute the result. @@ -53,4 +53,6 @@ abstract class V2CommandExec extends LeafExecNode { protected override def doExecute(): RDD[InternalRow] = { sqlContext.sparkContext.parallelize(result, 1) } + + override def children: Seq[SparkPlan] = Nil } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala index 59089fa6b77e9..b168e848f0b6f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala @@ -59,7 +59,7 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] { val wrappedScan = scan match { case v1: V1Scan => - val translated = filters.flatMap(DataSourceStrategy.translateFilter) + val translated = filters.flatMap(DataSourceStrategy.translateFilter(_, true)) V1ScanWrapper(v1, translated, pushedFilters) case _ => scan } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index cef9b5f675889..2ed33b867183b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -167,14 +167,15 @@ class V2SessionCatalog(catalog: SessionCatalog, conf: SQLConf) } implicit class TableIdentifierHelper(ident: Identifier) { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper + def asTableIdentifier: TableIdentifier = { ident.namespace match { case Array(db) => TableIdentifier(ident.name, Some(db)) - case Array() => - TableIdentifier(ident.name, Some(catalog.getCurrentDatabase)) case _ => - throw new NoSuchTableException(ident) + throw new NoSuchTableException( + s"V2 session catalog requires a single-part namespace: ${ident.quoted}") } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index e360a9e656a16..616e18ee85a6b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -70,10 +70,10 @@ case class CreateTableAsSelectExec( import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { if (catalog.tableExists(ident)) { if (ifNotExists) { - return sparkContext.parallelize(Seq.empty, 1) + return Nil } throw new TableAlreadyExistsException(ident) @@ -125,10 +125,10 @@ case class AtomicCreateTableAsSelectExec( writeOptions: CaseInsensitiveStringMap, ifNotExists: Boolean) extends AtomicTableWriteExec { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { if (catalog.tableExists(ident)) { if (ifNotExists) { - return sparkContext.parallelize(Seq.empty, 1) + return Nil } throw new TableAlreadyExistsException(ident) @@ -161,7 +161,7 @@ case class ReplaceTableAsSelectExec( import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { // Note that this operation is potentially unsafe, but these are the strict semantics of // RTAS if the catalog does not support atomic operations. // @@ -225,7 +225,7 @@ case class AtomicReplaceTableAsSelectExec( writeOptions: CaseInsensitiveStringMap, orCreate: Boolean) extends AtomicTableWriteExec { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { val schema = query.schema.asNullable val staged = if (orCreate) { catalog.stageCreateOrReplace( @@ -255,7 +255,7 @@ case class AppendDataExec( writeOptions: CaseInsensitiveStringMap, query: SparkPlan) extends V2TableWriteExec with BatchWriteHelper { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { writeWithV2(newWriteBuilder().buildForBatch()) } } @@ -280,7 +280,7 @@ case class OverwriteByExpressionExec( filters.length == 1 && filters(0).isInstanceOf[AlwaysTrue] } - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { newWriteBuilder() match { case builder: SupportsTruncate if isTruncate(deleteWhere) => writeWithV2(builder.truncate().buildForBatch()) @@ -308,7 +308,7 @@ case class OverwritePartitionsDynamicExec( writeOptions: CaseInsensitiveStringMap, query: SparkPlan) extends V2TableWriteExec with BatchWriteHelper { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { newWriteBuilder() match { case builder: SupportsDynamicOverwrite => writeWithV2(builder.overwriteDynamicPartitions().buildForBatch()) @@ -325,7 +325,7 @@ case class WriteToDataSourceV2Exec( def writeOptions: CaseInsensitiveStringMap = CaseInsensitiveStringMap.empty() - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { writeWithV2(batchWrite) } } @@ -350,7 +350,7 @@ trait BatchWriteHelper { /** * The base physical plan for writing data into data source v2. */ -trait V2TableWriteExec extends UnaryExecNode { +trait V2TableWriteExec extends V2CommandExec with UnaryExecNode { def query: SparkPlan var commitProgress: Option[StreamWriterCommitProgress] = None @@ -358,7 +358,7 @@ trait V2TableWriteExec extends UnaryExecNode { override def child: SparkPlan = query override def output: Seq[Attribute] = Nil - protected def writeWithV2(batchWrite: BatchWrite): RDD[InternalRow] = { + protected def writeWithV2(batchWrite: BatchWrite): Seq[InternalRow] = { val rdd: RDD[InternalRow] = { val tempRdd = query.execute() // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single @@ -415,7 +415,7 @@ trait V2TableWriteExec extends UnaryExecNode { } } - sparkContext.emptyRDD + Nil } } @@ -485,7 +485,7 @@ private[v2] trait AtomicTableWriteExec extends V2TableWriteExec with SupportsV1W protected def writeToStagedTable( stagedTable: StagedTable, writeOptions: CaseInsensitiveStringMap, - ident: Identifier): RDD[InternalRow] = { + ident: Identifier): Seq[InternalRow] = { Utils.tryWithSafeFinallyAndFailureCallbacks({ stagedTable match { case table: SupportsWrite => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala index 1f99d4282f6da..c577cbf8d756a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala @@ -31,13 +31,15 @@ class CSVDataSourceV2 extends FileDataSourceV2 { override def getTable(options: CaseInsensitiveStringMap): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - CSVTable(tableName, sparkSession, options, paths, None, fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + CSVTable(tableName, sparkSession, optionsWithoutPaths, paths, None, fallbackFileFormat) } override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - CSVTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + CSVTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema), fallbackFileFormat) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala index 7a0949e586cd8..cd0eba0ca9c51 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala @@ -31,14 +31,16 @@ class JsonDataSourceV2 extends FileDataSourceV2 { override def getTable(options: CaseInsensitiveStringMap): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - JsonTable(tableName, sparkSession, options, paths, None, fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + JsonTable(tableName, sparkSession, optionsWithoutPaths, paths, None, fallbackFileFormat) } override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - JsonTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + JsonTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema), fallbackFileFormat) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala index 8665af33b976a..6303723e4b805 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala @@ -31,14 +31,16 @@ class OrcDataSourceV2 extends FileDataSourceV2 { override def getTable(options: CaseInsensitiveStringMap): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - OrcTable(tableName, sparkSession, options, paths, None, fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + OrcTable(tableName, sparkSession, optionsWithoutPaths, paths, None, fallbackFileFormat) } override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - OrcTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + OrcTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema), fallbackFileFormat) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala index 03d58fdcb7f67..b0ddee0a6b336 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala @@ -31,9 +31,10 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader} import org.apache.spark.sql.execution.datasources.PartitionedFile -import org.apache.spark.sql.execution.datasources.orc.{OrcColumnarBatchReader, OrcDeserializer, OrcUtils} +import org.apache.spark.sql.execution.datasources.orc.{OrcColumnarBatchReader, OrcDeserializer, OrcFilters, OrcUtils} import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.{AtomicType, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -52,10 +53,13 @@ case class OrcPartitionReaderFactory( broadcastedConf: Broadcast[SerializableConfiguration], dataSchema: StructType, readDataSchema: StructType, - partitionSchema: StructType) extends FilePartitionReaderFactory { + partitionSchema: StructType, + filters: Array[Filter]) extends FilePartitionReaderFactory { private val resultSchema = StructType(readDataSchema.fields ++ partitionSchema.fields) private val isCaseSensitive = sqlConf.caseSensitiveAnalysis private val capacity = sqlConf.orcVectorizedReaderBatchSize + private val orcFilterPushDown = sqlConf.orcFilterPushDown + private val ignoreCorruptFiles = sqlConf.ignoreCorruptFiles override def supportColumnarReads(partition: InputPartition): Boolean = { sqlConf.orcVectorizedReaderEnabled && sqlConf.wholeStageEnabled && @@ -63,27 +67,39 @@ case class OrcPartitionReaderFactory( resultSchema.forall(_.dataType.isInstanceOf[AtomicType]) } + private def pushDownPredicates(filePath: Path, conf: Configuration): Unit = { + if (orcFilterPushDown) { + OrcUtils.readCatalystSchema(filePath, conf, ignoreCorruptFiles).foreach { fileSchema => + OrcFilters.createFilter(fileSchema, filters).foreach { f => + OrcInputFormat.setSearchArgument(conf, f, fileSchema.fieldNames) + } + } + } + } + override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { val conf = broadcastedConf.value.value - val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) - OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) + pushDownPredicates(filePath, conf) + val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val requestedColIdsOrEmptyFile = + val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } - if (requestedColIdsOrEmptyFile.isEmpty) { + if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader[InternalRow] } else { - val requestedColIds = requestedColIdsOrEmptyFile.get + val (requestedColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = OrcUtils.orcResultSchemaString(canPruneCols, + dataSchema, resultSchema, partitionSchema, conf) assert(requestedColIds.length == readDataSchema.length, "[BUG] requested column IDs do not match required schema") @@ -112,24 +128,27 @@ case class OrcPartitionReaderFactory( override def buildColumnarReader(file: PartitionedFile): PartitionReader[ColumnarBatch] = { val conf = broadcastedConf.value.value - val resultSchemaString = OrcUtils.orcTypeDescriptionString(resultSchema) - OrcConf.MAPRED_INPUT_SCHEMA.setString(conf, resultSchemaString) OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(conf, isCaseSensitive) val filePath = new Path(new URI(file.filePath)) + pushDownPredicates(filePath, conf) + val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val requestedColIdsOrEmptyFile = + val resultedColPruneInfo = Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => OrcUtils.requestedColumnIds( isCaseSensitive, dataSchema, readDataSchema, reader, conf) } - if (requestedColIdsOrEmptyFile.isEmpty) { + if (resultedColPruneInfo.isEmpty) { new EmptyPartitionReader } else { - val requestedColIds = requestedColIdsOrEmptyFile.get ++ Array.fill(partitionSchema.length)(-1) + val (requestedDataColIds, canPruneCols) = resultedColPruneInfo.get + val resultSchemaString = OrcUtils.orcResultSchemaString(canPruneCols, + dataSchema, resultSchema, partitionSchema, conf) + val requestedColIds = requestedDataColIds ++ Array.fill(partitionSchema.length)(-1) assert(requestedColIds.length == resultSchema.length, "[BUG] requested column IDs do not match required schema") val taskConf = new Configuration(conf) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala index 62894fa7a2538..35e3b1a88e087 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala @@ -48,7 +48,7 @@ case class OrcScan( // The partition values are already truncated in `FileScan.partitions`. // We should use `readPartitionSchema` as the partition schema here. OrcPartitionReaderFactory(sparkSession.sessionState.conf, broadcastedConf, - dataSchema, readDataSchema, readPartitionSchema) + dataSchema, readDataSchema, readPartitionSchema, pushedFilters) } override def equals(obj: Any): Boolean = obj match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala index 1421ffd8b6de4..6a9cb250ca3d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.orc.OrcFilters import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -54,13 +55,10 @@ case class OrcScanBuilder( override def pushFilters(filters: Array[Filter]): Array[Filter] = { if (sparkSession.sessionState.conf.orcFilterPushDown) { - OrcFilters.createFilter(schema, filters).foreach { f => - // The pushed filters will be set in `hadoopConf`. After that, we can simply use the - // changed `hadoopConf` in executors. - OrcInputFormat.setSearchArgument(hadoopConf, f, schema.fieldNames) - } - val dataTypeMap = schema.map(f => f.name -> f.dataType).toMap - _pushedFilters = OrcFilters.convertibleFilters(schema, dataTypeMap, filters).toArray + val dataTypeMap = OrcFilters.getDataTypeMap(schema, SQLConf.get.caseSensitiveAnalysis) + // TODO (SPARK-25557): ORC doesn't support nested predicate pushdown, so they are removed. + val newFilters = filters.filter(!_.containsNestedColumn) + _pushedFilters = OrcFilters.convertibleFilters(schema, dataTypeMap, newFilters).toArray } filters } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala index 8cb6186c12ff3..4590660bc7b7e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala @@ -31,14 +31,17 @@ class ParquetDataSourceV2 extends FileDataSourceV2 { override def getTable(options: CaseInsensitiveStringMap): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - ParquetTable(tableName, sparkSession, options, paths, None, fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + ParquetTable(tableName, sparkSession, optionsWithoutPaths, paths, None, fallbackFileFormat) } override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - ParquetTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + ParquetTable( + tableName, sparkSession, optionsWithoutPaths, paths, Some(schema), fallbackFileFormat) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala index b2fc724057eba..3b482b0c8ab62 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.v2.parquet import java.net.URI -import java.util.TimeZone +import java.time.ZoneId import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ @@ -33,10 +33,11 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader} -import org.apache.spark.sql.execution.datasources.{PartitionedFile, RecordReaderIterator} +import org.apache.spark.sql.execution.datasources.{DataSourceUtils, PartitionedFile, RecordReaderIterator} import org.apache.spark.sql.execution.datasources.parquet._ import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.{AtomicType, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch @@ -116,8 +117,9 @@ case class ParquetPartitionReaderFactory( private def buildReaderBase[T]( file: PartitionedFile, buildReaderFunc: ( - ParquetInputSplit, InternalRow, TaskAttemptContextImpl, Option[FilterPredicate], - Option[TimeZone]) => RecordReader[Void, T]): RecordReader[Void, T] = { + ParquetInputSplit, InternalRow, TaskAttemptContextImpl, + Option[FilterPredicate], Option[ZoneId], + LegacyBehaviorPolicy.Value) => RecordReader[Void, T]): RecordReader[Void, T] = { val conf = broadcastedConf.value.value val filePath = new Path(new URI(file.filePath)) @@ -156,7 +158,7 @@ case class ParquetPartitionReaderFactory( val convertTz = if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getTimeZone(conf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + Some(DateTimeUtils.getZoneId(conf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) } else { None } @@ -169,8 +171,11 @@ case class ParquetPartitionReaderFactory( if (pushed.isDefined) { ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get) } - val reader = - buildReaderFunc(split, file.partitionValues, hadoopAttemptContext, pushed, convertTz) + val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( + footerFileMetaData.getKeyValueMetaData.get, + SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ)) + val reader = buildReaderFunc( + split, file.partitionValues, hadoopAttemptContext, pushed, convertTz, datetimeRebaseMode) reader.initialize(split, hadoopAttemptContext) reader } @@ -184,11 +189,13 @@ case class ParquetPartitionReaderFactory( partitionValues: InternalRow, hadoopAttemptContext: TaskAttemptContextImpl, pushed: Option[FilterPredicate], - convertTz: Option[TimeZone]): RecordReader[Void, InternalRow] = { + convertTz: Option[ZoneId], + datetimeRebaseMode: LegacyBehaviorPolicy.Value): RecordReader[Void, InternalRow] = { logDebug(s"Falling back to parquet-mr") val taskContext = Option(TaskContext.get()) // ParquetRecordReader returns InternalRow - val readSupport = new ParquetReadSupport(convertTz, enableVectorizedReader = false) + val readSupport = new ParquetReadSupport( + convertTz, enableVectorizedReader = false, datetimeRebaseMode) val reader = if (pushed.isDefined && enableRecordFilter) { val parquetFilter = FilterCompat.get(pushed.get, null) new ParquetRecordReader[InternalRow](readSupport, parquetFilter) @@ -213,10 +220,14 @@ case class ParquetPartitionReaderFactory( partitionValues: InternalRow, hadoopAttemptContext: TaskAttemptContextImpl, pushed: Option[FilterPredicate], - convertTz: Option[TimeZone]): VectorizedParquetRecordReader = { + convertTz: Option[ZoneId], + datetimeRebaseMode: LegacyBehaviorPolicy.Value): VectorizedParquetRecordReader = { val taskContext = Option(TaskContext.get()) val vectorizedReader = new VectorizedParquetRecordReader( - convertTz.orNull, enableOffHeapColumnVector && taskContext.isDefined, capacity) + convertTz.orNull, + datetimeRebaseMode.toString, + enableOffHeapColumnVector && taskContext.isDefined, + capacity) val iter = new RecordReaderIterator(vectorizedReader) // SPARK-23457 Register a task completion listener before `initialization`. taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala index 049c717effa26..f375a128d8177 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala @@ -31,14 +31,16 @@ class TextDataSourceV2 extends FileDataSourceV2 { override def getTable(options: CaseInsensitiveStringMap): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - TextTable(tableName, sparkSession, options, paths, None, fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + TextTable(tableName, sparkSession, optionsWithoutPaths, paths, None, fallbackFileFormat) } override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { val paths = getPaths(options) - val tableName = getTableName(paths) - TextTable(tableName, sparkSession, options, paths, Some(schema), fallbackFileFormat) + val tableName = getTableName(options, paths) + val optionsWithoutPaths = getOptionsWithoutPaths(options) + TextTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema), fallbackFileFormat) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 6a57ef2cafe23..3cbebca14f7dc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -107,13 +107,18 @@ package object debug { */ def codegenStringSeq(plan: SparkPlan): Seq[(String, String, ByteCodeStats)] = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() - plan transform { - case s: WholeStageCodegenExec => - codegenSubtrees += s - s - case s => s + + def findSubtrees(plan: SparkPlan): Unit = { + plan foreach { + case s: WholeStageCodegenExec => + codegenSubtrees += s + case s => + s.subqueries.foreach(findSubtrees) + } } - codegenSubtrees.toSeq.map { subtree => + + findSubtrees(plan) + codegenSubtrees.toSeq.sortBy(_.codegenStageId).map { subtree => val (_, source) = subtree.doCodeGen() val codeStats = try { CodeGenerator.compile(source)._2 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/CleanupDynamicPruningFilters.scala similarity index 97% rename from sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/CleanupDynamicPruningFilters.scala index 84be2c9cf5a21..1cf55a2f2954d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/CleanupDynamicPruningFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/CleanupDynamicPruningFilters.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.dynamicpruning +package org.apache.spark.sql.execution.dynamicpruning import org.apache.spark.sql.catalyst.expressions.{DynamicPruning, PredicateHelper} import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala similarity index 98% rename from sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala index 48ba8618f272e..43c6581632687 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.dynamicpruning +package org.apache.spark.sql.execution.dynamicpruning import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys @@ -86,7 +86,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper { filteringPlan: LogicalPlan, joinKeys: Seq[Expression], hasBenefit: Boolean): LogicalPlan = { - val reuseEnabled = SQLConf.get.dynamicPartitionPruningReuseBroadcast + val reuseEnabled = SQLConf.get.exchangeReuseEnabled val index = joinKeys.indexOf(filteringKey) if (hasBenefit || reuseEnabled) { // insert a DynamicPruning wrapper to identify the subquery during query planning @@ -96,7 +96,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper { filteringPlan, joinKeys, index, - !hasBenefit), + !hasBenefit || SQLConf.get.dynamicPartitionPruningReuseBroadcastOnly), pruningPlan) } else { // abort dynamic partition pruning diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala similarity index 94% rename from sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala index 1398dc049dd99..eb091758910cd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.dynamicpruning +package org.apache.spark.sql.execution.dynamicpruning import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions @@ -36,9 +36,6 @@ import org.apache.spark.sql.internal.SQLConf case class PlanDynamicPruningFilters(sparkSession: SparkSession) extends Rule[SparkPlan] with PredicateHelper { - private def reuseBroadcast: Boolean = - SQLConf.get.dynamicPartitionPruningReuseBroadcast && SQLConf.get.exchangeReuseEnabled - /** * Identify the shape in which keys of a given plan are broadcasted. */ @@ -59,7 +56,7 @@ case class PlanDynamicPruningFilters(sparkSession: SparkSession) sparkSession, sparkSession.sessionState.planner, buildPlan) // Using `sparkPlan` is a little hacky as it is based on the assumption that this rule is // the first to be applied (apart from `InsertAdaptiveSparkPlan`). - val canReuseExchange = reuseBroadcast && buildKeys.nonEmpty && + val canReuseExchange = SQLConf.get.exchangeReuseEnabled && buildKeys.nonEmpty && plan.find { case BroadcastHashJoinExec(_, _, _, BuildLeft, _, left, _) => left.sameResult(sparkPlan) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala index 36f0d173cd0b0..bcdaf61c9c997 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/BroadcastExchangeExec.scala @@ -29,22 +29,52 @@ import org.apache.spark.launcher.SparkLauncher import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, BroadcastPartitioning, Partitioning} import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} import org.apache.spark.sql.execution.joins.HashedRelation import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.unsafe.map.BytesToBytesMap import org.apache.spark.util.{SparkFatalException, ThreadUtils} +/** + * Common trait for all broadcast exchange implementations to facilitate pattern matching. + */ +trait BroadcastExchangeLike extends Exchange { + + /** + * The broadcast job group ID + */ + def runId: UUID = UUID.randomUUID + + /** + * The asynchronous job that prepares the broadcast relation. + */ + def relationFuture: Future[broadcast.Broadcast[Any]] + + /** + * For registering callbacks on `relationFuture`. + * Note that calling this method may not start the execution of broadcast job. + */ + def completionFuture: scala.concurrent.Future[broadcast.Broadcast[Any]] + + /** + * Returns the runtime statistics after broadcast materialization. + */ + def runtimeStatistics: Statistics +} + /** * A [[BroadcastExchangeExec]] collects, transforms and finally broadcasts the result of * a transformed SparkPlan. */ case class BroadcastExchangeExec( mode: BroadcastMode, - child: SparkPlan) extends Exchange { + child: SparkPlan) extends BroadcastExchangeLike { + import BroadcastExchangeExec._ - private[sql] val runId: UUID = UUID.randomUUID + override val runId: UUID = UUID.randomUUID override lazy val metrics = Map( "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"), @@ -58,6 +88,11 @@ case class BroadcastExchangeExec( BroadcastExchangeExec(mode.canonicalized, child.canonicalized) } + override def runtimeStatistics: Statistics = { + val dataSize = metrics("dataSize").value + Statistics(dataSize) + } + @transient private lazy val promise = Promise[broadcast.Broadcast[Any]]() @@ -66,20 +101,16 @@ case class BroadcastExchangeExec( * Note that calling this field will not start the execution of broadcast job. */ @transient - lazy val completionFuture: scala.concurrent.Future[broadcast.Broadcast[Any]] = promise.future + override lazy val completionFuture: scala.concurrent.Future[broadcast.Broadcast[Any]] = + promise.future @transient private val timeout: Long = SQLConf.get.broadcastTimeout @transient - private[sql] lazy val relationFuture: Future[broadcast.Broadcast[Any]] = { - // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here. - val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - val task = new Callable[broadcast.Broadcast[Any]]() { - override def call(): broadcast.Broadcast[Any] = { - // This will run in another thread. Set the execution id so that we can connect these jobs - // with the correct execution. - SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) { + override lazy val relationFuture: Future[broadcast.Broadcast[Any]] = { + SQLExecution.withThreadLocalCaptured[broadcast.Broadcast[Any]]( + sqlContext.sparkSession, BroadcastExchangeExec.executionContext) { try { // Setup a job group here so later it may get cancelled by groupId if necessary. sparkContext.setJobGroup(runId.toString, s"broadcast exchange (runId $runId)", @@ -87,9 +118,9 @@ case class BroadcastExchangeExec( val beforeCollect = System.nanoTime() // Use executeCollect/executeCollectIterator to avoid conversion to Scala types val (numRows, input) = child.executeCollectIterator() - if (numRows >= 512000000) { + if (numRows >= MAX_BROADCAST_TABLE_ROWS) { throw new SparkException( - s"Cannot broadcast the table with 512 million or more rows: $numRows rows") + s"Cannot broadcast the table over $MAX_BROADCAST_TABLE_ROWS rows: $numRows rows") } val beforeBuild = System.nanoTime() @@ -109,7 +140,7 @@ case class BroadcastExchangeExec( } longMetric("dataSize") += dataSize - if (dataSize >= (8L << 30)) { + if (dataSize >= MAX_BROADCAST_TABLE_BYTES) { throw new SparkException( s"Cannot broadcast the table that is larger than 8GB: ${dataSize >> 30} GB") } @@ -121,9 +152,9 @@ case class BroadcastExchangeExec( val broadcasted = sparkContext.broadcast(relation) longMetric("broadcastTime") += NANOSECONDS.toMillis( System.nanoTime() - beforeBroadcast) - + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq) - promise.success(broadcasted) + promise.trySuccess(broadcasted) broadcasted } catch { // SPARK-24294: To bypass scala bug: https://github.com/scala/bug/issues/9554, we throw @@ -136,20 +167,17 @@ case class BroadcastExchangeExec( s"${SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key} to -1 or increase the spark " + s"driver memory by setting ${SparkLauncher.DRIVER_MEMORY} to a higher value.") .initCause(oe.getCause)) - promise.failure(ex) + promise.tryFailure(ex) throw ex case e if !NonFatal(e) => val ex = new SparkFatalException(e) - promise.failure(ex) + promise.tryFailure(ex) throw ex case e: Throwable => - promise.failure(e) + promise.tryFailure(e) throw e } - } - } } - BroadcastExchangeExec.executionContext.submit[broadcast.Broadcast[Any]](task) } override protected def doPrepare(): Unit = { @@ -181,6 +209,13 @@ case class BroadcastExchangeExec( } object BroadcastExchangeExec { + // Since the maximum number of keys that BytesToBytesMap supports is 1 << 29, + // and only 70% of the slots can be used before growing in HashedRelation, + // here the limitation should not be over 341 million. + val MAX_BROADCAST_TABLE_ROWS = (BytesToBytesMap.MAX_CAPACITY / 1.5).toLong + + val MAX_BROADCAST_TABLE_BYTES = 8L << 30 + private[execution] val executionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("broadcast-exchange", SQLConf.get.getConf(StaticSQLConf.BROADCAST_EXCHANGE_MAX_THREAD_THRESHOLD))) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index ab4176cada527..c242320635a59 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -35,12 +35,6 @@ import org.apache.spark.sql.internal.SQLConf * the input partition ordering requirements are met. */ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { - private def defaultNumPreShufflePartitions: Int = - if (conf.adaptiveExecutionEnabled && conf.reducePostShufflePartitionsEnabled) { - conf.maxNumPostShufflePartitions - } else { - conf.numShufflePartitions - } private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = { val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution @@ -57,7 +51,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { BroadcastExchangeExec(mode, child) case (child, distribution) => val numPartitions = distribution.requiredNumPartitions - .getOrElse(defaultNumPreShufflePartitions) + .getOrElse(conf.numShufflePartitions) ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child) } @@ -91,11 +85,16 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec]) .map(_.outputPartitioning.numPartitions) val expectedChildrenNumPartitions = if (nonShuffleChildrenNumPartitions.nonEmpty) { - // Here we pick the max number of partitions among these non-shuffle children as the - // expected number of shuffle partitions. However, if it's smaller than - // `conf.numShufflePartitions`, we pick `conf.numShufflePartitions` as the - // expected number of shuffle partitions. - math.max(nonShuffleChildrenNumPartitions.max, conf.numShufflePartitions) + if (nonShuffleChildrenNumPartitions.length == childrenIndexes.length) { + // Here we pick the max number of partitions among these non-shuffle children. + nonShuffleChildrenNumPartitions.max + } else { + // Here we pick the max number of partitions among these non-shuffle children as the + // expected number of shuffle partitions. However, if it's smaller than + // `conf.numShufflePartitions`, we pick `conf.numShufflePartitions` as the + // expected number of shuffle partitions. + math.max(nonShuffleChildrenNumPartitions.max, conf.defaultNumShufflePartitions) + } } else { childrenNumPartitions.max } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala index 849ff384c130a..c4062879c2727 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala @@ -88,12 +88,11 @@ case class ReusedExchangeExec(override val output: Seq[Attribute], child: Exchan } override def verboseStringWithOperatorId(): String = { - val cdgen = ExplainUtils.getCodegenId(this) val reuse_op_str = ExplainUtils.getOpId(child) s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${cdgen} [Reuses operator id: $reuse_op_str] - |Output : ${output} - """.stripMargin + |$formattedNodeName [Reuses operator id: $reuse_op_str] + |${ExplainUtils.generateFieldString("Output", output)} + |""".stripMargin } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala index ffcd6c7783354..24c736951fdc4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala @@ -30,30 +30,69 @@ import org.apache.spark.shuffle.{ShuffleWriteMetricsReporter, ShuffleWriteProces import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ -import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Divide, Literal, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering +import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{LocalShuffledRowRDD, SkewedShuffledRowRDD} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.util.MutablePair import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, RecordComparator} +/** + * Common trait for all shuffle exchange implementations to facilitate pattern matching. + */ +trait ShuffleExchangeLike extends Exchange { + + /** + * Returns the number of mappers of this shuffle. + */ + def numMappers: Int + + /** + * Returns the shuffle partition number. + */ + def numPartitions: Int + + /** + * Returns whether the shuffle partition number can be changed. + */ + def canChangeNumPartitions: Boolean + + /** + * The asynchronous job that materializes the shuffle. + */ + def mapOutputStatisticsFuture: Future[MapOutputStatistics] + + /** + * Returns the shuffle RDD with specified partition specs. + */ + def getShuffleRDD(partitionSpecs: Array[ShufflePartitionSpec]): RDD[_] + + /** + * Returns the runtime statistics after shuffle materialization. + */ + def runtimeStatistics: Statistics +} + /** * Performs a shuffle that will result in the desired partitioning. */ case class ShuffleExchangeExec( override val outputPartitioning: Partitioning, child: SparkPlan, - canChangeNumPartitions: Boolean = true) extends Exchange { + noUserSpecifiedNumPartition: Boolean = true) extends ShuffleExchangeLike { + + // If users specify the num partitions via APIs like `repartition`, we shouldn't change it. + // For `SinglePartition`, it requires exactly one partition and we can't change it either. + def canChangeNumPartitions: Boolean = + noUserSpecifiedNumPartition && outputPartitioning != SinglePartition - // NOTE: coordinator can be null after serialization/deserialization, - // e.g. it can be null on the Executor side private lazy val writeMetrics = SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext) - private lazy val readMetrics = + private[sql] lazy val readMetrics = SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext) override lazy val metrics = Map( "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size") @@ -67,7 +106,7 @@ case class ShuffleExchangeExec( @transient lazy val inputRDD: RDD[InternalRow] = child.execute() // 'mapOutputStatisticsFuture' is only needed when enable AQE. - @transient lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = { + @transient override lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = { if (inputRDD.getNumPartitions == 0) { Future.successful(null) } else { @@ -75,6 +114,20 @@ case class ShuffleExchangeExec( } } + override def numMappers: Int = shuffleDependency.rdd.getNumPartitions + + override def numPartitions: Int = shuffleDependency.partitioner.numPartitions + + override def getShuffleRDD(partitionSpecs: Array[ShufflePartitionSpec]): RDD[InternalRow] = { + new ShuffledRowRDD(shuffleDependency, readMetrics, partitionSpecs) + } + + override def runtimeStatistics: Statistics = { + val dataSize = metrics("dataSize").value + val rowCount = metrics(SQLShuffleWriteMetricsReporter.SHUFFLE_RECORDS_WRITTEN).value + Statistics(dataSize, Some(rowCount)) + } + /** * A [[ShuffleDependency]] that will partition rows of its child based on * the partitioning scheme defined in `newPartitioning`. Those partitions of @@ -90,24 +143,6 @@ case class ShuffleExchangeExec( writeMetrics) } - def createShuffledRDD( - partitionRanges: Option[Array[(Int, Int)]]): ShuffledRowRDD = { - new ShuffledRowRDD(shuffleDependency, readMetrics, partitionRanges) - } - - def createLocalShuffleRDD( - partitionStartIndicesPerMapper: Array[Array[Int]]): LocalShuffledRowRDD = { - new LocalShuffledRowRDD(shuffleDependency, readMetrics, partitionStartIndicesPerMapper) - } - - def createSkewedShuffleRDD( - partitionIndex: Int, - startMapIndex: Int, - endMapIndex: Int): SkewedShuffledRowRDD = { - new SkewedShuffledRowRDD(shuffleDependency, - partitionIndex, startMapIndex, endMapIndex, readMetrics) - } - /** * Caches the created ShuffleRowRDD so we can reuse that. */ @@ -116,7 +151,7 @@ case class ShuffleExchangeExec( protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { // Returns the same ShuffleRowRDD if this plan is used by multiple plans. if (cachedShuffleRDD == null) { - cachedShuffleRDD = createShuffledRDD(None) + cachedShuffleRDD = new ShuffledRowRDD(shuffleDependency, readMetrics) } cachedShuffleRDD } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala index 29645a736548c..91016a9060303 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala @@ -72,8 +72,8 @@ case class CartesianProductExec( } else "None" s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Join condition: ${joinCondStr} + |$formattedNodeName + |${ExplainUtils.generateFieldString("Join condition", joinCondStr)} """.stripMargin } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala index 137f0b87a2f3d..0275741844bca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala @@ -18,12 +18,14 @@ package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.CastSupport import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{ExplainUtils, RowIterator, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegralType, LongType} trait HashJoin { @@ -48,11 +50,11 @@ trait HashJoin { } else "None" s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Left keys: ${leftKeys} - |Right keys: ${rightKeys} - |Join condition: ${joinCondStr} - """.stripMargin + |$formattedNodeName + |${ExplainUtils.generateFieldString("Left keys", leftKeys)} + |${ExplainUtils.generateFieldString("Right keys", rightKeys)} + |${ExplainUtils.generateFieldString("Join condition", joinCondStr)} + |""".stripMargin } override def output: Seq[Attribute] = { @@ -80,8 +82,11 @@ trait HashJoin { } protected lazy val (buildKeys, streamedKeys) = { - require(leftKeys.map(_.dataType) == rightKeys.map(_.dataType), - "Join keys from two sides should have same types") + require(leftKeys.length == rightKeys.length && + leftKeys.map(_.dataType) + .zip(rightKeys.map(_.dataType)) + .forall(types => types._1.sameType(types._2)), + "Join keys from two sides should have same length and types") val lkeys = bindReferences(HashJoin.rewriteKeyExpr(leftKeys), left.output) val rkeys = bindReferences(HashJoin.rewriteKeyExpr(rightKeys), right.output) buildSide match { @@ -237,7 +242,10 @@ trait HashJoin { } } -object HashJoin { +object HashJoin extends CastSupport { + + override def conf: SQLConf = SQLConf.get + /** * Try to rewrite the key as LongType so we can use getLong(), if they key can fit with a long. * @@ -252,14 +260,14 @@ object HashJoin { } var keyExpr: Expression = if (keys.head.dataType != LongType) { - Cast(keys.head, LongType) + cast(keys.head, LongType) } else { keys.head } keys.tail.foreach { e => val bits = e.dataType.defaultSize * 8 keyExpr = BitwiseOr(ShiftLeft(keyExpr, Literal(bits)), - BitwiseAnd(Cast(e, LongType), Literal((1L << bits) - 1))) + BitwiseAnd(cast(e, LongType), Literal((1L << bits) - 1))) } keyExpr :: Nil } @@ -272,13 +280,13 @@ object HashJoin { // jump over keys that have a higher index value than the required key if (keys.size == 1) { assert(index == 0) - Cast(BoundReference(0, LongType, nullable = false), keys(index).dataType) + cast(BoundReference(0, LongType, nullable = false), keys(index).dataType) } else { val shiftedBits = keys.slice(index + 1, keys.size).map(_.dataType.defaultSize * 8).sum val mask = (1L << (keys(index).dataType.defaultSize * 8)) - 1 // build the schema for unpacking the required key - Cast(BitwiseAnd( + cast(BitwiseAnd( ShiftRightUnsigned(BoundReference(0, LongType, nullable = false), Literal(shiftedBits)), Literal(mask)), keys(index).dataType) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index 4001338662d53..13180d6b20902 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -177,7 +177,7 @@ private[joins] class UnsafeHashedRelation( } override def keys(): Iterator[InternalRow] = { - val iter = binaryMap.safeIterator() + val iter = binaryMap.iterator() new Iterator[InternalRow] { val unsafeRow = new UnsafeRow(numKeys) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala index 6384aed6a78e0..5b5904f1575fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala @@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{PartialShuffleReaderExec, SkewedPartitionReaderExec} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.collection.BitSet @@ -42,11 +41,17 @@ case class SortMergeJoinExec( condition: Option[Expression], left: SparkPlan, right: SparkPlan, - isPartial: Boolean = false) extends BinaryExecNode with CodegenSupport { + isSkewJoin: Boolean = false) extends BinaryExecNode with CodegenSupport { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + override def nodeName: String = { + if (isSkewJoin) super.nodeName + "(skew=true)" else super.nodeName + } + + override def stringArgs: Iterator[Any] = super.stringArgs.toSeq.dropRight(1).iterator + override def simpleStringWithNodeId(): String = { val opId = ExplainUtils.getOpId(this) s"$nodeName $joinType ($opId)".trim @@ -57,11 +62,11 @@ case class SortMergeJoinExec( s"${condition.get}" } else "None" s""" - |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} - |Left keys : ${leftKeys} - |Right keys: ${rightKeys} - |Join condition : ${joinCondStr} - """.stripMargin + |$formattedNodeName + |${ExplainUtils.generateFieldString("Left keys", leftKeys)} + |${ExplainUtils.generateFieldString("Right keys", rightKeys)} + |${ExplainUtils.generateFieldString("Join condition", joinCondStr)} + |""".stripMargin } override def output: Seq[Attribute] = { @@ -98,7 +103,9 @@ case class SortMergeJoinExec( } override def requiredChildDistribution: Seq[Distribution] = { - if (isPartial) { + if (isSkewJoin) { + // We re-arrange the shuffle partitions to deal with skew join, and the new children + // partitioning doesn't satisfy `HashClusteredDistribution`. UnspecifiedDistribution :: UnspecifiedDistribution :: Nil } else { HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala index 65aabe004d75b..92d217983f4c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala @@ -116,26 +116,23 @@ object SQLMetrics { // data size total (min, med, max): // 100GB (100MB, 1GB, 10GB) val acc = new SQLMetric(SIZE_METRIC, -1) - acc.register(sc, name = Some(s"$name total (min, med, max (stageId (attemptId): taskId))"), - countFailedValues = false) + acc.register(sc, name = Some(name), countFailedValues = false) acc } def createTimingMetric(sc: SparkContext, name: String): SQLMetric = { // The final result of this metric in physical operator UI may looks like: - // duration(min, med, max): + // duration total (min, med, max): // 5s (800ms, 1s, 2s) val acc = new SQLMetric(TIMING_METRIC, -1) - acc.register(sc, name = Some(s"$name total (min, med, max (stageId (attemptId): taskId))"), - countFailedValues = false) + acc.register(sc, name = Some(name), countFailedValues = false) acc } def createNanoTimingMetric(sc: SparkContext, name: String): SQLMetric = { // Same with createTimingMetric, just normalize the unit of time to millisecond. val acc = new SQLMetric(NS_TIMING_METRIC, -1) - acc.register(sc, name = Some(s"$name total (min, med, max (stageId (attemptId): taskId))"), - countFailedValues = false) + acc.register(sc, name = Some(name), countFailedValues = false) acc } @@ -150,8 +147,7 @@ object SQLMetrics { // probe avg (min, med, max): // (1.2, 2.2, 6.3) val acc = new SQLMetric(AVERAGE_METRIC) - acc.register(sc, name = Some(s"$name (min, med, max (stageId (attemptId): taskId))"), - countFailedValues = false) + acc.register(sc, name = Some(name), countFailedValues = false) acc } @@ -164,34 +160,38 @@ object SQLMetrics { metricsType != SUM_METRIC } + private val METRICS_NAME_SUFFIX = "(min, med, max (stageId: taskId))" + /** * A function that defines how we aggregate the final accumulator results among all tasks, * and represent it in string for a SQL physical operator. */ def stringValue(metricsType: String, values: Array[Long], maxMetrics: Array[Long]): String = { - // stringMetric = "(driver)" OR (stage $stageId (attempt $attemptId): task $taskId)) - val stringMetric = if (maxMetrics.isEmpty) { + // taskInfo = "(driver)" OR (stage ${stageId}.${attemptId}: task $taskId) + val taskInfo = if (maxMetrics.isEmpty) { "(driver)" } else { - s"(stage ${maxMetrics(1)} (attempt ${maxMetrics(2)}): task ${maxMetrics(3)})" + s"(stage ${maxMetrics(1)}.${maxMetrics(2)}: task ${maxMetrics(3)})" } if (metricsType == SUM_METRIC) { val numberFormat = NumberFormat.getIntegerInstance(Locale.US) numberFormat.format(values.sum) } else if (metricsType == AVERAGE_METRIC) { val validValues = values.filter(_ > 0) - val Seq(min, med, max) = { - val metric = if (validValues.isEmpty) { - val zeros = Seq.fill(3)(0L) - zeros.map(v => toNumberFormat(v)) - } else { + // When there are only 1 metrics value (or None), no need to display max/min/median. This is + // common for driver-side SQL metrics. + if (validValues.length <= 1) { + toNumberFormat(validValues.headOption.getOrElse(0)) + } else { + val Seq(min, med, max) = { Arrays.sort(validValues) - Seq(toNumberFormat(validValues(0)), toNumberFormat(validValues(validValues.length / 2)), - s"${toNumberFormat(validValues(validValues.length - 1))} $stringMetric") + Seq( + toNumberFormat(validValues(0)), + toNumberFormat(validValues(validValues.length / 2)), + toNumberFormat(validValues(validValues.length - 1))) } - metric + s"$METRICS_NAME_SUFFIX:\n($min, $med, $max $taskInfo)" } - s"\n($min, $med, $max)" } else { val strFormat: Long => String = if (metricsType == SIZE_METRIC) { Utils.bytesToString @@ -204,19 +204,21 @@ object SQLMetrics { } val validValues = values.filter(_ >= 0) - val Seq(sum, min, med, max) = { - val metric = if (validValues.isEmpty) { - val zeros = Seq.fill(4)(0L) - zeros.map(v => strFormat(v)) - } else { + // When there are only 1 metrics value (or None), no need to display max/min/median. This is + // common for driver-side SQL metrics. + if (validValues.length <= 1) { + strFormat(validValues.headOption.getOrElse(0)) + } else { + val Seq(sum, min, med, max) = { Arrays.sort(validValues) - Seq(strFormat(validValues.sum), strFormat(validValues(0)), + Seq( + strFormat(validValues.sum), + strFormat(validValues(0)), strFormat(validValues(validValues.length / 2)), - s"${strFormat(validValues(validValues.length - 1))} $stringMetric") + strFormat(validValues(validValues.length - 1))) } - metric + s"total $METRICS_NAME_SUFFIX\n$sum ($min, $med, $max $taskInfo)" } - s"\n$sum ($min, $med, $max)" } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala index d05113431df41..c08db132c946f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala @@ -276,12 +276,12 @@ case class MapElementsExec( } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { - val (funcClass, methodName) = func match { + val (funcClass, funcName) = func match { case m: MapFunction[_, _] => classOf[MapFunction[_, _]] -> "call" case _ => FunctionUtils.getFunctionOneName(outputObjectType, child.output(0).dataType) } val funcObj = Literal.create(func, ObjectType(funcClass)) - val callFunc = Invoke(funcObj, methodName, outputObjectType, child.output) + val callFunc = Invoke(funcObj, funcName, outputObjectType, child.output, propagateNull = false) val result = BindReferences.bindReference(callFunc, child.output).genCode(ctx) @@ -567,7 +567,14 @@ case class FlatMapGroupsInRWithArrowExec( // binary in a batch due to the limitation of R API. See also ARROW-4512. val columnarBatchIter = runner.compute(groupedByRKey, -1) val outputProject = UnsafeProjection.create(output, output) - columnarBatchIter.flatMap(_.rowIterator().asScala).map(outputProject) + val outputTypes = StructType.fromAttributes(output).map(_.dataType) + + columnarBatchIter.flatMap { batch => + val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) + assert(outputTypes == actualDataTypes, "Invalid schema from gapply(): " + + s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") + batch.rowIterator().asScala + }.map(outputProject) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala index a0f23e925d237..57ccdb194b1f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala @@ -21,7 +21,7 @@ import java.io.File import scala.collection.mutable.ArrayBuffer -import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.{ContextAwareIterator, SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -88,6 +88,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], inputRDD.mapPartitions { iter => val context = TaskContext.get() + val contextAwareIterator = new ContextAwareIterator(context, iter) // The queue used to buffer input rows so we can drain it to // combine input with output from Python. @@ -119,7 +120,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], }) // Add rows to queue to join later with the result. - val projectedRowIter = iter.map { inputRow => + val projectedRowIter = contextAwareIterator.map { inputRow => queue.add(inputRow.asInstanceOf[UnsafeRow]) projection(inputRow) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala index 520afad287648..7fe3263630820 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala @@ -198,7 +198,7 @@ object EvaluatePython { case udt: UserDefinedType[_] => makeFromJava(udt.sqlType) - case other => (obj: Any) => nullSafeConvert(other)(PartialFunction.empty) + case other => (obj: Any) => nullSafeConvert(obj)(PartialFunction.empty) } private def nullSafeConvert(input: Any)(f: PartialFunction[Any, Any]): Any = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala index 2bb808119c0ae..71f51f1abc6f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInPandasExec.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ -import org.apache.spark.TaskContext +import org.apache.spark.{ContextAwareIterator, TaskContext} import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow @@ -61,16 +61,17 @@ case class MapInPandasExec( val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) val outputTypes = child.schema + val context = TaskContext.get() + val contextAwareIterator = new ContextAwareIterator(context, inputIter) + // Here we wrap it via another row so that Python sides understand it // as a DataFrame. - val wrappedIter = inputIter.map(InternalRow(_)) + val wrappedIter = contextAwareIterator.map(InternalRow(_)) // DO NOT use iter.grouped(). See BatchIterator. val batchIter = if (batchSize > 0) new BatchIterator(wrappedIter, batchSize) else Iterator(wrappedIter) - val context = TaskContext.get() - val columnarBatchIter = new ArrowPythonRunner( chainedFunc, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala index 6f1b67801bb80..bcd226f95f822 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala @@ -113,10 +113,8 @@ object FrequentItems extends Logging { val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) - val originalSchema = df.schema val outputCols = cols.map { name => - val index = originalSchema.fieldIndex(name) - val originalField = originalSchema.fields(index) + val originalField = df.resolve(name) // append frequent Items to the column name for easy debugging StructField(name + "_freqItems", ArrayType(originalField.dataType, originalField.nullable)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index fffd8805a6525..5094e5eab5955 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -70,7 +70,7 @@ object StatFunctions extends Logging { require(relativeError >= 0, s"Relative Error must be non-negative but got $relativeError") val columns: Seq[Column] = cols.map { colName => - val field = df.schema(colName) + val field = df.resolve(colName) require(field.dataType.isInstanceOf[NumericType], s"Quantile calculation for column $colName with data type ${field.dataType}" + " is not supported.") @@ -154,10 +154,9 @@ object StatFunctions extends Logging { functionName: String): CovarianceCounter = { require(cols.length == 2, s"Currently $functionName calculation is supported " + "between two columns.") - cols.map(name => (name, df.schema.fields.find(_.name == name))).foreach { case (name, data) => - require(data.nonEmpty, s"Couldn't find column with name $name") - require(data.get.dataType.isInstanceOf[NumericType], s"Currently $functionName calculation " + - s"for columns with dataType ${data.get.dataType.catalogString} not supported.") + cols.map(name => (name, df.resolve(name))).foreach { case (name, data) => + require(data.dataType.isInstanceOf[NumericType], s"Currently $functionName calculation " + + s"for columns with dataType ${data.dataType.catalogString} not supported.") } val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType))) df.select(columns: _*).queryExecution.toRdd.treeAggregate(new CovarianceCounter)( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala index 905bce4d614e6..10bcfe6649802 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala @@ -28,6 +28,7 @@ import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.sql.SparkSession +import org.apache.spark.util.{SizeEstimator, Utils} /** * An abstract class for compactible metadata logs. It will write one log file for each batch. @@ -177,16 +178,35 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag]( * corresponding `batchId` file. It will delete expired files as well if enabled. */ private def compact(batchId: Long, logs: Array[T]): Boolean = { - val validBatches = getValidBatchesBeforeCompactionBatch(batchId, compactInterval) - val allLogs = validBatches.flatMap { id => - super.get(id).getOrElse { - throw new IllegalStateException( - s"${batchIdToPath(id)} doesn't exist when compacting batch $batchId " + - s"(compactInterval: $compactInterval)") - } - } ++ logs + val (allLogs, loadElapsedMs) = Utils.timeTakenMs { + val validBatches = getValidBatchesBeforeCompactionBatch(batchId, compactInterval) + validBatches.flatMap { id => + super.get(id).getOrElse { + throw new IllegalStateException( + s"${batchIdToPath(id)} doesn't exist when compacting batch $batchId " + + s"(compactInterval: $compactInterval)") + } + } ++ logs + } + val compactedLogs = compactLogs(allLogs) + // Return false as there is another writer. - super.add(batchId, compactLogs(allLogs).toArray) + val (writeSucceed, writeElapsedMs) = Utils.timeTakenMs { + super.add(batchId, compactedLogs.toArray) + } + + val elapsedMs = loadElapsedMs + writeElapsedMs + if (elapsedMs >= COMPACT_LATENCY_WARN_THRESHOLD_MS) { + logWarning(s"Compacting took $elapsedMs ms (load: $loadElapsedMs ms," + + s" write: $writeElapsedMs ms) for compact batch $batchId") + logWarning(s"Loaded ${allLogs.size} entries (estimated ${SizeEstimator.estimate(allLogs)} " + + s"bytes in memory), and wrote ${compactedLogs.size} entries for compact batch $batchId") + } else { + logDebug(s"Compacting took $elapsedMs ms (load: $loadElapsedMs ms," + + s" write: $writeElapsedMs ms) for compact batch $batchId") + } + + writeSucceed } /** @@ -268,6 +288,7 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag]( object CompactibleFileStreamLog { val COMPACT_FILE_SUFFIX = ".compact" + val COMPACT_LATENCY_WARN_THRESHOLD_MS = 2000 def getBatchIdFromFileName(fileName: String): Long = { fileName.stripSuffix(COMPACT_FILE_SUFFIX).toLong diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala index eac5246904ffd..7dd52c1feabf2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala @@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark -import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS +import org.apache.spark.sql.catalyst.util.DateTimeUtils.toMillis import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval @@ -100,7 +100,7 @@ case class EventTimeWatermarkExec( child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => - eventTimeStats.add(getEventTime(row).getLong(0) / MICROS_PER_MILLIS) + eventTimeStats.add(toMillis(getEventTime(row).getLong(0))) row } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index b679f163fc561..86a31942f3f94 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -45,8 +45,7 @@ object FileStreamSink extends Logging { val hdfsPath = new Path(singlePath) val fs = hdfsPath.getFileSystem(hadoopConf) if (fs.isDirectory(hdfsPath)) { - val metadataPath = new Path(hdfsPath, metadataDir) - checkEscapedMetadataPath(fs, metadataPath, sqlConf) + val metadataPath = getMetadataLogPath(fs, hdfsPath, sqlConf) fs.exists(metadataPath) } else { false @@ -55,6 +54,12 @@ object FileStreamSink extends Logging { } } + def getMetadataLogPath(fs: FileSystem, path: Path, sqlConf: SQLConf): Path = { + val metadataDir = new Path(path, FileStreamSink.metadataDir) + FileStreamSink.checkEscapedMetadataPath(fs, metadataDir, sqlConf) + metadataDir + } + def checkEscapedMetadataPath(fs: FileSystem, metadataPath: Path, sqlConf: SQLConf): Unit = { if (sqlConf.getConf(SQLConf.STREAMING_CHECKPOINT_ESCAPED_PATH_CHECK_ENABLED) && StreamExecution.containsSpecialCharsInPath(metadataPath)) { @@ -125,14 +130,12 @@ class FileStreamSink( partitionColumnNames: Seq[String], options: Map[String, String]) extends Sink with Logging { + import FileStreamSink._ + private val hadoopConf = sparkSession.sessionState.newHadoopConf() private val basePath = new Path(path) - private val logPath = { - val metadataDir = new Path(basePath, FileStreamSink.metadataDir) - val fs = metadataDir.getFileSystem(hadoopConf) - FileStreamSink.checkEscapedMetadataPath(fs, metadataDir, sparkSession.sessionState.conf) - metadataDir - } + private val logPath = getMetadataLogPath(basePath.getFileSystem(hadoopConf), basePath, + sparkSession.sessionState.conf) private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toString) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala index 09ae7692ec518..bfa60cf7dfd78 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashPartitioning, SinglePartition} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, LocalLimitExec, QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode} -import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec +import org.apache.spark.sql.execution.exchange.{ShuffleExchangeExec, ShuffleExchangeLike} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.util.Utils @@ -76,7 +76,7 @@ class IncrementalExecution( * with the desired literal */ override - lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) { + lazy val optimizedPlan: LogicalPlan = executePhase(QueryPlanningTracker.OPTIMIZATION) { sparkSession.sessionState.optimizer.executeAndTrack(withCachedData, tracker) transformAllExpressions { case ts @ CurrentBatchTimestamp(timestamp, _, _) => @@ -118,7 +118,7 @@ class IncrementalExecution( case s: StatefulOperator => statefulOpFound = true - case e: ShuffleExchangeExec => + case e: ShuffleExchangeLike => // Don't search recursively any further as any child stateful operator as we // are only looking for stateful subplans that this plan has narrow dependencies on. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala index 83bc347e23ed4..f85cfbcb33d83 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala @@ -226,7 +226,8 @@ class MicroBatchExecution( } } - finishTrigger(currentBatchHasNewData) // Must be outside reportTimeTaken so it is recorded + // Must be outside reportTimeTaken so it is recorded + finishTrigger(currentBatchHasNewData, isCurrentBatchConstructed) // Signal waiting threads. Note this must be after finishTrigger() to ensure all // activities (progress generation, etc.) have completed before signaling. @@ -317,6 +318,17 @@ class MicroBatchExecution( committedOffsets ++= availableOffsets watermarkTracker.setWatermark( math.max(watermarkTracker.currentWatermark, commitMetadata.nextBatchWatermarkMs)) + } else if (latestCommittedBatchId == latestBatchId - 1) { + availableOffsets.foreach { + case (source: Source, end: Offset) => + val start = committedOffsets.get(source).map(_.asInstanceOf[Offset]) + if (start.map(_ == end).getOrElse(true)) { + source.getBatch(start, end) + } + case nonV1Tuple => + // The V2 API does not have the same edge case requiring getBatch to be called + // here, so we do nothing here. + } } else if (latestCommittedBatchId < latestBatchId - 1) { logWarning(s"Batch completion log latest batch id is " + s"${latestCommittedBatchId}, which is not trailing " + @@ -563,11 +575,10 @@ class MicroBatchExecution( } val nextBatch = - new Dataset(sparkSessionToRunBatch, lastExecution, RowEncoder(lastExecution.analyzed.schema)) + new Dataset(lastExecution, RowEncoder(lastExecution.analyzed.schema)) - val batchSinkProgress: Option[StreamWriterCommitProgress] = - reportTimeTaken("addBatch") { - SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) { + val batchSinkProgress: Option[StreamWriterCommitProgress] = reportTimeTaken("addBatch") { + SQLExecution.withNewExecutionId(lastExecution) { sink match { case s: Sink => s.addBatch(currentBatchId, nextBatch) case _: SupportsWrite => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala index f20291e11fd70..ea1f2ce3943b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala @@ -85,8 +85,8 @@ trait ProgressReporter extends Logging { private val noDataProgressEventInterval = sparkSession.sessionState.conf.streamingNoDataProgressEventInterval - // The timestamp we report an event that has no input data - private var lastNoDataProgressEventTime = Long.MinValue + // The timestamp we report an event that has not executed anything + private var lastNoExecutionProgressEventTime = Long.MinValue private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC")) @@ -142,12 +142,19 @@ trait ProgressReporter extends Logging { logInfo(s"Streaming query made progress: $newProgress") } - /** Finalizes the query progress and adds it to list of recent status updates. */ - protected def finishTrigger(hasNewData: Boolean): Unit = { + /** + * Finalizes the query progress and adds it to list of recent status updates. + * + * @param hasNewData Whether the sources of this stream had new data for this trigger. + * @param hasExecuted Whether any batch was executed during this trigger. Streaming queries that + * perform stateful aggregations with timeouts can still run batches even + * though the sources don't have any new data. + */ + protected def finishTrigger(hasNewData: Boolean, hasExecuted: Boolean): Unit = { assert(currentTriggerStartOffsets != null && currentTriggerEndOffsets != null) currentTriggerEndTimestamp = triggerClock.getTimeMillis() - val executionStats = extractExecutionStats(hasNewData) + val executionStats = extractExecutionStats(hasNewData, hasExecuted) val processingTimeMills = currentTriggerEndTimestamp - currentTriggerStartTimestamp val processingTimeSec = Math.max(1L, processingTimeMills).toDouble / MILLIS_PER_SECOND @@ -170,9 +177,12 @@ trait ProgressReporter extends Logging { ) } - val sinkProgress = SinkProgress( - sink.toString, - sinkCommitProgress.map(_.numOutputRows)) + val sinkOutput = if (hasExecuted) { + sinkCommitProgress.map(_.numOutputRows) + } else { + sinkCommitProgress.map(_ => 0L) + } + val sinkProgress = SinkProgress(sink.toString, sinkOutput) val observedMetrics = extractObservedMetrics(hasNewData, lastExecution) val newProgress = new StreamingQueryProgress( @@ -189,14 +199,14 @@ trait ProgressReporter extends Logging { sink = sinkProgress, observedMetrics = new java.util.HashMap(observedMetrics.asJava)) - if (hasNewData) { + if (hasExecuted) { // Reset noDataEventTimestamp if we processed any data - lastNoDataProgressEventTime = Long.MinValue + lastNoExecutionProgressEventTime = triggerClock.getTimeMillis() updateProgress(newProgress) } else { val now = triggerClock.getTimeMillis() - if (now - noDataProgressEventInterval >= lastNoDataProgressEventTime) { - lastNoDataProgressEventTime = now + if (now - noDataProgressEventInterval >= lastNoExecutionProgressEventTime) { + lastNoExecutionProgressEventTime = now updateProgress(newProgress) } } @@ -205,26 +215,26 @@ trait ProgressReporter extends Logging { } /** Extract statistics about stateful operators from the executed query plan. */ - private def extractStateOperatorMetrics(hasNewData: Boolean): Seq[StateOperatorProgress] = { + private def extractStateOperatorMetrics(hasExecuted: Boolean): Seq[StateOperatorProgress] = { if (lastExecution == null) return Nil - // lastExecution could belong to one of the previous triggers if `!hasNewData`. + // lastExecution could belong to one of the previous triggers if `!hasExecuted`. // Walking the plan again should be inexpensive. lastExecution.executedPlan.collect { case p if p.isInstanceOf[StateStoreWriter] => val progress = p.asInstanceOf[StateStoreWriter].getProgress() - if (hasNewData) progress else progress.copy(newNumRowsUpdated = 0) + if (hasExecuted) progress else progress.copy(newNumRowsUpdated = 0) } } /** Extracts statistics from the most recent query execution. */ - private def extractExecutionStats(hasNewData: Boolean): ExecutionStats = { + private def extractExecutionStats(hasNewData: Boolean, hasExecuted: Boolean): ExecutionStats = { val hasEventTime = logicalPlan.collect { case e: EventTimeWatermark => e }.nonEmpty val watermarkTimestamp = if (hasEventTime) Map("watermark" -> formatTimestamp(offsetSeqMetadata.batchWatermarkMs)) else Map.empty[String, String] // SPARK-19378: Still report metrics even though no data was processed while reporting progress. - val stateOperators = extractStateOperatorMetrics(hasNewData) + val stateOperators = extractStateOperatorMetrics(hasExecuted) if (!hasNewData) { return ExecutionStats(Map.empty, stateOperators, watermarkTimestamp) @@ -349,7 +359,7 @@ trait ProgressReporter extends Logging { result } - private def formatTimestamp(millis: Long): String = { + protected def formatTimestamp(millis: Long): String = { timestampFormat.format(new Date(millis)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index 8b3534bc0837a..18fe38caa5e65 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -44,6 +44,7 @@ import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.StreamingExplainCommand import org.apache.spark.sql.execution.datasources.v2.StreamWriterCommitProgress import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.connector.SupportsStreamingUpdate import org.apache.spark.sql.streaming._ import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.{Clock, UninterruptibleThread, Utils} @@ -307,8 +308,8 @@ abstract class StreamExecution( } // `postEvent` does not throw non fatal exception. - val submissionTime = triggerClock.getTimeMillis() - postEvent(new QueryStartedEvent(id, runId, name, submissionTime)) + val startTimestamp = triggerClock.getTimeMillis() + postEvent(new QueryStartedEvent(id, runId, name, formatTimestamp(startTimestamp))) // Unblock starting thread startLatch.countDown() @@ -451,9 +452,9 @@ abstract class StreamExecution( val stackTraceException = new SparkException("The stream thread was last executing:") stackTraceException.setStackTrace(queryExecutionThread.getStackTrace) val timeoutException = new TimeoutException( - s"Stream Execution thread failed to stop within $timeout milliseconds (specified by " + - s"${SQLConf.STREAMING_STOP_TIMEOUT.key}). See the cause on what was " + - "being executed in the streaming query thread.") + s"Stream Execution thread for stream $prettyIdString failed to stop within $timeout " + + s"milliseconds (specified by ${SQLConf.STREAMING_STOP_TIMEOUT.key}). See the cause on " + + s"what was being executed in the streaming query thread.") timeoutException.initCause(stackTraceException) throw timeoutException } @@ -629,14 +630,9 @@ abstract class StreamExecution( writeBuilder.asInstanceOf[SupportsTruncate].truncate().buildForStreaming() case Update => - // Although no v2 sinks really support Update mode now, but during tests we do want them - // to pretend to support Update mode, and treat Update mode same as Append mode. - if (Utils.isTesting) { - writeBuilder.buildForStreaming() - } else { - throw new IllegalArgumentException( - "Data source v2 streaming sinks does not support Update mode.") - } + require(writeBuilder.isInstanceOf[SupportsStreamingUpdate], + table.name + " does not support Update mode.") + writeBuilder.asInstanceOf[SupportsStreamingUpdate].update().buildForStreaming() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala index 198e17db419a7..57e62dcf513ea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala @@ -295,6 +295,10 @@ case class StreamingSymmetricHashJoinExec( postJoinFilter(joinedRow.withLeft(leftKeyValue.value).withRight(rightValue)) } } + + // NOTE: we need to make sure `outerOutputIter` is evaluated "after" exhausting all of + // elements in `innerOutputIter`, because evaluation of `innerOutputIter` may update + // the match flag which the logic for outer join is relying on. val removedRowIter = leftSideJoiner.removeOldState() val outerOutputIter = removedRowIter.filterNot { kv => stateFormatVersion match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala index 1a27fe61d9602..28171f4ca645e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala @@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY +import org.apache.spark.sql.catalyst.util.DateTimeUtils.toMillis import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.streaming.Trigger import org.apache.spark.unsafe.types.UTF8String @@ -36,7 +37,8 @@ private object Triggers { if (cal.months != 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } - TimeUnit.MICROSECONDS.toMillis(cal.microseconds + cal.days * MICROS_PER_DAY) + val microsInDays = Math.multiplyExact(cal.days, MICROS_PER_DAY) + toMillis(Math.addExact(cal.microseconds, microsInDays)) } def convert(interval: Duration): Long = interval.toMillis @@ -48,17 +50,17 @@ private object Triggers { * A [[Trigger]] that processes only one batch of data in a streaming query then terminates * the query. */ -private[sql] case object OneTimeTrigger extends Trigger +case object OneTimeTrigger extends Trigger /** * A [[Trigger]] that runs a query periodically based on the processing time. If `interval` is 0, * the query will run as fast as possible. */ -private[sql] case class ProcessingTimeTrigger(intervalMs: Long) extends Trigger { +case class ProcessingTimeTrigger(intervalMs: Long) extends Trigger { Triggers.validate(intervalMs) } -private[sql] object ProcessingTimeTrigger { +object ProcessingTimeTrigger { import Triggers._ def apply(interval: String): ProcessingTimeTrigger = { @@ -82,11 +84,11 @@ private[sql] object ProcessingTimeTrigger { * A [[Trigger]] that continuously processes streaming data, asynchronously checkpointing at * the specified interval. */ -private[sql] case class ContinuousTrigger(intervalMs: Long) extends Trigger { +case class ContinuousTrigger(intervalMs: Long) extends Trigger { Triggers.validate(intervalMs) } -private[sql] object ContinuousTrigger { +object ContinuousTrigger { import Triggers._ def apply(interval: String): ContinuousTrigger = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala index e471e6c601d16..1e64021c8105e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapabi import org.apache.spark.sql.connector.write.{LogicalWriteInfo, SupportsTruncate, WriteBuilder} import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.streaming.sources.ConsoleWrite -import org.apache.spark.sql.internal.connector.SimpleTableProvider +import org.apache.spark.sql.internal.connector.{SimpleTableProvider, SupportsStreamingUpdate} import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -73,11 +73,12 @@ object ConsoleTable extends Table with SupportsWrite { } override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { - new WriteBuilder with SupportsTruncate { + new WriteBuilder with SupportsTruncate with SupportsStreamingUpdate { private val inputSchema: StructType = info.schema() - // Do nothing for truncate. Console sink is special that it just prints all the records. + // Do nothing for truncate/update. Console sink is special and it just prints all the records. override def truncate(): WriteBuilder = this + override def update(): WriteBuilder = this override def buildForStreaming(): StreamingWrite = { assert(inputSchema != null) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala index a9b724a73a18e..a109c2171f3d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala @@ -252,7 +252,7 @@ class ContinuousExecution( updateStatusMessage("Running") reportTimeTaken("runContinuous") { - SQLExecution.withNewExecutionId(sparkSessionForQuery, lastExecution) { + SQLExecution.withNewExecutionId(lastExecution) { lastExecution.executedPlan.execute() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala index fc47c5ed3ac00..368dfae0cc95e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala @@ -173,6 +173,7 @@ class TextSocketContinuousStream( setDaemon(true) override def run(): Unit = { + val toRow = encoder.createSerializer() try { while (true) { val line = reader.readLine() @@ -187,7 +188,7 @@ class TextSocketContinuousStream( Timestamp.valueOf( TextSocketReader.DATE_FORMAT.format(Calendar.getInstance().getTime())) ) - buckets(currentOffset % numPartitions) += encoder.toRow(newData) + buckets(currentOffset % numPartitions) += toRow(newData) .copy().asInstanceOf[UnsafeRow] } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala index ea39c549bd072..e5b9e68d71026 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala @@ -27,7 +27,7 @@ import scala.collection.mutable.ListBuffer import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.encoders.encoderFor +import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder} import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.truncatedString @@ -57,6 +57,8 @@ abstract class MemoryStreamBase[A : Encoder](sqlContext: SQLContext) extends Spa val encoder = encoderFor[A] protected val attributes = encoder.schema.toAttributes + protected lazy val toRow: ExpressionEncoder.Serializer[A] = encoder.createSerializer() + def toDS(): Dataset[A] = { Dataset[A](sqlContext.sparkSession, logicalPlan) } @@ -176,7 +178,7 @@ case class MemoryStream[A : Encoder]( def addData(data: TraversableOnce[A]): Offset = { val objects = data.toSeq - val rows = objects.iterator.map(d => encoder.toRow(d).copy().asInstanceOf[UnsafeRow]).toArray + val rows = objects.iterator.map(d => toRow(d).copy().asInstanceOf[UnsafeRow]).toArray logDebug(s"Adding: $objects") this.synchronized { currentOffset = currentOffset + 1 @@ -243,7 +245,7 @@ case class MemoryStream[A : Encoder]( rows: Seq[UnsafeRow], startOrdinal: Int, endOrdinal: Int): String = { - val fromRow = encoder.resolveAndBind().fromRow _ + val fromRow = encoder.resolveAndBind().createDeserializer() s"MemoryBatch [$startOrdinal, $endOrdinal]: " + s"${rows.map(row => fromRow(row)).mkString(", ")}" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala index f94469385b281..d0cf602c7cca2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala @@ -60,7 +60,7 @@ class ContinuousMemoryStream[A : Encoder](id: Int, sqlContext: SQLContext, numPa // Distribute data evenly among partition lists. data.toSeq.zipWithIndex.map { case (item, index) => - records(index % numPartitions) += encoder.toRow(item).copy().asInstanceOf[UnsafeRow] + records(index % numPartitions) += toRow(item).copy().asInstanceOf[UnsafeRow] } // The new target offset is the offset where all records in all partitions have been processed. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala index 03c567c58d46a..6d5e7fd5c5cf3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala @@ -30,7 +30,8 @@ class ForeachBatchSink[T](batchWriter: (Dataset[T], Long) => Unit, encoder: Expr val resolvedEncoder = encoder.resolveAndBind( data.logicalPlan.output, data.sparkSession.sessionState.analyzer) - val rdd = data.queryExecution.toRdd.map[T](resolvedEncoder.fromRow)(encoder.clsTag) + val fromRow = resolvedEncoder.createDeserializer() + val rdd = data.queryExecution.toRdd.map[T](fromRow)(encoder.clsTag) val ds = data.sparkSession.createDataset(rdd)(encoder) batchWriter(ds, batchId) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala index 6e4f40ad080d4..57a73c740310e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapabi import org.apache.spark.sql.connector.write.{DataWriter, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.execution.python.PythonForeachWriter +import org.apache.spark.sql.internal.connector.SupportsStreamingUpdate import org.apache.spark.sql.types.StructType /** @@ -54,12 +55,13 @@ case class ForeachWriterTable[T]( } override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { - new WriteBuilder with SupportsTruncate { + new WriteBuilder with SupportsTruncate with SupportsStreamingUpdate { private var inputSchema: StructType = info.schema() - // Do nothing for truncate. Foreach sink is special that it just forwards all the records to - // ForeachWriter. + // Do nothing for truncate/update. Foreach sink is special and it just forwards all the + // records to ForeachWriter. override def truncate(): WriteBuilder = this + override def update(): WriteBuilder = this override def buildForStreaming(): StreamingWrite = { new StreamingWrite { @@ -73,7 +75,7 @@ case class ForeachWriterTable[T]( val boundEnc = enc.resolveAndBind( inputSchema.toAttributes, SparkSession.getActiveSession.get.sessionState.analyzer) - boundEnc.fromRow + boundEnc.createDeserializer() case Right(func) => func } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala index 2b674070a70ad..03ebbb9f1b376 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala @@ -36,6 +36,7 @@ import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapabi import org.apache.spark.sql.connector.write.{DataWriter, DataWriterFactory, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.execution.streaming.Sink +import org.apache.spark.sql.internal.connector.SupportsStreamingUpdate import org.apache.spark.sql.types.StructType /** @@ -53,7 +54,7 @@ class MemorySink extends Table with SupportsWrite with Logging { } override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { - new WriteBuilder with SupportsTruncate { + new WriteBuilder with SupportsTruncate with SupportsStreamingUpdate { private var needTruncate: Boolean = false private val inputSchema: StructType = info.schema() @@ -62,6 +63,9 @@ class MemorySink extends Table with SupportsWrite with Logging { this } + // The in-memory sink treats update as append. + override def update(): WriteBuilder = this + override def buildForStreaming(): StreamingWrite = { new MemoryStreamingWrite(MemorySink.this, inputSchema, needTruncate) } @@ -172,10 +176,10 @@ class MemoryDataWriter(partition: Int, schema: StructType) private val data = mutable.Buffer[Row]() - private val encoder = RowEncoder(schema).resolveAndBind() + private val fromRow = RowEncoder(schema).resolveAndBind().createDeserializer() override def write(row: InternalRow): Unit = { - data.append(encoder.fromRow(row)) + data.append(fromRow(row)) } override def commit(): MemoryWriterCommitMessage = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala index 1a0a43c083879..1a5b50dcc7901 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala @@ -451,10 +451,25 @@ class SymmetricHashJoinStateManager( } private trait KeyWithIndexToValueRowConverter { + /** Defines the schema of the value row (the value side of K-V in state store). */ def valueAttributes: Seq[Attribute] + /** + * Convert the value row to (actual value, match) pair. + * + * NOTE: implementations should ensure the result row is NOT reused during execution, so + * that caller can safely read the value in any time. + */ def convertValue(value: UnsafeRow): ValueAndMatchPair + /** + * Build the value row from (actual value, match) pair. This is expected to be called just + * before storing to the state store. + * + * NOTE: depending on the implementation, the result row "may" be reused during execution + * (to avoid initialization of object), so the caller should ensure that the logic doesn't + * affect by such behavior. Call copy() against the result row if needed. + */ def convertToValueRow(value: UnsafeRow, matched: Boolean): UnsafeRow } @@ -493,7 +508,7 @@ class SymmetricHashJoinStateManager( override def convertValue(value: UnsafeRow): ValueAndMatchPair = { if (value != null) { - ValueAndMatchPair(valueRowGenerator(value), + ValueAndMatchPair(valueRowGenerator(value).copy(), value.getBoolean(indexOrdinalInValueWithMatchedRow)) } else { null diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala index c2270c57eb941..48d6210c83868 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} -import org.apache.spark.sql.catalyst.expressions.{AttributeSeq, CreateNamedStruct, Expression, ExprId, InSet, ListQuery, Literal, PlanExpression} +import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, Expression, ExprId, InSet, ListQuery, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf @@ -117,6 +117,7 @@ case class InSubqueryExec( private var resultBroadcast: Broadcast[Array[Any]] = null) extends ExecSubqueryExpression { @transient private var result: Array[Any] = _ + @transient private lazy val inSet = InSet(child, result.toSet) override def dataType: DataType = BooleanType override def children: Seq[Expression] = child :: Nil @@ -131,10 +132,7 @@ case class InSubqueryExec( def updateResult(): Unit = { val rows = plan.executeCollect() - result = child.dataType match { - case _: StructType => rows.toArray - case _ => rows.map(_.get(0, child.dataType)) - } + result = rows.map(_.get(0, child.dataType)) resultBroadcast = plan.sqlContext.sparkContext.broadcast(result) } @@ -149,17 +147,12 @@ case class InSubqueryExec( override def eval(input: InternalRow): Any = { prepareResult() - val v = child.eval(input) - if (v == null) { - null - } else { - result.contains(v) - } + inSet.eval(input) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { prepareResult() - InSet(child, result.toSet).doGenCode(ctx, ev) + inSet.doGenCode(ctx, ev) } override lazy val canonicalized: InSubqueryExec = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala index 91360e0e50314..e3e22783e18ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala @@ -71,6 +71,10 @@ class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging {jobLinks(JobExecutionStatus.FAILED, "Failed Jobs:")} +
    + + Show the Stage ID and Task ID that corresponds to the max metric +
    val metrics = sqlStore.executionMetrics(executionId) val graph = sqlStore.planGraph(executionId) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala index a90f37a80d525..c6e7f3978469d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala @@ -42,10 +42,6 @@ class SQLAppStatusStore( store.view(classOf[SQLExecutionUIData]).asScala.toSeq } - def executionsList(offset: Int, length: Int): Seq[SQLExecutionUIData] = { - store.view(classOf[SQLExecutionUIData]).skip(offset).max(length).asScala.toSeq - } - def execution(executionId: Long): Option[SQLExecutionUIData] = { try { Some(store.read(classOf[SQLExecutionUIData], executionId)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala index 1e767c3c043c3..274a5a414ffa2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala @@ -160,29 +160,33 @@ private[ui] class SparkPlanGraphNode( val metrics: Seq[SQLPlanMetric]) { def makeDotNode(metricsValue: Map[Long, String]): String = { - val builder = new mutable.StringBuilder(name) + val builder = new mutable.StringBuilder("" + name + "") val values = for { metric <- metrics value <- metricsValue.get(metric.accumulatorId) } yield { - metric.name + ": " + value + // The value may contain ":" to extend the name, like `total (min, med, max): ...` + if (value.contains(":")) { + metric.name + " " + value + } else { + metric.name + ": " + value + } } - // If there are metrics, display each entry in a separate line. - // Note: whitespace between two "\n"s is to create an empty line between the name of - // SparkPlan and metrics. If removing it, it won't display the empty line in UI. - builder ++= "\n \n" - if (values.nonEmpty) { - builder ++= values.mkString("\n") + // If there are metrics, display each entry in a separate line. + // Note: whitespace between two "\n"s is to create an empty line between the name of + // SparkPlan and metrics. If removing it, it won't display the empty line in UI. + builder ++= "

    " + builder ++= values.mkString("
    ") + val labelStr = StringEscapeUtils.escapeJava(builder.toString().replaceAll("\n", "
    ")) + s""" $id [labelType="html" label="${labelStr}"];""" } else { - // A certain level of height is needed for a rect as a node in a sub-graph - // to avoid layout collapse for sub-graphs. - builder ++= " " + // SPARK-30684: when there is no metrics, add empty lines to increase the height of the node, + // so that there won't be gaps between an edge and a small node. + s""" $id [labelType="html" label="
    $name

    "];""" } - - s""" $id [label="${StringEscapeUtils.escapeJava(builder.toString())}"];""" } } @@ -212,6 +216,7 @@ private[ui] class SparkPlanGraphCluster( } s""" | subgraph cluster${id} { + | isCluster="true"; | label="${StringEscapeUtils.escapeJava(labelStr)}"; | ${nodes.map(_.makeDotNode(metricsValue)).mkString(" \n")} | } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala index d191f3790ffa8..42fa07f4a6aee 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala @@ -38,15 +38,25 @@ import org.apache.spark.sql.types.{CalendarIntervalType, DateType, IntegerType, * - Entire partition: The frame is the entire partition, i.e. * UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING. For this case, window function will take all * rows as inputs and be evaluated once. - * - Growing frame: We only add new rows into the frame, i.e. UNBOUNDED PRECEDING AND .... + * - Growing frame: We only add new rows into the frame, Examples are: + * 1. UNBOUNDED PRECEDING AND 1 PRECEDING + * 2. UNBOUNDED PRECEDING AND CURRENT ROW + * 3. UNBOUNDED PRECEDING AND 1 FOLLOWING * Every time we move to a new row to process, we add some rows to the frame. We do not remove * rows from this frame. - * - Shrinking frame: We only remove rows from the frame, i.e. ... AND UNBOUNDED FOLLOWING. + * - Shrinking frame: We only remove rows from the frame, Examples are: + * 1. 1 PRECEDING AND UNBOUNDED FOLLOWING + * 2. CURRENT ROW AND UNBOUNDED FOLLOWING + * 3. 1 FOLLOWING AND UNBOUNDED FOLLOWING * Every time we move to a new row to process, we remove some rows from the frame. We do not add * rows to this frame. * - Moving frame: Every time we move to a new row to process, we remove some rows from the frame * and we add some rows to the frame. Examples are: - * 1 PRECEDING AND CURRENT ROW and 1 FOLLOWING AND 2 FOLLOWING. + * 1. 2 PRECEDING AND 1 PRECEDING + * 2. 1 PRECEDING AND CURRENT ROW + * 3. CURRENT ROW AND 1 FOLLOWING + * 4. 1 PRECEDING AND 1 FOLLOWING + * 5. 1 FOLLOWING AND 2 FOLLOWING * - Offset frame: The frame consist of one row, which is an offset number of rows away from the * current row. Only [[OffsetWindowFunction]]s can be processed in an offset frame. * @@ -105,7 +115,7 @@ case class WindowExec( override def outputPartitioning: Partitioning = child.outputPartitioning protected override def doExecute(): RDD[InternalRow] = { - // Unwrap the expressions and factories from the map. + // Unwrap the window expressions and window frame factories from the map. val expressions = windowFrameExpressionFactoryPairs.flatMap(_._1) val factories = windowFrameExpressionFactoryPairs.map(_._2).toArray val inMemoryThreshold = sqlContext.conf.windowExecBufferInMemoryThreshold diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala index d5d11c45f8535..99c8917478865 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala @@ -114,7 +114,7 @@ abstract class WindowExecBase( /** * Collection containing an entry for each window frame to process. Each entry contains a frame's - * [[WindowExpression]]s and factory function for the WindowFrameFunction. + * [[WindowExpression]]s and factory function for the [[WindowFrameFunction]]. */ protected lazy val windowFrameExpressionFactoryPairs = { type FrameKey = (String, FrameType, Expression, Expression) @@ -170,7 +170,7 @@ abstract class WindowExecBase( MutableProjection.create(expressions, schema)) } - // Create the factory + // Create the factory to produce WindowFunctionFrame. val factory = key match { // Offset Frame case ("OFFSET", _, IntegerLiteral(offset), _) => @@ -223,7 +223,7 @@ abstract class WindowExecBase( // Keep track of the number of expressions. This is a side-effect in a map... numExpressions += expressions.size - // Create the Frame Expression - Factory pair. + // Create the Window Expression - Frame Factory pair. (expressions, factory) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala index d5f2ffa5573a9..dc1b919feefe4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowFunctionFrame.scala @@ -47,14 +47,14 @@ abstract class WindowFunctionFrame { /** * The current lower window bound in the row array (inclusive). * - * This should be called after the current row is updated via [[write]] + * This should be called after the current row is updated via `write`. */ def currentLowerBound(): Int /** * The current row index of the upper window bound in the row array (exclusive) * - * This should be called after the current row is updated via [[write]] + * This should be called after the current row is updated via `write`. */ def currentUpperBound(): Int } @@ -277,6 +277,8 @@ final class UnboundedWindowFunctionFrame( while (iterator.hasNext) { processor.update(iterator.next()) } + + processor.evaluate(target) } upperBound = rows.length @@ -284,11 +286,8 @@ final class UnboundedWindowFunctionFrame( /** Write the frame columns for the current row to the given target row. */ override def write(index: Int, current: InternalRow): Unit = { - // Unfortunately we cannot assume that evaluation is deterministic. So we need to re-evaluate - // for each row. - if (processor != null) { - processor.evaluate(target) - } + // The results are the same for each row in the partition, and have been evaluated in prepare. + // Don't need to recalculate here. } override def currentLowerBound(): Int = lowerBound diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala index 878dc0d83f45a..62d04cf7f7cac 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala @@ -35,6 +35,8 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression * def reduce(b: Int, a: Data): Int = b + a.i * def merge(b1: Int, b2: Int): Int = b1 + b2 * def finish(r: Int): Int = r + * def bufferEncoder: Encoder[Int] = Encoders.scalaInt + * def outputEncoder: Encoder[Int] = Encoders.scalaInt * }.toColumn() * * val ds: Dataset[Data] = ... diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index 85b2cd379ba24..6a20a46756f85 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -90,10 +90,10 @@ sealed abstract class UserDefinedFunction { def asNondeterministic(): UserDefinedFunction } -private[sql] case class SparkUserDefinedFunction( +private[spark] case class SparkUserDefinedFunction( f: AnyRef, dataType: DataType, - inputSchemas: Seq[Option[ScalaReflection.Schema]], + inputEncoders: Seq[Option[ExpressionEncoder[_]]] = Nil, name: Option[String] = None, nullable: Boolean = true, deterministic: Boolean = true) extends UserDefinedFunction { @@ -104,18 +104,11 @@ private[sql] case class SparkUserDefinedFunction( } private[sql] def createScalaUDF(exprs: Seq[Expression]): ScalaUDF = { - // It's possible that some of the inputs don't have a specific type(e.g. `Any`), skip type - // check. - val inputTypes = inputSchemas.map(_.map(_.dataType).getOrElse(AnyDataType)) - // `ScalaReflection.Schema.nullable` is false iff the type is primitive. Also `Any` is not - // primitive. - val inputsPrimitive = inputSchemas.map(_.map(!_.nullable).getOrElse(false)) ScalaUDF( f, dataType, exprs, - inputsPrimitive, - inputTypes, + inputEncoders, udfName = name, nullable = nullable, udfDeterministic = deterministic) @@ -157,7 +150,8 @@ private[sql] case class UserDefinedAggregator[IN, BUF, OUT]( // This is also used by udf.register(...) when it detects a UserDefinedAggregator def scalaAggregator(exprs: Seq[Expression]): ScalaAggregator[IN, BUF, OUT] = { val iEncoder = inputEncoder.asInstanceOf[ExpressionEncoder[IN]] - ScalaAggregator(exprs, aggregator, iEncoder, nullable, deterministic) + val bEncoder = aggregator.bufferEncoder.asInstanceOf[ExpressionEncoder[BUF]] + ScalaAggregator(exprs, aggregator, iEncoder, bEncoder, nullable, deterministic) } override def withName(name: String): UserDefinedAggregator[IN, BUF, OUT] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index da26c5a2f4625..ff8ee2555276a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, ResolvedHint} +import org.apache.spark.sql.catalyst.util.TimestampFormatter import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.expressions.{Aggregator, SparkUserDefinedFunction, UserDefinedAggregator, UserDefinedFunction} import org.apache.spark.sql.internal.SQLConf @@ -210,6 +211,36 @@ object functions { // Aggregate functions ////////////////////////////////////////////////////////////////////////////////////////////// + /** + * @group agg_funcs + * @since 1.3.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(e: Column): Column = approx_count_distinct(e) + + /** + * @group agg_funcs + * @since 1.3.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(columnName: String): Column = approx_count_distinct(columnName) + + /** + * @group agg_funcs + * @since 1.3.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(e: Column, rsd: Double): Column = approx_count_distinct(e, rsd) + + /** + * @group agg_funcs + * @since 1.3.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(columnName: String, rsd: Double): Column = { + approx_count_distinct(Column(columnName), rsd) + } + /** * Aggregate function: returns the approximate number of distinct items in a group. * @@ -231,7 +262,7 @@ object functions { /** * Aggregate function: returns the approximate number of distinct items in a group. * - * @param rsd maximum estimation error allowed (default = 0.05) + * @param rsd maximum relative standard deviation allowed (default = 0.05) * * @group agg_funcs * @since 2.1.0 @@ -243,7 +274,7 @@ object functions { /** * Aggregate function: returns the approximate number of distinct items in a group. * - * @param rsd maximum estimation error allowed (default = 0.05) + * @param rsd maximum relative standard deviation allowed (default = 0.05) * * @group agg_funcs * @since 2.1.0 @@ -430,7 +461,7 @@ object functions { * @since 2.0.0 */ def first(e: Column, ignoreNulls: Boolean): Column = withAggregateFunction { - new First(e.expr, Literal(ignoreNulls)) + First(e.expr, ignoreNulls) } /** @@ -555,7 +586,7 @@ object functions { * @since 2.0.0 */ def last(e: Column, ignoreNulls: Boolean): Column = withAggregateFunction { - new Last(e.expr, Literal(ignoreNulls)) + new Last(e.expr, ignoreNulls) } /** @@ -1089,6 +1120,27 @@ object functions { */ def isnull(e: Column): Column = withExpr { IsNull(e.expr) } + /** + * A column expression that generates monotonically increasing 64-bit integers. + * + * The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. + * The current implementation puts the partition ID in the upper 31 bits, and the record number + * within each partition in the lower 33 bits. The assumption is that the data frame has + * less than 1 billion partitions, and each partition has less than 8 billion records. + * + * As an example, consider a `DataFrame` with two partitions, each with 3 records. + * This expression would return the following IDs: + * + * {{{ + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * }}} + * + * @group normal_funcs + * @since 1.4.0 + */ + @deprecated("Use monotonically_increasing_id()", "2.0.0") + def monotonicallyIncreasingId(): Column = monotonically_increasing_id() + /** * A column expression that generates monotonically increasing 64-bit integers. * @@ -1152,7 +1204,7 @@ object functions { /** * Generate a random column with independent and identically distributed (i.i.d.) samples - * from U[0.0, 1.0]. + * uniformly distributed in [0.0, 1.0). * * @note The function is non-deterministic in general case. * @@ -1163,7 +1215,7 @@ object functions { /** * Generate a random column with independent and identically distributed (i.i.d.) samples - * from U[0.0, 1.0]. + * uniformly distributed in [0.0, 1.0). * * @note The function is non-deterministic in general case. * @@ -1231,7 +1283,7 @@ object functions { * @since 1.4.0 */ @scala.annotation.varargs - def struct(cols: Column*): Column = withExpr { CreateStruct(cols.map(_.expr)) } + def struct(cols: Column*): Column = withExpr { CreateStruct.create(cols.map(_.expr)) } /** * Creates a new struct column that composes multiple input columns. @@ -2070,6 +2122,20 @@ object functions { */ def tanh(columnName: String): Column = tanh(Column(columnName)) + /** + * @group math_funcs + * @since 1.4.0 + */ + @deprecated("Use degrees", "2.1.0") + def toDegrees(e: Column): Column = degrees(e) + + /** + * @group math_funcs + * @since 1.4.0 + */ + @deprecated("Use degrees", "2.1.0") + def toDegrees(columnName: String): Column = degrees(Column(columnName)) + /** * Converts an angle measured in radians to an approximately equivalent angle measured in degrees. * @@ -2092,6 +2158,20 @@ object functions { */ def degrees(columnName: String): Column = degrees(Column(columnName)) + /** + * @group math_funcs + * @since 1.4.0 + */ + @deprecated("Use radians", "2.1.0") + def toRadians(e: Column): Column = radians(e) + + /** + * @group math_funcs + * @since 1.4.0 + */ + @deprecated("Use radians", "2.1.0") + def toRadians(columnName: String): Column = radians(Column(columnName)) + /** * Converts an angle measured in degrees to an approximately equivalent angle measured in radians. * @@ -2459,25 +2539,25 @@ object functions { def soundex(e: Column): Column = withExpr { SoundEx(e.expr) } /** - * Splits str around matches of the given regex. + * Splits str around matches of the given pattern. * * @param str a string expression to split - * @param regex a string representing a regular expression. The regex string should be - * a Java regular expression. + * @param pattern a string representing a regular expression. The regex string should be + * a Java regular expression. * * @group string_funcs * @since 1.5.0 */ - def split(str: Column, regex: String): Column = withExpr { - StringSplit(str.expr, Literal(regex), Literal(-1)) + def split(str: Column, pattern: String): Column = withExpr { + StringSplit(str.expr, Literal(pattern), Literal(-1)) } /** - * Splits str around matches of the given regex. + * Splits str around matches of the given pattern. * * @param str a string expression to split - * @param regex a string representing a regular expression. The regex string should be - * a Java regular expression. + * @param pattern a string representing a regular expression. The regex string should be + * a Java regular expression. * @param limit an integer expression which controls the number of times the regex is applied. *
      *
    • limit greater than 0: The resulting array's length will not be more than limit, @@ -2490,8 +2570,8 @@ object functions { * @group string_funcs * @since 3.0.0 */ - def split(str: Column, regex: String, limit: Int): Column = withExpr { - StringSplit(str.expr, Literal(regex), Literal(limit)) + def split(str: Column, pattern: String, limit: Int): Column = withExpr { + StringSplit(str.expr, Literal(pattern), Literal(limit)) } /** @@ -2612,7 +2692,8 @@ object functions { } /** - * Returns the current date as a date column. + * Returns the current date at the start of query evaluation as a date column. + * All calls of current_date within the same query return the same value. * * @group datetime_funcs * @since 1.5.0 @@ -2620,7 +2701,8 @@ object functions { def current_date(): Column = withExpr { CurrentDate() } /** - * Returns the current timestamp as a timestamp column. + * Returns the current timestamp at the start of query evaluation as a timestamp column. + * All calls of current_timestamp within the same query return the same value. * * @group datetime_funcs * @since 1.5.0 @@ -2631,11 +2713,13 @@ object functions { * Converts a date/timestamp/string to a value of string in the format specified by the date * format given by the second argument. * - * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns + * See + * Datetime Patterns + * for valid date and time format patterns * * @param dateExpr A date, timestamp or string. If a string, the data must be in a format that - * can be cast to a timestamp, such as `uuuu-MM-dd` or `uuuu-MM-dd HH:mm:ss.SSSS` - * @param format A pattern `dd.MM.uuuu` would return a string like `18.03.1993` + * can be cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param format A pattern `dd.MM.yyyy` would return a string like `18.03.1993` * @return A string, or null if `dateExpr` was a string that could not be cast to a timestamp * @note Use specialized functions like [[year]] whenever possible as they benefit from a * specialized implementation. @@ -2872,7 +2956,7 @@ object functions { /** * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string * representing the timestamp of that moment in the current system time zone in the - * uuuu-MM-dd HH:mm:ss format. + * yyyy-MM-dd HH:mm:ss format. * * @param ut A number of a type that is castable to a long, such as string or integer. Can be * negative for timestamps before the unix epoch @@ -2881,7 +2965,7 @@ object functions { * @since 1.5.0 */ def from_unixtime(ut: Column): Column = withExpr { - FromUnixTime(ut.expr, Literal("uuuu-MM-dd HH:mm:ss")) + FromUnixTime(ut.expr, Literal(TimestampFormatter.defaultPattern)) } /** @@ -2889,7 +2973,9 @@ object functions { * representing the timestamp of that moment in the current system time zone in the given * format. * - * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns + * See + * Datetime Patterns + * for valid date and time format patterns * * @param ut A number of a type that is castable to a long, such as string or integer. Can be * negative for timestamps before the unix epoch @@ -2913,30 +2999,32 @@ object functions { * @since 1.5.0 */ def unix_timestamp(): Column = withExpr { - UnixTimestamp(CurrentTimestamp(), Literal("uuuu-MM-dd HH:mm:ss")) + UnixTimestamp(CurrentTimestamp(), Literal(TimestampFormatter.defaultPattern)) } /** - * Converts time string in format uuuu-MM-dd HH:mm:ss to Unix timestamp (in seconds), + * Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds), * using the default timezone and the default locale. * * @param s A date, timestamp or string. If a string, the data must be in the - * `uuuu-MM-dd HH:mm:ss` format + * `yyyy-MM-dd HH:mm:ss` format * @return A long, or null if the input was a string not of the correct format * @group datetime_funcs * @since 1.5.0 */ def unix_timestamp(s: Column): Column = withExpr { - UnixTimestamp(s.expr, Literal("uuuu-MM-dd HH:mm:ss")) + UnixTimestamp(s.expr, Literal(TimestampFormatter.defaultPattern)) } /** * Converts time string with given pattern to Unix timestamp (in seconds). * - * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns + * See + * Datetime Patterns + * for valid date and time format patterns * * @param s A date, timestamp or string. If a string, the data must be in a format that can be - * cast to a date, such as `uuuu-MM-dd` or `uuuu-MM-dd HH:mm:ss.SSSS` + * cast to a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` * @param p A date time pattern detailing the format of `s` when `s` is a string * @return A long, or null if `s` was a string that could not be cast to a date or `p` was * an invalid format @@ -2949,7 +3037,7 @@ object functions { * Converts to a timestamp by casting rules to `TimestampType`. * * @param s A date, timestamp or string. If a string, the data must be in a format that can be - * cast to a timestamp, such as `uuuu-MM-dd` or `uuuu-MM-dd HH:mm:ss.SSSS` + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` * @return A timestamp, or null if the input was a string that could not be cast to a timestamp * @group datetime_funcs * @since 2.2.0 @@ -2961,10 +3049,12 @@ object functions { /** * Converts time string with the given pattern to timestamp. * - * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns + * See + * Datetime Patterns + * for valid date and time format patterns * * @param s A date, timestamp or string. If a string, the data must be in a format that can be - * cast to a timestamp, such as `uuuu-MM-dd` or `uuuu-MM-dd HH:mm:ss.SSSS` + * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` * @param fmt A date time pattern detailing the format of `s` when `s` is a string * @return A timestamp, or null if `s` was a string that could not be cast to a timestamp or * `fmt` was an invalid format @@ -2986,10 +3076,12 @@ object functions { /** * Converts the column into a `DateType` with a specified format * - * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns + * See + * Datetime Patterns + * for valid date and time format patterns * * @param e A date, timestamp or string. If a string, the data must be in a format that can be - * cast to a date, such as `uuuu-MM-dd` or `uuuu-MM-dd HH:mm:ss.SSSS` + * cast to a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` * @param fmt A date time pattern detailing the format of `e` when `e`is a string * @return A date, or null if `e` was a string that could not be cast to a date or `fmt` was an * invalid format @@ -3009,6 +3101,7 @@ object functions { * cast to a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` * @param format: 'year', 'yyyy', 'yy' to truncate by year, * or 'month', 'mon', 'mm' to truncate by month + * Other options are: 'week', 'quarter' * * @return A date, or null if `date` was a string that could not be cast to a date or `format` * was an invalid value @@ -3027,7 +3120,8 @@ object functions { * @param format: 'year', 'yyyy', 'yy' to truncate by year, * 'month', 'mon', 'mm' to truncate by month, * 'day', 'dd' to truncate by day, - * Other options are: 'second', 'minute', 'hour', 'week', 'month', 'quarter' + * Other options are: + * 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' * @param timestamp A date, timestamp or string. If a string, the data must be in a format that * can be cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` * @return A timestamp, or null if `timestamp` was a string that could not be cast to a timestamp @@ -3046,8 +3140,12 @@ object functions { * * @param ts A date, timestamp or string. If a string, the data must be in a format that can be * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` - * @param tz A string detailing the time zone that the input should be adjusted to, such as - * `Europe/London`, `PST` or `GMT+5` + * @param tz A string detailing the time zone ID that the input should be adjusted to. It should + * be in the format of either region-based zone IDs or zone offsets. Region IDs must + * have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in + * the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are + * supported as aliases of '+00:00'. Other short names are not recommended to use + * because they can be ambiguous. * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or * `tz` was an invalid value * @group datetime_funcs @@ -3075,8 +3173,12 @@ object functions { * * @param ts A date, timestamp or string. If a string, the data must be in a format that can be * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` - * @param tz A string detailing the time zone that the input belongs to, such as `Europe/London`, - * `PST` or `GMT+5` + * @param tz A string detailing the time zone ID that the input should be adjusted to. It should + * be in the format of either region-based zone IDs or zone offsets. Region IDs must + * have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in + * the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are + * supported as aliases of '+00:00'. Other short names are not recommended to use + * because they can be ambiguous. * @return A timestamp, or null if `ts` was a string that could not be cast to a timestamp or * `tz` was an invalid value * @group datetime_funcs @@ -3410,6 +3512,12 @@ object functions { /** * Returns an array of elements after applying a transformation to each element * in the input array. + * {{{ + * df.select(transform(col("i"), x => x + 1)) + * }}} + * + * @param column the input array column + * @param f col => transformed_col, the lambda function to transform the input column * * @group collection_funcs * @since 3.0.0 @@ -3421,6 +3529,13 @@ object functions { /** * Returns an array of elements after applying a transformation to each element * in the input array. + * {{{ + * df.select(transform(col("i"), (x, i) => x + i)) + * }}} + * + * @param column the input array column + * @param f (col, index) => transformed_col, the lambda function to filter the input column + * given the index. Indices start at 0. * * @group collection_funcs * @since 3.0.0 @@ -3431,6 +3546,12 @@ object functions { /** * Returns whether a predicate holds for one or more elements in the array. + * {{{ + * df.select(exists(col("i"), _ % 2 === 0)) + * }}} + * + * @param column the input array column + * @param f col => predicate, the Boolean predicate to check the input column * * @group collection_funcs * @since 3.0.0 @@ -3441,6 +3562,12 @@ object functions { /** * Returns whether a predicate holds for every element in the array. + * {{{ + * df.select(forall(col("i"), x => x % 2 === 0)) + * }}} + * + * @param column the input array column + * @param f col => predicate, the Boolean predicate to check the input column * * @group collection_funcs * @since 3.0.0 @@ -3453,11 +3580,10 @@ object functions { * Returns an array of elements for which a predicate holds in a given array. * {{{ * df.select(filter(col("s"), x => x % 2 === 0)) - * df.selectExpr("filter(col, x -> x % 2 == 0)") * }}} * - * @param column: the input array column - * @param f: col => predicate, the Boolean predicate to filter the input column + * @param column the input array column + * @param f col => predicate, the Boolean predicate to filter the input column * * @group collection_funcs * @since 3.0.0 @@ -3470,11 +3596,10 @@ object functions { * Returns an array of elements for which a predicate holds in a given array. * {{{ * df.select(filter(col("s"), (x, i) => i % 2 === 0)) - * df.selectExpr("filter(col, (x, i) -> i % 2 == 0)") * }}} * - * @param column: the input array column - * @param f: (col, index) => predicate, the Boolean predicate to filter the input column + * @param column the input array column + * @param f (col, index) => predicate, the Boolean predicate to filter the input column * given the index. Indices start at 0. * * @group collection_funcs @@ -3488,18 +3613,28 @@ object functions { * Applies a binary operator to an initial state and all elements in the array, * and reduces this to a single state. The final state is converted into the final result * by applying a finish function. + * {{{ + * df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x, _ * 10)) + * }}} + * + * @param expr the input array column + * @param initialValue the initial value + * @param merge (combined_value, input_value) => combined_value, the merge function to merge + * an input value to the combined_value + * @param finish combined_value => final_value, the lambda function to convert the combined value + * of all inputs to final result * * @group collection_funcs * @since 3.0.0 */ def aggregate( expr: Column, - zero: Column, + initialValue: Column, merge: (Column, Column) => Column, finish: Column => Column): Column = withExpr { ArrayAggregate( expr.expr, - zero.expr, + initialValue.expr, createLambda(merge), createLambda(finish) ) @@ -3508,17 +3643,31 @@ object functions { /** * Applies a binary operator to an initial state and all elements in the array, * and reduces this to a single state. + * {{{ + * df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x)) + * }}} * + * @param expr the input array column + * @param initialValue the initial value + * @param merge (combined_value, input_value) => combined_value, the merge function to merge + * an input value to the combined_value * @group collection_funcs * @since 3.0.0 */ - def aggregate(expr: Column, zero: Column, merge: (Column, Column) => Column): Column = - aggregate(expr, zero, merge, c => c) + def aggregate(expr: Column, initialValue: Column, merge: (Column, Column) => Column): Column = + aggregate(expr, initialValue, merge, c => c) /** * Merge two given arrays, element-wise, into a single array using a function. * If one array is shorter, nulls are appended at the end to match the length of the longer * array, before applying the function. + * {{{ + * df.select(zip_with(df1("val1"), df1("val2"), (x, y) => x + y)) + * }}} + * + * @param left the left input array column + * @param right the right input array column + * @param f (lCol, rCol) => col, the lambda function to merge two input columns into one column * * @group collection_funcs * @since 3.0.0 @@ -3530,6 +3679,12 @@ object functions { /** * Applies a function to every key-value pair in a map and returns * a map with the results of those applications as the new keys for the pairs. + * {{{ + * df.select(transform_keys(col("i"), (k, v) => k + v)) + * }}} + * + * @param expr the input map column + * @param f (key, value) => new_key, the lambda function to transform the key of input map column * * @group collection_funcs * @since 3.0.0 @@ -3541,6 +3696,13 @@ object functions { /** * Applies a function to every key-value pair in a map and returns * a map with the results of those applications as the new values for the pairs. + * {{{ + * df.select(transform_values(col("i"), (k, v) => k + v)) + * }}} + * + * @param expr the input map column + * @param f (key, value) => new_value, the lambda function to transform the value of input map + * column * * @group collection_funcs * @since 3.0.0 @@ -3551,6 +3713,12 @@ object functions { /** * Returns a map whose key-value pairs satisfy a predicate. + * {{{ + * df.select(map_filter(col("m"), (k, v) => k * 10 === v)) + * }}} + * + * @param expr the input map column + * @param f (key, value) => predicate, the Boolean predicate to filter the input map column * * @group collection_funcs * @since 3.0.0 @@ -3561,6 +3729,13 @@ object functions { /** * Merge two given maps, key-wise into a single map using a function. + * {{{ + * df.select(map_zip_with(df("m1"), df("m2"), (k, v1, v2) => k === v1 + v2)) + * }}} + * + * @param left the left input map column + * @param right the right input map column + * @param f (key, value1, value2) => new_value, the lambda function to merge the map values * * @group collection_funcs * @since 3.0.0 @@ -3883,6 +4058,10 @@ object functions { /** * Returns length of array or map. * + * The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or + * spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. + * With the default settings, the function returns -1 for null input. + * * @group collection_funcs * @since 1.5.0 */ @@ -4194,7 +4373,7 @@ object functions { (0 to 10).foreach { x => val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"}) val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _) - val inputSchemas = (1 to x).foldRight("Nil")((i, s) => {s"Try(ScalaReflection.schemaFor(typeTag[A$i])).toOption :: $s"}) + val inputEncoders = (1 to x).foldRight("Nil")((i, s) => {s"Try(ExpressionEncoder[A$i]()).toOption :: $s"}) println(s""" |/** | * Defines a Scala closure of $x arguments as user-defined function (UDF). @@ -4207,8 +4386,8 @@ object functions { | */ |def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = { | val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - | val inputSchemas = $inputSchemas - | val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + | val inputEncoders = $inputEncoders + | val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) | if (nullable) udf else udf.asNonNullable() |}""".stripMargin) } @@ -4231,7 +4410,7 @@ object functions { | */ |def udf(f: UDF$i[$extTypeArgs], returnType: DataType): UserDefinedFunction = { | val func = $funcCall - | SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill($i)(None)) + | SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill($i)(None)) |}""".stripMargin) } @@ -4313,8 +4492,8 @@ object functions { */ def udf[RT: TypeTag](f: Function0[RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4329,8 +4508,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag](f: Function1[A1, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4345,8 +4524,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: Function2[A1, A2, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4361,8 +4540,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](f: Function3[A1, A2, A3, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4377,8 +4556,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](f: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4393,8 +4572,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](f: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A5])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4409,8 +4588,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](f: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A5])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A6])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4425,8 +4604,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](f: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A5])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A6])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A7])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4441,8 +4620,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](f: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A5])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A6])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A7])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A8])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4457,8 +4636,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](f: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A5])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A6])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A7])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A8])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A9])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4473,8 +4652,8 @@ object functions { */ def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](f: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction = { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputSchemas = Try(ScalaReflection.schemaFor(typeTag[A1])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A2])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A3])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A4])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A5])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A6])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A7])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A8])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A9])).toOption :: Try(ScalaReflection.schemaFor(typeTag[A10])).toOption :: Nil - val udf = SparkUserDefinedFunction(f, dataType, inputSchemas) + val inputEncoders = Try(ExpressionEncoder[A1]()).toOption :: Try(ExpressionEncoder[A2]()).toOption :: Try(ExpressionEncoder[A3]()).toOption :: Try(ExpressionEncoder[A4]()).toOption :: Try(ExpressionEncoder[A5]()).toOption :: Try(ExpressionEncoder[A6]()).toOption :: Try(ExpressionEncoder[A7]()).toOption :: Try(ExpressionEncoder[A8]()).toOption :: Try(ExpressionEncoder[A9]()).toOption :: Try(ExpressionEncoder[A10]()).toOption :: Nil + val udf = SparkUserDefinedFunction(f, dataType, inputEncoders) if (nullable) udf else udf.asNonNullable() } @@ -4493,7 +4672,7 @@ object functions { */ def udf(f: UDF0[_], returnType: DataType): UserDefinedFunction = { val func = () => f.asInstanceOf[UDF0[Any]].call() - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(0)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(0)(None)) } /** @@ -4507,7 +4686,7 @@ object functions { */ def udf(f: UDF1[_, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(1)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(1)(None)) } /** @@ -4521,7 +4700,7 @@ object functions { */ def udf(f: UDF2[_, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(2)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(2)(None)) } /** @@ -4535,7 +4714,7 @@ object functions { */ def udf(f: UDF3[_, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(3)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(3)(None)) } /** @@ -4549,7 +4728,7 @@ object functions { */ def udf(f: UDF4[_, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(4)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(4)(None)) } /** @@ -4563,7 +4742,7 @@ object functions { */ def udf(f: UDF5[_, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(5)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(5)(None)) } /** @@ -4577,7 +4756,7 @@ object functions { */ def udf(f: UDF6[_, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(6)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(6)(None)) } /** @@ -4591,7 +4770,7 @@ object functions { */ def udf(f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(7)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(7)(None)) } /** @@ -4605,7 +4784,7 @@ object functions { */ def udf(f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(8)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(8)(None)) } /** @@ -4619,7 +4798,7 @@ object functions { */ def udf(f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(9)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(9)(None)) } /** @@ -4633,7 +4812,7 @@ object functions { */ def udf(f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = { val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) - SparkUserDefinedFunction(func, returnType, inputSchemas = Seq.fill(10)(None)) + SparkUserDefinedFunction(func, returnType, inputEncoders = Seq.fill(10)(None)) } // scalastyle:on parameter.number @@ -4658,8 +4837,24 @@ object functions { * @group udf_funcs * @since 2.0.0 */ + @deprecated("Scala `udf` method with return type parameter is deprecated. " + + "Please use Scala `udf` method without return type parameter.", "3.0.0") def udf(f: AnyRef, dataType: DataType): UserDefinedFunction = { - SparkUserDefinedFunction(f, dataType, inputSchemas = Nil) + if (!SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF)) { + val errorMsg = "You're using untyped Scala UDF, which does not have the input type " + + "information. Spark may blindly pass null to the Scala closure with primitive-type " + + "argument, and the closure will see the default value of the Java type for the null " + + "argument, e.g. `udf((x: Int) => x, IntegerType)`, the result is 0 for null input. " + + "To get rid of this error, you could:\n" + + "1. use typed Scala UDF APIs(without return type parameter), e.g. `udf((x: Int) => x)`\n" + + "2. use Java UDF APIs, e.g. `udf(new UDF1[String, Integer] { " + + "override def call(s: String): Integer = s.length() }, IntegerType)`, " + + "if input types are all non primitive\n" + + s"3. set ${SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF.key} to true and " + + s"use this API with caution" + throw new AnalysisException(errorMsg) + } + SparkUserDefinedFunction(f, dataType, inputEncoders = Nil) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index eb658e2d8850e..83a7a557305e9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -26,8 +26,10 @@ import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.connector.catalog.CatalogManager -import org.apache.spark.sql.execution.{ColumnarRule, QueryExecution, SparkOptimizer, SparkPlanner, SparkSqlParser} +import org.apache.spark.sql.execution.{ColumnarRule, QueryExecution, SparkOptimizer, SparkPlan, SparkPlanner, SparkSqlParser} +import org.apache.spark.sql.execution.aggregate.ResolveEncodersInScalaAgg import org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin +import org.apache.spark.sql.execution.command.CommandCheck import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.v2.{TableCapabilityCheck, V2SessionCatalog} import org.apache.spark.sql.streaming.StreamingQueryManager @@ -174,7 +176,9 @@ abstract class BaseSessionStateBuilder( new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: new FallBackFileSourceV2(session) +: - new ResolveSessionCatalog(catalogManager, conf, catalog.isView) +: + ResolveEncodersInScalaAgg +: + new ResolveSessionCatalog( + catalogManager, conf, catalog.isTempView, catalog.isTempFunction) +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = @@ -189,6 +193,7 @@ abstract class BaseSessionStateBuilder( PreReadCheck +: HiveOnlyCheck +: TableCapabilityCheck +: + CommandCheck(conf) +: customCheckRules } @@ -229,6 +234,9 @@ abstract class BaseSessionStateBuilder( */ protected def optimizer: Optimizer = { new SparkOptimizer(catalogManager, catalog, experimentalMethods) { + override def earlyScanPushDownRules: Seq[Rule[LogicalPlan]] = + super.earlyScanPushDownRules ++ customEarlyScanPushDownRules + override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules } @@ -244,6 +252,14 @@ abstract class BaseSessionStateBuilder( extensions.buildOptimizerRules(session) } + /** + * Custom early scan push down rules to add to the Optimizer. Prefer overriding this instead + * of creating your own Optimizer. + * + * Note that this may NOT depend on the `optimizer` function. + */ + protected def customEarlyScanPushDownRules: Seq[Rule[LogicalPlan]] = Nil + /** * Planner that converts optimized logical plans to physical plans. * @@ -270,6 +286,10 @@ abstract class BaseSessionStateBuilder( extensions.buildColumnarRules(session) } + protected def queryStagePrepRules: Seq[Rule[SparkPlan]] = { + extensions.buildQueryStagePrepRules(session) + } + /** * Create a query execution object. */ @@ -321,7 +341,8 @@ abstract class BaseSessionStateBuilder( () => resourceLoader, createQueryExecution, createClone, - columnarRules) + columnarRules, + queryStagePrepRules) } } @@ -333,8 +354,14 @@ private[sql] trait WithTestConf { self: BaseSessionStateBuilder => override protected lazy val conf: SQLConf = { val overrideConfigurations = overrideConfs - val conf = parentState.map(_.conf.clone()).getOrElse { - new SQLConf { + parentState.map { s => + val cloned = s.conf.clone() + if (session.sparkContext.conf.get(StaticSQLConf.SQL_LEGACY_SESSION_INIT_WITH_DEFAULTS)) { + mergeSparkConf(conf, session.sparkContext.conf) + } + cloned + }.getOrElse { + val conf = new SQLConf { clear() override def clear(): Unit = { super.clear() @@ -342,8 +369,8 @@ private[sql] trait WithTestConf { self: BaseSessionStateBuilder => overrideConfigurations.foreach { case (key, value) => setConfString(key, value) } } } + mergeSparkConf(conf, session.sparkContext.conf) + conf } - mergeSparkConf(conf, session.sparkContext.conf) - conf } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 3740b56cb9cbb..7c76168b0dd9b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -464,6 +464,9 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { * If this table is cached as an InMemoryRelation, drop the original cached version and make the * new version cached lazily. * + * In addition, refreshing a table also invalidate all caches that have reference to the table + * in a cascading manner. This is to prevent incorrect result from the otherwise staled caches. + * * @group cachemgmt * @since 2.0.0 */ @@ -484,14 +487,17 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { // If this table is cached as an InMemoryRelation, drop the original // cached version and make the new version cached lazily. val cache = sparkSession.sharedState.cacheManager.lookupCachedData(table) + + // uncache the logical plan. + // note this is a no-op for the table itself if it's not cached, but will invalidate all + // caches referencing this table. + sparkSession.sharedState.cacheManager.uncacheQuery(table, cascade = true) + if (cache.nonEmpty) { // save the cache name and cache level for recreation val cacheName = cache.get.cachedRepresentation.cacheBuilder.tableName val cacheLevel = cache.get.cachedRepresentation.cacheBuilder.storageLevel - // uncache the logical plan. - sparkSession.sharedState.cacheManager.uncacheQuery(table, cascade = true) - // recache with the same name and cache level. sparkSession.sharedState.cacheManager.cacheQuery(table, cacheName, cacheLevel) } @@ -517,10 +523,11 @@ private[sql] object CatalogImpl { data: Seq[T], sparkSession: SparkSession): Dataset[T] = { val enc = ExpressionEncoder[T]() - val encoded = data.map(d => enc.toRow(d).copy()) + val toRow = enc.createSerializer() + val encoded = data.map(d => toRow(d).copy()) val plan = new LocalRelation(enc.schema.toAttributes, encoded) val queryExecution = sparkSession.sessionState.executePlan(plan) - new Dataset[T](sparkSession, queryExecution, enc) + new Dataset[T](queryExecution, enc) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index abd1250628539..cd425b04ef311 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution._ import org.apache.spark.sql.streaming.StreamingQueryManager @@ -73,7 +74,8 @@ private[sql] class SessionState( resourceLoaderBuilder: () => SessionResourceLoader, createQueryExecution: LogicalPlan => QueryExecution, createClone: (SparkSession, SessionState) => SessionState, - val columnarRules: Seq[ColumnarRule]) { + val columnarRules: Seq[ColumnarRule], + val queryStagePrepRules: Seq[Rule[SparkPlan]]) { // The following fields are lazy to avoid creating the Hive client when creating SessionState. lazy val catalog: SessionCatalog = catalogBuilder() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index fefd72dcf1752..1922a58ce16af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -22,6 +22,7 @@ import java.util.UUID import java.util.concurrent.ConcurrentHashMap import javax.annotation.concurrent.GuardedBy +import scala.collection.JavaConverters._ import scala.reflect.ClassTag import scala.util.control.NonFatal @@ -41,7 +42,6 @@ import org.apache.spark.sql.streaming.ui.{StreamingQueryStatusListener, Streamin import org.apache.spark.status.ElementTrackingStore import org.apache.spark.util.Utils - /** * A class that holds all state shared across sessions in a given [[SQLContext]]. * @@ -53,47 +53,12 @@ private[sql] class SharedState( initialConfigs: scala.collection.Map[String, String]) extends Logging { - SharedState.setFsUrlStreamHandlerFactory(sparkContext.conf) - - // Load hive-site.xml into hadoopConf and determine the warehouse path we want to use, based on - // the config from both hive and Spark SQL. Finally set the warehouse config value to sparkConf. - val warehousePath: String = { - val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") - if (configFile != null) { - logInfo(s"loading hive config file: $configFile") - sparkContext.hadoopConfiguration.addResource(configFile) - } - - // hive.metastore.warehouse.dir only stay in hadoopConf - sparkContext.conf.remove("hive.metastore.warehouse.dir") - // Set the Hive metastore warehouse path to the one we use - val hiveWarehouseDir = sparkContext.hadoopConfiguration.get("hive.metastore.warehouse.dir") - if (hiveWarehouseDir != null && !sparkContext.conf.contains(WAREHOUSE_PATH.key)) { - // If hive.metastore.warehouse.dir is set and spark.sql.warehouse.dir is not set, - // we will respect the value of hive.metastore.warehouse.dir. - sparkContext.conf.set(WAREHOUSE_PATH.key, hiveWarehouseDir) - logInfo(s"${WAREHOUSE_PATH.key} is not set, but hive.metastore.warehouse.dir " + - s"is set. Setting ${WAREHOUSE_PATH.key} to the value of " + - s"hive.metastore.warehouse.dir ('$hiveWarehouseDir').") - hiveWarehouseDir - } else { - // If spark.sql.warehouse.dir is set, we will override hive.metastore.warehouse.dir using - // the value of spark.sql.warehouse.dir. - // When neither spark.sql.warehouse.dir nor hive.metastore.warehouse.dir is set, - // we will set hive.metastore.warehouse.dir to the default value of spark.sql.warehouse.dir. - val sparkWarehouseDir = sparkContext.conf.get(WAREHOUSE_PATH) - logInfo(s"Setting hive.metastore.warehouse.dir ('$hiveWarehouseDir') to the value of " + - s"${WAREHOUSE_PATH.key} ('$sparkWarehouseDir').") - sparkContext.hadoopConfiguration.set("hive.metastore.warehouse.dir", sparkWarehouseDir) - sparkWarehouseDir - } - } - logInfo(s"Warehouse path is '$warehousePath'.") + SharedState.setFsUrlStreamHandlerFactory(sparkContext.conf, sparkContext.hadoopConfiguration) - // These 2 variables should be initiated after `warehousePath`, because in the first place we need - // to load hive-site.xml into hadoopConf and determine the warehouse path which will be set into - // both spark conf and hadoop conf avoiding be affected by any SparkSession level options private val (conf, hadoopConf) = { + // Load hive-site.xml into hadoopConf and determine the warehouse path which will be set into + // both spark conf and hadoop conf avoiding be affected by any SparkSession level options + SharedState.loadHiveConfFile(sparkContext.conf, sparkContext.hadoopConfiguration) val confClone = sparkContext.conf.clone() val hadoopConfClone = new Configuration(sparkContext.hadoopConfiguration) // If `SparkSession` is instantiated using an existing `SparkContext` instance and no existing @@ -145,13 +110,14 @@ private[sql] class SharedState( * data to show. */ lazy val streamingQueryStatusListener: Option[StreamingQueryStatusListener] = { - val sqlConf = SQLConf.get - if (sqlConf.isStreamingUIEnabled) { - val statusListener = new StreamingQueryStatusListener(sqlConf) - sparkContext.ui.foreach(new StreamingQueryTab(statusListener, _)) - Some(statusListener) - } else { - None + sparkContext.ui.flatMap { ui => + if (conf.get(STREAMING_UI_ENABLED)) { + val statusListener = new StreamingQueryStatusListener(conf) + new StreamingQueryTab(statusListener, ui) + Some(statusListener) + } else { + None + } } } @@ -165,7 +131,7 @@ private[sql] class SharedState( val defaultDbDefinition = CatalogDatabase( SessionCatalog.DEFAULT_DATABASE, "default database", - CatalogUtils.stringToURI(warehousePath), + CatalogUtils.stringToURI(conf.get(WAREHOUSE_PATH)), Map()) // Create default database if it doesn't exist if (!externalCatalog.databaseExists(SessionCatalog.DEFAULT_DATABASE)) { @@ -187,9 +153,6 @@ private[sql] class SharedState( * A manager for global temporary views. */ lazy val globalTempViewManager: GlobalTempViewManager = { - // System preserved database should not exists in metastore. However it's hard to guarantee it - // for every session, because case-sensitivity differs. Here we always lowercase it to make our - // life easier. val globalTempDB = conf.get(GLOBAL_TEMP_DATABASE) if (externalCatalog.databaseExists(globalTempDB)) { throw new SparkException( @@ -211,13 +174,13 @@ private[sql] class SharedState( object SharedState extends Logging { @volatile private var fsUrlStreamHandlerFactoryInitialized = false - private def setFsUrlStreamHandlerFactory(conf: SparkConf): Unit = { + private def setFsUrlStreamHandlerFactory(conf: SparkConf, hadoopConf: Configuration): Unit = { if (!fsUrlStreamHandlerFactoryInitialized && conf.get(DEFAULT_URL_STREAM_HANDLER_FACTORY_ENABLED)) { synchronized { if (!fsUrlStreamHandlerFactoryInitialized) { try { - URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()) + URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory(hadoopConf)) fsUrlStreamHandlerFactoryInitialized = true } catch { case NonFatal(_) => @@ -257,4 +220,52 @@ object SharedState extends Logging { throw new IllegalArgumentException(s"Error while instantiating '$className':", e) } } + + /** + * Load hive-site.xml into hadoopConf and determine the warehouse path we want to use, based on + * the config from both hive and Spark SQL. Finally set the warehouse config value to sparkConf. + */ + def loadHiveConfFile( + sparkConf: SparkConf, + hadoopConf: Configuration): Unit = { + def containsInSparkConf(key: String): Boolean = { + sparkConf.contains(key) || sparkConf.contains("spark.hadoop." + key) || + (key.startsWith("hive") && sparkConf.contains("spark." + key)) + } + + val hiveWarehouseKey = "hive.metastore.warehouse.dir" + val configFile = Utils.getContextOrSparkClassLoader.getResourceAsStream("hive-site.xml") + if (configFile != null) { + logInfo(s"loading hive config file: $configFile") + val hadoopConfTemp = new Configuration() + hadoopConfTemp.clear() + hadoopConfTemp.addResource(configFile) + for (entry <- hadoopConfTemp.asScala if !containsInSparkConf(entry.getKey)) { + hadoopConf.set(entry.getKey, entry.getValue) + } + } + // hive.metastore.warehouse.dir only stay in hadoopConf + sparkConf.remove(hiveWarehouseKey) + // Set the Hive metastore warehouse path to the one we use + val hiveWarehouseDir = hadoopConf.get(hiveWarehouseKey) + val warehousePath = if (hiveWarehouseDir != null && !sparkConf.contains(WAREHOUSE_PATH.key)) { + // If hive.metastore.warehouse.dir is set and spark.sql.warehouse.dir is not set, + // we will respect the value of hive.metastore.warehouse.dir. + sparkConf.set(WAREHOUSE_PATH.key, hiveWarehouseDir) + logInfo(s"${WAREHOUSE_PATH.key} is not set, but $hiveWarehouseKey is set. Setting" + + s" ${WAREHOUSE_PATH.key} to the value of $hiveWarehouseKey ('$hiveWarehouseDir').") + hiveWarehouseDir + } else { + // If spark.sql.warehouse.dir is set, we will override hive.metastore.warehouse.dir using + // the value of spark.sql.warehouse.dir. + // When neither spark.sql.warehouse.dir nor hive.metastore.warehouse.dir is set + // we will set hive.metastore.warehouse.dir to the default value of spark.sql.warehouse.dir. + val sparkWarehouseDir = sparkConf.get(WAREHOUSE_PATH) + logInfo(s"Setting $hiveWarehouseKey ('$hiveWarehouseDir') to the value of " + + s"${WAREHOUSE_PATH.key} ('$sparkWarehouseDir').") + hadoopConf.set(hiveWarehouseKey, sparkWarehouseDir) + sparkWarehouseDir + } + logInfo(s"Warehouse path is '$warehousePath'.") + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala index 72284b5996201..72cdc68d56116 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala @@ -25,6 +25,13 @@ import org.apache.spark.sql.types._ private object MsSqlServerDialect extends JdbcDialect { + // Special JDBC types in Microsoft SQL Server. + // https://github.com/microsoft/mssql-jdbc/blob/v7.2.1/src/main/java/microsoft/sql/Types.java + private object SpecificTypes { + val GEOMETRY = -157 + val GEOGRAPHY = -158 + } + override def canHandle(url: String): Boolean = url.toLowerCase(Locale.ROOT).startsWith("jdbc:sqlserver") @@ -40,6 +47,7 @@ private object MsSqlServerDialect extends JdbcDialect { sqlType match { case java.sql.Types.SMALLINT => Some(ShortType) case java.sql.Types.REAL => Some(FloatType) + case SpecificTypes.GEOMETRY | SpecificTypes.GEOGRAPHY => Some(BinaryType) case _ => None } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index c8d8a3392128e..a1ce25a0464c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -57,7 +57,7 @@ private object PostgresDialect extends JdbcDialect { case "int8" | "oid" => Some(LongType) case "float4" => Some(FloatType) case "money" | "float8" => Some(DoubleType) - case "text" | "varchar" | "char" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => + case "text" | "varchar" | "char" | "bpchar" | "cidr" | "inet" | "json" | "jsonb" | "uuid" => Some(StringType) case "bytea" => Some(BinaryType) case "timestamp" | "timestamptz" | "time" | "timetz" => Some(TimestampType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala index 61875931d226e..c0397010acba3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala @@ -49,8 +49,15 @@ package object sql { * Metadata key which is used to write Spark version in the followings: * - Parquet file metadata * - ORC file metadata + * - Avro file metadata * * Note that Hive table property `spark.sql.create.version` also has Spark version. */ private[sql] val SPARK_VERSION_METADATA_KEY = "org.apache.spark.version" + + /** + * Parquet/Avro file metadata key to indicate that the file was written with legacy datetime + * values. + */ + private[sql] val SPARK_LEGACY_DATETIME = "org.apache.spark.legacyDateTime" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 0eb4776988d9f..6b30949a4dc80 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -24,6 +24,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.annotation.Evolving import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.connector.catalog.{SupportsRead, TableProvider} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils @@ -81,8 +82,19 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * * You can set the following option(s): *
        - *
      • `timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps in the JSON/CSV datasources or partition values.
      • + *
      • `timeZone` (default session local timezone): sets the string that indicates a time zone ID + * to be used to parse timestamps in the JSON/CSV datasources or partition values. The following + * formats of `timeZone` are supported: + *
          + *
        • Region-based zone ID: It should have the form 'area/city', such as + * 'America/Los_Angeles'.
        • + *
        • Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' + * or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'.
        • + *
        + * Other short names like 'CST' are not recommended to use because they can be ambiguous. + * If it isn't set, the current value of the SQL config `spark.sql.session.timeZone` is + * used by default. + *
      • *
      * * @since 2.0.0 @@ -118,8 +130,19 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * * You can set the following option(s): *
        - *
      • `timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps in the JSON/CSV data sources or partition values.
      • + *
      • `timeZone` (default session local timezone): sets the string that indicates a time zone ID + * to be used to parse timestamps in the JSON/CSV datasources or partition values. The following + * formats of `timeZone` are supported: + *
          + *
        • Region-based zone ID: It should have the form 'area/city', such as + * 'America/Los_Angeles'.
        • + *
        • Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' + * or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'.
        • + *
        + * Other short names like 'CST' are not recommended to use because they can be ambiguous. + * If it isn't set, the current value of the SQL config `spark.sql.session.timeZone` is + * used by default. + *
      • *
      * * @since 2.0.0 @@ -134,8 +157,19 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * * You can set the following option(s): *
        - *
      • `timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to parse timestamps in the JSON/CSV data sources or partition values.
      • + *
      • `timeZone` (default session local timezone): sets the string that indicates a time zone ID + * to be used to parse timestamps in the JSON/CSV datasources or partition values. The following + * formats of `timeZone` are supported: + *
          + *
        • Region-based zone ID: It should have the form 'area/city', such as + * 'America/Los_Angeles'.
        • + *
        • Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' + * or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'.
        • + *
        + * Other short names like 'CST' are not recommended to use because they can be ambiguous. + * If it isn't set, the current value of the SQL config `spark.sql.session.timeZone` is + * used by default. + *
      • *
      * * @since 2.0.0 @@ -177,7 +211,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo case provider: TableProvider if !provider.isInstanceOf[FileDataSourceV2] => val sessionOptions = DataSourceV2Utils.extractSessionConfigs( source = provider, conf = sparkSession.sessionState.conf) - val options = sessionOptions ++ extraOptions + val options = sessionOptions ++ extraOptions.toMap val dsOptions = new CaseInsensitiveStringMap(options.asJava) val table = DataSourceV2Utils.getTableFromProvider(provider, dsOptions, userSpecifiedSchema) import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ @@ -252,12 +286,16 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo *
    • `columnNameOfCorruptRecord` (default is the value specified in * `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.
    • - *
    • `dateFormat` (default `uuuu-MM-dd`): sets the string that indicates a date format. - * Custom date formats follow the formats at `java.time.format.DateTimeFormatter`. + *
    • `dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format. + * Custom date formats follow the formats at + * + * Datetime Patterns. * This applies to date type.
    • - *
    • `timestampFormat` (default `uuuu-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that + *
    • `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at - * `java.time.format.DateTimeFormatter`. This applies to timestamp type.
    • + * + * Datetime Patterns. + * This applies to timestamp type. *
    • `multiLine` (default `false`): parse one record, which may span multiple lines, * per file
    • *
    • `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator @@ -318,12 +356,16 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * value.
    • *
    • `negativeInf` (default `-Inf`): sets the string representation of a negative infinity * value.
    • - *
    • `dateFormat` (default `uuuu-MM-dd`): sets the string that indicates a date format. - * Custom date formats follow the formats at `java.time.format.DateTimeFormatter`. + *
    • `dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format. + * Custom date formats follow the formats at + * + * Datetime Patterns. * This applies to date type.
    • - *
    • `timestampFormat` (default `uuuu-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that + *
    • `timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]`): sets the string that * indicates a timestamp format. Custom date formats follow the formats at - * `java.time.format.DateTimeFormatter`. This applies to timestamp type.
    • + * + * Datetime Patterns. + * This applies to timestamp type. *
    • `maxColumns` (default `20480`): defines a hard limit of how many columns * a record can have.
    • *
    • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed @@ -479,5 +521,5 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo private var userSpecifiedSchema: Option[StructType] = None - private var extraOptions = new scala.collection.mutable.HashMap[String, String] + private var extraOptions = CaseInsensitiveMap[String](Map.empty) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 1c21a30dd5bd6..07ab4008ca63f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -26,6 +26,7 @@ import org.apache.spark.annotation.Evolving import org.apache.spark.api.java.function.VoidFunction2 import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.connector.catalog.{SupportsWrite, TableProvider} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils @@ -161,8 +162,19 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { * * You can set the following option(s): *
        - *
      • `timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to format timestamps in the JSON/CSV datasources or partition values.
      • + *
      • `timeZone` (default session local timezone): sets the string that indicates a time zone ID + * to be used to format timestamps in the JSON/CSV datasources or partition values. The following + * formats of `timeZone` are supported: + *
          + *
        • Region-based zone ID: It should have the form 'area/city', such as + * 'America/Los_Angeles'.
        • + *
        • Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' + * or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'.
        • + *
        + * Other short names like 'CST' are not recommended to use because they can be ambiguous. + * If it isn't set, the current value of the SQL config `spark.sql.session.timeZone` is + * used by default. + *
      • *
      * * @since 2.0.0 @@ -198,8 +210,19 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { * * You can set the following option(s): *
        - *
      • `timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to format timestamps in the JSON/CSV datasources or partition values.
      • + *
      • `timeZone` (default session local timezone): sets the string that indicates a time zone ID + * to be used to format timestamps in the JSON/CSV datasources or partition values. The following + * formats of `timeZone` are supported: + *
          + *
        • Region-based zone ID: It should have the form 'area/city', such as + * 'America/Los_Angeles'.
        • + *
        • Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' + * or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'.
        • + *
        + * Other short names like 'CST' are not recommended to use because they can be ambiguous. + * If it isn't set, the current value of the SQL config `spark.sql.session.timeZone` is + * used by default. + *
      • *
      * * @since 2.0.0 @@ -214,8 +237,19 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { * * You can set the following option(s): *
        - *
      • `timeZone` (default session local timezone): sets the string that indicates a timezone - * to be used to format timestamps in the JSON/CSV datasources or partition values.
      • + *
      • `timeZone` (default session local timezone): sets the string that indicates a time zone ID + * to be used to format timestamps in the JSON/CSV datasources or partition values. The following + * formats of `timeZone` are supported: + *
          + *
        • Region-based zone ID: It should have the form 'area/city', such as + * 'America/Los_Angeles'.
        • + *
        • Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' + * or '+01:00'. Also 'UTC' and 'Z' are supported as aliases of '+00:00'.
        • + *
        + * Other short names like 'CST' are not recommended to use because they can be ambiguous. + * If it isn't set, the current value of the SQL config `spark.sql.session.timeZone` is + * used by default. + *
      • *
      * * @since 2.0.0 @@ -316,7 +350,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { val provider = cls.getConstructor().newInstance().asInstanceOf[TableProvider] val sessionOptions = DataSourceV2Utils.extractSessionConfigs( source = provider, conf = df.sparkSession.sessionState.conf) - val options = sessionOptions ++ extraOptions + val options = sessionOptions ++ extraOptions.toMap val dsOptions = new CaseInsensitiveStringMap(options.asJava) val table = DataSourceV2Utils.getTableFromProvider( provider, dsOptions, userSpecifiedSchema = None) @@ -439,7 +473,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { private var trigger: Trigger = Trigger.ProcessingTime(0L) - private var extraOptions = new scala.collection.mutable.HashMap[String, String] + private var extraOptions = CaseInsensitiveMap[String](Map.empty) private var foreachWriter: ForeachWriter[T] = null diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala index dd842cd1a3e99..7ae38c71a005f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala @@ -82,7 +82,7 @@ object StreamingQueryListener { * @param id A unique query id that persists across restarts. See `StreamingQuery.id()`. * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`. * @param name User-specified name of the query, null if not specified. - * @param submissionTime The timestamp to start a query. + * @param timestamp The timestamp to start a query. * @since 2.1.0 */ @Evolving @@ -90,7 +90,7 @@ object StreamingQueryListener { val id: UUID, val runId: UUID, val name: String, - val submissionTime: Long) extends Event + val timestamp: String) extends Event /** * Event representing any progress updates in a query. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 4d0d8ffd959c6..0fe2d0be966d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -58,8 +58,17 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo private val activeQueriesSharedLock = sparkSession.sharedState.activeQueriesLock private val awaitTerminationLock = new Object + /** + * Track the last terminated query and remember the last failure since the creation of the + * context, or since `resetTerminated()` was called. There are three possible values: + * + * - null: no query has been been terminated. + * - None: some queries have been terminated and no one has failed. + * - Some(StreamingQueryException): Some queries have been terminated and at least one query has + * failed. The exception is the exception of the last failed query. + */ @GuardedBy("awaitTerminationLock") - private var lastTerminatedQuery: StreamingQuery = null + private var lastTerminatedQueryException: Option[StreamingQueryException] = null try { sparkSession.sparkContext.conf.get(STREAMING_QUERY_LISTENERS).foreach { classNames => @@ -125,11 +134,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo @throws[StreamingQueryException] def awaitAnyTermination(): Unit = { awaitTerminationLock.synchronized { - while (lastTerminatedQuery == null) { + while (lastTerminatedQueryException == null) { awaitTerminationLock.wait(10) } - if (lastTerminatedQuery != null && lastTerminatedQuery.exception.nonEmpty) { - throw lastTerminatedQuery.exception.get + if (lastTerminatedQueryException != null && lastTerminatedQueryException.nonEmpty) { + throw lastTerminatedQueryException.get } } } @@ -164,13 +173,13 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo } awaitTerminationLock.synchronized { - while (!isTimedout && lastTerminatedQuery == null) { + while (!isTimedout && lastTerminatedQueryException == null) { awaitTerminationLock.wait(10) } - if (lastTerminatedQuery != null && lastTerminatedQuery.exception.nonEmpty) { - throw lastTerminatedQuery.exception.get + if (lastTerminatedQueryException != null && lastTerminatedQueryException.nonEmpty) { + throw lastTerminatedQueryException.get } - lastTerminatedQuery != null + lastTerminatedQueryException != null } } @@ -182,7 +191,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo */ def resetTerminated(): Unit = { awaitTerminationLock.synchronized { - lastTerminatedQuery = null + lastTerminatedQueryException = null } } @@ -422,8 +431,8 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo private[sql] def notifyQueryTermination(terminatedQuery: StreamingQuery): Unit = { unregisterTerminatedStream(terminatedQuery) awaitTerminationLock.synchronized { - if (lastTerminatedQuery == null || terminatedQuery.exception.nonEmpty) { - lastTerminatedQuery = terminatedQuery + if (lastTerminatedQueryException == null || terminatedQuery.exception.nonEmpty) { + lastTerminatedQueryException = terminatedQuery.exception } awaitTerminationLock.notifyAll() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala index 650f64fe1688c..43b93a3654f4c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala @@ -17,22 +17,18 @@ package org.apache.spark.sql.streaming.ui -import java.text.SimpleDateFormat import javax.servlet.http.HttpServletRequest import scala.xml.Node -import org.apache.commons.lang3.StringEscapeUtils +import org.apache.commons.text.StringEscapeUtils import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone import org.apache.spark.sql.streaming.ui.UIUtils._ import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage} private[ui] class StreamingQueryPage(parent: StreamingQueryTab) extends WebUIPage("") with Logging { - val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") - df.setTimeZone(getTimeZone("UTC")) override def render(request: HttpServletRequest): Seq[Node] = { val content = generateStreamingQueryTable(request) @@ -61,12 +57,12 @@ private[ui] class StreamingQueryPage(parent: StreamingQueryTab) val name = UIUtils.getQueryName(query) val status = UIUtils.getQueryStatus(query) val duration = if (queryActive) { - SparkUIUtils.formatDurationVerbose(System.currentTimeMillis() - query.submissionTime) + System.currentTimeMillis() - query.startTimestamp } else { withNoProgress(query, { val endTimeMs = query.lastProgress.timestamp - SparkUIUtils.formatDurationVerbose(df.parse(endTimeMs).getTime - query.submissionTime) - }, "-") + parseProgressTimestamp(endTimeMs) - query.startTimestamp + }, 0) } @@ -74,8 +70,10 @@ private[ui] class StreamingQueryPage(parent: StreamingQueryTab) {status} {query.id} {query.runId} - {SparkUIUtils.formatDate(query.submissionTime)} - {duration} + {SparkUIUtils.formatDate(query.startTimestamp)} + + {SparkUIUtils.formatDurationVerbose(duration)} + {withNoProgress(query, { (query.recentProgress.map(p => withNumberInvalid(p.inputRowsPerSecond)).sum / query.recentProgress.length).formatted("%.2f") }, "NaN")} @@ -94,29 +92,38 @@ private[ui] class StreamingQueryPage(parent: StreamingQueryTab) .partition(_.isActive) val activeQueryTables = if (activeQueries.nonEmpty) { val headerRow = Seq( - "Name", "Status", "Id", "Run ID", "Submitted Time", "Duration", "Avg Input /sec", + "Name", "Status", "Id", "Run ID", "Start Time", "Duration", "Avg Input /sec", "Avg Process /sec", "Lastest Batch") + val headerCss = Seq("", "", "", "", "", "sorttable_numeric", "sorttable_numeric", + "sorttable_numeric", "") + // header classes size must be equal to header row size + assert(headerRow.size == headerCss.size) + Some(SparkUIUtils.listingTable(headerRow, generateDataRow(request, queryActive = true), - activeQueries, true, None, Seq(null), false)) + activeQueries, true, Some("activeQueries-table"), headerCss, false)) } else { None } val inactiveQueryTables = if (inactiveQueries.nonEmpty) { val headerRow = Seq( - "Name", "Status", "Id", "Run ID", "Submitted Time", "Duration", "Avg Input /sec", + "Name", "Status", "Id", "Run ID", "Start Time", "Duration", "Avg Input /sec", "Avg Process /sec", "Lastest Batch", "Error") + val headerCss = Seq("", "", "", "", "", "sorttable_numeric", "sorttable_numeric", + "sorttable_numeric", "", "") + assert(headerRow.size == headerCss.size) + Some(SparkUIUtils.listingTable(headerRow, generateDataRow(request, queryActive = false), - inactiveQueries, true, None, Seq(null), false)) + inactiveQueries, true, Some("completedQueries-table"), headerCss, false)) } else { None } // scalastyle:off val content = -
      diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala index 56672ce328bff..227e5e5af3983 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala @@ -19,21 +19,17 @@ package org.apache.spark.sql.streaming.ui import java.{util => ju} import java.lang.{Long => JLong} -import java.text.SimpleDateFormat import java.util.UUID import javax.servlet.http.HttpServletRequest import scala.xml.{Node, Unparsed} import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone import org.apache.spark.sql.streaming.ui.UIUtils._ import org.apache.spark.ui.{GraphUIData, JsCollector, UIUtils => SparkUIUtils, WebUIPage} private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) extends WebUIPage("statistics") with Logging { - val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") - df.setTimeZone(getTimeZone("UTC")) def generateLoadResources(request: HttpServletRequest): Seq[Node] = { // scalastyle:off @@ -70,11 +66,30 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) } - def generateVar(values: Array[(Long, ju.Map[String, JLong])]): Seq[Node] = { + def generateTimeTipStrings(values: Array[(Long, Long)]): Seq[Node] = { + val js = "var timeTipStrings = {};\n" + values.map { case (batchId, time) => + val formattedTime = SparkUIUtils.formatBatchTime(time, 1, showYYYYMMSS = false) + s"timeTipStrings[$time] = 'batch $batchId ($formattedTime)';" + }.mkString("\n") + + + } + + def generateFormattedTimeTipStrings(values: Array[(Long, Long)]): Seq[Node] = { + val js = "var formattedTimeTipStrings = {};\n" + values.map { case (batchId, time) => + val formattedTime = SparkUIUtils.formatBatchTime(time, 1, showYYYYMMSS = false) + s"""formattedTimeTipStrings["$formattedTime"] = 'batch $batchId ($formattedTime)';""" + }.mkString("\n") + + + } + + def generateTimeToValues(values: Array[(Long, ju.Map[String, JLong])]): Seq[Node] = { val durationDataPadding = SparkUIUtils.durationDataPadding(values) - val js = "var timeToValues = {};\n" + durationDataPadding.map { case (x, y) => + val js = "var formattedTimeToValues = {};\n" + durationDataPadding.map { case (x, y) => val s = y.toSeq.sortBy(_._1).map(e => s""""${e._2}"""").mkString("[", ",", "]") - s"""timeToValues["${SparkUIUtils.formatBatchTime(x, 1, showYYYYMMSS = false)}"] = $s;""" + val formattedTime = SparkUIUtils.formatBatchTime(x, 1, showYYYYMMSS = false) + s"""formattedTimeToValues["$formattedTime"] = $s;""" }.mkString("\n") @@ -82,13 +97,13 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) def generateBasicInfo(query: StreamingQueryUIData): Seq[Node] = { val duration = if (query.isActive) { - SparkUIUtils.formatDurationVerbose(System.currentTimeMillis() - query.submissionTime) + SparkUIUtils.formatDurationVerbose(System.currentTimeMillis() - query.startTimestamp) } else { withNoProgress(query, { val end = query.lastProgress.timestamp val start = query.recentProgress.head.timestamp SparkUIUtils.formatDurationVerbose( - df.parse(end).getTime - df.parse(start).getTime) + parseProgressTimestamp(end) - parseProgressTimestamp(start)) }, "-") } @@ -100,7 +115,7 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) since - {SparkUIUtils.formatDate(query.submissionTime)} + {SparkUIUtils.formatDate(query.startTimestamp)} ({numBatches} completed batches) @@ -112,12 +127,14 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) } def generateStatTable(query: StreamingQueryUIData): Seq[Node] = { - val batchTimes = withNoProgress(query, - query.recentProgress.map(p => df.parse(p.timestamp).getTime), Array.empty[Long]) + val batchToTimestamps = withNoProgress(query, + query.recentProgress.map(p => (p.batchId, parseProgressTimestamp(p.timestamp))), + Array.empty[(Long, Long)]) + val batchTimes = batchToTimestamps.map(_._2) val minBatchTime = - withNoProgress(query, df.parse(query.recentProgress.head.timestamp).getTime, 0L) + withNoProgress(query, parseProgressTimestamp(query.recentProgress.head.timestamp), 0L) val maxBatchTime = - withNoProgress(query, df.parse(query.lastProgress.timestamp).getTime, 0L) + withNoProgress(query, parseProgressTimestamp(query.lastProgress.timestamp), 0L) val maxRecordRate = withNoProgress(query, query.recentProgress.map(_.inputRowsPerSecond).max, 0L) val minRecordRate = 0L @@ -131,22 +148,26 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) val minBatchDuration = 0L val inputRateData = withNoProgress(query, - query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp), withNumberInvalid { p.inputRowsPerSecond })), Array.empty[(Long, Double)]) val processRateData = withNoProgress(query, - query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp), withNumberInvalid { p.processedRowsPerSecond })), Array.empty[(Long, Double)]) val inputRowsData = withNoProgress(query, - query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp), withNumberInvalid { p.numInputRows })), Array.empty[(Long, Double)]) val batchDurations = withNoProgress(query, - query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp), withNumberInvalid { p.batchDuration })), Array.empty[(Long, Double)]) - val operationDurationData = withNoProgress(query, query.recentProgress.map { p => - val durationMs = p.durationMs - // remove "triggerExecution" as it count the other operation duration. - durationMs.remove("triggerExecution") - (df.parse(p.timestamp).getTime, durationMs)}, Array.empty[(Long, ju.Map[String, JLong])]) + val operationDurationData = withNoProgress( + query, + query.recentProgress.map { p => + val durationMs = p.durationMs + // remove "triggerExecution" as it count the other operation duration. + durationMs.remove("triggerExecution") + (parseProgressTimestamp(p.timestamp), durationMs) + }, + Array.empty[(Long, ju.Map[String, JLong])]) val jsCollector = new JsCollector val graphUIDataForInputRate = @@ -208,14 +229,15 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) 0L, "ms") - val table = - // scalastyle:off + val table = if (query.lastProgress != null) { + // scalastyle:off - + + @@ -264,8 +286,16 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
      TimelinesHistograms
      Histograms
      - // scalastyle:on + } else { +
      + No visualization information available. +
      + // scalastyle:on + } - generateVar(operationDurationData) ++ generateTimeMap(batchTimes) ++ table ++ jsCollector.toHtml + generateTimeToValues(operationDurationData) ++ + generateFormattedTimeTipStrings(batchToTimestamps) ++ + generateTimeMap(batchTimes) ++ generateTimeTipStrings(batchToTimestamps) ++ + table ++ jsCollector.toHtml } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala index db085dbe87ec4..e331083b30024 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala @@ -17,26 +17,23 @@ package org.apache.spark.sql.streaming.ui -import java.text.SimpleDateFormat import java.util.UUID import java.util.concurrent.ConcurrentHashMap import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.SparkConf +import org.apache.spark.sql.internal.StaticSQLConf import org.apache.spark.sql.streaming.{StreamingQueryListener, StreamingQueryProgress} +import org.apache.spark.sql.streaming.ui.UIUtils.parseProgressTimestamp /** * A customized StreamingQueryListener used in structured streaming UI, which contains all * UI data for both active and inactive query. * TODO: Add support for history server. */ -private[sql] class StreamingQueryStatusListener(sqlConf: SQLConf) extends StreamingQueryListener { - - private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 - timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC")) +private[sql] class StreamingQueryStatusListener(conf: SparkConf) extends StreamingQueryListener { /** * We use runId as the key here instead of id in active query status map, @@ -45,16 +42,18 @@ private[sql] class StreamingQueryStatusListener(sqlConf: SQLConf) extends Stream private[ui] val activeQueryStatus = new ConcurrentHashMap[UUID, StreamingQueryUIData]() private[ui] val inactiveQueryStatus = new mutable.Queue[StreamingQueryUIData]() - private val streamingProgressRetention = sqlConf.streamingProgressRetention - private val inactiveQueryStatusRetention = sqlConf.streamingUIInactiveQueryRetention + private val streamingProgressRetention = + conf.get(StaticSQLConf.STREAMING_UI_RETAINED_PROGRESS_UPDATES) + private val inactiveQueryStatusRetention = conf.get(StaticSQLConf.STREAMING_UI_RETAINED_QUERIES) override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { + val startTimestamp = parseProgressTimestamp(event.timestamp) activeQueryStatus.putIfAbsent(event.runId, - new StreamingQueryUIData(event.name, event.id, event.runId, event.submissionTime)) + new StreamingQueryUIData(event.name, event.id, event.runId, startTimestamp)) } override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { - val batchTimestamp = timestampFormat.parse(event.progress.timestamp).getTime + val batchTimestamp = parseProgressTimestamp(event.progress.timestamp) val queryStatus = activeQueryStatus.getOrDefault( event.progress.runId, new StreamingQueryUIData(event.progress.name, event.progress.id, event.progress.runId, @@ -87,7 +86,7 @@ private[ui] class StreamingQueryUIData( val name: String, val id: UUID, val runId: UUID, - val submissionTime: Long) { + val startTimestamp: Long) { /** Holds the most recent query progress updates. */ private val progressBuffer = new mutable.Queue[StreamingQueryProgress]() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala index f909cfd97514e..bb097ffc06912 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala @@ -34,6 +34,6 @@ private[sql] class StreamingQueryTab( parent.addStaticHandler(StreamingQueryTab.STATIC_RESOURCE_DIR, "/static/sql") } -object StreamingQueryTab { +private[sql] object StreamingQueryTab { private val STATIC_RESOURCE_DIR = "org/apache/spark/sql/execution/ui/static" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala index 57b9dec81f28a..cdad5ed9942b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql.streaming.ui +import java.text.SimpleDateFormat +import java.util.Locale + +import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone + private[ui] object UIUtils { /** @@ -57,4 +62,16 @@ private[ui] object UIUtils { query.exception.map(_ => "FAILED").getOrElse("FINISHED") } } + + private val progressTimestampFormat = new ThreadLocal[SimpleDateFormat]() { + override def initialValue(): SimpleDateFormat = { + val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 + format.setTimeZone(getTimeZone("UTC")) + format + } + } + + def parseProgressTimestamp(timestamp: String): Long = { + progressTimestampFormat.get.parse(timestamp).getTime + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala index 01f81825f6bfd..0b5951ec2ac97 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala @@ -23,7 +23,7 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.{QueryExecution, QueryExecutionException} import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.util.{ListenerBus, Utils} @@ -55,12 +55,13 @@ trait QueryExecutionListener { * @param funcName the name of the action that triggered this query. * @param qe the QueryExecution object that carries detail information like logical plan, * physical plan, etc. - * @param error the error that failed this query. - * + * @param exception the exception that failed this query. If `java.lang.Error` is thrown during + * execution, it will be wrapped with an `Exception` and it can be accessed by + * `exception.getCause`. * @note This can be invoked by multiple different threads. */ @DeveloperApi - def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit + def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit } @@ -140,7 +141,14 @@ private[sql] class ExecutionListenerBus(session: SparkSession) val funcName = event.executionName.get event.executionFailure match { case Some(ex) => - listener.onFailure(funcName, event.qe, ex) + val exception = ex match { + case e: Exception => e + case other: Throwable => + val message = "Hit an error when executing a query" + + (if (other.getMessage == null) "" else s": ${other.getMessage}") + new QueryExecutionException(message, other) + } + listener.onFailure(funcName, event.qe, exception) case _ => listener.onSuccess(funcName, event.qe, event.duration) } diff --git a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/SqlResource.scala b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/SqlResource.scala deleted file mode 100644 index 346e07f2bef15..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/SqlResource.scala +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.status.api.v1.sql - -import java.util.Date -import javax.ws.rs._ -import javax.ws.rs.core.MediaType - -import org.apache.spark.JobExecutionStatus -import org.apache.spark.sql.execution.ui.{SQLAppStatusStore, SQLExecutionUIData, SQLPlanMetric} -import org.apache.spark.status.api.v1.{BaseAppResource, NotFoundException} - -@Produces(Array(MediaType.APPLICATION_JSON)) -private[v1] class SqlResource extends BaseAppResource { - - @GET - def sqlList( - @DefaultValue("false") @QueryParam("details") details: Boolean, - @DefaultValue("0") @QueryParam("offset") offset: Int, - @DefaultValue("20") @QueryParam("length") length: Int): Seq[ExecutionData] = { - withUI { ui => - val sqlStore = new SQLAppStatusStore(ui.store.store) - sqlStore.executionsList(offset, length).map(prepareExecutionData(_, details)) - } - } - - @GET - @Path("{executionId:\\d+}") - def sql( - @PathParam("executionId") execId: Long, - @DefaultValue("false") @QueryParam("details") details: Boolean): ExecutionData = { - withUI { ui => - val sqlStore = new SQLAppStatusStore(ui.store.store) - sqlStore - .execution(execId) - .map(prepareExecutionData(_, details)) - .getOrElse(throw new NotFoundException("unknown id: " + execId)) - } - } - - private def printableMetrics( - metrics: Seq[SQLPlanMetric], - metricValues: Map[Long, String]): Seq[Metrics] = { - metrics.map(metric => - Metrics(metric.name, metricValues.get(metric.accumulatorId).getOrElse(""))) - } - - private def prepareExecutionData(exec: SQLExecutionUIData, details: Boolean): ExecutionData = { - var running = Seq[Int]() - var completed = Seq[Int]() - var failed = Seq[Int]() - - exec.jobs.foreach { - case (id, JobExecutionStatus.RUNNING) => - running = running :+ id - case (id, JobExecutionStatus.SUCCEEDED) => - completed = completed :+ id - case (id, JobExecutionStatus.FAILED) => - failed = failed :+ id - case _ => - } - - val status = if (exec.jobs.size == completed.size) { - "COMPLETED" - } else if (failed.nonEmpty) { - "FAILED" - } else { - "RUNNING" - } - - val duration = exec.completionTime.getOrElse(new Date()).getTime - exec.submissionTime - val planDetails = if (details) exec.physicalPlanDescription else "" - val metrics = if (details) printableMetrics(exec.metrics, exec.metricValues) else Seq.empty - new ExecutionData( - exec.executionId, - status, - exec.description, - planDetails, - metrics, - new Date(exec.submissionTime), - duration, - running, - completed, - failed) - } -} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java index 5603cb988b8e7..af0a22b036030 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java @@ -18,6 +18,8 @@ package test.org.apache.spark.sql; import java.io.Serializable; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; import java.time.Instant; import java.time.LocalDate; import java.util.*; @@ -210,6 +212,17 @@ private static Row createRecordSpark22000Row(Long index) { return new GenericRow(values); } + private static String timestampToString(Timestamp ts) { + String timestampString = String.valueOf(ts); + String formatted = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(ts); + + if (timestampString.length() > 19 && !timestampString.substring(19).equals(".0")) { + return formatted + timestampString.substring(19); + } else { + return formatted; + } + } + private static RecordSpark22000 createRecordSpark22000(Row recordRow) { RecordSpark22000 record = new RecordSpark22000(); record.setShortField(String.valueOf(recordRow.getShort(0))); @@ -219,7 +232,7 @@ private static RecordSpark22000 createRecordSpark22000(Row recordRow) { record.setDoubleField(String.valueOf(recordRow.getDouble(4))); record.setStringField(recordRow.getString(5)); record.setBooleanField(String.valueOf(recordRow.getBoolean(6))); - record.setTimestampField(String.valueOf(recordRow.getTimestamp(7))); + record.setTimestampField(timestampToString(recordRow.getTimestamp(7))); // This would figure out that null value will not become "null". record.setNullIntField(null); return record; diff --git a/sql/core/src/test/resources/hive-site.xml b/sql/core/src/test/resources/hive-site.xml index 17297b3e22a7e..4bf6189b73ca9 100644 --- a/sql/core/src/test/resources/hive-site.xml +++ b/sql/core/src/test/resources/hive-site.xml @@ -23,4 +23,9 @@ true Internal marker for test. + + hadoop.tmp.dir + /tmp/hive_one + default is /tmp/hadoop-${user.name} and will be overridden + diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md new file mode 100644 index 0000000000000..b84abe5afc1ef --- /dev/null +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -0,0 +1,336 @@ + +## Summary + - Number of queries: 328 + - Number of expressions that missing example: 34 + - Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,struct,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch +## Schema of Built-in Functions +| Class name | Function name or alias | Query example | Output schema | +| ---------- | ---------------------- | ------------- | ------------- | +| org.apache.spark.sql.catalyst.expressions.Abs | abs | SELECT abs(-1) | struct | +| org.apache.spark.sql.catalyst.expressions.Acos | acos | SELECT acos(1) | struct | +| org.apache.spark.sql.catalyst.expressions.Acosh | acosh | SELECT acosh(1) | struct | +| org.apache.spark.sql.catalyst.expressions.Add | + | SELECT 1 + 2 | struct<(1 + 2):int> | +| org.apache.spark.sql.catalyst.expressions.AddMonths | add_months | SELECT add_months('2016-08-31', 1) | struct | +| org.apache.spark.sql.catalyst.expressions.And | and | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.ArrayAggregate | aggregate | SELECT aggregate(array(1, 2, 3), 0, (acc, x) -> acc + x) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayContains | array_contains | SELECT array_contains(array(1, 2, 3), 2) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayDistinct | array_distinct | SELECT array_distinct(array(1, 2, 3, null, 3)) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArrayExcept | array_except | SELECT array_except(array(1, 2, 3), array(1, 3, 5)) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArrayExists | exists | SELECT exists(array(1, 2, 3), x -> x % 2 == 0) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayFilter | filter | SELECT filter(array(1, 2, 3), x -> x % 2 == 1) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArrayForAll | forall | SELECT forall(array(1, 2, 3), x -> x % 2 == 0) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayIntersect | array_intersect | SELECT array_intersect(array(1, 2, 3), array(1, 3, 5)) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArrayJoin | array_join | SELECT array_join(array('hello', 'world'), ' ') | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayMax | array_max | SELECT array_max(array(1, 20, null, 3)) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayMin | array_min | SELECT array_min(array(1, 20, null, 3)) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayPosition | array_position | SELECT array_position(array(3, 2, 1), 1) | struct | +| org.apache.spark.sql.catalyst.expressions.ArrayRemove | array_remove | SELECT array_remove(array(1, 2, 3, null, 3), 3) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArrayRepeat | array_repeat | SELECT array_repeat('123', 2) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArraySort | array_sort | SELECT array_sort(array(5, 6, 1), (left, right) -> case when left < right then -1 when left > right then 1 else 0 end) | struct namedlambdavariable()) THEN 1 ELSE 0 END, namedlambdavariable(), namedlambdavariable())):array> | +| org.apache.spark.sql.catalyst.expressions.ArrayTransform | transform | SELECT transform(array(1, 2, 3), x -> x + 1) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArrayUnion | array_union | SELECT array_union(array(1, 2, 3), array(1, 3, 5)) | struct> | +| org.apache.spark.sql.catalyst.expressions.ArraysOverlap | arrays_overlap | SELECT arrays_overlap(array(1, 2, 3), array(3, 4, 5)) | struct | +| org.apache.spark.sql.catalyst.expressions.ArraysZip | arrays_zip | SELECT arrays_zip(array(1, 2, 3), array(2, 3, 4)) | struct>> | +| org.apache.spark.sql.catalyst.expressions.Ascii | ascii | SELECT ascii('222') | struct | +| org.apache.spark.sql.catalyst.expressions.Asin | asin | SELECT asin(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Asinh | asinh | SELECT asinh(0) | struct | +| org.apache.spark.sql.catalyst.expressions.AssertTrue | assert_true | SELECT assert_true(0 < 1) | struct | +| org.apache.spark.sql.catalyst.expressions.Atan | atan | SELECT atan(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Atan2 | atan2 | SELECT atan2(0, 0) | struct | +| org.apache.spark.sql.catalyst.expressions.Atanh | atanh | SELECT atanh(0) | struct | +| org.apache.spark.sql.catalyst.expressions.BRound | bround | SELECT bround(2.5, 0) | struct | +| org.apache.spark.sql.catalyst.expressions.Base64 | base64 | SELECT base64('Spark SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.Bin | bin | SELECT bin(13) | struct | +| org.apache.spark.sql.catalyst.expressions.BitLength | bit_length | SELECT bit_length('Spark SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.BitwiseAnd | & | SELECT 3 & 5 | struct<(3 & 5):int> | +| org.apache.spark.sql.catalyst.expressions.BitwiseCount | bit_count | SELECT bit_count(0) | struct | +| org.apache.spark.sql.catalyst.expressions.BitwiseNot | ~ | SELECT ~ 0 | struct<~0:int> | +| org.apache.spark.sql.catalyst.expressions.BitwiseOr | | | SELECT 3 | 5 | struct<(3 | 5):int> | +| org.apache.spark.sql.catalyst.expressions.BitwiseXor | ^ | SELECT 3 ^ 5 | struct<(3 ^ 5):int> | +| org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection | java_method | SELECT java_method('java.util.UUID', 'randomUUID') | struct | +| org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection | reflect | SELECT reflect('java.util.UUID', 'randomUUID') | struct | +| org.apache.spark.sql.catalyst.expressions.CaseWhen | when | SELECT CASE WHEN 1 > 0 THEN 1 WHEN 2 > 0 THEN 2.0 ELSE 1.2 END | struct 0) THEN CAST(1 AS DECIMAL(11,1)) WHEN (2 > 0) THEN CAST(2.0 AS DECIMAL(11,1)) ELSE CAST(1.2 AS DECIMAL(11,1)) END:decimal(11,1)> | +| org.apache.spark.sql.catalyst.expressions.Cast | string | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | cast | SELECT cast('10' as int) | struct | +| org.apache.spark.sql.catalyst.expressions.Cast | tinyint | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | double | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | smallint | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | date | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | decimal | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | boolean | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | float | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | binary | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | bigint | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | int | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cast | timestamp | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Cbrt | cbrt | SELECT cbrt(27.0) | struct | +| org.apache.spark.sql.catalyst.expressions.Ceil | ceil | SELECT ceil(-0.1) | struct | +| org.apache.spark.sql.catalyst.expressions.Ceil | ceiling | SELECT ceiling(-0.1) | struct | +| org.apache.spark.sql.catalyst.expressions.Chr | char | SELECT char(65) | struct | +| org.apache.spark.sql.catalyst.expressions.Chr | chr | SELECT chr(65) | struct | +| org.apache.spark.sql.catalyst.expressions.Coalesce | coalesce | SELECT coalesce(NULL, 1, NULL) | struct | +| org.apache.spark.sql.catalyst.expressions.Concat | concat | SELECT concat('Spark', 'SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.ConcatWs | concat_ws | SELECT concat_ws(' ', 'Spark', 'SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.Conv | conv | SELECT conv('100', 2, 10) | struct | +| org.apache.spark.sql.catalyst.expressions.Cos | cos | SELECT cos(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Cosh | cosh | SELECT cosh(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Cot | cot | SELECT cot(1) | struct | +| org.apache.spark.sql.catalyst.expressions.Crc32 | crc32 | SELECT crc32('Spark') | struct | +| org.apache.spark.sql.catalyst.expressions.CreateArray | array | SELECT array(1, 2, 3) | struct> | +| org.apache.spark.sql.catalyst.expressions.CreateMap | map | SELECT map(1.0, '2', 3.0, '4') | struct> | +| org.apache.spark.sql.catalyst.expressions.CreateNamedStruct | named_struct | SELECT named_struct("a", 1, "b", 2, "c", 3) | struct> | +| org.apache.spark.sql.catalyst.expressions.CreateNamedStruct | struct | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.CsvToStructs | from_csv | SELECT from_csv('1, 0.8', 'a INT, b DOUBLE') | struct> | +| org.apache.spark.sql.catalyst.expressions.Cube | cube | SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name, age) | struct | +| org.apache.spark.sql.catalyst.expressions.CumeDist | cume_dist | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.CurrentDatabase | current_database | SELECT current_database() | struct | +| org.apache.spark.sql.catalyst.expressions.CurrentDate | current_date | SELECT current_date() | struct | +| org.apache.spark.sql.catalyst.expressions.CurrentTimestamp | current_timestamp | SELECT current_timestamp() | struct | +| org.apache.spark.sql.catalyst.expressions.DateAdd | date_add | SELECT date_add('2016-07-30', 1) | struct | +| org.apache.spark.sql.catalyst.expressions.DateDiff | datediff | SELECT datediff('2009-07-31', '2009-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.DateFormatClass | date_format | SELECT date_format('2016-04-08', 'y') | struct | +| org.apache.spark.sql.catalyst.expressions.DatePart | date_part | SELECT date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456') | struct | +| org.apache.spark.sql.catalyst.expressions.DateSub | date_sub | SELECT date_sub('2016-07-30', 1) | struct | +| org.apache.spark.sql.catalyst.expressions.DayOfMonth | day | SELECT day('2009-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.DayOfMonth | dayofmonth | SELECT dayofmonth('2009-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.DayOfWeek | dayofweek | SELECT dayofweek('2009-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.DayOfYear | dayofyear | SELECT dayofyear('2016-04-09') | struct | +| org.apache.spark.sql.catalyst.expressions.Decode | decode | SELECT decode(encode('abc', 'utf-8'), 'utf-8') | struct | +| org.apache.spark.sql.catalyst.expressions.DenseRank | dense_rank | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Divide | / | SELECT 3 / 2 | struct<(CAST(3 AS DOUBLE) / CAST(2 AS DOUBLE)):double> | +| org.apache.spark.sql.catalyst.expressions.ElementAt | element_at | SELECT element_at(array(1, 2, 3), 2) | struct | +| org.apache.spark.sql.catalyst.expressions.Elt | elt | SELECT elt(1, 'scala', 'java') | struct | +| org.apache.spark.sql.catalyst.expressions.Encode | encode | SELECT encode('abc', 'utf-8') | struct | +| org.apache.spark.sql.catalyst.expressions.EqualNullSafe | <=> | SELECT 2 <=> 2 | struct<(2 <=> 2):boolean> | +| org.apache.spark.sql.catalyst.expressions.EqualTo | = | SELECT 2 = 2 | struct<(2 = 2):boolean> | +| org.apache.spark.sql.catalyst.expressions.EqualTo | == | SELECT 2 == 2 | struct<(2 = 2):boolean> | +| org.apache.spark.sql.catalyst.expressions.EulerNumber | e | SELECT e() | struct | +| org.apache.spark.sql.catalyst.expressions.Exp | exp | SELECT exp(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Explode | explode | SELECT explode(array(10, 20)) | struct | +| org.apache.spark.sql.catalyst.expressions.Explode | explode_outer | SELECT explode_outer(array(10, 20)) | struct | +| org.apache.spark.sql.catalyst.expressions.Expm1 | expm1 | SELECT expm1(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Extract | extract | SELECT extract(YEAR FROM TIMESTAMP '2019-08-12 01:00:00.123456') | struct | +| org.apache.spark.sql.catalyst.expressions.Factorial | factorial | SELECT factorial(5) | struct | +| org.apache.spark.sql.catalyst.expressions.FindInSet | find_in_set | SELECT find_in_set('ab','abc,b,ab,c,def') | struct | +| org.apache.spark.sql.catalyst.expressions.Flatten | flatten | SELECT flatten(array(array(1, 2), array(3, 4))) | struct> | +| org.apache.spark.sql.catalyst.expressions.Floor | floor | SELECT floor(-0.1) | struct | +| org.apache.spark.sql.catalyst.expressions.FormatNumber | format_number | SELECT format_number(12332.123456, 4) | struct | +| org.apache.spark.sql.catalyst.expressions.FormatString | printf | SELECT printf("Hello World %d %s", 100, "days") | struct | +| org.apache.spark.sql.catalyst.expressions.FormatString | format_string | SELECT format_string("Hello World %d %s", 100, "days") | struct | +| org.apache.spark.sql.catalyst.expressions.FromUTCTimestamp | from_utc_timestamp | SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul') | struct | +| org.apache.spark.sql.catalyst.expressions.FromUnixTime | from_unixtime | SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ss') | struct | +| org.apache.spark.sql.catalyst.expressions.GetJsonObject | get_json_object | SELECT get_json_object('{"a":"b"}', '$.a') | struct | +| org.apache.spark.sql.catalyst.expressions.GreaterThan | > | SELECT 2 > 1 | struct<(2 > 1):boolean> | +| org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual | >= | SELECT 2 >= 1 | struct<(2 >= 1):boolean> | +| org.apache.spark.sql.catalyst.expressions.Greatest | greatest | SELECT greatest(10, 9, 2, 4, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.Grouping | grouping | SELECT name, grouping(name), sum(age) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name) | struct | +| org.apache.spark.sql.catalyst.expressions.GroupingID | grouping_id | SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height) | struct | +| org.apache.spark.sql.catalyst.expressions.Hex | hex | SELECT hex(17) | struct | +| org.apache.spark.sql.catalyst.expressions.Hour | hour | SELECT hour('2009-07-30 12:58:59') | struct | +| org.apache.spark.sql.catalyst.expressions.Hypot | hypot | SELECT hypot(3, 4) | struct | +| org.apache.spark.sql.catalyst.expressions.If | if | SELECT if(1 < 2, 'a', 'b') | struct<(IF((1 < 2), a, b)):string> | +| org.apache.spark.sql.catalyst.expressions.IfNull | ifnull | SELECT ifnull(NULL, array('2')) | struct> | +| org.apache.spark.sql.catalyst.expressions.In | in | SELECT 1 in(1, 2, 3) | struct<(1 IN (1, 2, 3)):boolean> | +| org.apache.spark.sql.catalyst.expressions.InitCap | initcap | SELECT initcap('sPark sql') | struct | +| org.apache.spark.sql.catalyst.expressions.Inline | inline | SELECT inline(array(struct(1, 'a'), struct(2, 'b'))) | struct | +| org.apache.spark.sql.catalyst.expressions.Inline | inline_outer | SELECT inline_outer(array(struct(1, 'a'), struct(2, 'b'))) | struct | +| org.apache.spark.sql.catalyst.expressions.InputFileBlockLength | input_file_block_length | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.InputFileBlockStart | input_file_block_start | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.InputFileName | input_file_name | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.IntegralDivide | div | SELECT 3 div 2 | struct<(CAST(3 AS BIGINT) div CAST(2 AS BIGINT)):bigint> | +| org.apache.spark.sql.catalyst.expressions.IsNaN | isnan | SELECT isnan(cast('NaN' as double)) | struct | +| org.apache.spark.sql.catalyst.expressions.IsNotNull | isnotnull | SELECT isnotnull(1) | struct<(1 IS NOT NULL):boolean> | +| org.apache.spark.sql.catalyst.expressions.IsNull | isnull | SELECT isnull(1) | struct<(1 IS NULL):boolean> | +| org.apache.spark.sql.catalyst.expressions.JsonToStructs | from_json | SELECT from_json('{"a":1, "b":0.8}', 'a INT, b DOUBLE') | struct> | +| org.apache.spark.sql.catalyst.expressions.JsonTuple | json_tuple | SELECT json_tuple('{"a":1, "b":2}', 'a', 'b') | struct | +| org.apache.spark.sql.catalyst.expressions.Lag | lag | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.LastDay | last_day | SELECT last_day('2009-01-12') | struct | +| org.apache.spark.sql.catalyst.expressions.Lead | lead | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Least | least | SELECT least(10, 9, 2, 4, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.Left | left | SELECT left('Spark SQL', 3) | struct | +| org.apache.spark.sql.catalyst.expressions.Length | character_length | SELECT character_length('Spark SQL ') | struct | +| org.apache.spark.sql.catalyst.expressions.Length | char_length | SELECT char_length('Spark SQL ') | struct | +| org.apache.spark.sql.catalyst.expressions.Length | length | SELECT length('Spark SQL ') | struct | +| org.apache.spark.sql.catalyst.expressions.LessThan | < | SELECT 1 < 2 | struct<(1 < 2):boolean> | +| org.apache.spark.sql.catalyst.expressions.LessThanOrEqual | <= | SELECT 2 <= 2 | struct<(2 <= 2):boolean> | +| org.apache.spark.sql.catalyst.expressions.Levenshtein | levenshtein | SELECT levenshtein('kitten', 'sitting') | struct | +| org.apache.spark.sql.catalyst.expressions.Like | like | SELECT like('Spark', '_park') | struct | +| org.apache.spark.sql.catalyst.expressions.Log | ln | SELECT ln(1) | struct | +| org.apache.spark.sql.catalyst.expressions.Log10 | log10 | SELECT log10(10) | struct | +| org.apache.spark.sql.catalyst.expressions.Log1p | log1p | SELECT log1p(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Log2 | log2 | SELECT log2(2) | struct | +| org.apache.spark.sql.catalyst.expressions.Logarithm | log | SELECT log(10, 100) | struct | +| org.apache.spark.sql.catalyst.expressions.Lower | lcase | SELECT lcase('SparkSql') | struct | +| org.apache.spark.sql.catalyst.expressions.Lower | lower | SELECT lower('SparkSql') | struct | +| org.apache.spark.sql.catalyst.expressions.MakeDate | make_date | SELECT make_date(2013, 7, 15) | struct | +| org.apache.spark.sql.catalyst.expressions.MakeInterval | make_interval | SELECT make_interval(100, 11, 1, 1, 12, 30, 01.001001) | struct | +| org.apache.spark.sql.catalyst.expressions.MakeTimestamp | make_timestamp | SELECT make_timestamp(2014, 12, 28, 6, 30, 45.887) | struct | +| org.apache.spark.sql.catalyst.expressions.MapConcat | map_concat | SELECT map_concat(map(1, 'a', 2, 'b'), map(3, 'c')) | struct> | +| org.apache.spark.sql.catalyst.expressions.MapEntries | map_entries | SELECT map_entries(map(1, 'a', 2, 'b')) | struct>> | +| org.apache.spark.sql.catalyst.expressions.MapFilter | map_filter | SELECT map_filter(map(1, 0, 2, 2, 3, -1), (k, v) -> k > v) | struct namedlambdavariable()), namedlambdavariable(), namedlambdavariable())):map> | +| org.apache.spark.sql.catalyst.expressions.MapFromArrays | map_from_arrays | SELECT map_from_arrays(array(1.0, 3.0), array('2', '4')) | struct> | +| org.apache.spark.sql.catalyst.expressions.MapFromEntries | map_from_entries | SELECT map_from_entries(array(struct(1, 'a'), struct(2, 'b'))) | struct> | +| org.apache.spark.sql.catalyst.expressions.MapKeys | map_keys | SELECT map_keys(map(1, 'a', 2, 'b')) | struct> | +| org.apache.spark.sql.catalyst.expressions.MapValues | map_values | SELECT map_values(map(1, 'a', 2, 'b')) | struct> | +| org.apache.spark.sql.catalyst.expressions.MapZipWith | map_zip_with | SELECT map_zip_with(map(1, 'a', 2, 'b'), map(1, 'x', 2, 'y'), (k, v1, v2) -> concat(v1, v2)) | struct> | +| org.apache.spark.sql.catalyst.expressions.Md5 | md5 | SELECT md5('Spark') | struct | +| org.apache.spark.sql.catalyst.expressions.Minute | minute | SELECT minute('2009-07-30 12:58:59') | struct | +| org.apache.spark.sql.catalyst.expressions.MonotonicallyIncreasingID | monotonically_increasing_id | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Month | month | SELECT month('2016-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.MonthsBetween | months_between | SELECT months_between('1997-02-28 10:30:00', '1996-10-30') | struct | +| org.apache.spark.sql.catalyst.expressions.Multiply | * | SELECT 2 * 3 | struct<(2 * 3):int> | +| org.apache.spark.sql.catalyst.expressions.Murmur3Hash | hash | SELECT hash('Spark', array(123), 2) | struct | +| org.apache.spark.sql.catalyst.expressions.NTile | ntile | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.NaNvl | nanvl | SELECT nanvl(cast('NaN' as double), 123) | struct | +| org.apache.spark.sql.catalyst.expressions.NextDay | next_day | SELECT next_day('2015-01-14', 'TU') | struct | +| org.apache.spark.sql.catalyst.expressions.Not | ! | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Not | not | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Now | now | SELECT now() | struct | +| org.apache.spark.sql.catalyst.expressions.NullIf | nullif | SELECT nullif(2, 2) | struct | +| org.apache.spark.sql.catalyst.expressions.Nvl | nvl | SELECT nvl(NULL, array('2')) | struct> | +| org.apache.spark.sql.catalyst.expressions.Nvl2 | nvl2 | SELECT nvl2(NULL, 2, 1) | struct | +| org.apache.spark.sql.catalyst.expressions.OctetLength | octet_length | SELECT octet_length('Spark SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.Or | or | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Overlay | overlay | SELECT overlay('Spark SQL' PLACING '_' FROM 6) | struct | +| org.apache.spark.sql.catalyst.expressions.ParseToDate | to_date | SELECT to_date('2009-07-30 04:17:52') | struct | +| org.apache.spark.sql.catalyst.expressions.ParseToTimestamp | to_timestamp | SELECT to_timestamp('2016-12-31 00:12:00') | struct | +| org.apache.spark.sql.catalyst.expressions.ParseUrl | parse_url | SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') | struct | +| org.apache.spark.sql.catalyst.expressions.PercentRank | percent_rank | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Pi | pi | SELECT pi() | struct | +| org.apache.spark.sql.catalyst.expressions.Pmod | pmod | SELECT pmod(10, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.PosExplode | posexplode_outer | SELECT posexplode_outer(array(10,20)) | struct | +| org.apache.spark.sql.catalyst.expressions.PosExplode | posexplode | SELECT posexplode(array(10,20)) | struct | +| org.apache.spark.sql.catalyst.expressions.Pow | pow | SELECT pow(2, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.Pow | power | SELECT power(2, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.Quarter | quarter | SELECT quarter('2016-08-31') | struct | +| org.apache.spark.sql.catalyst.expressions.RLike | rlike | SELECT '%SystemDrive%\Users\John' rlike '%SystemDrive%\\Users.*' | struct<%SystemDrive%UsersJohn RLIKE %SystemDrive%\Users.*:boolean> | +| org.apache.spark.sql.catalyst.expressions.Rank | rank | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract | SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) | struct | +| org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace | SELECT regexp_replace('100-200', '(\\d+)', 'num') | struct | +| org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 | struct<(CAST(CAST(2 AS DECIMAL(1,0)) AS DECIMAL(2,1)) % CAST(1.8 AS DECIMAL(2,1))):decimal(2,1)> | +| org.apache.spark.sql.catalyst.expressions.Remainder | mod | SELECT 2 % 1.8 | struct<(CAST(CAST(2 AS DECIMAL(1,0)) AS DECIMAL(2,1)) % CAST(1.8 AS DECIMAL(2,1))):decimal(2,1)> | +| org.apache.spark.sql.catalyst.expressions.Reverse | reverse | SELECT reverse('Spark SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.Right | right | SELECT right('Spark SQL', 3) | struct | +| org.apache.spark.sql.catalyst.expressions.Rint | rint | SELECT rint(12.3456) | struct | +| org.apache.spark.sql.catalyst.expressions.Rollup | rollup | SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY rollup(name, age) | struct | +| org.apache.spark.sql.catalyst.expressions.Round | round | SELECT round(2.5, 0) | struct | +| org.apache.spark.sql.catalyst.expressions.RowNumber | row_number | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.SchemaOfCsv | schema_of_csv | SELECT schema_of_csv('1,abc') | struct | +| org.apache.spark.sql.catalyst.expressions.SchemaOfJson | schema_of_json | SELECT schema_of_json('[{"col":0}]') | struct | +| org.apache.spark.sql.catalyst.expressions.Second | second | SELECT second('2009-07-30 12:58:59') | struct | +| org.apache.spark.sql.catalyst.expressions.Sentences | sentences | SELECT sentences('Hi there! Good morning.') | struct>> | +| org.apache.spark.sql.catalyst.expressions.Sequence | sequence | SELECT sequence(1, 5) | struct> | +| org.apache.spark.sql.catalyst.expressions.Sha1 | sha1 | SELECT sha1('Spark') | struct | +| org.apache.spark.sql.catalyst.expressions.Sha1 | sha | SELECT sha('Spark') | struct | +| org.apache.spark.sql.catalyst.expressions.Sha2 | sha2 | SELECT sha2('Spark', 256) | struct | +| org.apache.spark.sql.catalyst.expressions.ShiftLeft | shiftleft | SELECT shiftleft(2, 1) | struct | +| org.apache.spark.sql.catalyst.expressions.ShiftRight | shiftright | SELECT shiftright(4, 1) | struct | +| org.apache.spark.sql.catalyst.expressions.ShiftRightUnsigned | shiftrightunsigned | SELECT shiftrightunsigned(4, 1) | struct | +| org.apache.spark.sql.catalyst.expressions.Shuffle | shuffle | SELECT shuffle(array(1, 20, 3, 5)) | struct> | +| org.apache.spark.sql.catalyst.expressions.Signum | signum | SELECT signum(40) | struct | +| org.apache.spark.sql.catalyst.expressions.Signum | sign | SELECT sign(40) | struct | +| org.apache.spark.sql.catalyst.expressions.Sin | sin | SELECT sin(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Sinh | sinh | SELECT sinh(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Size | size | SELECT size(array('b', 'd', 'c', 'a')) | struct | +| org.apache.spark.sql.catalyst.expressions.Size | cardinality | SELECT cardinality(array('b', 'd', 'c', 'a')) | struct | +| org.apache.spark.sql.catalyst.expressions.Slice | slice | SELECT slice(array(1, 2, 3, 4), 2, 2) | struct> | +| org.apache.spark.sql.catalyst.expressions.SortArray | sort_array | SELECT sort_array(array('b', 'd', null, 'c', 'a'), true) | struct> | +| org.apache.spark.sql.catalyst.expressions.SoundEx | soundex | SELECT soundex('Miller') | struct | +| org.apache.spark.sql.catalyst.expressions.SparkPartitionID | spark_partition_id | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.SparkVersion | version | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Sqrt | sqrt | SELECT sqrt(4) | struct | +| org.apache.spark.sql.catalyst.expressions.Stack | stack | SELECT stack(2, 1, 2, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.StringInstr | instr | SELECT instr('SparkSQL', 'SQL') | struct | +| org.apache.spark.sql.catalyst.expressions.StringLPad | lpad | SELECT lpad('hi', 5, '??') | struct | +| org.apache.spark.sql.catalyst.expressions.StringLocate | position | SELECT position('bar', 'foobarbar') | struct | +| org.apache.spark.sql.catalyst.expressions.StringLocate | locate | SELECT locate('bar', 'foobarbar') | struct | +| org.apache.spark.sql.catalyst.expressions.StringRPad | rpad | SELECT rpad('hi', 5, '??') | struct | +| org.apache.spark.sql.catalyst.expressions.StringRepeat | repeat | SELECT repeat('123', 2) | struct | +| org.apache.spark.sql.catalyst.expressions.StringReplace | replace | SELECT replace('ABCabc', 'abc', 'DEF') | struct | +| org.apache.spark.sql.catalyst.expressions.StringSpace | space | SELECT concat(space(2), '1') | struct | +| org.apache.spark.sql.catalyst.expressions.StringSplit | split | SELECT split('oneAtwoBthreeC', '[ABC]') | struct> | +| org.apache.spark.sql.catalyst.expressions.StringToMap | str_to_map | SELECT str_to_map('a:1,b:2,c:3', ',', ':') | struct> | +| org.apache.spark.sql.catalyst.expressions.StringTranslate | translate | SELECT translate('AaBbCc', 'abc', '123') | struct | +| org.apache.spark.sql.catalyst.expressions.StringTrim | trim | SELECT trim(' SparkSQL ') | struct | +| org.apache.spark.sql.catalyst.expressions.StringTrimLeft | ltrim | SELECT ltrim(' SparkSQL ') | struct | +| org.apache.spark.sql.catalyst.expressions.StringTrimRight | rtrim | SELECT rtrim(' SparkSQL ') | struct | +| org.apache.spark.sql.catalyst.expressions.StructsToCsv | to_csv | SELECT to_csv(named_struct('a', 1, 'b', 2)) | struct | +| org.apache.spark.sql.catalyst.expressions.StructsToJson | to_json | SELECT to_json(named_struct('a', 1, 'b', 2)) | struct | +| org.apache.spark.sql.catalyst.expressions.Substring | substr | SELECT substr('Spark SQL', 5) | struct | +| org.apache.spark.sql.catalyst.expressions.Substring | substring | SELECT substring('Spark SQL', 5) | struct | +| org.apache.spark.sql.catalyst.expressions.SubstringIndex | substring_index | SELECT substring_index('www.apache.org', '.', 2) | struct | +| org.apache.spark.sql.catalyst.expressions.Subtract | - | SELECT 2 - 1 | struct<(2 - 1):int> | +| org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct | +| org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct | +| org.apache.spark.sql.catalyst.expressions.TimeWindow | window | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.ToDegrees | degrees | SELECT degrees(3.141592653589793) | struct | +| org.apache.spark.sql.catalyst.expressions.ToRadians | radians | SELECT radians(180) | struct | +| org.apache.spark.sql.catalyst.expressions.ToUTCTimestamp | to_utc_timestamp | SELECT to_utc_timestamp('2016-08-31', 'Asia/Seoul') | struct | +| org.apache.spark.sql.catalyst.expressions.ToUnixTimestamp | to_unix_timestamp | SELECT to_unix_timestamp('2016-04-08', 'yyyy-MM-dd') | struct | +| org.apache.spark.sql.catalyst.expressions.TransformKeys | transform_keys | SELECT transform_keys(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> k + 1) | struct> | +| org.apache.spark.sql.catalyst.expressions.TransformValues | transform_values | SELECT transform_values(map_from_arrays(array(1, 2, 3), array(1, 2, 3)), (k, v) -> v + 1) | struct> | +| org.apache.spark.sql.catalyst.expressions.TruncDate | trunc | SELECT trunc('2019-08-04', 'week') | struct | +| org.apache.spark.sql.catalyst.expressions.TruncTimestamp | date_trunc | SELECT date_trunc('YEAR', '2015-03-05T09:32:05.359') | struct | +| org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct | +| org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct | +| org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct | +| org.apache.spark.sql.catalyst.expressions.UnaryPositive | positive | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.Unhex | unhex | SELECT decode(unhex('537061726B2053514C'), 'UTF-8') | struct | +| org.apache.spark.sql.catalyst.expressions.UnixTimestamp | unix_timestamp | SELECT unix_timestamp() | struct | +| org.apache.spark.sql.catalyst.expressions.Upper | ucase | SELECT ucase('SparkSql') | struct | +| org.apache.spark.sql.catalyst.expressions.Upper | upper | SELECT upper('SparkSql') | struct | +| org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct | +| org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct | +| org.apache.spark.sql.catalyst.expressions.XxHash64 | xxhash64 | SELECT xxhash64('Spark', array(123), 2) | struct | +| org.apache.spark.sql.catalyst.expressions.Year | year | SELECT year('2016-07-30') | struct | +| org.apache.spark.sql.catalyst.expressions.ZipWith | zip_with | SELECT zip_with(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x)) | struct>> | +| org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile | approx_percentile | SELECT approx_percentile(col, array(0.5, 0.4, 0.1), 100) FROM VALUES (0), (1), (2), (10) AS tab(col) | struct> | +| org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile | percentile_approx | SELECT percentile_approx(col, array(0.5, 0.4, 0.1), 100) FROM VALUES (0), (1), (2), (10) AS tab(col) | struct> | +| org.apache.spark.sql.catalyst.expressions.aggregate.Average | avg | SELECT avg(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Average | mean | SELECT mean(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BitAndAgg | bit_and | SELECT bit_and(col) FROM VALUES (3), (5) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BitOrAgg | bit_or | SELECT bit_or(col) FROM VALUES (3), (5) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BitXorAgg | bit_xor | SELECT bit_xor(col) FROM VALUES (3), (5) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BoolAnd | every | SELECT every(col) FROM VALUES (true), (true), (true) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BoolAnd | bool_and | SELECT bool_and(col) FROM VALUES (true), (true), (true) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BoolOr | bool_or | SELECT bool_or(col) FROM VALUES (true), (false), (false) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BoolOr | some | SELECT some(col) FROM VALUES (true), (false), (false) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.BoolOr | any | SELECT any(col) FROM VALUES (true), (false), (false) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.CollectList | collect_list | SELECT collect_list(col) FROM VALUES (1), (2), (1) AS tab(col) | struct> | +| org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet | collect_set | SELECT collect_set(col) FROM VALUES (1), (2), (1) AS tab(col) | struct> | +| org.apache.spark.sql.catalyst.expressions.aggregate.Corr | corr | SELECT corr(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Count | count | SELECT count(*) FROM VALUES (NULL), (5), (5), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.CountIf | count_if | SELECT count_if(col % 2 = 0) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.CountMinSketchAgg | count_min_sketch | N/A | N/A | +| org.apache.spark.sql.catalyst.expressions.aggregate.CovPopulation | covar_pop | SELECT covar_pop(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.CovSample | covar_samp | SELECT covar_samp(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.First | first_value | SELECT first_value(col) FROM VALUES (10), (5), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.First | first | SELECT first(col) FROM VALUES (10), (5), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.HyperLogLogPlusPlus | approx_count_distinct | SELECT approx_count_distinct(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Kurtosis | kurtosis | SELECT kurtosis(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Last | last_value | SELECT last_value(col) FROM VALUES (10), (5), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Last | last | SELECT last(col) FROM VALUES (10), (5), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Max | max | SELECT max(col) FROM VALUES (10), (50), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.MaxBy | max_by | SELECT max_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Min | min | SELECT min(col) FROM VALUES (10), (-1), (20) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.MinBy | min_by | SELECT min_by(x, y) FROM VALUES (('a', 10)), (('b', 50)), (('c', 20)) AS tab(x, y) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Percentile | percentile | SELECT percentile(col, 0.3) FROM VALUES (0), (10) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Skewness | skewness | SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop | stddev_pop | SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev_samp | SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | stddev | SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | std | SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.Sum | sum | SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.VariancePop | var_pop | SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.VarianceSamp | var_samp | SELECT var_samp(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.aggregate.VarianceSamp | variance | SELECT variance(col) FROM VALUES (1), (2), (3) AS tab(col) | struct | +| org.apache.spark.sql.catalyst.expressions.xml.XPathBoolean | xpath_boolean | SELECT xpath_boolean('1','a/b') | struct1, a/b):boolean> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathDouble | xpath_number | SELECT xpath_number('12', 'sum(a/b)') | struct12, sum(a/b)):double> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathDouble | xpath_double | SELECT xpath_double('12', 'sum(a/b)') | struct12, sum(a/b)):double> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathFloat | xpath_float | SELECT xpath_float('12', 'sum(a/b)') | struct12, sum(a/b)):float> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathInt | xpath_int | SELECT xpath_int('12', 'sum(a/b)') | struct12, sum(a/b)):int> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathList | xpath | SELECT xpath('b1b2b3c1c2','a/b/text()') | structb1b2b3c1c2, a/b/text()):array> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathLong | xpath_long | SELECT xpath_long('12', 'sum(a/b)') | struct12, sum(a/b)):bigint> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathShort | xpath_short | SELECT xpath_short('12', 'sum(a/b)') | struct12, sum(a/b)):smallint> | +| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('bcc','a/c') | structbcc, a/c):string> | \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/datetime.sql new file mode 100644 index 0000000000000..58ecf80637883 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/datetime.sql @@ -0,0 +1 @@ +--IMPORT datetime.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/string-functions.sql new file mode 100644 index 0000000000000..dd28e9b97fb20 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/string-functions.sql @@ -0,0 +1 @@ +--IMPORT string-functions.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql index 972ebdd01f61e..81c741a5ca8e9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cast.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql @@ -70,6 +70,11 @@ select cast(' 1' as bigint); select cast(' 1' as float); select cast(' 1 ' as DOUBLE); select cast('1.0 ' as DEC); +select cast('1中文' as tinyint); +select cast('1中文' as smallint); +select cast('1中文' as INT); +select cast('中文1' as bigint); +select cast('1中文' as bigint); -- trim string before cast to boolean select cast('\t\t true \n\r ' as boolean); diff --git a/sql/core/src/test/resources/sql-tests/inputs/change-column.sql b/sql/core/src/test/resources/sql-tests/inputs/change-column.sql index dd2fc660b53e3..2b57891cfcbc5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/change-column.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/change-column.sql @@ -15,29 +15,34 @@ ALTER TABLE test_change CHANGE a TYPE STRING; DESC test_change; -- Change column position (not supported yet) -ALTER TABLE test_change CHANGE a TYPE INT AFTER b; -ALTER TABLE test_change CHANGE b TYPE STRING FIRST; +ALTER TABLE test_change CHANGE a AFTER b; +ALTER TABLE test_change CHANGE b FIRST; DESC test_change; -- Change column comment -ALTER TABLE test_change CHANGE a TYPE INT COMMENT 'this is column a'; -ALTER TABLE test_change CHANGE b TYPE STRING COMMENT '#*02?`'; -ALTER TABLE test_change CHANGE c TYPE INT COMMENT ''; +ALTER TABLE test_change CHANGE a COMMENT 'this is column a'; +ALTER TABLE test_change CHANGE b COMMENT '#*02?`'; +ALTER TABLE test_change CHANGE c COMMENT ''; DESC test_change; -- Don't change anything. -ALTER TABLE test_change CHANGE a TYPE INT COMMENT 'this is column a'; +ALTER TABLE test_change CHANGE a TYPE INT; +ALTER TABLE test_change CHANGE a COMMENT 'this is column a'; DESC test_change; -- Change a invalid column ALTER TABLE test_change CHANGE invalid_col TYPE INT; DESC test_change; +-- Check case insensitivity. +ALTER TABLE test_change CHANGE A COMMENT 'case insensitivity'; +DESC test_change; + -- Change column can't apply to a temporary/global_temporary view CREATE TEMPORARY VIEW temp_view(a, b) AS SELECT 1, "one"; -ALTER TABLE temp_view CHANGE a TYPE INT COMMENT 'this is column a'; +ALTER TABLE temp_view CHANGE a TYPE INT; CREATE GLOBAL TEMPORARY VIEW global_temp_view(a, b) AS SELECT 1, "one"; -ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT COMMENT 'this is column a'; +ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT; -- DROP TEST TABLE DROP TABLE test_change; diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql index 2f2606d44d910..29dee1a3afd38 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-legacy.sql @@ -1,115 +1,2 @@ -create temporary view t as select * from values 0, 1, 2 as t(id); -create temporary view t2 as select * from values 0, 1 as t(id); - --- CTE legacy substitution -SET spark.sql.legacy.ctePrecedence.enabled=true; - --- CTE in CTE definition -WITH t as ( - WITH t2 AS (SELECT 1) - SELECT * FROM t2 -) -SELECT * FROM t; - --- CTE in subquery -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 1) - SELECT * FROM t -); - --- CTE in subquery expression -SELECT ( - WITH t AS (SELECT 1) - SELECT * FROM t -); - --- CTE in CTE definition shadows outer -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -SELECT * FROM t2; - --- CTE in CTE definition shadows outer 2 -WITH - t(c) AS (SELECT 1), - t2 AS ( - SELECT ( - SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) - ) - ) -SELECT * FROM t2; - --- CTE in CTE definition shadows outer 3 -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2), - t2 AS ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) - SELECT * FROM t2 - ) -SELECT * FROM t2; - --- CTE in subquery shadows outer -WITH t(c) AS (SELECT 1) -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t -); - --- CTE in subquery shadows outer 2 -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) -); - --- CTE in subquery shadows outer 3 -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 3) - SELECT * FROM t - ) -); - --- CTE in subquery expression shadows outer -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t -); - --- CTE in subquery expression shadows outer 2 -WITH t AS (SELECT 1) -SELECT ( - SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -); - --- CTE in subquery expression shadows outer 3 -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) -); - --- Clean up -DROP VIEW IF EXISTS t; -DROP VIEW IF EXISTS t2; +--SET spark.sql.legacy.ctePrecedencePolicy = legacy +--IMPORT cte-nested.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql new file mode 100644 index 0000000000000..3b64b5daa82db --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql @@ -0,0 +1,138 @@ +-- CTE in CTE definition +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t; + +-- CTE in subquery +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +); + +-- CTE in subquery expression +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +); + +-- CTE in CTE definition shadows outer +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2; + +-- CTE in CTE definition shadows outer 2 +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2; + +-- CTE in CTE definition shadows outer 3 +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2; + +-- CTE in subquery shadows outer +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +); + +-- CTE in subquery shadows outer 2 +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +); + +-- CTE in subquery shadows outer 3 +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +); + +-- CTE in subquery expression shadows outer +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +); + +-- CTE in subquery expression shadows outer 2 +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +); + +-- CTE in subquery expression shadows outer 3 +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +); + +-- CTE in subquery expression shadows outer 4 +WITH t(c) AS (SELECT 1) +SELECT * FROM t +WHERE c IN ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +); + +-- forward name conflict is not a real conflict +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t; + +-- case insensitive name conflicts: in other CTE relations +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t; + +-- case insensitive name conflicts: in subquery expressions +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql new file mode 100644 index 0000000000000..d3b4539da2f3e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.ctePrecedencePolicy = corrected +--IMPORT cte-nested.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql index d0e145c35a9fe..ec2f4808fcfc9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql @@ -49,112 +49,6 @@ WITH t(x) AS (SELECT 2) SELECT * FROM t; --- CTE in CTE definition -WITH t as ( - WITH t2 AS (SELECT 1) - SELECT * FROM t2 -) -SELECT * FROM t; - --- CTE in subquery -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 1) - SELECT * FROM t -); - --- CTE in subquery expression -SELECT ( - WITH t AS (SELECT 1) - SELECT * FROM t -); - --- CTE in CTE definition shadows outer -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -SELECT * FROM t2; - --- CTE in CTE definition shadows outer 2 -WITH - t(c) AS (SELECT 1), - t2 AS ( - SELECT ( - SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) - ) - ) -SELECT * FROM t2; - --- CTE in CTE definition shadows outer 3 -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2), - t2 AS ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) - SELECT * FROM t2 - ) -SELECT * FROM t2; - --- CTE in subquery shadows outer -WITH t(c) AS (SELECT 1) -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t -); - --- CTE in subquery shadows outer 2 -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) -); - --- CTE in subquery shadows outer 3 -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 3) - SELECT * FROM t - ) -); - --- CTE in subquery expression shadows outer -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t -); - --- CTE in subquery expression shadows outer 2 -WITH t AS (SELECT 1) -SELECT ( - SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -); - --- CTE in subquery expression shadows outer 3 -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) -); - -- Clean up DROP VIEW IF EXISTS t; DROP VIEW IF EXISTS t2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/date_part.sql b/sql/core/src/test/resources/sql-tests/inputs/date_part.sql deleted file mode 100644 index a63cdafb745a0..0000000000000 --- a/sql/core/src/test/resources/sql-tests/inputs/date_part.sql +++ /dev/null @@ -1,145 +0,0 @@ -CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c; - -select date_part('millennium', c) from t; -select date_part('millennia', c) from t; -select date_part('mil', c) from t; -select date_part('mils', c) from t; - -select date_part('century', c) from t; -select date_part('centuries', c) from t; -select date_part('c', c) from t; -select date_part('cent', c) from t; - -select date_part('decade', c) from t; -select date_part('decades', c) from t; -select date_part('dec', c) from t; -select date_part('decs', c) from t; - -select date_part('year', c) from t; -select date_part('y', c) from t; -select date_part('years', c) from t; -select date_part('yr', c) from t; -select date_part('yrs', c) from t; - -select date_part('quarter', c) from t; -select date_part('qtr', c) from t; - -select date_part('month', c) from t; -select date_part('mon', c) from t; -select date_part('mons', c) from t; -select date_part('months', c) from t; - -select date_part('week', c) from t; -select date_part('w', c) from t; -select date_part('weeks', c) from t; - -select date_part('day', c) from t; -select date_part('d', c) from t; -select date_part('days', c) from t; - -select date_part('dayofweek', c) from t; - -select date_part('dow', c) from t; - -select date_part('isodow', c) from t; - -select date_part('doy', c) from t; - -select date_part('hour', c) from t; -select date_part('h', c) from t; -select date_part('hours', c) from t; -select date_part('hr', c) from t; -select date_part('hrs', c) from t; - -select date_part('minute', c) from t; -select date_part('m', c) from t; -select date_part('min', c) from t; -select date_part('mins', c) from t; -select date_part('minutes', c) from t; - -select date_part('second', c) from t; -select date_part('s', c) from t; -select date_part('sec', c) from t; -select date_part('seconds', c) from t; -select date_part('secs', c) from t; - -select date_part('not_supported', c) from t; - -select date_part(c, c) from t; - -select date_part(null, c) from t; - -CREATE TEMPORARY VIEW t2 AS select interval 1010 year 9 month 8 day 7 hour 6 minute 5 second 4 millisecond 3 microsecond as c; - -select date_part('millennium', c) from t2; -select date_part('millennia', c) from t2; -select date_part('mil', c) from t2; -select date_part('mils', c) from t2; - -select date_part('century', c) from t2; -select date_part('centuries', c) from t2; -select date_part('c', c) from t2; -select date_part('cent', c) from t2; - -select date_part('decade', c) from t2; -select date_part('decades', c) from t2; -select date_part('dec', c) from t2; -select date_part('decs', c) from t2; - -select date_part('year', c) from t2; -select date_part('y', c) from t2; -select date_part('years', c) from t2; -select date_part('yr', c) from t2; -select date_part('yrs', c) from t2; - -select date_part('quarter', c) from t2; -select date_part('qtr', c) from t2; - -select date_part('month', c) from t2; -select date_part('mon', c) from t2; -select date_part('mons', c) from t2; -select date_part('months', c) from t2; - -select date_part('day', c) from t2; -select date_part('d', c) from t2; -select date_part('days', c) from t2; - -select date_part('hour', c) from t2; -select date_part('h', c) from t2; -select date_part('hours', c) from t2; -select date_part('hr', c) from t2; -select date_part('hrs', c) from t2; - -select date_part('minute', c) from t2; -select date_part('m', c) from t2; -select date_part('min', c) from t2; -select date_part('mins', c) from t2; -select date_part('minutes', c) from t2; - -select date_part('second', c) from t2; -select date_part('s', c) from t2; -select date_part('sec', c) from t2; -select date_part('seconds', c) from t2; -select date_part('secs', c) from t2; - -select date_part('milliseconds', c) from t2; -select date_part('msec', c) from t2; -select date_part('msecs', c) from t2; -select date_part('millisecon', c) from t2; -select date_part('mseconds', c) from t2; -select date_part('ms', c) from t2; - -select date_part('microseconds', c) from t2; -select date_part('usec', c) from t2; -select date_part('usecs', c) from t2; -select date_part('useconds', c) from t2; -select date_part('microsecon', c) from t2; -select date_part('us', c) from t2; - -select date_part('epoch', c) from t2; - -select date_part('not_supported', c) from t2; - -select date_part(c, c) from t2; - -select date_part(null, c) from t2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting-invalid.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting-invalid.sql new file mode 100644 index 0000000000000..11bba00e91abf --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting-invalid.sql @@ -0,0 +1,53 @@ +--- TESTS FOR DATETIME FORMATTING FUNCTIONS WITH INVALID PATTERNS --- + +-- separating this from datetime-formatting.sql, because the text form +-- for patterns with 5 letters in SimpleDateFormat varies from different JDKs +select date_format('2018-11-17 13:33:33.333', 'GGGGG'); +-- pattern letter count can not be greater than 6 +select date_format('2018-11-17 13:33:33.333', 'yyyyyyy'); +-- q/L in JDK 8 will fail when the count is more than 2 +select date_format('2018-11-17 13:33:33.333', 'qqqqq'); +select date_format('2018-11-17 13:33:33.333', 'QQQQQ'); +select date_format('2018-11-17 13:33:33.333', 'MMMMM'); +select date_format('2018-11-17 13:33:33.333', 'LLLLL'); + +select date_format('2018-11-17 13:33:33.333', 'EEEEE'); +select date_format('2018-11-17 13:33:33.333', 'FF'); +select date_format('2018-11-17 13:33:33.333', 'ddd'); +-- DD is invalid if the day-of-year exceeds 100, but it becomes valid in Java 11 +-- select date_format('2018-11-17 13:33:33.333', 'DD'); +select date_format('2018-11-17 13:33:33.333', 'DDDD'); +select date_format('2018-11-17 13:33:33.333', 'HHH'); +select date_format('2018-11-17 13:33:33.333', 'hhh'); +select date_format('2018-11-17 13:33:33.333', 'kkk'); +select date_format('2018-11-17 13:33:33.333', 'KKK'); +select date_format('2018-11-17 13:33:33.333', 'mmm'); +select date_format('2018-11-17 13:33:33.333', 'sss'); +select date_format('2018-11-17 13:33:33.333', 'SSSSSSSSSS'); +select date_format('2018-11-17 13:33:33.333', 'aa'); +select date_format('2018-11-17 13:33:33.333', 'V'); +select date_format('2018-11-17 13:33:33.333', 'zzzzz'); +select date_format('2018-11-17 13:33:33.333', 'XXXXXX'); +select date_format('2018-11-17 13:33:33.333', 'ZZZZZZ'); +select date_format('2018-11-17 13:33:33.333', 'OO'); +select date_format('2018-11-17 13:33:33.333', 'xxxxxx'); + +select date_format('2018-11-17 13:33:33.333', 'A'); +select date_format('2018-11-17 13:33:33.333', 'n'); +select date_format('2018-11-17 13:33:33.333', 'N'); +select date_format('2018-11-17 13:33:33.333', 'p'); + +-- disabled week-based patterns +select date_format('2018-11-17 13:33:33.333', 'Y'); +select date_format('2018-11-17 13:33:33.333', 'w'); +select date_format('2018-11-17 13:33:33.333', 'W'); +select date_format('2018-11-17 13:33:33.333', 'u'); +select date_format('2018-11-17 13:33:33.333', 'e'); +select date_format('2018-11-17 13:33:33.333', 'c'); + +-- others +select date_format('2018-11-17 13:33:33.333', 'B'); +select date_format('2018-11-17 13:33:33.333', 'C'); +select date_format('2018-11-17 13:33:33.333', 'I'); + + diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting-legacy.sql new file mode 100644 index 0000000000000..19cab61a7ee56 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting-legacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.timeParserPolicy=LEGACY +--IMPORT datetime-formatting.sql \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting.sql new file mode 100644 index 0000000000000..2d70326f4f3c0 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-formatting.sql @@ -0,0 +1,68 @@ +--- TESTS FOR DATETIME FORMATTING FUNCTIONS --- + +create temporary view v as select col from values + (timestamp '1582-06-01 11:33:33.123UTC+080000'), + (timestamp '1970-01-01 00:00:00.000Europe/Paris'), + (timestamp '1970-12-31 23:59:59.999Asia/Srednekolymsk'), + (timestamp '1996-04-01 00:33:33.123Australia/Darwin'), + (timestamp '2018-11-17 13:33:33.123Z'), + (timestamp '2020-01-01 01:33:33.123Asia/Shanghai'), + (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col); + +select col, date_format(col, 'G GG GGG GGGG') from v; + +select col, date_format(col, 'y yy yyy yyyy yyyyy yyyyyy') from v; + +select col, date_format(col, 'q qq') from v; + +select col, date_format(col, 'Q QQ QQQ QQQQ') from v; + +select col, date_format(col, 'M MM MMM MMMM') from v; + +select col, date_format(col, 'L LL') from v; + +select col, date_format(col, 'E EE EEE EEEE') from v; + +select col, date_format(col, 'F') from v; + +select col, date_format(col, 'd dd') from v; + +select col, date_format(col, 'DD') from v where col = timestamp '2100-01-01 01:33:33.123America/Los_Angeles'; +select col, date_format(col, 'D DDD') from v; + +select col, date_format(col, 'H HH') from v; + +select col, date_format(col, 'h hh') from v; + +select col, date_format(col, 'k kk') from v; + +select col, date_format(col, 'K KK') from v; + +select col, date_format(col, 'm mm') from v; + +select col, date_format(col, 's ss') from v; + +select col, date_format(col, 'S SS SSS SSSS SSSSS SSSSSS SSSSSSS SSSSSSSS SSSSSSSSS') from v; + +select col, date_format(col, 'a') from v; + +select col, date_format(col, 'VV') from v; + +select col, date_format(col, 'z zz zzz zzzz') from v; + +select col, date_format(col, 'X XX XXX') from v; +select col, date_format(col, 'XXXX XXXXX') from v; + +select col, date_format(col, 'Z ZZ ZZZ ZZZZ ZZZZZ') from v; + +select col, date_format(col, 'O OOOO') from v; + +select col, date_format(col, 'x xx xxx xxxx xxxx xxxxx') from v; + +-- optional pattern, but the results won't be optional for formatting +select col, date_format(col, '[yyyy-MM-dd HH:mm:ss]') from v; + +-- literals +select col, date_format(col, "姚123'GyYqQMLwWuEFDdhHmsSaVzZxXOV'") from v; +select col, date_format(col, "''") from v; +select col, date_format(col, '') from v; diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-legacy.sql new file mode 100644 index 0000000000000..e573f8a6b4be9 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-legacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.timeParserPolicy=LEGACY +--IMPORT datetime.sql \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-invalid.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-invalid.sql new file mode 100644 index 0000000000000..a1c02eaa3b0a0 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-invalid.sql @@ -0,0 +1,29 @@ +--- TESTS FOR DATETIME PARSING FUNCTIONS WITH INVALID VALUES --- + +-- parsing invalid value with pattern 'y' +select to_timestamp('294248', 'y'); -- out of year value range [0, 294247] +select to_timestamp('1', 'yy'); -- the number of digits must be 2 for 'yy'. +select to_timestamp('-12', 'yy'); -- out of year value range [0, 99] for reduced two digit form +select to_timestamp('123', 'yy'); -- the number of digits must be 2 for 'yy'. +select to_timestamp('1', 'yyy'); -- the number of digits must be in [3, 6] for 'yyy' + +select to_timestamp('1234567', 'yyyyyyy'); -- the length of 'y' pattern must be less than 7 + +-- parsing invalid values with pattern 'D' +select to_timestamp('366', 'D'); +select to_timestamp('9', 'DD'); +-- in java 8 this case is invalid, but valid in java 11, disabled for jenkins +-- select to_timestamp('100', 'DD'); +select to_timestamp('366', 'DD'); +select to_timestamp('9', 'DDD'); +select to_timestamp('99', 'DDD'); +select to_timestamp('30-365', 'dd-DDD'); +select to_timestamp('11-365', 'MM-DDD'); +select to_timestamp('2019-366', 'yyyy-DDD'); +select to_timestamp('12-30-365', 'MM-dd-DDD'); +select to_timestamp('2020-01-365', 'yyyy-dd-DDD'); +select to_timestamp('2020-10-350', 'yyyy-MM-DDD'); +select to_timestamp('2020-11-31-366', 'yyyy-MM-dd-DDD'); +-- add a special case to test csv, because the legacy formatter it uses is lenient then Spark should +-- throw SparkUpgradeException +select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD')) diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-legacy.sql new file mode 100644 index 0000000000000..ee1afe502ab79 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-legacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.timeParserPolicy=LEGACY +--IMPORT datetime-parsing.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql new file mode 100644 index 0000000000000..e058bd675c375 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql @@ -0,0 +1,44 @@ +--- TESTS FOR DATETIME PARSING FUNCTIONS --- + +-- parsing with pattern 'y'. +-- the range of valid year is [-290307, 294247], +-- but particularly, some thrift client use java.sql.Timestamp to parse timestamp, which allows +-- only positive year values less or equal than 9999. So the cases bellow only use [1, 9999] to pass +-- ThriftServerQueryTestSuite +select to_timestamp('1', 'y'); +select to_timestamp('009999', 'y'); + +-- reduced two digit form is used, the range of valid year is 20-[01, 99] +select to_timestamp('00', 'yy'); +select to_timestamp('99', 'yy'); + +-- the range of valid year is [-290307, 294247], the number of digits must be in [3, 6] for 'yyy' +select to_timestamp('001', 'yyy'); +select to_timestamp('009999', 'yyy'); + +-- the range of valid year is [-9999, 9999], the number of digits must be 4 for 'yyyy'. +select to_timestamp('0001', 'yyyy'); +select to_timestamp('9999', 'yyyy'); + +-- the range of valid year is [-99999, 99999], the number of digits must be 5 for 'yyyyy'. +select to_timestamp('00001', 'yyyyy'); +select to_timestamp('09999', 'yyyyy'); + +-- the range of valid year is [-290307, 294247], the number of digits must be 6 for 'yyyyyy'. +select to_timestamp('000001', 'yyyyyy'); +select to_timestamp('009999', 'yyyyyy'); + +-- parsing with pattern 'D' +select to_timestamp('9', 'D'); +select to_timestamp('300', 'D'); +select to_timestamp('09', 'DD'); +select to_timestamp('99', 'DD'); +select to_timestamp('009', 'DDD'); +select to_timestamp('365', 'DDD'); +select to_timestamp('31-365', 'dd-DDD'); +select to_timestamp('12-365', 'MM-DDD'); +select to_timestamp('2020-365', 'yyyy-DDD'); +select to_timestamp('12-31-365', 'MM-dd-DDD'); +select to_timestamp('2020-30-365', 'yyyy-dd-DDD'); +select to_timestamp('2020-12-350', 'yyyy-MM-DDD'); +select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index b14778b91510e..ae5831c61913c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -33,6 +33,8 @@ select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01'); select date '2019-01-01\t'; select timestamp '2019-01-01\t'; +select date '2020-01-01中文'; +select timestamp '2019-01-01中文'; -- time add/sub select timestamp'2011-11-11 11:11:11' + interval '2' day; @@ -58,20 +60,91 @@ select date_add('2011-11-11', 1L); select date_add('2011-11-11', 1.0); select date_add('2011-11-11', 1E1); select date_add('2011-11-11', '1'); +select date_add('2011-11-11', '1.2'); select date_add(date'2011-11-11', 1); select date_add(timestamp'2011-11-11', 1); select date_sub(date'2011-11-11', 1); +select date_sub(date'2011-11-11', '1'); +select date_sub(date'2011-11-11', '1.2'); select date_sub(timestamp'2011-11-11', 1); select date_sub(null, 1); select date_sub(date'2011-11-11', null); select date'2011-11-11' + 1E1; +select date'2011-11-11' + '1'; select null + date '2001-09-28'; select date '2001-09-28' + 7Y; select 7S + date '2001-09-28'; select date '2001-10-01' - 7; +select date '2001-10-01' - '7'; select date '2001-09-28' + null; select date '2001-09-28' - null; +-- date add/sub with non-literal string column +create temp view v as select '1' str; +select date_add('2011-11-11', str) from v; +select date_sub('2011-11-11', str) from v; + -- subtract dates select null - date '2019-10-06'; select date '2001-10-01' - date '2001-09-28'; + +-- variable-length second fraction tests +select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.0', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.1', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.123UTC', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.12345CST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.123456PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +-- second fraction exceeded max variable length +select to_timestamp('2019-10-06 10:11:12.1234567PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +-- special cases +select to_timestamp('123456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('223456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.[SSSSSS]'); +select to_timestamp('2019-10-06 10:11:12.123', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]'); +select to_timestamp('2019-10-06 10:11:12', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]'); +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]'); +select to_timestamp('2019-10-06 10:11', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]'); +select to_timestamp("2019-10-06S10:11:12.12345", "yyyy-MM-dd'S'HH:mm:ss.SSSSSS"); +select to_timestamp("12.12342019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm"); +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm"); +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyy-MM-dd'S'HH:mm"); +select to_timestamp("12.1234019-10-06S10:11", "ss.SSSSy-MM-dd'S'HH:mm"); + +select to_timestamp("2019-10-06S", "yyyy-MM-dd'S'"); +select to_timestamp("S2019-10-06", "'S'yyyy-MM-dd"); + +select to_timestamp("2019-10-06T10:11:12'12", "yyyy-MM-dd'T'HH:mm:ss''SSSS"); -- middle +select to_timestamp("2019-10-06T10:11:12'", "yyyy-MM-dd'T'HH:mm:ss''"); -- tail +select to_timestamp("'2019-10-06T10:11:12", "''yyyy-MM-dd'T'HH:mm:ss"); -- head +select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss"); -- head but as single quote + +-- missing fields +select to_timestamp("16", "dd"); +select to_timestamp("02-29", "MM-dd"); +select to_date("16", "dd"); +select to_date("02-29", "MM-dd"); +select to_timestamp("2019 40", "yyyy mm"); +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss"); + +-- Unsupported narrow text style +select date_format(date '2020-05-23', 'GGGGG'); +select date_format(date '2020-05-23', 'MMMMM'); +select date_format(date '2020-05-23', 'LLLLL'); +select date_format(timestamp '2020-05-23', 'EEEEE'); +select date_format(timestamp '2020-05-23', 'uuuuu'); +select date_format('2020-05-23', 'QQQQQ'); +select date_format('2020-05-23', 'qqqqq'); +select to_timestamp('2019-10-06 A', 'yyyy-MM-dd GGGGG'); +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE'); +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE'); +select unix_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE'); +select from_unixtime(12345, 'MMMMM'); +select from_unixtime(54321, 'QQQQQ'); +select from_unixtime(23456, 'aaaaa'); +select from_json('{"time":"26/October/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')); +select from_json('{"date":"26/October/2015"}', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')); +select from_csv('26/October/2015', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')); +select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')); diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql index 821cb473751eb..d55e398329b76 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql @@ -52,7 +52,7 @@ DROP TABLE desc_complex_col_table; --Test case insensitive -CREATE TABLE customer(CName STRING); +CREATE TABLE customer(CName STRING) USING PARQUET; INSERT INTO customer VALUES('Maria'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/except.sql b/sql/core/src/test/resources/sql-tests/inputs/except.sql index 1d579e65f3473..ffdf1f4f3d24d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/except.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/except.sql @@ -55,3 +55,22 @@ FROM t1 WHERE t1.v >= (SELECT min(t2.v) FROM t2 WHERE t2.k = t1.k); + +-- SPARK-32638: corrects references when adding aliases in WidenSetOperationTypes +CREATE OR REPLACE TEMPORARY VIEW t3 AS VALUES (decimal(1)) tbl(v); +SELECT t.v FROM ( + SELECT v FROM t3 + EXCEPT + SELECT v + v AS v FROM t3 +) t; + +SELECT SUM(t.v) FROM ( + SELECT v FROM t3 + EXCEPT + SELECT v + v AS v FROM t3 +) t; + +-- Clean-up +DROP VIEW IF EXISTS t1; +DROP VIEW IF EXISTS t2; +DROP VIEW IF EXISTS t3; diff --git a/sql/core/src/test/resources/sql-tests/inputs/explain-aqe.sql b/sql/core/src/test/resources/sql-tests/inputs/explain-aqe.sql new file mode 100644 index 0000000000000..f4afa2b77a9d7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/explain-aqe.sql @@ -0,0 +1,3 @@ +--IMPORT explain.sql + +--SET spark.sql.adaptive.enabled=true diff --git a/sql/core/src/test/resources/sql-tests/inputs/explain.sql b/sql/core/src/test/resources/sql-tests/inputs/explain.sql index d5253e3daddb0..80bf258704c70 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/explain.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/explain.sql @@ -5,6 +5,7 @@ CREATE table explain_temp1 (key int, val int) USING PARQUET; CREATE table explain_temp2 (key int, val int) USING PARQUET; CREATE table explain_temp3 (key int, val int) USING PARQUET; +CREATE table explain_temp4 (key int, val string) USING PARQUET; SET spark.sql.codegen.wholeStage = true; @@ -61,7 +62,7 @@ EXPLAIN FORMATTED FROM explain_temp2 WHERE val > 0) OR - key = (SELECT max(key) + key = (SELECT avg(key) FROM explain_temp3 WHERE val > 0); @@ -93,7 +94,27 @@ EXPLAIN FORMATTED CREATE VIEW explain_view AS SELECT key, val FROM explain_temp1; +-- HashAggregate +EXPLAIN FORMATTED + SELECT + COUNT(val) + SUM(key) as TOTAL, + COUNT(key) FILTER (WHERE val > 1) + FROM explain_temp1; + +-- ObjectHashAggregate +EXPLAIN FORMATTED + SELECT key, sort_array(collect_set(val))[0] + FROM explain_temp4 + GROUP BY key; + +-- SortAggregate +EXPLAIN FORMATTED + SELECT key, MIN(val) + FROM explain_temp4 + GROUP BY key; + -- cleanup DROP TABLE explain_temp1; DROP TABLE explain_temp2; DROP TABLE explain_temp3; +DROP TABLE explain_temp4; diff --git a/sql/core/src/test/resources/sql-tests/inputs/extract.sql b/sql/core/src/test/resources/sql-tests/inputs/extract.sql index 2d6cad8838704..abb9e82c9ef2e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/extract.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/extract.sql @@ -1,84 +1,125 @@ -CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c; +CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c, interval 10 year 20 month 30 day 40 hour 50 minute 6.7890 second as i; -select extract(millennium from c) from t; -select extract(millennia from c) from t; -select extract(mil from c) from t; -select extract(mils from c) from t; +select extract(year from c), extract(year from i) from t; +select extract(y from c), extract(y from i) from t; +select extract(years from c), extract(years from i) from t; +select extract(yr from c), extract(yr from i) from t; +select extract(yrs from c), extract(yrs from i) from t; -select extract(century from c) from t; -select extract(centuries from c) from t; -select extract(c from c) from t; -select extract(cent from c) from t; - -select extract(decade from c) from t; -select extract(decades from c) from t; -select extract(dec from c) from t; -select extract(decs from c) from t; - -select extract(year from c) from t; -select extract(y from c) from t; -select extract(years from c) from t; -select extract(yr from c) from t; -select extract(yrs from c) from t; - -select extract(isoyear from c) from t; +select extract(yearofweek from c) from t; select extract(quarter from c) from t; select extract(qtr from c) from t; -select extract(month from c) from t; -select extract(mon from c) from t; -select extract(mons from c) from t; -select extract(months from c) from t; +select extract(month from c), extract(month from i) from t; +select extract(mon from c), extract(mon from i) from t; +select extract(mons from c), extract(mons from i) from t; +select extract(months from c), extract(months from i) from t; select extract(week from c) from t; select extract(w from c) from t; select extract(weeks from c) from t; -select extract(day from c) from t; -select extract(d from c) from t; -select extract(days from c) from t; +select extract(day from c), extract(day from i) from t; +select extract(d from c), extract(d from i) from t; +select extract(days from c), extract(days from i) from t; select extract(dayofweek from c) from t; - select extract(dow from c) from t; -select extract(isodow from c) from t; +select extract(dayofweek_iso from c) from t; +select extract(dow_iso from c) from t; select extract(doy from c) from t; -select extract(hour from c) from t; -select extract(h from c) from t; -select extract(hours from c) from t; -select extract(hr from c) from t; -select extract(hrs from c) from t; - -select extract(minute from c) from t; -select extract(m from c) from t; -select extract(min from c) from t; -select extract(mins from c) from t; -select extract(minutes from c) from t; - -select extract(second from c) from t; -select extract(s from c) from t; -select extract(sec from c) from t; -select extract(seconds from c) from t; -select extract(secs from c) from t; - -select extract(milliseconds from c) from t; -select extract(msec from c) from t; -select extract(msecs from c) from t; -select extract(millisecon from c) from t; -select extract(mseconds from c) from t; -select extract(ms from c) from t; - -select extract(microseconds from c) from t; -select extract(usec from c) from t; -select extract(usecs from c) from t; -select extract(useconds from c) from t; -select extract(microsecon from c) from t; -select extract(us from c) from t; - -select extract(epoch from c) from t; +select extract(hour from c), extract(hour from i) from t; +select extract(h from c), extract(h from i) from t; +select extract(hours from c), extract(hours from i) from t; +select extract(hr from c), extract(hr from i) from t; +select extract(hrs from c), extract(hrs from i) from t; + +select extract(minute from c), extract(minute from i) from t; +select extract(m from c), extract(m from i) from t; +select extract(min from c), extract(min from i) from t; +select extract(mins from c), extract(mins from i) from t; +select extract(minutes from c), extract(minutes from i) from t; + +select extract(second from c), extract(second from i) from t; +select extract(s from c), extract(s from i) from t; +select extract(sec from c), extract(sec from i) from t; +select extract(seconds from c), extract(seconds from i) from t; +select extract(secs from c), extract(secs from i) from t; select extract(not_supported from c) from t; +select extract(not_supported from i) from t; + +select date_part('year', c), date_part('year', i) from t; +select date_part('y', c), date_part('y', i) from t; +select date_part('years', c), date_part('years', i) from t; +select date_part('yr', c), date_part('yr', i) from t; +select date_part('yrs', c), date_part('yrs', i) from t; + +select date_part('yearofweek', c) from t; + +select date_part('quarter', c) from t; +select date_part('qtr', c) from t; + +select date_part('month', c), date_part('month', i) from t; +select date_part('mon', c), date_part('mon', i) from t; +select date_part('mons', c), date_part('mons', i) from t; +select date_part('months', c), date_part('months', i) from t; + +select date_part('week', c) from t; +select date_part('w', c) from t; +select date_part('weeks', c) from t; + +select date_part('day', c), date_part('day', i) from t; +select date_part('d', c), date_part('d', i) from t; +select date_part('days', c), date_part('days', i) from t; + +select date_part('dayofweek', c) from t; +select date_part('dow', c) from t; + +select date_part('dayofweek_iso', c) from t; +select date_part('dow_iso', c) from t; + +select date_part('doy', c) from t; + +select date_part('hour', c), date_part('hour', i) from t; +select date_part('h', c), date_part('h', i) from t; +select date_part('hours', c), date_part('hours', i) from t; +select date_part('hr', c), date_part('hr', i) from t; +select date_part('hrs', c), date_part('hrs', i) from t; + +select date_part('minute', c), date_part('minute', i) from t; +select date_part('m', c), date_part('m', i) from t; +select date_part('min', c), date_part('min', i) from t; +select date_part('mins', c), date_part('mins', i) from t; +select date_part('minutes', c), date_part('minutes', i) from t; + +select date_part('second', c), date_part('second', i) from t; +select date_part('s', c), date_part('s', i) from t; +select date_part('sec', c), date_part('sec', i) from t; +select date_part('seconds', c), date_part('seconds', i) from t; +select date_part('secs', c), date_part('secs', i) from t; + +select date_part('not_supported', c) from t; +select date_part(c, c) from t; +select date_part(null, c) from t; + +select date_part(i, i) from t; +select date_part(null, i) from t; + +-- In SPARK-31476, we've supported extract('field', source), too +select extract('year', c) from t; +select extract('quarter', c) from t; +select extract('month', c) from t; +select extract('week', c) from t; +select extract('day', c) from t; +select extract('days', c) from t; +select extract('dayofweek', c) from t; +select extract('dow', c) from t; +select extract('doy', c) from t; +select extract('hour', c) from t; +select extract('minute', c) from t; +select extract('second', c) from t; diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index fedf03d774e42..3f5f5568b0720 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -86,6 +86,16 @@ SELECT 1 FROM range(10) HAVING MAX(id) > 0; SELECT id FROM range(10) HAVING id > 0; +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true; + +SELECT 1 FROM range(10) HAVING true; + +SELECT 1 FROM range(10) HAVING MAX(id) > 0; + +SELECT id FROM range(10) HAVING id > 0; + +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false; + -- Test data CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES (1, true), (1, false), diff --git a/sql/core/src/test/resources/sql-tests/inputs/having.sql b/sql/core/src/test/resources/sql-tests/inputs/having.sql index 868a911e787f6..2799b1a94d085 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/having.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/having.sql @@ -16,3 +16,17 @@ SELECT MIN(t.v) FROM (SELECT * FROM hav WHERE v > 0) t HAVING(COUNT(1) > 0); -- SPARK-20329: make sure we handle timezones correctly SELECT a + b FROM VALUES (1L, 2), (3L, 4) AS T(a, b) GROUP BY a + b HAVING a + b > 1; + +-- SPARK-31519: Cast in having aggregate expressions returns the wrong result +SELECT SUM(a) AS b, CAST('2020-01-01' AS DATE) AS fake FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY b HAVING b > 10; + +-- SPARK-31663: Grouping sets with having clause returns the wrong result +SELECT SUM(a) AS b FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY GROUPING SETS ((b), (a, b)) HAVING b > 10; +SELECT SUM(a) AS b FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY CUBE(a, b) HAVING b > 10; +SELECT SUM(a) AS b FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY ROLLUP(a, b) HAVING b > 10; + +-- SPARK-33131: Grouping sets with having clause can not resolve qualified col name. +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY GROUPING SETS(t.c1) HAVING t.c1 = 1; +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY CUBE(t.c1) HAVING t.c1 = 1; +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY ROLLUP(t.c1) HAVING t.c1 = 1; +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY t.c1 HAVING t.c1 = 1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql index cfa06aea82b04..73dfa91827b08 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql @@ -92,3 +92,6 @@ select transform_values(ys, (k, v) -> k + v) as v from nested; -- use non reversed keywords: all is non reversed only if !ansi select transform(ys, all -> all * all) as v from values (array(32, 97)) as t(ys); select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t(ys); + +-- SPARK-32819: Aggregate on nested string arrays +select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))); diff --git a/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql b/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql index b0b2244048caa..077caa5dd44a0 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/intersect-all.sql @@ -155,6 +155,21 @@ SELECT * FROM tab2; -- Restore the property SET spark.sql.legacy.setopsPrecedence.enabled = false; +-- SPARK-32638: corrects references when adding aliases in WidenSetOperationTypes +CREATE OR REPLACE TEMPORARY VIEW tab3 AS VALUES (decimal(1)), (decimal(2)) tbl3(v); +SELECT t.v FROM ( + SELECT v FROM tab3 + INTERSECT + SELECT v + v AS v FROM tab3 +) t; + +SELECT SUM(t.v) FROM ( + SELECT v FROM tab3 + INTERSECT + SELECT v + v AS v FROM tab3 +) t; + -- Clean-up DROP VIEW IF EXISTS tab1; DROP VIEW IF EXISTS tab2; +DROP VIEW IF EXISTS tab3; diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql index fb6c485f619ae..8f6cf0504baf2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -4,6 +4,10 @@ select 3 * (timestamp'2019-10-15 10:11:12.001002' - date'2019-10-15'); select interval 4 month 2 weeks 3 microseconds * 1.5; select (timestamp'2019-10-15' - timestamp'2019-10-14') / 1.5; +select interval 2147483647 month * 2; +select interval 2147483647 month / 0.5; +select interval 2147483647 day * 2; +select interval 2147483647 day / 0.5; -- interval operation with null and zero case select interval '2 seconds' / 0; @@ -25,6 +29,8 @@ select make_interval(1, 2, 3, 4); select make_interval(1, 2, 3, 4, 5); select make_interval(1, 2, 3, 4, 5, 6); select make_interval(1, 2, 3, 4, 5, 6, 7.008009); +select make_interval(1, 2, 3, 4, 0, 0, 123456789012.123456); +select make_interval(0, 0, 0, 0, 0, 0, 1234567890123456789); -- cast string to intervals select cast('1 second' as interval); @@ -49,6 +55,7 @@ select interval '15:40:32.99899999' hour to second; select interval '40:32.99899999' minute to second; select interval '40:32' minute to second; select interval 30 day day; +select interval 30 days days; -- invalid day-time string intervals select interval '20 15:40:32.99899999' day to hour; @@ -67,6 +74,7 @@ select map(1, interval 1 day, 2, interval 3 week); -- typed interval expression select interval 'interval 3 year 1 hour'; select interval '3 year 1 hour'; +SELECT interval '1 year 3 months 2 weeks 2 days 1 hour 3 minutes 2 seconds 100 millisecond 200 microseconds'; -- malformed interval literal select interval; @@ -83,70 +91,9 @@ select interval '12:11:10' hour to second '1' year; select interval (-30) day; select interval (a + 1) day; select interval 30 day day day; - --- sum interval values --- null -select sum(cast(null as interval)); - --- empty set -select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0; - --- basic interval sum -select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v); -select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v); -select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v); -select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v); - --- group by -select - i, - sum(cast(v as interval)) -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -group by i; - --- having -select - sum(cast(v as interval)) as sv -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -having sv is not null; - --- window -SELECT - i, - sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) -FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v); - --- average with interval type --- null -select avg(cast(v as interval)) from VALUES (null) t(v); - --- empty set -select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0; - --- basic interval avg -select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v); -select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v); -select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v); -select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v); - --- group by -select - i, - avg(cast(v as interval)) -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -group by i; - --- having -select - avg(cast(v as interval)) as sv -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -having sv is not null; - --- window -SELECT - i, - avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) -FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v); +select interval (-30) days; +select interval (a + 1) days; +select interval 30 days days days; -- Interval year-month arithmetic @@ -213,6 +160,11 @@ select interval 'interval \t 1\tday'; select interval 'interval\t1\tday'; select interval '1\t' day; select interval '1 ' day; +select interval '2-2\t' year to month; +select interval '-\t2-2\t' year to month; +select interval '\n0 12:34:46.789\t' day to second; +select interval '\n-\t10\t 12:34:46.789\t' day to second; +select interval '中文 interval 1 day'; -- interval overflow if (ansi) exception else NULL select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); @@ -222,7 +174,25 @@ select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); -- interval support for csv and json functions -SELECT from_csv('1, 1 day', 'a INT, b interval'); -SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)); -SELECT from_json('{"a":"1 days"}', 'a interval'); -SELECT to_json(map('a', interval 25 month 100 day 130 minute)); +SELECT + from_csv('1, 1 day', 'a INT, b interval'), + to_csv(from_csv('1, 1 day', 'a INT, b interval')), + to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), + from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval'); +SELECT + from_json('{"a":"1 days"}', 'a interval'), + to_json(from_json('{"a":"1 days"}', 'a interval')), + to_json(map('a', interval 25 month 100 day 130 minute)), + from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval'); + +select interval '+'; +select interval '+.'; +select interval '1'; +select interval '1.2'; +select interval '- 2'; +select interval '1 day -'; +select interval '1 day 1'; + +select interval '1 day 2' day; +select interval 'interval 1' day; +select interval '-\t 1' day; diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index 6c14eee2e4e61..5bd78f5f6af3a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -48,6 +48,21 @@ select from_json('[null, {"a":2}]', 'array>'); select from_json('[{"a": 1}, {"b":2}]', 'array>'); select from_json('[{"a": 1}, 2]', 'array>'); +-- from_json - datetime type +select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp'); +select from_json( + '{"d": "12/15 2012", "t": "12/15 2012 15:15:15"}', + 'd date, t timestamp', + map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss')); +select from_json( + '{"d": "02-29"}', + 'd date', + map('dateFormat', 'MM-dd')); +select from_json( + '{"t": "02-29"}', + 't timestamp', + map('timestampFormat', 'MM-dd')); + -- to_json - array type select to_json(array('1', '2', '3')); select to_json(array(array(1, 2, 3), array(4))); diff --git a/sql/core/src/test/resources/sql-tests/inputs/operator-div.sql b/sql/core/src/test/resources/sql-tests/inputs/operator-div.sql deleted file mode 100644 index 67b2d39aacd18..0000000000000 --- a/sql/core/src/test/resources/sql-tests/inputs/operator-div.sql +++ /dev/null @@ -1,21 +0,0 @@ -set spark.sql.legacy.integralDivide.returnBigint=true; - -select 5 div 2; -select 5 div 0; -select 5 div null; -select null div 5; -select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)); -select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)); -select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)); -select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)); - -set spark.sql.legacy.integralDivide.returnBigint=false; - -select 5 div 2; -select 5 div 0; -select 5 div null; -select null div 5; -select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)); -select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)); -select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)); -select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)); diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql index ba14789d48db6..20bf0eb15c5b2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql @@ -22,6 +22,16 @@ select 5 / 0; select 5 / null; select null / 5; +-- integral div +select 5 div 2; +select 5 div 0; +select 5 div null; +select null div 5; +select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)); +select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)); +select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)); +select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)); + -- other arithmetics select 1 + 2; select 1 - 2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql index 6725ce45e72a5..1a454179ef79f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql @@ -11,17 +11,19 @@ SELECT /* embedded single line */ 'embedded' AS `second`; SELECT /* both embedded and trailing single line */ 'both' AS third; -- trailing single line SELECT 'before multi-line' AS fourth; +--QUERY-DELIMITER-START -- [SPARK-28880] ANSI SQL: Bracketed comments /* This is an example of SQL which should not execute: * select 'multi-line'; */ SELECT 'after multi-line' AS fifth; +--QUERY-DELIMITER-END -- [SPARK-28880] ANSI SQL: Bracketed comments -- -- Nested comments -- - +--QUERY-DELIMITER-START /* SELECT 'trailing' as x1; -- inside block comment */ @@ -44,5 +46,5 @@ Hoo boy. Still two deep... Now just one deep... */ 'deeply nested example' AS sixth; - +--QUERY-DELIMITER-END /* and this is the end of the file */ diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql index 39e708478e298..21ffd85f7d01f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql @@ -41,7 +41,7 @@ DROP TABLE emp; -- These views are left around mainly to exercise special cases in pg_dump. -- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark -CREATE TABLE view_base_table (key int /* PRIMARY KEY */, data varchar(20)); +CREATE TABLE view_base_table (key int /* PRIMARY KEY */, data varchar(20)) USING PARQUET; -- CREATE VIEW key_dependent_view AS SELECT * FROM view_base_table GROUP BY key; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql index 0bab2f884d976..69851080847b9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql @@ -230,49 +230,49 @@ SELECT date 'tomorrow' - date 'yesterday' AS `Two days`; -- -- epoch -- -SELECT EXTRACT(EPOCH FROM DATE '1970-01-01'); -- 0 -SELECT EXTRACT(EPOCH FROM TIMESTAMP '1970-01-01'); -- 0 +-- SELECT EXTRACT(EPOCH FROM DATE '1970-01-01'); -- 0 +-- SELECT EXTRACT(EPOCH FROM TIMESTAMP '1970-01-01'); -- 0 -- SELECT EXTRACT(EPOCH FROM TIMESTAMPTZ '1970-01-01+00'); -- 0 -- -- century -- -SELECT EXTRACT(CENTURY FROM TO_DATE('0101-12-31 BC', 'yyyy-MM-dd G')); -- -2 -SELECT EXTRACT(CENTURY FROM TO_DATE('0100-12-31 BC', 'yyyy-MM-dd G')); -- -1 -SELECT EXTRACT(CENTURY FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')); -- -1 -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01'); -- 1 -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD'); -- 1 -SELECT EXTRACT(CENTURY FROM DATE '1900-12-31'); -- 19 -SELECT EXTRACT(CENTURY FROM DATE '1901-01-01'); -- 20 -SELECT EXTRACT(CENTURY FROM DATE '2000-12-31'); -- 20 -SELECT EXTRACT(CENTURY FROM DATE '2001-01-01'); -- 21 -SELECT EXTRACT(CENTURY FROM CURRENT_DATE)>=21 AS True; -- true +-- SELECT EXTRACT(CENTURY FROM TO_DATE('0101-12-31 BC', 'yyyy-MM-dd G')); -- -2 +-- SELECT EXTRACT(CENTURY FROM TO_DATE('0100-12-31 BC', 'yyyy-MM-dd G')); -- -1 +-- SELECT EXTRACT(CENTURY FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')); -- -1 +-- SELECT EXTRACT(CENTURY FROM DATE '0001-01-01'); -- 1 +-- SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD'); -- 1 +-- SELECT EXTRACT(CENTURY FROM DATE '1900-12-31'); -- 19 +-- SELECT EXTRACT(CENTURY FROM DATE '1901-01-01'); -- 20 +-- SELECT EXTRACT(CENTURY FROM DATE '2000-12-31'); -- 20 +-- SELECT EXTRACT(CENTURY FROM DATE '2001-01-01'); -- 21 +-- SELECT EXTRACT(CENTURY FROM CURRENT_DATE)>=21 AS True; -- true -- -- millennium -- -SELECT EXTRACT(MILLENNIUM FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')); -- -1 -SELECT EXTRACT(MILLENNIUM FROM DATE '0001-01-01 AD'); -- 1 -SELECT EXTRACT(MILLENNIUM FROM DATE '1000-12-31'); -- 1 -SELECT EXTRACT(MILLENNIUM FROM DATE '1001-01-01'); -- 2 -SELECT EXTRACT(MILLENNIUM FROM DATE '2000-12-31'); -- 2 -SELECT EXTRACT(MILLENNIUM FROM DATE '2001-01-01'); -- 3 +-- SELECT EXTRACT(MILLENNIUM FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')); -- -1 +-- SELECT EXTRACT(MILLENNIUM FROM DATE '0001-01-01 AD'); -- 1 +-- SELECT EXTRACT(MILLENNIUM FROM DATE '1000-12-31'); -- 1 +-- SELECT EXTRACT(MILLENNIUM FROM DATE '1001-01-01'); -- 2 +-- SELECT EXTRACT(MILLENNIUM FROM DATE '2000-12-31'); -- 2 +-- SELECT EXTRACT(MILLENNIUM FROM DATE '2001-01-01'); -- 3 -- next test to be fixed on the turn of the next millennium;-) -SELECT EXTRACT(MILLENNIUM FROM CURRENT_DATE); -- 3 +-- SELECT EXTRACT(MILLENNIUM FROM CURRENT_DATE); -- 3 -- -- decade -- -SELECT EXTRACT(DECADE FROM DATE '1994-12-25'); -- 199 -SELECT EXTRACT(DECADE FROM DATE '0010-01-01'); -- 1 -SELECT EXTRACT(DECADE FROM DATE '0009-12-31'); -- 0 -SELECT EXTRACT(DECADE FROM TO_DATE('0001-01-01 BC', 'yyyy-MM-dd G')); -- 0 -SELECT EXTRACT(DECADE FROM TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')); -- -1 -SELECT EXTRACT(DECADE FROM TO_DATE('0011-01-01 BC', 'yyyy-MM-dd G')); -- -1 -SELECT EXTRACT(DECADE FROM TO_DATE('0012-12-31 BC', 'yyyy-MM-dd G')); -- -2 +-- SELECT EXTRACT(DECADE FROM DATE '1994-12-25'); -- 199 +-- SELECT EXTRACT(DECADE FROM DATE '0010-01-01'); -- 1 +-- SELECT EXTRACT(DECADE FROM DATE '0009-12-31'); -- 0 +-- SELECT EXTRACT(DECADE FROM TO_DATE('0001-01-01 BC', 'yyyy-MM-dd G')); -- 0 +-- SELECT EXTRACT(DECADE FROM TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')); -- -1 +-- SELECT EXTRACT(DECADE FROM TO_DATE('0011-01-01 BC', 'yyyy-MM-dd G')); -- -1 +-- SELECT EXTRACT(DECADE FROM TO_DATE('0012-12-31 BC', 'yyyy-MM-dd G')); -- -2 -- -- some other types: -- -- on a timestamp. -SELECT EXTRACT(CENTURY FROM NOW())>=21 AS True; -- true -SELECT EXTRACT(CENTURY FROM TIMESTAMP '1970-03-20 04:30:00.00000'); -- 20 +-- SELECT EXTRACT(CENTURY FROM NOW())>=21 AS True; -- true +-- SELECT EXTRACT(CENTURY FROM TIMESTAMP '1970-03-20 04:30:00.00000'); -- 20 -- on an interval -- SELECT EXTRACT(CENTURY FROM INTERVAL '100 y'); -- 1 -- SELECT EXTRACT(CENTURY FROM INTERVAL '99 y'); -- 0 @@ -280,16 +280,16 @@ SELECT EXTRACT(CENTURY FROM TIMESTAMP '1970-03-20 04:30:00.00000'); -- 20 -- SELECT EXTRACT(CENTURY FROM INTERVAL '-100 y'); -- -1 -- -- test trunc function! -SELECT DATE_TRUNC('MILLENNIUM', TIMESTAMP '1970-03-20 04:30:00.00000'); -- 1001 -SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20'); -- 1001-01-01 -SELECT DATE_TRUNC('CENTURY', TIMESTAMP '1970-03-20 04:30:00.00000'); -- 1901 -SELECT DATE_TRUNC('CENTURY', DATE '1970-03-20'); -- 1901 -SELECT DATE_TRUNC('CENTURY', DATE '2004-08-10'); -- 2001-01-01 -SELECT DATE_TRUNC('CENTURY', DATE '0002-02-04'); -- 0001-01-01 -SELECT DATE_TRUNC('CENTURY', TO_DATE('0055-08-10 BC', 'yyyy-MM-dd G')); -- 0100-01-01 BC -SELECT DATE_TRUNC('DECADE', DATE '1993-12-25'); -- 1990-01-01 -SELECT DATE_TRUNC('DECADE', DATE '0004-12-25'); -- 0001-01-01 BC -SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')); -- 0011-01-01 BC +-- SELECT DATE_TRUNC('MILLENNIUM', TIMESTAMP '1970-03-20 04:30:00.00000'); -- 1001 +-- SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20'); -- 1001-01-01 +-- SELECT DATE_TRUNC('CENTURY', TIMESTAMP '1970-03-20 04:30:00.00000'); -- 1901 +-- SELECT DATE_TRUNC('CENTURY', DATE '1970-03-20'); -- 1901 +-- SELECT DATE_TRUNC('CENTURY', DATE '2004-08-10'); -- 2001-01-01 +-- SELECT DATE_TRUNC('CENTURY', DATE '0002-02-04'); -- 0001-01-01 +-- SELECT DATE_TRUNC('CENTURY', TO_DATE('0055-08-10 BC', 'yyyy-MM-dd G')); -- 0100-01-01 BC +-- SELECT DATE_TRUNC('DECADE', DATE '1993-12-25'); -- 1990-01-01 +-- SELECT DATE_TRUNC('DECADE', DATE '0004-12-25'); -- 0001-01-01 BC +-- SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')); -- 0011-01-01 BC -- [SPARK-29006] Support special date/timestamp values `infinity`/`-infinity` -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql index 541ff0bdad745..e2a94404395bc 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql @@ -630,7 +630,8 @@ SELECT rpad('hi', -5, 'xy'); SELECT rpad('hello', 2); SELECT rpad('hi', 5, ''); -SELECT ltrim('zzzytrim', 'xyz'); +-- skip this test because PostgreSQL has different parameter order compares to SparkSQL +-- SELECT ltrim('zzzytrim', 'xyz'); SELECT translate('', '14', 'ax'); SELECT translate('12345', '14', 'ax'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql index bf69da295a960..ade29cc41358e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql @@ -200,15 +200,15 @@ SELECT '' AS `54`, d1 as `timestamp`, date_part( 'minute', d1) AS `minute`, date_part( 'second', d1) AS `second` FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; -SELECT '' AS `54`, d1 as `timestamp`, - date_part( 'quarter', d1) AS quarter, date_part( 'msec', d1) AS msec, - date_part( 'usec', d1) AS usec - FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; - -SELECT '' AS `54`, d1 as `timestamp`, - date_part( 'isoyear', d1) AS isoyear, date_part( 'week', d1) AS week, - date_part( 'dow', d1) AS dow - FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; +-- SELECT '' AS `54`, d1 as `timestamp`, +-- date_part( 'quarter', d1) AS quarter, date_part( 'msec', d1) AS msec, +-- date_part( 'usec', d1) AS usec +-- FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; + +-- SELECT '' AS `54`, d1 as `timestamp`, +-- date_part( 'isoyear', d1) AS isoyear, date_part( 'week', d1) AS week, +-- date_part( 'dow', d1) AS dow +-- FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; -- [SPARK-28137] Data Type Formatting Functions -- TO_CHAR() diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql index 087d7a5befd19..6e95aca7aff62 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql @@ -146,7 +146,7 @@ SELECT count(*) OVER (PARTITION BY four) FROM (SELECT * FROM tenk1 WHERE FALSE)s -- mixture of agg/wfunc in the same window -- SELECT sum(salary) OVER w, rank() OVER w FROM empsalary WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); --- Cannot safely cast 'enroll_date': StringType to DateType; +-- Cannot safely cast 'enroll_date': string to date; -- SELECT empno, depname, salary, bonus, depadj, MIN(bonus) OVER (ORDER BY empno), MAX(depadj) OVER () FROM( -- SELECT *, -- CASE WHEN enroll_date < '2008-01-01' THEN 2008 - extract(year FROM enroll_date) END * 500 AS bonus, diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql index 395149e48d5c8..ba1acc9f56b4a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql @@ -15,6 +15,18 @@ CREATE TABLE empsalary ( enroll_date date ) USING parquet; +INSERT INTO empsalary VALUES + ('develop', 10, 5200, date '2007-08-01'), + ('sales', 1, 5000, date '2006-10-01'), + ('personnel', 5, 3500, date '2007-12-10'), + ('sales', 4, 4800, date '2007-08-08'), + ('personnel', 2, 3900, date '2006-12-23'), + ('develop', 7, 4200, date '2008-01-01'), + ('develop', 9, 4500, date '2008-01-01'), + ('sales', 3, 4800, date '2007-08-01'), + ('develop', 8, 6000, date '2006-10-01'), + ('develop', 11, 5200, date '2007-08-15'); + -- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp -- CREATE TEMP VIEW v_window AS -- SELECT i, min(i) over (order by i range between '1 day' preceding and '10 days' following) as min_i @@ -99,9 +111,10 @@ FROM tenk1 WHERE unique1 < 10; -- nth_value(salary, 1) over(order by salary range between 1000 preceding and 1000 following), -- salary from empsalary; -select last(salary) over(order by salary range between 1000 preceding and 1000 following), -lag(salary) over(order by salary range between 1000 preceding and 1000 following), -salary from empsalary; +-- [SPARK-30734] AnalysisException that window RangeFrame not match RowFrame +-- select last(salary) over(order by salary range between 1000 preceding and 1000 following), +-- lag(salary) over(order by salary range between 1000 preceding and 1000 following), +-- salary from empsalary; -- [SPARK-27951] ANSI SQL: NTH_VALUE function -- select first_value(salary) over(order by salary range between 1000 following and 3000 following diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql index cd3b74b3aa03f..f4b8454da0d82 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql @@ -42,7 +42,7 @@ create table datetimes ( f_timestamp timestamp ) using parquet; --- Spark cannot safely cast StringType to TimestampType +-- Spark cannot safely cast string to timestamp -- [SPARK-29636] Spark can't parse '11:00 BST' or '2000-10-19 10:23:54+01' signatures to timestamp insert into datetimes values (1, timestamp '11:00', cast ('11:00 BST' as timestamp), cast ('1 year' as timestamp), cast ('2000-10-19 10:23:54+01' as timestamp), timestamp '2000-10-19 10:23:54'), diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql new file mode 100644 index 0000000000000..8a531be30d896 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -0,0 +1,11 @@ +-- regexp_extract +SELECT regexp_extract('1a 2b 14m', '\\d+'); +SELECT regexp_extract('1a 2b 14m', '\\d+', 0); +SELECT regexp_extract('1a 2b 14m', '\\d+', 1); +SELECT regexp_extract('1a 2b 14m', '\\d+', 2); +SELECT regexp_extract('1a 2b 14m', '\\d+', -1); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)'); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', -1); diff --git a/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql b/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql index dc77f87d9743a..00b46d1951fcf 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql @@ -74,6 +74,9 @@ CREATE VIEW view_SPARK_30302 (aaa, bbb) AS SELECT a, b FROM tbl; SHOW CREATE TABLE view_SPARK_30302 AS SERDE; + +SHOW CREATE TABLE view_SPARK_30302; + DROP VIEW view_SPARK_30302; @@ -83,6 +86,9 @@ COMMENT 'This is a comment with \'quoted text\' for view' AS SELECT a, b FROM tbl; SHOW CREATE TABLE view_SPARK_30302 AS SERDE; + +SHOW CREATE TABLE view_SPARK_30302; + DROP VIEW view_SPARK_30302; @@ -92,13 +98,9 @@ TBLPROPERTIES ('a' = '1', 'b' = '2') AS SELECT a, b FROM tbl; SHOW CREATE TABLE view_SPARK_30302 AS SERDE; -DROP VIEW view_SPARK_30302; - --- SHOW CREATE TABLE does not support view -CREATE VIEW view_SPARK_30302 (aaa, bbb) -AS SELECT a, b FROM tbl; SHOW CREATE TABLE view_SPARK_30302; + DROP VIEW view_SPARK_30302; DROP TABLE tbl; diff --git a/sql/core/src/test/resources/sql-tests/inputs/show-tables.sql b/sql/core/src/test/resources/sql-tests/inputs/show-tables.sql index 3c77c9977d80f..8f46c93ee3233 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/show-tables.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/show-tables.sql @@ -15,6 +15,7 @@ SHOW TABLES IN showdb; SHOW TABLES 'show_t*'; SHOW TABLES LIKE 'show_t1*|show_t2*'; SHOW TABLES IN showdb 'show_t*'; +SHOW TABLES IN showdb LIKE 'show_t*'; -- SHOW TABLE EXTENDED SHOW TABLE EXTENDED LIKE 'show_t*'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/show-tblproperties.sql b/sql/core/src/test/resources/sql-tests/inputs/show-tblproperties.sql new file mode 100644 index 0000000000000..2861b2b43a113 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/show-tblproperties.sql @@ -0,0 +1,26 @@ +-- create a table with properties +CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet +TBLPROPERTIES('p1'='v1', 'p2'='v2'); + +SHOW TBLPROPERTIES tbl; +SHOW TBLPROPERTIES tbl("p1"); +SHOW TBLPROPERTIES tbl("p3"); + +DROP TABLE tbl; + +-- create a view with properties +CREATE VIEW view TBLPROPERTIES('p1'='v1', 'p2'='v2') AS SELECT 1 AS c1; + +SHOW TBLPROPERTIES view; +SHOW TBLPROPERTIES view("p1"); +SHOW TBLPROPERTIES view("p3"); + +DROP VIEW view; + +-- create a temporary view with properties +CREATE TEMPORARY VIEW tv TBLPROPERTIES('p1'='v1') AS SELECT 1 AS c1; + +-- Properties for a temporary view should be empty +SHOW TBLPROPERTIES tv; + +DROP VIEW tv; diff --git a/sql/core/src/test/resources/sql-tests/inputs/show-views.sql b/sql/core/src/test/resources/sql-tests/inputs/show-views.sql new file mode 100644 index 0000000000000..bdf9ef4aa6780 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/show-views.sql @@ -0,0 +1,28 @@ +-- Test data. +CREATE DATABASE showdb; +USE showdb; +CREATE TABLE tbl(a STRING, b INT, c STRING, d STRING) USING parquet; +CREATE VIEW view_1 AS SELECT * FROM tbl; +CREATE VIEW view_2 AS SELECT * FROM tbl WHERE c='a'; +CREATE GLOBAL TEMP VIEW view_3 AS SELECT 1 as col1; +CREATE TEMPORARY VIEW view_4(e INT) USING parquet; + +-- SHOW VIEWS +SHOW VIEWS; +SHOW VIEWS FROM showdb; +SHOW VIEWS IN showdb; +SHOW VIEWS IN global_temp; + +-- SHOW VIEWS WITH wildcard match +SHOW VIEWS 'view_*'; +SHOW VIEWS LIKE 'view_1*|view_2*'; +SHOW VIEWS IN showdb 'view_*'; +SHOW VIEWS IN showdb LIKE 'view_*'; +-- Error when database not exists +SHOW VIEWS IN wrongdb LIKE 'view_*'; + +-- Clean Up +DROP VIEW global_temp.view_3; +DROP VIEW view_4; +USE default; +DROP DATABASE showdb CASCADE; diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 59987b9792e25..f5ed2036dc8ac 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -39,12 +39,17 @@ SELECT substring('Spark SQL' from 5); SELECT substring('Spark SQL' from -3); SELECT substring('Spark SQL' from 5 for 1); --- trim/ltrim/rtrim -SELECT trim('yxTomxx', 'xyz'), trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx'); -SELECT trim('xxxbarxxx', 'x'), trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx'); -SELECT ltrim('zzzytest', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytest'); -SELECT ltrim('zzzytestxyz', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytestxyz'); -SELECT ltrim('xyxXxyLAST WORD', 'xy'), trim(LEADING 'xy' FROM 'xyxXxyLAST WORD'); -SELECT rtrim('testxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'testxxzx'); -SELECT rtrim('xyztestxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'xyztestxxzx'); -SELECT rtrim('TURNERyxXxy', 'xy'), trim(TRAILING 'xy' FROM 'TURNERyxXxy'); +-- trim +SELECT trim(" xyz "), ltrim(" xyz "), rtrim(" xyz "); +SELECT trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx'); +SELECT trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx'); +SELECT trim(LEADING 'xyz' FROM 'zzzytest'); +SELECT trim(LEADING 'xyz' FROM 'zzzytestxyz'); +SELECT trim(LEADING 'xy' FROM 'xyxXxyLAST WORD'); +SELECT trim(TRAILING 'xyz' FROM 'testxxzx'); +SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx'); +SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy'); + +-- Check lpad/rpad with invalid length parameter +SELECT lpad('hi', 'invalid_length'); +SELECT rpad('hi', 'invalid_length'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql index 481b5e8cc7700..0a16f118f0455 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql @@ -72,7 +72,7 @@ SELECT Count(DISTINCT( t1a )), FROM t1 WHERE t1d IN (SELECT t2d FROM t2 - ORDER BY t2c + ORDER BY t2c, t2d LIMIT 2) GROUP BY t1b ORDER BY t1b DESC NULLS FIRST @@ -93,7 +93,7 @@ SELECT Count(DISTINCT( t1a )), FROM t1 WHERE t1d NOT IN (SELECT t2d FROM t2 - ORDER BY t2b DESC nulls first + ORDER BY t2b DESC nulls first, t2d LIMIT 1) GROUP BY t1b ORDER BY t1b NULLS last diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql index e6fe1078b0d24..77bcfe7e10cbc 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql @@ -8,6 +8,9 @@ -- -- This test file was converted from postgreSQL/join.sql. +-- Disable BroadcastHashJoin optimization to avoid changing result order when we enable AQE +--SET spark.sql.autoBroadcastJoinThreshold = -1 + CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) AS v(f1); diff --git a/sql/core/src/test/resources/sql-tests/inputs/union.sql b/sql/core/src/test/resources/sql-tests/inputs/union.sql index 6da1b9b49b226..8a5b6c50fc1e3 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/union.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/union.sql @@ -45,10 +45,24 @@ SELECT array(1, 2), 'str' UNION ALL SELECT array(1, 2, 3, NULL), 1; +-- SPARK-32638: corrects references when adding aliases in WidenSetOperationTypes +CREATE OR REPLACE TEMPORARY VIEW t3 AS VALUES (decimal(1)) tbl(v); +SELECT t.v FROM ( + SELECT v FROM t3 + UNION ALL + SELECT v + v AS v FROM t3 +) t; + +SELECT SUM(t.v) FROM ( + SELECT v FROM t3 + UNION + SELECT v + v AS v FROM t3 +) t; -- Clean-up DROP VIEW IF EXISTS t1; DROP VIEW IF EXISTS t2; +DROP VIEW IF EXISTS t3; DROP VIEW IF EXISTS p1; DROP VIEW IF EXISTS p2; DROP VIEW IF EXISTS p3; diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql index e25a252418301..3d05dfda6c3fa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/window.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql @@ -120,3 +120,8 @@ SELECT cate, sum(val) OVER (w) FROM testData WHERE val is not null WINDOW w AS (PARTITION BY cate ORDER BY val); + +-- with filter predicate +SELECT val, cate, +count(val) FILTER (WHERE val > 1) OVER(PARTITION BY cate) +FROM testData ORDER BY cate, val; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out new file mode 100644 index 0000000000000..7e5568dc5cb36 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -0,0 +1,937 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 108 + + +-- !query +select current_date = current_date(), current_timestamp = current_timestamp() +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'current_date'(line 1, pos 22) + +== SQL == +select current_date = current_date(), current_timestamp = current_timestamp() +----------------------^^^ + + +-- !query +select to_date(null), to_date('2016-12-31'), to_date('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct +-- !query output +NULL 2016-12-31 2016-12-31 + + +-- !query +select to_timestamp(null), to_timestamp('2016-12-31 00:12:00'), to_timestamp('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct +-- !query output +NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 + + +-- !query +select dayofweek('2007-02-03'), dayofweek('2009-07-30'), dayofweek('2017-05-27'), dayofweek(null), dayofweek('1582-10-15 13:10:15') +-- !query schema +struct +-- !query output +7 5 7 NULL 6 + + +-- !query +create temporary view ttf1 as select * from values + (1, 2), + (2, 3) + as ttf1(current_date, current_timestamp) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'current_date'(line 4, pos 10) + +== SQL == +create temporary view ttf1 as select * from values + (1, 2), + (2, 3) + as ttf1(current_date, current_timestamp) +----------^^^ + + +-- !query +select current_date, current_timestamp from ttf1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: ttf1; line 1 pos 44 + + +-- !query +create temporary view ttf2 as select * from values + (1, 2), + (2, 3) + as ttf2(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +select current_date = current_date(), current_timestamp = current_timestamp(), a, b from ttf2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'current_date'(line 1, pos 22) + +== SQL == +select current_date = current_date(), current_timestamp = current_timestamp(), a, b from ttf2 +----------------------^^^ + + +-- !query +select a, b from ttf2 order by a, current_date +-- !query schema +struct +-- !query output +1 2 +2 3 + + +-- !query +select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), weekday(null), weekday('1582-10-15 13:10:15') +-- !query schema +struct +-- !query output +5 3 5 NULL 4 + + +-- !query +select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01') +-- !query schema +struct +-- !query output +1500 1 1 + + +-- !query +select date '2019-01-01\t' +-- !query schema +struct +-- !query output +2019-01-01 + + +-- !query +select timestamp '2019-01-01\t' +-- !query schema +struct +-- !query output +2019-01-01 00:00:00 + + +-- !query +select date '2020-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7) + +== SQL == +select date '2020-01-01中文' +-------^^^ + + +-- !query +select timestamp '2019-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7) + +== SQL == +select timestamp '2019-01-01中文' +-------^^^ + + +-- !query +select timestamp'2011-11-11 11:11:11' + interval '2' day +-- !query schema +struct +-- !query output +2011-11-13 11:11:11 + + +-- !query +select timestamp'2011-11-11 11:11:11' - interval '2' day +-- !query schema +struct +-- !query output +2011-11-09 11:11:11 + + +-- !query +select date'2011-11-11 11:11:11' + interval '2' second +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +requirement failed: Cannot add hours, minutes or seconds, milliseconds, microseconds to a date + + +-- !query +select date'2011-11-11 11:11:11' - interval '2' second +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +requirement failed: Cannot add hours, minutes or seconds, milliseconds, microseconds to a date + + +-- !query +select '2011-11-11' - interval '2' day +-- !query schema +struct +-- !query output +2011-11-09 00:00:00 + + +-- !query +select '2011-11-11 11:11:11' - interval '2' second +-- !query schema +struct +-- !query output +2011-11-11 11:11:09 + + +-- !query +select '1' - interval '2' second +-- !query schema +struct +-- !query output +NULL + + +-- !query +select 1 - interval '2' second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '1 - INTERVAL '2 seconds'' due to data type mismatch: argument 1 requires timestamp type, however, '1' is of int type.; line 1 pos 7 + + +-- !query +select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678' +-- !query schema +struct +-- !query output +2078 hours 48 minutes 47.654322 seconds + + +-- !query +select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01' +-- !query schema +struct +-- !query output +-2078 hours -48 minutes -47.654322 seconds + + +-- !query +select timestamp'2019-10-06 10:11:12.345678' - null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select null - timestamp'2019-10-06 10:11:12.345678' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date_add('2011-11-11', 1Y) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1S) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1L) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 1L)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '1L' is of bigint type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', 1.0) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 1.0BD)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', 1E1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', '1') +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', '1.2') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The second argument of 'date_add' function needs to be an integer.; + + +-- !query +select date_add(date'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add(timestamp'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_sub(date'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(date'2011-11-11', '1') +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(date'2011-11-11', '1.2') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The second argument of 'date_sub' function needs to be an integer.; + + +-- !query +select date_sub(timestamp'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(null, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date_sub(date'2011-11-11', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date'2011-11-11' + 1E1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(DATE '2011-11-11', 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 + + +-- !query +select date'2011-11-11' + '1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(DATE '2011-11-11', CAST('1' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('1' AS DOUBLE)' is of double type.; line 1 pos 7 + + +-- !query +select null + date '2001-09-28' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-09-28' + 7Y +-- !query schema +struct +-- !query output +2001-10-05 + + +-- !query +select 7S + date '2001-09-28' +-- !query schema +struct +-- !query output +2001-10-05 + + +-- !query +select date '2001-10-01' - 7 +-- !query schema +struct +-- !query output +2001-09-24 + + +-- !query +select date '2001-10-01' - '7' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_sub(DATE '2001-10-01', CAST('7' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('7' AS DOUBLE)' is of double type.; line 1 pos 7 + + +-- !query +select date '2001-09-28' + null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-09-28' - null +-- !query schema +struct +-- !query output +NULL + + +-- !query +create temp view v as select '1' str +-- !query schema +struct<> +-- !query output + + + +-- !query +select date_add('2011-11-11', str) from v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), v.`str`)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'v.`str`' is of string type.; line 1 pos 7 + + +-- !query +select date_sub('2011-11-11', str) from v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_sub(CAST('2011-11-11' AS DATE), v.`str`)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'v.`str`' is of string type.; line 1 pos 7 + + +-- !query +select null - date '2019-10-06' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-10-01' - date '2001-09-28' +-- !query schema +struct +-- !query output +3 days + + +-- !query +select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.0', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123UTC', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 03:11:12.123 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12345CST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 08:11:12.12345 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123456PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.123456 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234567PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('123456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.123456 + + +-- !query +select to_timestamp('223456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.[SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.123 + + +-- !query +select to_timestamp('2019-10-06 10:11:12', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12 + + +-- !query +select to_timestamp('2019-10-06 10:11', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:00 + + +-- !query +select to_timestamp("2019-10-06S10:11:12.12345", "yyyy-MM-dd'S'HH:mm:ss.SSSSSS") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12345 + + +-- !query +select to_timestamp("12.12342019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1234019-10-06S10:11", "ss.SSSSy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +0019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp("2019-10-06S", "yyyy-MM-dd'S'") +-- !query schema +struct +-- !query output +2019-10-06 00:00:00 + + +-- !query +select to_timestamp("S2019-10-06", "'S'yyyy-MM-dd") +-- !query schema +struct +-- !query output +2019-10-06 00:00:00 + + +-- !query +select to_timestamp("2019-10-06T10:11:12'12", "yyyy-MM-dd'T'HH:mm:ss''SSSS") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12 + + +-- !query +select to_timestamp("2019-10-06T10:11:12'", "yyyy-MM-dd'T'HH:mm:ss''") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("'2019-10-06T10:11:12", "''yyyy-MM-dd'T'HH:mm:ss") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 00:00:00 + + +-- !query +select to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 + + +-- !query +select to_date("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019 40", "yyyy mm") +-- !query schema +struct +-- !query output +2019-01-01 00:40:00 + + +-- !query +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss") +-- !query schema +struct +-- !query output +2019-01-01 10:10:10 + + +-- !query +select date_format(date '2020-05-23', 'GGGGG') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'GGGGG' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(date '2020-05-23', 'MMMMM') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMMMM' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(date '2020-05-23', 'LLLLL') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'LLLLL' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(timestamp '2020-05-23', 'EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(timestamp '2020-05-23', 'uuuuu') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'uuuuu' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2020-05-23', 'QQQQQ') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: Q + + +-- !query +select date_format('2020-05-23', 'qqqqq') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: q + + +-- !query +select to_timestamp('2019-10-06 A', 'yyyy-MM-dd GGGGG') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyy-MM-dd GGGGG' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd MM yyyy EEEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd MM yyyy EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select unix_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd MM yyyy EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_unixtime(12345, 'MMMMM') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMMMM' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_unixtime(54321, 'QQQQQ') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select from_unixtime(23456, 'aaaaa') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'aaaaa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_json('{"time":"26/October/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_json('{"date":"26/October/2015"}', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_csv('26/October/2015', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out index 7bef1bad4507e..6d26fae73b11c 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 29 +-- Number of queries: 30 -- !query @@ -282,3 +282,11 @@ no viable alternative at input 'all'(line 1, pos 22) == SQL == select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t(ys) ----------------------^^^ + + +-- !query +select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))) +-- !query schema +struct>> +-- !query output +[[""]] diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index ab6130da869c4..2cee0c8886935 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 101 +-- Number of queries: 107 -- !query @@ -26,6 +26,42 @@ struct +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select interval 2147483647 month / 0.5 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select interval 2147483647 day * 2 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select interval 2147483647 day / 0.5 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + -- !query select interval '2 seconds' / 0 -- !query schema @@ -142,11 +178,28 @@ struct -- !query select make_interval(1, 2, 3, 4, 5, 6, 7.008009) -- !query schema -struct +struct -- !query output 1 years 2 months 25 days 5 hours 6 minutes 7.008009 seconds +-- !query +select make_interval(1, 2, 3, 4, 0, 0, 123456789012.123456) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 34293552 hours 30 minutes 12.123456 seconds + + +-- !query +select make_interval(0, 0, 0, 0, 0, 0, 1234567890123456789) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Decimal(expanded,1234567890123456789,20,0}) cannot be represented as Decimal(18, 6). + + -- !query select cast('1 second' as interval) -- !query schema @@ -302,15 +355,17 @@ struct -- !query select interval 30 day day -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +30 days -no viable alternative at input 'day'(line 1, pos 23) -== SQL == -select interval 30 day day ------------------------^^^ +-- !query +select interval 30 days days +-- !query schema +struct +-- !query output +30 days -- !query @@ -320,7 +375,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 20 15:40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '20 15:40:32.99899999' day to hour @@ -334,7 +389,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 20 15:40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '20 15:40:32.99899999' day to minute @@ -348,7 +403,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 15:40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '15:40:32.99899999' hour to minute @@ -362,7 +417,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '15:40.99899999' hour to second @@ -376,7 +431,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '15:40' hour to second @@ -390,7 +445,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 20 40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 20 40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '20 40:32.99899999' minute to second @@ -435,6 +490,14 @@ struct 3 years 1 hours +-- !query +SELECT interval '1 year 3 months 2 weeks 2 days 1 hour 3 minutes 2 seconds 100 millisecond 200 microseconds' +-- !query schema +struct +-- !query output +1 years 3 months 16 days 1 hours 3 minutes 2.1002 seconds + + -- !query select interval -- !query schema @@ -594,13 +657,8 @@ select interval (-30) day -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -no viable alternative at input 'day'(line 1, pos 22) - -== SQL == -select interval (-30) day -----------------------^^^ +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 -- !query @@ -608,13 +666,8 @@ select interval (a + 1) day -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -no viable alternative at input 'day'(line 1, pos 24) - -== SQL == -select interval (a + 1) day -------------------------^^^ +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 -- !query @@ -624,185 +677,43 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 23) +extraneous input 'day' expecting {, ';'}(line 1, pos 27) == SQL == select interval 30 day day day ------------------------^^^ +---------------------------^^^ -- !query -select sum(cast(null as interval)) +select interval (-30) days -- !query schema -struct --- !query output -NULL - - --- !query -select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0 --- !query schema -struct --- !query output -NULL - - --- !query -select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -3 seconds - - --- !query -select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -1 seconds - - --- !query -select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) --- !query schema -struct --- !query output --3 seconds - - --- !query -select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output --7 days 2 seconds - - --- !query -select - i, - sum(cast(v as interval)) -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -group by i --- !query schema -struct --- !query output -1 -2 days -2 2 seconds -3 NULL - - --- !query -select - sum(cast(v as interval)) as sv -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -having sv is not null --- !query schema -struct --- !query output --2 days 2 seconds - - --- !query -SELECT - i, - sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) -FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v) --- !query schema -struct --- !query output -1 2 seconds -1 3 seconds -2 NULL -2 NULL - - --- !query -select avg(cast(v as interval)) from VALUES (null) t(v) --- !query schema -struct --- !query output -NULL - - --- !query -select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0 --- !query schema -struct --- !query output -NULL - - --- !query -select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -1.5 seconds - - --- !query -select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -0.5 seconds - - --- !query -select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) --- !query schema -struct +struct<> -- !query output --1.5 seconds +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 -- !query -select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) +select interval (a + 1) days -- !query schema -struct +struct<> -- !query output --3 days -11 hours -59 minutes -59 seconds +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 -- !query -select - i, - avg(cast(v as interval)) -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -group by i +select interval 30 days days days -- !query schema -struct --- !query output -1 -1 days -2 2 seconds -3 NULL - - --- !query -select - avg(cast(v as interval)) as sv -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -having sv is not null --- !query schema -struct +struct<> -- !query output --15 hours -59 minutes -59.333333 seconds +org.apache.spark.sql.catalyst.parser.ParseException +extraneous input 'days' expecting {, ';'}(line 1, pos 29) --- !query -SELECT - i, - avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) -FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v) --- !query schema -struct --- !query output -1 1.5 seconds -1 2 seconds -2 NULL -2 NULL +== SQL == +select interval 30 days days days +-----------------------------^^^ -- !query @@ -827,7 +738,7 @@ select interval '2-2' year to month + dateval from interval_arithmetic -- !query schema -struct +struct -- !query output 2012-01-01 2009-11-01 2014-03-01 2014-03-01 2009-11-01 2009-11-01 2014-03-01 @@ -870,9 +781,10 @@ select interval '99 11:22:33.123456789' day to second + dateval from interval_arithmetic -- !query schema -struct +struct<> -- !query output -2012-01-01 2011-09-23 2012-04-09 2012-04-09 2011-09-23 2011-09-23 2012-04-09 +java.lang.IllegalArgumentException +requirement failed: Cannot add hours, minutes or seconds, milliseconds, microseconds to a date -- !query @@ -942,6 +854,65 @@ struct 1 days +-- !query +select interval '2-2\t' year to month +-- !query schema +struct +-- !query output +2 years 2 months + + +-- !query +select interval '-\t2-2\t' year to month +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Interval string does not match year-month format of 'y-m': - 2-2 (line 1, pos 16) + +== SQL == +select interval '-\t2-2\t' year to month +----------------^^^ + + +-- !query +select interval '\n0 12:34:46.789\t' day to second +-- !query schema +struct +-- !query output +12 hours 34 minutes 46.789 seconds + + +-- !query +select interval '\n-\t10\t 12:34:46.789\t' day to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': +- 10 12:34:46.789 , set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '\n-\t10\t 12:34:46.789\t' day to second +----------------^^^ + + +-- !query +select interval '中文 interval 1 day' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7) + +== SQL == +select interval '中文 interval 1 day' +-------^^^ + + -- !query select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) -- !query schema @@ -988,32 +959,158 @@ integer overflow -- !query -SELECT from_csv('1, 1 day', 'a INT, b interval') +SELECT + from_csv('1, 1 day', 'a INT, b interval'), + to_csv(from_csv('1, 1 day', 'a INT, b interval')), + to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), + from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval') +-- !query schema +struct,to_csv(from_csv(1, 1 day)):string,to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string,from_csv(to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes'))):struct> +-- !query output +{"a":1,"b":1 days} 1,1 days 2 years 8 months,1 hours 10 minutes {"a":2 years 8 months,"b":1 hours 10 minutes} + + +-- !query +SELECT + from_json('{"a":"1 days"}', 'a interval'), + to_json(from_json('{"a":"1 days"}', 'a interval')), + to_json(map('a', interval 25 month 100 day 130 minute)), + from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval') +-- !query schema +struct,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct> +-- !query output +{"a":1 days} {"a":"1 days"} {"a":"2 years 1 months 100 days 2 hours 10 minutes"} {"a":2 years 1 months 100 days 2 hours 10 minutes} + + +-- !query +select interval '+' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +(line 1, pos 7) + +== SQL == +select interval '+' +-------^^^ + + +-- !query +select interval '+.' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +.(line 1, pos 7) + +== SQL == +select interval '+.' +-------^^^ + + +-- !query +select interval '1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1(line 1, pos 7) + +== SQL == +select interval '1' +-------^^^ + + +-- !query +select interval '1.2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1.2(line 1, pos 7) + +== SQL == +select interval '1.2' +-------^^^ + + +-- !query +select interval '- 2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: - 2(line 1, pos 7) + +== SQL == +select interval '- 2' +-------^^^ + + +-- !query +select interval '1 day -' -- !query schema -struct> +struct<> -- !query output -{"a":1,"b":1 days} +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day -(line 1, pos 7) + +== SQL == +select interval '1 day -' +-------^^^ -- !query -SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)) +select interval '1 day 1' -- !query schema -struct +struct<> -- !query output -2 years 8 months,1 hours 10 minutes +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day 1(line 1, pos 7) + +== SQL == +select interval '1 day 1' +-------^^^ + + +-- !query +select interval '1 day 2' day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only use numbers in the interval value part for multiple unit value pairs interval form, but got invalid value: 1 day 2(line 1, pos 16) + +== SQL == +select interval '1 day 2' day +----------------^^^ -- !query -SELECT from_json('{"a":"1 days"}', 'a interval') +select interval 'interval 1' day -- !query schema -struct> +struct<> -- !query output -{"a":1 days} +org.apache.spark.sql.catalyst.parser.ParseException + +Can only use numbers in the interval value part for multiple unit value pairs interval form, but got invalid value: interval 1(line 1, pos 16) + +== SQL == +select interval 'interval 1' day +----------------^^^ -- !query -SELECT to_json(map('a', interval 25 month 100 day 130 minute)) +select interval '-\t 1' day -- !query schema -struct +struct -- !query output -{"a":"2 years 1 months 100 days 2 hours 10 minutes"} +-1 days diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out new file mode 100644 index 0000000000000..dfdf183233b42 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -0,0 +1,296 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 36 + + +-- !query +select concat_ws() +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +requirement failed: concat_ws requires at least one argument.; line 1 pos 7 + + +-- !query +select format_string() +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +requirement failed: format_string() should take at least 1 argument; line 1 pos 7 + + +-- !query +select 'a' || 'b' || 'c' +-- !query schema +struct +-- !query output +abc + + +-- !query +select replace('abc', 'b', '123') +-- !query schema +struct +-- !query output +a123c + + +-- !query +select replace('abc', 'b') +-- !query schema +struct +-- !query output +ac + + +-- !query +select length(uuid()), (uuid() <> uuid()) +-- !query schema +struct +-- !query output +36 true + + +-- !query +select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aaads', null) +-- !query schema +struct +-- !query output +4 NULL NULL + + +-- !query +select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null) +-- !query schema +struct +-- !query output +ab abcd ab NULL + + +-- !query +select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a') +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: a + + +-- !query +select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null) +-- !query schema +struct +-- !query output +cd abcd cd NULL + + +-- !query +select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a') +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: a + + +-- !query +SELECT split('aa1cc2ee3', '[1-9]+') +-- !query schema +struct> +-- !query output +["aa","cc","ee",""] + + +-- !query +SELECT split('aa1cc2ee3', '[1-9]+', 2) +-- !query schema +struct> +-- !query output +["aa","cc2ee3"] + + +-- !query +SELECT substr('Spark SQL', 5) +-- !query schema +struct +-- !query output +k SQL + + +-- !query +SELECT substr('Spark SQL', -3) +-- !query schema +struct +-- !query output +SQL + + +-- !query +SELECT substr('Spark SQL', 5, 1) +-- !query schema +struct +-- !query output +k + + +-- !query +SELECT substr('Spark SQL' from 5) +-- !query schema +struct +-- !query output +k SQL + + +-- !query +SELECT substr('Spark SQL' from -3) +-- !query schema +struct +-- !query output +SQL + + +-- !query +SELECT substr('Spark SQL' from 5 for 1) +-- !query schema +struct +-- !query output +k + + +-- !query +SELECT substring('Spark SQL', 5) +-- !query schema +struct +-- !query output +k SQL + + +-- !query +SELECT substring('Spark SQL', -3) +-- !query schema +struct +-- !query output +SQL + + +-- !query +SELECT substring('Spark SQL', 5, 1) +-- !query schema +struct +-- !query output +k + + +-- !query +SELECT substring('Spark SQL' from 5) +-- !query schema +struct +-- !query output +k SQL + + +-- !query +SELECT substring('Spark SQL' from -3) +-- !query schema +struct +-- !query output +SQL + + +-- !query +SELECT substring('Spark SQL' from 5 for 1) +-- !query schema +struct +-- !query output +k + + +-- !query +SELECT trim(" xyz "), ltrim(" xyz "), rtrim(" xyz ") +-- !query schema +struct +-- !query output +xyz xyz xyz + + +-- !query +SELECT trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx') +-- !query schema +struct +-- !query output +Tom Tom + + +-- !query +SELECT trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx') +-- !query schema +struct +-- !query output +bar bar + + +-- !query +SELECT trim(LEADING 'xyz' FROM 'zzzytest') +-- !query schema +struct +-- !query output +test + + +-- !query +SELECT trim(LEADING 'xyz' FROM 'zzzytestxyz') +-- !query schema +struct +-- !query output +testxyz + + +-- !query +SELECT trim(LEADING 'xy' FROM 'xyxXxyLAST WORD') +-- !query schema +struct +-- !query output +XxyLAST WORD + + +-- !query +SELECT trim(TRAILING 'xyz' FROM 'testxxzx') +-- !query schema +struct +-- !query output +test + + +-- !query +SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx') +-- !query schema +struct +-- !query output +xyztest + + +-- !query +SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy') +-- !query schema +struct +-- !query output +TURNERyxX + + +-- !query +SELECT lpad('hi', 'invalid_length') +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: invalid_length + + +-- !query +SELECT rpad('hi', 'invalid_length') +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: invalid_length diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out index 35b4c0e79720b..d4872ca03199b 100644 --- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 46 +-- Number of queries: 51 -- !query @@ -353,6 +353,46 @@ struct 1 +-- !query +select cast('1中文' as tinyint) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('1中文' as smallint) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('1中文' as INT) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('中文1' as bigint) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('1中文' as bigint) +-- !query schema +struct +-- !query output +NULL + + -- !query select cast('\t\t true \n\r ' as boolean) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out index 5bb00e028c4b7..b1a32ad1f63e9 100644 --- a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 25 +-- Number of queries: 28 -- !query @@ -27,7 +27,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -Operation not allowed: ALTER TABLE table CHANGE COLUMN requires a TYPE or a COMMENT or a FIRST/AFTER(line 1, pos 0) +Operation not allowed: ALTER TABLE table CHANGE COLUMN requires a TYPE, a SET/DROP, a COMMENT, or a FIRST/AFTER(line 1, pos 0) == SQL == ALTER TABLE test_change CHANGE a @@ -83,7 +83,7 @@ c int -- !query -ALTER TABLE test_change CHANGE a TYPE INT AFTER b +ALTER TABLE test_change CHANGE a AFTER b -- !query schema struct<> -- !query output @@ -92,7 +92,7 @@ ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables.; -- !query -ALTER TABLE test_change CHANGE b TYPE STRING FIRST +ALTER TABLE test_change CHANGE b FIRST -- !query schema struct<> -- !query output @@ -111,7 +111,7 @@ c int -- !query -ALTER TABLE test_change CHANGE a TYPE INT COMMENT 'this is column a' +ALTER TABLE test_change CHANGE a COMMENT 'this is column a' -- !query schema struct<> -- !query output @@ -119,7 +119,7 @@ struct<> -- !query -ALTER TABLE test_change CHANGE b TYPE STRING COMMENT '#*02?`' +ALTER TABLE test_change CHANGE b COMMENT '#*02?`' -- !query schema struct<> -- !query output @@ -127,7 +127,7 @@ struct<> -- !query -ALTER TABLE test_change CHANGE c TYPE INT COMMENT '' +ALTER TABLE test_change CHANGE c COMMENT '' -- !query schema struct<> -- !query output @@ -145,7 +145,15 @@ c int -- !query -ALTER TABLE test_change CHANGE a TYPE INT COMMENT 'this is column a' +ALTER TABLE test_change CHANGE a TYPE INT +-- !query schema +struct<> +-- !query output + + + +-- !query +ALTER TABLE test_change CHANGE a COMMENT 'this is column a' -- !query schema struct<> -- !query output @@ -181,6 +189,24 @@ b string #*02?` c int +-- !query +ALTER TABLE test_change CHANGE A COMMENT 'case insensitivity' +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC test_change +-- !query schema +struct +-- !query output +a int case insensitivity +b string #*02?` +c int + + -- !query CREATE TEMPORARY VIEW temp_view(a, b) AS SELECT 1, "one" -- !query schema @@ -190,7 +216,7 @@ struct<> -- !query -ALTER TABLE temp_view CHANGE a TYPE INT COMMENT 'this is column a' +ALTER TABLE temp_view CHANGE a TYPE INT -- !query schema struct<> -- !query output @@ -207,7 +233,7 @@ struct<> -- !query -ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT COMMENT 'this is column a' +ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT -- !query schema struct<> -- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out index f34b75a379aae..04ddfe0ac128c 100644 --- a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out @@ -72,7 +72,7 @@ SELECT i1 FROM t1, mydb1.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'i1' is ambiguous, could be: mydb1.t1.i1, mydb1.t1.i1.; line 1 pos 7 +Reference 'i1' is ambiguous, could be: spark_catalog.mydb1.t1.i1, spark_catalog.mydb1.t1.i1.; line 1 pos 7 -- !query @@ -81,7 +81,7 @@ SELECT t1.i1 FROM t1, mydb1.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 't1.i1' is ambiguous, could be: mydb1.t1.i1, mydb1.t1.i1.; line 1 pos 7 +Reference 't1.i1' is ambiguous, could be: spark_catalog.mydb1.t1.i1, spark_catalog.mydb1.t1.i1.; line 1 pos 7 -- !query @@ -90,7 +90,7 @@ SELECT mydb1.t1.i1 FROM t1, mydb1.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'mydb1.t1.i1' is ambiguous, could be: mydb1.t1.i1, mydb1.t1.i1.; line 1 pos 7 +Reference 'mydb1.t1.i1' is ambiguous, could be: spark_catalog.mydb1.t1.i1, spark_catalog.mydb1.t1.i1.; line 1 pos 7 -- !query @@ -99,7 +99,7 @@ SELECT i1 FROM t1, mydb2.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'i1' is ambiguous, could be: mydb1.t1.i1, mydb2.t1.i1.; line 1 pos 7 +Reference 'i1' is ambiguous, could be: spark_catalog.mydb1.t1.i1, spark_catalog.mydb2.t1.i1.; line 1 pos 7 -- !query @@ -108,7 +108,7 @@ SELECT t1.i1 FROM t1, mydb2.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 't1.i1' is ambiguous, could be: mydb1.t1.i1, mydb2.t1.i1.; line 1 pos 7 +Reference 't1.i1' is ambiguous, could be: spark_catalog.mydb1.t1.i1, spark_catalog.mydb2.t1.i1.; line 1 pos 7 -- !query @@ -125,7 +125,7 @@ SELECT i1 FROM t1, mydb1.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'i1' is ambiguous, could be: mydb2.t1.i1, mydb1.t1.i1.; line 1 pos 7 +Reference 'i1' is ambiguous, could be: spark_catalog.mydb2.t1.i1, spark_catalog.mydb1.t1.i1.; line 1 pos 7 -- !query @@ -134,7 +134,7 @@ SELECT t1.i1 FROM t1, mydb1.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 't1.i1' is ambiguous, could be: mydb2.t1.i1, mydb1.t1.i1.; line 1 pos 7 +Reference 't1.i1' is ambiguous, could be: spark_catalog.mydb2.t1.i1, spark_catalog.mydb1.t1.i1.; line 1 pos 7 -- !query @@ -143,7 +143,7 @@ SELECT i1 FROM t1, mydb2.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'i1' is ambiguous, could be: mydb2.t1.i1, mydb2.t1.i1.; line 1 pos 7 +Reference 'i1' is ambiguous, could be: spark_catalog.mydb2.t1.i1, spark_catalog.mydb2.t1.i1.; line 1 pos 7 -- !query @@ -152,7 +152,7 @@ SELECT t1.i1 FROM t1, mydb2.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 't1.i1' is ambiguous, could be: mydb2.t1.i1, mydb2.t1.i1.; line 1 pos 7 +Reference 't1.i1' is ambiguous, could be: spark_catalog.mydb2.t1.i1, spark_catalog.mydb2.t1.i1.; line 1 pos 7 -- !query @@ -161,7 +161,7 @@ SELECT db1.t1.i1 FROM t1, mydb2.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`db1.t1.i1`' given input columns: [mydb2.t1.i1, mydb2.t1.i1]; line 1 pos 7 +cannot resolve '`db1.t1.i1`' given input columns: [spark_catalog.mydb2.t1.i1, spark_catalog.mydb2.t1.i1]; line 1 pos 7 -- !query @@ -186,7 +186,7 @@ SELECT mydb1.t1 FROM t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`mydb1.t1`' given input columns: [mydb1.t1.i1]; line 1 pos 7 +cannot resolve '`mydb1.t1`' given input columns: [spark_catalog.mydb1.t1.i1]; line 1 pos 7 -- !query @@ -204,7 +204,7 @@ SELECT t1 FROM mydb1.t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`t1`' given input columns: [mydb1.t1.i1]; line 1 pos 7 +cannot resolve '`t1`' given input columns: [spark_catalog.mydb1.t1.i1]; line 1 pos 7 -- !query @@ -221,7 +221,7 @@ SELECT mydb1.t1.i1 FROM t1 struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`mydb1.t1.i1`' given input columns: [mydb2.t1.i1]; line 1 pos 7 +cannot resolve '`mydb1.t1.i1`' given input columns: [spark_catalog.mydb2.t1.i1]; line 1 pos 7 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out index a9709c4a79793..4d0e5ea829d3f 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out @@ -1,29 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 17 - - --- !query -create temporary view t as select * from values 0, 1, 2 as t(id) --- !query schema -struct<> --- !query output - - - --- !query -create temporary view t2 as select * from values 0, 1 as t(id) --- !query schema -struct<> --- !query output - - - --- !query -SET spark.sql.legacy.ctePrecedence.enabled=true --- !query schema -struct --- !query output -spark.sql.legacy.ctePrecedence.enabled true +-- Number of queries: 16 -- !query @@ -193,16 +169,53 @@ struct -- !query -DROP VIEW IF EXISTS t +WITH t(c) AS (SELECT 1) +SELECT * FROM t +WHERE c IN ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) -- !query schema -struct<> +struct -- !query output +1 +-- !query +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + -- !query -DROP VIEW IF EXISTS t2 +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t -- !query schema -struct<> +struct<1:int> -- !query output +1 + +-- !query +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +) +-- !query schema +struct +-- !query output +1 diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out new file mode 100644 index 0000000000000..2f736c7b4978f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out @@ -0,0 +1,229 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 16 + + +-- !query +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2 +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT * FROM t +WHERE c IN ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out new file mode 100644 index 0000000000000..74394ee3ffc89 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -0,0 +1,221 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 16 + + +-- !query +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2 +-- !query schema +struct<2:int> +-- !query output +2 + + +-- !query +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2 +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2 +-- !query schema +struct<3:int> +-- !query output +3 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT * FROM t +WHERE c IN ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output + + + +-- !query +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t +-- !query schema +struct<2:int> +-- !query output +2 + + +-- !query +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +) +-- !query schema +struct +-- !query output +2 diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index 2d87781193c25..b8f666586ce45 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 27 +-- Number of queries: 15 -- !query @@ -161,172 +161,6 @@ WITH SELECT * FROM t --- !query -WITH t as ( - WITH t2 AS (SELECT 1) - SELECT * FROM t2 -) -SELECT * FROM t --- !query schema -struct<1:int> --- !query output -1 - - --- !query -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 1) - SELECT * FROM t -) --- !query schema -struct --- !query output -1 - - --- !query -SELECT ( - WITH t AS (SELECT 1) - SELECT * FROM t -) --- !query schema -struct --- !query output -1 - - --- !query -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -SELECT * FROM t2 --- !query schema -struct<2:int> --- !query output -2 - - --- !query -WITH - t(c) AS (SELECT 1), - t2 AS ( - SELECT ( - SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) - ) - ) -SELECT * FROM t2 --- !query schema -struct --- !query output -2 - - --- !query -WITH - t AS (SELECT 1), - t2 AS ( - WITH t AS (SELECT 2), - t2 AS ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) - SELECT * FROM t2 - ) -SELECT * FROM t2 --- !query schema -struct<3:int> --- !query output -3 - - --- !query -WITH t(c) AS (SELECT 1) -SELECT max(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t -) --- !query schema -struct --- !query output -2 - - --- !query -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 2) - SELECT * FROM t - ) -) --- !query schema -struct --- !query output -2 - - --- !query -WITH t(c) AS (SELECT 1) -SELECT sum(c) FROM ( - WITH t(c) AS (SELECT 2) - SELECT max(c) AS c FROM ( - WITH t(c) AS (SELECT 3) - SELECT * FROM t - ) -) --- !query schema -struct --- !query output -3 - - --- !query -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t -) --- !query schema -struct --- !query output -2 - - --- !query -WITH t AS (SELECT 1) -SELECT ( - SELECT ( - WITH t AS (SELECT 2) - SELECT * FROM t - ) -) --- !query schema -struct --- !query output -2 - - --- !query -WITH t AS (SELECT 1) -SELECT ( - WITH t AS (SELECT 2) - SELECT ( - WITH t AS (SELECT 3) - SELECT * FROM t - ) -) --- !query schema -struct --- !query output -3 - - -- !query DROP VIEW IF EXISTS t -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/date_part.sql.out b/sql/core/src/test/resources/sql-tests/results/date_part.sql.out deleted file mode 100644 index b4cceedffd98b..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/date_part.sql.out +++ /dev/null @@ -1,886 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 110 - - --- !query -CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c --- !query schema -struct<> --- !query output - - - --- !query -select date_part('millennium', c) from t --- !query schema -struct --- !query output -3 - - --- !query -select date_part('millennia', c) from t --- !query schema -struct --- !query output -3 - - --- !query -select date_part('mil', c) from t --- !query schema -struct --- !query output -3 - - --- !query -select date_part('mils', c) from t --- !query schema -struct --- !query output -3 - - --- !query -select date_part('century', c) from t --- !query schema -struct --- !query output -21 - - --- !query -select date_part('centuries', c) from t --- !query schema -struct --- !query output -21 - - --- !query -select date_part('c', c) from t --- !query schema -struct --- !query output -21 - - --- !query -select date_part('cent', c) from t --- !query schema -struct --- !query output -21 - - --- !query -select date_part('decade', c) from t --- !query schema -struct --- !query output -201 - - --- !query -select date_part('decades', c) from t --- !query schema -struct --- !query output -201 - - --- !query -select date_part('dec', c) from t --- !query schema -struct --- !query output -201 - - --- !query -select date_part('decs', c) from t --- !query schema -struct --- !query output -201 - - --- !query -select date_part('year', c) from t --- !query schema -struct --- !query output -2011 - - --- !query -select date_part('y', c) from t --- !query schema -struct --- !query output -2011 - - --- !query -select date_part('years', c) from t --- !query schema -struct --- !query output -2011 - - --- !query -select date_part('yr', c) from t --- !query schema -struct --- !query output -2011 - - --- !query -select date_part('yrs', c) from t --- !query schema -struct --- !query output -2011 - - --- !query -select date_part('quarter', c) from t --- !query schema -struct --- !query output -2 - - --- !query -select date_part('qtr', c) from t --- !query schema -struct --- !query output -2 - - --- !query -select date_part('month', c) from t --- !query schema -struct --- !query output -5 - - --- !query -select date_part('mon', c) from t --- !query schema -struct --- !query output -5 - - --- !query -select date_part('mons', c) from t --- !query schema -struct --- !query output -5 - - --- !query -select date_part('months', c) from t --- !query schema -struct --- !query output -5 - - --- !query -select date_part('week', c) from t --- !query schema -struct --- !query output -18 - - --- !query -select date_part('w', c) from t --- !query schema -struct --- !query output -18 - - --- !query -select date_part('weeks', c) from t --- !query schema -struct --- !query output -18 - - --- !query -select date_part('day', c) from t --- !query schema -struct --- !query output -6 - - --- !query -select date_part('d', c) from t --- !query schema -struct --- !query output -6 - - --- !query -select date_part('days', c) from t --- !query schema -struct --- !query output -6 - - --- !query -select date_part('dayofweek', c) from t --- !query schema -struct --- !query output -6 - - --- !query -select date_part('dow', c) from t --- !query schema -struct --- !query output -5 - - --- !query -select date_part('isodow', c) from t --- !query schema -struct --- !query output -5 - - --- !query -select date_part('doy', c) from t --- !query schema -struct --- !query output -126 - - --- !query -select date_part('hour', c) from t --- !query schema -struct --- !query output -7 - - --- !query -select date_part('h', c) from t --- !query schema -struct --- !query output -7 - - --- !query -select date_part('hours', c) from t --- !query schema -struct --- !query output -7 - - --- !query -select date_part('hr', c) from t --- !query schema -struct --- !query output -7 - - --- !query -select date_part('hrs', c) from t --- !query schema -struct --- !query output -7 - - --- !query -select date_part('minute', c) from t --- !query schema -struct --- !query output -8 - - --- !query -select date_part('m', c) from t --- !query schema -struct --- !query output -8 - - --- !query -select date_part('min', c) from t --- !query schema -struct --- !query output -8 - - --- !query -select date_part('mins', c) from t --- !query schema -struct --- !query output -8 - - --- !query -select date_part('minutes', c) from t --- !query schema -struct --- !query output -8 - - --- !query -select date_part('second', c) from t --- !query schema -struct --- !query output -9.123456 - - --- !query -select date_part('s', c) from t --- !query schema -struct --- !query output -9.123456 - - --- !query -select date_part('sec', c) from t --- !query schema -struct --- !query output -9.123456 - - --- !query -select date_part('seconds', c) from t --- !query schema -struct --- !query output -9.123456 - - --- !query -select date_part('secs', c) from t --- !query schema -struct --- !query output -9.123456 - - --- !query -select date_part('not_supported', c) from t --- !query schema -struct<> --- !query output -org.apache.spark.sql.AnalysisException -Literals of type 'not_supported' are currently not supported for the string type.;; line 1 pos 7 - - --- !query -select date_part(c, c) from t --- !query schema -struct<> --- !query output -org.apache.spark.sql.AnalysisException -The field parameter needs to be a foldable string value.;; line 1 pos 7 - - --- !query -select date_part(null, c) from t --- !query schema -struct --- !query output -NULL - - --- !query -CREATE TEMPORARY VIEW t2 AS select interval 1010 year 9 month 8 day 7 hour 6 minute 5 second 4 millisecond 3 microsecond as c --- !query schema -struct<> --- !query output - - - --- !query -select date_part('millennium', c) from t2 --- !query schema -struct --- !query output -1 - - --- !query -select date_part('millennia', c) from t2 --- !query schema -struct --- !query output -1 - - --- !query -select date_part('mil', c) from t2 --- !query schema -struct --- !query output -1 - - --- !query -select date_part('mils', c) from t2 --- !query schema -struct --- !query output -1 - - --- !query -select date_part('century', c) from t2 --- !query schema -struct --- !query output -10 - - --- !query -select date_part('centuries', c) from t2 --- !query schema -struct --- !query output -10 - - --- !query -select date_part('c', c) from t2 --- !query schema -struct --- !query output -10 - - --- !query -select date_part('cent', c) from t2 --- !query schema -struct --- !query output -10 - - --- !query -select date_part('decade', c) from t2 --- !query schema -struct --- !query output -101 - - --- !query -select date_part('decades', c) from t2 --- !query schema -struct --- !query output -101 - - --- !query -select date_part('dec', c) from t2 --- !query schema -struct --- !query output -101 - - --- !query -select date_part('decs', c) from t2 --- !query schema -struct --- !query output -101 - - --- !query -select date_part('year', c) from t2 --- !query schema -struct --- !query output -1010 - - --- !query -select date_part('y', c) from t2 --- !query schema -struct --- !query output -1010 - - --- !query -select date_part('years', c) from t2 --- !query schema -struct --- !query output -1010 - - --- !query -select date_part('yr', c) from t2 --- !query schema -struct --- !query output -1010 - - --- !query -select date_part('yrs', c) from t2 --- !query schema -struct --- !query output -1010 - - --- !query -select date_part('quarter', c) from t2 --- !query schema -struct --- !query output -4 - - --- !query -select date_part('qtr', c) from t2 --- !query schema -struct --- !query output -4 - - --- !query -select date_part('month', c) from t2 --- !query schema -struct --- !query output -9 - - --- !query -select date_part('mon', c) from t2 --- !query schema -struct --- !query output -9 - - --- !query -select date_part('mons', c) from t2 --- !query schema -struct --- !query output -9 - - --- !query -select date_part('months', c) from t2 --- !query schema -struct --- !query output -9 - - --- !query -select date_part('day', c) from t2 --- !query schema -struct --- !query output -8 - - --- !query -select date_part('d', c) from t2 --- !query schema -struct --- !query output -8 - - --- !query -select date_part('days', c) from t2 --- !query schema -struct --- !query output -8 - - --- !query -select date_part('hour', c) from t2 --- !query schema -struct --- !query output -7 - - --- !query -select date_part('h', c) from t2 --- !query schema -struct --- !query output -7 - - --- !query -select date_part('hours', c) from t2 --- !query schema -struct --- !query output -7 - - --- !query -select date_part('hr', c) from t2 --- !query schema -struct --- !query output -7 - - --- !query -select date_part('hrs', c) from t2 --- !query schema -struct --- !query output -7 - - --- !query -select date_part('minute', c) from t2 --- !query schema -struct --- !query output -6 - - --- !query -select date_part('m', c) from t2 --- !query schema -struct --- !query output -6 - - --- !query -select date_part('min', c) from t2 --- !query schema -struct --- !query output -6 - - --- !query -select date_part('mins', c) from t2 --- !query schema -struct --- !query output -6 - - --- !query -select date_part('minutes', c) from t2 --- !query schema -struct --- !query output -6 - - --- !query -select date_part('second', c) from t2 --- !query schema -struct --- !query output -5.004003 - - --- !query -select date_part('s', c) from t2 --- !query schema -struct --- !query output -5.004003 - - --- !query -select date_part('sec', c) from t2 --- !query schema -struct --- !query output -5.004003 - - --- !query -select date_part('seconds', c) from t2 --- !query schema -struct --- !query output -5.004003 - - --- !query -select date_part('secs', c) from t2 --- !query schema -struct --- !query output -5.004003 - - --- !query -select date_part('milliseconds', c) from t2 --- !query schema -struct --- !query output -5004.003 - - --- !query -select date_part('msec', c) from t2 --- !query schema -struct --- !query output -5004.003 - - --- !query -select date_part('msecs', c) from t2 --- !query schema -struct --- !query output -5004.003 - - --- !query -select date_part('millisecon', c) from t2 --- !query schema -struct --- !query output -5004.003 - - --- !query -select date_part('mseconds', c) from t2 --- !query schema -struct --- !query output -5004.003 - - --- !query -select date_part('ms', c) from t2 --- !query schema -struct --- !query output -5004.003 - - --- !query -select date_part('microseconds', c) from t2 --- !query schema -struct --- !query output -5004003 - - --- !query -select date_part('usec', c) from t2 --- !query schema -struct --- !query output -5004003 - - --- !query -select date_part('usecs', c) from t2 --- !query schema -struct --- !query output -5004003 - - --- !query -select date_part('useconds', c) from t2 --- !query schema -struct --- !query output -5004003 - - --- !query -select date_part('microsecon', c) from t2 --- !query schema -struct --- !query output -5004003 - - --- !query -select date_part('us', c) from t2 --- !query schema -struct --- !query output -5004003 - - --- !query -select date_part('epoch', c) from t2 --- !query schema -struct --- !query output -31897220765.004003 - - --- !query -select date_part('not_supported', c) from t2 --- !query schema -struct<> --- !query output -org.apache.spark.sql.AnalysisException -Literals of type 'not_supported' are currently not supported for the interval type.;; line 1 pos 7 - - --- !query -select date_part(c, c) from t2 --- !query schema -struct<> --- !query output -org.apache.spark.sql.AnalysisException -The field parameter needs to be a foldable string value.;; line 1 pos 7 - - --- !query -select date_part(null, c) from t2 --- !query schema -struct --- !query output -NULL diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-formatting-invalid.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-formatting-invalid.sql.out new file mode 100644 index 0000000000000..18d1a10068794 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-formatting-invalid.sql.out @@ -0,0 +1,335 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 37 + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'GGGGG') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'GGGGG' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'yyyyyyy') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'qqqqq') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: q + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'QQQQQ') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: Q + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'MMMMM') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMMMM' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'LLLLL') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'LLLLL' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'FF') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'FF' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'ddd') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'ddd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'DDDD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'DDDD' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'HHH') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'HHH' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'hhh') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'hhh' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'kkk') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'kkk' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'KKK') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'KKK' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'mmm') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'mmm' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'sss') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'sss' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'SSSSSSSSSS') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'SSSSSSSSSS' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'aa') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'V') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Pattern letter count must be 2: V + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'zzzzz') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'zzzzz' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'XXXXXX') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: X + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'ZZZZZZ') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'ZZZZZZ' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'OO') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Pattern letter count must be 1 or 4: O + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'xxxxxx') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: x + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'A') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character: A + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'n') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character: n + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'N') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character: N + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'p') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character: p + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'Y') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'Y' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'w') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'w' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'W') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'W' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'u') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'u' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'e') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +All week-based patterns are unsupported since Spark 3.0, detected: e, Please use the SQL function EXTRACT instead + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'c') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +All week-based patterns are unsupported since Spark 3.0, detected: c, Please use the SQL function EXTRACT instead + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'B') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Unknown pattern letter: B + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'C') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Unknown pattern letter: C + + +-- !query +select date_format('2018-11-17 13:33:33.333', 'I') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Unknown pattern letter: I diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-formatting-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-formatting-legacy.sql.out new file mode 100644 index 0000000000000..b37922b20807d --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-formatting-legacy.sql.out @@ -0,0 +1,401 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 31 + + +-- !query +create temporary view v as select col from values + (timestamp '1582-06-01 11:33:33.123UTC+080000'), + (timestamp '1970-01-01 00:00:00.000Europe/Paris'), + (timestamp '1970-12-31 23:59:59.999Asia/Srednekolymsk'), + (timestamp '1996-04-01 00:33:33.123Australia/Darwin'), + (timestamp '2018-11-17 13:33:33.123Z'), + (timestamp '2020-01-01 01:33:33.123Asia/Shanghai'), + (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col) +-- !query schema +struct<> +-- !query output + + + +-- !query +select col, date_format(col, 'G GG GGG GGGG') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 AD AD AD AD +1969-12-31 15:00:00 AD AD AD AD +1970-12-31 04:59:59.999 AD AD AD AD +1996-03-31 07:03:33.123 AD AD AD AD +2018-11-17 05:33:33.123 AD AD AD AD +2019-12-31 09:33:33.123 AD AD AD AD +2100-01-01 01:33:33.123 AD AD AD AD + + +-- !query +select col, date_format(col, 'y yy yyy yyyy yyyyy yyyyyy') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 1582 82 1582 1582 01582 001582 +1969-12-31 15:00:00 1969 69 1969 1969 01969 001969 +1970-12-31 04:59:59.999 1970 70 1970 1970 01970 001970 +1996-03-31 07:03:33.123 1996 96 1996 1996 01996 001996 +2018-11-17 05:33:33.123 2018 18 2018 2018 02018 002018 +2019-12-31 09:33:33.123 2019 19 2019 2019 02019 002019 +2100-01-01 01:33:33.123 2100 00 2100 2100 02100 002100 + + +-- !query +select col, date_format(col, 'q qq') from v +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'q' + + +-- !query +select col, date_format(col, 'Q QQ QQQ QQQQ') from v +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'Q' + + +-- !query +select col, date_format(col, 'M MM MMM MMMM') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 5 05 May May +1969-12-31 15:00:00 12 12 Dec December +1970-12-31 04:59:59.999 12 12 Dec December +1996-03-31 07:03:33.123 3 03 Mar March +2018-11-17 05:33:33.123 11 11 Nov November +2019-12-31 09:33:33.123 12 12 Dec December +2100-01-01 01:33:33.123 1 01 Jan January + + +-- !query +select col, date_format(col, 'L LL') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 5 05 +1969-12-31 15:00:00 12 12 +1970-12-31 04:59:59.999 12 12 +1996-03-31 07:03:33.123 3 03 +2018-11-17 05:33:33.123 11 11 +2019-12-31 09:33:33.123 12 12 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'E EE EEE EEEE') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 Thu Thu Thu Thursday +1969-12-31 15:00:00 Wed Wed Wed Wednesday +1970-12-31 04:59:59.999 Thu Thu Thu Thursday +1996-03-31 07:03:33.123 Sun Sun Sun Sunday +2018-11-17 05:33:33.123 Sat Sat Sat Saturday +2019-12-31 09:33:33.123 Tue Tue Tue Tuesday +2100-01-01 01:33:33.123 Fri Fri Fri Friday + + +-- !query +select col, date_format(col, 'F') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 5 +1969-12-31 15:00:00 5 +1970-12-31 04:59:59.999 5 +1996-03-31 07:03:33.123 5 +2018-11-17 05:33:33.123 3 +2019-12-31 09:33:33.123 5 +2100-01-01 01:33:33.123 1 + + +-- !query +select col, date_format(col, 'd dd') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 31 31 +1969-12-31 15:00:00 31 31 +1970-12-31 04:59:59.999 31 31 +1996-03-31 07:03:33.123 31 31 +2018-11-17 05:33:33.123 17 17 +2019-12-31 09:33:33.123 31 31 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'DD') from v where col = timestamp '2100-01-01 01:33:33.123America/Los_Angeles' +-- !query schema +struct +-- !query output +2100-01-01 01:33:33.123 01 + + +-- !query +select col, date_format(col, 'D DDD') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 151 151 +1969-12-31 15:00:00 365 365 +1970-12-31 04:59:59.999 365 365 +1996-03-31 07:03:33.123 91 091 +2018-11-17 05:33:33.123 321 321 +2019-12-31 09:33:33.123 365 365 +2100-01-01 01:33:33.123 1 001 + + +-- !query +select col, date_format(col, 'H HH') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 19 19 +1969-12-31 15:00:00 15 15 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'h hh') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 7 07 +1969-12-31 15:00:00 3 03 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'k kk') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 19 19 +1969-12-31 15:00:00 15 15 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'K KK') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 7 07 +1969-12-31 15:00:00 3 03 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'm mm') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 40 40 +1969-12-31 15:00:00 0 00 +1970-12-31 04:59:59.999 59 59 +1996-03-31 07:03:33.123 3 03 +2018-11-17 05:33:33.123 33 33 +2019-12-31 09:33:33.123 33 33 +2100-01-01 01:33:33.123 33 33 + + +-- !query +select col, date_format(col, 's ss') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 35 35 +1969-12-31 15:00:00 0 00 +1970-12-31 04:59:59.999 59 59 +1996-03-31 07:03:33.123 33 33 +2018-11-17 05:33:33.123 33 33 +2019-12-31 09:33:33.123 33 33 +2100-01-01 01:33:33.123 33 33 + + +-- !query +select col, date_format(col, 'S SS SSS SSSS SSSSS SSSSSS SSSSSSS SSSSSSSS SSSSSSSSS') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 123 123 123 0123 00123 000123 0000123 00000123 000000123 +1969-12-31 15:00:00 0 00 000 0000 00000 000000 0000000 00000000 000000000 +1970-12-31 04:59:59.999 999 999 999 0999 00999 000999 0000999 00000999 000000999 +1996-03-31 07:03:33.123 123 123 123 0123 00123 000123 0000123 00000123 000000123 +2018-11-17 05:33:33.123 123 123 123 0123 00123 000123 0000123 00000123 000000123 +2019-12-31 09:33:33.123 123 123 123 0123 00123 000123 0000123 00000123 000000123 +2100-01-01 01:33:33.123 123 123 123 0123 00123 000123 0000123 00000123 000000123 + + +-- !query +select col, date_format(col, 'a') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 PM +1969-12-31 15:00:00 PM +1970-12-31 04:59:59.999 AM +1996-03-31 07:03:33.123 AM +2018-11-17 05:33:33.123 AM +2019-12-31 09:33:33.123 AM +2100-01-01 01:33:33.123 AM + + +-- !query +select col, date_format(col, 'VV') from v +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'V' + + +-- !query +select col, date_format(col, 'z zz zzz zzzz') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 PST PST PST Pacific Standard Time +1969-12-31 15:00:00 PST PST PST Pacific Standard Time +1970-12-31 04:59:59.999 PST PST PST Pacific Standard Time +1996-03-31 07:03:33.123 PST PST PST Pacific Standard Time +2018-11-17 05:33:33.123 PST PST PST Pacific Standard Time +2019-12-31 09:33:33.123 PST PST PST Pacific Standard Time +2100-01-01 01:33:33.123 PST PST PST Pacific Standard Time + + +-- !query +select col, date_format(col, 'X XX XXX') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 -08 -0800 -08:00 +1969-12-31 15:00:00 -08 -0800 -08:00 +1970-12-31 04:59:59.999 -08 -0800 -08:00 +1996-03-31 07:03:33.123 -08 -0800 -08:00 +2018-11-17 05:33:33.123 -08 -0800 -08:00 +2019-12-31 09:33:33.123 -08 -0800 -08:00 +2100-01-01 01:33:33.123 -08 -0800 -08:00 + + +-- !query +select col, date_format(col, 'XXXX XXXXX') from v +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +invalid ISO 8601 format: length=4 + + +-- !query +select col, date_format(col, 'Z ZZ ZZZ ZZZZ ZZZZZ') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 -0800 -0800 -0800 -0800 -0800 +1969-12-31 15:00:00 -0800 -0800 -0800 -0800 -0800 +1970-12-31 04:59:59.999 -0800 -0800 -0800 -0800 -0800 +1996-03-31 07:03:33.123 -0800 -0800 -0800 -0800 -0800 +2018-11-17 05:33:33.123 -0800 -0800 -0800 -0800 -0800 +2019-12-31 09:33:33.123 -0800 -0800 -0800 -0800 -0800 +2100-01-01 01:33:33.123 -0800 -0800 -0800 -0800 -0800 + + +-- !query +select col, date_format(col, 'O OOOO') from v +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'O' + + +-- !query +select col, date_format(col, 'x xx xxx xxxx xxxx xxxxx') from v +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'x' + + +-- !query +select col, date_format(col, '[yyyy-MM-dd HH:mm:ss]') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 [1582-05-31 19:40:35] +1969-12-31 15:00:00 [1969-12-31 15:00:00] +1970-12-31 04:59:59.999 [1970-12-31 04:59:59] +1996-03-31 07:03:33.123 [1996-03-31 07:03:33] +2018-11-17 05:33:33.123 [2018-11-17 05:33:33] +2019-12-31 09:33:33.123 [2019-12-31 09:33:33] +2100-01-01 01:33:33.123 [2100-01-01 01:33:33] + + +-- !query +select col, date_format(col, "姚123'GyYqQMLwWuEFDdhHmsSaVzZxXOV'") from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +1969-12-31 15:00:00 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +1970-12-31 04:59:59.999 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +1996-03-31 07:03:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +2018-11-17 05:33:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +2019-12-31 09:33:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +2100-01-01 01:33:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV + + +-- !query +select col, date_format(col, "''") from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 ' +1969-12-31 15:00:00 ' +1970-12-31 04:59:59.999 ' +1996-03-31 07:03:33.123 ' +2018-11-17 05:33:33.123 ' +2019-12-31 09:33:33.123 ' +2100-01-01 01:33:33.123 ' + + +-- !query +select col, date_format(col, '') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 +1969-12-31 15:00:00 +1970-12-31 04:59:59.999 +1996-03-31 07:03:33.123 +2018-11-17 05:33:33.123 +2019-12-31 09:33:33.123 +2100-01-01 01:33:33.123 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-formatting.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-formatting.sql.out new file mode 100644 index 0000000000000..5bed88e168f1e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-formatting.sql.out @@ -0,0 +1,431 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 31 + + +-- !query +create temporary view v as select col from values + (timestamp '1582-06-01 11:33:33.123UTC+080000'), + (timestamp '1970-01-01 00:00:00.000Europe/Paris'), + (timestamp '1970-12-31 23:59:59.999Asia/Srednekolymsk'), + (timestamp '1996-04-01 00:33:33.123Australia/Darwin'), + (timestamp '2018-11-17 13:33:33.123Z'), + (timestamp '2020-01-01 01:33:33.123Asia/Shanghai'), + (timestamp '2100-01-01 01:33:33.123America/Los_Angeles') t(col) +-- !query schema +struct<> +-- !query output + + + +-- !query +select col, date_format(col, 'G GG GGG GGGG') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 AD AD AD Anno Domini +1969-12-31 15:00:00 AD AD AD Anno Domini +1970-12-31 04:59:59.999 AD AD AD Anno Domini +1996-03-31 07:03:33.123 AD AD AD Anno Domini +2018-11-17 05:33:33.123 AD AD AD Anno Domini +2019-12-31 09:33:33.123 AD AD AD Anno Domini +2100-01-01 01:33:33.123 AD AD AD Anno Domini + + +-- !query +select col, date_format(col, 'y yy yyy yyyy yyyyy yyyyyy') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 1582 82 1582 1582 01582 001582 +1969-12-31 15:00:00 1969 69 1969 1969 01969 001969 +1970-12-31 04:59:59.999 1970 70 1970 1970 01970 001970 +1996-03-31 07:03:33.123 1996 96 1996 1996 01996 001996 +2018-11-17 05:33:33.123 2018 18 2018 2018 02018 002018 +2019-12-31 09:33:33.123 2019 19 2019 2019 02019 002019 +2100-01-01 01:33:33.123 2100 00 2100 2100 02100 002100 + + +-- !query +select col, date_format(col, 'q qq') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 2 02 +1969-12-31 15:00:00 4 04 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 1 01 +2018-11-17 05:33:33.123 4 04 +2019-12-31 09:33:33.123 4 04 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'Q QQ QQQ QQQQ') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 2 02 Q2 2nd quarter +1969-12-31 15:00:00 4 04 Q4 4th quarter +1970-12-31 04:59:59.999 4 04 Q4 4th quarter +1996-03-31 07:03:33.123 1 01 Q1 1st quarter +2018-11-17 05:33:33.123 4 04 Q4 4th quarter +2019-12-31 09:33:33.123 4 04 Q4 4th quarter +2100-01-01 01:33:33.123 1 01 Q1 1st quarter + + +-- !query +select col, date_format(col, 'M MM MMM MMMM') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 5 05 May May +1969-12-31 15:00:00 12 12 Dec December +1970-12-31 04:59:59.999 12 12 Dec December +1996-03-31 07:03:33.123 3 03 Mar March +2018-11-17 05:33:33.123 11 11 Nov November +2019-12-31 09:33:33.123 12 12 Dec December +2100-01-01 01:33:33.123 1 01 Jan January + + +-- !query +select col, date_format(col, 'L LL') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 5 05 +1969-12-31 15:00:00 12 12 +1970-12-31 04:59:59.999 12 12 +1996-03-31 07:03:33.123 3 03 +2018-11-17 05:33:33.123 11 11 +2019-12-31 09:33:33.123 12 12 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'E EE EEE EEEE') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 Mon Mon Mon Monday +1969-12-31 15:00:00 Wed Wed Wed Wednesday +1970-12-31 04:59:59.999 Thu Thu Thu Thursday +1996-03-31 07:03:33.123 Sun Sun Sun Sunday +2018-11-17 05:33:33.123 Sat Sat Sat Saturday +2019-12-31 09:33:33.123 Tue Tue Tue Tuesday +2100-01-01 01:33:33.123 Fri Fri Fri Friday + + +-- !query +select col, date_format(col, 'F') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 3 +1969-12-31 15:00:00 3 +1970-12-31 04:59:59.999 3 +1996-03-31 07:03:33.123 3 +2018-11-17 05:33:33.123 3 +2019-12-31 09:33:33.123 3 +2100-01-01 01:33:33.123 1 + + +-- !query +select col, date_format(col, 'd dd') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 31 31 +1969-12-31 15:00:00 31 31 +1970-12-31 04:59:59.999 31 31 +1996-03-31 07:03:33.123 31 31 +2018-11-17 05:33:33.123 17 17 +2019-12-31 09:33:33.123 31 31 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'DD') from v where col = timestamp '2100-01-01 01:33:33.123America/Los_Angeles' +-- !query schema +struct +-- !query output +2100-01-01 01:33:33.123 01 + + +-- !query +select col, date_format(col, 'D DDD') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 151 151 +1969-12-31 15:00:00 365 365 +1970-12-31 04:59:59.999 365 365 +1996-03-31 07:03:33.123 91 091 +2018-11-17 05:33:33.123 321 321 +2019-12-31 09:33:33.123 365 365 +2100-01-01 01:33:33.123 1 001 + + +-- !query +select col, date_format(col, 'H HH') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 19 19 +1969-12-31 15:00:00 15 15 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'h hh') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 7 07 +1969-12-31 15:00:00 3 03 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'k kk') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 19 19 +1969-12-31 15:00:00 15 15 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'K KK') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 7 07 +1969-12-31 15:00:00 3 03 +1970-12-31 04:59:59.999 4 04 +1996-03-31 07:03:33.123 7 07 +2018-11-17 05:33:33.123 5 05 +2019-12-31 09:33:33.123 9 09 +2100-01-01 01:33:33.123 1 01 + + +-- !query +select col, date_format(col, 'm mm') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 40 40 +1969-12-31 15:00:00 0 00 +1970-12-31 04:59:59.999 59 59 +1996-03-31 07:03:33.123 3 03 +2018-11-17 05:33:33.123 33 33 +2019-12-31 09:33:33.123 33 33 +2100-01-01 01:33:33.123 33 33 + + +-- !query +select col, date_format(col, 's ss') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 35 35 +1969-12-31 15:00:00 0 00 +1970-12-31 04:59:59.999 59 59 +1996-03-31 07:03:33.123 33 33 +2018-11-17 05:33:33.123 33 33 +2019-12-31 09:33:33.123 33 33 +2100-01-01 01:33:33.123 33 33 + + +-- !query +select col, date_format(col, 'S SS SSS SSSS SSSSS SSSSSS SSSSSSS SSSSSSSS SSSSSSSSS') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 1 12 123 1230 12300 123000 1230000 12300000 123000000 +1969-12-31 15:00:00 0 00 000 0000 00000 000000 0000000 00000000 000000000 +1970-12-31 04:59:59.999 9 99 999 9990 99900 999000 9990000 99900000 999000000 +1996-03-31 07:03:33.123 1 12 123 1230 12300 123000 1230000 12300000 123000000 +2018-11-17 05:33:33.123 1 12 123 1230 12300 123000 1230000 12300000 123000000 +2019-12-31 09:33:33.123 1 12 123 1230 12300 123000 1230000 12300000 123000000 +2100-01-01 01:33:33.123 1 12 123 1230 12300 123000 1230000 12300000 123000000 + + +-- !query +select col, date_format(col, 'a') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 PM +1969-12-31 15:00:00 PM +1970-12-31 04:59:59.999 AM +1996-03-31 07:03:33.123 AM +2018-11-17 05:33:33.123 AM +2019-12-31 09:33:33.123 AM +2100-01-01 01:33:33.123 AM + + +-- !query +select col, date_format(col, 'VV') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 America/Los_Angeles +1969-12-31 15:00:00 America/Los_Angeles +1970-12-31 04:59:59.999 America/Los_Angeles +1996-03-31 07:03:33.123 America/Los_Angeles +2018-11-17 05:33:33.123 America/Los_Angeles +2019-12-31 09:33:33.123 America/Los_Angeles +2100-01-01 01:33:33.123 America/Los_Angeles + + +-- !query +select col, date_format(col, 'z zz zzz zzzz') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 PST PST PST Pacific Standard Time +1969-12-31 15:00:00 PST PST PST Pacific Standard Time +1970-12-31 04:59:59.999 PST PST PST Pacific Standard Time +1996-03-31 07:03:33.123 PST PST PST Pacific Standard Time +2018-11-17 05:33:33.123 PST PST PST Pacific Standard Time +2019-12-31 09:33:33.123 PST PST PST Pacific Standard Time +2100-01-01 01:33:33.123 PST PST PST Pacific Standard Time + + +-- !query +select col, date_format(col, 'X XX XXX') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 -0752 -0752 -07:52 +1969-12-31 15:00:00 -08 -0800 -08:00 +1970-12-31 04:59:59.999 -08 -0800 -08:00 +1996-03-31 07:03:33.123 -08 -0800 -08:00 +2018-11-17 05:33:33.123 -08 -0800 -08:00 +2019-12-31 09:33:33.123 -08 -0800 -08:00 +2100-01-01 01:33:33.123 -08 -0800 -08:00 + + +-- !query +select col, date_format(col, 'XXXX XXXXX') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 -075258 -07:52:58 +1969-12-31 15:00:00 -0800 -08:00 +1970-12-31 04:59:59.999 -0800 -08:00 +1996-03-31 07:03:33.123 -0800 -08:00 +2018-11-17 05:33:33.123 -0800 -08:00 +2019-12-31 09:33:33.123 -0800 -08:00 +2100-01-01 01:33:33.123 -0800 -08:00 + + +-- !query +select col, date_format(col, 'Z ZZ ZZZ ZZZZ ZZZZZ') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 -0752 -0752 -0752 GMT-07:52:58 -07:52:58 +1969-12-31 15:00:00 -0800 -0800 -0800 GMT-08:00 -08:00 +1970-12-31 04:59:59.999 -0800 -0800 -0800 GMT-08:00 -08:00 +1996-03-31 07:03:33.123 -0800 -0800 -0800 GMT-08:00 -08:00 +2018-11-17 05:33:33.123 -0800 -0800 -0800 GMT-08:00 -08:00 +2019-12-31 09:33:33.123 -0800 -0800 -0800 GMT-08:00 -08:00 +2100-01-01 01:33:33.123 -0800 -0800 -0800 GMT-08:00 -08:00 + + +-- !query +select col, date_format(col, 'O OOOO') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 GMT-7:52:58 GMT-07:52:58 +1969-12-31 15:00:00 GMT-8 GMT-08:00 +1970-12-31 04:59:59.999 GMT-8 GMT-08:00 +1996-03-31 07:03:33.123 GMT-8 GMT-08:00 +2018-11-17 05:33:33.123 GMT-8 GMT-08:00 +2019-12-31 09:33:33.123 GMT-8 GMT-08:00 +2100-01-01 01:33:33.123 GMT-8 GMT-08:00 + + +-- !query +select col, date_format(col, 'x xx xxx xxxx xxxx xxxxx') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 -0752 -0752 -07:52 -075258 -075258 -07:52:58 +1969-12-31 15:00:00 -08 -0800 -08:00 -0800 -0800 -08:00 +1970-12-31 04:59:59.999 -08 -0800 -08:00 -0800 -0800 -08:00 +1996-03-31 07:03:33.123 -08 -0800 -08:00 -0800 -0800 -08:00 +2018-11-17 05:33:33.123 -08 -0800 -08:00 -0800 -0800 -08:00 +2019-12-31 09:33:33.123 -08 -0800 -08:00 -0800 -0800 -08:00 +2100-01-01 01:33:33.123 -08 -0800 -08:00 -0800 -0800 -08:00 + + +-- !query +select col, date_format(col, '[yyyy-MM-dd HH:mm:ss]') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 1582-05-31 19:40:35 +1969-12-31 15:00:00 1969-12-31 15:00:00 +1970-12-31 04:59:59.999 1970-12-31 04:59:59 +1996-03-31 07:03:33.123 1996-03-31 07:03:33 +2018-11-17 05:33:33.123 2018-11-17 05:33:33 +2019-12-31 09:33:33.123 2019-12-31 09:33:33 +2100-01-01 01:33:33.123 2100-01-01 01:33:33 + + +-- !query +select col, date_format(col, "姚123'GyYqQMLwWuEFDdhHmsSaVzZxXOV'") from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +1969-12-31 15:00:00 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +1970-12-31 04:59:59.999 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +1996-03-31 07:03:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +2018-11-17 05:33:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +2019-12-31 09:33:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV +2100-01-01 01:33:33.123 姚123GyYqQMLwWuEFDdhHmsSaVzZxXOV + + +-- !query +select col, date_format(col, "''") from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 ' +1969-12-31 15:00:00 ' +1970-12-31 04:59:59.999 ' +1996-03-31 07:03:33.123 ' +2018-11-17 05:33:33.123 ' +2019-12-31 09:33:33.123 ' +2100-01-01 01:33:33.123 ' + + +-- !query +select col, date_format(col, '') from v +-- !query schema +struct +-- !query output +1582-05-31 19:40:35.123 +1969-12-31 15:00:00 +1970-12-31 04:59:59.999 +1996-03-31 07:03:33.123 +2018-11-17 05:33:33.123 +2019-12-31 09:33:33.123 +2100-01-01 01:33:33.123 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out new file mode 100644 index 0000000000000..fbbdb5f3feb48 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -0,0 +1,900 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 108 + + +-- !query +select current_date = current_date(), current_timestamp = current_timestamp() +-- !query schema +struct<(current_date() = current_date()):boolean,(current_timestamp() = current_timestamp()):boolean> +-- !query output +true true + + +-- !query +select to_date(null), to_date('2016-12-31'), to_date('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct +-- !query output +NULL 2016-12-31 2016-12-31 + + +-- !query +select to_timestamp(null), to_timestamp('2016-12-31 00:12:00'), to_timestamp('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct +-- !query output +NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 + + +-- !query +select dayofweek('2007-02-03'), dayofweek('2009-07-30'), dayofweek('2017-05-27'), dayofweek(null), dayofweek('1582-10-15 13:10:15') +-- !query schema +struct +-- !query output +7 5 7 NULL 6 + + +-- !query +create temporary view ttf1 as select * from values + (1, 2), + (2, 3) + as ttf1(current_date, current_timestamp) +-- !query schema +struct<> +-- !query output + + + +-- !query +select current_date, current_timestamp from ttf1 +-- !query schema +struct +-- !query output +1 2 +2 3 + + +-- !query +create temporary view ttf2 as select * from values + (1, 2), + (2, 3) + as ttf2(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +select current_date = current_date(), current_timestamp = current_timestamp(), a, b from ttf2 +-- !query schema +struct<(current_date() = current_date()):boolean,(current_timestamp() = current_timestamp()):boolean,a:int,b:int> +-- !query output +true true 1 2 +true true 2 3 + + +-- !query +select a, b from ttf2 order by a, current_date +-- !query schema +struct +-- !query output +1 2 +2 3 + + +-- !query +select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), weekday(null), weekday('1582-10-15 13:10:15') +-- !query schema +struct +-- !query output +5 3 5 NULL 4 + + +-- !query +select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01') +-- !query schema +struct +-- !query output +1500 1 1 + + +-- !query +select date '2019-01-01\t' +-- !query schema +struct +-- !query output +2019-01-01 + + +-- !query +select timestamp '2019-01-01\t' +-- !query schema +struct +-- !query output +2019-01-01 00:00:00 + + +-- !query +select date '2020-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7) + +== SQL == +select date '2020-01-01中文' +-------^^^ + + +-- !query +select timestamp '2019-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7) + +== SQL == +select timestamp '2019-01-01中文' +-------^^^ + + +-- !query +select timestamp'2011-11-11 11:11:11' + interval '2' day +-- !query schema +struct +-- !query output +2011-11-13 11:11:11 + + +-- !query +select timestamp'2011-11-11 11:11:11' - interval '2' day +-- !query schema +struct +-- !query output +2011-11-09 11:11:11 + + +-- !query +select date'2011-11-11 11:11:11' + interval '2' second +-- !query schema +struct +-- !query output +2011-11-11 + + +-- !query +select date'2011-11-11 11:11:11' - interval '2' second +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select '2011-11-11' - interval '2' day +-- !query schema +struct +-- !query output +2011-11-09 00:00:00 + + +-- !query +select '2011-11-11 11:11:11' - interval '2' second +-- !query schema +struct +-- !query output +2011-11-11 11:11:09 + + +-- !query +select '1' - interval '2' second +-- !query schema +struct +-- !query output +NULL + + +-- !query +select 1 - interval '2' second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '1 - INTERVAL '2 seconds'' due to data type mismatch: argument 1 requires timestamp type, however, '1' is of int type.; line 1 pos 7 + + +-- !query +select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678' +-- !query schema +struct +-- !query output +2078 hours 48 minutes 47.654322 seconds + + +-- !query +select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01' +-- !query schema +struct +-- !query output +-2078 hours -48 minutes -47.654322 seconds + + +-- !query +select timestamp'2019-10-06 10:11:12.345678' - null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select null - timestamp'2019-10-06 10:11:12.345678' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date_add('2011-11-11', 1Y) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1S) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1L) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 1L)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '1L' is of bigint type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', 1.0) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 1.0BD)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', 1E1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', '1') +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', '1.2') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The second argument of 'date_add' function needs to be an integer.; + + +-- !query +select date_add(date'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add(timestamp'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_sub(date'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(date'2011-11-11', '1') +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(date'2011-11-11', '1.2') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The second argument of 'date_sub' function needs to be an integer.; + + +-- !query +select date_sub(timestamp'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(null, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date_sub(date'2011-11-11', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date'2011-11-11' + 1E1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(DATE '2011-11-11', 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 + + +-- !query +select date'2011-11-11' + '1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(DATE '2011-11-11', CAST('1' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('1' AS DOUBLE)' is of double type.; line 1 pos 7 + + +-- !query +select null + date '2001-09-28' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-09-28' + 7Y +-- !query schema +struct +-- !query output +2001-10-05 + + +-- !query +select 7S + date '2001-09-28' +-- !query schema +struct +-- !query output +2001-10-05 + + +-- !query +select date '2001-10-01' - 7 +-- !query schema +struct +-- !query output +2001-09-24 + + +-- !query +select date '2001-10-01' - '7' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_sub(DATE '2001-10-01', CAST('7' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('7' AS DOUBLE)' is of double type.; line 1 pos 7 + + +-- !query +select date '2001-09-28' + null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-09-28' - null +-- !query schema +struct +-- !query output +NULL + + +-- !query +create temp view v as select '1' str +-- !query schema +struct<> +-- !query output + + + +-- !query +select date_add('2011-11-11', str) from v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), v.`str`)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'v.`str`' is of string type.; line 1 pos 7 + + +-- !query +select date_sub('2011-11-11', str) from v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_sub(CAST('2011-11-11' AS DATE), v.`str`)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'v.`str`' is of string type.; line 1 pos 7 + + +-- !query +select null - date '2019-10-06' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-10-01' - date '2001-09-28' +-- !query schema +struct +-- !query output +3 days + + +-- !query +select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.0', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123UTC', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12345CST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123456PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234567PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('123456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('223456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.[SSSSSS]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019-10-06S10:11:12.12345", "yyyy-MM-dd'S'HH:mm:ss.SSSSSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.12342019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1234019-10-06S10:11", "ss.SSSSy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019-10-06S", "yyyy-MM-dd'S'") +-- !query schema +struct +-- !query output +2019-10-06 00:00:00 + + +-- !query +select to_timestamp("S2019-10-06", "'S'yyyy-MM-dd") +-- !query schema +struct +-- !query output +2019-10-06 00:00:00 + + +-- !query +select to_timestamp("2019-10-06T10:11:12'12", "yyyy-MM-dd'T'HH:mm:ss''SSSS") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.012 + + +-- !query +select to_timestamp("2019-10-06T10:11:12'", "yyyy-MM-dd'T'HH:mm:ss''") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("'2019-10-06T10:11:12", "''yyyy-MM-dd'T'HH:mm:ss") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 00:00:00 + + +-- !query +select to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 + + +-- !query +select to_date("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019 40", "yyyy mm") +-- !query schema +struct +-- !query output +2019-01-01 00:40:00 + + +-- !query +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss") +-- !query schema +struct +-- !query output +2019-01-01 10:10:10 + + +-- !query +select date_format(date '2020-05-23', 'GGGGG') +-- !query schema +struct +-- !query output +AD + + +-- !query +select date_format(date '2020-05-23', 'MMMMM') +-- !query schema +struct +-- !query output +May + + +-- !query +select date_format(date '2020-05-23', 'LLLLL') +-- !query schema +struct +-- !query output +May + + +-- !query +select date_format(timestamp '2020-05-23', 'EEEEE') +-- !query schema +struct +-- !query output +Saturday + + +-- !query +select date_format(timestamp '2020-05-23', 'uuuuu') +-- !query schema +struct +-- !query output +00006 + + +-- !query +select date_format('2020-05-23', 'QQQQQ') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'Q' + + +-- !query +select date_format('2020-05-23', 'qqqqq') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Illegal pattern character 'q' + + +-- !query +select to_timestamp('2019-10-06 A', 'yyyy-MM-dd GGGGG') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct +-- !query output +2020-05-22 00:00:00 + + +-- !query +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE') +-- !query schema +struct +-- !query output +2020-05-22 00:00:00 + + +-- !query +select unix_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE') +-- !query schema +struct +-- !query output +1590130800 + + +-- !query +select from_unixtime(12345, 'MMMMM') +-- !query schema +struct +-- !query output +December + + +-- !query +select from_unixtime(54321, 'QQQQQ') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select from_unixtime(23456, 'aaaaa') +-- !query schema +struct +-- !query output +PM + + +-- !query +select from_json('{"time":"26/October/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct> +-- !query output +{"time":2015-10-26 00:00:00} + + +-- !query +select from_json('{"date":"26/October/2015"}', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct> +-- !query output +{"date":2015-10-26} + + +-- !query +select from_csv('26/October/2015', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct> +-- !query output +{"time":2015-10-26 00:00:00} + + +-- !query +select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct> +-- !query output +{"date":2015-10-26} diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out new file mode 100644 index 0000000000000..76008d9c97dc6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out @@ -0,0 +1,162 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 19 + + +-- !query +select to_timestamp('294248', 'y') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('1', 'yy') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('-12', 'yy') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('123', 'yy') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '123' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('1', 'yyy') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '1' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('1234567', 'yyyyyyy') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('366', 'D') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('9', 'DD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '9' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('366', 'DD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('9', 'DDD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '9' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('99', 'DDD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '99' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('30-365', 'dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('11-365', 'MM-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-366', 'yyyy-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('12-30-365', 'MM-dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2020-01-365', 'yyyy-dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2020-10-350', 'yyyy-MM-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2020-11-31-366', 'yyyy-MM-dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2018-366' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-legacy.sql.out new file mode 100644 index 0000000000000..258afe43b99ee --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-legacy.sql.out @@ -0,0 +1,202 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 25 + + +-- !query +select to_timestamp('1', 'y') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('009999', 'y') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('00', 'yy') +-- !query schema +struct +-- !query output +2000-01-01 00:00:00 + + +-- !query +select to_timestamp('99', 'yy') +-- !query schema +struct +-- !query output +1999-01-01 00:00:00 + + +-- !query +select to_timestamp('001', 'yyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('009999', 'yyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('0001', 'yyyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('9999', 'yyyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('00001', 'yyyyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('09999', 'yyyyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('000001', 'yyyyyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('009999', 'yyyyyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('9', 'D') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('300', 'D') +-- !query schema +struct +-- !query output +1970-10-27 00:00:00 + + +-- !query +select to_timestamp('09', 'DD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('99', 'DD') +-- !query schema +struct +-- !query output +1970-04-09 00:00:00 + + +-- !query +select to_timestamp('009', 'DDD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('365', 'DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('31-365', 'dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('12-365', 'MM-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-365', 'yyyy-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('12-31-365', 'MM-dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-30-365', 'yyyy-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('2020-12-350', 'yyyy-MM-DDD') +-- !query schema +struct +-- !query output +2020-12-15 00:00:00 + + +-- !query +select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-31 00:00:00 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing.sql.out new file mode 100644 index 0000000000000..f030141d288ec --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing.sql.out @@ -0,0 +1,202 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 25 + + +-- !query +select to_timestamp('1', 'y') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('009999', 'y') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('00', 'yy') +-- !query schema +struct +-- !query output +2000-01-01 00:00:00 + + +-- !query +select to_timestamp('99', 'yy') +-- !query schema +struct +-- !query output +2099-01-01 00:00:00 + + +-- !query +select to_timestamp('001', 'yyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('009999', 'yyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('0001', 'yyyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('9999', 'yyyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('00001', 'yyyyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('09999', 'yyyyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('000001', 'yyyyyy') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +select to_timestamp('009999', 'yyyyyy') +-- !query schema +struct +-- !query output +9999-01-01 00:00:00 + + +-- !query +select to_timestamp('9', 'D') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('300', 'D') +-- !query schema +struct +-- !query output +1970-10-27 00:00:00 + + +-- !query +select to_timestamp('09', 'DD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('99', 'DD') +-- !query schema +struct +-- !query output +1970-04-09 00:00:00 + + +-- !query +select to_timestamp('009', 'DDD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('365', 'DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('31-365', 'dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('12-365', 'MM-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-365', 'yyyy-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('12-31-365', 'MM-dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-30-365', 'yyyy-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('2020-12-350', 'yyyy-MM-DDD') +-- !query schema +struct +-- !query output +2020-12-15 00:00:00 + + +-- !query +select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-31 00:00:00 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index a7b098d79a706..d7e960a9a3db3 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 47 +-- Number of queries: 108 -- !query @@ -115,6 +115,34 @@ struct 2019-01-01 00:00:00 +-- !query +select date '2020-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7) + +== SQL == +select date '2020-01-01中文' +-------^^^ + + +-- !query +select timestamp '2019-01-01中文' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7) + +== SQL == +select timestamp '2019-01-01中文' +-------^^^ + + -- !query select timestamp'2011-11-11 11:11:11' + interval '2' day -- !query schema @@ -134,7 +162,7 @@ struct +struct -- !query output 2011-11-11 @@ -142,7 +170,7 @@ struct +struct -- !query output 2011-11-10 @@ -266,10 +294,18 @@ cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 10.0D)' due to data type mi -- !query select date_add('2011-11-11', '1') -- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', '1.2') +-- !query schema struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'date_add(CAST('2011-11-11' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 +The second argument of 'date_add' function needs to be an integer.; -- !query @@ -296,6 +332,23 @@ struct 2011-11-10 +-- !query +select date_sub(date'2011-11-11', '1') +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(date'2011-11-11', '1.2') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The second argument of 'date_sub' function needs to be an integer.; + + -- !query select date_sub(timestamp'2011-11-11', 1) -- !query schema @@ -329,6 +382,15 @@ org.apache.spark.sql.AnalysisException cannot resolve 'date_add(DATE '2011-11-11', 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 +-- !query +select date'2011-11-11' + '1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(DATE '2011-11-11', CAST('1' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('1' AS DOUBLE)' is of double type.; line 1 pos 7 + + -- !query select null + date '2001-09-28' -- !query schema @@ -361,6 +423,15 @@ struct 2001-09-24 +-- !query +select date '2001-10-01' - '7' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_sub(DATE '2001-10-01', CAST('7' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('7' AS DOUBLE)' is of double type.; line 1 pos 7 + + -- !query select date '2001-09-28' + null -- !query schema @@ -377,6 +448,32 @@ struct NULL +-- !query +create temp view v as select '1' str +-- !query schema +struct<> +-- !query output + + + +-- !query +select date_add('2011-11-11', str) from v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), v.`str`)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'v.`str`' is of string type.; line 1 pos 7 + + +-- !query +select date_sub('2011-11-11', str) from v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_sub(CAST('2011-11-11' AS DATE), v.`str`)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'v.`str`' is of string type.; line 1 pos 7 + + -- !query select null - date '2019-10-06' -- !query schema @@ -391,3 +488,428 @@ select date '2001-10-01' - date '2001-09-28' struct -- !query output 3 days + + +-- !query +select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.0', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123UTC', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 03:11:12.123 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12345CST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 08:11:12.12345 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123456PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.123456 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234567PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('123456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.123456 + + +-- !query +select to_timestamp('223456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.[SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.123', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.123 + + +-- !query +select to_timestamp('2019-10-06 10:11:12', 'yyyy-MM-dd HH:mm:ss[.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp('2019-10-06 10:11:12.12', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12 + + +-- !query +select to_timestamp('2019-10-06 10:11', 'yyyy-MM-dd HH:mm[:ss.SSSSSS]') +-- !query schema +struct +-- !query output +2019-10-06 10:11:00 + + +-- !query +select to_timestamp("2019-10-06S10:11:12.12345", "yyyy-MM-dd'S'HH:mm:ss.SSSSSS") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12345 + + +-- !query +select to_timestamp("12.12342019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyyyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("12.1234019-10-06S10:11", "ss.SSSSy-MM-dd'S'HH:mm") +-- !query schema +struct +-- !query output +0019-10-06 10:11:12.1234 + + +-- !query +select to_timestamp("2019-10-06S", "yyyy-MM-dd'S'") +-- !query schema +struct +-- !query output +2019-10-06 00:00:00 + + +-- !query +select to_timestamp("S2019-10-06", "'S'yyyy-MM-dd") +-- !query schema +struct +-- !query output +2019-10-06 00:00:00 + + +-- !query +select to_timestamp("2019-10-06T10:11:12'12", "yyyy-MM-dd'T'HH:mm:ss''SSSS") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12.12 + + +-- !query +select to_timestamp("2019-10-06T10:11:12'", "yyyy-MM-dd'T'HH:mm:ss''") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("'2019-10-06T10:11:12", "''yyyy-MM-dd'T'HH:mm:ss") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss") +-- !query schema +struct +-- !query output +2019-10-06 10:11:12 + + +-- !query +select to_timestamp("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 00:00:00 + + +-- !query +select to_timestamp("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("16", "dd") +-- !query schema +struct +-- !query output +1970-01-16 + + +-- !query +select to_date("02-29", "MM-dd") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2019 40", "yyyy mm") +-- !query schema +struct +-- !query output +2019-01-01 00:40:00 + + +-- !query +select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss") +-- !query schema +struct +-- !query output +2019-01-01 10:10:10 + + +-- !query +select date_format(date '2020-05-23', 'GGGGG') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'GGGGG' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(date '2020-05-23', 'MMMMM') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMMMM' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(date '2020-05-23', 'LLLLL') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'LLLLL' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(timestamp '2020-05-23', 'EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format(timestamp '2020-05-23', 'uuuuu') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'uuuuu' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select date_format('2020-05-23', 'QQQQQ') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: Q + + +-- !query +select date_format('2020-05-23', 'qqqqq') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Too many pattern letters: q + + +-- !query +select to_timestamp('2019-10-06 A', 'yyyy-MM-dd GGGGG') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyy-MM-dd GGGGG' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd MM yyyy EEEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd MM yyyy EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select unix_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd MM yyyy EEEEE' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_unixtime(12345, 'MMMMM') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'MMMMM' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_unixtime(54321, 'QQQQQ') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select from_unixtime(23456, 'aaaaa') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'aaaaa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_json('{"time":"26/October/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_json('{"date":"26/October/2015"}', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_csv('26/October/2015', 'time Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html diff --git a/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out index 6b16aba268f50..2199fc0312d25 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out @@ -112,7 +112,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -mismatched input 'desc_temp1' expecting {, '.'}(line 1, pos 21) +mismatched input 'desc_temp1' expecting {, ';'}(line 1, pos 21) == SQL == DESCRIBE INSERT INTO desc_temp1 values (1, 'val1') @@ -126,7 +126,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -mismatched input 'desc_temp1' expecting {, '.'}(line 1, pos 21) +mismatched input 'desc_temp1' expecting {, ';'}(line 1, pos 21) == SQL == DESCRIBE INSERT INTO desc_temp1 SELECT * FROM desc_temp2 diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out index ae9240ec588da..c6d3d45879eb1 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out @@ -267,7 +267,7 @@ struct<> -- !query -CREATE TABLE customer(CName STRING) +CREATE TABLE customer(CName STRING) USING PARQUET -- !query schema struct<> -- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 697e006544acf..a7de033e3a1ac 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -520,7 +520,7 @@ struct -- !query output == Physical Plan == Execute DescribeTableCommand - +- DescribeTableCommand `t`, false + +- DescribeTableCommand `default`.`t`, false -- !query @@ -530,7 +530,7 @@ struct -- !query output == Physical Plan == Execute DescribeTableCommand - +- DescribeTableCommand `t`, true + +- DescribeTableCommand `default`.`t`, true -- !query @@ -544,14 +544,14 @@ struct == Analyzed Logical Plan == col_name: string, data_type: string, comment: string -DescribeTableCommand `t`, false +DescribeTableCommand `default`.`t`, false == Optimized Logical Plan == -DescribeTableCommand `t`, false +DescribeTableCommand `default`.`t`, false == Physical Plan == Execute DescribeTableCommand - +- DescribeTableCommand `t`, false + +- DescribeTableCommand `default`.`t`, false -- !query @@ -561,7 +561,7 @@ struct -- !query output == Physical Plan == Execute DescribeColumnCommand - +- DescribeColumnCommand `t`, [b], false + +- DescribeColumnCommand `default`.`t`, [b], false -- !query @@ -571,7 +571,7 @@ struct -- !query output == Physical Plan == Execute DescribeTableCommand - +- DescribeTableCommand `t`, Map(c -> Us, d -> 2), false + +- DescribeTableCommand `default`.`t`, Map(c -> Us, d -> 2), false -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/except.sql.out b/sql/core/src/test/resources/sql-tests/results/except.sql.out index 62d695219d01d..061b122eac7cf 100644 --- a/sql/core/src/test/resources/sql-tests/results/except.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/except.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 9 +-- Number of queries: 15 -- !query @@ -103,3 +103,59 @@ WHERE t1.v >= (SELECT min(t2.v) struct -- !query output two + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW t3 AS VALUES (decimal(1)) tbl(v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t.v FROM ( + SELECT v FROM t3 + EXCEPT + SELECT v + v AS v FROM t3 +) t +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT SUM(t.v) FROM ( + SELECT v FROM t3 + EXCEPT + SELECT v + v AS v FROM t3 +) t +-- !query schema +struct +-- !query output +1 + + +-- !query +DROP VIEW IF EXISTS t1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t3 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out new file mode 100644 index 0000000000000..36757863ffcb5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out @@ -0,0 +1,860 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 23 + + +-- !query +CREATE table explain_temp1 (key int, val int) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE table explain_temp2 (key int, val int) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE table explain_temp3 (key int, val int) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE table explain_temp4 (key int, val string) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query +SET spark.sql.codegen.wholeStage = true +-- !query schema +struct +-- !query output +spark.sql.codegen.wholeStage true + + +-- !query +EXPLAIN FORMATTED + SELECT key, max(val) + FROM explain_temp1 + WHERE key > 0 + GROUP BY key + ORDER BY key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (9) ++- Sort (8) + +- Exchange (7) + +- HashAggregate (6) + +- Exchange (5) + +- HashAggregate (4) + +- Project (3) + +- Filter (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 0)) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) HashAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(5) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(6) HashAggregate +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [2]: [key#x, max(val#x)#x AS max(val)#x] + +(7) Exchange +Input [2]: [key#x, max(val)#x] +Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), true, [id=#x] + +(8) Sort +Input [2]: [key#x, max(val)#x] +Arguments: [key#x ASC NULLS FIRST], true, 0 + +(9) AdaptiveSparkPlan +Output [2]: [key#x, max(val)#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT key, max(val) + FROM explain_temp1 + WHERE key > 0 + GROUP BY key + HAVING max(val) > 0 +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (9) ++- Project (8) + +- Filter (7) + +- HashAggregate (6) + +- Exchange (5) + +- HashAggregate (4) + +- Project (3) + +- Filter (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 0)) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) HashAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(5) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(6) HashAggregate +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [3]: [key#x, max(val#x)#x AS max(val)#x, max(val#x)#x AS max(val#x)#x] + +(7) Filter +Input [3]: [key#x, max(val)#x, max(val#x)#x] +Condition : (isnotnull(max(val#x)#x) AND (max(val#x)#x > 0)) + +(8) Project +Output [2]: [key#x, max(val)#x] +Input [3]: [key#x, max(val)#x, max(val#x)#x] + +(9) AdaptiveSparkPlan +Output [2]: [key#x, max(val)#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT key, val FROM explain_temp1 WHERE key > 0 + UNION + SELECT key, val FROM explain_temp1 WHERE key > 0 +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (11) ++- HashAggregate (10) + +- Exchange (9) + +- HashAggregate (8) + +- Union (7) + :- Project (3) + : +- Filter (2) + : +- Scan parquet default.explain_temp1 (1) + +- Project (6) + +- Filter (5) + +- Scan parquet default.explain_temp1 (4) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 0)) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct + +(5) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 0)) + +(6) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(7) Union + +(8) HashAggregate +Input [2]: [key#x, val#x] +Keys [2]: [key#x, val#x] +Functions: [] +Aggregate Attributes: [] +Results [2]: [key#x, val#x] + +(9) Exchange +Input [2]: [key#x, val#x] +Arguments: hashpartitioning(key#x, val#x, 4), true, [id=#x] + +(10) HashAggregate +Input [2]: [key#x, val#x] +Keys [2]: [key#x, val#x] +Functions: [] +Aggregate Attributes: [] +Results [2]: [key#x, val#x] + +(11) AdaptiveSparkPlan +Output [2]: [key#x, val#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT * + FROM explain_temp1 a, + explain_temp2 b + WHERE a.key = b.key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (9) ++- BroadcastHashJoin Inner BuildRight (8) + :- Project (3) + : +- Filter (2) + : +- Scan parquet default.explain_temp1 (1) + +- BroadcastExchange (7) + +- Project (6) + +- Filter (5) + +- Scan parquet default.explain_temp2 (4) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : isnotnull(key#x) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) Scan parquet default.explain_temp2 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp2] +PushedFilters: [IsNotNull(key)] +ReadSchema: struct + +(5) Filter +Input [2]: [key#x, val#x] +Condition : isnotnull(key#x) + +(6) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(7) BroadcastExchange +Input [2]: [key#x, val#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + +(8) BroadcastHashJoin +Left keys [1]: [key#x] +Right keys [1]: [key#x] +Join condition: None + +(9) AdaptiveSparkPlan +Output [4]: [key#x, val#x, key#x, val#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT * + FROM explain_temp1 a + LEFT OUTER JOIN explain_temp2 b + ON a.key = b.key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (7) ++- BroadcastHashJoin LeftOuter BuildRight (6) + :- Scan parquet default.explain_temp1 (1) + +- BroadcastExchange (5) + +- Project (4) + +- Filter (3) + +- Scan parquet default.explain_temp2 (2) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct + +(2) Scan parquet default.explain_temp2 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp2] +PushedFilters: [IsNotNull(key)] +ReadSchema: struct + +(3) Filter +Input [2]: [key#x, val#x] +Condition : isnotnull(key#x) + +(4) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(5) BroadcastExchange +Input [2]: [key#x, val#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + +(6) BroadcastHashJoin +Left keys [1]: [key#x] +Right keys [1]: [key#x] +Join condition: None + +(7) AdaptiveSparkPlan +Output [4]: [key#x, val#x, key#x, val#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT * + FROM explain_temp1 + WHERE key = (SELECT max(key) + FROM explain_temp2 + WHERE key = (SELECT max(key) + FROM explain_temp3 + WHERE val > 0) + AND val = 2) + AND val > 3 +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (4) ++- Project (3) + +- Filter (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), IsNotNull(val), GreaterThan(val,3)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : (((isnotnull(key#x) AND isnotnull(val#x)) AND (key#x = Subquery subquery#x, [id=#x])) AND (val#x > 3)) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) AdaptiveSparkPlan +Output [2]: [key#x, val#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT * + FROM explain_temp1 + WHERE key = (SELECT max(key) + FROM explain_temp2 + WHERE val > 0) + OR + key = (SELECT avg(key) + FROM explain_temp3 + WHERE val > 0) +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (3) ++- Filter (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : ((key#x = Subquery subquery#x, [id=#x]) OR (cast(key#x as double) = Subquery subquery#x, [id=#x])) + +(3) AdaptiveSparkPlan +Output [2]: [key#x, val#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT (SELECT Avg(key) FROM explain_temp1) + (SELECT Avg(key) FROM explain_temp1) + FROM explain_temp1 +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (3) ++- Project (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output: [] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct<> + +(2) Project +Output [1]: [(Subquery subquery#x, [id=#x] + Subquery subquery#x, [id=#x]) AS (scalarsubquery() + scalarsubquery())#x] +Input: [] + +(3) AdaptiveSparkPlan +Output [1]: [(scalarsubquery() + scalarsubquery())#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + WITH cte1 AS ( + SELECT * + FROM explain_temp1 + WHERE key > 10 + ) + SELECT * FROM cte1 a, cte1 b WHERE a.key = b.key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (9) ++- BroadcastHashJoin Inner BuildRight (8) + :- Project (3) + : +- Filter (2) + : +- Scan parquet default.explain_temp1 (1) + +- BroadcastExchange (7) + +- Project (6) + +- Filter (5) + +- Scan parquet default.explain_temp1 (4) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 10)) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct + +(5) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 10)) + +(6) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(7) BroadcastExchange +Input [2]: [key#x, val#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + +(8) BroadcastHashJoin +Left keys [1]: [key#x] +Right keys [1]: [key#x] +Join condition: None + +(9) AdaptiveSparkPlan +Output [4]: [key#x, val#x, key#x, val#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + WITH cte1 AS ( + SELECT key, max(val) + FROM explain_temp1 + WHERE key > 10 + GROUP BY key + ) + SELECT * FROM cte1 a, cte1 b WHERE a.key = b.key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (15) ++- BroadcastHashJoin Inner BuildRight (14) + :- HashAggregate (6) + : +- Exchange (5) + : +- HashAggregate (4) + : +- Project (3) + : +- Filter (2) + : +- Scan parquet default.explain_temp1 (1) + +- BroadcastExchange (13) + +- HashAggregate (12) + +- Exchange (11) + +- HashAggregate (10) + +- Project (9) + +- Filter (8) + +- Scan parquet default.explain_temp1 (7) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct + +(2) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 10)) + +(3) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(4) HashAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(5) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(6) HashAggregate +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [2]: [key#x, max(val#x)#x AS max(val)#x] + +(7) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct + +(8) Filter +Input [2]: [key#x, val#x] +Condition : (isnotnull(key#x) AND (key#x > 10)) + +(9) Project +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(10) HashAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(11) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(12) HashAggregate +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [2]: [key#x, max(val#x)#x AS max(val)#x] + +(13) BroadcastExchange +Input [2]: [key#x, max(val)#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + +(14) BroadcastHashJoin +Left keys [1]: [key#x] +Right keys [1]: [key#x] +Join condition: None + +(15) AdaptiveSparkPlan +Output [4]: [key#x, max(val)#x, key#x, max(val)#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + CREATE VIEW explain_view AS + SELECT key, val FROM explain_temp1 +-- !query schema +struct +-- !query output +== Physical Plan == +Execute CreateViewCommand (1) + +- CreateViewCommand (2) + +- Project (4) + +- UnresolvedRelation (3) + + +(1) Execute CreateViewCommand +Output: [] + +(2) CreateViewCommand +Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView + +(3) UnresolvedRelation +Arguments: [explain_temp1] + +(4) Project +Arguments: ['key, 'val] + + +-- !query +EXPLAIN FORMATTED + SELECT + COUNT(val) + SUM(key) as TOTAL, + COUNT(key) FILTER (WHERE val > 1) + FROM explain_temp1 +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (5) ++- HashAggregate (4) + +- Exchange (3) + +- HashAggregate (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct + +(2) HashAggregate +Input [2]: [key#x, val#x] +Keys: [] +Functions [3]: [partial_count(val#x), partial_sum(cast(key#x as bigint)), partial_count(key#x) FILTER (WHERE (val#x > 1))] +Aggregate Attributes [3]: [count#xL, sum#xL, count#xL] +Results [3]: [count#xL, sum#xL, count#xL] + +(3) Exchange +Input [3]: [count#xL, sum#xL, count#xL] +Arguments: SinglePartition, true, [id=#x] + +(4) HashAggregate +Input [3]: [count#xL, sum#xL, count#xL] +Keys: [] +Functions [3]: [count(val#x), sum(cast(key#x as bigint)), count(key#x)] +Aggregate Attributes [3]: [count(val#x)#xL, sum(cast(key#x as bigint))#xL, count(key#x)#xL] +Results [2]: [(count(val#x)#xL + sum(cast(key#x as bigint))#xL) AS TOTAL#xL, count(key#x)#xL AS count(key) FILTER (WHERE (val > 1))#xL] + +(5) AdaptiveSparkPlan +Output [2]: [TOTAL#xL, count(key) FILTER (WHERE (val > 1))#xL] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT key, sort_array(collect_set(val))[0] + FROM explain_temp4 + GROUP BY key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (5) ++- ObjectHashAggregate (4) + +- Exchange (3) + +- ObjectHashAggregate (2) + +- Scan parquet default.explain_temp4 (1) + + +(1) Scan parquet default.explain_temp4 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp4] +ReadSchema: struct + +(2) ObjectHashAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_collect_set(val#x, 0, 0)] +Aggregate Attributes [1]: [buf#x] +Results [2]: [key#x, buf#x] + +(3) Exchange +Input [2]: [key#x, buf#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(4) ObjectHashAggregate +Input [2]: [key#x, buf#x] +Keys [1]: [key#x] +Functions [1]: [collect_set(val#x, 0, 0)] +Aggregate Attributes [1]: [collect_set(val#x, 0, 0)#x] +Results [2]: [key#x, sort_array(collect_set(val#x, 0, 0)#x, true)[0] AS sort_array(collect_set(val), true)[0]#x] + +(5) AdaptiveSparkPlan +Output [2]: [key#x, sort_array(collect_set(val), true)[0]#x] +Arguments: isFinalPlan=false + + +-- !query +EXPLAIN FORMATTED + SELECT key, MIN(val) + FROM explain_temp4 + GROUP BY key +-- !query schema +struct +-- !query output +== Physical Plan == +AdaptiveSparkPlan (7) ++- SortAggregate (6) + +- Sort (5) + +- Exchange (4) + +- SortAggregate (3) + +- Sort (2) + +- Scan parquet default.explain_temp4 (1) + + +(1) Scan parquet default.explain_temp4 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp4] +ReadSchema: struct + +(2) Sort +Input [2]: [key#x, val#x] +Arguments: [key#x ASC NULLS FIRST], false, 0 + +(3) SortAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_min(val#x)] +Aggregate Attributes [1]: [min#x] +Results [2]: [key#x, min#x] + +(4) Exchange +Input [2]: [key#x, min#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(5) Sort +Input [2]: [key#x, min#x] +Arguments: [key#x ASC NULLS FIRST], false, 0 + +(6) SortAggregate +Input [2]: [key#x, min#x] +Keys [1]: [key#x] +Functions [1]: [min(val#x)] +Aggregate Attributes [1]: [min(val#x)#x] +Results [2]: [key#x, min(val#x)#x AS min(val)#x] + +(7) AdaptiveSparkPlan +Output [2]: [key#x, min(val)#x] +Arguments: isFinalPlan=false + + +-- !query +DROP TABLE explain_temp1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE explain_temp2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE explain_temp3 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE explain_temp4 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out index 756c14f28a657..2b07dac0e5d0a 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 18 +-- Number of queries: 23 -- !query @@ -26,6 +26,14 @@ struct<> +-- !query +CREATE table explain_temp4 (key int, val string) USING PARQUET +-- !query schema +struct<> +-- !query output + + + -- !query SET spark.sql.codegen.wholeStage = true -- !query schema @@ -56,38 +64,49 @@ struct +- Scan parquet default.explain_temp1 (1) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,0)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 0)) - + (4) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + (5) HashAggregate [codegen id : 1] -Input: [key#x, val#x] - -(6) Exchange -Input: [key#x, max#x] - +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(6) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + (7) HashAggregate [codegen id : 2] -Input: [key#x, max#x] - -(8) Exchange -Input: [key#x, max(val)#x] - +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [2]: [key#x, max(val#x)#x AS max(val)#x] + +(8) Exchange +Input [2]: [key#x, max(val)#x] +Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), true, [id=#x] + (9) Sort [codegen id : 3] -Input: [key#x, max(val)#x] +Input [2]: [key#x, max(val)#x] +Arguments: [key#x ASC NULLS FIRST], true, 0 -- !query @@ -112,40 +131,49 @@ struct +- Scan parquet default.explain_temp1 (1) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,0)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 0)) - + (4) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + (5) HashAggregate [codegen id : 1] -Input: [key#x, val#x] - -(6) Exchange -Input: [key#x, max#x] - +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(6) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + (7) HashAggregate [codegen id : 2] -Input: [key#x, max#x] - +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [3]: [key#x, max(val#x)#x AS max(val)#x, max(val#x)#x AS max(val#x)#x] + (8) Filter [codegen id : 2] -Input : [key#x, max(val)#x, max(val#x)#x] +Input [3]: [key#x, max(val)#x, max(val#x)#x] Condition : (isnotnull(max(val#x)#x) AND (max(val#x)#x > 0)) - + (9) Project [codegen id : 2] -Output : [key#x, max(val)#x] -Input : [key#x, max(val)#x, max(val#x)#x] +Output [2]: [key#x, max(val)#x] +Input [3]: [key#x, max(val)#x, max(val#x)#x] -- !query @@ -171,52 +199,61 @@ struct +- Scan parquet default.explain_temp1 (5) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,0)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 0)) - + (4) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(5) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(5) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,0)] ReadSchema: struct - + (6) ColumnarToRow [codegen id : 2] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (7) Filter [codegen id : 2] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 0)) - + (8) Project [codegen id : 2] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(9) Union - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(9) Union + (10) HashAggregate [codegen id : 3] -Input: [key#x, val#x] - -(11) Exchange -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] +Keys [2]: [key#x, val#x] +Functions: [] +Aggregate Attributes: [] +Results [2]: [key#x, val#x] + +(11) Exchange +Input [2]: [key#x, val#x] +Arguments: hashpartitioning(key#x, val#x, 4), true, [id=#x] + (12) HashAggregate [codegen id : 4] -Input: [key#x, val#x] +Input [2]: [key#x, val#x] +Keys [2]: [key#x, val#x] +Functions: [] +Aggregate Attributes: [] +Results [2]: [key#x, val#x] -- !query @@ -241,48 +278,49 @@ struct +- Scan parquet default.explain_temp2 (5) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 2] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 2] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : isnotnull(key#x) - + (4) Project [codegen id : 2] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(5) Scan parquet default.explain_temp2 -Output: [key#x, val#x] +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(5) Scan parquet default.explain_temp2 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp2] PushedFilters: [IsNotNull(key)] ReadSchema: struct - + (6) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (7) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : isnotnull(key#x) - + (8) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(9) BroadcastExchange -Input: [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(9) BroadcastExchange +Input [2]: [key#x, val#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + (10) BroadcastHashJoin [codegen id : 2] -Left keys: List(key#x) -Right keys: List(key#x) +Left keys [1]: [key#x] +Right keys [1]: [key#x] Join condition: None @@ -306,39 +344,40 @@ struct +- Scan parquet default.explain_temp2 (3) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 2] -Input: [key#x, val#x] - -(3) Scan parquet default.explain_temp2 -Output: [key#x, val#x] +Input [2]: [key#x, val#x] + +(3) Scan parquet default.explain_temp2 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp2] PushedFilters: [IsNotNull(key)] ReadSchema: struct - + (4) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (5) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : isnotnull(key#x) - + (6) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(7) BroadcastExchange -Input: [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(7) BroadcastExchange +Input [2]: [key#x, val#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + (8) BroadcastHashJoin [codegen id : 2] -Left keys: List(key#x) -Right keys: List(key#x) +Left keys [1]: [key#x] +Right keys [1]: [key#x] Join condition: None @@ -363,24 +402,24 @@ struct +- Scan parquet default.explain_temp1 (1) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), IsNotNull(val), GreaterThan(val,3)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (((isnotnull(key#x) AND isnotnull(val#x)) AND (key#x = Subquery scalar-subquery#x, [id=#x])) AND (val#x > 3)) - + (4) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + ===== Subqueries ===== Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x] @@ -393,33 +432,42 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery +- Scan parquet default.explain_temp2 (5) -(5) Scan parquet default.explain_temp2 -Output: [key#x, val#x] +(5) Scan parquet default.explain_temp2 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp2] PushedFilters: [IsNotNull(key), IsNotNull(val), EqualTo(val,2)] ReadSchema: struct - + (6) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (7) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (((isnotnull(key#x) AND isnotnull(val#x)) AND (key#x = Subquery scalar-subquery#x, [id=#x])) AND (val#x = 2)) - + (8) Project [codegen id : 1] -Output : [key#x] -Input : [key#x, val#x] - +Output [1]: [key#x] +Input [2]: [key#x, val#x] + (9) HashAggregate [codegen id : 1] -Input: [key#x] - -(10) Exchange -Input: [max#x] - +Input [1]: [key#x] +Keys: [] +Functions [1]: [partial_max(key#x)] +Aggregate Attributes [1]: [max#x] +Results [1]: [max#x] + +(10) Exchange +Input [1]: [max#x] +Arguments: SinglePartition, true, [id=#x] + (11) HashAggregate [codegen id : 2] -Input: [max#x] - +Input [1]: [max#x] +Keys: [] +Functions [1]: [max(key#x)] +Aggregate Attributes [1]: [max(key#x)#x] +Results [1]: [max(key#x)#x AS max(key)#x] + Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery#x, [id=#x] * HashAggregate (18) +- Exchange (17) @@ -430,32 +478,41 @@ Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery +- Scan parquet default.explain_temp3 (12) -(12) Scan parquet default.explain_temp3 -Output: [key#x, val#x] +(12) Scan parquet default.explain_temp3 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp3] PushedFilters: [IsNotNull(val), GreaterThan(val,0)] ReadSchema: struct - + (13) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (14) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(val#x) AND (val#x > 0)) - + (15) Project [codegen id : 1] -Output : [key#x] -Input : [key#x, val#x] - +Output [1]: [key#x] +Input [2]: [key#x, val#x] + (16) HashAggregate [codegen id : 1] -Input: [key#x] - -(17) Exchange -Input: [max#x] - +Input [1]: [key#x] +Keys: [] +Functions [1]: [partial_max(key#x)] +Aggregate Attributes [1]: [max#x] +Results [1]: [max#x] + +(17) Exchange +Input [1]: [max#x] +Arguments: SinglePartition, true, [id=#x] + (18) HashAggregate [codegen id : 2] -Input: [max#x] +Input [1]: [max#x] +Keys: [] +Functions [1]: [max(key#x)] +Aggregate Attributes [1]: [max(key#x)#x] +Results [1]: [max(key#x)#x AS max(key)#x] -- !query @@ -466,7 +523,7 @@ EXPLAIN FORMATTED FROM explain_temp2 WHERE val > 0) OR - key = (SELECT max(key) + key = (SELECT avg(key) FROM explain_temp3 WHERE val > 0) -- !query schema @@ -478,19 +535,19 @@ struct +- Scan parquet default.explain_temp1 (1) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 1] -Input : [key#x, val#x] -Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (key#x = Subquery scalar-subquery#x, [id=#x])) - +Input [2]: [key#x, val#x] +Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (cast(key#x as double) = Subquery scalar-subquery#x, [id=#x])) + ===== Subqueries ===== Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x] @@ -503,33 +560,42 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery +- Scan parquet default.explain_temp2 (4) -(4) Scan parquet default.explain_temp2 -Output: [key#x, val#x] +(4) Scan parquet default.explain_temp2 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp2] PushedFilters: [IsNotNull(val), GreaterThan(val,0)] ReadSchema: struct - + (5) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (6) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(val#x) AND (val#x > 0)) - + (7) Project [codegen id : 1] -Output : [key#x] -Input : [key#x, val#x] - +Output [1]: [key#x] +Input [2]: [key#x, val#x] + (8) HashAggregate [codegen id : 1] -Input: [key#x] - -(9) Exchange -Input: [max#x] - +Input [1]: [key#x] +Keys: [] +Functions [1]: [partial_max(key#x)] +Aggregate Attributes [1]: [max#x] +Results [1]: [max#x] + +(9) Exchange +Input [1]: [max#x] +Arguments: SinglePartition, true, [id=#x] + (10) HashAggregate [codegen id : 2] -Input: [max#x] - +Input [1]: [max#x] +Keys: [] +Functions [1]: [max(key#x)] +Aggregate Attributes [1]: [max(key#x)#x] +Results [1]: [max(key#x)#x AS max(key)#x] + Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x] * HashAggregate (17) +- Exchange (16) @@ -540,32 +606,41 @@ Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery +- Scan parquet default.explain_temp3 (11) -(11) Scan parquet default.explain_temp3 -Output: [key#x, val#x] +(11) Scan parquet default.explain_temp3 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp3] PushedFilters: [IsNotNull(val), GreaterThan(val,0)] ReadSchema: struct - + (12) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (13) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(val#x) AND (val#x > 0)) - + (14) Project [codegen id : 1] -Output : [key#x] -Input : [key#x, val#x] - +Output [1]: [key#x] +Input [2]: [key#x, val#x] + (15) HashAggregate [codegen id : 1] -Input: [key#x] - -(16) Exchange -Input: [max#x] - +Input [1]: [key#x] +Keys: [] +Functions [1]: [partial_avg(cast(key#x as bigint))] +Aggregate Attributes [2]: [sum#x, count#xL] +Results [2]: [sum#x, count#xL] + +(16) Exchange +Input [2]: [sum#x, count#xL] +Arguments: SinglePartition, true, [id=#x] + (17) HashAggregate [codegen id : 2] -Input: [max#x] +Input [2]: [sum#x, count#xL] +Keys: [] +Functions [1]: [avg(cast(key#x as bigint))] +Aggregate Attributes [1]: [avg(cast(key#x as bigint))#x] +Results [1]: [avg(cast(key#x as bigint))#x AS avg(key)#x] -- !query @@ -581,19 +656,19 @@ struct +- Scan parquet default.explain_temp1 (1) -(1) Scan parquet default.explain_temp1 +(1) Scan parquet default.explain_temp1 Output: [] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] ReadSchema: struct<> - + (2) ColumnarToRow [codegen id : 1] Input: [] - + (3) Project [codegen id : 1] -Output : [(Subquery scalar-subquery#x, [id=#x] + ReusedSubquery Subquery scalar-subquery#x, [id=#x]) AS (scalarsubquery() + scalarsubquery())#x] -Input : [] - +Output [1]: [(Subquery scalar-subquery#x, [id=#x] + ReusedSubquery Subquery scalar-subquery#x, [id=#x]) AS (scalarsubquery() + scalarsubquery())#x] +Input: [] + ===== Subqueries ===== Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x] @@ -604,24 +679,33 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery +- Scan parquet default.explain_temp1 (4) -(4) Scan parquet default.explain_temp1 -Output: [key#x] +(4) Scan parquet default.explain_temp1 +Output [1]: [key#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] ReadSchema: struct - + (5) ColumnarToRow [codegen id : 1] -Input: [key#x] - +Input [1]: [key#x] + (6) HashAggregate [codegen id : 1] -Input: [key#x] - -(7) Exchange -Input: [sum#x, count#xL] - +Input [1]: [key#x] +Keys: [] +Functions [1]: [partial_avg(cast(key#x as bigint))] +Aggregate Attributes [2]: [sum#x, count#xL] +Results [2]: [sum#x, count#xL] + +(7) Exchange +Input [2]: [sum#x, count#xL] +Arguments: SinglePartition, true, [id=#x] + (8) HashAggregate [codegen id : 2] -Input: [sum#x, count#xL] - +Input [2]: [sum#x, count#xL] +Keys: [] +Functions [1]: [avg(cast(key#x as bigint))] +Aggregate Attributes [1]: [avg(cast(key#x as bigint))#x] +Results [1]: [avg(cast(key#x as bigint))#x AS avg(key)#x] + Subquery:2 Hosting operator id = 3 Hosting Expression = ReusedSubquery Subquery scalar-subquery#x, [id=#x] @@ -649,48 +733,49 @@ struct +- Scan parquet default.explain_temp1 (5) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,10)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 2] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 2] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 10)) - + (4) Project [codegen id : 2] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(5) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(5) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,10)] ReadSchema: struct - + (6) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (7) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 10)) - + (8) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - -(9) BroadcastExchange -Input: [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + +(9) BroadcastExchange +Input [2]: [key#x, val#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + (10) BroadcastHashJoin [codegen id : 2] -Left keys: List(key#x) -Right keys: List(key#x) +Left keys [1]: [key#x] +Right keys [1]: [key#x] Join condition: None @@ -720,45 +805,59 @@ struct +- ReusedExchange (8) -(1) Scan parquet default.explain_temp1 -Output: [key#x, val#x] +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] Batched: true Location [not included in comparison]/{warehouse_dir}/explain_temp1] PushedFilters: [IsNotNull(key), GreaterThan(key,10)] ReadSchema: struct - + (2) ColumnarToRow [codegen id : 1] -Input: [key#x, val#x] - +Input [2]: [key#x, val#x] + (3) Filter [codegen id : 1] -Input : [key#x, val#x] +Input [2]: [key#x, val#x] Condition : (isnotnull(key#x) AND (key#x > 10)) - + (4) Project [codegen id : 1] -Output : [key#x, val#x] -Input : [key#x, val#x] - +Output [2]: [key#x, val#x] +Input [2]: [key#x, val#x] + (5) HashAggregate [codegen id : 1] -Input: [key#x, val#x] - -(6) Exchange -Input: [key#x, max#x] - +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_max(val#x)] +Aggregate Attributes [1]: [max#x] +Results [2]: [key#x, max#x] + +(6) Exchange +Input [2]: [key#x, max#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + (7) HashAggregate [codegen id : 4] -Input: [key#x, max#x] - -(8) ReusedExchange [Reuses operator id: 6] -Output : ArrayBuffer(key#x, max#x) - +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [2]: [key#x, max(val#x)#x AS max(val)#x] + +(8) ReusedExchange [Reuses operator id: 6] +Output [2]: [key#x, max#x] + (9) HashAggregate [codegen id : 3] -Input: [key#x, max#x] - -(10) BroadcastExchange -Input: [key#x, max(val)#x] - +Input [2]: [key#x, max#x] +Keys [1]: [key#x] +Functions [1]: [max(val#x)] +Aggregate Attributes [1]: [max(val#x)#x] +Results [2]: [key#x, max(val#x)#x AS max(val)#x] + +(10) BroadcastExchange +Input [2]: [key#x, max(val)#x] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#x] + (11) BroadcastHashJoin [codegen id : 4] -Left keys: List(key#x) -Right keys: List(key#x) +Left keys [1]: [key#x] +Right keys [1]: [key#x] Join condition: None @@ -776,14 +875,160 @@ Execute CreateViewCommand (1) +- UnresolvedRelation (3) -(1) Execute CreateViewCommand +(1) Execute CreateViewCommand Output: [] - -(2) CreateViewCommand - -(3) UnresolvedRelation - + +(2) CreateViewCommand +Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView + +(3) UnresolvedRelation +Arguments: [explain_temp1] + (4) Project +Arguments: ['key, 'val] + + +-- !query +EXPLAIN FORMATTED + SELECT + COUNT(val) + SUM(key) as TOTAL, + COUNT(key) FILTER (WHERE val > 1) + FROM explain_temp1 +-- !query schema +struct +-- !query output +== Physical Plan == +* HashAggregate (5) ++- Exchange (4) + +- HashAggregate (3) + +- * ColumnarToRow (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct + +(2) ColumnarToRow [codegen id : 1] +Input [2]: [key#x, val#x] + +(3) HashAggregate +Input [2]: [key#x, val#x] +Keys: [] +Functions [3]: [partial_count(val#x), partial_sum(cast(key#x as bigint)), partial_count(key#x) FILTER (WHERE (val#x > 1))] +Aggregate Attributes [3]: [count#xL, sum#xL, count#xL] +Results [3]: [count#xL, sum#xL, count#xL] + +(4) Exchange +Input [3]: [count#xL, sum#xL, count#xL] +Arguments: SinglePartition, true, [id=#x] + +(5) HashAggregate [codegen id : 2] +Input [3]: [count#xL, sum#xL, count#xL] +Keys: [] +Functions [3]: [count(val#x), sum(cast(key#x as bigint)), count(key#x)] +Aggregate Attributes [3]: [count(val#x)#xL, sum(cast(key#x as bigint))#xL, count(key#x)#xL] +Results [2]: [(count(val#x)#xL + sum(cast(key#x as bigint))#xL) AS TOTAL#xL, count(key#x)#xL AS count(key) FILTER (WHERE (val > 1))#xL] + + +-- !query +EXPLAIN FORMATTED + SELECT key, sort_array(collect_set(val))[0] + FROM explain_temp4 + GROUP BY key +-- !query schema +struct +-- !query output +== Physical Plan == +ObjectHashAggregate (5) ++- Exchange (4) + +- ObjectHashAggregate (3) + +- * ColumnarToRow (2) + +- Scan parquet default.explain_temp4 (1) + + +(1) Scan parquet default.explain_temp4 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp4] +ReadSchema: struct + +(2) ColumnarToRow [codegen id : 1] +Input [2]: [key#x, val#x] + +(3) ObjectHashAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_collect_set(val#x, 0, 0)] +Aggregate Attributes [1]: [buf#x] +Results [2]: [key#x, buf#x] + +(4) Exchange +Input [2]: [key#x, buf#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(5) ObjectHashAggregate +Input [2]: [key#x, buf#x] +Keys [1]: [key#x] +Functions [1]: [collect_set(val#x, 0, 0)] +Aggregate Attributes [1]: [collect_set(val#x, 0, 0)#x] +Results [2]: [key#x, sort_array(collect_set(val#x, 0, 0)#x, true)[0] AS sort_array(collect_set(val), true)[0]#x] + + +-- !query +EXPLAIN FORMATTED + SELECT key, MIN(val) + FROM explain_temp4 + GROUP BY key +-- !query schema +struct +-- !query output +== Physical Plan == +SortAggregate (7) ++- * Sort (6) + +- Exchange (5) + +- SortAggregate (4) + +- * Sort (3) + +- * ColumnarToRow (2) + +- Scan parquet default.explain_temp4 (1) + + +(1) Scan parquet default.explain_temp4 +Output [2]: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp4] +ReadSchema: struct + +(2) ColumnarToRow [codegen id : 1] +Input [2]: [key#x, val#x] + +(3) Sort [codegen id : 1] +Input [2]: [key#x, val#x] +Arguments: [key#x ASC NULLS FIRST], false, 0 + +(4) SortAggregate +Input [2]: [key#x, val#x] +Keys [1]: [key#x] +Functions [1]: [partial_min(val#x)] +Aggregate Attributes [1]: [min#x] +Results [2]: [key#x, min#x] + +(5) Exchange +Input [2]: [key#x, min#x] +Arguments: hashpartitioning(key#x, 4), true, [id=#x] + +(6) Sort [codegen id : 2] +Input [2]: [key#x, min#x] +Arguments: [key#x ASC NULLS FIRST], false, 0 + +(7) SortAggregate +Input [2]: [key#x, min#x] +Keys [1]: [key#x] +Functions [1]: [min(val#x)] +Aggregate Attributes [1]: [min(val#x)#x] +Results [2]: [key#x, min(val#x)#x AS min(val)#x] -- !query @@ -808,3 +1053,11 @@ DROP TABLE explain_temp3 struct<> -- !query output + + +-- !query +DROP TABLE explain_temp4 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/extract.sql.out b/sql/core/src/test/resources/sql-tests/results/extract.sql.out index 583459f9037b8..29cbefdb38541 100644 --- a/sql/core/src/test/resources/sql-tests/results/extract.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/extract.sql.out @@ -1,9 +1,9 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 64 +-- Number of queries: 96 -- !query -CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c +CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c, interval 10 year 20 month 30 day 40 hour 50 minute 6.7890 second as i -- !query schema struct<> -- !query output @@ -11,510 +11,765 @@ struct<> -- !query -select extract(millennium from c) from t +select extract(year from c), extract(year from i) from t -- !query schema -struct +struct -- !query output -3 +2011 11 -- !query -select extract(millennia from c) from t +select extract(y from c), extract(y from i) from t -- !query schema -struct +struct -- !query output -3 +2011 11 -- !query -select extract(mil from c) from t +select extract(years from c), extract(years from i) from t -- !query schema -struct +struct -- !query output -3 +2011 11 -- !query -select extract(mils from c) from t +select extract(yr from c), extract(yr from i) from t -- !query schema -struct +struct -- !query output -3 +2011 11 -- !query -select extract(century from c) from t +select extract(yrs from c), extract(yrs from i) from t -- !query schema -struct +struct -- !query output -21 +2011 11 -- !query -select extract(centuries from c) from t +select extract(yearofweek from c) from t -- !query schema -struct +struct -- !query output -21 +2011 -- !query -select extract(c from c) from t +select extract(quarter from c) from t -- !query schema -struct +struct -- !query output -21 +2 -- !query -select extract(cent from c) from t +select extract(qtr from c) from t -- !query schema -struct +struct -- !query output -21 +2 -- !query -select extract(decade from c) from t +select extract(month from c), extract(month from i) from t -- !query schema -struct +struct -- !query output -201 +5 8 -- !query -select extract(decades from c) from t +select extract(mon from c), extract(mon from i) from t -- !query schema -struct +struct -- !query output -201 +5 8 -- !query -select extract(dec from c) from t +select extract(mons from c), extract(mons from i) from t -- !query schema -struct +struct -- !query output -201 +5 8 -- !query -select extract(decs from c) from t +select extract(months from c), extract(months from i) from t -- !query schema -struct +struct -- !query output -201 +5 8 -- !query -select extract(year from c) from t +select extract(week from c) from t -- !query schema -struct +struct -- !query output -2011 +18 -- !query -select extract(y from c) from t +select extract(w from c) from t -- !query schema -struct +struct -- !query output -2011 +18 -- !query -select extract(years from c) from t +select extract(weeks from c) from t -- !query schema -struct +struct -- !query output -2011 +18 -- !query -select extract(yr from c) from t +select extract(day from c), extract(day from i) from t -- !query schema -struct +struct -- !query output -2011 +6 31 -- !query -select extract(yrs from c) from t +select extract(d from c), extract(d from i) from t -- !query schema -struct +struct -- !query output -2011 +6 31 + + +-- !query +select extract(days from c), extract(days from i) from t +-- !query schema +struct +-- !query output +6 31 + + +-- !query +select extract(dayofweek from c) from t +-- !query schema +struct +-- !query output +6 + + +-- !query +select extract(dow from c) from t +-- !query schema +struct +-- !query output +6 + + +-- !query +select extract(dayofweek_iso from c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select extract(dow_iso from c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select extract(doy from c) from t +-- !query schema +struct +-- !query output +126 + + +-- !query +select extract(hour from c), extract(hour from i) from t +-- !query schema +struct +-- !query output +7 16 + + +-- !query +select extract(h from c), extract(h from i) from t +-- !query schema +struct +-- !query output +7 16 + + +-- !query +select extract(hours from c), extract(hours from i) from t +-- !query schema +struct +-- !query output +7 16 + + +-- !query +select extract(hr from c), extract(hr from i) from t +-- !query schema +struct +-- !query output +7 16 + + +-- !query +select extract(hrs from c), extract(hrs from i) from t +-- !query schema +struct +-- !query output +7 16 + + +-- !query +select extract(minute from c), extract(minute from i) from t +-- !query schema +struct +-- !query output +8 50 + + +-- !query +select extract(m from c), extract(m from i) from t +-- !query schema +struct +-- !query output +8 50 + + +-- !query +select extract(min from c), extract(min from i) from t +-- !query schema +struct +-- !query output +8 50 + + +-- !query +select extract(mins from c), extract(mins from i) from t +-- !query schema +struct +-- !query output +8 50 + + +-- !query +select extract(minutes from c), extract(minutes from i) from t +-- !query schema +struct +-- !query output +8 50 + + +-- !query +select extract(second from c), extract(second from i) from t +-- !query schema +struct +-- !query output +9.123456 6.789000 + + +-- !query +select extract(s from c), extract(s from i) from t +-- !query schema +struct +-- !query output +9.123456 6.789000 + + +-- !query +select extract(sec from c), extract(sec from i) from t +-- !query schema +struct +-- !query output +9.123456 6.789000 + + +-- !query +select extract(seconds from c), extract(seconds from i) from t +-- !query schema +struct +-- !query output +9.123456 6.789000 + + +-- !query +select extract(secs from c), extract(secs from i) from t +-- !query schema +struct +-- !query output +9.123456 6.789000 + + +-- !query +select extract(not_supported from c) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Literals of type 'not_supported' are currently not supported for the string type.;; line 1 pos 7 + + +-- !query +select extract(not_supported from i) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Literals of type 'not_supported' are currently not supported for the interval type.;; line 1 pos 7 + + +-- !query +select date_part('year', c), date_part('year', i) from t +-- !query schema +struct +-- !query output +2011 11 + + +-- !query +select date_part('y', c), date_part('y', i) from t +-- !query schema +struct +-- !query output +2011 11 -- !query -select extract(isoyear from c) from t +select date_part('years', c), date_part('years', i) from t -- !query schema -struct +struct +-- !query output +2011 11 + + +-- !query +select date_part('yr', c), date_part('yr', i) from t +-- !query schema +struct +-- !query output +2011 11 + + +-- !query +select date_part('yrs', c), date_part('yrs', i) from t +-- !query schema +struct +-- !query output +2011 11 + + +-- !query +select date_part('yearofweek', c) from t +-- !query schema +struct -- !query output 2011 -- !query -select extract(quarter from c) from t +select date_part('quarter', c) from t -- !query schema -struct +struct -- !query output 2 -- !query -select extract(qtr from c) from t +select date_part('qtr', c) from t -- !query schema -struct +struct -- !query output 2 -- !query -select extract(month from c) from t +select date_part('month', c), date_part('month', i) from t -- !query schema -struct +struct -- !query output -5 +5 8 -- !query -select extract(mon from c) from t +select date_part('mon', c), date_part('mon', i) from t -- !query schema -struct +struct -- !query output -5 +5 8 -- !query -select extract(mons from c) from t +select date_part('mons', c), date_part('mons', i) from t -- !query schema -struct +struct -- !query output -5 +5 8 -- !query -select extract(months from c) from t +select date_part('months', c), date_part('months', i) from t -- !query schema -struct +struct -- !query output -5 +5 8 -- !query -select extract(week from c) from t +select date_part('week', c) from t -- !query schema -struct +struct -- !query output 18 -- !query -select extract(w from c) from t +select date_part('w', c) from t -- !query schema -struct +struct -- !query output 18 -- !query -select extract(weeks from c) from t +select date_part('weeks', c) from t -- !query schema -struct +struct -- !query output 18 -- !query -select extract(day from c) from t +select date_part('day', c), date_part('day', i) from t -- !query schema -struct +struct -- !query output -6 +6 31 -- !query -select extract(d from c) from t +select date_part('d', c), date_part('d', i) from t -- !query schema -struct +struct -- !query output -6 +6 31 -- !query -select extract(days from c) from t +select date_part('days', c), date_part('days', i) from t -- !query schema -struct +struct +-- !query output +6 31 + + +-- !query +select date_part('dayofweek', c) from t +-- !query schema +struct -- !query output 6 -- !query -select extract(dayofweek from c) from t +select date_part('dow', c) from t -- !query schema -struct +struct -- !query output 6 -- !query -select extract(dow from c) from t +select date_part('dayofweek_iso', c) from t -- !query schema -struct +struct -- !query output 5 -- !query -select extract(isodow from c) from t +select date_part('dow_iso', c) from t -- !query schema -struct +struct -- !query output 5 -- !query -select extract(doy from c) from t +select date_part('doy', c) from t -- !query schema -struct +struct -- !query output 126 -- !query -select extract(hour from c) from t +select date_part('hour', c), date_part('hour', i) from t -- !query schema -struct +struct -- !query output -7 +7 16 -- !query -select extract(h from c) from t +select date_part('h', c), date_part('h', i) from t -- !query schema -struct +struct -- !query output -7 +7 16 -- !query -select extract(hours from c) from t +select date_part('hours', c), date_part('hours', i) from t -- !query schema -struct +struct -- !query output -7 +7 16 -- !query -select extract(hr from c) from t +select date_part('hr', c), date_part('hr', i) from t -- !query schema -struct +struct -- !query output -7 +7 16 -- !query -select extract(hrs from c) from t +select date_part('hrs', c), date_part('hrs', i) from t -- !query schema -struct +struct -- !query output -7 +7 16 -- !query -select extract(minute from c) from t +select date_part('minute', c), date_part('minute', i) from t -- !query schema -struct +struct -- !query output -8 +8 50 -- !query -select extract(m from c) from t +select date_part('m', c), date_part('m', i) from t -- !query schema -struct +struct -- !query output -8 +8 50 -- !query -select extract(min from c) from t +select date_part('min', c), date_part('min', i) from t -- !query schema -struct +struct -- !query output -8 +8 50 -- !query -select extract(mins from c) from t +select date_part('mins', c), date_part('mins', i) from t -- !query schema -struct +struct -- !query output -8 +8 50 -- !query -select extract(minutes from c) from t +select date_part('minutes', c), date_part('minutes', i) from t -- !query schema -struct +struct -- !query output -8 +8 50 -- !query -select extract(second from c) from t +select date_part('second', c), date_part('second', i) from t -- !query schema -struct +struct -- !query output -9.123456 +9.123456 6.789000 -- !query -select extract(s from c) from t +select date_part('s', c), date_part('s', i) from t -- !query schema -struct +struct -- !query output -9.123456 +9.123456 6.789000 -- !query -select extract(sec from c) from t +select date_part('sec', c), date_part('sec', i) from t -- !query schema -struct +struct -- !query output -9.123456 +9.123456 6.789000 -- !query -select extract(seconds from c) from t +select date_part('seconds', c), date_part('seconds', i) from t -- !query schema -struct +struct -- !query output -9.123456 +9.123456 6.789000 -- !query -select extract(secs from c) from t +select date_part('secs', c), date_part('secs', i) from t -- !query schema -struct +struct -- !query output -9.123456 +9.123456 6.789000 + + +-- !query +select date_part('not_supported', c) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Literals of type 'not_supported' are currently not supported for the string type.;; line 1 pos 7 -- !query -select extract(milliseconds from c) from t +select date_part(c, c) from t -- !query schema -struct +struct<> -- !query output -9123.456 +org.apache.spark.sql.AnalysisException +The field parameter needs to be a foldable string value.;; line 1 pos 7 -- !query -select extract(msec from c) from t +select date_part(null, c) from t -- !query schema -struct +struct -- !query output -9123.456 +NULL -- !query -select extract(msecs from c) from t +select date_part(i, i) from t -- !query schema -struct +struct<> -- !query output -9123.456 +org.apache.spark.sql.AnalysisException +The field parameter needs to be a foldable string value.;; line 1 pos 7 -- !query -select extract(millisecon from c) from t +select date_part(null, i) from t -- !query schema -struct +struct -- !query output -9123.456 +NULL -- !query -select extract(mseconds from c) from t +select extract('year', c) from t -- !query schema -struct +struct -- !query output -9123.456 +2011 -- !query -select extract(ms from c) from t +select extract('quarter', c) from t -- !query schema -struct +struct -- !query output -9123.456 +2 -- !query -select extract(microseconds from c) from t +select extract('month', c) from t -- !query schema -struct +struct -- !query output -9123456 +5 -- !query -select extract(usec from c) from t +select extract('week', c) from t -- !query schema -struct +struct -- !query output -9123456 +18 -- !query -select extract(usecs from c) from t +select extract('day', c) from t -- !query schema -struct +struct -- !query output -9123456 +6 -- !query -select extract(useconds from c) from t +select extract('days', c) from t -- !query schema -struct +struct -- !query output -9123456 +6 -- !query -select extract(microsecon from c) from t +select extract('dayofweek', c) from t -- !query schema -struct +struct -- !query output -9123456 +6 -- !query -select extract(us from c) from t +select extract('dow', c) from t -- !query schema -struct +struct -- !query output -9123456 +6 -- !query -select extract(epoch from c) from t +select extract('doy', c) from t -- !query schema -struct +struct -- !query output -1304665689.123456 +126 -- !query -select extract(not_supported from c) from t +select extract('hour', c) from t -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.catalyst.parser.ParseException +7 -Literals of type 'not_supported' are currently not supported.(line 1, pos 7) -== SQL == -select extract(not_supported from c) from t --------^^^ +-- !query +select extract('minute', c) from t +-- !query schema +struct +-- !query output +8 + + +-- !query +select extract('second', c) from t +-- !query schema +struct +-- !query output +9.123456 diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out index a032678e90fe8..9ff7f711ea296 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out @@ -272,7 +272,7 @@ struct= 0)):bigint> -- !query SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 GROUP BY 1 -- !query schema -struct= 1)):struct> +struct= 1)):struct> -- !query output @@ -369,13 +369,13 @@ org.apache.spark.sql.AnalysisException IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, avg(salary#x) AS avg(salary)#x, avg(salary#x) FILTER (WHERE exists#x [dept_id#x]) AS avg(salary) FILTER (WHERE exists(dept_id))#x] : +- Project [state#x] : +- Filter (dept_id#x = outer(dept_id#x)) -: +- SubqueryAlias `dept` +: +- SubqueryAlias dept : +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias `DEPT` +: +- SubqueryAlias DEPT : +- LocalRelation [dept_id#x, dept_name#x, state#x] -+- SubqueryAlias `emp` ++- SubqueryAlias emp +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias `EMP` + +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; @@ -395,13 +395,13 @@ org.apache.spark.sql.AnalysisException IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, sum(salary#x) AS sum(salary)#x, sum(salary#x) FILTER (WHERE NOT exists#x [dept_id#x]) AS sum(salary) FILTER (WHERE (NOT exists(dept_id)))#x] : +- Project [state#x] : +- Filter (dept_id#x = outer(dept_id#x)) -: +- SubqueryAlias `dept` +: +- SubqueryAlias dept : +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias `DEPT` +: +- SubqueryAlias DEPT : +- LocalRelation [dept_id#x, dept_name#x, state#x] -+- SubqueryAlias `emp` ++- SubqueryAlias emp +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias `EMP` + +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; @@ -420,13 +420,13 @@ org.apache.spark.sql.AnalysisException IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, avg(salary#x) AS avg(salary)#x, avg(salary#x) FILTER (WHERE dept_id#x IN (list#x [])) AS avg(salary) FILTER (WHERE (dept_id IN (listquery())))#x] : +- Distinct : +- Project [dept_id#x] -: +- SubqueryAlias `dept` +: +- SubqueryAlias dept : +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias `DEPT` +: +- SubqueryAlias DEPT : +- LocalRelation [dept_id#x, dept_name#x, state#x] -+- SubqueryAlias `emp` ++- SubqueryAlias emp +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias `EMP` + +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; @@ -445,13 +445,13 @@ org.apache.spark.sql.AnalysisException IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, sum(salary#x) AS sum(salary)#x, sum(salary#x) FILTER (WHERE NOT dept_id#x IN (list#x [])) AS sum(salary) FILTER (WHERE (NOT (dept_id IN (listquery()))))#x] : +- Distinct : +- Project [dept_id#x] -: +- SubqueryAlias `dept` +: +- SubqueryAlias dept : +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias `DEPT` +: +- SubqueryAlias DEPT : +- LocalRelation [dept_id#x, dept_name#x, state#x] -+- SubqueryAlias `emp` ++- SubqueryAlias emp +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias `EMP` + +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 7bfdd0ad53a95..e5b705895340e 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 56 +-- Number of queries: 61 -- !query @@ -87,7 +87,7 @@ struct -- !query SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1 -- !query schema -struct> +struct> -- !query output @@ -277,6 +277,67 @@ org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and '`id`' is not an aggregate function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or first_value) if you don't care which value you get.; +-- !query +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true +-- !query schema +struct +-- !query output +spark.sql.legacy.parser.havingWithoutGroupByAsWhere true + + +-- !query +SELECT 1 FROM range(10) HAVING true +-- !query schema +struct<1:int> +-- !query output +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 + + +-- !query +SELECT 1 FROM range(10) HAVING MAX(id) > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +Aggregate/Window/Generate expressions are not valid in where clause of the query. +Expression in where clause: [(max(`id`) > CAST(0 AS BIGINT))] +Invalid expressions: [max(`id`)]; + + +-- !query +SELECT id FROM range(10) HAVING id > 0 +-- !query schema +struct +-- !query output +1 +2 +3 +4 +5 +6 +7 +8 +9 + + +-- !query +SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=false +-- !query schema +struct +-- !query output +spark.sql.legacy.parser.havingWithoutGroupByAsWhere false + + -- !query CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES (1, true), (1, false), diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out index 8eeabb34b4fab..0a5fe7a727241 100644 --- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out @@ -138,7 +138,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -extraneous input 'ROLLUP' expecting (line 1, pos 53) +extraneous input 'ROLLUP' expecting {, ';'}(line 1, pos 53) == SQL == SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP @@ -152,7 +152,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -extraneous input 'CUBE' expecting (line 1, pos 53) +extraneous input 'CUBE' expecting {, ';'}(line 1, pos 53) == SQL == SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE diff --git a/sql/core/src/test/resources/sql-tests/results/having.sql.out b/sql/core/src/test/resources/sql-tests/results/having.sql.out index 5bd185d7b815d..6508143e6f9fe 100644 --- a/sql/core/src/test/resources/sql-tests/results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/having.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 5 +-- Number of queries: 9 -- !query @@ -47,3 +47,69 @@ struct<(a + CAST(b AS BIGINT)):bigint> -- !query output 3 7 + + +-- !query +SELECT SUM(a) AS b, CAST('2020-01-01' AS DATE) AS fake FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY b HAVING b > 10 +-- !query schema +struct +-- !query output +2 2020-01-01 + + +-- !query +SELECT SUM(a) AS b FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY GROUPING SETS ((b), (a, b)) HAVING b > 10 +-- !query schema +struct +-- !query output +2 +2 + + +-- !query +SELECT SUM(a) AS b FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY CUBE(a, b) HAVING b > 10 +-- !query schema +struct +-- !query output +2 +2 + + +-- !query +SELECT SUM(a) AS b FROM VALUES (1, 10), (2, 20) AS T(a, b) GROUP BY ROLLUP(a, b) HAVING b > 10 +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY GROUPING SETS(t.c1) HAVING t.c1 = 1 +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY CUBE(t.c1) HAVING t.c1 = 1 +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY ROLLUP(t.c1) HAVING t.c1 = 1 +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT c1 FROM VALUES (1, 2) as t(c1, c2) GROUP BY t.c1 HAVING t.c1 = 1 +-- !query schema +struct +-- !query output +1 diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index d35d0d5d944bb..7b31b5690998c 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 29 +-- Number of queries: 30 -- !query @@ -270,3 +270,11 @@ select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t( struct> -- !query output [32,98] + + +-- !query +select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array(x))) +-- !query schema +struct>> +-- !query output +[[""]] diff --git a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out index 4762082dc3be2..b99f63393cc4d 100644 --- a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 22 +-- Number of queries: 26 -- !query @@ -291,6 +291,38 @@ struct spark.sql.legacy.setopsPrecedence.enabled false +-- !query +CREATE OR REPLACE TEMPORARY VIEW tab3 AS VALUES (decimal(1)), (decimal(2)) tbl3(v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t.v FROM ( + SELECT v FROM tab3 + INTERSECT + SELECT v + v AS v FROM tab3 +) t +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT SUM(t.v) FROM ( + SELECT v FROM tab3 + INTERSECT + SELECT v + v AS v FROM tab3 +) t +-- !query schema +struct +-- !query output +2 + + -- !query DROP VIEW IF EXISTS tab1 -- !query schema @@ -305,3 +337,11 @@ DROP VIEW IF EXISTS tab2 struct<> -- !query output + + +-- !query +DROP VIEW IF EXISTS tab3 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out index 8f523a35f3c19..3068e8ede70f9 100644 --- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 101 +-- Number of queries: 107 -- !query @@ -26,13 +26,44 @@ struct +-- !query output +178956970 years 7 months + + +-- !query +select interval 2147483647 month / 0.5 +-- !query schema +struct +-- !query output +178956970 years 7 months + + +-- !query +select interval 2147483647 day * 2 +-- !query schema +struct +-- !query output +2147483647 days 2562047788 hours 54.775807 seconds + + +-- !query +select interval 2147483647 day / 0.5 +-- !query schema +struct +-- !query output +2147483647 days 2562047788 hours 54.775807 seconds + + -- !query select interval '2 seconds' / 0 -- !query schema -struct<> +struct -- !query output -java.lang.ArithmeticException -divide by zero +NULL -- !query @@ -142,11 +173,27 @@ struct -- !query select make_interval(1, 2, 3, 4, 5, 6, 7.008009) -- !query schema -struct +struct -- !query output 1 years 2 months 25 days 5 hours 6 minutes 7.008009 seconds +-- !query +select make_interval(1, 2, 3, 4, 0, 0, 123456789012.123456) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 34293552 hours 30 minutes 12.123456 seconds + + +-- !query +select make_interval(0, 0, 0, 0, 0, 0, 1234567890123456789) +-- !query schema +struct +-- !query output +NULL + + -- !query select cast('1 second' as interval) -- !query schema @@ -307,6 +354,14 @@ struct 30 days +-- !query +select interval 30 days days +-- !query schema +struct +-- !query output +30 days + + -- !query select interval '20 15:40:32.99899999' day to hour -- !query schema @@ -314,7 +369,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 20 15:40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '20 15:40:32.99899999' day to hour @@ -328,7 +383,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 20 15:40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '20 15:40:32.99899999' day to minute @@ -342,7 +397,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 15:40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '15:40:32.99899999' hour to minute @@ -356,7 +411,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '15:40.99899999' hour to second @@ -370,7 +425,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '15:40' hour to second @@ -384,7 +439,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 20 40:32.99899999(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 20 40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == select interval '20 40:32.99899999' minute to second @@ -429,6 +484,14 @@ struct 3 years 1 hours +-- !query +SELECT interval '1 year 3 months 2 weeks 2 days 1 hour 3 minutes 2 seconds 100 millisecond 200 microseconds' +-- !query schema +struct +-- !query output +1 years 3 months 16 days 1 hours 3 minutes 2.1002 seconds + + -- !query select interval -- !query schema @@ -608,7 +671,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -extraneous input 'day' expecting (line 1, pos 27) +extraneous input 'day' expecting {, ';'}(line 1, pos 27) == SQL == select interval 30 day day day @@ -616,177 +679,35 @@ select interval 30 day day day -- !query -select sum(cast(null as interval)) +select interval (-30) days -- !query schema -struct --- !query output -NULL - - --- !query -select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0 --- !query schema -struct --- !query output -NULL - - --- !query -select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -3 seconds - - --- !query -select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -1 seconds - - --- !query -select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) --- !query schema -struct --- !query output --3 seconds - - --- !query -select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output --7 days 2 seconds - - --- !query -select - i, - sum(cast(v as interval)) -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -group by i --- !query schema -struct --- !query output -1 -2 days -2 2 seconds -3 NULL - - --- !query -select - sum(cast(v as interval)) as sv -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -having sv is not null --- !query schema -struct --- !query output --2 days 2 seconds - - --- !query -SELECT - i, - sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) -FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v) --- !query schema -struct --- !query output -1 2 seconds -1 3 seconds -2 NULL -2 NULL - - --- !query -select avg(cast(v as interval)) from VALUES (null) t(v) --- !query schema -struct --- !query output -NULL - - --- !query -select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0 --- !query schema -struct --- !query output -NULL - - --- !query -select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -1.5 seconds - - --- !query -select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) --- !query schema -struct --- !query output -0.5 seconds - - --- !query -select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) --- !query schema -struct --- !query output --1.5 seconds - - --- !query -select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) --- !query schema -struct +struct<> -- !query output --3 days -11 hours -59 minutes -59 seconds +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 -- !query -select - i, - avg(cast(v as interval)) -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -group by i +select interval (a + 1) days -- !query schema -struct +struct<> -- !query output -1 -1 days -2 2 seconds -3 NULL +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 -- !query -select - avg(cast(v as interval)) as sv -from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) -having sv is not null +select interval 30 days days days -- !query schema -struct +struct<> -- !query output --15 hours -59 minutes -59.333333 seconds +org.apache.spark.sql.catalyst.parser.ParseException +extraneous input 'days' expecting {, ';'}(line 1, pos 29) --- !query -SELECT - i, - avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) -FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v) --- !query schema -struct --- !query output -1 1.5 seconds -1 2 seconds -2 NULL -2 NULL +== SQL == +select interval 30 days days days +-----------------------------^^^ -- !query @@ -811,7 +732,7 @@ select interval '2-2' year to month + dateval from interval_arithmetic -- !query schema -struct +struct -- !query output 2012-01-01 2009-11-01 2014-03-01 2014-03-01 2009-11-01 2009-11-01 2014-03-01 @@ -854,7 +775,7 @@ select interval '99 11:22:33.123456789' day to second + dateval from interval_arithmetic -- !query schema -struct +struct -- !query output 2012-01-01 2011-09-23 2012-04-09 2012-04-09 2011-09-23 2011-09-23 2012-04-09 @@ -926,6 +847,65 @@ struct 1 days +-- !query +select interval '2-2\t' year to month +-- !query schema +struct +-- !query output +2 years 2 months + + +-- !query +select interval '-\t2-2\t' year to month +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Interval string does not match year-month format of 'y-m': - 2-2 (line 1, pos 16) + +== SQL == +select interval '-\t2-2\t' year to month +----------------^^^ + + +-- !query +select interval '\n0 12:34:46.789\t' day to second +-- !query schema +struct +-- !query output +12 hours 34 minutes 46.789 seconds + + +-- !query +select interval '\n-\t10\t 12:34:46.789\t' day to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': +- 10 12:34:46.789 , set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '\n-\t10\t 12:34:46.789\t' day to second +----------------^^^ + + +-- !query +select interval '中文 interval 1 day' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7) + +== SQL == +select interval '中文 interval 1 day' +-------^^^ + + -- !query select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) -- !query schema @@ -953,48 +933,172 @@ struct<(b + INTERVAL '1 months'):interval> -- !query select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) -- !query schema -struct<> +struct -- !query output -java.lang.ArithmeticException -integer overflow +-178956970 years -8 months -- !query select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) -- !query schema +struct +-- !query output +-178956970 years -8 months + + +-- !query +SELECT + from_csv('1, 1 day', 'a INT, b interval'), + to_csv(from_csv('1, 1 day', 'a INT, b interval')), + to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), + from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval') +-- !query schema +struct,to_csv(from_csv(1, 1 day)):string,to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string,from_csv(to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes'))):struct> +-- !query output +{"a":1,"b":1 days} 1,1 days 2 years 8 months,1 hours 10 minutes {"a":2 years 8 months,"b":1 hours 10 minutes} + + +-- !query +SELECT + from_json('{"a":"1 days"}', 'a interval'), + to_json(from_json('{"a":"1 days"}', 'a interval')), + to_json(map('a', interval 25 month 100 day 130 minute)), + from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval') +-- !query schema +struct,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct> +-- !query output +{"a":1 days} {"a":"1 days"} {"a":"2 years 1 months 100 days 2 hours 10 minutes"} {"a":2 years 1 months 100 days 2 hours 10 minutes} + + +-- !query +select interval '+' +-- !query schema struct<> -- !query output -java.lang.ArithmeticException -integer overflow +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +(line 1, pos 7) + +== SQL == +select interval '+' +-------^^^ -- !query -SELECT from_csv('1, 1 day', 'a INT, b interval') +select interval '+.' -- !query schema -struct> +struct<> -- !query output -{"a":1,"b":1 days} +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: +.(line 1, pos 7) + +== SQL == +select interval '+.' +-------^^^ -- !query -SELECT to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)) +select interval '1' -- !query schema -struct +struct<> -- !query output -2 years 8 months,1 hours 10 minutes +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1(line 1, pos 7) + +== SQL == +select interval '1' +-------^^^ -- !query -SELECT from_json('{"a":"1 days"}', 'a interval') +select interval '1.2' -- !query schema -struct> +struct<> -- !query output -{"a":1 days} +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1.2(line 1, pos 7) + +== SQL == +select interval '1.2' +-------^^^ + + +-- !query +select interval '- 2' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: - 2(line 1, pos 7) + +== SQL == +select interval '- 2' +-------^^^ + + +-- !query +select interval '1 day -' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day -(line 1, pos 7) + +== SQL == +select interval '1 day -' +-------^^^ + + +-- !query +select interval '1 day 1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the INTERVAL value: 1 day 1(line 1, pos 7) + +== SQL == +select interval '1 day 1' +-------^^^ + + +-- !query +select interval '1 day 2' day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only use numbers in the interval value part for multiple unit value pairs interval form, but got invalid value: 1 day 2(line 1, pos 16) + +== SQL == +select interval '1 day 2' day +----------------^^^ + + +-- !query +select interval 'interval 1' day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only use numbers in the interval value part for multiple unit value pairs interval form, but got invalid value: interval 1(line 1, pos 16) + +== SQL == +select interval 'interval 1' day +----------------^^^ -- !query -SELECT to_json(map('a', interval 25 month 100 day 130 minute)) +select interval '-\t 1' day -- !query schema -struct +struct -- !query output -{"a":"2 years 1 months 100 days 2 hours 10 minutes"} +-1 days diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 21a3531caf732..665c79c4753bc 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 42 +-- Number of queries: 46 -- !query @@ -288,6 +288,49 @@ struct>> NULL +-- !query +select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp') +-- !query schema +struct> +-- !query output +{"d":2012-12-15,"t":2012-12-15 15:15:15} + + +-- !query +select from_json( + '{"d": "12/15 2012", "t": "12/15 2012 15:15:15"}', + 'd date, t timestamp', + map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss')) +-- !query schema +struct> +-- !query output +{"d":2012-12-15,"t":2012-12-15 15:15:15} + + +-- !query +select from_json( + '{"d": "02-29"}', + 'd date', + map('dateFormat', 'MM-dd')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '02-29' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select from_json( + '{"t": "02-29"}', + 't timestamp', + map('timestampFormat', 'MM-dd')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '02-29' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + -- !query select to_json(array('1', '2', '3')) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/limit.sql.out b/sql/core/src/test/resources/sql-tests/results/limit.sql.out index 281326e22a97a..074e7a6d28c47 100644 --- a/sql/core/src/test/resources/sql-tests/results/limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/limit.sql.out @@ -88,7 +88,7 @@ SELECT * FROM testdata LIMIT key > 3 struct<> -- !query output org.apache.spark.sql.AnalysisException -The limit expression must evaluate to a constant value, but got (testdata.`key` > 3); +The limit expression must evaluate to a constant value, but got (spark_catalog.default.testdata.`key` > 3); -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out b/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out deleted file mode 100644 index 3f933f4c0e449..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out +++ /dev/null @@ -1,146 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 18 - - --- !query -set spark.sql.legacy.integralDivide.returnBigint=true --- !query schema -struct --- !query output -spark.sql.legacy.integralDivide.returnBigint true - - --- !query -select 5 div 2 --- !query schema -struct<(5 div 2):bigint> --- !query output -2 - - --- !query -select 5 div 0 --- !query schema -struct<(5 div 0):bigint> --- !query output -NULL - - --- !query -select 5 div null --- !query schema -struct<(5 div CAST(NULL AS INT)):bigint> --- !query output -NULL - - --- !query -select null div 5 --- !query schema -struct<(CAST(NULL AS INT) div 5):bigint> --- !query output -NULL - - --- !query -select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(51 AS DECIMAL(10,0)) AS DECIMAL(10,0)) div CAST(CAST(2 AS DECIMAL(2,0)) AS DECIMAL(10,0))):bigint> --- !query output -25 - - --- !query -select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(0 AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> --- !query output -NULL - - --- !query -select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(NULL AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> --- !query output -NULL - - --- !query -select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(NULL AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(5 AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> --- !query output -NULL - - --- !query -set spark.sql.legacy.integralDivide.returnBigint=false --- !query schema -struct --- !query output -spark.sql.legacy.integralDivide.returnBigint false - - --- !query -select 5 div 2 --- !query schema -struct<(5 div 2):int> --- !query output -2 - - --- !query -select 5 div 0 --- !query schema -struct<(5 div 0):int> --- !query output -NULL - - --- !query -select 5 div null --- !query schema -struct<(5 div CAST(NULL AS INT)):int> --- !query output -NULL - - --- !query -select null div 5 --- !query schema -struct<(CAST(NULL AS INT) div 5):int> --- !query output -NULL - - --- !query -select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(51 AS DECIMAL(10,0)) AS DECIMAL(10,0)) div CAST(CAST(2 AS DECIMAL(2,0)) AS DECIMAL(10,0))):decimal(10,0)> --- !query output -25 - - --- !query -select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(0 AS DECIMAL(2,0)) AS DECIMAL(2,0))):decimal(1,0)> --- !query output -NULL - - --- !query -select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(NULL AS DECIMAL(2,0)) AS DECIMAL(2,0))):decimal(1,0)> --- !query output -NULL - - --- !query -select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)) --- !query schema -struct<(CAST(CAST(NULL AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(5 AS DECIMAL(2,0)) AS DECIMAL(2,0))):decimal(1,0)> --- !query output -NULL diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out index 548281014afd7..9accc57d0bf60 100644 --- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 49 +-- Number of queries: 57 -- !query @@ -154,6 +154,70 @@ struct<(CAST(NULL AS DOUBLE) / CAST(5 AS DOUBLE)):double> NULL +-- !query +select 5 div 2 +-- !query schema +struct<(CAST(5 AS BIGINT) div CAST(2 AS BIGINT)):bigint> +-- !query output +2 + + +-- !query +select 5 div 0 +-- !query schema +struct<(CAST(5 AS BIGINT) div CAST(0 AS BIGINT)):bigint> +-- !query output +NULL + + +-- !query +select 5 div null +-- !query schema +struct<(CAST(5 AS BIGINT) div CAST(NULL AS BIGINT)):bigint> +-- !query output +NULL + + +-- !query +select null div 5 +-- !query schema +struct<(CAST(NULL AS BIGINT) div CAST(5 AS BIGINT)):bigint> +-- !query output +NULL + + +-- !query +select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)) +-- !query schema +struct<(CAST(CAST(51 AS DECIMAL(10,0)) AS DECIMAL(10,0)) div CAST(CAST(2 AS DECIMAL(2,0)) AS DECIMAL(10,0))):bigint> +-- !query output +25 + + +-- !query +select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)) +-- !query schema +struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(0 AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> +-- !query output +NULL + + +-- !query +select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)) +-- !query schema +struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(NULL AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> +-- !query output +NULL + + +-- !query +select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)) +-- !query schema +struct<(CAST(CAST(NULL AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(5 AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> +-- !query output +NULL + + -- !query select 1 + 2 -- !query schema @@ -229,7 +293,7 @@ struct -- !query select ceiling(0) -- !query schema -struct +struct -- !query output 0 @@ -237,7 +301,7 @@ struct -- !query select ceiling(1) -- !query schema -struct +struct -- !query output 1 @@ -253,7 +317,7 @@ struct -- !query select ceiling(1234567890123456) -- !query schema -struct +struct -- !query output 1234567890123456 @@ -269,7 +333,7 @@ struct -- !query select ceiling(-0.10) -- !query schema -struct +struct -- !query output 0 @@ -325,7 +389,7 @@ true -- !query select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null) -- !query schema -struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double> +struct -- !query output 1 NULL 0 NULL NULL NULL @@ -341,7 +405,7 @@ struct -- !query select CHAR_LENGTH('abc') -- !query schema -struct +struct -- !query output 3 @@ -349,7 +413,7 @@ struct -- !query select CHARACTER_LENGTH('abc') -- !query schema -struct +struct -- !query output 3 @@ -373,7 +437,7 @@ struct -- !query select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11) -- !query schema -struct<(+ CAST(-1.11 AS DOUBLE)):double,(+ -1.11):decimal(3,2),(- CAST(-1.11 AS DOUBLE)):double,(- -1.11):decimal(3,2)> +struct<(+ CAST(-1.11 AS DOUBLE)):double,(+ -1.11):decimal(3,2),negative(CAST(-1.11 AS DOUBLE)):double,negative(-1.11):decimal(3,2)> -- !query output -1.11 -1.11 1.11 1.11 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out index 5efb58c7fc1b0..f7bba96738eab 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out @@ -381,8 +381,8 @@ struct<> org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. -Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(b.`four` AS BIGINT))] -Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; +Expression in where clause: [(sum(DISTINCT CAST((outer(a.`four`) + b.`four`) AS BIGINT)) = CAST(b.`four` AS BIGINT))] +Invalid expressions: [sum(DISTINCT CAST((outer(a.`four`) + b.`four`) AS BIGINT))]; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out index 4ea49013a62d1..637c5561bd940 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 13 +-- Number of queries: 7 -- !query @@ -36,129 +36,32 @@ before multi-line -- !query /* This is an example of SQL which should not execute: - * select 'multi-line' --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -/* This is an example of SQL which should not execute: -^^^ - * select 'multi-line' - - --- !query -*/ + * select 'multi-line'; + */ SELECT 'after multi-line' AS fifth -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -extraneous input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -*/ -^^^ -SELECT 'after multi-line' AS fifth +after multi-line -- !query /* -SELECT 'trailing' as x1 --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -/* -^^^ -SELECT 'trailing' as x1 - - --- !query +SELECT 'trailing' as x1; -- inside block comment */ /* This block comment surrounds a query which itself has a block comment... -SELECT /* embedded single line */ 'embedded' AS x2 --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -mismatched input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -*/ -^^^ - -/* This block comment surrounds a query which itself has a block comment... -SELECT /* embedded single line */ 'embedded' AS x2 - - --- !query +SELECT /* embedded single line */ 'embedded' AS x2; */ SELECT -- continued after the following block comments... /* Deeply nested comment. This includes a single apostrophe to make sure we aren't decoding this part as a string. -SELECT 'deep nest' AS n1 --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -extraneous input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -*/ -^^^ - -SELECT -- continued after the following block comments... -/* Deeply nested comment. - This includes a single apostrophe to make sure we aren't decoding this part as a string. -SELECT 'deep nest' AS n1 - - --- !query +SELECT 'deep nest' AS n1; /* Second level of nesting... -SELECT 'deeper nest' as n2 --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -/* Second level of nesting... -^^^ -SELECT 'deeper nest' as n2 - - --- !query +SELECT 'deeper nest' as n2; /* Third level of nesting... -SELECT 'deepest nest' as n3 --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -mismatched input '/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) - -== SQL == -/* Third level of nesting... -^^^ -SELECT 'deepest nest' as n3 - - --- !query +SELECT 'deepest nest' as n3; */ Hoo boy. Still two deep... */ @@ -170,11 +73,27 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -mismatched input '*/' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 0) +mismatched input ''embedded'' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 6, pos 34) == SQL == +/* +SELECT 'trailing' as x1; -- inside block comment +*/ + +/* This block comment surrounds a query which itself has a block comment... +SELECT /* embedded single line */ 'embedded' AS x2; +----------------------------------^^^ +*/ + +SELECT -- continued after the following block comments... +/* Deeply nested comment. + This includes a single apostrophe to make sure we aren't decoding this part as a string. +SELECT 'deep nest' AS n1; +/* Second level of nesting... +SELECT 'deeper nest' as n2; +/* Third level of nesting... +SELECT 'deepest nest' as n3; */ -^^^ Hoo boy. Still two deep... */ Now just one deep... diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out index 436b33ce43980..ae1cb2f171704 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -42,7 +42,7 @@ struct<> -- !query -CREATE TABLE view_base_table (key int /* PRIMARY KEY */, data varchar(20)) +CREATE TABLE view_base_table (key int /* PRIMARY KEY */, data varchar(20)) USING PARQUET -- !query schema struct<> -- !query output @@ -56,7 +56,7 @@ CREATE VIEW key_dependent_view AS struct<> -- !query output org.apache.spark.sql.AnalysisException -expression 'default.view_base_table.`data`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; +expression 'spark_catalog.default.view_base_table.`data`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; -- !query @@ -266,7 +266,7 @@ CREATE VIEW v1_temp AS SELECT * FROM temp_table struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v1_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v1_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -371,7 +371,7 @@ CREATE VIEW v4_temp AS struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v4_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v4_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -383,7 +383,7 @@ CREATE VIEW v5_temp AS struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v5_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v5_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -542,7 +542,7 @@ CREATE VIEW v6_temp AS SELECT * FROM base_table WHERE id IN (SELECT id FROM temp struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v6_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v6_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -551,7 +551,7 @@ CREATE VIEW v7_temp AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM tem struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v7_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v7_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -560,7 +560,7 @@ CREATE VIEW v8_temp AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM temp struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v8_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v8_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -569,7 +569,7 @@ CREATE VIEW v9_temp AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `v9_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `temp_view_test`.`v9_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -678,7 +678,7 @@ CREATE VIEW temporal1 AS SELECT * FROM t1 CROSS JOIN tt struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `temporal1` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `testviewschm2`.`temporal1` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -719,7 +719,7 @@ CREATE VIEW temporal2 AS SELECT * FROM t1 INNER JOIN tt ON t1.num = tt.num2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `temporal2` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `testviewschm2`.`temporal2` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -760,7 +760,7 @@ CREATE VIEW temporal3 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2 struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `temporal3` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `testviewschm2`.`temporal3` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -801,7 +801,7 @@ CREATE VIEW temporal4 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2 AND t struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `temporal4` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `testviewschm2`.`temporal4` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; -- !query @@ -810,7 +810,7 @@ CREATE VIEW temporal5 AS SELECT * FROM t1 WHERE num IN (SELECT num FROM t1 WHERE struct<> -- !query output org.apache.spark.sql.AnalysisException -Not allowed to create a permanent view `temporal5` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; +Not allowed to create a permanent view `testviewschm2`.`temporal5` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out index fd5dc42632176..151fa1e28d725 100755 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 98 +-- Number of queries: 60 -- !query @@ -571,310 +571,6 @@ struct 2 days --- !query -SELECT EXTRACT(EPOCH FROM DATE '1970-01-01') --- !query schema -struct --- !query output -0.000000 - - --- !query -SELECT EXTRACT(EPOCH FROM TIMESTAMP '1970-01-01') --- !query schema -struct --- !query output -0.000000 - - --- !query -SELECT EXTRACT(CENTURY FROM TO_DATE('0101-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --2 - - --- !query -SELECT EXTRACT(CENTURY FROM TO_DATE('0100-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --1 - - --- !query -SELECT EXTRACT(CENTURY FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --1 - - --- !query -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01') --- !query schema -struct --- !query output -1 - - --- !query -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD') --- !query schema -struct --- !query output -1 - - --- !query -SELECT EXTRACT(CENTURY FROM DATE '1900-12-31') --- !query schema -struct --- !query output -19 - - --- !query -SELECT EXTRACT(CENTURY FROM DATE '1901-01-01') --- !query schema -struct --- !query output -20 - - --- !query -SELECT EXTRACT(CENTURY FROM DATE '2000-12-31') --- !query schema -struct --- !query output -20 - - --- !query -SELECT EXTRACT(CENTURY FROM DATE '2001-01-01') --- !query schema -struct --- !query output -21 - - --- !query -SELECT EXTRACT(CENTURY FROM CURRENT_DATE)>=21 AS True --- !query schema -struct --- !query output -true - - --- !query -SELECT EXTRACT(MILLENNIUM FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --1 - - --- !query -SELECT EXTRACT(MILLENNIUM FROM DATE '0001-01-01 AD') --- !query schema -struct --- !query output -1 - - --- !query -SELECT EXTRACT(MILLENNIUM FROM DATE '1000-12-31') --- !query schema -struct --- !query output -1 - - --- !query -SELECT EXTRACT(MILLENNIUM FROM DATE '1001-01-01') --- !query schema -struct --- !query output -2 - - --- !query -SELECT EXTRACT(MILLENNIUM FROM DATE '2000-12-31') --- !query schema -struct --- !query output -2 - - --- !query -SELECT EXTRACT(MILLENNIUM FROM DATE '2001-01-01') --- !query schema -struct --- !query output -3 - - --- !query -SELECT EXTRACT(MILLENNIUM FROM CURRENT_DATE) --- !query schema -struct --- !query output -3 - - --- !query -SELECT EXTRACT(DECADE FROM DATE '1994-12-25') --- !query schema -struct --- !query output -199 - - --- !query -SELECT EXTRACT(DECADE FROM DATE '0010-01-01') --- !query schema -struct --- !query output -1 - - --- !query -SELECT EXTRACT(DECADE FROM DATE '0009-12-31') --- !query schema -struct --- !query output -0 - - --- !query -SELECT EXTRACT(DECADE FROM TO_DATE('0001-01-01 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output -0 - - --- !query -SELECT EXTRACT(DECADE FROM TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --1 - - --- !query -SELECT EXTRACT(DECADE FROM TO_DATE('0011-01-01 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --1 - - --- !query -SELECT EXTRACT(DECADE FROM TO_DATE('0012-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --2 - - --- !query -SELECT EXTRACT(CENTURY FROM NOW())>=21 AS True --- !query schema -struct --- !query output -true - - --- !query -SELECT EXTRACT(CENTURY FROM TIMESTAMP '1970-03-20 04:30:00.00000') --- !query schema -struct --- !query output -20 - - --- !query -SELECT DATE_TRUNC('MILLENNIUM', TIMESTAMP '1970-03-20 04:30:00.00000') --- !query schema -struct --- !query output -1001-01-01 00:07:02 - - --- !query -SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20') --- !query schema -struct --- !query output -1001-01-01 00:07:02 - - --- !query -SELECT DATE_TRUNC('CENTURY', TIMESTAMP '1970-03-20 04:30:00.00000') --- !query schema -struct --- !query output -1901-01-01 00:00:00 - - --- !query -SELECT DATE_TRUNC('CENTURY', DATE '1970-03-20') --- !query schema -struct --- !query output -1901-01-01 00:00:00 - - --- !query -SELECT DATE_TRUNC('CENTURY', DATE '2004-08-10') --- !query schema -struct --- !query output -2001-01-01 00:00:00 - - --- !query -SELECT DATE_TRUNC('CENTURY', DATE '0002-02-04') --- !query schema -struct --- !query output -0001-01-01 00:07:02 - - --- !query -SELECT DATE_TRUNC('CENTURY', TO_DATE('0055-08-10 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --0099-01-01 00:07:02 - - --- !query -SELECT DATE_TRUNC('DECADE', DATE '1993-12-25') --- !query schema -struct --- !query output -1990-01-01 00:00:00 - - --- !query -SELECT DATE_TRUNC('DECADE', DATE '0004-12-25') --- !query schema -struct --- !query output -0000-01-01 00:07:02 - - --- !query -SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')) --- !query schema -struct --- !query output --0010-01-01 00:07:02 - - -- !query select make_date(2013, 7, 15) -- !query schema @@ -888,7 +584,7 @@ select make_date(-44, 3, 15) -- !query schema struct -- !query output --0044-03-15 +0045-03-15 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out index ba913789d5623..fe8375c5eab8f 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out @@ -322,9 +322,10 @@ struct -- !query SELECT int(float('2147483647')) -- !query schema -struct +struct<> -- !query output -2147483647 +java.lang.ArithmeticException +Casting 2.14748365E9 to int causes overflow -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out index 1046d0ec86bbd..63ad74aac32ec 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out @@ -64,7 +64,7 @@ struct<> -- !query select col1, col2, char_length(col3) from inserttest -- !query schema -struct +struct -- !query output 30 50 10000 NULL 3 7 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out index 4bd846d3ff923..62d47410aab65 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out @@ -105,7 +105,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 1 2:03(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03' day to hour @@ -119,7 +119,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 1 2:03:04(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03:04' day to hour @@ -141,7 +141,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 1 2:03:04(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03:04' day to minute @@ -155,7 +155,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03' day to second @@ -177,7 +177,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 1 2:03(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03' hour to minute @@ -191,7 +191,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 1 2:03:04(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03:04' hour to minute @@ -205,7 +205,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03' hour to second @@ -219,7 +219,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03:04(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03:04' hour to second @@ -233,7 +233,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03' minute to second @@ -247,7 +247,7 @@ struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException -requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03:04(line 1, pos 16) +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) == SQL == SELECT interval '1 2:03:04' minute to second diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out index 5332dfff9f101..20f4f6b1f213f 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out @@ -536,7 +536,7 @@ SELECT '' AS `xxx`, i, k, t struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'i' is ambiguous, could be: default.j1_tbl.i, default.j2_tbl.i.; line 1 pos 20 +Reference 'i' is ambiguous, could be: spark_catalog.default.j1_tbl.i, spark_catalog.default.j2_tbl.i.; line 1 pos 20 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out index bdb605e406b8a..7b7aeb4ec7934 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out @@ -4404,7 +4404,7 @@ struct<> -- !query SELECT a, ceil(a), ceiling(a), floor(a), round(a) FROM ceil_floor_round -- !query schema -struct +struct -- !query output -0.000001000000000000 0 0 -1 0 -5.499999000000000000 -5 -5 -6 -5 @@ -4494,7 +4494,7 @@ struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) / CA -- !query select div(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000) -- !query schema -struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,0)> +struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):bigint> -- !query output 0 @@ -4510,7 +4510,7 @@ struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CA -- !query select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) -- !query schema -struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,0)> +struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):bigint> -- !query output -9 @@ -4526,7 +4526,7 @@ struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % -- !query select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000)*1000000000000000000000 + mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) -- !query schema -struct<(CAST((CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0)) * CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0)) + CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0))):decimal(38,0)> +struct<(CAST((CAST(CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(20,0)) AS DECIMAL(22,0)) * CAST(1000000000000000000000 AS DECIMAL(22,0))) AS DECIMAL(38,0)) + CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0))):decimal(38,0)> -- !query output -9999999999999999999999 @@ -4542,7 +4542,7 @@ struct<(CAST(70.0 AS DECIMAL(3,1)) % CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3, -- !query select div (70.0,70) -- !query schema -struct<(CAST(70.0 AS DECIMAL(3,1)) div CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(2,0)> +struct<(CAST(70.0 AS DECIMAL(3,1)) div CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):bigint> -- !query output 1 @@ -4654,7 +4654,7 @@ struct -- !query select ln(1.2345678e-28) -- !query schema -struct +struct -- !query output -64.26166165451762 @@ -4662,7 +4662,7 @@ struct -- !query select ln(0.0456789) -- !query schema -struct +struct -- !query output -3.0861187944847437 @@ -4670,7 +4670,7 @@ struct -- !query select ln(0.99949452) -- !query schema -struct +struct -- !query output -5.056077980832118E-4 @@ -4678,7 +4678,7 @@ struct -- !query select ln(1.00049687395) -- !query schema -struct +struct -- !query output 4.967505490136803E-4 @@ -4686,7 +4686,7 @@ struct -- !query select ln(1234.567890123456789) -- !query schema -struct +struct -- !query output 7.11847630129779 @@ -4694,7 +4694,7 @@ struct -- !query select ln(5.80397490724e5) -- !query schema -struct +struct -- !query output 13.271468476626518 @@ -4702,7 +4702,7 @@ struct -- !query select ln(9.342536355e34) -- !query schema -struct +struct -- !query output 80.52247093552418 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out index cbf4cfa58cdb9..d8d33d92a7cc4 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out @@ -143,7 +143,7 @@ SELECT a FROM test_having HAVING min(a) < max(a) struct<> -- !query output org.apache.spark.sql.AnalysisException -grouping expressions sequence is empty, and 'default.test_having.`a`' is not an aggregate function. Wrap '(min(default.test_having.`a`) AS `min(a#x)`, max(default.test_having.`a`) AS `max(a#x)`)' in windowing function(s) or wrap 'default.test_having.`a`' in first() (or first_value) if you don't care which value you get.; +grouping expressions sequence is empty, and 'spark_catalog.default.test_having.`a`' is not an aggregate function. Wrap '(min(spark_catalog.default.test_having.`a`) AS `min(a#x)`, max(spark_catalog.default.test_having.`a`) AS `max(a#x)`)' in windowing function(s) or wrap 'spark_catalog.default.test_having.`a`' in first() (or first_value) if you don't care which value you get.; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out index c30eea8ab689d..e8a3a9b9731a6 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 122 +-- Number of queries: 121 -- !query @@ -627,7 +627,7 @@ struct<> -- !query SELECT substr(f1, 99995) from toasttest -- !query schema -struct +struct -- !query output 567890 567890 @@ -638,7 +638,7 @@ struct -- !query SELECT substr(f1, 99995, 10) from toasttest -- !query schema -struct +struct -- !query output 567890 567890 @@ -910,14 +910,6 @@ struct hi --- !query -SELECT ltrim('zzzytrim', 'xyz') --- !query schema -struct --- !query output -trim - - -- !query SELECT translate('', '14', 'ax') -- !query schema @@ -985,7 +977,7 @@ struct -- !query SELECT trim(binary('\\000') from binary('\\000Tom\\000')) -- !query schema -struct +struct -- !query output Tom diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out index 75ea3f3c42932..5b0b636ff0c29 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 32 +-- Number of queries: 30 -- !query @@ -265,36 +265,6 @@ struct<54:string,timestamp:timestamp,year:int,month:int,day:int,hour:int,minute: 2001-09-22 18:19:20 2001 9 22 18 19 20.000000 --- !query -SELECT '' AS `54`, d1 as `timestamp`, - date_part( 'quarter', d1) AS quarter, date_part( 'msec', d1) AS msec, - date_part( 'usec', d1) AS usec - FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01' --- !query schema -struct<54:string,timestamp:timestamp,quarter:int,msec:decimal(8,3),usec:int> --- !query output - 1969-12-31 16:00:00 4 0.000 0 - 1997-01-02 00:00:00 1 0.000 0 - 1997-01-02 03:04:05 1 5000.000 5000000 - 1997-02-10 17:32:01 1 1000.000 1000000 - 2001-09-22 18:19:20 3 20000.000 20000000 - - --- !query -SELECT '' AS `54`, d1 as `timestamp`, - date_part( 'isoyear', d1) AS isoyear, date_part( 'week', d1) AS week, - date_part( 'dow', d1) AS dow - FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01' --- !query schema -struct<54:string,timestamp:timestamp,isoyear:int,week:int,dow:int> --- !query output - 1969-12-31 16:00:00 1970 1 3 - 1997-01-02 00:00:00 1997 1 4 - 1997-01-02 03:04:05 1997 1 4 - 1997-02-10 17:32:01 1997 7 1 - 2001-09-22 18:19:20 2001 38 6 - - -- !query SELECT make_timestamp(2014,12,28,6,30,45.887) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out index 2b1de87a6be5e..76567b689445a 100755 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out @@ -270,7 +270,7 @@ struct +struct -- !query output 0 0 0 0 0 0 @@ -287,7 +287,7 @@ struct +struct -- !query output 0 4 0 1 1 1 @@ -306,7 +306,7 @@ SELECT last(ten) OVER (PARTITION BY four), ten, four FROM (SELECT * FROM tenk1 WHERE unique2 < 10 ORDER BY four, ten)s ORDER BY four, ten -- !query schema -struct +struct -- !query output 4 0 0 4 0 0 @@ -476,7 +476,7 @@ sum(ten) over (partition by four order by ten), last(ten) over (partition by four order by ten) FROM (select distinct ten, four from tenk1) ss -- !query schema -struct +struct -- !query output 0 0 0 0 0 2 2 2 @@ -506,7 +506,7 @@ sum(ten) over (partition by four order by ten range between unbounded preceding last(ten) over (partition by four order by ten range between unbounded preceding and current row) FROM (select distinct ten, four from tenk1) ss -- !query schema -struct +struct -- !query output 0 0 0 0 0 2 2 2 @@ -536,7 +536,7 @@ sum(ten) over (partition by four order by ten range between unbounded preceding last(ten) over (partition by four order by ten range between unbounded preceding and unbounded following) FROM (select distinct ten, four from tenk1) ss -- !query schema -struct +struct -- !query output 0 0 20 8 0 2 20 8 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out index 0015740a0638e..ccddf9db172a6 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out @@ -15,6 +15,24 @@ struct<> +-- !query +INSERT INTO empsalary VALUES + ('develop', 10, 5200, date '2007-08-01'), + ('sales', 1, 5000, date '2006-10-01'), + ('personnel', 5, 3500, date '2007-12-10'), + ('sales', 4, 4800, date '2007-08-08'), + ('personnel', 2, 3900, date '2006-12-23'), + ('develop', 7, 4200, date '2008-01-01'), + ('develop', 9, 4500, date '2008-01-01'), + ('sales', 3, 4800, date '2007-08-01'), + ('develop', 8, 6000, date '2006-10-01'), + ('develop', 11, 5200, date '2007-08-15') +-- !query schema +struct<> +-- !query output + + + -- !query SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding), unique1, four @@ -72,17 +90,6 @@ struct --- !query output -org.apache.spark.sql.AnalysisException -Window Frame specifiedwindowframe(RangeFrame, -1000, 1000) must match the required frame specifiedwindowframe(RowFrame, -1, -1); - - -- !query select ss.id, ss.y, first(ss.y) over w, @@ -94,7 +101,7 @@ from window w as (order by ss.id asc nulls first range between 2 preceding and 2 following) -- !query schema -struct +struct -- !query output 1 1 1 3 2 2 1 4 @@ -116,7 +123,7 @@ from window w as (order by ss.id asc nulls last range between 2 preceding and 2 following) -- !query schema -struct +struct -- !query output 1 1 1 3 2 2 1 4 @@ -138,7 +145,7 @@ from window w as (order by ss.id desc nulls first range between 2 preceding and 2 following) -- !query schema -struct +struct -- !query output 1 1 3 1 2 2 4 1 @@ -160,7 +167,7 @@ from window w as (order by ss.id desc nulls last range between 2 preceding and 2 following) -- !query schema -struct +struct -- !query output 1 1 3 1 2 2 4 1 @@ -175,7 +182,7 @@ NULL 43 42 43 select x.id, last(x.id) over (order by x.id range between current row and 2147450884 following) from range(32764, 32767) x -- !query schema -struct +struct -- !query output 32764 32766 32765 32766 @@ -186,7 +193,7 @@ struct +struct -- !query output -32766 -32766 @@ -195,7 +202,7 @@ struct +struct -- !query output 2147483644 2147483646 2147483645 2147483646 @@ -206,7 +213,7 @@ struct +struct -- !query output -2147483646 -2147483646 @@ -265,7 +272,7 @@ from numerics window w as (order by f_float4 range between 1 preceding and 1 following) -- !query schema -struct +struct -- !query output 1 -3.0 1 1 2 -1.0 2 3 @@ -282,7 +289,7 @@ from numerics window w as (order by f_float4 range between 1 preceding and 1.1 following) -- !query schema -struct +struct -- !query output 1 -3.0 1 1 2 -1.0 2 3 @@ -299,7 +306,7 @@ from numerics window w as (order by f_float4 range between 'inf' preceding and 'inf' following) -- !query schema -struct +struct -- !query output 1 -3.0 1 7 2 -1.0 1 7 @@ -316,7 +323,7 @@ from numerics window w as (order by f_float4 range between 1.1 preceding and 'NaN' following) -- !query schema -struct +struct -- !query output 1 -3.0 1 7 2 -1.0 2 7 @@ -333,7 +340,7 @@ from numerics window w as (order by f_float8 range between 1 preceding and 1 following) -- !query schema -struct +struct -- !query output 1 -3.0 1 1 2 -1.0 2 3 @@ -350,7 +357,7 @@ from numerics window w as (order by f_float8 range between 1 preceding and 1.1 following) -- !query schema -struct +struct -- !query output 1 -3.0 1 1 2 -1.0 2 3 @@ -367,7 +374,7 @@ from numerics window w as (order by f_float8 range between 'inf' preceding and 'inf' following) -- !query schema -struct +struct -- !query output 1 -3.0 1 7 2 -1.0 1 7 @@ -384,7 +391,7 @@ from numerics window w as (order by f_float8 range between 1.1 preceding and 'NaN' following) -- !query schema -struct +struct -- !query output 1 -3.0 1 7 2 -1.0 2 7 @@ -401,7 +408,7 @@ from numerics window w as (order by f_numeric range between 1 preceding and 1 following) -- !query schema -struct +struct -- !query output 1 -3 1 1 2 -1 2 3 @@ -418,7 +425,7 @@ from numerics window w as (order by f_numeric range between 1 preceding and 1.1 following) -- !query schema -struct +struct -- !query output 1 -3 1 1 2 -1 2 3 @@ -435,7 +442,7 @@ from numerics window w as (order by f_numeric range between 1 preceding and 1.1 following) -- !query schema -struct +struct -- !query output 1 -3 1 1 2 -1 2 3 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out index 5a52358fe1c53..08eba6797b01d 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out @@ -244,7 +244,7 @@ from t1 where f1 = f2 struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(PARTITION BY default.t1.`f1` RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 24 +cannot resolve '(PARTITION BY spark_catalog.default.t1.`f1` RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 24 -- !query @@ -294,7 +294,7 @@ SELECT * FROM empsalary WHERE row_number() OVER (ORDER BY salary) < 10 struct<> -- !query output org.apache.spark.sql.AnalysisException -It is not allowed to use window functions inside WHERE and HAVING clauses; +It is not allowed to use window functions inside WHERE clause; -- !query @@ -306,7 +306,7 @@ org.apache.spark.sql.AnalysisException The query operator `Join` contains one or more unsupported expression types Aggregate, Window or Generate. -Invalid expressions: [row_number() OVER (ORDER BY default.empsalary.`salary` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)]; +Invalid expressions: [row_number() OVER (ORDER BY spark_catalog.default.empsalary.`salary` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)]; -- !query @@ -341,7 +341,7 @@ SELECT * FROM empsalary WHERE (rank() OVER (ORDER BY random())) > 10 struct<> -- !query output org.apache.spark.sql.AnalysisException -It is not allowed to use window functions inside WHERE and HAVING clauses; +It is not allowed to use window functions inside WHERE clause; -- !query @@ -350,7 +350,7 @@ SELECT * FROM empsalary WHERE rank() OVER (ORDER BY random()) struct<> -- !query output org.apache.spark.sql.AnalysisException -It is not allowed to use window functions inside WHERE and HAVING clauses; +It is not allowed to use window functions inside WHERE clause; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out new file mode 100644 index 0000000000000..64aa6053d8d70 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -0,0 +1,87 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 10 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 0) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 1) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 2) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 2 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', -1) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +The specified group index cannot be less than zero + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0) +-- !query schema +struct +-- !query output +1a + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2) +-- !query schema +struct +-- !query output +a + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', -1) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +The specified group index cannot be less than zero \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out index e8ee07171651d..88fef8908638e 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out @@ -15,7 +15,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -44,7 +44,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -75,7 +75,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -105,7 +105,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -135,7 +135,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `b` STRING, `c` INT, `a` INT) @@ -165,7 +165,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -197,7 +197,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -227,7 +227,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` INT, `b` STRING, `c` INT) @@ -257,7 +257,7 @@ SHOW CREATE TABLE tbl -- !query schema struct -- !query output -CREATE TABLE `tbl` ( +CREATE TABLE `default`.`tbl` ( `a` FLOAT, `b` DECIMAL(10,0), `c` DECIMAL(10,0), @@ -295,7 +295,18 @@ SHOW CREATE TABLE view_SPARK_30302 AS SERDE -- !query schema struct -- !query output -CREATE VIEW `view_SPARK_30302`( +CREATE VIEW `default`.`view_SPARK_30302`( + `aaa`, + `bbb`) +AS SELECT a, b FROM tbl + + +-- !query +SHOW CREATE TABLE view_SPARK_30302 +-- !query schema +struct +-- !query output +CREATE VIEW `default`.`view_SPARK_30302` ( `aaa`, `bbb`) AS SELECT a, b FROM tbl @@ -324,7 +335,19 @@ SHOW CREATE TABLE view_SPARK_30302 AS SERDE -- !query schema struct -- !query output -CREATE VIEW `view_SPARK_30302`( +CREATE VIEW `default`.`view_SPARK_30302`( + `aaa` COMMENT 'comment with \'quoted text\' for aaa', + `bbb`) +COMMENT 'This is a comment with \'quoted text\' for view' +AS SELECT a, b FROM tbl + + +-- !query +SHOW CREATE TABLE view_SPARK_30302 +-- !query schema +struct +-- !query output +CREATE VIEW `default`.`view_SPARK_30302` ( `aaa` COMMENT 'comment with \'quoted text\' for aaa', `bbb`) COMMENT 'This is a comment with \'quoted text\' for view' @@ -354,7 +377,7 @@ SHOW CREATE TABLE view_SPARK_30302 AS SERDE -- !query schema struct -- !query output -CREATE VIEW `view_SPARK_30302`( +CREATE VIEW `default`.`view_SPARK_30302`( `aaa`, `bbb`) TBLPROPERTIES ( @@ -363,30 +386,18 @@ TBLPROPERTIES ( AS SELECT a, b FROM tbl --- !query -DROP VIEW view_SPARK_30302 --- !query schema -struct<> --- !query output - - - --- !query -CREATE VIEW view_SPARK_30302 (aaa, bbb) -AS SELECT a, b FROM tbl --- !query schema -struct<> --- !query output - - - -- !query SHOW CREATE TABLE view_SPARK_30302 -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.AnalysisException -Hive view isn't supported by SHOW CREATE TABLE; +CREATE VIEW `default`.`view_SPARK_30302` ( + `aaa`, + `bbb`) +TBLPROPERTIES ( + 'a' = '1', + 'b' = '2') +AS SELECT a, b FROM tbl -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index 501e185b07f7a..60c5e6d5642b7 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 26 +-- Number of queries: 27 -- !query @@ -63,15 +63,9 @@ SHOW TABLES -- !query schema struct -- !query output -aggtest -arraydata -mapdata -onek show_t1 show_t2 show_t3 -tenk1 -testdata -- !query @@ -79,15 +73,9 @@ SHOW TABLES IN showdb -- !query schema struct -- !query output -aggtest -arraydata -mapdata -onek show_t1 show_t2 show_t3 -tenk1 -testdata -- !query @@ -119,6 +107,16 @@ show_t2 show_t3 +-- !query +SHOW TABLES IN showdb LIKE 'show_t*' +-- !query schema +struct +-- !query output +show_t1 +show_t2 +show_t3 + + -- !query SHOW TABLE EXTENDED LIKE 'show_t*' -- !query schema @@ -226,7 +224,7 @@ SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(a='Us', d=1) struct<> -- !query output org.apache.spark.sql.AnalysisException -Partition spec is invalid. The spec (a, d) must match the partition spec (c, d) defined in table '`showdb`.`show_t1`'; +a is not a valid partition column in table `showdb`.`show_t1`.; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out new file mode 100644 index 0000000000000..6984b34c365ec --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out @@ -0,0 +1,114 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 13 + + +-- !query +CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet +TBLPROPERTIES('p1'='v1', 'p2'='v2') +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW TBLPROPERTIES tbl +-- !query schema +struct +-- !query output +p1 v1 +p2 v2 + + +-- !query +SHOW TBLPROPERTIES tbl("p1") +-- !query schema +struct +-- !query output +v1 + + +-- !query +SHOW TBLPROPERTIES tbl("p3") +-- !query schema +struct +-- !query output +Table default.tbl does not have property: p3 + + +-- !query +DROP TABLE tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view TBLPROPERTIES('p1'='v1', 'p2'='v2') AS SELECT 1 AS c1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW TBLPROPERTIES view +-- !query schema +struct +-- !query output +p1 v1 +p2 v2 +view.catalogAndNamespace.numParts 2 +view.catalogAndNamespace.part.0 spark_catalog +view.catalogAndNamespace.part.1 default +view.query.out.col.0 c1 +view.query.out.numCols 1 + + +-- !query +SHOW TBLPROPERTIES view("p1") +-- !query schema +struct +-- !query output +v1 + + +-- !query +SHOW TBLPROPERTIES view("p3") +-- !query schema +struct +-- !query output +Table default.view does not have property: p3 + + +-- !query +DROP VIEW view +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW tv TBLPROPERTIES('p1'='v1') AS SELECT 1 AS c1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW TBLPROPERTIES tv +-- !query schema +struct +-- !query output + + + +-- !query +DROP VIEW tv +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/show-views.sql.out b/sql/core/src/test/resources/sql-tests/results/show-views.sql.out new file mode 100644 index 0000000000000..d88790d8b5ec8 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/show-views.sql.out @@ -0,0 +1,177 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 20 + + +-- !query +CREATE DATABASE showdb +-- !query schema +struct<> +-- !query output + + + +-- !query +USE showdb +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tbl(a STRING, b INT, c STRING, d STRING) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view_1 AS SELECT * FROM tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view_2 AS SELECT * FROM tbl WHERE c='a' +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE GLOBAL TEMP VIEW view_3 AS SELECT 1 as col1 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW view_4(e INT) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW VIEWS +-- !query schema +struct +-- !query output +view_1 +view_2 +view_4 + + +-- !query +SHOW VIEWS FROM showdb +-- !query schema +struct +-- !query output +view_1 +view_2 +view_4 + + +-- !query +SHOW VIEWS IN showdb +-- !query schema +struct +-- !query output +view_1 +view_2 +view_4 + + +-- !query +SHOW VIEWS IN global_temp +-- !query schema +struct +-- !query output +view_3 +view_4 + + +-- !query +SHOW VIEWS 'view_*' +-- !query schema +struct +-- !query output +view_1 +view_2 +view_4 + + +-- !query +SHOW VIEWS LIKE 'view_1*|view_2*' +-- !query schema +struct +-- !query output +view_1 +view_2 + + +-- !query +SHOW VIEWS IN showdb 'view_*' +-- !query schema +struct +-- !query output +view_1 +view_2 +view_4 + + +-- !query +SHOW VIEWS IN showdb LIKE 'view_*' +-- !query schema +struct +-- !query output +view_1 +view_2 +view_4 + + +-- !query +SHOW VIEWS IN wrongdb LIKE 'view_*' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException +Database 'wrongdb' not found; + + +-- !query +DROP VIEW global_temp.view_3 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW view_4 +-- !query schema +struct<> +-- !query output + + + +-- !query +USE default +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP DATABASE showdb CASCADE +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 33d1b25aee483..c9aad35bec98a 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 33 +-- Number of queries: 36 -- !query @@ -55,7 +55,7 @@ struct -- !query select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aaads', null) -- !query schema -struct +struct -- !query output 4 NULL NULL @@ -111,7 +111,7 @@ struct> -- !query SELECT substr('Spark SQL', 5) -- !query schema -struct +struct -- !query output k SQL @@ -119,7 +119,7 @@ k SQL -- !query SELECT substr('Spark SQL', -3) -- !query schema -struct +struct -- !query output SQL @@ -127,7 +127,7 @@ SQL -- !query SELECT substr('Spark SQL', 5, 1) -- !query schema -struct +struct -- !query output k @@ -205,64 +205,88 @@ k -- !query -SELECT trim('yxTomxx', 'xyz'), trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx') +SELECT trim(" xyz "), ltrim(" xyz "), rtrim(" xyz ") +-- !query schema +struct +-- !query output +xyz xyz xyz + + +-- !query +SELECT trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx') +-- !query schema +struct +-- !query output +Tom Tom + + +-- !query +SELECT trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx') +-- !query schema +struct +-- !query output +bar bar + + +-- !query +SELECT trim(LEADING 'xyz' FROM 'zzzytest') -- !query schema -struct +struct -- !query output -Tom Tom Tom +test -- !query -SELECT trim('xxxbarxxx', 'x'), trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx') +SELECT trim(LEADING 'xyz' FROM 'zzzytestxyz') -- !query schema -struct +struct -- !query output -bar bar bar +testxyz -- !query -SELECT ltrim('zzzytest', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytest') +SELECT trim(LEADING 'xy' FROM 'xyxXxyLAST WORD') -- !query schema -struct +struct -- !query output -test test +XxyLAST WORD -- !query -SELECT ltrim('zzzytestxyz', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytestxyz') +SELECT trim(TRAILING 'xyz' FROM 'testxxzx') -- !query schema -struct +struct -- !query output -testxyz testxyz +test -- !query -SELECT ltrim('xyxXxyLAST WORD', 'xy'), trim(LEADING 'xy' FROM 'xyxXxyLAST WORD') +SELECT trim(TRAILING 'xyz' FROM 'xyztestxxzx') -- !query schema -struct +struct -- !query output -XxyLAST WORD XxyLAST WORD +xyztest -- !query -SELECT rtrim('testxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'testxxzx') +SELECT trim(TRAILING 'xy' FROM 'TURNERyxXxy') -- !query schema -struct +struct -- !query output -test test +TURNERyxX -- !query -SELECT rtrim('xyztestxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'xyztestxxzx') +SELECT lpad('hi', 'invalid_length') -- !query schema -struct +struct -- !query output -xyztest xyztest +NULL -- !query -SELECT rtrim('TURNERyxXxy', 'xy'), trim(TRAILING 'xy' FROM 'TURNERyxXxy') +SELECT rpad('hi', 'invalid_length') -- !query schema -struct +struct -- !query output -TURNERyxX TURNERyxX +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/struct.sql.out b/sql/core/src/test/resources/sql-tests/results/struct.sql.out index f294c5213d319..3b610edc47169 100644 --- a/sql/core/src/test/resources/sql-tests/results/struct.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/struct.sql.out @@ -83,7 +83,7 @@ struct -- !query SELECT ID, STRUCT(ST.C as STC, ST.D as STD).STD FROM tbl_x -- !query schema -struct +struct -- !query output 1 delta 2 eta diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out index 1c335445114c7..e24538b9138ba 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out @@ -103,7 +103,7 @@ SELECT Count(DISTINCT( t1a )), FROM t1 WHERE t1d IN (SELECT t2d FROM t2 - ORDER BY t2c + ORDER BY t2c, t2d LIMIT 2) GROUP BY t1b ORDER BY t1b DESC NULLS FIRST @@ -136,7 +136,7 @@ SELECT Count(DISTINCT( t1a )), FROM t1 WHERE t1d NOT IN (SELECT t2d FROM t2 - ORDER BY t2b DESC nulls first + ORDER BY t2b DESC nulls first, t2d LIMIT 1) GROUP BY t1b ORDER BY t1b NULLS last diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out index 1599634ff9efb..d703d4e9112e9 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out @@ -109,9 +109,9 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses: -Aggregate [min(outer(t2a#x)) AS min(outer())#x] -+- SubqueryAlias `t3` +Aggregate [min(outer(t2a#x)) AS min(outer(t2.`t2a`))#x] ++- SubqueryAlias t3 +- Project [t3a#x, t3b#x, t3c#x] - +- SubqueryAlias `t3` + +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x] ; diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out index d5c27ade8e152..7b1fcddfdac7d 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out @@ -118,7 +118,7 @@ struct +struct -- !query output 2017-12-13 @@ -231,7 +231,7 @@ struct +struct -- !query output 2017-12-13 @@ -344,6 +344,6 @@ struct +struct -- !query output 2017-12-09 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out index f841adf89612e..e47decbd33920 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out @@ -285,7 +285,7 @@ struct -- !query SELECT day( '1996-01-10') FROM t -- !query schema -struct +struct -- !query output 10 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out index ed7ab5a342c12..d046ff249379f 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out @@ -85,7 +85,7 @@ FROM various_maps struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'map_zip_with(various_maps.`decimal_map1`, various_maps.`decimal_map2`, lambdafunction(named_struct(NamePlaceholder(), k, NamePlaceholder(), v1, NamePlaceholder(), v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,0), decimal(36,35)].; line 1 pos 7 +cannot resolve 'map_zip_with(various_maps.`decimal_map1`, various_maps.`decimal_map2`, lambdafunction(struct(k, v1, v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,0), decimal(36,35)].; line 1 pos 7 -- !query @@ -113,7 +113,7 @@ FROM various_maps struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'map_zip_with(various_maps.`decimal_map2`, various_maps.`int_map`, lambdafunction(named_struct(NamePlaceholder(), k, NamePlaceholder(), v1, NamePlaceholder(), v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,35), int].; line 1 pos 7 +cannot resolve 'map_zip_with(various_maps.`decimal_map2`, various_maps.`int_map`, lambdafunction(struct(k, v1, v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,35), int].; line 1 pos 7 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out index 31353bdedc69f..b8c190beeae19 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out @@ -107,7 +107,7 @@ SELECT '1' + cast('2017-12-11 09:30:00' as date) FROM t struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('1' AS DOUBLE)' is of double type.; line 1 pos 7 -- !query @@ -698,7 +698,7 @@ SELECT cast('2017-12-11 09:30:00' as date) + '1' FROM t struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('1' AS DOUBLE)' is of double type.; line 1 pos 7 -- !query @@ -790,7 +790,7 @@ SELECT cast('2017-12-11 09:30:00' as date) - '1' FROM t struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 +cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST('1' AS DOUBLE)' is of double type.; line 1 pos 7 -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out index 7b419c6702586..b4c052585ef7c 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out @@ -136,15 +136,16 @@ NULL -- !query select to_timestamp('2018-01-01', a) from t -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html -- !query select to_unix_timestamp(a) from t -- !query schema -struct +struct -- !query output NULL @@ -152,15 +153,16 @@ NULL -- !query select to_unix_timestamp('2018-01-01', a) from t -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html -- !query select unix_timestamp(a) from t -- !query schema -struct +struct -- !query output NULL @@ -168,15 +170,16 @@ NULL -- !query select unix_timestamp('2018-01-01', a) from t -- !query schema -struct +struct<> -- !query output -NULL +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'aa' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html -- !query select from_unixtime(a) from t -- !query schema -struct +struct -- !query output NULL diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out index d65c56774eafd..76637bf578e6f 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out @@ -77,7 +77,7 @@ struct -- !query SELECT stddev_pop(udf(b)) FROM aggtest -- !query schema -struct +struct -- !query output 131.10703231895047 @@ -85,7 +85,7 @@ struct -- !query SELECT udf(stddev_samp(b)) FROM aggtest -- !query schema -struct +struct -- !query output 151.38936080399804 @@ -93,7 +93,7 @@ struct +struct -- !query output 17189.053923482323 @@ -101,7 +101,7 @@ struct -- !query SELECT udf(var_samp(b)) FROM aggtest -- !query schema -struct +struct -- !query output 22918.738564643096 @@ -109,7 +109,7 @@ struct -- !query SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest -- !query schema -struct +struct -- !query output 131.18117242958306 @@ -117,7 +117,7 @@ struct +struct -- !query output 151.47497042966097 @@ -125,7 +125,7 @@ struct +struct -- !query output 17208.5 @@ -133,7 +133,7 @@ struct +struct -- !query output 22944.666666666668 @@ -141,7 +141,7 @@ struct +struct -- !query output 0.0 NaN @@ -149,7 +149,7 @@ struct +struct -- !query output 0.0 NaN @@ -157,7 +157,7 @@ struct +struct -- !query output NULL @@ -165,7 +165,7 @@ NULL -- !query select sum(udf(CAST(null AS long))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -173,7 +173,7 @@ NULL -- !query select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -181,7 +181,7 @@ NULL -- !query select sum(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -189,7 +189,7 @@ NULL -- !query select avg(udf(CAST(null AS int))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -197,7 +197,7 @@ NULL -- !query select avg(udf(CAST(null AS long))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -205,7 +205,7 @@ NULL -- !query select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -213,7 +213,7 @@ NULL -- !query select avg(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query schema -struct +struct -- !query output NULL @@ -221,7 +221,7 @@ NULL -- !query select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query schema -struct +struct -- !query output NaN @@ -229,7 +229,7 @@ NaN -- !query select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query schema -struct +struct -- !query output NaN @@ -238,7 +238,7 @@ NaN SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('1')) v(x) -- !query schema -struct +struct -- !query output Infinity NaN @@ -247,7 +247,7 @@ Infinity NaN SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('Infinity')) v(x) -- !query schema -struct +struct -- !query output Infinity NaN @@ -256,7 +256,7 @@ Infinity NaN SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('-Infinity'), ('Infinity')) v(x) -- !query schema -struct +struct -- !query output NaN NaN @@ -265,7 +265,7 @@ NaN NaN SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x) -- !query schema -struct +struct -- !query output 1.00000005E8 2.5 @@ -274,7 +274,7 @@ struct +struct -- !query output 7.000000000006E12 1.0 @@ -282,7 +282,7 @@ struct +struct -- !query output 653.6289553875104 871.5052738500139 @@ -290,7 +290,7 @@ struct +struct -- !query output 0.1396345165178734 @@ -315,7 +315,7 @@ struct select ten, udf(count(*)), sum(udf(four)) from onek group by ten order by ten -- !query schema -struct +struct -- !query output 0 100 100 1 100 200 @@ -333,7 +333,7 @@ struct +struct -- !query output 0 100 2 1 100 4 @@ -352,7 +352,7 @@ select ten, udf(sum(distinct four)) from onek a group by ten having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four) -- !query schema -struct +struct -- !query output 0 2 2 2 @@ -372,8 +372,8 @@ struct<> org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. -Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(CAST(udf(cast(four as string)) AS INT) AS BIGINT))] -Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; +Expression in where clause: [(sum(DISTINCT CAST((outer(a.`four`) + b.`four`) AS BIGINT)) = CAST(CAST(udf(ansi_cast(four as string)) AS INT) AS BIGINT))] +Invalid expressions: [sum(DISTINCT CAST((outer(a.`four`) + b.`four`) AS BIGINT))]; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out index c10fe9b51dd72..d4941d0a0b768 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out @@ -161,7 +161,7 @@ true true false NULL false true -- !query select min(udf(unique1)) from tenk1 -- !query schema -struct +struct -- !query output 0 @@ -169,7 +169,7 @@ struct -- !query select udf(max(unique1)) from tenk1 -- !query schema -struct +struct -- !query output 9999 @@ -217,7 +217,7 @@ struct -- !query select distinct max(udf(unique2)) from tenk1 -- !query schema -struct +struct -- !query output 9999 @@ -241,7 +241,7 @@ struct -- !query select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1 -- !query schema -struct +struct -- !query output 9999 @@ -249,7 +249,7 @@ struct +struct -- !query output 9999 3 9999 2 @@ -259,6 +259,6 @@ struct -- !query select udf(max(100)) from tenk1 -- !query schema -struct +struct -- !query output 100 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out index 04c4f54b02a3e..6c733e916d734 100755 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out @@ -176,7 +176,7 @@ struct -- !query SELECT CASE WHEN udf(1=0) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END -- !query schema -struct +struct -- !query output 1.0 @@ -184,7 +184,7 @@ struct +struct -- !query output 1.0 @@ -192,7 +192,7 @@ struct 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl -- !query schema -struct 100) THEN CAST(udf(cast((cast(1 as double) / cast(0 as double)) as string)) AS DOUBLE) ELSE CAST(CAST(udf(cast(0 as string)) AS INT) AS DOUBLE) END:double> +struct 100) THEN CAST(udf(ansi_cast((ansi_cast(1 as double) / ansi_cast(0 as double)) as string)) AS DOUBLE) ELSE CAST(CAST(udf(ansi_cast(0 as string)) AS INT) AS DOUBLE) END:double> -- !query output 0.0 0.0 @@ -203,7 +203,7 @@ struct 100) THEN CAST(udf(cast((cast(1 as double) / cast(0 as dou -- !query SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END -- !query schema -struct +struct -- !query output 1 @@ -294,7 +294,7 @@ struct SELECT udf(COALESCE(a.f, b.i, b.j)) FROM CASE_TBL a, CASE2_TBL b -- !query schema -struct +struct -- !query output -30.3 -30.3 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out index f113aee6d3b51..188b57ffd58d5 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out @@ -243,7 +243,7 @@ struct<> SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t) FROM J1_TBL AS tx -- !query schema -struct +struct -- !query output 0 NULL zero 1 4 one @@ -262,7 +262,7 @@ struct +struct -- !query output 0 NULL zero 1 4 one @@ -281,7 +281,7 @@ struct +struct -- !query output 0 NULL zero 1 4 one @@ -300,7 +300,7 @@ struct +struct -- !query output 0 NULL zero 1 4 one @@ -319,7 +319,7 @@ struct +struct -- !query output 0 NULL zero 0 NULL 0 NULL zero 1 -1 @@ -536,14 +536,14 @@ SELECT udf('') AS `xxx`, udf(i) AS i, udf(k), udf(t) AS t struct<> -- !query output org.apache.spark.sql.AnalysisException -Reference 'i' is ambiguous, could be: default.j1_tbl.i, default.j2_tbl.i.; line 1 pos 29 +Reference 'i' is ambiguous, could be: spark_catalog.default.j1_tbl.i, spark_catalog.default.j2_tbl.i.; line 1 pos 29 -- !query SELECT udf('') AS `xxx`, udf(t1.i) AS i, udf(k), udf(t) FROM J1_TBL t1 CROSS JOIN J2_TBL t2 -- !query schema -struct +struct -- !query output 0 -1 zero 0 -3 zero @@ -651,7 +651,7 @@ SELECT udf(udf('')) AS `xxx`, udf(udf(ii)) AS ii, udf(udf(tt)) AS tt, udf(udf(kk FROM (J1_TBL CROSS JOIN J2_TBL) AS tx (ii, jj, tt, ii2, kk) -- !query schema -struct +struct -- !query output 0 zero -1 0 zero -3 @@ -758,7 +758,7 @@ struct +struct -- !query output 0 NULL zero 0 NULL 0 NULL 0 NULL zero 0 NULL 1 -1 @@ -1657,7 +1657,7 @@ struct +struct -- !query output 0 NULL zero NULL 1 4 one -1 @@ -1672,7 +1672,7 @@ struct +struct -- !query output 0 NULL zero NULL 1 4 one -1 @@ -1703,7 +1703,7 @@ struct SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) FROM J1_TBL NATURAL JOIN J2_TBL -- !query schema -struct +struct -- !query output 0 NULL zero NULL 1 4 one -1 @@ -1718,7 +1718,7 @@ struct +struct -- !query output 0 NULL zero NULL 1 4 one -1 @@ -1733,7 +1733,7 @@ struct +struct -- !query output 0 NULL zero NULL 2 3 two 2 @@ -1744,7 +1744,7 @@ struct +struct -- !query output 0 NULL zero 0 NULL 1 4 one 1 -1 @@ -1759,7 +1759,7 @@ struct +struct -- !query output 0 NULL zero NULL 0 2 3 two 2 2 @@ -1770,7 +1770,7 @@ struct +struct -- !query output 0 NULL zero 2 2 0 NULL zero 2 4 @@ -1788,7 +1788,7 @@ SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i) ORDER BY udf(udf(i)), udf(k), udf(t) -- !query schema -struct +struct -- !query output NULL NULL null NULL NULL 0 zero NULL @@ -1810,7 +1810,7 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) FROM J1_TBL LEFT JOIN J2_TBL USING (i) ORDER BY udf(i), udf(udf(k)), udf(t) -- !query schema -struct +struct -- !query output NULL NULL null NULL NULL 0 zero NULL @@ -1831,7 +1831,7 @@ struct +struct -- !query output 0 NULL zero NULL 1 4 one -1 @@ -1848,7 +1848,7 @@ struct +struct -- !query output 0 NULL zero NULL 1 4 one -1 @@ -1866,7 +1866,7 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), udf(udf(t)), udf(k) FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i) ORDER BY udf(udf(i)), udf(k), udf(t) -- !query schema -struct +struct -- !query output NULL NULL NULL NULL NULL NULL null NULL @@ -1890,7 +1890,7 @@ SELECT udf('') AS `xxx`, udf(i), udf(j), t, udf(udf(k)) FROM J1_TBL FULL JOIN J2_TBL USING (i) ORDER BY udf(udf(i)), udf(k), udf(udf(t)) -- !query schema -struct +struct -- !query output NULL NULL NULL NULL NULL NULL null NULL @@ -1913,7 +1913,7 @@ struct +struct -- !query output @@ -1922,7 +1922,7 @@ struct +struct -- !query output 1 4 one -1 @@ -2052,7 +2052,7 @@ FULL JOIN (SELECT * FROM t3) s3 USING (name) -- !query schema -struct +struct -- !query output bb 12 13 cc 22 23 @@ -2138,7 +2138,7 @@ NATURAL FULL JOIN (SELECT name, udf(udf(n)) as s3_n FROM t3) as s3 ) ss2 -- !query schema -struct +struct -- !query output bb 11 12 13 cc NULL 22 23 @@ -2171,7 +2171,7 @@ FULL JOIN (SELECT name, 2 as s2_n FROM t2) as s2 ON (udf(udf(s1_n)) = udf(s2_n)) -- !query schema -struct +struct -- !query output NULL NULL bb 2 NULL NULL cc 2 @@ -2202,7 +2202,7 @@ struct<> -- !query select udf(udf(x1)), udf(x2) from x -- !query schema -struct +struct -- !query output 1 11 2 22 @@ -2214,7 +2214,7 @@ struct +struct -- !query output 1 111 2 222 @@ -2336,7 +2336,7 @@ select udf(udf(count(*))) from tenk1 a where udf(udf(unique1)) in (select udf(unique1) from tenk1 b join tenk1 c using (unique1) where udf(udf(b.unique2)) = udf(42)) -- !query schema -struct +struct -- !query output 1 @@ -2347,7 +2347,7 @@ select udf(count(*)) from tenk1 x where udf(x.unique1) = 0 and udf(x.unique1) in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=udf(udf(bb.f1))) -- !query schema -struct +struct -- !query output 1 @@ -2358,7 +2358,7 @@ select udf(udf(count(*))) from tenk1 x where udf(x.unique1) = 0 and udf(udf(x.unique1)) in (select udf(aa.f1) from int4_tbl aa,float8_tbl bb where udf(aa.f1)=udf(udf(bb.f1))) -- !query schema -struct +struct -- !query output 1 @@ -2388,7 +2388,7 @@ from tenk1 t5 where udf(t4.thousand) = udf(t5.unique1) and udf(udf(ss.x1)) = t4.tenthous and udf(ss.x2) = udf(udf(t5.stringu1)) -- !query schema -struct +struct -- !query output 1000 @@ -2400,7 +2400,7 @@ select udf(a.f1), udf(b.f1), udf(t.thousand), udf(t.tenthous) from (select udf(sum(udf(f1))) as f1 from int4_tbl i4b) b where b.f1 = udf(t.thousand) and udf(a.f1) = udf(b.f1) and udf((udf(a.f1)+udf(b.f1)+999)) = udf(udf(t.tenthous)) -- !query schema -struct +struct -- !query output @@ -2441,7 +2441,7 @@ select udf(count(*)) from (select * from tenk1 y order by udf(y.unique2)) y on udf(x.thousand) = y.unique2 and x.twothousand = udf(y.hundred) and x.fivethous = y.unique2 -- !query schema -struct +struct -- !query output 10000 @@ -2530,7 +2530,7 @@ struct select udf(count(*)) from tenk1 a, tenk1 b where udf(a.hundred) = b.thousand and udf(udf((b.fivethous % 10)) < 10) -- !query schema -struct +struct -- !query output 100000 @@ -2727,7 +2727,7 @@ from tenk1 a left join tenk1 b on a.unique2 = udf(b.tenthous) where udf(a.unique1) = 42 and ((udf(b.unique2) is null and udf(a.ten) = 2) or udf(udf(b.hundred)) = udf(udf(3))) -- !query schema -struct +struct -- !query output @@ -2761,7 +2761,7 @@ select udf(t1.q2), udf(count(t2.*)) from int8_tbl t1 left join int8_tbl t2 on (udf(udf(t1.q2)) = t2.q1) group by udf(t1.q2) order by 1 -- !query schema -struct +struct -- !query output -4567890123456789 0 123 2 @@ -2774,7 +2774,7 @@ select udf(udf(t1.q2)), udf(count(t2.*)) from int8_tbl t1 left join (select * from int8_tbl) t2 on (udf(udf(t1.q2)) = udf(t2.q1)) group by udf(udf(t1.q2)) order by 1 -- !query schema -struct +struct -- !query output -4567890123456789 0 123 2 @@ -2789,7 +2789,7 @@ from int8_tbl t1 left join on (udf(t1.q2) = udf(t2.q1)) group by t1.q2 order by 1 -- !query schema -struct +struct -- !query output -4567890123456789 0 123 2 @@ -2838,7 +2838,7 @@ from c left join on (udf(udf(c.a)) = udf(ss.code)) order by c.name -- !query schema -struct +struct -- !query output A p 2 -1 B q 0 -1 @@ -2884,7 +2884,7 @@ LEFT JOIN ) sub2 ON sub1.key1 = udf(udf(sub2.key3)) -- !query schema -struct +struct -- !query output 1 1 1 1 @@ -2898,7 +2898,7 @@ SELECT udf(qq), udf(udf(unique1)) USING (qq) INNER JOIN tenk1 c ON udf(qq) = udf(unique2) -- !query schema -struct +struct -- !query output 123 4596 123 4596 @@ -2948,7 +2948,7 @@ from nt3 as nt3 on udf(ss2.id) = nt3.nt2_id where udf(nt3.id) = 1 and udf(ss2.b3) -- !query schema -struct +struct -- !query output 1 @@ -3008,7 +3008,7 @@ select udf(count(*)) from left join tenk1 c on udf(a.unique2) = udf(b.unique1) and udf(c.thousand) = udf(udf(a.thousand)) join int4_tbl on udf(b.thousand) = f1 -- !query schema -struct +struct -- !query output 10 @@ -3021,7 +3021,7 @@ select udf(b.unique1) from right join int4_tbl i2 on udf(udf(i2.f1)) = udf(b.tenthous) order by udf(1) -- !query schema -struct +struct -- !query output NULL NULL @@ -3039,7 +3039,7 @@ select * from where udf(fault) = udf(122) order by udf(fault) -- !query schema -struct +struct -- !query output NULL 123 122 @@ -3049,7 +3049,7 @@ select udf(q1), udf(unique2), udf(thousand), udf(hundred) from int8_tbl a left join tenk1 b on udf(q1) = udf(unique2) where udf(coalesce(thousand,123)) = udf(q1) and udf(q1) = udf(udf(coalesce(hundred,123))) -- !query schema -struct +struct -- !query output @@ -3059,7 +3059,7 @@ select udf(f1), udf(unique2), case when udf(udf(unique2)) is null then udf(f1) e from int4_tbl a left join tenk1 b on udf(f1) = udf(udf(unique2)) where (case when udf(unique2) is null then udf(f1) else 0 end) = 0 -- !query schema -struct +struct -- !query output 0 0 0 @@ -3069,7 +3069,7 @@ select udf(a.unique1), udf(b.unique1), udf(c.unique1), udf(coalesce(b.twothousan from tenk1 a left join tenk1 b on udf(b.thousand) = a.unique1 left join tenk1 c on udf(c.unique2) = udf(coalesce(b.twothousand, a.twothousand)) where a.unique2 < udf(10) and udf(udf(coalesce(b.twothousand, a.twothousand))) = udf(44) -- !query schema -struct +struct -- !query output @@ -3107,7 +3107,7 @@ select udf(a.q2), udf(b.q1) from int8_tbl a left join int8_tbl b on udf(a.q2) = coalesce(b.q1, 1) where udf(udf(coalesce(b.q1, 1)) > 0) -- !query schema -struct +struct -- !query output -4567890123456789 NULL 123 123 @@ -3237,7 +3237,7 @@ SELECT * FROM FROM int8_tbl LEFT JOIN innertab ON udf(udf(q2)) = id) ss2 ON true -- !query schema -struct +struct -- !query output 1 123 456 123 1 123 4567890123456789 123 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out index 68113afdfae30..50b6e60086747 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out @@ -94,7 +94,7 @@ struct<> SELECT udf(b), udf(c) FROM test_having GROUP BY b, c HAVING udf(count(*)) = 1 ORDER BY udf(b), udf(c) -- !query schema -struct +struct -- !query output 1 XXXX 3 bbbb @@ -104,7 +104,7 @@ struct +struct -- !query output 3 BBBB 3 bbbb @@ -115,7 +115,7 @@ SELECT udf(c), max(udf(a)) FROM test_having GROUP BY c HAVING udf(count(*)) > 2 OR udf(min(a)) = udf(max(a)) ORDER BY c -- !query schema -struct +struct -- !query output XXXX 0 bbbb 5 @@ -124,7 +124,7 @@ bbbb 5 -- !query SELECT udf(udf(min(udf(a)))), udf(udf(max(udf(a)))) FROM test_having HAVING udf(udf(min(udf(a)))) = udf(udf(max(udf(a)))) -- !query schema -struct +struct -- !query output @@ -132,7 +132,7 @@ struct +struct -- !query output 0 9 @@ -143,7 +143,7 @@ SELECT udf(a) FROM test_having HAVING udf(min(a)) < udf(max(a)) struct<> -- !query output org.apache.spark.sql.AnalysisException -grouping expressions sequence is empty, and 'default.test_having.`a`' is not an aggregate function. Wrap '(min(default.test_having.`a`) AS `min(a#x)`, max(default.test_having.`a`) AS `max(a#x)`)' in windowing function(s) or wrap 'default.test_having.`a`' in first() (or first_value) if you don't care which value you get.; +grouping expressions sequence is empty, and 'spark_catalog.default.test_having.`a`' is not an aggregate function. Wrap '(min(spark_catalog.default.test_having.`a`) AS `min(a#x)`, max(spark_catalog.default.test_having.`a`) AS `max(a#x)`)' in windowing function(s) or wrap 'spark_catalog.default.test_having.`a`' in first() (or first_value) if you don't care which value you get.; -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out index 11cb682ee1494..66e6c20a2f6f2 100755 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out @@ -95,7 +95,7 @@ SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c) ORDER BY udf(c) -- !query schema -struct +struct -- !query output ABAB 2 BBBB 2 @@ -109,7 +109,7 @@ cccc 2 SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c) ORDER BY udf(c) -- !query schema -struct +struct -- !query output 2 2 @@ -125,13 +125,13 @@ SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b) struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 75 +cannot resolve '`b`' given input columns: [CAST(udf(ansi_cast(count(1) as string)) AS BIGINT)]; line 1 pos 75 -- !query SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) -- !query schema -struct +struct -- !query output 1 2 @@ -143,7 +143,7 @@ struct SELECT udf(test_missing_target.b), udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) -- !query schema -struct +struct -- !query output 1 1 2 2 @@ -154,7 +154,7 @@ struct +struct -- !query output XXXX ABAB @@ -171,7 +171,7 @@ CCCC -- !query SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc -- !query schema -struct +struct -- !query output 4 3 @@ -182,7 +182,7 @@ struct -- !query SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc -- !query schema -struct +struct -- !query output 10 @@ -190,7 +190,7 @@ struct -- !query SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 1 ORDER BY 1 -- !query schema -struct +struct -- !query output ABAB 2 BBBB 2 @@ -224,7 +224,7 @@ Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14 SELECT udf(a), udf(a) FROM test_missing_target ORDER BY udf(a) -- !query schema -struct +struct -- !query output 0 0 1 1 @@ -242,7 +242,7 @@ struct +struct -- !query output 0.0 0.0 0.5 0.5 @@ -260,7 +260,7 @@ struct +struct -- !query output 0.0 0.0 0.5 0.5 @@ -279,7 +279,7 @@ SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b) ORDER BY udf(x.b) -- !query schema -struct +struct -- !query output 1 1 2 2 @@ -292,7 +292,7 @@ SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b) ORDER BY udf(x.b) -- !query schema -struct +struct -- !query output 1 2 @@ -305,7 +305,7 @@ SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target GROUP BY udf(test_missing_target.a%2) ORDER BY udf(test_missing_target.a%2) -- !query schema -struct +struct -- !query output 0 5 1 5 @@ -316,7 +316,7 @@ SELECT udf(count(c)) FROM test_missing_target GROUP BY udf(lower(test_missing_target.c)) ORDER BY udf(lower(test_missing_target.c)) -- !query schema -struct +struct -- !query output 2 3 @@ -330,13 +330,13 @@ SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf( struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80 +cannot resolve '`b`' given input columns: [CAST(udf(ansi_cast(count(ansi_cast(udf(ansi_cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80 -- !query SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2) -- !query schema -struct +struct -- !query output 1 2 @@ -348,7 +348,7 @@ struct SELECT udf(lower(test_missing_target.c)), udf(count(udf(c))) FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c)) -- !query schema -struct +struct -- !query output abab 2 bbbb 3 @@ -359,7 +359,7 @@ xxxx 1 -- !query SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d))) -- !query schema -struct +struct -- !query output 0 1 @@ -377,7 +377,7 @@ struct SELECT udf(count(b)) FROM test_missing_target GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc -- !query schema -struct +struct -- !query output 4 3 @@ -402,7 +402,7 @@ test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b/2) ORDER BY udf(x.b/2) -- !query schema -struct +struct -- !query output 0.5 1 1.0 2 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out index 6403406413db9..da5256f5c0453 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out @@ -87,7 +87,7 @@ struct> +struct> -- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out index 44002406836a4..ce3c761bc5d2d 100644 --- a/sql/core/src/test/resources/sql-tests/results/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 16 +-- Number of queries: 20 -- !query @@ -126,6 +126,39 @@ struct,str:string> [1,2] str +-- !query +CREATE OR REPLACE TEMPORARY VIEW t3 AS VALUES (decimal(1)) tbl(v) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t.v FROM ( + SELECT v FROM t3 + UNION ALL + SELECT v + v AS v FROM t3 +) t +-- !query schema +struct +-- !query output +1 +2 + + +-- !query +SELECT SUM(t.v) FROM ( + SELECT v FROM t3 + UNION + SELECT v + v AS v FROM t3 +) t +-- !query schema +struct +-- !query output +3 + + -- !query DROP VIEW IF EXISTS t1 -- !query schema @@ -142,6 +175,14 @@ struct<> +-- !query +DROP VIEW IF EXISTS t3 +-- !query schema +struct<> +-- !query output + + + -- !query DROP VIEW IF EXISTS p1 -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out index f795374735f59..625088f90ced9 100644 --- a/sql/core/src/test/resources/sql-tests/results/window.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 23 +-- Number of queries: 24 -- !query @@ -380,3 +380,14 @@ a 4 b 1 b 3 b 6 + + +-- !query +SELECT val, cate, +count(val) FILTER (WHERE val > 1) OVER(PARTITION BY cate) +FROM testData ORDER BY cate, val +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +window aggregate function with filter predicate is not supported yet.; diff --git a/sql/core/src/test/resources/test-data/TestStringDictionary.testRowIndex.orc b/sql/core/src/test/resources/test-data/TestStringDictionary.testRowIndex.orc new file mode 100644 index 0000000000000..cba483d1f070f Binary files /dev/null and b/sql/core/src/test/resources/test-data/TestStringDictionary.testRowIndex.orc differ diff --git a/sql/core/src/test/resources/test-data/bad_after_good.csv b/sql/core/src/test/resources/test-data/bad_after_good.csv index 4621a7d23714d..1a7c2651a11a7 100644 --- a/sql/core/src/test/resources/test-data/bad_after_good.csv +++ b/sql/core/src/test/resources/test-data/bad_after_good.csv @@ -1,2 +1,2 @@ "good record",1999-08-01 -"bad record",1999-088-01 +"bad record",1999-088_01 diff --git a/sql/core/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc b/sql/core/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc new file mode 100644 index 0000000000000..ebe01743b2e20 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_date_v2_4.snappy.orc differ diff --git a/sql/core/src/test/resources/test-data/before_1582_date_v2_4_5.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_date_v2_4_5.snappy.parquet new file mode 100644 index 0000000000000..edd61c9b9fec8 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_date_v2_4_5.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_date_v2_4_6.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_date_v2_4_6.snappy.parquet new file mode 100644 index 0000000000000..01f4887f5e994 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_date_v2_4_6.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_dict_v2_4_5.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_dict_v2_4_5.snappy.parquet new file mode 100644 index 0000000000000..c7e8d3926f63a Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_dict_v2_4_5.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_dict_v2_4_6.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_dict_v2_4_6.snappy.parquet new file mode 100644 index 0000000000000..939e2b8088eb0 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_dict_v2_4_6.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_plain_v2_4_5.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_plain_v2_4_5.snappy.parquet new file mode 100644 index 0000000000000..88a94ac482052 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_plain_v2_4_5.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_plain_v2_4_6.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_plain_v2_4_6.snappy.parquet new file mode 100644 index 0000000000000..68bfa33aac13f Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_int96_plain_v2_4_6.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_micros_v2_4_5.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_micros_v2_4_5.snappy.parquet new file mode 100644 index 0000000000000..62e6048354dc1 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_micros_v2_4_5.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_micros_v2_4_6.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_micros_v2_4_6.snappy.parquet new file mode 100644 index 0000000000000..d7fdaa3e67212 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_micros_v2_4_6.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_millis_v2_4_5.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_millis_v2_4_5.snappy.parquet new file mode 100644 index 0000000000000..a7cef9e60f134 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_millis_v2_4_5.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_timestamp_millis_v2_4_6.snappy.parquet b/sql/core/src/test/resources/test-data/before_1582_timestamp_millis_v2_4_6.snappy.parquet new file mode 100644 index 0000000000000..4c213f4540a73 Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_timestamp_millis_v2_4_6.snappy.parquet differ diff --git a/sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc b/sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc new file mode 100644 index 0000000000000..af9ef040270ac Binary files /dev/null and b/sql/core/src/test/resources/test-data/before_1582_ts_v2_4.snappy.orc differ diff --git a/sql/core/src/test/resources/test-data/percentile_approx-input.csv.bz2 b/sql/core/src/test/resources/test-data/percentile_approx-input.csv.bz2 new file mode 100644 index 0000000000000..f85e2896b3a89 Binary files /dev/null and b/sql/core/src/test/resources/test-data/percentile_approx-input.csv.bz2 differ diff --git a/sql/core/src/test/resources/test-data/value-malformed.csv b/sql/core/src/test/resources/test-data/value-malformed.csv index 8945ed73d2e83..6e6f08fca6df8 100644 --- a/sql/core/src/test/resources/test-data/value-malformed.csv +++ b/sql/core/src/test/resources/test-data/value-malformed.csv @@ -1,2 +1,2 @@ -0,2013-111-11 12:13:14 +0,2013-111_11 12:13:14 1,1983-08-04 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 2b4abed645910..4991e397eb11c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -150,7 +150,7 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession (1 to 1000).toDF("col").createOrReplaceTempView(table) checkAnswer( spark.sql(s"SELECT percentile_approx(col, array(0.25 + 0.25D), 200 + 800) FROM $table"), - Row(Seq(499)) + Row(Seq(500)) ) } } @@ -296,4 +296,23 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession buffer.quantileSummaries assert(buffer.isCompressed) } + + test("SPARK-32908: maximum target error in percentile_approx") { + withTempView(table) { + spark.read + .schema("col int") + .csv(testFile("test-data/percentile_approx-input.csv.bz2")) + .repartition(1) + .createOrReplaceTempView(table) + checkAnswer( + spark.sql( + s"""SELECT + | percentile_approx(col, 0.77, 1000), + | percentile_approx(col, 0.77, 10000), + | percentile_approx(col, 0.77, 100000), + | percentile_approx(col, 0.77, 1000000) + |FROM $table""".stripMargin), + Row(18, 17, 17, 17)) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala index 07afd4195c3d4..174e7345d56b6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala @@ -50,11 +50,17 @@ abstract class BenchmarkQueryTest extends QueryTest with SharedSparkSession { protected def checkGeneratedCode(plan: SparkPlan, checkMethodCodeSize: Boolean = true): Unit = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() - plan foreach { - case s: WholeStageCodegenExec => - codegenSubtrees += s - case _ => + + def findSubtrees(plan: SparkPlan): Unit = { + plan foreach { + case s: WholeStageCodegenExec => + codegenSubtrees += s + case s => + s.subqueries.foreach(findSubtrees) + } } + + findSubtrees(plan) codegenSubtrees.toSeq.foreach { subtree => val code = subtree.doCodeGen()._2 val (_, ByteCodeStats(maxMethodCodeSize, _, _)) = try { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index cd2c681dd7e0e..dc16418d50199 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -17,9 +17,13 @@ package org.apache.spark.sql +import java.io.File + import scala.collection.mutable.HashSet import scala.concurrent.duration._ +import org.apache.commons.io.FileUtils + import org.apache.spark.CleanerListener import org.apache.spark.executor.DataReadMethod._ import org.apache.spark.executor.DataReadMethod.DataReadMethod @@ -141,27 +145,31 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } test("uncaching temp table") { - testData.select("key").createOrReplaceTempView("tempTable1") - testData.select("key").createOrReplaceTempView("tempTable2") - spark.catalog.cacheTable("tempTable1") + withTempView("tempTable1", "tempTable2") { + testData.select("key").createOrReplaceTempView("tempTable1") + testData.select("key").createOrReplaceTempView("tempTable2") + spark.catalog.cacheTable("tempTable1") - assertCached(sql("SELECT COUNT(*) FROM tempTable1")) - assertCached(sql("SELECT COUNT(*) FROM tempTable2")) + assertCached(sql("SELECT COUNT(*) FROM tempTable1")) + assertCached(sql("SELECT COUNT(*) FROM tempTable2")) - // Is this valid? - uncacheTable("tempTable2") + // Is this valid? + uncacheTable("tempTable2") - // Should this be cached? - assertCached(sql("SELECT COUNT(*) FROM tempTable1"), 0) + // Should this be cached? + assertCached(sql("SELECT COUNT(*) FROM tempTable1"), 0) + } } test("too big for memory") { - val data = "*" * 1000 - sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF() - .createOrReplaceTempView("bigData") - spark.table("bigData").persist(StorageLevel.MEMORY_AND_DISK) - assert(spark.table("bigData").count() === 200000L) - spark.table("bigData").unpersist(blocking = true) + withTempView("bigData") { + val data = "*" * 1000 + sparkContext.parallelize(1 to 200000, 1).map(_ => BigData(data)).toDF() + .createOrReplaceTempView("bigData") + spark.table("bigData").persist(StorageLevel.MEMORY_AND_DISK) + assert(spark.table("bigData").count() === 200000L) + spark.table("bigData").unpersist(blocking = true) + } } test("calling .cache() should use in-memory columnar caching") { @@ -195,7 +203,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } test("SPARK-1669: cacheTable should be idempotent") { - assume(!spark.table("testData").logicalPlan.isInstanceOf[InMemoryRelation]) + assert(!spark.table("testData").logicalPlan.isInstanceOf[InMemoryRelation]) spark.catalog.cacheTable("testData") assertCached(spark.table("testData")) @@ -225,12 +233,14 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } test("SELECT star from cached table") { - sql("SELECT * FROM testData").createOrReplaceTempView("selectStar") - spark.catalog.cacheTable("selectStar") - checkAnswer( - sql("SELECT * FROM selectStar WHERE key = 1"), - Seq(Row(1, "1"))) - uncacheTable("selectStar") + withTempView("selectStar") { + sql("SELECT * FROM testData").createOrReplaceTempView("selectStar") + spark.catalog.cacheTable("selectStar") + checkAnswer( + sql("SELECT * FROM selectStar WHERE key = 1"), + Seq(Row(1, "1"))) + uncacheTable("selectStar") + } } test("Self-join cached") { @@ -375,102 +385,112 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } test("Drops temporary table") { - testData.select("key").createOrReplaceTempView("t1") - spark.table("t1") - spark.catalog.dropTempView("t1") - intercept[AnalysisException](spark.table("t1")) + withTempView("t1") { + testData.select("key").createOrReplaceTempView("t1") + spark.table("t1") + spark.catalog.dropTempView("t1") + intercept[AnalysisException](spark.table("t1")) + } } test("Drops cached temporary table") { - testData.select("key").createOrReplaceTempView("t1") - testData.select("key").createOrReplaceTempView("t2") - spark.catalog.cacheTable("t1") + withTempView("t1", "t2") { + testData.select("key").createOrReplaceTempView("t1") + testData.select("key").createOrReplaceTempView("t2") + spark.catalog.cacheTable("t1") - assert(spark.catalog.isCached("t1")) - assert(spark.catalog.isCached("t2")) + assert(spark.catalog.isCached("t1")) + assert(spark.catalog.isCached("t2")) - spark.catalog.dropTempView("t1") - intercept[AnalysisException](spark.table("t1")) - assert(!spark.catalog.isCached("t2")) + spark.catalog.dropTempView("t1") + intercept[AnalysisException](spark.table("t1")) + assert(!spark.catalog.isCached("t2")) + } } test("Clear all cache") { - sql("SELECT key FROM testData LIMIT 10").createOrReplaceTempView("t1") - sql("SELECT key FROM testData LIMIT 5").createOrReplaceTempView("t2") - spark.catalog.cacheTable("t1") - spark.catalog.cacheTable("t2") - spark.catalog.clearCache() - assert(spark.sharedState.cacheManager.isEmpty) + withTempView("t1", "t2") { + sql("SELECT key FROM testData LIMIT 10").createOrReplaceTempView("t1") + sql("SELECT key FROM testData LIMIT 5").createOrReplaceTempView("t2") + spark.catalog.cacheTable("t1") + spark.catalog.cacheTable("t2") + spark.catalog.clearCache() + assert(spark.sharedState.cacheManager.isEmpty) - sql("SELECT key FROM testData LIMIT 10").createOrReplaceTempView("t1") - sql("SELECT key FROM testData LIMIT 5").createOrReplaceTempView("t2") - spark.catalog.cacheTable("t1") - spark.catalog.cacheTable("t2") - sql("Clear CACHE") - assert(spark.sharedState.cacheManager.isEmpty) + sql("SELECT key FROM testData LIMIT 10").createOrReplaceTempView("t1") + sql("SELECT key FROM testData LIMIT 5").createOrReplaceTempView("t2") + spark.catalog.cacheTable("t1") + spark.catalog.cacheTable("t2") + sql("Clear CACHE") + assert(spark.sharedState.cacheManager.isEmpty) + } } test("Ensure accumulators to be cleared after GC when uncacheTable") { - sql("SELECT key FROM testData LIMIT 10").createOrReplaceTempView("t1") - sql("SELECT key FROM testData LIMIT 5").createOrReplaceTempView("t2") + withTempView("t1", "t2") { + sql("SELECT key FROM testData LIMIT 10").createOrReplaceTempView("t1") + sql("SELECT key FROM testData LIMIT 5").createOrReplaceTempView("t2") - spark.catalog.cacheTable("t1") - spark.catalog.cacheTable("t2") + spark.catalog.cacheTable("t1") + spark.catalog.cacheTable("t2") - sql("SELECT * FROM t1").count() - sql("SELECT * FROM t2").count() - sql("SELECT * FROM t1").count() - sql("SELECT * FROM t2").count() + sql("SELECT * FROM t1").count() + sql("SELECT * FROM t2").count() + sql("SELECT * FROM t1").count() + sql("SELECT * FROM t2").count() + + val toBeCleanedAccIds = new HashSet[Long] + + val accId1 = spark.table("t1").queryExecution.withCachedData.collect { + case i: InMemoryRelation => i.cacheBuilder.sizeInBytesStats.id + }.head + toBeCleanedAccIds += accId1 + + val accId2 = spark.table("t1").queryExecution.withCachedData.collect { + case i: InMemoryRelation => i.cacheBuilder.sizeInBytesStats.id + }.head + toBeCleanedAccIds += accId2 + + val cleanerListener = new CleanerListener { + def rddCleaned(rddId: Int): Unit = {} + def shuffleCleaned(shuffleId: Int): Unit = {} + def broadcastCleaned(broadcastId: Long): Unit = {} + def accumCleaned(accId: Long): Unit = { + toBeCleanedAccIds.synchronized { toBeCleanedAccIds -= accId } + } + def checkpointCleaned(rddId: Long): Unit = {} + } + spark.sparkContext.cleaner.get.attachListener(cleanerListener) - val toBeCleanedAccIds = new HashSet[Long] + uncacheTable("t1") + uncacheTable("t2") - val accId1 = spark.table("t1").queryExecution.withCachedData.collect { - case i: InMemoryRelation => i.cacheBuilder.sizeInBytesStats.id - }.head - toBeCleanedAccIds += accId1 + System.gc() - val accId2 = spark.table("t1").queryExecution.withCachedData.collect { - case i: InMemoryRelation => i.cacheBuilder.sizeInBytesStats.id - }.head - toBeCleanedAccIds += accId2 - - val cleanerListener = new CleanerListener { - def rddCleaned(rddId: Int): Unit = {} - def shuffleCleaned(shuffleId: Int): Unit = {} - def broadcastCleaned(broadcastId: Long): Unit = {} - def accumCleaned(accId: Long): Unit = { - toBeCleanedAccIds.synchronized { toBeCleanedAccIds -= accId } + eventually(timeout(10.seconds)) { + assert(toBeCleanedAccIds.synchronized { toBeCleanedAccIds.isEmpty }, + "batchStats accumulators should be cleared after GC when uncacheTable") } - def checkpointCleaned(rddId: Long): Unit = {} - } - spark.sparkContext.cleaner.get.attachListener(cleanerListener) - - uncacheTable("t1") - uncacheTable("t2") - - System.gc() - eventually(timeout(10.seconds)) { - assert(toBeCleanedAccIds.synchronized { toBeCleanedAccIds.isEmpty }, - "batchStats accumulators should be cleared after GC when uncacheTable") + assert(AccumulatorContext.get(accId1).isEmpty) + assert(AccumulatorContext.get(accId2).isEmpty) } - - assert(AccumulatorContext.get(accId1).isEmpty) - assert(AccumulatorContext.get(accId2).isEmpty) } test("SPARK-10327 Cache Table is not working while subquery has alias in its project list") { - sparkContext.parallelize((1, 1) :: (2, 2) :: Nil) - .toDF("key", "value").selectExpr("key", "value", "key+1").createOrReplaceTempView("abc") - spark.catalog.cacheTable("abc") - - val sparkPlan = sql( - """select a.key, b.key, c.key from - |abc a join abc b on a.key=b.key - |join abc c on a.key=c.key""".stripMargin).queryExecution.sparkPlan - - assert(sparkPlan.collect { case e: InMemoryTableScanExec => e }.size === 3) - assert(sparkPlan.collect { case e: RDDScanExec => e }.size === 0) + withTempView("abc") { + sparkContext.parallelize((1, 1) :: (2, 2) :: Nil) + .toDF("key", "value").selectExpr("key", "value", "key+1").createOrReplaceTempView("abc") + spark.catalog.cacheTable("abc") + + val sparkPlan = sql( + """select a.key, b.key, c.key from + |abc a join abc b on a.key=b.key + |join abc c on a.key=c.key""".stripMargin).queryExecution.sparkPlan + + assert(sparkPlan.collect { case e: InMemoryTableScanExec => e }.size === 3) + assert(sparkPlan.collect { case e: RDDScanExec => e }.size === 0) + } } /** @@ -628,26 +648,30 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } test("SPARK-15870 DataFrame can't execute after uncacheTable") { - val selectStar = sql("SELECT * FROM testData WHERE key = 1") - selectStar.createOrReplaceTempView("selectStar") + withTempView("selectStar") { + val selectStar = sql("SELECT * FROM testData WHERE key = 1") + selectStar.createOrReplaceTempView("selectStar") - spark.catalog.cacheTable("selectStar") - checkAnswer( - selectStar, - Seq(Row(1, "1"))) + spark.catalog.cacheTable("selectStar") + checkAnswer( + selectStar, + Seq(Row(1, "1"))) - uncacheTable("selectStar") - checkAnswer( - selectStar, - Seq(Row(1, "1"))) + uncacheTable("selectStar") + checkAnswer( + selectStar, + Seq(Row(1, "1"))) + } } test("SPARK-15915 Logical plans should use canonicalized plan when override sameResult") { - val localRelation = Seq(1, 2, 3).toDF() - localRelation.createOrReplaceTempView("localRelation") + withTempView("localRelation") { + val localRelation = Seq(1, 2, 3).toDF() + localRelation.createOrReplaceTempView("localRelation") - spark.catalog.cacheTable("localRelation") - assert(getNumInMemoryRelations(localRelation) == 1) + spark.catalog.cacheTable("localRelation") + assert(getNumInMemoryRelations(localRelation) == 1) + } } test("SPARK-19093 Caching in side subquery") { @@ -1122,4 +1146,201 @@ class CachedTableSuite extends QueryTest with SQLTestUtils assert(!spark.catalog.isCached("t1")) } } + + test("SPARK-30494 Fix the leak of cached data when replace an existing view") { + withTempView("tempView") { + spark.catalog.clearCache() + sql("create or replace temporary view tempView as select 1") + sql("cache table tempView") + assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 1")).isDefined) + sql("create or replace temporary view tempView as select 1, 2") + assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 1")).isEmpty) + sql("cache table tempView") + assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 1, 2")).isDefined) + } + + withGlobalTempView("tempGlobalTempView") { + spark.catalog.clearCache() + sql("create or replace global temporary view tempGlobalTempView as select 1") + sql("cache table global_temp.tempGlobalTempView") + assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 1")).isDefined) + sql("create or replace global temporary view tempGlobalTempView as select 1, 2") + assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 1")).isEmpty) + sql("cache table global_temp.tempGlobalTempView") + assert(spark.sharedState.cacheManager.lookupCachedData(sql("select 1, 2")).isDefined) + } + + withView("view1") { + spark.catalog.clearCache() + sql("create or replace view view1 as select 1") + sql("cache table view1") + sql("create or replace view view1 as select 1, 2") + sql("cache table view1") + // the cached plan of persisted view likes below, + // we cannot use the same assertion of temp view. + // SubqueryAlias + // | + // + View + // | + // + Project[1 AS 1] + spark.sharedState.cacheManager.uncacheQuery(spark.table("view1"), cascade = false) + // make sure there is no cached data leak + assert(spark.sharedState.cacheManager.isEmpty) + } + } + + test("SPARK-33228: Don't uncache data when replacing an existing view having the same plan") { + withTempView("tempView") { + spark.catalog.clearCache() + val df = spark.range(1).selectExpr("id a", "id b") + df.cache() + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) + df.createOrReplaceTempView("tempView") + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) + df.createOrReplaceTempView("tempView") + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) + } + + withTempView("tempGlobalTempView") { + spark.catalog.clearCache() + val df = spark.range(1).selectExpr("id a", "id b") + df.cache() + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) + df.createOrReplaceGlobalTempView("tempGlobalTempView") + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) + df.createOrReplaceGlobalTempView("tempGlobalTempView") + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) + } + } + + test("SPARK-33290: REFRESH TABLE should invalidate all caches referencing the table") { + withTable("t") { + withTempPath { path => + withTempView("tempView1", "tempView2") { + Seq((1 -> "a")).toDF("i", "j").write.parquet(path.getCanonicalPath) + sql(s"CREATE TABLE t USING parquet LOCATION '${path.toURI}'") + sql("CREATE TEMPORARY VIEW tempView1 AS SELECT * FROM t") + sql("CACHE TABLE tempView2 AS SELECT i FROM tempView1") + checkAnswer(sql("SELECT * FROM tempView1"), Seq(Row(1, "a"))) + checkAnswer(sql("SELECT * FROM tempView2"), Seq(Row(1))) + + Utils.deleteRecursively(path) + sql("REFRESH TABLE tempView1") + checkAnswer(sql("SELECT * FROM tempView1"), Seq.empty) + checkAnswer(sql("SELECT * FROM tempView2"), Seq.empty) + } + } + } + } + + test("SPARK-33290: querying temporary view after REFRESH TABLE fails with FNFE") { + withTable("t") { + withTempPath { path => + withTempView("tempView1") { + Seq((1 -> "a")).toDF("i", "j").write.parquet(path.getCanonicalPath) + sql(s"CREATE TABLE t USING parquet LOCATION '${path.toURI}'") + sql("CREATE TEMPORARY VIEW tempView1 AS SELECT * FROM t") + checkAnswer(sql("SELECT * FROM tempView1"), Seq(Row(1, "a"))) + + Utils.deleteRecursively(path) + sql("REFRESH TABLE t") + checkAnswer(sql("SELECT * FROM t"), Seq.empty) + val exception = intercept[Exception] { + checkAnswer(sql("SELECT * FROM tempView1"), Seq.empty) + } + assert(exception.getMessage.contains("FileNotFoundException")) + assert(exception.getMessage.contains("REFRESH TABLE")) + } + } + } + } + + test("SPARK-33786: Cache's storage level should be respected when a table name is altered.") { + withTable("old", "new") { + withTempPath { path => + def getStorageLevel(tableName: String): StorageLevel = { + val table = spark.table(tableName) + val cachedData = spark.sharedState.cacheManager.lookupCachedData(table).get + cachedData.cachedRepresentation.cacheBuilder.storageLevel + } + Seq(1 -> "a").toDF("i", "j").write.parquet(path.getCanonicalPath) + sql(s"CREATE TABLE old USING parquet LOCATION '${path.toURI}'") + sql("CACHE TABLE old OPTIONS('storageLevel' 'MEMORY_ONLY')") + val oldStorageLevel = getStorageLevel("old") + + sql("ALTER TABLE old RENAME TO new") + val newStorageLevel = getStorageLevel("new") + assert(oldStorageLevel === newStorageLevel) + } + } + } + + test("SPARK-33950: refresh cache after partition dropping") { + withTable("t") { + sql(s"CREATE TABLE t (id int, part int) USING parquet PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + sql("INSERT INTO t PARTITION (part=1) SELECT 1") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(1, 1))) + sql("ALTER TABLE t DROP PARTITION (part=0)") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(1, 1))) + } + } + + test("SPARK-34011: refresh cache after partition renaming") { + withTable("t") { + sql("CREATE TABLE t (id int, part int) USING parquet PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + sql("INSERT INTO t PARTITION (part=1) SELECT 1") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + QueryTest.checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(1, 1))) + sql("ALTER TABLE t PARTITION (part=0) RENAME TO PARTITION (part=2)") + assert(spark.catalog.isCached("t")) + QueryTest.checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 2), Row(1, 1))) + } + } + + private def testCacheRefreshing(cmd: String => DataFrame): Unit = { + withTable("t") { + sql("CREATE TABLE t (id int, part int) USING parquet PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0))) + + // Create new partition (part = 1) in the filesystem + val information = sql("SHOW TABLE EXTENDED LIKE 't' PARTITION (part = 0)") + .select("information") + .first().getString(0) + val part0Loc = information + .split("\\r?\\n") + .filter(_.startsWith("Location:")) + .head + .replace("Location: file:", "") + val part1Loc = part0Loc.replace("part=0", "part=1") + FileUtils.copyDirectory(new File(part0Loc), new File(part1Loc)) + + cmd(part1Loc) + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(0, 1))) + } + } + + test("SPARK-34055: refresh cache in partition adding") { + testCacheRefreshing { location => + sql(s"ALTER TABLE t ADD PARTITION (part=1) LOCATION '$location'") + } + } + + test("SPARK-34027: refresh cache in partitions recovering") { + testCacheRefreshing { _ => + sql("ALTER TABLE t RECOVER PARTITIONS") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index a9ee25b10dc02..fa06484a73d95 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import java.sql.Date +import java.sql.{Date, Timestamp} import java.util.Locale import scala.collection.JavaConverters._ @@ -26,12 +26,13 @@ import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat} import org.scalatest.Matchers._ -import org.apache.spark.sql.catalyst.expressions.{In, InSet, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{InSet, Literal, NamedExpression} import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String class ColumnExpressionSuite extends QueryTest with SharedSparkSession { import testImplicits._ @@ -259,26 +260,28 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { } test("nanvl") { - val testData = spark.createDataFrame(sparkContext.parallelize( - Row(null, 3.0, Double.NaN, Double.PositiveInfinity, 1.0f, 4) :: Nil), - StructType(Seq(StructField("a", DoubleType), StructField("b", DoubleType), - StructField("c", DoubleType), StructField("d", DoubleType), - StructField("e", FloatType), StructField("f", IntegerType)))) + withTempView("t") { + val testData = spark.createDataFrame(sparkContext.parallelize( + Row(null, 3.0, Double.NaN, Double.PositiveInfinity, 1.0f, 4) :: Nil), + StructType(Seq(StructField("a", DoubleType), StructField("b", DoubleType), + StructField("c", DoubleType), StructField("d", DoubleType), + StructField("e", FloatType), StructField("f", IntegerType)))) - checkAnswer( - testData.select( - nanvl($"a", lit(5)), nanvl($"b", lit(10)), nanvl(lit(10), $"b"), - nanvl($"c", lit(null).cast(DoubleType)), nanvl($"d", lit(10)), - nanvl($"b", $"e"), nanvl($"e", $"f")), - Row(null, 3.0, 10.0, null, Double.PositiveInfinity, 3.0, 1.0) - ) - testData.createOrReplaceTempView("t") - checkAnswer( - sql( - "select nanvl(a, 5), nanvl(b, 10), nanvl(10, b), nanvl(c, null), nanvl(d, 10), " + - " nanvl(b, e), nanvl(e, f) from t"), - Row(null, 3.0, 10.0, null, Double.PositiveInfinity, 3.0, 1.0) - ) + checkAnswer( + testData.select( + nanvl($"a", lit(5)), nanvl($"b", lit(10)), nanvl(lit(10), $"b"), + nanvl($"c", lit(null).cast(DoubleType)), nanvl($"d", lit(10)), + nanvl($"b", $"e"), nanvl($"e", $"f")), + Row(null, 3.0, 10.0, null, Double.PositiveInfinity, 3.0, 1.0) + ) + testData.createOrReplaceTempView("t") + checkAnswer( + sql( + "select nanvl(a, 5), nanvl(b, 10), nanvl(10, b), nanvl(c, null), nanvl(d, 10), " + + " nanvl(b, e), nanvl(e, f) from t"), + Row(null, 3.0, 10.0, null, Double.PositiveInfinity, 3.0, 1.0) + ) + } } test("===") { @@ -453,35 +456,81 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { } test("isInCollection: Scala Collection") { - val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") + Seq(0, 1, 10).foreach { optThreshold => + Seq(0, 1, 10).foreach { switchThreshold => + withSQLConf( + SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> optThreshold.toString, + SQLConf.OPTIMIZER_INSET_SWITCH_THRESHOLD.key -> switchThreshold.toString) { + val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") + // Test with different types of collections + checkAnswer(df.filter($"a".isInCollection(Seq(3, 1))), + df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) + checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).toSet)), + df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) + checkAnswer(df.filter($"a".isInCollection(Seq(3, 2).toArray)), + df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) + checkAnswer(df.filter($"a".isInCollection(Seq(3, 1).toList)), + df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) + + val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") - Seq(1, 2).foreach { conf => - withSQLConf(SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> conf.toString) { - if (conf <= 1) { - assert($"a".isInCollection(Seq(3, 1)).expr.isInstanceOf[InSet], "Expect expr to be InSet") - } else { - assert($"a".isInCollection(Seq(3, 1)).expr.isInstanceOf[In], "Expect expr to be In") + val e = intercept[AnalysisException] { + df2.filter($"a".isInCollection(Seq($"b"))) + } + Seq("cannot resolve", "due to data type mismatch: Arguments must be same type but were") + .foreach { s => + assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) + } } + } + } + } - // Test with different types of collections - checkAnswer(df.filter($"a".isInCollection(Seq(3, 1))), - df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) - checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).toSet)), - df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".isInCollection(Seq(3, 2).toArray)), - df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".isInCollection(Seq(3, 1).toList)), - df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) - - val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") - - val e = intercept[AnalysisException] { - df2.filter($"a".isInCollection(Seq($"b"))) + test("SPARK-31553: isInCollection - collection element types") { + val expected = Seq(Row(true), Row(false)) + Seq(0, 1, 10).foreach { optThreshold => + Seq(0, 1, 10).foreach { switchThreshold => + withSQLConf( + SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> optThreshold.toString, + SQLConf.OPTIMIZER_INSET_SWITCH_THRESHOLD.key -> switchThreshold.toString) { + checkAnswer(Seq(0).toDS.select($"value".isInCollection(Seq(null))), Seq(Row(null))) + checkAnswer( + Seq(true).toDS.select($"value".isInCollection(Seq(true, false))), + Seq(Row(true))) + checkAnswer( + Seq(0.toByte, 1.toByte).toDS.select($"value".isInCollection(Seq(0.toByte, 2.toByte))), + expected) + checkAnswer( + Seq(0.toShort, 1.toShort).toDS + .select($"value".isInCollection(Seq(0.toShort, 2.toShort))), + expected) + checkAnswer(Seq(0, 1).toDS.select($"value".isInCollection(Seq(0, 2))), expected) + checkAnswer(Seq(0L, 1L).toDS.select($"value".isInCollection(Seq(0L, 2L))), expected) + checkAnswer(Seq(0.0f, 1.0f).toDS + .select($"value".isInCollection(Seq(0.0f, 2.0f))), expected) + checkAnswer(Seq(0.0D, 1.0D).toDS + .select($"value".isInCollection(Seq(0.0D, 2.0D))), expected) + checkAnswer( + Seq(BigDecimal(0), BigDecimal(2)).toDS + .select($"value".isInCollection(Seq(BigDecimal(0), BigDecimal(1)))), + expected) + checkAnswer( + Seq("abc", "def").toDS.select($"value".isInCollection(Seq("abc", "xyz"))), + expected) + checkAnswer( + Seq(Date.valueOf("2020-04-29"), Date.valueOf("2020-05-01")).toDS + .select($"value".isInCollection( + Seq(Date.valueOf("2020-04-29"), Date.valueOf("2020-04-30")))), + expected) + checkAnswer( + Seq(new Timestamp(0), new Timestamp(2)).toDS + .select($"value".isInCollection(Seq(new Timestamp(0), new Timestamp(1)))), + expected) + checkAnswer( + Seq(Array("a", "b"), Array("c", "d")).toDS + .select($"value".isInCollection(Seq(Array("a", "b"), Array("x", "z")))), + expected) } - Seq("cannot resolve", - "due to data type mismatch: Arguments must be same type but were").foreach { s => - assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) - } } } } @@ -869,4 +918,9 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { df.select(typedLit(("a", 2, 1.0))), Row(Row("a", 2, 1.0)) :: Nil) } + + test("SPARK-31563: sql of InSet for UTF8String collection") { + val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString)) + assert(inSet.sql === "('a' IN ('a', 'b'))") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala index 6b503334f9f23..bdcf7230e3211 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala @@ -17,11 +17,15 @@ package org.apache.spark.sql +import scala.collection.JavaConverters._ + import org.apache.spark.sql.catalyst.expressions.CreateNamedStruct import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{ArrayType, StructType} class ComplexTypesSuite extends QueryTest with SharedSparkSession { + import testImplicits._ override def beforeAll(): Unit = { super.beforeAll() @@ -106,4 +110,11 @@ class ComplexTypesSuite extends QueryTest with SharedSparkSession { checkAnswer(df1, Row(10, 12) :: Row(11, 13) :: Nil) checkNamedStruct(df.queryExecution.optimizedPlan, expectedCount = 0) } + + test("SPARK-32167: get field from an array of struct") { + val innerStruct = new StructType().add("i", "int", nullable = true) + val schema = new StructType().add("arr", ArrayType(innerStruct, containsNull = false)) + val df = spark.createDataFrame(List(Row(Seq(Row(1), Row(null)))).asJava, schema) + checkAnswer(df.select($"arr".getField("i")), Row(Seq(1, null))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala index c3dbbb325d842..36989efbe870d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql import org.apache.commons.math3.stat.inference.ChiSquareTest +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -27,7 +28,8 @@ class ConfigBehaviorSuite extends QueryTest with SharedSparkSession { import testImplicits._ - test("SPARK-22160 spark.sql.execution.rangeExchange.sampleSizePerPartition") { + test("SPARK-22160 spark.sql.execution.rangeExchange.sampleSizePerPartition", + DisableAdaptiveExecution("Post shuffle partition number can be different")) { // In this test, we run a sort and compute the histogram for partition size post shuffle. // With a high sample count, the partition size should be more evenly distributed, and has a // low chi-sq test value. @@ -53,11 +55,8 @@ class ConfigBehaviorSuite extends QueryTest with SharedSparkSession { dist) } - // When enable AQE, the post partition number is changed. // And the ChiSquareTest result is also need updated. So disable AQE. - withSQLConf( - SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString, - SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString) { // The default chi-sq value should be low assert(computeChiSquareTest() < 100) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala index 61f0e138cc358..b9e0d509eeece 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala @@ -59,10 +59,22 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { val df2 = df .select(from_csv($"value", schemaWithCorrField1, Map( "mode" -> "Permissive", "columnNameOfCorruptRecord" -> columnNameOfCorruptRecord))) - - checkAnswer(df2, Seq( - Row(Row(0, null, "0,2013-111-11 12:13:14")), - Row(Row(1, java.sql.Date.valueOf("1983-08-04"), null)))) + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "corrected") { + checkAnswer(df2, Seq( + Row(Row(0, null, "0,2013-111-11 12:13:14")), + Row(Row(1, java.sql.Date.valueOf("1983-08-04"), null)))) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "legacy") { + checkAnswer(df2, Seq( + Row(Row(0, java.sql.Date.valueOf("2022-03-11"), null)), + Row(Row(1, java.sql.Date.valueOf("1983-08-04"), null)))) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "exception") { + val msg = intercept[SparkException] { + df2.collect() + }.getCause.getMessage + assert(msg.contains("Fail to parse")) + } } test("schema_of_csv - infers schemas") { @@ -200,4 +212,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { assert(readback(0).getAs[Row](0).getAs[Date](0).getTime >= 0) } } + + test("optional datetime parser does not affect csv time formatting") { + val s = "2015-08-26 12:34:46" + def toDF(p: String): DataFrame = sql( + s""" + |SELECT + | to_csv( + | named_struct('time', timestamp'$s'), map('timestampFormat', "$p") + | ) + | """.stripMargin) + checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index dc1767a6852f6..2cb7790028b33 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -530,6 +530,22 @@ class DataFrameAggregateSuite extends QueryTest ) } + test("SPARK-31500: collect_set() of BinaryType returns duplicate elements") { + val bytesTest1 = "test1".getBytes + val bytesTest2 = "test2".getBytes + val df = Seq(bytesTest1, bytesTest1, bytesTest2).toDF("a") + checkAnswer(df.select(size(collect_set($"a"))), Row(2) :: Nil) + + val a = "aa".getBytes + val b = "bb".getBytes + val c = "cc".getBytes + val d = "dd".getBytes + val df1 = Seq((a, b), (a, b), (c, d)) + .toDF("x", "y") + .select(struct($"x", $"y").as("a")) + checkAnswer(df1.select(size(collect_set($"a"))), Row(2) :: Nil) + } + test("collect_set functions cannot have maps") { val df = Seq((1, 3, 0), (2, 3, 0), (3, 4, 1)) .toDF("a", "x", "y") @@ -615,34 +631,33 @@ class DataFrameAggregateSuite extends QueryTest Seq((true, true), (true, false), (false, true), (false, false))) { withSQLConf( (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString), - (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString), - (SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false")) { - // When enable AQE, the WholeStageCodegenExec is added during QueryStageExec. + (SQLConf.USE_OBJECT_HASH_AGG.key, useObjectHashAgg.toString)) { val df = Seq(("1", 1), ("1", 2), ("2", 3), ("2", 4)).toDF("x", "y") // test case for HashAggregate val hashAggDF = df.groupBy("x").agg(c, sum("y")) + hashAggDF.collect() val hashAggPlan = hashAggDF.queryExecution.executedPlan if (wholeStage) { - assert(hashAggPlan.find { + assert(find(hashAggPlan) { case WholeStageCodegenExec(_: HashAggregateExec) => true case _ => false }.isDefined) } else { - assert(hashAggPlan.isInstanceOf[HashAggregateExec]) + assert(stripAQEPlan(hashAggPlan).isInstanceOf[HashAggregateExec]) } - hashAggDF.collect() // test case for ObjectHashAggregate and SortAggregate val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y")) - val objHashAggOrSortAggPlan = objHashAggOrSortAggDF.queryExecution.executedPlan + objHashAggOrSortAggDF.collect() + val objHashAggOrSortAggPlan = + stripAQEPlan(objHashAggOrSortAggDF.queryExecution.executedPlan) if (useObjectHashAgg) { assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec]) } else { assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec]) } - objHashAggOrSortAggDF.collect() } } } @@ -959,16 +974,85 @@ class DataFrameAggregateSuite extends QueryTest } } - test("calendar interval agg support hash aggregate") { - val df1 = Seq((1, "1 day"), (2, "2 day"), (3, "3 day"), (3, null)).toDF("a", "b") - val df2 = df1.select(avg($"b" cast CalendarIntervalType)) - checkAnswer(df2, Row(new CalendarInterval(0, 2, 0)) :: Nil) - assert(find(df2.queryExecution.executedPlan)(_.isInstanceOf[HashAggregateExec]).isDefined) - val df3 = df1.groupBy($"a").agg(avg($"b" cast CalendarIntervalType)) - checkAnswer(df3, - Row(1, new CalendarInterval(0, 1, 0)) :: - Row(2, new CalendarInterval(0, 2, 0)) :: - Row(3, new CalendarInterval(0, 3, 0)) :: Nil) - assert(find(df3.queryExecution.executedPlan)(_.isInstanceOf[HashAggregateExec]).isDefined) + Seq(true, false).foreach { value => + test(s"SPARK-31620: agg with subquery (whole-stage-codegen = $value)") { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> value.toString) { + withTempView("t1", "t2") { + sql("create temporary view t1 as select * from values (1, 2) as t1(a, b)") + sql("create temporary view t2 as select * from values (3, 4) as t2(c, d)") + + // test without grouping keys + checkAnswer(sql("select sum(if(c > (select a from t1), d, 0)) as csum from t2"), + Row(4) :: Nil) + + // test with grouping keys + checkAnswer(sql("select c, sum(if(c > (select a from t1), d, 0)) as csum from " + + "t2 group by c"), Row(3, 4) :: Nil) + + // test with distinct + checkAnswer(sql("select avg(distinct(d)), sum(distinct(if(c > (select a from t1)," + + " d, 0))) as csum from t2 group by c"), Row(4, 4) :: Nil) + + // test subquery with agg + checkAnswer(sql("select sum(distinct(if(c > (select sum(distinct(a)) from t1)," + + " d, 0))) as csum from t2 group by c"), Row(4) :: Nil) + + // test SortAggregateExec + var df = sql("select max(if(c > (select a from t1), 'str1', 'str2')) as csum from t2") + assert(df.queryExecution.executedPlan + .find { case _: SortAggregateExec => true }.isDefined) + checkAnswer(df, Row("str1") :: Nil) + + // test ObjectHashAggregateExec + df = sql("select collect_list(d), sum(if(c > (select a from t1), d, 0)) as csum from t2") + assert(df.queryExecution.executedPlan + .find { case _: ObjectHashAggregateExec => true }.isDefined) + checkAnswer(df, Row(Array(4), 4) :: Nil) + } + } + } + } + + test("SPARK-32038: NormalizeFloatingNumbers should work on distinct aggregate") { + withTempView("view") { + val nan1 = java.lang.Float.intBitsToFloat(0x7f800001) + val nan2 = java.lang.Float.intBitsToFloat(0x7fffffff) + + Seq(("mithunr", Float.NaN), + ("mithunr", nan1), + ("mithunr", nan2), + ("abellina", 1.0f), + ("abellina", 2.0f)).toDF("uid", "score").createOrReplaceTempView("view") + + val df = spark.sql("select uid, count(distinct score) from view group by 1 order by 1 asc") + checkAnswer(df, Row("abellina", 2) :: Row("mithunr", 1) :: Nil) + } + } + + test("SPARK-32136: NormalizeFloatingNumbers should work on null struct") { + val df = Seq( + A(None), + A(Some(B(None))), + A(Some(B(Some(1.0))))).toDF + val groupBy = df.groupBy("b").agg(count("*")) + checkAnswer(groupBy, Row(null, 1) :: Row(Row(null), 1) :: Row(Row(1.0), 1) :: Nil) + } + + test("SPARK-32344: Unevaluable's set to FIRST/LAST ignoreNullsExpr in distinct aggregates") { + val queryTemplate = (agg: String) => + s"SELECT $agg(DISTINCT v) FROM (SELECT v FROM VALUES 1, 2, 3 t(v) ORDER BY v)" + checkAnswer(sql(queryTemplate("FIRST")), Row(1)) + checkAnswer(sql(queryTemplate("LAST")), Row(3)) + } + + test("SPARK-32906: struct field names should not change after normalizing floats") { + val df = Seq(Tuple1(Tuple2(-0.0d, Double.NaN)), Tuple1(Tuple2(0.0d, Double.NaN))).toDF("k") + val aggs = df.distinct().queryExecution.sparkPlan.collect { case a: HashAggregateExec => a } + assert(aggs.length == 2) + assert(aggs.head.output.map(_.dataType.simpleString).head === + aggs.last.output.map(_.dataType.simpleString).head) } } + +case class B(c: Option[Double]) +case class A(b: Option[B]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 7fce03658fc16..ac98d3f0c7095 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -23,11 +23,13 @@ import java.util.TimeZone import scala.util.Random +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation import org.apache.spark.sql.catalyst.util.DateTimeTestUtils +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.UTC import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -489,6 +491,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") { testSizeOfArray(sizeOfNull = null) } + // size(null) should return null under ansi mode. + withSQLConf( + SQLConf.LEGACY_SIZE_OF_NULL.key -> "true", + SQLConf.ANSI_ENABLED.key -> "true") { + testSizeOfArray(sizeOfNull = null) + } } test("dataframe arrays_zip function") { @@ -568,6 +576,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") { testSizeOfMap(sizeOfNull = null) } + // size(null) should return null under ansi mode. + withSQLConf( + SQLConf.LEGACY_SIZE_OF_NULL.key -> "true", + SQLConf.ANSI_ENABLED.key -> "true") { + testSizeOfMap(sizeOfNull = null) + } } test("map_keys/map_values function") { @@ -651,8 +665,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(null) ) - checkAnswer(df1.selectExpr("map_concat(map1, map2)"), expected1a) - checkAnswer(df1.select(map_concat($"map1", $"map2")), expected1a) + intercept[SparkException](df1.selectExpr("map_concat(map1, map2)").collect()) + intercept[SparkException](df1.select(map_concat($"map1", $"map2")).collect()) + withSQLConf(SQLConf.MAP_KEY_DEDUP_POLICY.key -> SQLConf.MapKeyDedupPolicy.LAST_WIN.toString) { + checkAnswer(df1.selectExpr("map_concat(map1, map2)"), expected1a) + checkAnswer(df1.select(map_concat($"map1", $"map2")), expected1a) + } val expected1b = Seq( Row(Map(1 -> 100, 2 -> 200)), @@ -1011,7 +1029,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Timestamp.valueOf("2018-01-01 12:00:00"), Timestamp.valueOf("2018-01-02 00:00:00"))))) - DateTimeTestUtils.withDefaultTimeZone(TimeZone.getTimeZone("UTC")) { + DateTimeTestUtils.withDefaultTimeZone(UTC) { checkAnswer( spark.sql("select sequence(" + " cast('2018-01-01' as date)" + @@ -1515,6 +1533,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { assert(e.getMessage.contains("string, binary or array")) } + test("SPARK-31227: Non-nullable null type should not coerce to nullable type in concat") { + val actual = spark.range(1).selectExpr("concat(array(), array(1)) as arr") + val expected = spark.range(1).selectExpr("array(1) as arr") + checkAnswer(actual, expected) + assert(actual.schema === expected.schema) + } + test("flatten function") { // Test cases with a primitive type val intDF = Seq( @@ -3068,11 +3093,19 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(dfExample2.select(transform_keys(col("j"), (k, v) => k + v)), Seq(Row(Map(2.0 -> 1.0, 3.4 -> 1.4, 4.7 -> 1.7)))) - checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> k % 2 = 0 OR v)"), - Seq(Row(Map(true -> true, true -> false)))) + intercept[SparkException] { + dfExample3.selectExpr("transform_keys(x, (k, v) -> k % 2 = 0 OR v)").collect() + } + intercept[SparkException] { + dfExample3.select(transform_keys(col("x"), (k, v) => k % 2 === 0 || v)).collect() + } + withSQLConf(SQLConf.MAP_KEY_DEDUP_POLICY.key -> SQLConf.MapKeyDedupPolicy.LAST_WIN.toString) { + checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> k % 2 = 0 OR v)"), + Seq(Row(Map(true -> true, true -> false)))) - checkAnswer(dfExample3.select(transform_keys(col("x"), (k, v) => k % 2 === 0 || v)), - Seq(Row(Map(true -> true, true -> false)))) + checkAnswer(dfExample3.select(transform_keys(col("x"), (k, v) => k % 2 === 0 || v)), + Seq(Row(Map(true -> true, true -> false)))) + } checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> if(v, 2 * k, 3 * k))"), Seq(Row(Map(50 -> true, 78 -> false)))) @@ -3499,16 +3532,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_)) } - test("SPARK-21281 use string types by default if array and map have no argument") { - val ds = spark.range(1) - var expectedSchema = new StructType() - .add("x", ArrayType(StringType, containsNull = false), nullable = false) - assert(ds.select(array().as("x")).schema == expectedSchema) - expectedSchema = new StructType() - .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false) - assert(ds.select(map().as("x")).schema == expectedSchema) - } - test("SPARK-21281 fails if functions have no argument") { val df = Seq(1).toDF("a") @@ -3563,19 +3586,40 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Seq(Row(1))) } - test("the like function with the escape parameter") { - val df = Seq(("abc", "a_c", "!")).toDF("str", "pattern", "escape") - checkAnswer(df.selectExpr("like(str, pattern, '@')"), Row(true)) + test("SPARK-29462: Empty array of NullType for array function with no arguments") { + Seq((true, StringType), (false, NullType)).foreach { + case (arrayDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key -> + arrayDefaultToString.toString) { + val schema = spark.range(1).select(array()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType]) + val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType + assert(actualType === expectedType) + } + } + } - val longEscapeError = intercept[AnalysisException] { - df.selectExpr("like(str, pattern, '@%')").collect() - }.getMessage - assert(longEscapeError.contains("The 'escape' parameter must be a string literal of one char")) + test("SPARK-30790: Empty map with NullType as key/value type for map function with no argument") { + Seq((true, StringType), (false, NullType)).foreach { + case (mapDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key -> + mapDefaultToString.toString) { + val schema = spark.range(1).select(map()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType]) + val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType + val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType + assert(actualKeyType === expectedType) + assert(actualValueType === expectedType) + } + } + } - val nonFoldableError = intercept[AnalysisException] { - df.selectExpr("like(str, pattern, escape)").collect() - }.getMessage - assert(nonFoldableError.contains("The 'escape' parameter must be a string literal")) + test("SPARK-26071: convert map to array and use as map key") { + val df = Seq(Map(1 -> "a")).toDF("m") + intercept[AnalysisException](df.select(map($"m", lit(1)))) + checkAnswer( + df.select(map(map_entries($"m"), lit(1))), + Row(Map(Seq(Row(1, "a")) -> 1))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index c7545bcad8962..a49f95f1ed134 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -17,14 +17,21 @@ package org.apache.spark.sql +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.{Inner, InnerLike, LeftOuter, RightOuter} -import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, Filter, HintInfo, Join, JoinHint, LogicalPlan, Project} +import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ class DataFrameJoinSuite extends QueryTest with SharedSparkSession @@ -322,4 +329,132 @@ class DataFrameJoinSuite extends QueryTest } } } + + test("Supports multi-part names for broadcast hint resolution") { + val (table1Name, table2Name) = ("t1", "t2") + + withTempDatabase { dbName => + withTable(table1Name, table2Name) { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + spark.range(50).write.saveAsTable(s"$dbName.$table1Name") + spark.range(100).write.saveAsTable(s"$dbName.$table2Name") + + def checkIfHintApplied(df: DataFrame): Unit = { + val sparkPlan = df.queryExecution.executedPlan + val broadcastHashJoins = sparkPlan.collect { case p: BroadcastHashJoinExec => p } + assert(broadcastHashJoins.size == 1) + val broadcastExchanges = broadcastHashJoins.head.collect { + case p: BroadcastExchangeExec => p + } + assert(broadcastExchanges.size == 1) + val tables = broadcastExchanges.head.collect { + case FileSourceScanExec(_, _, _, _, _, _, Some(tableIdent)) => tableIdent + } + assert(tables.size == 1) + assert(tables.head === TableIdentifier(table1Name, Some(dbName))) + } + + def checkIfHintNotApplied(df: DataFrame): Unit = { + val sparkPlan = df.queryExecution.executedPlan + val broadcastHashJoins = sparkPlan.collect { case p: BroadcastHashJoinExec => p } + assert(broadcastHashJoins.isEmpty) + } + + def sqlTemplate(tableName: String, hintTableName: String): DataFrame = { + sql(s"SELECT /*+ BROADCASTJOIN($hintTableName) */ * " + + s"FROM $tableName, $dbName.$table2Name " + + s"WHERE $tableName.id = $table2Name.id") + } + + def dfTemplate(tableName: String, hintTableName: String): DataFrame = { + spark.table(tableName).join(spark.table(s"$dbName.$table2Name"), "id") + .hint("broadcast", hintTableName) + } + + sql(s"USE $dbName") + + checkIfHintApplied(sqlTemplate(table1Name, table1Name)) + checkIfHintApplied(sqlTemplate(s"$dbName.$table1Name", s"$dbName.$table1Name")) + checkIfHintApplied(sqlTemplate(s"$dbName.$table1Name", table1Name)) + checkIfHintNotApplied(sqlTemplate(table1Name, s"$dbName.$table1Name")) + + checkIfHintApplied(dfTemplate(table1Name, table1Name)) + checkIfHintApplied(dfTemplate(s"$dbName.$table1Name", s"$dbName.$table1Name")) + checkIfHintApplied(dfTemplate(s"$dbName.$table1Name", table1Name)) + checkIfHintApplied(dfTemplate(table1Name, s"$dbName.$table1Name")) + checkIfHintApplied(dfTemplate(table1Name, + s"${CatalogManager.SESSION_CATALOG_NAME}.$dbName.$table1Name")) + + withView("tv") { + sql(s"CREATE VIEW tv AS SELECT * FROM $dbName.$table1Name") + checkIfHintApplied(sqlTemplate("tv", "tv")) + checkIfHintNotApplied(sqlTemplate("tv", s"$dbName.tv")) + + checkIfHintApplied(dfTemplate("tv", "tv")) + checkIfHintApplied(dfTemplate("tv", s"$dbName.tv")) + } + } + } + } + } + + test("The same table name exists in two databases for broadcast hint resolution") { + val (db1Name, db2Name) = ("db1", "db2") + + withDatabase(db1Name, db2Name) { + withTable("t") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + sql(s"CREATE DATABASE $db1Name") + sql(s"CREATE DATABASE $db2Name") + spark.range(1).write.saveAsTable(s"$db1Name.t") + spark.range(1).write.saveAsTable(s"$db2Name.t") + + // Checks if a broadcast hint applied in both sides + val statement = s"SELECT /*+ BROADCASTJOIN(t) */ * FROM $db1Name.t, $db2Name.t " + + s"WHERE $db1Name.t.id = $db2Name.t.id" + sql(statement).queryExecution.optimizedPlan match { + case Join(_, _, _, _, JoinHint(Some(HintInfo(Some(BROADCAST))), + Some(HintInfo(Some(BROADCAST))))) => + case _ => fail("broadcast hint not found in both tables") + } + } + } + } + } + + test("SPARK-32693: Compare two dataframes with same schema except nullable property") { + val schema1 = StructType( + StructField("a", IntegerType, false) :: + StructField("b", IntegerType, false) :: + StructField("c", IntegerType, false) :: Nil) + val rowSeq1: List[Row] = List(Row(10, 1, 1), Row(10, 50, 2)) + val df1 = spark.createDataFrame(rowSeq1.asJava, schema1) + + val schema2 = StructType( + StructField("a", IntegerType) :: + StructField("b", IntegerType) :: + StructField("c", IntegerType) :: Nil) + val rowSeq2: List[Row] = List(Row(10, 1, 1)) + val df2 = spark.createDataFrame(rowSeq2.asJava, schema2) + + checkAnswer(df1.except(df2), Row(10, 50, 2)) + + val schema3 = StructType( + StructField("a", IntegerType, false) :: + StructField("b", IntegerType, false) :: + StructField("c", IntegerType, false) :: + StructField("d", schema1, false) :: Nil) + val rowSeq3: List[Row] = List(Row(10, 1, 1, Row(10, 1, 1)), Row(10, 50, 2, Row(10, 50, 2))) + val df3 = spark.createDataFrame(rowSeq3.asJava, schema3) + + val schema4 = StructType( + StructField("a", IntegerType) :: + StructField("b", IntegerType) :: + StructField("b", IntegerType) :: + StructField("d", schema2) :: Nil) + val rowSeq4: List[Row] = List(Row(10, 1, 1, Row(10, 1, 1))) + val df4 = spark.createDataFrame(rowSeq4.asJava, schema4) + + checkAnswer(df3.except(df4), Row(10, 50, 2, Row(10, 50, 2))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala index fb1ca69b6f73f..091877f7cac37 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala @@ -45,6 +45,16 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession { ).toDF("int", "long", "short", "byte", "float", "double") } + def createDFWithNestedColumns: DataFrame = { + val schema = new StructType() + .add("c1", new StructType() + .add("c1-1", StringType) + .add("c1-2", StringType)) + val data = Seq(Row(Row(null, "a2")), Row(Row("b1", "b2")), Row(null)) + spark.createDataFrame( + spark.sparkContext.parallelize(data), schema) + } + test("drop") { val input = createDF() val rows = input.collect() @@ -275,33 +285,35 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession { assert(message.contains("Reference 'f2' is ambiguous")) } - test("fill/drop with col(*)") { + test("fill with col(*)") { val df = createDF() // If columns are specified with "*", they are ignored. checkAnswer(df.na.fill("new name", Seq("*")), df.collect()) - checkAnswer(df.na.drop("any", Seq("*")), df.collect()) } - test("fill/drop with nested columns") { - val schema = new StructType() - .add("c1", new StructType() - .add("c1-1", StringType) - .add("c1-2", StringType)) + test("drop with col(*)") { + val df = createDF() + val exception = intercept[AnalysisException] { + df.na.drop("any", Seq("*")) + } + assert(exception.getMessage.contains("Cannot resolve column name \"*\"")) + } - val data = Seq( - Row(Row(null, "a2")), - Row(Row("b1", "b2")), - Row(null)) + test("fill with nested columns") { + val df = createDFWithNestedColumns - val df = spark.createDataFrame( - spark.sparkContext.parallelize(data), schema) + // Nested columns are ignored for fill(). + checkAnswer(df.na.fill("a1", Seq("c1.c1-1")), df) + } - checkAnswer(df.select("c1.c1-1"), - Row(null) :: Row("b1") :: Row(null) :: Nil) + test("drop with nested columns") { + val df = createDFWithNestedColumns - // Nested columns are ignored for fill() and drop(). - checkAnswer(df.na.fill("a1", Seq("c1.c1-1")), data) - checkAnswer(df.na.drop("any", Seq("c1.c1-1")), data) + // Rows with the specified nested columns whose null values are dropped. + assert(df.count == 3) + checkAnswer( + df.na.drop("any", Seq("c1.c1-1")), + Seq(Row(Row("b1", "b2")))) } test("replace") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala index 51c6a835d58d6..51d861ec69b23 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala @@ -258,7 +258,7 @@ class DataFramePivotSuite extends QueryTest with SharedSparkSession { val ts = "2012-12-31 16:00:10.011" val tsWithZone = "2013-01-01 00:00:10.011" - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "GMT") { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { val df = Seq(java.sql.Timestamp.valueOf(ts)).toDF("a").groupBy("a").pivot("a").count() val expected = StructType( StructField("a", TimestampType) :: diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala index 250ec7dc0ba5a..3b3b54f75da57 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -202,4 +203,20 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { assertAmbiguousSelfJoin(df1.join(df4).join(df2).select(df2("id"))) } } + + test("SPARK-28344: don't fail if there is no ambiguous self join") { + withSQLConf( + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true") { + val df = Seq(1, 1, 2, 2).toDF("a") + val w = Window.partitionBy(df("a")) + checkAnswer( + df.select(df("a").alias("x"), sum(df("a")).over(w)), + Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple)) + + val joined = df.join(spark.range(1)).select($"a") + checkAnswer( + joined.select(joined("a").alias("x"), sum(joined("a")).over(w)), + Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple)) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 394bad751b5ce..19601726fa393 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -126,6 +126,32 @@ class DataFrameStatSuite extends QueryTest with SharedSparkSession { assert(math.abs(corr3 - 0.95723391394758572) < 1e-12) } + test("SPARK-30532 stat functions to understand fully-qualified column name") { + val df1 = spark.sparkContext.parallelize(0 to 10).toDF("num").as("table1") + val df2 = spark.sparkContext.parallelize(0 to 10).toDF("num").as("table2") + val dfx = df2.crossJoin(df1) + + assert(dfx.stat.corr("table1.num", "table2.num") != 0.0) + assert(dfx.stat.cov("table1.num", "table2.num") != 0.0) + assert(dfx.stat.approxQuantile("table1.num", Array(0.1), 0.0).length == 1) + assert(dfx.stat.approxQuantile("table2.num", Array(0.1), 0.0).length == 1) + assert(dfx.stat.freqItems(Array("table1.num", "table2.num")).collect()(0).length == 2) + + // this should throw "Reference 'num' is ambiguous" + intercept[AnalysisException] { + dfx.stat.freqItems(Array("num")) + } + intercept[AnalysisException] { + dfx.stat.approxQuantile("num", Array(0.1), 0.0) + } + intercept[AnalysisException] { + dfx.stat.cov("num", "num") + } + intercept[AnalysisException] { + dfx.stat.corr("num", "num") + } + } + test("covariance") { val df = Seq.tabulate(10)(i => (i, 2.0 * i, toLetter(i))).toDF("singles", "doubles", "letters") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index d2d58a83ded5d..7c410e862e10c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -23,6 +23,7 @@ import java.sql.{Date, Timestamp} import java.util.UUID import java.util.concurrent.atomic.AtomicLong +import scala.reflect.runtime.universe.TypeTag import scala.util.Random import org.scalatest.Matchers._ @@ -30,10 +31,12 @@ import org.scalatest.Matchers._ import org.apache.spark.SparkException import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.expressions.Uuid import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, OneRowRelation, Union} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, OneRowRelation} +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.FakeV2Provider import org.apache.spark.sql.execution.{FilterExec, QueryExecution, WholeStageCodegenExec} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.HashAggregateExec @@ -41,8 +44,9 @@ import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExc import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession} -import org.apache.spark.sql.test.SQLTestData.{DecimalData, NullStrings, TestData2} +import org.apache.spark.sql.test.SQLTestData.{DecimalData, TestData2} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom @@ -109,6 +113,31 @@ class DataFrameSuite extends QueryTest dfAlias.col("t2.c") } + test("simple explode") { + val df = Seq(Tuple1("a b c"), Tuple1("d e")).toDF("words") + + checkAnswer( + df.explode("words", "word") { word: String => word.split(" ").toSeq }.select('word), + Row("a") :: Row("b") :: Row("c") :: Row("d") ::Row("e") :: Nil + ) + } + + test("explode") { + val df = Seq((1, "a b c"), (2, "a b"), (3, "a")).toDF("number", "letters") + val df2 = + df.explode('letters) { + case Row(letters: String) => letters.split(" ").map(Tuple1(_)).toSeq + } + + checkAnswer( + df2 + .select('_1 as 'letter, 'number) + .groupBy('letter) + .agg(countDistinct('number)), + Row("a", 3) :: Row("b", 2) :: Row("c", 1) :: Nil + ) + } + test("Star Expansion - CreateStruct and CreateArray") { val structDf = testData2.select("a", "b").as("record") // CreateStruct and CreateArray in aggregateExpressions @@ -185,6 +214,27 @@ class DataFrameSuite extends QueryTest } } + test("Star Expansion - ds.explode should fail with a meaningful message if it takes a star") { + val df = Seq(("1", "1,2"), ("2", "4"), ("3", "7,8,9")).toDF("prefix", "csv") + val e = intercept[AnalysisException] { + df.explode($"*") { case Row(prefix: String, csv: String) => + csv.split(",").map(v => Tuple1(prefix + ":" + v)).toSeq + }.queryExecution.assertAnalyzed() + } + assert(e.getMessage.contains("Invalid usage of '*' in explode/json_tuple/UDTF")) + + checkAnswer( + df.explode('prefix, 'csv) { case Row(prefix: String, csv: String) => + csv.split(",").map(v => Tuple1(prefix + ":" + v)).toSeq + }, + Row("1", "1,2", "1:1") :: + Row("1", "1,2", "1:2") :: + Row("2", "4", "2:4") :: + Row("3", "7,8,9", "3:7") :: + Row("3", "7,8,9", "3:8") :: + Row("3", "7,8,9", "3:9") :: Nil) + } + test("Star Expansion - explode should fail with a meaningful message if it takes a star") { val df = Seq(("1,2"), ("4"), ("7,8,9")).toDF("csv") val e = intercept[AnalysisException] { @@ -196,7 +246,7 @@ class DataFrameSuite extends QueryTest test("explode on output of array-valued function") { val df = Seq(("1,2"), ("4"), ("7,8,9")).toDF("csv") checkAnswer( - df.select(explode(split($"csv", ","))), + df.select(explode(split($"csv", pattern = ","))), Row("1") :: Row("2") :: Row("4") :: Row("7") :: Row("8") :: Row("9") :: Nil) } @@ -330,7 +380,7 @@ class DataFrameSuite extends QueryTest testData.select("key").coalesce(1).select("key"), testData.select("key").collect().toSeq) - assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 1) + assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 0) } test("convert $\"attribute name\" into unresolved attribute") { @@ -1167,7 +1217,7 @@ class DataFrameSuite extends QueryTest |""".stripMargin assert(df.showString(1, truncate = 0) === expectedAnswer) - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "GMT") { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { val expectedAnswer = """+----------+-------------------+ ||d |ts | @@ -1188,7 +1238,7 @@ class DataFrameSuite extends QueryTest " ts | 2016-12-01 00:00:00 \n" assert(df.showString(1, truncate = 0, vertical = true) === expectedAnswer) - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "GMT") { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { val expectedAnswer = "-RECORD 0------------------\n" + " d | 2016-12-01 \n" + @@ -1335,47 +1385,48 @@ class DataFrameSuite extends QueryTest test("SPARK-6941: Better error message for inserting into RDD-based Table") { withTempDir { dir => + withTempView("parquet_base", "json_base", "rdd_base", "indirect_ds", "one_row") { + val tempParquetFile = new File(dir, "tmp_parquet") + val tempJsonFile = new File(dir, "tmp_json") + + val df = Seq(Tuple1(1)).toDF() + val insertion = Seq(Tuple1(2)).toDF("col") + + // pass case: parquet table (HadoopFsRelation) + df.write.mode(SaveMode.Overwrite).parquet(tempParquetFile.getCanonicalPath) + val pdf = spark.read.parquet(tempParquetFile.getCanonicalPath) + pdf.createOrReplaceTempView("parquet_base") + + insertion.write.insertInto("parquet_base") + + // pass case: json table (InsertableRelation) + df.write.mode(SaveMode.Overwrite).json(tempJsonFile.getCanonicalPath) + val jdf = spark.read.json(tempJsonFile.getCanonicalPath) + jdf.createOrReplaceTempView("json_base") + insertion.write.mode(SaveMode.Overwrite).insertInto("json_base") + + // error cases: insert into an RDD + df.createOrReplaceTempView("rdd_base") + val e1 = intercept[AnalysisException] { + insertion.write.insertInto("rdd_base") + } + assert(e1.getMessage.contains("Inserting into an RDD-based table is not allowed.")) - val tempParquetFile = new File(dir, "tmp_parquet") - val tempJsonFile = new File(dir, "tmp_json") - - val df = Seq(Tuple1(1)).toDF() - val insertion = Seq(Tuple1(2)).toDF("col") - - // pass case: parquet table (HadoopFsRelation) - df.write.mode(SaveMode.Overwrite).parquet(tempParquetFile.getCanonicalPath) - val pdf = spark.read.parquet(tempParquetFile.getCanonicalPath) - pdf.createOrReplaceTempView("parquet_base") - - insertion.write.insertInto("parquet_base") - - // pass case: json table (InsertableRelation) - df.write.mode(SaveMode.Overwrite).json(tempJsonFile.getCanonicalPath) - val jdf = spark.read.json(tempJsonFile.getCanonicalPath) - jdf.createOrReplaceTempView("json_base") - insertion.write.mode(SaveMode.Overwrite).insertInto("json_base") - - // error cases: insert into an RDD - df.createOrReplaceTempView("rdd_base") - val e1 = intercept[AnalysisException] { - insertion.write.insertInto("rdd_base") - } - assert(e1.getMessage.contains("Inserting into an RDD-based table is not allowed.")) - - // error case: insert into a logical plan that is not a LeafNode - val indirectDS = pdf.select("_1").filter($"_1" > 5) - indirectDS.createOrReplaceTempView("indirect_ds") - val e2 = intercept[AnalysisException] { - insertion.write.insertInto("indirect_ds") - } - assert(e2.getMessage.contains("Inserting into an RDD-based table is not allowed.")) + // error case: insert into a logical plan that is not a LeafNode + val indirectDS = pdf.select("_1").filter($"_1" > 5) + indirectDS.createOrReplaceTempView("indirect_ds") + val e2 = intercept[AnalysisException] { + insertion.write.insertInto("indirect_ds") + } + assert(e2.getMessage.contains("Inserting into an RDD-based table is not allowed.")) - // error case: insert into an OneRowRelation - Dataset.ofRows(spark, OneRowRelation()).createOrReplaceTempView("one_row") - val e3 = intercept[AnalysisException] { - insertion.write.insertInto("one_row") + // error case: insert into an OneRowRelation + Dataset.ofRows(spark, OneRowRelation()).createOrReplaceTempView("one_row") + val e3 = intercept[AnalysisException] { + insertion.write.insertInto("one_row") + } + assert(e3.getMessage.contains("Inserting into an RDD-based table is not allowed.")) } - assert(e3.getMessage.contains("Inserting into an RDD-based table is not allowed.")) } } @@ -1747,13 +1798,17 @@ class DataFrameSuite extends QueryTest val df = Seq("foo", "bar").map(Tuple1.apply).toDF("col") // invalid table names Seq("11111", "t~", "#$@sum", "table!#").foreach { name => - val m = intercept[AnalysisException](df.createOrReplaceTempView(name)).getMessage - assert(m.contains(s"Invalid view name: $name")) + withTempView(name) { + val m = intercept[AnalysisException](df.createOrReplaceTempView(name)).getMessage + assert(m.contains(s"Invalid view name: $name")) + } } // valid table names Seq("table1", "`11111`", "`t~`", "`#$@sum`", "`table!#`").foreach { name => - df.createOrReplaceTempView(name) + withTempView(name) { + df.createOrReplaceTempView(name) + } } } @@ -2298,6 +2353,133 @@ class DataFrameSuite extends QueryTest fail("emptyDataFrame should be foldable") } } + + test("SPARK-30811: CTE should not cause stack overflow when " + + "it refers to non-existent table with same name") { + val e = intercept[AnalysisException] { + sql("WITH t AS (SELECT 1 FROM nonexist.t) SELECT * FROM t") + } + assert(e.getMessage.contains("Table or view not found:")) + } + + test("SPARK-32680: Don't analyze CTAS with unresolved query") { + val v2Source = classOf[FakeV2Provider].getName + val e = intercept[AnalysisException] { + sql(s"CREATE TABLE t USING $v2Source AS SELECT * from nonexist") + } + assert(e.getMessage.contains("Table or view not found:")) + } + + test("CalendarInterval reflection support") { + val df = Seq((1, new CalendarInterval(1, 2, 3))).toDF("a", "b") + checkAnswer(df.selectExpr("b"), Row(new CalendarInterval(1, 2, 3))) + } + + test("SPARK-31552: array encoder with different types") { + // primitives + val booleans = Array(true, false) + checkAnswer(Seq(booleans).toDF(), Row(booleans)) + + val bytes = Array(1.toByte, 2.toByte) + checkAnswer(Seq(bytes).toDF(), Row(bytes)) + val shorts = Array(1.toShort, 2.toShort) + checkAnswer(Seq(shorts).toDF(), Row(shorts)) + val ints = Array(1, 2) + checkAnswer(Seq(ints).toDF(), Row(ints)) + val longs = Array(1L, 2L) + checkAnswer(Seq(longs).toDF(), Row(longs)) + + val floats = Array(1.0F, 2.0F) + checkAnswer(Seq(floats).toDF(), Row(floats)) + val doubles = Array(1.0D, 2.0D) + checkAnswer(Seq(doubles).toDF(), Row(doubles)) + + val strings = Array("2020-04-24", "2020-04-25") + checkAnswer(Seq(strings).toDF(), Row(strings)) + + // tuples + val decOne = Decimal(1, 38, 18) + val decTwo = Decimal(2, 38, 18) + val tuple1 = (1, 2.2, "3.33", decOne, Date.valueOf("2012-11-22")) + val tuple2 = (2, 3.3, "4.44", decTwo, Date.valueOf("2022-11-22")) + checkAnswer(Seq(Array(tuple1, tuple2)).toDF(), Seq(Seq(tuple1, tuple2)).toDF()) + + // case classes + val gbks = Array(GroupByKey(1, 2), GroupByKey(4, 5)) + checkAnswer(Seq(gbks).toDF(), Row(Array(Row(1, 2), Row(4, 5)))) + + // We can move this implicit def to [[SQLImplicits]] when we eventually make fully + // support for array encoder like Seq and Set + // For now cases below, decimal/datetime/interval/binary/nested types, etc, + // are not supported by array + implicit def newArrayEncoder[T <: Array[_] : TypeTag]: Encoder[T] = ExpressionEncoder() + + // decimals + val decSpark = Array(decOne, decTwo) + val decScala = decSpark.map(_.toBigDecimal) + val decJava = decSpark.map(_.toJavaBigDecimal) + checkAnswer(Seq(decSpark).toDF(), Row(decJava)) + checkAnswer(Seq(decScala).toDF(), Row(decJava)) + checkAnswer(Seq(decJava).toDF(), Row(decJava)) + + // datetimes and intervals + val dates = strings.map(Date.valueOf) + checkAnswer(Seq(dates).toDF(), Row(dates)) + val localDates = dates.map(d => DateTimeUtils.daysToLocalDate(DateTimeUtils.fromJavaDate(d))) + checkAnswer(Seq(localDates).toDF(), Row(dates)) + + val timestamps = + Array(Timestamp.valueOf("2020-04-24 12:34:56"), Timestamp.valueOf("2020-04-24 11:22:33")) + checkAnswer(Seq(timestamps).toDF(), Row(timestamps)) + val instants = + timestamps.map(t => DateTimeUtils.microsToInstant(DateTimeUtils.fromJavaTimestamp(t))) + checkAnswer(Seq(instants).toDF(), Row(timestamps)) + + val intervals = Array(new CalendarInterval(1, 2, 3), new CalendarInterval(4, 5, 6)) + checkAnswer(Seq(intervals).toDF(), Row(intervals)) + + // binary + val bins = Array(Array(1.toByte), Array(2.toByte), Array(3.toByte), Array(4.toByte)) + checkAnswer(Seq(bins).toDF(), Row(bins)) + + // nested + val nestedIntArray = Array(Array(1), Array(2)) + checkAnswer(Seq(nestedIntArray).toDF(), Row(nestedIntArray.map(wrapIntArray))) + val nestedDecArray = Array(decSpark) + checkAnswer(Seq(nestedDecArray).toDF(), Row(Array(wrapRefArray(decJava)))) + } + + test("SPARK-31750: eliminate UpCast if child's dataType is DecimalType") { + withTempPath { f => + sql("select cast(1 as decimal(38, 0)) as d") + .write.mode("overwrite") + .parquet(f.getAbsolutePath) + + val df = spark.read.parquet(f.getAbsolutePath).as[BigDecimal] + assert(df.schema === new StructType().add(StructField("d", DecimalType(38, 0)))) + } + } + + test("SPARK-32764: -0.0 and 0.0 should be equal") { + val df = Seq(0.0 -> -0.0).toDF("pos", "neg") + checkAnswer(df.select($"pos" > $"neg"), Row(false)) + } + + test("SPARK-32635: Replace references with foldables coming only from the node's children") { + val a = Seq("1").toDF("col1").withColumn("col2", lit("1")) + val b = Seq("2").toDF("col1").withColumn("col2", lit("2")) + val aub = a.union(b) + val c = aub.filter($"col1" === "2").cache() + val d = Seq("2").toDF("col4") + val r = d.join(aub, $"col2" === $"col4").select("col4") + val l = c.select("col2") + val df = l.join(r, $"col2" === $"col4", "LeftOuter") + checkAnswer(df, Row("2", "2")) + } + + test("SPARK-32761: aggregating multiple distinct CONSTANT columns") { + checkAnswer(sql("select count(distinct 2), count(distinct 2,3)"), Row(1, 1)) + } } case class GroupByKey(a: Int, b: Int) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala index d398657ec0b6e..09ce43e4e2b27 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala @@ -61,26 +61,28 @@ class DataFrameWindowFunctionsSuite extends QueryTest } test("rank functions in unspecific window") { - val df = Seq((1, "1"), (2, "2"), (1, "2"), (2, "2")).toDF("key", "value") - df.createOrReplaceTempView("window_table") - checkAnswer( - df.select( - $"key", - max("key").over(Window.partitionBy("value").orderBy("key")), - min("key").over(Window.partitionBy("value").orderBy("key")), - mean("key").over(Window.partitionBy("value").orderBy("key")), - count("key").over(Window.partitionBy("value").orderBy("key")), - sum("key").over(Window.partitionBy("value").orderBy("key")), - ntile(2).over(Window.partitionBy("value").orderBy("key")), - row_number().over(Window.partitionBy("value").orderBy("key")), - dense_rank().over(Window.partitionBy("value").orderBy("key")), - rank().over(Window.partitionBy("value").orderBy("key")), - cume_dist().over(Window.partitionBy("value").orderBy("key")), - percent_rank().over(Window.partitionBy("value").orderBy("key"))), - Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d, 0.0d) :: - Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d / 3.0d, 0.0d) :: - Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 1, 2, 2, 2, 1.0d, 0.5d) :: - Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 2, 3, 2, 2, 1.0d, 0.5d) :: Nil) + withTempView("window_table") { + val df = Seq((1, "1"), (2, "2"), (1, "2"), (2, "2")).toDF("key", "value") + df.createOrReplaceTempView("window_table") + checkAnswer( + df.select( + $"key", + max("key").over(Window.partitionBy("value").orderBy("key")), + min("key").over(Window.partitionBy("value").orderBy("key")), + mean("key").over(Window.partitionBy("value").orderBy("key")), + count("key").over(Window.partitionBy("value").orderBy("key")), + sum("key").over(Window.partitionBy("value").orderBy("key")), + ntile(2).over(Window.partitionBy("value").orderBy("key")), + row_number().over(Window.partitionBy("value").orderBy("key")), + dense_rank().over(Window.partitionBy("value").orderBy("key")), + rank().over(Window.partitionBy("value").orderBy("key")), + cume_dist().over(Window.partitionBy("value").orderBy("key")), + percent_rank().over(Window.partitionBy("value").orderBy("key"))), + Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d, 0.0d) :: + Row(1, 1, 1, 1.0d, 1, 1, 1, 1, 1, 1, 1.0d / 3.0d, 0.0d) :: + Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 1, 2, 2, 2, 1.0d, 0.5d) :: + Row(2, 2, 1, 5.0d / 3.0d, 3, 5, 2, 3, 2, 2, 1.0d, 0.5d) :: Nil) + } } test("window function should fail if order by clause is not specified") { @@ -348,15 +350,17 @@ class DataFrameWindowFunctionsSuite extends QueryTest } test("SPARK-16195 empty over spec") { - val df = Seq(("a", 1), ("a", 1), ("a", 2), ("b", 2)). - toDF("key", "value") - df.createOrReplaceTempView("window_table") - checkAnswer( - df.select($"key", $"value", sum($"value").over(), avg($"value").over()), - Seq(Row("a", 1, 6, 1.5), Row("a", 1, 6, 1.5), Row("a", 2, 6, 1.5), Row("b", 2, 6, 1.5))) - checkAnswer( - sql("select key, value, sum(value) over(), avg(value) over() from window_table"), - Seq(Row("a", 1, 6, 1.5), Row("a", 1, 6, 1.5), Row("a", 2, 6, 1.5), Row("b", 2, 6, 1.5))) + withTempView("window_table") { + val df = Seq(("a", 1), ("a", 1), ("a", 2), ("b", 2)). + toDF("key", "value") + df.createOrReplaceTempView("window_table") + checkAnswer( + df.select($"key", $"value", sum($"value").over(), avg($"value").over()), + Seq(Row("a", 1, 6, 1.5), Row("a", 1, 6, 1.5), Row("a", 2, 6, 1.5), Row("b", 2, 6, 1.5))) + checkAnswer( + sql("select key, value, sum(value) over(), avg(value) over() from window_table"), + Seq(Row("a", 1, 6, 1.5), Row("a", 1, 6, 1.5), Row("a", 2, 6, 1.5), Row("b", 2, 6, 1.5))) + } } test("window function with udaf") { @@ -548,37 +552,41 @@ class DataFrameWindowFunctionsSuite extends QueryTest } test("aggregation and rows between with unbounded + predicate pushdown") { - val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value") - df.createOrReplaceTempView("window_table") - val selectList = Seq($"key", $"value", - last("key").over( - Window.partitionBy($"value").orderBy($"key").rowsBetween(0, Long.MaxValue)), - last("key").over( - Window.partitionBy($"value").orderBy($"key").rowsBetween(Long.MinValue, 0)), - last("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 1))) - - checkAnswer( - df.select(selectList: _*).where($"value" < "3"), - Seq(Row(1, "1", 1, 1, 1), Row(2, "2", 3, 2, 3), Row(3, "2", 3, 3, 3))) + withTempView("window_table") { + val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value") + df.createOrReplaceTempView("window_table") + val selectList = Seq($"key", $"value", + last("key").over( + Window.partitionBy($"value").orderBy($"key").rowsBetween(0, Long.MaxValue)), + last("key").over( + Window.partitionBy($"value").orderBy($"key").rowsBetween(Long.MinValue, 0)), + last("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 1))) + + checkAnswer( + df.select(selectList: _*).where($"value" < "3"), + Seq(Row(1, "1", 1, 1, 1), Row(2, "2", 3, 2, 3), Row(3, "2", 3, 3, 3))) + } } test("aggregation and range between with unbounded + predicate pushdown") { - val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value") - df.createOrReplaceTempView("window_table") - val selectList = Seq($"key", $"value", - last("value").over( - Window.partitionBy($"value").orderBy($"key").rangeBetween(-2, -1)).equalTo("2") - .as("last_v"), - avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(Long.MinValue, 1)) - .as("avg_key1"), - avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(0, Long.MaxValue)) - .as("avg_key2"), - avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(-1, 1)) - .as("avg_key3")) - - checkAnswer( - df.select(selectList: _*).where($"value" < 2), - Seq(Row(3, "1", null, 3.0, 4.0, 3.0), Row(5, "1", false, 4.0, 5.0, 5.0))) + withTempView("window_table") { + val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value") + df.createOrReplaceTempView("window_table") + val selectList = Seq($"key", $"value", + last("value").over( + Window.partitionBy($"value").orderBy($"key").rangeBetween(-2, -1)).equalTo("2") + .as("last_v"), + avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(Long.MinValue, 1)) + .as("avg_key1"), + avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(0, Long.MaxValue)) + .as("avg_key2"), + avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(-1, 1)) + .as("avg_key3")) + + checkAnswer( + df.select(selectList: _*).where($"value" < 2), + Seq(Row(3, "1", null, 3.0, 4.0, 3.0), Row(5, "1", false, 4.0, 5.0, 5.0))) + } } test("Window spill with less than the inMemoryThreshold") { @@ -665,40 +673,46 @@ class DataFrameWindowFunctionsSuite extends QueryTest } test("SPARK-24575: Window functions inside WHERE and HAVING clauses") { - def checkAnalysisError(df: => DataFrame): Unit = { + def checkAnalysisError(df: => DataFrame, clause: String): Unit = { val thrownException = the[AnalysisException] thrownBy { df.queryExecution.analyzed } - assert(thrownException.message.contains("window functions inside WHERE and HAVING clauses")) + assert(thrownException.message.contains(s"window functions inside $clause clause")) } - checkAnalysisError(testData2.select("a").where(rank().over(Window.orderBy($"b")) === 1)) - checkAnalysisError(testData2.where($"b" === 2 && rank().over(Window.orderBy($"b")) === 1)) + checkAnalysisError( + testData2.select("a").where(rank().over(Window.orderBy($"b")) === 1), "WHERE") + checkAnalysisError( + testData2.where($"b" === 2 && rank().over(Window.orderBy($"b")) === 1), "WHERE") checkAnalysisError( testData2.groupBy($"a") .agg(avg($"b").as("avgb")) - .where($"a" > $"avgb" && rank().over(Window.orderBy($"a")) === 1)) + .where($"a" > $"avgb" && rank().over(Window.orderBy($"a")) === 1), "WHERE") checkAnalysisError( testData2.groupBy($"a") .agg(max($"b").as("maxb"), sum($"b").as("sumb")) - .where(rank().over(Window.orderBy($"a")) === 1)) + .where(rank().over(Window.orderBy($"a")) === 1), "WHERE") checkAnalysisError( testData2.groupBy($"a") .agg(max($"b").as("maxb"), sum($"b").as("sumb")) - .where($"sumb" === 5 && rank().over(Window.orderBy($"a")) === 1)) + .where($"sumb" === 5 && rank().over(Window.orderBy($"a")) === 1), "WHERE") - checkAnalysisError(sql("SELECT a FROM testData2 WHERE RANK() OVER(ORDER BY b) = 1")) - checkAnalysisError(sql("SELECT * FROM testData2 WHERE b = 2 AND RANK() OVER(ORDER BY b) = 1")) + checkAnalysisError(sql("SELECT a FROM testData2 WHERE RANK() OVER(ORDER BY b) = 1"), "WHERE") + checkAnalysisError( + sql("SELECT * FROM testData2 WHERE b = 2 AND RANK() OVER(ORDER BY b) = 1"), "WHERE") checkAnalysisError( - sql("SELECT * FROM testData2 GROUP BY a HAVING a > AVG(b) AND RANK() OVER(ORDER BY a) = 1")) + sql("SELECT * FROM testData2 GROUP BY a HAVING a > AVG(b) AND RANK() OVER(ORDER BY a) = 1"), + "HAVING") checkAnalysisError( - sql("SELECT a, MAX(b), SUM(b) FROM testData2 GROUP BY a HAVING RANK() OVER(ORDER BY a) = 1")) + sql("SELECT a, MAX(b), SUM(b) FROM testData2 GROUP BY a HAVING RANK() OVER(ORDER BY a) = 1"), + "HAVING") checkAnalysisError( sql( s"""SELECT a, MAX(b) |FROM testData2 |GROUP BY a - |HAVING SUM(b) = 5 AND RANK() OVER(ORDER BY a) = 1""".stripMargin)) + |HAVING SUM(b) = 5 AND RANK() OVER(ORDER BY a) = 1""".stripMargin), + "HAVING") } test("window functions in multiple selects") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala index d49dc58e93ddb..ff5c6242987de 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala @@ -17,20 +17,23 @@ package org.apache.spark.sql +import java.sql.Timestamp + import scala.collection.JavaConverters._ import org.scalatest.BeforeAndAfter -import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException} -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic} -import org.apache.spark.sql.connector.InMemoryTableCatalog +import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NamedRelation, NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, V2WriteCommand} +import org.apache.spark.sql.connector.{InMemoryTable, InMemoryTableCatalog} import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} import org.apache.spark.sql.connector.expressions.{BucketTransform, DaysTransform, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, YearsTransform} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, LongType, MapType, StringType, StructField, StructType, TimestampType} import org.apache.spark.sql.util.QueryExecutionListener +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with BeforeAndAfter { @@ -67,7 +70,7 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo plan = qe.analyzed } - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} } spark.listenerManager.register(listener) @@ -97,6 +100,86 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo assert(v2.catalog.exists(_ == catalogPlugin)) } + case class FakeV2WriteCommand(table: NamedRelation, query: LogicalPlan) extends V2WriteCommand + + test("SPARK-33136 output resolved on complex types for V2 write commands") { + val tableCatalog = catalog("testcat") + + def assertTypeCompatibility(name: String, fromType: DataType, toType: DataType): Unit = { + val fromTableName = s"from_table_$name" + tableCatalog.createTable( + Identifier.of(Array(), fromTableName), + StructType(Array(StructField("col", fromType))), + Array.empty, + new java.util.HashMap[String, String]()) + + val toTable = tableCatalog.createTable( + Identifier.of(Array(), s"to_table_$name"), + StructType(Array(StructField("col", toType))), + Array.empty, + new java.util.HashMap[String, String]()) + + val df = spark.table(s"testcat.$fromTableName") + + val relation = DataSourceV2Relation.create(toTable, Some(tableCatalog), None) + val writeCommand = FakeV2WriteCommand(relation, df.queryExecution.analyzed) + + assert(writeCommand.outputResolved, s"Unable to write from type $fromType to type $toType.") + } + + // The major difference between `from` and `to` is that `from` is a complex type + // with non-nullable, whereas `to` is same data type with flipping nullable. + + // nested struct type + val fromStructType = StructType(Array( + StructField("s", StringType), + StructField("i_nonnull", IntegerType, nullable = false), + StructField("st", StructType(Array( + StructField("l", LongType), + StructField("s_nonnull", StringType, nullable = false)))))) + + val toStructType = StructType(Array( + StructField("s", StringType), + StructField("i_nonnull", IntegerType), + StructField("st", StructType(Array( + StructField("l", LongType), + StructField("s_nonnull", StringType)))))) + + assertTypeCompatibility("struct", fromStructType, toStructType) + + // array type + assertTypeCompatibility("array", ArrayType(LongType, containsNull = false), + ArrayType(LongType, containsNull = true)) + + // array type with struct type + val fromArrayWithStructType = ArrayType( + StructType(Array(StructField("s", StringType, nullable = false))), + containsNull = false) + + val toArrayWithStructType = ArrayType( + StructType(Array(StructField("s", StringType))), + containsNull = true) + + assertTypeCompatibility("array_struct", fromArrayWithStructType, toArrayWithStructType) + + // map type + assertTypeCompatibility("map", MapType(IntegerType, StringType, valueContainsNull = false), + MapType(IntegerType, StringType, valueContainsNull = true)) + + // map type with struct type + val fromMapWithStructType = MapType( + IntegerType, + StructType(Array(StructField("s", StringType, nullable = false))), + valueContainsNull = false) + + val toMapWithStructType = MapType( + IntegerType, + StructType(Array(StructField("s", StringType))), + valueContainsNull = true) + + assertTypeCompatibility("map_struct", fromMapWithStructType, toMapWithStructType) + } + test("Append: basic append") { spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") @@ -332,7 +415,6 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo spark.table("source") .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) .writeTo("testcat.table_name") - .tableProperty("allow-unsupported-transforms", "true") .partitionedBy(years($"ts")) .create() @@ -346,7 +428,6 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo spark.table("source") .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) .writeTo("testcat.table_name") - .tableProperty("allow-unsupported-transforms", "true") .partitionedBy(months($"ts")) .create() @@ -360,7 +441,6 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo spark.table("source") .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) .writeTo("testcat.table_name") - .tableProperty("allow-unsupported-transforms", "true") .partitionedBy(days($"ts")) .create() @@ -374,7 +454,6 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo spark.table("source") .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) .writeTo("testcat.table_name") - .tableProperty("allow-unsupported-transforms", "true") .partitionedBy(hours($"ts")) .create() @@ -387,7 +466,6 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo test("Create: partitioned by bucket(4, id)") { spark.table("source") .writeTo("testcat.table_name") - .tableProperty("allow-unsupported-transforms", "true") .partitionedBy(bucket(4, $"id")) .create() @@ -550,4 +628,82 @@ class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with Befo assert(replaced.partitioning.isEmpty) assert(replaced.properties === defaultOwnership.asJava) } + + test("SPARK-30289 Create: partitioned by nested column") { + val schema = new StructType().add("ts", new StructType() + .add("created", TimestampType) + .add("modified", TimestampType) + .add("timezone", StringType)) + + val data = Seq( + Row(Row(Timestamp.valueOf("2019-06-01 10:00:00"), Timestamp.valueOf("2019-09-02 07:00:00"), + "America/Los_Angeles")), + Row(Row(Timestamp.valueOf("2019-08-26 18:00:00"), Timestamp.valueOf("2019-09-26 18:00:00"), + "America/Los_Angeles")), + Row(Row(Timestamp.valueOf("2018-11-23 18:00:00"), Timestamp.valueOf("2018-12-22 18:00:00"), + "America/New_York"))) + val df = spark.createDataFrame(spark.sparkContext.parallelize(data, 1), schema) + + df.writeTo("testcat.table_name") + .partitionedBy($"ts.timezone") + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + .asInstanceOf[InMemoryTable] + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(IdentityTransform(FieldReference(Array("ts", "timezone"))))) + checkAnswer(spark.table(table.name), data) + assert(table.dataMap.toArray.length == 2) + assert(table.dataMap(Seq(UTF8String.fromString("America/Los_Angeles"))).rows.size == 2) + assert(table.dataMap(Seq(UTF8String.fromString("America/New_York"))).rows.size == 1) + + // TODO: `DataSourceV2Strategy` can not translate nested fields into source filter yet + // so the following sql will fail. + // sql("DELETE FROM testcat.table_name WHERE ts.timezone = \"America/Los_Angeles\"") + } + + test("SPARK-30289 Create: partitioned by multiple transforms on nested columns") { + spark.table("source") + .withColumn("ts", struct( + lit("2019-06-01 10:00:00.000000").cast("timestamp") as "created", + lit("2019-09-02 07:00:00.000000").cast("timestamp") as "modified", + lit("America/Los_Angeles") as "timezone")) + .writeTo("testcat.table_name") + .partitionedBy( + years($"ts.created"), months($"ts.created"), days($"ts.created"), hours($"ts.created"), + years($"ts.modified"), months($"ts.modified"), days($"ts.modified"), hours($"ts.modified") + ) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq( + YearsTransform(FieldReference(Array("ts", "created"))), + MonthsTransform(FieldReference(Array("ts", "created"))), + DaysTransform(FieldReference(Array("ts", "created"))), + HoursTransform(FieldReference(Array("ts", "created"))), + YearsTransform(FieldReference(Array("ts", "modified"))), + MonthsTransform(FieldReference(Array("ts", "modified"))), + DaysTransform(FieldReference(Array("ts", "modified"))), + HoursTransform(FieldReference(Array("ts", "modified"))))) + } + + test("SPARK-30289 Create: partitioned by bucket(4, ts.timezone)") { + spark.table("source") + .withColumn("ts", struct( + lit("2019-06-01 10:00:00.000000").cast("timestamp") as "created", + lit("2019-09-02 07:00:00.000000").cast("timestamp") as "modified", + lit("America/Los_Angeles") as "timezone")) + .writeTo("testcat.table_name") + .partitionedBy(bucket(4, $"ts.timezone")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(BucketTransform(LiteralValue(4, IntegerType), + Seq(FieldReference(Seq("ts", "timezone")))))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala index 6ffe133ee652b..a22abd505ca00 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala @@ -219,6 +219,15 @@ case class OptionBooleanIntAggregator(colName: String) def OptionalBoolIntEncoder: Encoder[Option[(Boolean, Int)]] = ExpressionEncoder() } +case class FooAgg(s: Int) extends Aggregator[Row, Int, Int] { + def zero: Int = s + def reduce(b: Int, r: Row): Int = b + r.getAs[Int](0) + def merge(b1: Int, b2: Int): Int = b1 + b2 + def finish(b: Int): Int = b + def bufferEncoder: Encoder[Int] = Encoders.scalaInt + def outputEncoder: Encoder[Int] = Encoders.scalaInt +} + class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { import testImplicits._ @@ -394,4 +403,19 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { checkAnswer(group, Row("bob", Row(true, 3)) :: Nil) checkDataset(group.as[OptionBooleanIntData], OptionBooleanIntData("bob", Some((true, 3)))) } + + test("SPARK-30590: untyped select should not accept typed column that needs input type") { + val df = Seq((1, 2, 3, 4, 5, 6)).toDF("a", "b", "c", "d", "e", "f") + val fooAgg = (i: Int) => FooAgg(i).toColumn.name(s"foo_agg_$i") + + val agg1 = df.select(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5)) + checkDataset(agg1, (3, 5, 7, 9, 11)) + + // Passes typed columns to untyped `Dataset.select` API. + val err = intercept[AnalysisException] { + df.select(fooAgg(1), fooAgg(2), fooAgg(3), fooAgg(4), fooAgg(5), fooAgg(6)) + }.getMessage + assert(err.contains("cannot be passed in untyped `select` API. " + + "Use the typed `Dataset.select` API instead.")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala index 0ac99905f35f4..5b8c80b471bb4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala @@ -18,12 +18,13 @@ package org.apache.spark.sql import org.apache.spark.metrics.source.CodegenMetrics +import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, Expression} -import org.apache.spark.sql.catalyst.expressions.objects.ExternalMapToCatalyst import org.apache.spark.sql.catalyst.plans.logical.SerializeFromObject -import org.apache.spark.sql.functions.expr +import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StructType class DatasetOptimizationSuite extends QueryTest with SharedSparkSession { import testImplicits._ @@ -195,4 +196,12 @@ class DatasetOptimizationSuite extends QueryTest with SharedSparkSession { checkCodegenCache(() => Seq(Seq(Map("abc" -> 1))).toDS()) } } + + test("SPARK-32652: Pruned nested serializers: RowEncoder") { + val df = Seq(("a", 1), ("b", 2), ("c", 3)).toDF("i", "j") + val encoder = RowEncoder(new StructType().add("s", df.schema)) + val query = df.map(row => Row(row))(encoder).select("s.i") + testSerializer(query, Seq(Seq("i"))) + checkAnswer(query, Seq(Row("a"), Row("b"), Row("c"))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSerializerRegistratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSerializerRegistratorSuite.scala index 43de2663b1236..b20d050f2fc4a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSerializerRegistratorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSerializerRegistratorSuite.scala @@ -34,7 +34,7 @@ class DatasetSerializerRegistratorSuite extends QueryTest with SharedSparkSessio override protected def sparkConf: SparkConf = { // Make sure we use the KryoRegistrator - super.sparkConf.set(KRYO_USER_REGISTRATORS, TestRegistrator().getClass.getCanonicalName) + super.sparkConf.set(KRYO_USER_REGISTRATORS, Seq(TestRegistrator().getClass.getCanonicalName)) } test("Kryo registrator") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 233d67898f909..a1e813227c9c0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1222,14 +1222,6 @@ class DatasetSuite extends QueryTest assert(result == Set(ClassData("a", 1) -> null, ClassData("b", 2) -> ClassData("x", 2))) } - test("better error message when use java reserved keyword as field name") { - val e = intercept[UnsupportedOperationException] { - Seq(InvalidInJava(1)).toDS() - } - assert(e.getMessage.contains( - "`abstract` is a reserved keyword and cannot be used as field name")) - } - test("Dataset should support flat input object to be null") { checkDataset(Seq("a", null).toDS(), "a", null) } @@ -1899,6 +1891,26 @@ class DatasetSuite extends QueryTest val e = intercept[AnalysisException](spark.range(1).tail(-1)) e.getMessage.contains("tail expression must be equal to or greater than 0") } + + test("SparkSession.active should be the same instance after dataset operations") { + val active = SparkSession.getActiveSession.get + val clone = active.cloneSession() + val ds = new Dataset(clone, spark.range(10).queryExecution.logical, Encoders.INT) + + ds.queryExecution.analyzed + + assert(active eq SparkSession.getActiveSession.get) + } + + test("SPARK-31854: Invoke in MapElementsExec should not propagate null") { + Seq("true", "false").foreach { wholeStage => + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> wholeStage) { + val ds = Seq(1.asInstanceOf[Integer], null.asInstanceOf[Integer]).toDS() + val expectedAnswer = Seq[(Integer, Integer)]((1, 1), (null, null)) + checkDataset(ds.map(v => (v, v)), expectedAnswer: _*) + } + } + } } object AssertExecutionId { @@ -1939,8 +1951,6 @@ case class ClassNullableData(a: String, b: Integer) case class NestedStruct(f: ClassData) case class DeepNestedStruct(f: NestedStruct) -case class InvalidInJava(`abstract`: Int) - /** * A class used to test serialization using encoders. This class throws exceptions when using * Java serialization -- so the only way it can be "serialized" is through our encoders. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index d7d8c2c52d12b..247f8153dfde5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -19,11 +19,13 @@ package org.apache.spark.sql import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat -import java.time.Instant -import java.util.Locale +import java.time.{Instant, LocalDateTime} +import java.util.{Locale, TimeZone} import java.util.concurrent.TimeUnit -import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} +import org.apache.spark.{SparkException, SparkUpgradeException} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{CEST, LA} +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -96,15 +98,19 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { } test("date format") { - val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") - - checkAnswer( - df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")), - Row("2015", "2015", "2013")) - - checkAnswer( - df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"), - Row("2015", "2015", "2013")) + Seq("legacy", "corrected").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") + + checkAnswer( + df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")), + Row("2015", "2015", "2013")) + + checkAnswer( + df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"), + Row("2015", "2015", "2013")) + } + } } test("year") { @@ -373,6 +379,13 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { Seq(Row(Date.valueOf("2015-07-30")), Row(Date.valueOf("2015-07-30")))) } + def checkExceptionMessage(df: DataFrame): Unit = { + val message = intercept[SparkException] { + df.collect() + }.getCause.getMessage + assert(message.contains("Fail to parse")) + } + test("function to_date") { val d1 = Date.valueOf("2015-07-22") val d2 = Date.valueOf("2015-07-01") @@ -418,9 +431,15 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { df.select(to_date(col("d"), "yyyy-MM-dd")), Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2015-07-01")), Row(Date.valueOf("2014-12-31")))) - checkAnswer( - df.select(to_date(col("s"), "yyyy-MM-dd")), - Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) + val confKey = SQLConf.LEGACY_TIME_PARSER_POLICY.key + withSQLConf(confKey -> "corrected") { + checkAnswer( + df.select(to_date(col("s"), "yyyy-MM-dd")), + Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) + } + withSQLConf(confKey -> "exception") { + checkExceptionMessage(df.select(to_date(col("s"), "yyyy-MM-dd"))) + } // now switch format checkAnswer( @@ -431,9 +450,9 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer( df.select(to_date(col("s"), "yyyy-hh-MM")), Seq(Row(null), Row(null), Row(null))) - checkAnswer( - df.select(to_date(col("s"), "yyyy-dd-aa")), - Seq(Row(null), Row(null), Row(null))) + val e = intercept[SparkUpgradeException](df.select(to_date(col("s"), "yyyy-dd-aa")).collect()) + assert(e.getCause.isInstanceOf[IllegalArgumentException]) + assert(e.getMessage.contains("You may get a different result due to the upgrading of Spark")) // february val x1 = "2016-02-29" @@ -455,10 +474,6 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer( df.selectExpr("trunc(t, 'Month')"), Seq(Row(Date.valueOf("2015-07-01")), Row(Date.valueOf("2014-12-01")))) - - checkAnswer( - df.selectExpr("trunc(t, 'decade')"), - Seq(Row(Date.valueOf("2010-01-01")), Row(Date.valueOf("2010-01-01")))) } test("function date_trunc") { @@ -510,185 +525,215 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("date_trunc('MILLISECOND', t)"), Seq(Row(Timestamp.valueOf("2015-07-22 10:01:40.123")), Row(Timestamp.valueOf("2014-12-31 05:29:06.123")))) + } + test("unsupported fmt fields for trunc/date_trunc results null") { + Seq("INVALID", "decade", "century", "millennium", "whatever", null).foreach { f => checkAnswer( - df.selectExpr("date_trunc('DECADE', t)"), - Seq(Row(Timestamp.valueOf("2010-01-01 00:00:00")), - Row(Timestamp.valueOf("2010-01-01 00:00:00")))) - - Seq("century", "millennium").foreach { level => - checkAnswer( - df.selectExpr(s"date_trunc('$level', t)"), - Seq(Row(Timestamp.valueOf("2001-01-01 00:00:00")), - Row(Timestamp.valueOf("2001-01-01 00:00:00")))) + Seq(Date.valueOf("2014-12-31")) + .toDF("dt") + .selectExpr(s"date_trunc('$f', dt)", "trunc(dt, '$f')"), + Row(null, null)) } } test("from_unixtime") { - val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US) - val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS" - val sdf2 = new SimpleDateFormat(fmt2, Locale.US) - val fmt3 = "yy-MM-dd HH-mm-ss" - val sdf3 = new SimpleDateFormat(fmt3, Locale.US) - val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b") - checkAnswer( - df.select(from_unixtime(col("a"))), - Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) - checkAnswer( - df.select(from_unixtime(col("a"), fmt2)), - Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) - checkAnswer( - df.select(from_unixtime(col("a"), fmt3)), - Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) - checkAnswer( - df.selectExpr("from_unixtime(a)"), - Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) - checkAnswer( - df.selectExpr(s"from_unixtime(a, '$fmt2')"), - Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) - checkAnswer( - df.selectExpr(s"from_unixtime(a, '$fmt3')"), - Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) + Seq("corrected", "legacy").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US) + val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS" + val sdf2 = new SimpleDateFormat(fmt2, Locale.US) + val fmt3 = "yy-MM-dd HH-mm-ss" + val sdf3 = new SimpleDateFormat(fmt3, Locale.US) + val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b") + checkAnswer( + df.select(from_unixtime(col("a"))), + Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) + checkAnswer( + df.select(from_unixtime(col("a"), fmt2)), + Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) + checkAnswer( + df.select(from_unixtime(col("a"), fmt3)), + Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) + checkAnswer( + df.selectExpr("from_unixtime(a)"), + Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) + checkAnswer( + df.selectExpr(s"from_unixtime(a, '$fmt2')"), + Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) + checkAnswer( + df.selectExpr(s"from_unixtime(a, '$fmt3')"), + Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) + } + } } private def secs(millis: Long): Long = TimeUnit.MILLISECONDS.toSeconds(millis) test("unix_timestamp") { - val date1 = Date.valueOf("2015-07-24") - val date2 = Date.valueOf("2015-07-25") - val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") - val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") - val s1 = "2015/07/24 10:00:00.5" - val s2 = "2015/07/25 02:02:02.6" - val ss1 = "2015-07-24 10:00:00" - val ss2 = "2015-07-25 02:02:02" - val fmt = "yyyy/MM/dd HH:mm:ss.S" - val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") - checkAnswer(df.select(unix_timestamp(col("ts"))), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.select(unix_timestamp(col("ss"))), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq( - Row(secs(date1.getTime)), Row(secs(date2.getTime)))) - checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq( - Row(secs(date1.getTime)), Row(secs(date2.getTime)))) - checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - - val x1 = "2015-07-24 10:00:00" - val x2 = "2015-25-07 02:02:02" - val x3 = "2015-07-24 25:02:02" - val x4 = "2015-24-07 26:02:02" - val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") - val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") - - val df1 = Seq(x1, x2, x3, x4).toDF("x") - checkAnswer(df1.select(unix_timestamp(col("x"))), Seq( - Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) - checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq( - Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) - checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq( - Row(null), Row(secs(ts2.getTime)), Row(null), Row(null))) - checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( - Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) - - // invalid format - checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq( - Row(null), Row(null), Row(null), Row(null))) - - // february - val y1 = "2016-02-29" - val y2 = "2017-02-29" - val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") - val df2 = Seq(y1, y2).toDF("y") - checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( - Row(secs(ts5.getTime)), Row(null))) - - val now = sql("select unix_timestamp()").collect().head.getLong(0) - checkAnswer( - sql(s"select cast ($now as timestamp)"), - Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now)))) + Seq("corrected", "legacy").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + checkAnswer(df.select(unix_timestamp(col("ts"))), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.select(unix_timestamp(col("ss"))), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq( + Row(secs(date1.getTime)), Row(secs(date2.getTime)))) + checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq( + Row(secs(date1.getTime)), Row(secs(date2.getTime)))) + checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + + val x1 = "2015-07-24 10:00:00" + val x2 = "2015-25-07 02:02:02" + val x3 = "2015-07-24 25:02:02" + val x4 = "2015-24-07 26:02:02" + val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") + val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") + + val df1 = Seq(x1, x2, x3, x4).toDF("x") + checkAnswer(df1.select(unix_timestamp(col("x"))), Seq( + Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) + checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq( + Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) + checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq( + Row(null), Row(secs(ts2.getTime)), Row(null), Row(null))) + checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( + Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) + + // invalid format + val invalid = df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')") + if (legacyParserPolicy == "legacy") { + checkAnswer(invalid, + Seq(Row(null), Row(null), Row(null), Row(null))) + } else { + val e = intercept[SparkUpgradeException](invalid.collect()) + assert(e.getCause.isInstanceOf[IllegalArgumentException]) + assert( + e.getMessage.contains("You may get a different result due to the upgrading of Spark")) + } + + // february + val y1 = "2016-02-29" + val y2 = "2017-02-29" + val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") + val df2 = Seq(y1, y2).toDF("y") + checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( + Row(secs(ts5.getTime)), Row(null))) + + val now = sql("select unix_timestamp()").collect().head.getLong(0) + checkAnswer( + sql(s"select cast ($now as timestamp)"), + Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now)))) + } + } } test("to_unix_timestamp") { - val date1 = Date.valueOf("2015-07-24") - val date2 = Date.valueOf("2015-07-25") - val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") - val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") - val s1 = "2015/07/24 10:00:00.5" - val s2 = "2015/07/25 02:02:02.6" - val ss1 = "2015-07-24 10:00:00" - val ss2 = "2015-07-25 02:02:02" - val fmt = "yyyy/MM/dd HH:mm:ss.S" - val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") - checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq( - Row(secs(date1.getTime)), Row(secs(date2.getTime)))) - checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - - val x1 = "2015-07-24 10:00:00" - val x2 = "2015-25-07 02:02:02" - val x3 = "2015-07-24 25:02:02" - val x4 = "2015-24-07 26:02:02" - val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") - val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") - - val df1 = Seq(x1, x2, x3, x4).toDF("x") - checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq( - Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) - checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( - Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) - - // february - val y1 = "2016-02-29" - val y2 = "2017-02-29" - val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") - val df2 = Seq(y1, y2).toDF("y") - checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( - Row(secs(ts5.getTime)), Row(null))) - - // invalid format - checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq( - Row(null), Row(null), Row(null), Row(null))) + Seq("corrected", "legacy").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq( + Row(secs(date1.getTime)), Row(secs(date2.getTime)))) + checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + + val x1 = "2015-07-24 10:00:00" + val x2 = "2015-25-07 02:02:02" + val x3 = "2015-07-24 25:02:02" + val x4 = "2015-24-07 26:02:02" + val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") + val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") + + val df1 = Seq(x1, x2, x3, x4).toDF("x") + checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq( + Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) + checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( + Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) + + // february + val y1 = "2016-02-29" + val y2 = "2017-02-29" + val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") + val df2 = Seq(y1, y2).toDF("y") + checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( + Row(secs(ts5.getTime)), Row(null))) + + // invalid format + checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq( + Row(null), Row(null), Row(null), Row(null))) + } + } } test("to_timestamp") { - val date1 = Date.valueOf("2015-07-24") - val date2 = Date.valueOf("2015-07-25") - val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00") - val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00") - val ts1 = Timestamp.valueOf("2015-07-24 10:00:00") - val ts2 = Timestamp.valueOf("2015-07-25 02:02:02") - val s1 = "2015/07/24 10:00:00.5" - val s2 = "2015/07/25 02:02:02.6" - val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5") - val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6") - val ss1 = "2015-07-24 10:00:00" - val ss2 = "2015-07-25 02:02:02" - val fmt = "yyyy/MM/dd HH:mm:ss.S" - val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") - - checkAnswer(df.select(to_timestamp(col("ss"))), - df.select(unix_timestamp(col("ss")).cast("timestamp"))) - checkAnswer(df.select(to_timestamp(col("ss"))), Seq( - Row(ts1), Row(ts2))) - checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq( - Row(ts1m), Row(ts2m))) - checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq( - Row(ts1), Row(ts2))) - checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq( - Row(ts_date1), Row(ts_date2))) + Seq("legacy", "corrected").foreach { legacyParserPolicy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00") + val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5") + val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6") + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + + checkAnswer(df.select(to_timestamp(col("ss"))), + df.select(unix_timestamp(col("ss")).cast("timestamp"))) + checkAnswer(df.select(to_timestamp(col("ss"))), Seq( + Row(ts1), Row(ts2))) + if (legacyParserPolicy == "legacy") { + // In Spark 2.4 and earlier, to_timestamp() parses in seconds precision and cuts off + // the fractional part of seconds. The behavior was changed by SPARK-27438. + val legacyFmt = "yyyy/MM/dd HH:mm:ss" + checkAnswer(df.select(to_timestamp(col("s"), legacyFmt)), Seq( + Row(ts1), Row(ts2))) + } else { + checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq( + Row(ts1m), Row(ts2m))) + } + checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq( + Row(ts1), Row(ts2))) + checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq( + Row(ts_date1), Row(ts_date2))) + } + } } test("datediff") { @@ -719,12 +764,12 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") ).toDF("a", "b") checkAnswer( - df.select(from_utc_timestamp(col("a"), "PST")), + df.select(from_utc_timestamp(col("a"), LA.getId)), Seq( Row(Timestamp.valueOf("2015-07-23 17:00:00")), Row(Timestamp.valueOf("2015-07-24 17:00:00")))) checkAnswer( - df.select(from_utc_timestamp(col("b"), "PST")), + df.select(from_utc_timestamp(col("b"), LA.getId)), Seq( Row(Timestamp.valueOf("2015-07-23 17:00:00")), Row(Timestamp.valueOf("2015-07-24 17:00:00")))) @@ -732,8 +777,8 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { test("from_utc_timestamp with column zone") { val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "CET"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "PST") + (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", CEST.getId), + (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", LA.getId) ).toDF("a", "b", "c") checkAnswer( df.select(from_utc_timestamp(col("a"), col("c"))), @@ -762,12 +807,12 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") ).toDF("a", "b") checkAnswer( - df.select(to_utc_timestamp(col("a"), "PST")), + df.select(to_utc_timestamp(col("a"), LA.getId)), Seq( Row(Timestamp.valueOf("2015-07-24 07:00:00")), Row(Timestamp.valueOf("2015-07-25 07:00:00")))) checkAnswer( - df.select(to_utc_timestamp(col("b"), "PST")), + df.select(to_utc_timestamp(col("b"), LA.getId)), Seq( Row(Timestamp.valueOf("2015-07-24 07:00:00")), Row(Timestamp.valueOf("2015-07-25 07:00:00")))) @@ -775,8 +820,8 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { test("to_utc_timestamp with column zone") { val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "PST"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "CET") + (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", LA.getId), + (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", CEST.getId) ).toDF("a", "b", "c") checkAnswer( df.select(to_utc_timestamp(col("a"), col("c"))), @@ -789,4 +834,71 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { Row(Timestamp.valueOf("2015-07-24 07:00:00")), Row(Timestamp.valueOf("2015-07-24 22:00:00")))) } + + test("SPARK-30668: use legacy timestamp parser in to_timestamp") { + val confKey = SQLConf.LEGACY_TIME_PARSER_POLICY.key + val df = Seq("2020-01-27T20:06:11.847-0800").toDF("ts") + withSQLConf(confKey -> "legacy") { + val expected = Timestamp.valueOf("2020-01-27 20:06:11.847") + checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")), + Row(expected)) + } + withSQLConf(confKey -> "corrected") { + checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")), Row(null)) + } + withSQLConf(confKey -> "exception") { + checkExceptionMessage(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz"))) + } + } + + test("SPARK-30752: convert time zones on a daylight saving day") { + val systemTz = LA.getId + val sessionTz = "UTC" + val fromTz = "Asia/Hong_Kong" + val fromTs = "2019-11-03T12:00:00" // daylight saving date in America/Los_Angeles + val utsTs = "2019-11-03T04:00:00" + val defaultTz = TimeZone.getDefault + try { + TimeZone.setDefault(DateTimeUtils.getTimeZone(systemTz)) + withSQLConf( + SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true", + SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionTz) { + val expected = LocalDateTime.parse(utsTs) + .atZone(DateTimeUtils.getZoneId(sessionTz)) + .toInstant + val df = Seq(fromTs).toDF("localTs") + checkAnswer( + df.select(to_utc_timestamp(col("localTs"), fromTz)), + Row(expected)) + } + } finally { + TimeZone.setDefault(defaultTz) + } + } + + test("SPARK-30766: date_trunc of old timestamps to hours and days") { + def checkTrunc(level: String, expected: String): Unit = { + val df = Seq("0010-01-01 01:02:03.123456") + .toDF() + .select($"value".cast("timestamp").as("ts")) + .select(date_trunc(level, $"ts").cast("string")) + checkAnswer(df, Row(expected)) + } + + checkTrunc("HOUR", "0010-01-01 01:00:00") + checkTrunc("DAY", "0010-01-01 00:00:00") + } + + test("SPARK-30793: truncate timestamps before the epoch to seconds and minutes") { + def checkTrunc(level: String, expected: String): Unit = { + val df = Seq("1961-04-12 00:01:02.345") + .toDF() + .select($"value".cast("timestamp").as("ts")) + .select(date_trunc(level, $"ts").cast("string")) + checkAnswer(df, Row(expected)) + } + + checkTrunc("SECOND", "1961-04-12 00:01:02") + checkTrunc("MINUTE", "1961-04-12 00:01:00") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedAPISuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedAPISuite.scala new file mode 100644 index 0000000000000..25b8849d61248 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedAPISuite.scala @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} + +class DeprecatedAPISuite extends QueryTest with SharedSparkSession { + import MathFunctionsTestData.DoubleData + import testImplicits._ + + private lazy val doubleData = (1 to 10).map(i => DoubleData(i * 0.2 - 1, i * -0.2 + 1)).toDF() + + private def testOneToOneMathFunction[ + @specialized(Int, Long, Float, Double) T, + @specialized(Int, Long, Float, Double) U]( + c: Column => Column, + f: T => U): Unit = { + checkAnswer( + doubleData.select(c('a)), + (1 to 10).map(n => Row(f((n * 0.2 - 1).asInstanceOf[T]))) + ) + + checkAnswer( + doubleData.select(c('b)), + (1 to 10).map(n => Row(f((-n * 0.2 + 1).asInstanceOf[T]))) + ) + + checkAnswer( + doubleData.select(c(lit(null))), + (1 to 10).map(_ => Row(null)) + ) + } + + test("functions.toDegrees") { + testOneToOneMathFunction(toDegrees, math.toDegrees) + withView("t") { + val df = Seq(0, 1, 1.5).toDF("a") + df.createOrReplaceTempView("t") + + checkAnswer( + sql("SELECT degrees(0), degrees(1), degrees(1.5)"), + Seq(0).toDF().select(toDegrees(lit(0)), toDegrees(lit(1)), toDegrees(lit(1.5))) + ) + checkAnswer( + sql("SELECT degrees(a) FROM t"), + df.select(toDegrees("a")) + ) + } + } + + test("functions.toRadians") { + testOneToOneMathFunction(toRadians, math.toRadians) + withView("t") { + val df = Seq(0, 1, 1.5).toDF("a") + df.createOrReplaceTempView("t") + + checkAnswer( + sql("SELECT radians(0), radians(1), radians(1.5)"), + Seq(0).toDF().select(toRadians(lit(0)), toRadians(lit(1)), toRadians(lit(1.5))) + ) + checkAnswer( + sql("SELECT radians(a) FROM t"), + df.select(toRadians("a")) + ) + } + } + + test("functions.approxCountDistinct") { + withView("t") { + val df = Seq(0, 1, 2).toDF("a") + df.createOrReplaceTempView("t") + checkAnswer( + sql("SELECT approx_count_distinct(a) FROM t"), + df.select(approxCountDistinct("a"))) + } + } + + test("functions.monotonicallyIncreasingId") { + // Make sure we have 2 partitions, each with 2 records. + val df = sparkContext.parallelize(Seq[Int](), 2).mapPartitions { _ => + Iterator(Tuple1(1), Tuple1(2)) + }.toDF("a") + checkAnswer( + df.select(monotonicallyIncreasingId(), expr("monotonically_increasing_id()")), + Row(0L, 0L) :: + Row(1L, 1L) :: + Row((1L << 33) + 0L, (1L << 33) + 0L) :: + Row((1L << 33) + 1L, (1L << 33) + 1L) :: Nil + ) + } + + test("Column.!==") { + val nullData = Seq( + (Some(1), Some(1)), (Some(1), Some(2)), (Some(1), None), (None, None)).toDF("a", "b") + checkAnswer( + nullData.filter($"b" !== 1), + Row(1, 2) :: Nil) + + checkAnswer(nullData.filter($"b" !== null), Nil) + + checkAnswer( + nullData.filter($"a" !== $"b"), + Row(1, 2) :: Nil) + } + + test("Dataset.registerTempTable") { + withTempView("t") { + Seq(1).toDF().registerTempTable("t") + assert(spark.catalog.tableExists("t")) + } + } + + test("SQLContext.setActive/clearActive") { + val sc = spark.sparkContext + val sqlContext = new SQLContext(sc) + SQLContext.setActive(sqlContext) + assert(SparkSession.getActiveSession === Some(spark)) + SQLContext.clearActive() + assert(SparkSession.getActiveSession === None) + } + + test("SQLContext.applySchema") { + val rowRdd = sparkContext.parallelize(Seq(Row("Jack", 20), Row("Marry", 18))) + val schema = StructType(StructField("name", StringType, false) :: + StructField("age", IntegerType, true) :: Nil) + val sqlContext = spark.sqlContext + checkAnswer(sqlContext.applySchema(rowRdd, schema), Row("Jack", 20) :: Row("Marry", 18) :: Nil) + checkAnswer(sqlContext.applySchema(rowRdd.toJavaRDD(), schema), + Row("Jack", 20) :: Row("Marry", 18) :: Nil) + } + + test("SQLContext.parquetFile") { + val sqlContext = spark.sqlContext + withTempDir { dir => + val parquetFile = s"${dir.toString}/${System.currentTimeMillis()}" + val expectDF = spark.range(10).toDF() + expectDF.write.parquet(parquetFile) + val parquetDF = sqlContext.parquetFile(parquetFile) + checkAnswer(parquetDF, expectDF) + } + } + + test("SQLContext.jsonFile") { + val sqlContext = spark.sqlContext + withTempDir { dir => + val jsonFile = s"${dir.toString}/${System.currentTimeMillis()}" + val expectDF = spark.range(10).toDF() + expectDF.write.json(jsonFile) + var jsonDF = sqlContext.jsonFile(jsonFile) + checkAnswer(jsonDF, expectDF) + assert(jsonDF.schema === expectDF.schema.asNullable) + + var schema = expectDF.schema + jsonDF = sqlContext.jsonFile(jsonFile, schema) + checkAnswer(jsonDF, expectDF) + assert(jsonDF.schema === schema.asNullable) + + jsonDF = sqlContext.jsonFile(jsonFile, 0.9) + checkAnswer(jsonDF, expectDF) + + val jsonRDD = sparkContext.parallelize(Seq("{\"name\":\"Jack\",\"age\":20}", + "{\"name\":\"Marry\",\"age\":18}")) + jsonDF = sqlContext.jsonRDD(jsonRDD) + checkAnswer(jsonDF, Row(18, "Marry") :: Row(20, "Jack") :: Nil) + jsonDF = sqlContext.jsonRDD(jsonRDD.toJavaRDD()) + checkAnswer(jsonDF, Row(18, "Marry") :: Row(20, "Jack") :: Nil) + + schema = StructType(StructField("name", StringType, false) :: + StructField("age", IntegerType, false) :: Nil) + jsonDF = sqlContext.jsonRDD(jsonRDD, schema) + checkAnswer(jsonDF, Row("Jack", 20) :: Row("Marry", 18) :: Nil) + jsonDF = sqlContext.jsonRDD(jsonRDD.toJavaRDD(), schema) + checkAnswer(jsonDF, Row("Jack", 20) :: Row("Marry", 18) :: Nil) + + + jsonDF = sqlContext.jsonRDD(jsonRDD, 0.9) + checkAnswer(jsonDF, Row(18, "Marry") :: Row(20, "Jack") :: Nil) + jsonDF = sqlContext.jsonRDD(jsonRDD.toJavaRDD(), 0.9) + checkAnswer(jsonDF, Row(18, "Marry") :: Row(20, "Jack") :: Nil) + } + } + + test("SQLContext.load") { + withTempDir { dir => + val path = s"${dir.toString}/${System.currentTimeMillis()}" + val expectDF = spark.range(10).toDF() + expectDF.write.parquet(path) + val sqlContext = spark.sqlContext + + var loadDF = sqlContext.load(path) + checkAnswer(loadDF, expectDF) + + loadDF = sqlContext.load(path, "parquet") + checkAnswer(loadDF, expectDF) + + loadDF = sqlContext.load("parquet", Map("path" -> path)) + checkAnswer(loadDF, expectDF) + + loadDF = sqlContext.load("parquet", expectDF.schema, Map("path" -> path)) + checkAnswer(loadDF, expectDF) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala index e1f9bcc4e008d..94163315b34f9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala @@ -20,9 +20,10 @@ package org.apache.spark.sql import org.scalatest.GivenWhenThen import org.apache.spark.sql.catalyst.expressions.{DynamicPruningExpression, Expression} +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode._ import org.apache.spark.sql.catalyst.plans.ExistenceJoin import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec} import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingQueryWrapper} @@ -33,7 +34,7 @@ import org.apache.spark.sql.test.SharedSparkSession /** * Test suite for the filtering ratio policy used to trigger dynamic partition pruning (DPP). */ -class DynamicPartitionPruningSuite +abstract class DynamicPartitionPruningSuiteBase extends QueryTest with SharedSparkSession with GivenWhenThen @@ -43,9 +44,14 @@ class DynamicPartitionPruningSuite import testImplicits._ + val adaptiveExecutionOn: Boolean + override def beforeAll(): Unit = { super.beforeAll() + spark.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, adaptiveExecutionOn) + spark.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY, true) + val factData = Seq[(Int, Int, Int, Int)]( (1000, 1, 1, 10), (1010, 2, 1, 10), @@ -153,6 +159,8 @@ class DynamicPartitionPruningSuite sql("DROP TABLE IF EXISTS fact_stats") sql("DROP TABLE IF EXISTS dim_stats") } finally { + spark.sessionState.conf.unsetConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED) + spark.sessionState.conf.unsetConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY) super.afterAll() } } @@ -195,6 +203,11 @@ class DynamicPartitionPruningSuite fail(s"Invalid child node found in\n$s") } } + + val isMainQueryAdaptive = plan.isInstanceOf[AdaptiveSparkPlanExec] + subqueriesAll(plan).filterNot(subqueryBroadcast.contains).foreach { s => + assert(s.find(_.isInstanceOf[AdaptiveSparkPlanExec]).isDefined == isMainQueryAdaptive) + } } /** @@ -239,30 +252,17 @@ class DynamicPartitionPruningSuite */ test("simple inner join triggers DPP with mock-up tables") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { - withTable("df1", "df2") { - spark.range(1000) - .select(col("id"), col("id").as("k")) - .write - .partitionBy("k") - .format(tableFormat) - .mode("overwrite") - .saveAsTable("df1") - - spark.range(100) - .select(col("id"), col("id").as("k")) - .write - .partitionBy("k") - .format(tableFormat) - .mode("overwrite") - .saveAsTable("df2") - - val df = sql("SELECT df1.id, df2.k FROM df1 JOIN df2 ON df1.k = df2.k AND df2.id < 2") + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { + val df = sql( + """ + |SELECT f.date_id, f.store_id FROM fact_sk f + |JOIN dim_store s ON f.store_id = s.store_id AND s.country = 'NL' + """.stripMargin) - checkPartitionPruningPredicate(df, true, false) + checkPartitionPruningPredicate(df, true, false) - checkAnswer(df, Row(0, 0) :: Row(1, 1) :: Nil) - } + checkAnswer(df, Row(1000, 1) :: Row(1010, 2) :: Row(1020, 2) :: Nil) } } @@ -271,7 +271,8 @@ class DynamicPartitionPruningSuite */ test("self-join on a partitioned table should not trigger DPP") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { withTable("fact") { sql( s""" @@ -302,8 +303,11 @@ class DynamicPartitionPruningSuite */ test("static scan metrics") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { withTable("fact", "dim") { + val numPartitions = 10 + spark.range(10) .map { x => Tuple3(x, x + 1, 0) } .toDF("did", "d1", "d2") @@ -313,7 +317,7 @@ class DynamicPartitionPruningSuite .saveAsTable("dim") spark.range(100) - .map { x => Tuple2(x, x % 10) } + .map { x => Tuple2(x, x % numPartitions) } .toDF("f1", "fid") .write.partitionBy("fid") .format(tableFormat) @@ -340,6 +344,8 @@ class DynamicPartitionPruningSuite assert(!scan1.metrics.contains("staticFilesSize")) val allFilesNum = scan1.metrics("numFiles").value val allFilesSize = scan1.metrics("filesSize").value + assert(scan1.metrics("numPartitions").value === numPartitions) + assert(scan1.metrics("pruningTime").value === -1) // No dynamic partition pruning, so no static metrics // Only files from fid = 5 partition are scanned @@ -352,6 +358,8 @@ class DynamicPartitionPruningSuite val partFilesSize = scan2.metrics("filesSize").value assert(0 < partFilesNum && partFilesNum < allFilesNum) assert(0 < partFilesSize && partFilesSize < allFilesSize) + assert(scan2.metrics("numPartitions").value === 1) + assert(scan2.metrics("pruningTime").value === -1) // Dynamic partition pruning is used // Static metrics are as-if reading the whole fact table @@ -363,6 +371,8 @@ class DynamicPartitionPruningSuite assert(scan3.metrics("staticFilesSize").value == allFilesSize) assert(scan3.metrics("numFiles").value == partFilesNum) assert(scan3.metrics("filesSize").value == partFilesSize) + assert(scan3.metrics("numPartitions").value === 1) + assert(scan3.metrics("pruningTime").value !== -1) } } } @@ -370,7 +380,8 @@ class DynamicPartitionPruningSuite test("DPP should not be rewritten as an existential join") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "1.5", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( s""" |SELECT * FROM product p WHERE p.store_id NOT IN @@ -395,7 +406,7 @@ class DynamicPartitionPruningSuite */ test("DPP triggers only for certain types of query") { withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false") { Given("dynamic partition pruning disabled") withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "false") { val df = sql( @@ -433,7 +444,8 @@ class DynamicPartitionPruningSuite } Given("left-semi join with partition column on the left side") - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT * FROM fact_sk f @@ -457,7 +469,8 @@ class DynamicPartitionPruningSuite } Given("right outer join with partition column on the left side") - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT * FROM fact_sk f RIGHT OUTER JOIN dim_store s @@ -474,7 +487,8 @@ class DynamicPartitionPruningSuite */ test("filtering ratio policy fallback") { withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { Given("no stats and selective predicate") withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true") { @@ -543,7 +557,8 @@ class DynamicPartitionPruningSuite */ test("filtering ratio policy with stats when the broadcast pruning is disabled") { withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { Given("disabling the use of stats in the DPP heuristic") withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false") { @@ -613,10 +628,7 @@ class DynamicPartitionPruningSuite test("partition pruning in broadcast hash joins with non-deterministic probe part") { Given("alias with simple join condition, and non-deterministic query") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -630,10 +642,7 @@ class DynamicPartitionPruningSuite } Given("alias over multiple sub-queries with simple join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -651,10 +660,7 @@ class DynamicPartitionPruningSuite test("partition pruning in broadcast hash joins with aliases") { Given("alias with simple join condition, using attribute names only") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -674,10 +680,7 @@ class DynamicPartitionPruningSuite } Given("alias with expr as join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -697,10 +700,7 @@ class DynamicPartitionPruningSuite } Given("alias over multiple sub-queries with simple join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -722,10 +722,7 @@ class DynamicPartitionPruningSuite } Given("alias over multiple sub-queries with simple join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid_d as pid, f.sid_d as sid FROM @@ -754,10 +751,8 @@ class DynamicPartitionPruningSuite test("partition pruning in broadcast hash joins") { Given("disable broadcast pruning and disable subquery duplication") withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false", - SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f @@ -777,9 +772,10 @@ class DynamicPartitionPruningSuite Given("disable reuse broadcast results and enable subquery duplication") withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false", + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0.5") { + SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0.5", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f @@ -798,52 +794,47 @@ class DynamicPartitionPruningSuite } Given("enable reuse broadcast results and disable query duplication") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { - val df = sql( - """ - |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f - |JOIN dim_stats s - |ON f.store_id = s.store_id WHERE s.country = 'DE' - """.stripMargin) + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { + val df = sql( + """ + |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f + |JOIN dim_stats s + |ON f.store_id = s.store_id WHERE s.country = 'DE' + """.stripMargin) - checkPartitionPruningPredicate(df, false, true) + checkPartitionPruningPredicate(df, false, true) - checkAnswer(df, - Row(1030, 2, 10, 3) :: - Row(1040, 2, 50, 3) :: - Row(1050, 2, 50, 3) :: - Row(1060, 2, 50, 3) :: Nil - ) + checkAnswer(df, + Row(1030, 2, 10, 3) :: + Row(1040, 2, 50, 3) :: + Row(1050, 2, 50, 3) :: + Row(1060, 2, 50, 3) :: Nil + ) } Given("disable broadcast hash join and disable query duplication") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { - val df = sql( - """ - |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f - |JOIN dim_stats s - |ON f.store_id = s.store_id WHERE s.country = 'DE' - """.stripMargin) + withSQLConf( + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val df = sql( + """ + |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f + |JOIN dim_stats s + |ON f.store_id = s.store_id WHERE s.country = 'DE' + """.stripMargin) - checkPartitionPruningPredicate(df, false, false) + checkPartitionPruningPredicate(df, false, false) - checkAnswer(df, - Row(1030, 2, 10, 3) :: - Row(1040, 2, 50, 3) :: - Row(1050, 2, 50, 3) :: - Row(1060, 2, 50, 3) :: Nil - ) + checkAnswer(df, + Row(1030, 2, 10, 3) :: + Row(1040, 2, 50, 3) :: + Row(1050, 2, 50, 3) :: + Row(1060, 2, 50, 3) :: Nil + ) } Given("disable broadcast hash join and enable query duplication") - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true") { val df = sql( @@ -865,9 +856,7 @@ class DynamicPartitionPruningSuite } test("broadcast a single key in a HashedRelation") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -925,9 +914,7 @@ class DynamicPartitionPruningSuite } test("broadcast multiple keys in a LongHashedRelation") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -962,9 +949,7 @@ class DynamicPartitionPruningSuite } test("broadcast multiple keys in an UnsafeHashedRelation") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -999,9 +984,7 @@ class DynamicPartitionPruningSuite } test("different broadcast subqueries with identical children") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -1073,7 +1056,7 @@ class DynamicPartitionPruningSuite } test("avoid reordering broadcast join keys to match input hash partitioning") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { withTable("large", "dimTwo", "dimThree") { spark.range(100).select( @@ -1123,9 +1106,7 @@ class DynamicPartitionPruningSuite * duplicated partitioning keys, also used to uniquely identify the dynamic pruning filters. */ test("dynamic partition pruning ambiguity issue across nested joins") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("store", "date", "item") { spark.range(500) .select((($"id" + 30) % 50).as("ss_item_sk"), @@ -1163,11 +1144,9 @@ class DynamicPartitionPruningSuite } test("cleanup any DPP filter that isn't pushed down due to expression id clashes") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { - spark.range(1000).select($"id".as("A"), $"id".as("AA")) + spark.range(20).select($"id".as("A"), $"id".as("AA")) .write.partitionBy("A").format(tableFormat).mode("overwrite").saveAsTable("fact") spark.range(10).select($"id".as("B"), $"id".as("BB")) .write.format(tableFormat).mode("overwrite").saveAsTable("dim") @@ -1186,10 +1165,7 @@ class DynamicPartitionPruningSuite } test("cleanup any DPP filter that isn't pushed down due to non-determinism") { - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -1204,10 +1180,7 @@ class DynamicPartitionPruningSuite } test("join key with multiple references on the filtering plan") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0", - SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { // when enable AQE, the reusedExchange is inserted when executed. withTable("fact", "dim") { spark.range(100).select( @@ -1240,9 +1213,7 @@ class DynamicPartitionPruningSuite } test("Make sure dynamic pruning works on uncorrelated queries") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT d.store_id, @@ -1258,18 +1229,39 @@ class DynamicPartitionPruningSuite val plan = df.queryExecution.executedPlan val countSubqueryBroadcasts = - plan.collectInPlanAndSubqueries({ case _: SubqueryBroadcastExec => 1 }).sum + plan.collectWithSubqueries({ case _: SubqueryBroadcastExec => 1 }).sum assert(countSubqueryBroadcasts == 2) } } + test("SPARK-32509: Unused Dynamic Pruning filter shouldn't affect " + + "canonicalization and exchange reuse") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val df = sql( + """ WITH view1 as ( + | SELECT f.store_id FROM fact_stats f WHERE f.units_sold = 70 + | ) + | + | SELECT * FROM view1 v1 join view1 v2 WHERE v1.store_id = v2.store_id + """.stripMargin) + + checkPartitionPruningPredicate(df, false, false) + val reuseExchangeNodes = df.queryExecution.executedPlan.collect { + case se: ReusedExchangeExec => se + } + assert(reuseExchangeNodes.size == 1, "Expected plan to contain 1 ReusedExchangeExec " + + s"nodes. Found ${reuseExchangeNodes.size}") + + checkAnswer(df, Row(15, 15) :: Nil) + } + } + } + test("Plan broadcast pruning only when the broadcast can be reused") { Given("dynamic pruning filter on the build side") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.store_id, f.product_id, f.units_sold FROM fact_np f @@ -1288,10 +1280,7 @@ class DynamicPartitionPruningSuite } Given("dynamic pruning filter on the probe side") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT /*+ BROADCAST(f)*/ @@ -1310,4 +1299,44 @@ class DynamicPartitionPruningSuite ) } } + + test("SPARK-32659: Fix the data issue when pruning DPP on non-atomic type") { + Seq(NO_CODEGEN, CODEGEN_ONLY).foreach { mode => + Seq(true, false).foreach { pruning => + withSQLConf( + SQLConf.CODEGEN_FACTORY_MODE.key -> mode.toString, + SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> s"$pruning") { + Seq("struct", "array").foreach { dataType => + val df = sql( + s""" + |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f + |JOIN dim_stats s + |ON $dataType(f.store_id) = $dataType(s.store_id) WHERE s.country = 'DE' + """.stripMargin) + + if (pruning) { + checkPartitionPruningPredicate(df, false, true) + } else { + checkPartitionPruningPredicate(df, false, false) + } + + checkAnswer(df, + Row(1030, 2, 10, 3) :: + Row(1040, 2, 50, 3) :: + Row(1050, 2, 50, 3) :: + Row(1060, 2, 50, 3) :: Nil + ) + } + } + } + } + } +} + +class DynamicPartitionPruningSuiteAEOff extends DynamicPartitionPruningSuiteBase { + override val adaptiveExecutionOn: Boolean = false +} + +class DynamicPartitionPruningSuiteAEOn extends DynamicPartitionPruningSuiteBase { + override val adaptiveExecutionOn: Boolean = true } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index d9f4d6d5132ae..158d9392c0f5c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -18,27 +18,15 @@ package org.apache.spark.sql import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.adaptive.{DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType -class ExplainSuite extends QueryTest with SharedSparkSession { - import testImplicits._ - - var originalValue: String = _ - protected override def beforeAll(): Unit = { - super.beforeAll() - originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") - } - - protected override def afterAll(): Unit = { - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) - super.afterAll() - } +trait ExplainSuiteHelper extends QueryTest with SharedSparkSession { - private def getNormalizedExplain(df: DataFrame, mode: ExplainMode): String = { + protected def getNormalizedExplain(df: DataFrame, mode: ExplainMode): String = { val output = new java.io.ByteArrayOutputStream() Console.withOut(output) { df.explain(mode.name) @@ -49,7 +37,7 @@ class ExplainSuite extends QueryTest with SharedSparkSession { /** * Get the explain from a DataFrame and run the specified action on it. */ - private def withNormalizedExplain(df: DataFrame, mode: ExplainMode)(f: String => Unit) = { + protected def withNormalizedExplain(df: DataFrame, mode: ExplainMode)(f: String => Unit) = { f(getNormalizedExplain(df, mode)) } @@ -57,7 +45,7 @@ class ExplainSuite extends QueryTest with SharedSparkSession { * Get the explain by running the sql. The explain mode should be part of the * sql text itself. */ - private def withNormalizedExplain(queryText: String)(f: String => Unit) = { + protected def withNormalizedExplain(queryText: String)(f: String => Unit) = { val output = new java.io.ByteArrayOutputStream() Console.withOut(output) { sql(queryText).show(false) @@ -69,7 +57,7 @@ class ExplainSuite extends QueryTest with SharedSparkSession { /** * Runs the plan and makes sure the plans contains all of the keywords. */ - private def checkKeywordsExistsInExplain( + protected def checkKeywordsExistsInExplain( df: DataFrame, mode: ExplainMode, keywords: String*): Unit = { withNormalizedExplain(df, mode) { normalizedOutput => for (key <- keywords) { @@ -78,9 +66,13 @@ class ExplainSuite extends QueryTest with SharedSparkSession { } } - private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = { + protected def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = { checkKeywordsExistsInExplain(df, ExtendedMode, keywords: _*) } +} + +class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite { + import testImplicits._ test("SPARK-23034 show rdd names in RDD scan nodes (Dataset)") { val rddWithName = spark.sparkContext.parallelize(Row(1, "abc") :: Nil).setName("testRdd") @@ -116,8 +108,8 @@ class ExplainSuite extends QueryTest with SharedSparkSession { // plan should show the rewritten aggregate expression. val df = sql("SELECT k, every(v), some(v), any(v) FROM test_agg GROUP BY k") checkKeywordsExistsInExplain(df, - "Aggregate [k#x], [k#x, min(v#x) AS every(v)#x, max(v#x) AS some(v)#x, " + - "max(v#x) AS any(v)#x]") + "Aggregate [k#x], [k#x, every(v#x) AS every(v)#x, some(v#x) AS some(v)#x, " + + "any(v#x) AS any(v)#x]") } } @@ -236,10 +228,27 @@ class ExplainSuite extends QueryTest with SharedSparkSession { } } + test("SPARK-33853: explain codegen - check presence of subquery") { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + withTempView("df") { + val df1 = spark.range(1, 100) + df1.createTempView("df") + + val sqlText = "EXPLAIN CODEGEN SELECT (SELECT min(id) FROM df)" + val expectedText = "Found 3 WholeStageCodegen subtrees." + + withNormalizedExplain(sqlText) { normalizedOutput => + assert(normalizedOutput.contains(expectedText)) + } + } + } + } + test("explain formatted - check presence of subquery in case of DPP") { withTable("df1", "df2") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { withTable("df1", "df2") { spark.range(1000).select(col("id"), col("id").as("k")) .write @@ -340,6 +349,67 @@ class ExplainSuite extends QueryTest with SharedSparkSession { }.getMessage assert(errMsg.contains("Unknown explain mode: unknown")) } + + test("SPARK-31504: Output fields in formatted Explain should have determined order") { + withTempPath { path => + spark.range(10).selectExpr("id as a", "id as b", "id as c", "id as d", "id as e") + .write.mode("overwrite").parquet(path.getAbsolutePath) + val df1 = spark.read.parquet(path.getAbsolutePath) + val df2 = spark.read.parquet(path.getAbsolutePath) + assert(getNormalizedExplain(df1, FormattedMode) === getNormalizedExplain(df2, FormattedMode)) + } + } +} + +class ExplainSuiteAE extends ExplainSuiteHelper with EnableAdaptiveExecutionSuite { + import testImplicits._ + + test("Explain formatted") { + val df1 = Seq((1, 2), (2, 3)).toDF("k", "v1") + val df2 = Seq((2, 3), (1, 1)).toDF("k", "v2") + val testDf = df1.join(df2, "k").groupBy("k").agg(count("v1"), sum("v1"), avg("v2")) + // trigger the final plan for AQE + testDf.collect() + // == Physical Plan == + // AdaptiveSparkPlan (14) + // +- * HashAggregate (13) + // +- CustomShuffleReader (12) + // +- ShuffleQueryStage (11) + // +- Exchange (10) + // +- * HashAggregate (9) + // +- * Project (8) + // +- * BroadcastHashJoin Inner BuildRight (7) + // :- * Project (2) + // : +- * LocalTableScan (1) + // +- BroadcastQueryStage (6) + // +- BroadcastExchange (5) + // +- * Project (4) + // +- * LocalTableScan (3) + checkKeywordsExistsInExplain( + testDf, + FormattedMode, + s""" + |(6) BroadcastQueryStage + |Output [2]: [k#x, v2#x] + |Arguments: 0 + |""".stripMargin, + s""" + |(11) ShuffleQueryStage + |Output [5]: [k#x, count#xL, sum#xL, sum#x, count#xL] + |Arguments: 1 + |""".stripMargin, + s""" + |(12) CustomShuffleReader + |Input [5]: [k#x, count#xL, sum#xL, sum#x, count#xL] + |Arguments: coalesced + |""".stripMargin, + s""" + |(14) AdaptiveSparkPlan + |Output [4]: [k#x, count(v1)#xL, sum(v1)#xL, avg(v2)#x] + |Arguments: isFinalPlan=true + |""".stripMargin + ) + } } case class ExplainSingleData(id: Int) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala new file mode 100644 index 0000000000000..81c09d169efb2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExpressionsSchemaSuite.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.io.File + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.tags.ExtendedSQLTest + +// scalastyle:off line.size.limit +/** + * End-to-end test cases for SQL schemas of expression examples. + * The golden result file is "spark/sql/core/src/test/resources/sql-functions/sql-expression-schema.md". + * + * To run the entire test suite: + * {{{ + * build/sbt "sql/test-only *ExpressionsSchemaSuite" + * }}} + * + * To re-generate golden files for entire suite, run: + * {{{ + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/test-only *ExpressionsSchemaSuite" + * }}} + * + * For example: + * {{{ + * ... + * @ExpressionDescription( + * usage = "_FUNC_(str, n) - Returns the string which repeats the given string value n times.", + * examples = """ + * Examples: + * > SELECT _FUNC_('123', 2); + * 123123 + * """, + * since = "1.5.0") + * case class StringRepeat(str: Expression, times: Expression) + * ... + * }}} + * + * The format for golden result files look roughly like: + * {{{ + * ... + * | org.apache.spark.sql.catalyst.expressions.StringRepeat | repeat | SELECT repeat('123', 2) | struct | + * ... + * }}} + */ +// scalastyle:on line.size.limit +@ExtendedSQLTest +class ExpressionsSchemaSuite extends QueryTest with SharedSparkSession { + + private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1" + + private val baseResourcePath = { + // We use a path based on Spark home for 2 reasons: + // 1. Maven can't get correct resource directory when resources in other jars. + // 2. We test subclasses in the hive-thriftserver module. + java.nio.file.Paths.get(sparkHome, + "sql", "core", "src", "test", "resources", "sql-functions").toFile + } + + private val resultFile = new File(baseResourcePath, "sql-expression-schema.md") + + private val ignoreSet = Set( + // Output name with a random seed + "org.apache.spark.sql.catalyst.expressions.Rand", + "org.apache.spark.sql.catalyst.expressions.Randn") + + /** A single SQL query's SQL and schema. */ + protected case class QueryOutput( + className: String, + funcName: String, + sql: String = "N/A", + schema: String = "N/A") { + override def toString: String = { + s"| $className | $funcName | $sql | $schema |" + } + } + + test("Check schemas for expression examples") { + val exampleRe = """^(.+);\n(?s)(.+)$""".r + val funInfos = spark.sessionState.functionRegistry.listFunction().map { funcId => + spark.sessionState.catalog.lookupFunctionInfo(funcId) + } + + val classFunsMap = funInfos.groupBy(_.getClassName).toSeq.sortBy(_._1) + val outputBuffer = new ArrayBuffer[String] + val outputs = new ArrayBuffer[QueryOutput] + val missingExamples = new ArrayBuffer[String] + + classFunsMap.filterNot(v => ignoreSet.contains(v._1)).foreach { kv => + val className = kv._1 + kv._2.foreach { funInfo => + val example = funInfo.getExamples + val funcName = funInfo.getName.replaceAll("\\|", "|") + if (example == "") { + val queryOutput = QueryOutput(className, funcName) + outputBuffer += queryOutput.toString + outputs += queryOutput + missingExamples += funcName + } + + // If expression exists 'Examples' segment, the first element is 'Examples'. Because + // this test case is only used to print aliases of expressions for double checking. + // Therefore, we only need to output the first SQL and its corresponding schema. + // Note: We need to filter out the commands that set the parameters, such as: + // SET spark.sql.parser.escapedStringLiterals=true + example.split(" > ").tail.filterNot(_.trim.startsWith("SET")).take(1).foreach { + case exampleRe(sql, _) => + val df = spark.sql(sql) + val escapedSql = sql.replaceAll("\\|", "|") + val schema = df.schema.catalogString.replaceAll("\\|", "|") + val queryOutput = QueryOutput(className, funcName, escapedSql, schema) + outputBuffer += queryOutput.toString + outputs += queryOutput + case _ => + } + } + } + + val header = Seq( + s"", + "## Summary", + s" - Number of queries: ${outputs.size}", + s" - Number of expressions that missing example: ${missingExamples.size}", + s" - Expressions missing examples: ${missingExamples.mkString(",")}", + "## Schema of Built-in Functions", + "| Class name | Function name or alias | Query example | Output schema |", + "| ---------- | ---------------------- | ------------- | ------------- |" + ) + + if (regenerateGoldenFiles) { + val goldenOutput = (header ++ outputBuffer).mkString("\n") + val parent = resultFile.getParentFile + if (!parent.exists()) { + assert(parent.mkdirs(), "Could not create directory: " + parent) + } + stringToFile(resultFile, goldenOutput) + } + + val outputSize = outputs.size + val headerSize = header.size + val expectedOutputs: Seq[QueryOutput] = { + val expectedGoldenOutput = fileToString(resultFile) + val lines = expectedGoldenOutput.split("\n") + val expectedSize = lines.size + + assert(expectedSize == outputSize + headerSize, + s"Expected $expectedSize blocks in result file but got " + + s"${outputSize + headerSize}. Try regenerate the result files.") + + Seq.tabulate(outputSize) { i => + val segments = lines(i + headerSize).split('|') + QueryOutput( + className = segments(1).trim, + funcName = segments(2).trim, + sql = segments(3).trim, + schema = segments(4).trim) + } + } + + // Compare results. + assert(expectedOutputs.size == outputSize, + "The number of queries not equals the number of expected queries.") + + outputs.zip(expectedOutputs).foreach { case (output, expected) => + assert(expected.sql == output.sql, "SQL query did not match") + assert(expected.schema == output.schema, s"Schema did not match for query ${expected.sql}") + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index c870958128483..e9bff64d72fc3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -18,12 +18,14 @@ package org.apache.spark.sql import java.io.{File, FileNotFoundException} +import java.net.URI import java.nio.file.{Files, StandardOpenOption} import java.util.Locale import scala.collection.mutable -import org.apache.hadoop.fs.Path +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{LocalFileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} @@ -631,13 +633,15 @@ class FileBasedDataSourceSuite extends QueryTest assert(fileList.toSet === expectedFileList.toSet) - val fileList2 = spark.read.format("binaryFile") - .option("recursiveFileLookup", true) - .option("pathGlobFilter", "*.bin") - .load(dataPath) - .select("path").collect().map(_.getString(0)) + withClue("SPARK-32368: 'recursiveFileLookup' and 'pathGlobFilter' can be case insensitive") { + val fileList2 = spark.read.format("binaryFile") + .option("RecuRsivefileLookup", true) + .option("PaThglobFilter", "*.bin") + .load(dataPath) + .select("path").collect().map(_.getString(0)) - assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet) + assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet) + } } } @@ -842,6 +846,62 @@ class FileBasedDataSourceSuite extends QueryTest } } } + + test("SPARK-31935: Hadoop file system config should be effective in data source options") { + Seq("parquet", "").foreach { format => + withSQLConf( + SQLConf.USE_V1_SOURCE_LIST.key -> format, + "fs.file.impl" -> classOf[FakeFileSystemRequiringDSOption].getName, + "fs.file.impl.disable.cache" -> "true") { + withTempDir { dir => + val path = "file:" + dir.getCanonicalPath.stripPrefix("file:") + spark.range(10).write.option("ds_option", "value").mode("overwrite").parquet(path) + checkAnswer( + spark.read.option("ds_option", "value").parquet(path), spark.range(10).toDF()) + } + } + } + } + + test("SPARK-31116: Select nested schema with case insensitive mode") { + // This test case failed at only Parquet. ORC is added for test coverage parity. + Seq("orc", "parquet").foreach { format => + Seq("true", "false").foreach { nestedSchemaPruningEnabled => + withSQLConf( + SQLConf.CASE_SENSITIVE.key -> "false", + SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> nestedSchemaPruningEnabled) { + withTempPath { dir => + val path = dir.getCanonicalPath + + // Prepare values for testing nested parquet data + spark + .range(1L) + .selectExpr("NAMED_STRUCT('lowercase', id, 'camelCase', id + 1) AS StructColumn") + .write + .format(format) + .save(path) + + val exactSchema = "StructColumn struct" + + checkAnswer(spark.read.schema(exactSchema).format(format).load(path), Row(Row(0, 1))) + + // In case insensitive manner, parquet's column cases are ignored + val innerColumnCaseInsensitiveSchema = + "StructColumn struct" + checkAnswer( + spark.read.schema(innerColumnCaseInsensitiveSchema).format(format).load(path), + Row(Row(0, 1))) + + val rootColumnCaseInsensitiveSchema = + "structColumn struct" + checkAnswer( + spark.read.schema(rootColumnCaseInsensitiveSchema).format(format).load(path), + Row(Row(0, 1))) + } + } + } + } + } } object TestingUDT { @@ -872,3 +932,10 @@ object TestingUDT { override def userClass: Class[NullData] = classOf[NullData] } } + +class FakeFileSystemRequiringDSOption extends LocalFileSystem { + override def initialize(name: URI, conf: Configuration): Unit = { + super.initialize(name, conf) + require(conf.get("ds_option", "") == "value") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala index 96a0eb3e32e9b..8f449037a5e01 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala @@ -343,6 +343,19 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { assert(msg2.contains("Only one generator allowed per aggregate clause")) } } + + test("SPARK-30998: Unsupported nested inner generators") { + val errMsg = intercept[AnalysisException] { + sql("SELECT array(array(1, 2), array(3)) v").select(explode(explode($"v"))).collect + }.getMessage + assert(errMsg.contains("Generators are not supported when it's nested in expressions, " + + "but got: explode(explode(v))")) + } + + test("SPARK-30997: generators in aggregate expressions for dataframe") { + val df = Seq(1, 2, 3).toDF("v") + checkAnswer(df.select(explode(array(min($"v"), max($"v")))), Row(1) :: Row(3) :: Nil) + } } case class EmptyGenerator() extends Generator { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index 51150a1b38b49..80346b350c142 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -75,14 +75,7 @@ object IntegratedUDFTestUtils extends SQLHelper { import scala.sys.process._ private lazy val pythonPath = sys.env.getOrElse("PYTHONPATH", "") - private lazy val sparkHome = if (sys.props.contains(Tests.IS_TESTING.key)) { - assert(sys.props.contains("spark.test.home") || - sys.env.contains("SPARK_HOME"), "spark.test.home or SPARK_HOME is not set.") - sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) - } else { - assert(sys.env.contains("SPARK_HOME"), "SPARK_HOME is not set.") - sys.env("SPARK_HOME") - } + // Note that we will directly refer pyspark's source, not the zip from a regular build. // It is possible the test is being ran without the build. private lazy val sourcePath = Paths.get(sparkHome, "python").toAbsolutePath @@ -204,7 +197,7 @@ object IntegratedUDFTestUtils extends SQLHelper { lazy val pythonExec: String = { val pythonExec = sys.env.getOrElse( - "PYSPARK_DRIVER_PYTHON", sys.env.getOrElse("PYSPARK_PYTHON", "python3.6")) + "PYSPARK_DRIVER_PYTHON", sys.env.getOrElse("PYSPARK_PYTHON", "python3")) if (TestUtils.testCommandAvailable(pythonExec)) { pythonExec } else { @@ -337,7 +330,7 @@ object IntegratedUDFTestUtils extends SQLHelper { input.toString }, StringType, - inputSchemas = Seq.fill(1)(None), + inputEncoders = Seq.fill(1)(None), name = Some(name)) { override def apply(exprs: Column*): Column = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala index f68c416941266..71f7a708ad681 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala @@ -570,4 +570,31 @@ class JoinHintSuite extends PlanTest with SharedSparkSession with AdaptiveSparkP assert(joinHints == expectedHints) } } + + test("SPARK-32220: Non Cartesian Product Join Result Correct with SHUFFLE_REPLICATE_NL hint") { + withTempView("t1", "t2") { + Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1") + Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2") + val df1 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key = t2.key") + val df2 = sql("SELECT * from t1 join t2 ON t1.key = t2.key") + assert(df1.collect().size == df2.collect().size) + + val df3 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2") + val df4 = sql("SELECT * from t1 join t2") + assert(df3.collect().size == df4.collect().size) + + val df5 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < t2.key") + val df6 = sql("SELECT * from t1 join t2 ON t1.key < t2.key") + assert(df5.collect().size == df6.collect().size) + + val df7 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t1.key < 2") + val df8 = sql("SELECT * from t1 join t2 ON t1.key < 2") + assert(df7.collect().size == df8.collect().size) + + + val df9 = sql("SELECT /*+ shuffle_replicate_nl(t1) */ * from t1 join t2 ON t2.key < 2") + val df10 = sql("SELECT * from t1 join t2 ON t2.key < 2") + assert(df9.collect().size == df10.collect().size) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index f45bd950040ce..fe6775cc7f9b9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -401,94 +401,96 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan } test("full outer join") { - upperCaseData.where('N <= 4).createOrReplaceTempView("`left`") - upperCaseData.where('N >= 3).createOrReplaceTempView("`right`") + withTempView("`left`", "`right`") { + upperCaseData.where('N <= 4).createOrReplaceTempView("`left`") + upperCaseData.where('N >= 3).createOrReplaceTempView("`right`") - val left = UnresolvedRelation(TableIdentifier("left")) - val right = UnresolvedRelation(TableIdentifier("right")) + val left = UnresolvedRelation(TableIdentifier("left")) + val right = UnresolvedRelation(TableIdentifier("right")) - checkAnswer( - left.join(right, $"left.N" === $"right.N", "full"), - Row(1, "A", null, null) :: - Row(2, "B", null, null) :: - Row(3, "C", 3, "C") :: - Row(4, "D", 4, "D") :: - Row(null, null, 5, "E") :: - Row(null, null, 6, "F") :: Nil) + checkAnswer( + left.join(right, $"left.N" === $"right.N", "full"), + Row(1, "A", null, null) :: + Row(2, "B", null, null) :: + Row(3, "C", 3, "C") :: + Row(4, "D", 4, "D") :: + Row(null, null, 5, "E") :: + Row(null, null, 6, "F") :: Nil) - checkAnswer( - left.join(right, ($"left.N" === $"right.N") && ($"left.N" =!= 3), "full"), - Row(1, "A", null, null) :: - Row(2, "B", null, null) :: - Row(3, "C", null, null) :: - Row(null, null, 3, "C") :: - Row(4, "D", 4, "D") :: - Row(null, null, 5, "E") :: - Row(null, null, 6, "F") :: Nil) + checkAnswer( + left.join(right, ($"left.N" === $"right.N") && ($"left.N" =!= 3), "full"), + Row(1, "A", null, null) :: + Row(2, "B", null, null) :: + Row(3, "C", null, null) :: + Row(null, null, 3, "C") :: + Row(4, "D", 4, "D") :: + Row(null, null, 5, "E") :: + Row(null, null, 6, "F") :: Nil) - checkAnswer( - left.join(right, ($"left.N" === $"right.N") && ($"right.N" =!= 3), "full"), - Row(1, "A", null, null) :: - Row(2, "B", null, null) :: - Row(3, "C", null, null) :: - Row(null, null, 3, "C") :: - Row(4, "D", 4, "D") :: - Row(null, null, 5, "E") :: - Row(null, null, 6, "F") :: Nil) - - // Make sure we are UnknownPartitioning as the outputPartitioning for the outer join - // operator. - checkAnswer( - sql( - """ - |SELECT l.a, count(*) - |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N) - |GROUP BY l.a - """. - stripMargin), - Row(null, 10)) + checkAnswer( + left.join(right, ($"left.N" === $"right.N") && ($"right.N" =!= 3), "full"), + Row(1, "A", null, null) :: + Row(2, "B", null, null) :: + Row(3, "C", null, null) :: + Row(null, null, 3, "C") :: + Row(4, "D", 4, "D") :: + Row(null, null, 5, "E") :: + Row(null, null, 6, "F") :: Nil) - checkAnswer( - sql( - """ - |SELECT r.N, count(*) + // Make sure we are UnknownPartitioning as the outputPartitioning for the outer join + // operator. + checkAnswer( + sql( + """ + |SELECT l.a, count(*) |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N) - |GROUP BY r.N - """.stripMargin), - Row + |GROUP BY l.a + """. + stripMargin), + Row(null, 10)) + + checkAnswer( + sql( + """ + |SELECT r.N, count(*) + |FROM allNulls l FULL OUTER JOIN upperCaseData r ON (l.a = r.N) + |GROUP BY r.N + """.stripMargin), + Row (1, 1) :: - Row(2, 1) :: - Row(3, 1) :: - Row(4, 1) :: - Row(5, 1) :: - Row(6, 1) :: - Row(null, 4) :: Nil) + Row(2, 1) :: + Row(3, 1) :: + Row(4, 1) :: + Row(5, 1) :: + Row(6, 1) :: + Row(null, 4) :: Nil) - checkAnswer( - sql( - """ - |SELECT l.N, count(*) - |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a) - |GROUP BY l.N - """.stripMargin), - Row(1 - , 1) :: - Row(2, 1) :: - Row(3, 1) :: - Row(4, 1) :: - Row(5, 1) :: - Row(6, 1) :: - Row(null, 4) :: Nil) + checkAnswer( + sql( + """ + |SELECT l.N, count(*) + |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a) + |GROUP BY l.N + """.stripMargin), + Row(1 + , 1) :: + Row(2, 1) :: + Row(3, 1) :: + Row(4, 1) :: + Row(5, 1) :: + Row(6, 1) :: + Row(null, 4) :: Nil) - checkAnswer( - sql( - """ - |SELECT r.a, count(*) - |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a) - |GROUP BY r.a - """. - stripMargin), - Row(null, 10)) + checkAnswer( + sql( + """ + |SELECT r.a, count(*) + |FROM upperCaseData l FULL OUTER JOIN allNulls r ON (l.N = r.a) + |GROUP BY r.a + """. + stripMargin), + Row(null, 10)) + } } test("broadcasted existence join operator selection") { @@ -614,63 +616,65 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan } test("cross join detection") { - testData.createOrReplaceTempView("A") - testData.createOrReplaceTempView("B") - testData2.createOrReplaceTempView("C") - testData3.createOrReplaceTempView("D") - upperCaseData.where('N >= 3).createOrReplaceTempView("`right`") - val cartesianQueries = Seq( - /** The following should error out since there is no explicit cross join */ - "SELECT * FROM testData inner join testData2", - "SELECT * FROM testData left outer join testData2", - "SELECT * FROM testData right outer join testData2", - "SELECT * FROM testData full outer join testData2", - "SELECT * FROM testData, testData2", - "SELECT * FROM testData, testData2 where testData.key = 1 and testData2.a = 22", - /** The following should fail because after reordering there are cartesian products */ - "select * from (A join B on (A.key = B.key)) join D on (A.key=D.a) join C", - "select * from ((A join B on (A.key = B.key)) join C) join D on (A.key = D.a)", - /** Cartesian product involving C, which is not involved in a CROSS join */ - "select * from ((A join B on (A.key = B.key)) cross join D) join C on (A.key = D.a)"); - - def checkCartesianDetection(query: String): Unit = { - val e = intercept[Exception] { - checkAnswer(sql(query), Nil); + withTempView("A", "B", "C", "D") { + testData.createOrReplaceTempView("A") + testData.createOrReplaceTempView("B") + testData2.createOrReplaceTempView("C") + testData3.createOrReplaceTempView("D") + upperCaseData.where('N >= 3).createOrReplaceTempView("`right`") + val cartesianQueries = Seq( + /** The following should error out since there is no explicit cross join */ + "SELECT * FROM testData inner join testData2", + "SELECT * FROM testData left outer join testData2", + "SELECT * FROM testData right outer join testData2", + "SELECT * FROM testData full outer join testData2", + "SELECT * FROM testData, testData2", + "SELECT * FROM testData, testData2 where testData.key = 1 and testData2.a = 22", + /** The following should fail because after reordering there are cartesian products */ + "select * from (A join B on (A.key = B.key)) join D on (A.key=D.a) join C", + "select * from ((A join B on (A.key = B.key)) join C) join D on (A.key = D.a)", + /** Cartesian product involving C, which is not involved in a CROSS join */ + "select * from ((A join B on (A.key = B.key)) cross join D) join C on (A.key = D.a)"); + + def checkCartesianDetection(query: String): Unit = { + val e = intercept[Exception] { + checkAnswer(sql(query), Nil); + } + assert(e.getMessage.contains("Detected implicit cartesian product")) } - assert(e.getMessage.contains("Detected implicit cartesian product")) - } - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") { - cartesianQueries.foreach(checkCartesianDetection) - } + withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") { + cartesianQueries.foreach(checkCartesianDetection) + } - // Check that left_semi, left_anti, existence joins without conditions do not throw - // an exception if cross joins are disabled - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") { - checkAnswer( - sql("SELECT * FROM testData3 LEFT SEMI JOIN testData2"), - Row(1, null) :: Row (2, 2) :: Nil) - checkAnswer( - sql("SELECT * FROM testData3 LEFT ANTI JOIN testData2"), - Nil) - checkAnswer( - sql( - """ - |SELECT a FROM testData3 - |WHERE - | EXISTS (SELECT * FROM testData) - |OR - | EXISTS (SELECT * FROM testData2)""".stripMargin), - Row(1) :: Row(2) :: Nil) - checkAnswer( - sql( - """ - |SELECT key FROM testData - |WHERE - | key IN (SELECT a FROM testData2) - |OR - | key IN (SELECT a FROM testData3)""".stripMargin), - Row(1) :: Row(2) :: Row(3) :: Nil) + // Check that left_semi, left_anti, existence joins without conditions do not throw + // an exception if cross joins are disabled + withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "false") { + checkAnswer( + sql("SELECT * FROM testData3 LEFT SEMI JOIN testData2"), + Row(1, null) :: Row (2, 2) :: Nil) + checkAnswer( + sql("SELECT * FROM testData3 LEFT ANTI JOIN testData2"), + Nil) + checkAnswer( + sql( + """ + |SELECT a FROM testData3 + |WHERE + | EXISTS (SELECT * FROM testData) + |OR + | EXISTS (SELECT * FROM testData2)""".stripMargin), + Row(1) :: Row(2) :: Nil) + checkAnswer( + sql( + """ + |SELECT key FROM testData + |WHERE + | key IN (SELECT a FROM testData2) + |OR + | key IN (SELECT a FROM testData3)""".stripMargin), + Row(1) :: Row(2) :: Row(3) :: Nil) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index fd1e9e309558e..5e3931cecf1c5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -429,57 +429,69 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { } test("from_json - array of arrays") { - val jsonDF = Seq("[[1], [2, 3], [4, 5, 6]]").toDF("a") - val schema = new ArrayType(ArrayType(IntegerType, false), false) - jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = Seq("[[1], [2, 3], [4, 5, 6]]").toDF("a") + val schema = new ArrayType(ArrayType(IntegerType, false), false) + jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select json[0][0], json[1][1], json[2][2] from jsonTable"), - Seq(Row(1, 3, 6))) + checkAnswer( + sql("select json[0][0], json[1][1], json[2][2] from jsonTable"), + Seq(Row(1, 3, 6))) + } } test("from_json - array of arrays - malformed row") { - val jsonDF = Seq("[[1], [2, 3], 4, 5, 6]]").toDF("a") - val schema = new ArrayType(ArrayType(IntegerType, false), false) - jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = Seq("[[1], [2, 3], 4, 5, 6]]").toDF("a") + val schema = new ArrayType(ArrayType(IntegerType, false), false) + jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") - checkAnswer(sql("select json[0] from jsonTable"), Seq(Row(null))) + checkAnswer(sql("select json[0] from jsonTable"), Seq(Row(null))) + } } test("from_json - array of structs") { - val jsonDF = Seq("""[{"a":1}, {"a":2}, {"a":3}]""").toDF("a") - val schema = new ArrayType(new StructType().add("a", IntegerType), false) - jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = Seq("""[{"a":1}, {"a":2}, {"a":3}]""").toDF("a") + val schema = new ArrayType(new StructType().add("a", IntegerType), false) + jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select json[0], json[1], json[2] from jsonTable"), - Seq(Row(Row(1), Row(2), Row(3)))) + checkAnswer( + sql("select json[0], json[1], json[2] from jsonTable"), + Seq(Row(Row(1), Row(2), Row(3)))) + } } test("from_json - array of structs - malformed row") { - val jsonDF = Seq("""[{"a":1}, {"a:2}, {"a":3}]""").toDF("a") - val schema = new ArrayType(new StructType().add("a", IntegerType), false) - jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = Seq("""[{"a":1}, {"a:2}, {"a":3}]""").toDF("a") + val schema = new ArrayType(new StructType().add("a", IntegerType), false) + jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") - checkAnswer(sql("select json[0], json[1]from jsonTable"), Seq(Row(null, null))) + checkAnswer(sql("select json[0], json[1]from jsonTable"), Seq(Row(null, null))) + } } test("from_json - array of maps") { - val jsonDF = Seq("""[{"a":1}, {"b":2}]""").toDF("a") - val schema = new ArrayType(MapType(StringType, IntegerType, false), false) - jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = Seq("""[{"a":1}, {"b":2}]""").toDF("a") + val schema = new ArrayType(MapType(StringType, IntegerType, false), false) + jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") - checkAnswer( - sql("""select json[0], json[1] from jsonTable"""), - Seq(Row(Map("a" -> 1), Map("b" -> 2)))) + checkAnswer( + sql("""select json[0], json[1] from jsonTable"""), + Seq(Row(Map("a" -> 1), Map("b" -> 2)))) + } } test("from_json - array of maps - malformed row") { - val jsonDF = Seq("""[{"a":1} "b":2}]""").toDF("a") - val schema = new ArrayType(MapType(StringType, IntegerType, false), false) - jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = Seq("""[{"a":1} "b":2}]""").toDF("a") + val schema = new ArrayType(MapType(StringType, IntegerType, false), false) + jsonDF.select(from_json($"a", schema) as "json").createOrReplaceTempView("jsonTable") - checkAnswer(sql("""select json[0] from jsonTable"""), Seq(Row(null))) + checkAnswer(sql("""select json[0] from jsonTable"""), Seq(Row(null))) + } } test("to_json - array of primitive types") { @@ -653,4 +665,65 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { assert(json_tuple_result === len) } } + + test("SPARK-31065: schema_of_json - null and empty strings as strings") { + Seq("""{"id": null}""", """{"id": ""}""").foreach { input => + checkAnswer( + spark.range(1).select(schema_of_json(input)), + Seq(Row("struct"))) + } + } + + test("SPARK-31065: schema_of_json - 'dropFieldIfAllNull' option") { + val options = Map("dropFieldIfAllNull" -> "true") + // Structs + checkAnswer( + spark.range(1).select( + schema_of_json( + lit("""{"id": "a", "drop": {"drop": null}}"""), + options.asJava)), + Seq(Row("struct"))) + + // Array of structs + checkAnswer( + spark.range(1).select( + schema_of_json( + lit("""[{"id": "a", "drop": {"drop": null}}]"""), + options.asJava)), + Seq(Row("array>"))) + + // Other types are not affected. + checkAnswer( + spark.range(1).select( + schema_of_json( + lit("""null"""), + options.asJava)), + Seq(Row("string"))) + } + + test("optional datetime parser does not affect json time formatting") { + val s = "2015-08-26 12:34:46" + def toDF(p: String): DataFrame = sql( + s""" + |SELECT + | to_json( + | named_struct('time', timestamp'$s'), map('timestampFormat', "$p") + | ) + | """.stripMargin) + checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]")) + } + + test("SPARK-33134: return partial results only for root JSON objects") { + val st = new StructType() + .add("c1", LongType) + .add("c2", ArrayType(new StructType().add("c3", LongType).add("c4", StringType))) + val df1 = Seq("""{"c2": [19], "c1": 123456}""").toDF("c0") + checkAnswer(df1.select(from_json($"c0", st)), Row(Row(123456, null))) + val df2 = Seq("""{"data": {"c2": [19], "c1": 123456}}""").toDF("c0") + checkAnswer(df2.select(from_json($"c0", new StructType().add("data", st))), Row(Row(null))) + val df3 = Seq("""[{"c2": [19], "c1": 123456}]""").toDF("c0") + checkAnswer(df3.select(from_json($"c0", ArrayType(st))), Row(null)) + val df4 = Seq("""{"c2": [19]}""").toDF("c0") + checkAnswer(df4.select(from_json($"c0", MapType(StringType, st))), Row(null)) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala index 5ab06b1ebebf6..16edf35bb99f9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala @@ -34,9 +34,11 @@ class MiscFunctionsSuite extends QueryTest with SharedSparkSession { } test("version") { + val df = sql("SELECT version()") checkAnswer( - Seq("").toDF("a").selectExpr("version()"), + df, Row(SPARK_VERSION_SHORT + " " + SPARK_REVISION)) + assert(df.schema.fieldNames === Seq("version()")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 4a21ae9242039..e52d2262a6bf8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import java.util.{Locale, TimeZone} +import java.util.TimeZone import scala.collection.JavaConverters._ @@ -35,11 +35,6 @@ abstract class QueryTest extends PlanTest { protected def spark: SparkSession - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) - /** * Runs the plan and makes sure the answer contains all of the keywords. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala index aab2ae4afc7f5..a1799829932b8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala @@ -24,14 +24,32 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType} +@deprecated("This suite is deprecated to silent compiler deprecation warnings", "2.0.0") class SQLContextSuite extends SparkFunSuite with SharedSparkContext { object DummyRule extends Rule[LogicalPlan] { def apply(p: LogicalPlan): LogicalPlan = p } + test("getOrCreate instantiates SQLContext") { + val sqlContext = SQLContext.getOrCreate(sc) + assert(sqlContext != null, "SQLContext.getOrCreate returned null") + assert(SQLContext.getOrCreate(sc).eq(sqlContext), + "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate") + } + + test("getOrCreate return the original SQLContext") { + val sqlContext = SQLContext.getOrCreate(sc) + val newSession = sqlContext.newSession() + assert(SQLContext.getOrCreate(sc).eq(sqlContext), + "SQLContext.getOrCreate after explicitly created SQLContext did not return the context") + SparkSession.setActiveSession(newSession.sparkSession) + assert(SQLContext.getOrCreate(sc).eq(newSession), + "SQLContext.getOrCreate after explicitly setActive() did not return the active context") + } + test("Sessions of SQLContext") { - val sqlContext = SparkSession.builder().sparkContext(sc).getOrCreate().sqlContext + val sqlContext = SQLContext.getOrCreate(sc) val session1 = sqlContext.newSession() val session2 = sqlContext.newSession() @@ -59,13 +77,13 @@ class SQLContextSuite extends SparkFunSuite with SharedSparkContext { } test("Catalyst optimization passes are modifiable at runtime") { - val sqlContext = SparkSession.builder().sparkContext(sc).getOrCreate().sqlContext + val sqlContext = SQLContext.getOrCreate(sc) sqlContext.experimental.extraOptimizations = Seq(DummyRule) assert(sqlContext.sessionState.optimizer.batches.flatMap(_.rules).contains(DummyRule)) } test("get all tables") { - val sqlContext = SparkSession.builder().sparkContext(sc).getOrCreate().sqlContext + val sqlContext = SQLContext.getOrCreate(sc) val df = sqlContext.range(10) df.createOrReplaceTempView("listtablessuitetable") assert( @@ -82,7 +100,7 @@ class SQLContextSuite extends SparkFunSuite with SharedSparkContext { } test("getting all tables with a database name has no impact on returned table names") { - val sqlContext = SparkSession.builder().sparkContext(sc).getOrCreate().sqlContext + val sqlContext = SQLContext.getOrCreate(sc) val df = sqlContext.range(10) df.createOrReplaceTempView("listtablessuitetable") assert( @@ -99,7 +117,7 @@ class SQLContextSuite extends SparkFunSuite with SharedSparkContext { } test("query the returned DataFrame of tables") { - val sqlContext = SparkSession.builder().sparkContext(sc).getOrCreate().sqlContext + val sqlContext = SQLContext.getOrCreate(sc) val df = sqlContext.range(10) df.createOrReplaceTempView("listtablessuitetable") @@ -109,7 +127,7 @@ class SQLContextSuite extends SparkFunSuite with SharedSparkContext { StructField("isTemporary", BooleanType, false) :: Nil) Seq(sqlContext.tables(), sqlContext.sql("SHOW TABLes")).foreach { - tableDF => + case tableDF => assert(expectedSchema === tableDF.schema) tableDF.createOrReplaceTempView("tables") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index a6dae9a269740..8039c9b6f04b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -22,15 +22,13 @@ import java.net.{MalformedURLException, URL} import java.sql.{Date, Timestamp} import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.parallel.immutable.ParVector - import org.apache.spark.{AccumulatorSuite, SparkException} import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, Partial} -import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation +import org.apache.spark.sql.catalyst.optimizer.{ConvertToLocalRelation, NestedColumnAliasingSuite} +import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.catalyst.util.StringUtils -import org.apache.spark.sql.execution.HiveResult.hiveResultString import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec @@ -52,13 +50,15 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark setupTestData() test("SPARK-8010: promote numeric to string") { - val df = Seq((1, 1)).toDF("key", "value") - df.createOrReplaceTempView("src") - val queryCaseWhen = sql("select case when true then 1.0 else '1' end from src ") - val queryCoalesce = sql("select coalesce(null, 1, '1') from src ") + withTempView("src") { + val df = Seq((1, 1)).toDF("key", "value") + df.createOrReplaceTempView("src") + val queryCaseWhen = sql("select case when true then 1.0 else '1' end from src ") + val queryCoalesce = sql("select coalesce(null, 1, '1') from src ") - checkAnswer(queryCaseWhen, Row("1.0") :: Nil) - checkAnswer(queryCoalesce, Row("1") :: Nil) + checkAnswer(queryCaseWhen, Row("1.0") :: Nil) + checkAnswer(queryCoalesce, Row("1") :: Nil) + } } test("show functions") { @@ -125,107 +125,36 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } } - test("using _FUNC_ instead of function names in examples") { - val exampleRe = "(>.*;)".r - val setStmtRe = "(?i)^(>\\s+set\\s+).+".r - val ignoreSet = Set( - // Examples for CaseWhen show simpler syntax: - // `CASE WHEN ... THEN ... WHEN ... THEN ... END` - "org.apache.spark.sql.catalyst.expressions.CaseWhen", - // _FUNC_ is replaced by `locate` but `locate(... IN ...)` is not supported - "org.apache.spark.sql.catalyst.expressions.StringLocate", - // _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)` - "org.apache.spark.sql.catalyst.expressions.Remainder", - // Examples demonstrate alternative names, see SPARK-20749 - "org.apache.spark.sql.catalyst.expressions.Length") - spark.sessionState.functionRegistry.listFunction().foreach { funcId => - val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) - val className = info.getClassName - withClue(s"Expression class '$className'") { - val exprExamples = info.getOriginalExamples - if (!exprExamples.isEmpty && !ignoreSet.contains(className)) { - assert(exampleRe.findAllIn(exprExamples).toIterable - .filter(setStmtRe.findFirstIn(_).isEmpty) // Ignore SET commands - .forall(_.contains("_FUNC_"))) - } - } - } - } - - test("check outputs of expression examples") { - def unindentAndTrim(s: String): String = { - s.replaceAll("\n\\s+", "\n").trim - } - val beginSqlStmtRe = " > ".r - val endSqlStmtRe = ";\n".r - def checkExampleSyntax(example: String): Unit = { - val beginStmtNum = beginSqlStmtRe.findAllIn(example).length - val endStmtNum = endSqlStmtRe.findAllIn(example).length - assert(beginStmtNum === endStmtNum, - "The number of ` > ` does not match to the number of `;`") - } - val exampleRe = """^(.+);\n(?s)(.+)$""".r - val ignoreSet = Set( - // One of examples shows getting the current timestamp - "org.apache.spark.sql.catalyst.expressions.UnixTimestamp", - // Random output without a seed - "org.apache.spark.sql.catalyst.expressions.Rand", - "org.apache.spark.sql.catalyst.expressions.Randn", - "org.apache.spark.sql.catalyst.expressions.Shuffle", - "org.apache.spark.sql.catalyst.expressions.Uuid", - // The example calls methods that return unstable results. - "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection") - - val parFuncs = new ParVector(spark.sessionState.functionRegistry.listFunction().toVector) - parFuncs.foreach { funcId => - // Examples can change settings. We clone the session to prevent tests clashing. - val clonedSpark = spark.cloneSession() - val info = clonedSpark.sessionState.catalog.lookupFunctionInfo(funcId) - val className = info.getClassName - if (!ignoreSet.contains(className)) { - withClue(s"Function '${info.getName}', Expression class '$className'") { - val example = info.getExamples - checkExampleSyntax(example) - example.split(" > ").toList.foreach(_ match { - case exampleRe(sql, output) => - val df = clonedSpark.sql(sql) - val actual = unindentAndTrim( - hiveResultString(df.queryExecution.executedPlan).mkString("\n")) - val expected = unindentAndTrim(output) - assert(actual === expected) - case _ => - }) - } - } - } - } - test("SPARK-6743: no columns from cache") { - Seq( - (83, 0, 38), - (26, 0, 79), - (43, 81, 24) - ).toDF("a", "b", "c").createOrReplaceTempView("cachedData") + withTempView("cachedData") { + Seq( + (83, 0, 38), + (26, 0, 79), + (43, 81, 24) + ).toDF("a", "b", "c").createOrReplaceTempView("cachedData") - spark.catalog.cacheTable("cachedData") - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { - checkAnswer( - sql("SELECT t1.b FROM cachedData, cachedData t1 GROUP BY t1.b"), - Row(0) :: Row(81) :: Nil) + spark.catalog.cacheTable("cachedData") + withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { + checkAnswer( + sql("SELECT t1.b FROM cachedData, cachedData t1 GROUP BY t1.b"), + Row(0) :: Row(81) :: Nil) + } } } test("self join with aliases") { - Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str").createOrReplaceTempView("df") + withTempView("df") { + Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str").createOrReplaceTempView("df") - checkAnswer( - sql( - """ - |SELECT x.str, COUNT(*) - |FROM df x JOIN df y ON x.str = y.str - |GROUP BY x.str + checkAnswer( + sql( + """ + |SELECT x.str, COUNT(*) + |FROM df x JOIN df y ON x.str = y.str + |GROUP BY x.str """.stripMargin), - Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil) + Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil) + } } test("support table.star") { @@ -239,6 +168,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("self join with alias in agg") { + withTempView("df") { Seq(1, 2, 3) .map(i => (i, i.toString)) .toDF("int", "str") @@ -246,14 +176,15 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark .agg($"str", count("str").as("strCount")) .createOrReplaceTempView("df") - checkAnswer( - sql( - """ - |SELECT x.str, SUM(x.strCount) - |FROM df x JOIN df y ON x.str = y.str - |GROUP BY x.str + checkAnswer( + sql( + """ + |SELECT x.str, SUM(x.strCount) + |FROM df x JOIN df y ON x.str = y.str + |GROUP BY x.str """.stripMargin), - Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil) + Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil) + } } test("SPARK-8668 expr function") { @@ -300,41 +231,47 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("grouping on nested fields") { - spark.read - .json(Seq("""{"nested": {"attribute": 1}, "value": 2}""").toDS()) - .createOrReplaceTempView("rows") + withTempView("rows") { + spark.read + .json(Seq("""{"nested": {"attribute": 1}, "value": 2}""").toDS()) + .createOrReplaceTempView("rows") - checkAnswer( - sql( - """ - |select attribute, sum(cnt) - |from ( - | select nested.attribute, count(*) as cnt - | from rows - | group by nested.attribute) a - |group by attribute + checkAnswer( + sql( + """ + |select attribute, sum(cnt) + |from ( + | select nested.attribute, count(*) as cnt + | from rows + | group by nested.attribute) a + |group by attribute """.stripMargin), - Row(1, 1) :: Nil) + Row(1, 1) :: Nil) + } } test("SPARK-6201 IN type conversion") { - spark.read - .json(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}").toDS()) - .createOrReplaceTempView("d") + withTempView("d") { + spark.read + .json(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}").toDS()) + .createOrReplaceTempView("d") - checkAnswer( - sql("select * from d where d.a in (1,2)"), - Seq(Row("1"), Row("2"))) + checkAnswer( + sql("select * from d where d.a in (1,2)"), + Seq(Row("1"), Row("2"))) + } } test("SPARK-11226 Skip empty line in json file") { - spark.read - .json(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}", "").toDS()) - .createOrReplaceTempView("d") + withTempView("d") { + spark.read + .json(Seq("{\"a\": \"1\"}}", "{\"a\": \"2\"}}", "{\"a\": \"3\"}}", "").toDS()) + .createOrReplaceTempView("d") - checkAnswer( - sql("select count(1) from d"), - Seq(Row(3))) + checkAnswer( + sql("select count(1) from d"), + Seq(Row(3))) + } } test("SPARK-8828 sum should return null if all input values are null") { @@ -496,40 +433,42 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("SPARK-3173 Timestamp support in the parser") { - (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time").createOrReplaceTempView("timestamps") + withTempView("timestamps") { + (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time").createOrReplaceTempView("timestamps") - checkAnswer(sql( - "SELECT time FROM timestamps WHERE time='1969-12-31 16:00:00.0'"), - Row(Timestamp.valueOf("1969-12-31 16:00:00"))) + checkAnswer(sql( + "SELECT time FROM timestamps WHERE time='1969-12-31 16:00:00.0'"), + Row(Timestamp.valueOf("1969-12-31 16:00:00"))) - checkAnswer(sql( - "SELECT time FROM timestamps WHERE time=CAST('1969-12-31 16:00:00.001' AS TIMESTAMP)"), - Row(Timestamp.valueOf("1969-12-31 16:00:00.001"))) + checkAnswer(sql( + "SELECT time FROM timestamps WHERE time=CAST('1969-12-31 16:00:00.001' AS TIMESTAMP)"), + Row(Timestamp.valueOf("1969-12-31 16:00:00.001"))) - checkAnswer(sql( - "SELECT time FROM timestamps WHERE time='1969-12-31 16:00:00.001'"), - Row(Timestamp.valueOf("1969-12-31 16:00:00.001"))) + checkAnswer(sql( + "SELECT time FROM timestamps WHERE time='1969-12-31 16:00:00.001'"), + Row(Timestamp.valueOf("1969-12-31 16:00:00.001"))) - checkAnswer(sql( - "SELECT time FROM timestamps WHERE '1969-12-31 16:00:00.001'=time"), - Row(Timestamp.valueOf("1969-12-31 16:00:00.001"))) + checkAnswer(sql( + "SELECT time FROM timestamps WHERE '1969-12-31 16:00:00.001'=time"), + Row(Timestamp.valueOf("1969-12-31 16:00:00.001"))) - checkAnswer(sql( - """SELECT time FROM timestamps WHERE time<'1969-12-31 16:00:00.003' + checkAnswer(sql( + """SELECT time FROM timestamps WHERE time<'1969-12-31 16:00:00.003' AND time>'1969-12-31 16:00:00.001'"""), - Row(Timestamp.valueOf("1969-12-31 16:00:00.002"))) + Row(Timestamp.valueOf("1969-12-31 16:00:00.002"))) - checkAnswer(sql( - """ - |SELECT time FROM timestamps - |WHERE time IN ('1969-12-31 16:00:00.001','1969-12-31 16:00:00.002') + checkAnswer(sql( + """ + |SELECT time FROM timestamps + |WHERE time IN ('1969-12-31 16:00:00.001','1969-12-31 16:00:00.002') """.stripMargin), - Seq(Row(Timestamp.valueOf("1969-12-31 16:00:00.001")), - Row(Timestamp.valueOf("1969-12-31 16:00:00.002")))) + Seq(Row(Timestamp.valueOf("1969-12-31 16:00:00.001")), + Row(Timestamp.valueOf("1969-12-31 16:00:00.002")))) - checkAnswer(sql( - "SELECT time FROM timestamps WHERE time='123'"), - Nil) + checkAnswer(sql( + "SELECT time FROM timestamps WHERE time='123'"), + Nil) + } } test("left semi greater than predicate") { @@ -858,20 +797,22 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("SPARK-3349 partitioning after limit") { - sql("SELECT DISTINCT n FROM lowerCaseData ORDER BY n DESC") - .limit(2) - .createOrReplaceTempView("subset1") - sql("SELECT DISTINCT n FROM lowerCaseData ORDER BY n ASC") - .limit(2) - .createOrReplaceTempView("subset2") - checkAnswer( - sql("SELECT * FROM lowerCaseData INNER JOIN subset1 ON subset1.n = lowerCaseData.n"), - Row(3, "c", 3) :: - Row(4, "d", 4) :: Nil) - checkAnswer( - sql("SELECT * FROM lowerCaseData INNER JOIN subset2 ON subset2.n = lowerCaseData.n"), - Row(1, "a", 1) :: - Row(2, "b", 2) :: Nil) + withTempView("subset1", "subset2") { + sql("SELECT DISTINCT n FROM lowerCaseData ORDER BY n DESC") + .limit(2) + .createOrReplaceTempView("subset1") + sql("SELECT DISTINCT n FROM lowerCaseData ORDER BY n ASC") + .limit(2) + .createOrReplaceTempView("subset2") + checkAnswer( + sql("SELECT * FROM lowerCaseData INNER JOIN subset1 ON subset1.n = lowerCaseData.n"), + Row(3, "c", 3) :: + Row(4, "d", 4) :: Nil) + checkAnswer( + sql("SELECT * FROM lowerCaseData INNER JOIN subset2 ON subset2.n = lowerCaseData.n"), + Row(1, "a", 1) :: + Row(2, "b", 2) :: Nil) + } } test("mixed-case keywords") { @@ -1114,84 +1055,87 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("apply schema") { - val schema1 = StructType( - StructField("f1", IntegerType, false) :: - StructField("f2", StringType, false) :: - StructField("f3", BooleanType, false) :: - StructField("f4", IntegerType, true) :: Nil) - - val rowRDD1 = unparsedStrings.map { r => - val values = r.split(",").map(_.trim) - val v4 = try values(3).toInt catch { - case _: NumberFormatException => null + withTempView("applySchema1", "applySchema2", "applySchema3") { + val schema1 = StructType( + StructField("f1", IntegerType, false) :: + StructField("f2", StringType, false) :: + StructField("f3", BooleanType, false) :: + StructField("f4", IntegerType, true) :: Nil) + + val rowRDD1 = unparsedStrings.map { r => + val values = r.split(",").map(_.trim) + val v4 = try values(3).toInt catch { + case _: NumberFormatException => null + } + Row(values(0).toInt, values(1), values(2).toBoolean, v4) } - Row(values(0).toInt, values(1), values(2).toBoolean, v4) - } - val df1 = spark.createDataFrame(rowRDD1, schema1) - df1.createOrReplaceTempView("applySchema1") - checkAnswer( - sql("SELECT * FROM applySchema1"), - Row(1, "A1", true, null) :: - Row(2, "B2", false, null) :: - Row(3, "C3", true, null) :: - Row(4, "D4", true, 2147483644) :: Nil) - - checkAnswer( - sql("SELECT f1, f4 FROM applySchema1"), - Row(1, null) :: - Row(2, null) :: - Row(3, null) :: - Row(4, 2147483644) :: Nil) - - val schema2 = StructType( - StructField("f1", StructType( - StructField("f11", IntegerType, false) :: - StructField("f12", BooleanType, false) :: Nil), false) :: - StructField("f2", MapType(StringType, IntegerType, true), false) :: Nil) + val df1 = spark.createDataFrame(rowRDD1, schema1) + df1.createOrReplaceTempView("applySchema1") + checkAnswer( + sql("SELECT * FROM applySchema1"), + Row(1, "A1", true, null) :: + Row(2, "B2", false, null) :: + Row(3, "C3", true, null) :: + Row(4, "D4", true, 2147483644) :: Nil) - val rowRDD2 = unparsedStrings.map { r => - val values = r.split(",").map(_.trim) - val v4 = try values(3).toInt catch { - case _: NumberFormatException => null + checkAnswer( + sql("SELECT f1, f4 FROM applySchema1"), + Row(1, null) :: + Row(2, null) :: + Row(3, null) :: + Row(4, 2147483644) :: Nil) + + val schema2 = StructType( + StructField("f1", StructType( + StructField("f11", IntegerType, false) :: + StructField("f12", BooleanType, false) :: Nil), false) :: + StructField("f2", MapType(StringType, IntegerType, true), false) :: Nil) + + val rowRDD2 = unparsedStrings.map { r => + val values = r.split(",").map(_.trim) + val v4 = try values(3).toInt catch { + case _: NumberFormatException => null + } + Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4)) } - Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4)) - } - - val df2 = spark.createDataFrame(rowRDD2, schema2) - df2.createOrReplaceTempView("applySchema2") - checkAnswer( - sql("SELECT * FROM applySchema2"), - Row(Row(1, true), Map("A1" -> null)) :: - Row(Row(2, false), Map("B2" -> null)) :: - Row(Row(3, true), Map("C3" -> null)) :: - Row(Row(4, true), Map("D4" -> 2147483644)) :: Nil) - checkAnswer( - sql("SELECT f1.f11, f2['D4'] FROM applySchema2"), - Row(1, null) :: - Row(2, null) :: - Row(3, null) :: - Row(4, 2147483644) :: Nil) + val df2 = spark.createDataFrame(rowRDD2, schema2) + df2.createOrReplaceTempView("applySchema2") + checkAnswer( + sql("SELECT * FROM applySchema2"), + Row(Row(1, true), Map("A1" -> null)) :: + Row(Row(2, false), Map("B2" -> null)) :: + Row(Row(3, true), Map("C3" -> null)) :: + Row(Row(4, true), Map("D4" -> 2147483644)) :: Nil) - // The value of a MapType column can be a mutable map. - val rowRDD3 = unparsedStrings.map { r => - val values = r.split(",").map(_.trim) - val v4 = try values(3).toInt catch { - case _: NumberFormatException => null + checkAnswer( + sql("SELECT f1.f11, f2['D4'] FROM applySchema2"), + Row(1, null) :: + Row(2, null) :: + Row(3, null) :: + Row(4, 2147483644) :: Nil) + + // The value of a MapType column can be a mutable map. + val rowRDD3 = unparsedStrings.map { r => + val values = r.split(",").map(_.trim) + val v4 = try values(3).toInt catch { + case _: NumberFormatException => null + } + Row(Row(values(0).toInt, values(2).toBoolean), + scala.collection.mutable.Map(values(1) -> v4)) } - Row(Row(values(0).toInt, values(2).toBoolean), scala.collection.mutable.Map(values(1) -> v4)) - } - val df3 = spark.createDataFrame(rowRDD3, schema2) - df3.createOrReplaceTempView("applySchema3") + val df3 = spark.createDataFrame(rowRDD3, schema2) + df3.createOrReplaceTempView("applySchema3") - checkAnswer( - sql("SELECT f1.f11, f2['D4'] FROM applySchema3"), - Row(1, null) :: - Row(2, null) :: - Row(3, null) :: - Row(4, 2147483644) :: Nil) + checkAnswer( + sql("SELECT f1.f11, f2['D4'] FROM applySchema3"), + Row(1, null) :: + Row(2, null) :: + Row(3, null) :: + Row(4, 2147483644) :: Nil) + } } test("SPARK-3423 BETWEEN") { @@ -1243,28 +1187,30 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("metadata is propagated correctly") { - val person: DataFrame = sql("SELECT * FROM person") - val schema = person.schema - val docKey = "doc" - val docValue = "first name" - val metadata = new MetadataBuilder() - .putString(docKey, docValue) - .build() - val schemaWithMeta = new StructType(Array( - schema("id"), schema("name").copy(metadata = metadata), schema("age"))) - val personWithMeta = spark.createDataFrame(person.rdd, schemaWithMeta) - def validateMetadata(rdd: DataFrame): Unit = { - assert(rdd.schema("name").metadata.getString(docKey) == docValue) - } - personWithMeta.createOrReplaceTempView("personWithMeta") - validateMetadata(personWithMeta.select($"name")) - validateMetadata(personWithMeta.select($"name")) - validateMetadata(personWithMeta.select($"id", $"name")) - validateMetadata(sql("SELECT * FROM personWithMeta")) - validateMetadata(sql("SELECT id, name FROM personWithMeta")) - validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON id = personId")) - validateMetadata(sql( - "SELECT name, salary FROM personWithMeta JOIN salary ON id = personId")) + withTempView("personWithMeta") { + val person: DataFrame = sql("SELECT * FROM person") + val schema = person.schema + val docKey = "doc" + val docValue = "first name" + val metadata = new MetadataBuilder() + .putString(docKey, docValue) + .build() + val schemaWithMeta = new StructType(Array( + schema("id"), schema("name").copy(metadata = metadata), schema("age"))) + val personWithMeta = spark.createDataFrame(person.rdd, schemaWithMeta) + def validateMetadata(rdd: DataFrame): Unit = { + assert(rdd.schema("name").metadata.getString(docKey) == docValue) + } + personWithMeta.createOrReplaceTempView("personWithMeta") + validateMetadata(personWithMeta.select($"name")) + validateMetadata(personWithMeta.select($"name")) + validateMetadata(personWithMeta.select($"id", $"name")) + validateMetadata(sql("SELECT * FROM personWithMeta")) + validateMetadata(sql("SELECT id, name FROM personWithMeta")) + validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON id = personId")) + validateMetadata(sql( + "SELECT name, salary FROM personWithMeta JOIN salary ON id = personId")) + } } test("SPARK-3371 Renaming a function expression with group by gives error") { @@ -1306,10 +1252,12 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("SPARK-3483 Special chars in column names") { - val data = Seq("""{"key?number1": "value1", "key.number2": "value2"}""").toDS() - spark.read.json(data).createOrReplaceTempView("records") - withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { - sql("SELECT `key?number1`, `key.number2` FROM records") + withTempView("records") { + val data = Seq("""{"key?number1": "value1", "key.number2": "value2"}""").toDS() + spark.read.json(data).createOrReplaceTempView("records") + withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { + sql("SELECT `key?number1`, `key.number2` FROM records") + } } } @@ -1376,138 +1324,152 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("Supporting relational operator '<=>' in Spark SQL") { - val nullCheckData1 = TestData(1, "1") :: TestData(2, null) :: Nil - val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i))) - rdd1.toDF().createOrReplaceTempView("nulldata1") - val nullCheckData2 = TestData(1, "1") :: TestData(2, null) :: Nil - val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i))) - rdd2.toDF().createOrReplaceTempView("nulldata2") - checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " + - "nulldata2 on nulldata1.value <=> nulldata2.value"), + withTempView("nulldata1", "nulldata2") { + val nullCheckData1 = TestData(1, "1") :: TestData(2, null) :: Nil + val rdd1 = sparkContext.parallelize((0 to 1).map(i => nullCheckData1(i))) + rdd1.toDF().createOrReplaceTempView("nulldata1") + val nullCheckData2 = TestData(1, "1") :: TestData(2, null) :: Nil + val rdd2 = sparkContext.parallelize((0 to 1).map(i => nullCheckData2(i))) + rdd2.toDF().createOrReplaceTempView("nulldata2") + checkAnswer(sql("SELECT nulldata1.key FROM nulldata1 join " + + "nulldata2 on nulldata1.value <=> nulldata2.value"), (1 to 2).map(i => Row(i))) + } } test("Multi-column COUNT(DISTINCT ...)") { - val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil - val rdd = sparkContext.parallelize((0 to 1).map(i => data(i))) - rdd.toDF().createOrReplaceTempView("distinctData") - checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2)) + withTempView("distinctData") { + val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil + val rdd = sparkContext.parallelize((0 to 1).map(i => data(i))) + rdd.toDF().createOrReplaceTempView("distinctData") + checkAnswer(sql("SELECT COUNT(DISTINCT key,value) FROM distinctData"), Row(2)) + } } test("SPARK-4699 case sensitivity SQL query") { - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { - val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil - val rdd = sparkContext.parallelize((0 to 1).map(i => data(i))) - rdd.toDF().createOrReplaceTempView("testTable1") - checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1")) + withTempView("testTable1") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + val data = TestData(1, "val_1") :: TestData(2, "val_2") :: Nil + val rdd = sparkContext.parallelize((0 to 1).map(i => data(i))) + rdd.toDF().createOrReplaceTempView("testTable1") + checkAnswer(sql("SELECT VALUE FROM TESTTABLE1 where KEY = 1"), Row("val_1")) + } } } test("SPARK-6145: ORDER BY test for nested fields") { - spark.read - .json(Seq("""{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""").toDS()) - .createOrReplaceTempView("nestedOrder") + withTempView("nestedOrder") { + spark.read + .json(Seq("""{"a": {"b": 1, "a": {"a": 1}}, "c": [{"d": 1}]}""").toDS()) + .createOrReplaceTempView("nestedOrder") - checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY a.b"), Row(1)) - checkAnswer(sql("SELECT a.b FROM nestedOrder ORDER BY a.b"), Row(1)) - checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY a.a.a"), Row(1)) - checkAnswer(sql("SELECT a.a.a FROM nestedOrder ORDER BY a.a.a"), Row(1)) - checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY c[0].d"), Row(1)) - checkAnswer(sql("SELECT c[0].d FROM nestedOrder ORDER BY c[0].d"), Row(1)) + checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY a.b"), Row(1)) + checkAnswer(sql("SELECT a.b FROM nestedOrder ORDER BY a.b"), Row(1)) + checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY a.a.a"), Row(1)) + checkAnswer(sql("SELECT a.a.a FROM nestedOrder ORDER BY a.a.a"), Row(1)) + checkAnswer(sql("SELECT 1 FROM nestedOrder ORDER BY c[0].d"), Row(1)) + checkAnswer(sql("SELECT c[0].d FROM nestedOrder ORDER BY c[0].d"), Row(1)) + } } test("SPARK-6145: special cases") { - spark.read - .json(Seq("""{"a": {"b": [1]}, "b": [{"a": 1}], "_c0": {"a": 1}}""").toDS()) - .createOrReplaceTempView("t") + withTempView("t") { + spark.read + .json(Seq("""{"a": {"b": [1]}, "b": [{"a": 1}], "_c0": {"a": 1}}""").toDS()) + .createOrReplaceTempView("t") - checkAnswer(sql("SELECT a.b[0] FROM t ORDER BY _c0.a"), Row(1)) - checkAnswer(sql("SELECT b[0].a FROM t ORDER BY _c0.a"), Row(1)) + checkAnswer(sql("SELECT a.b[0] FROM t ORDER BY _c0.a"), Row(1)) + checkAnswer(sql("SELECT b[0].a FROM t ORDER BY _c0.a"), Row(1)) + } } test("SPARK-6898: complete support for special chars in column names") { - spark.read - .json(Seq("""{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""").toDS()) - .createOrReplaceTempView("t") + withTempView("t") { + spark.read + .json(Seq("""{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""").toDS()) + .createOrReplaceTempView("t") - withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { - checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1)) + withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { + checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1)) + } } } test("SPARK-6583 order by aggregated function") { - Seq("1" -> 3, "1" -> 4, "2" -> 7, "2" -> 8, "3" -> 5, "3" -> 6, "4" -> 1, "4" -> 2) - .toDF("a", "b").createOrReplaceTempView("orderByData") + withTempView("orderByData") { + Seq("1" -> 3, "1" -> 4, "2" -> 7, "2" -> 8, "3" -> 5, "3" -> 6, "4" -> 1, "4" -> 2) + .toDF("a", "b").createOrReplaceTempView("orderByData") - checkAnswer( - sql( - """ - |SELECT a - |FROM orderByData - |GROUP BY a - |ORDER BY sum(b) + checkAnswer( + sql( + """ + |SELECT a + |FROM orderByData + |GROUP BY a + |ORDER BY sum(b) """.stripMargin), - Row("4") :: Row("1") :: Row("3") :: Row("2") :: Nil) + Row("4") :: Row("1") :: Row("3") :: Row("2") :: Nil) - checkAnswer( - sql( - """ - |SELECT sum(b) - |FROM orderByData - |GROUP BY a - |ORDER BY sum(b) + checkAnswer( + sql( + """ + |SELECT sum(b) + |FROM orderByData + |GROUP BY a + |ORDER BY sum(b) """.stripMargin), - Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil) + Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil) - checkAnswer( - sql( - """ - |SELECT sum(b) - |FROM orderByData - |GROUP BY a - |ORDER BY sum(b), max(b) + checkAnswer( + sql( + """ + |SELECT sum(b) + |FROM orderByData + |GROUP BY a + |ORDER BY sum(b), max(b) """.stripMargin), - Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil) + Row(3) :: Row(7) :: Row(11) :: Row(15) :: Nil) - checkAnswer( - sql( - """ - |SELECT a, sum(b) - |FROM orderByData - |GROUP BY a - |ORDER BY sum(b) + checkAnswer( + sql( + """ + |SELECT a, sum(b) + |FROM orderByData + |GROUP BY a + |ORDER BY sum(b) """.stripMargin), - Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil) + Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil) - checkAnswer( - sql( - """ + checkAnswer( + sql( + """ |SELECT a, sum(b) |FROM orderByData |GROUP BY a |ORDER BY sum(b) + 1 """.stripMargin), - Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil) + Row("4", 3) :: Row("1", 7) :: Row("3", 11) :: Row("2", 15) :: Nil) - checkAnswer( - sql( - """ + checkAnswer( + sql( + """ |SELECT count(*) |FROM orderByData |GROUP BY a |ORDER BY count(*) """.stripMargin), - Row(2) :: Row(2) :: Row(2) :: Row(2) :: Nil) + Row(2) :: Row(2) :: Row(2) :: Row(2) :: Nil) - checkAnswer( - sql( - """ + checkAnswer( + sql( + """ |SELECT a |FROM orderByData |GROUP BY a |ORDER BY a, count(*), sum(b) """.stripMargin), - Row("1") :: Row("2") :: Row("3") :: Row("4") :: Nil) + Row("1") :: Row("2") :: Row("3") :: Row("4") :: Nil) + } } test("SPARK-7952: fix the equality check between boolean and numeric types") { @@ -1819,137 +1781,141 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("Struct Star Expansion") { - val structDf = testData2.select("a", "b").as("record") + withTempView("structTable", "nestedStructTable", "specialCharacterTable", "nameConflict") { + val structDf = testData2.select("a", "b").as("record") - checkAnswer( - structDf.select($"record.a", $"record.b"), - Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) + checkAnswer( + structDf.select($"record.a", $"record.b"), + Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) - checkAnswer( - structDf.select($"record.*"), - Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) + checkAnswer( + structDf.select($"record.*"), + Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) - checkAnswer( - structDf.select($"record.*", $"record.*"), - Row(1, 1, 1, 1) :: Row(1, 2, 1, 2) :: Row(2, 1, 2, 1) :: Row(2, 2, 2, 2) :: - Row(3, 1, 3, 1) :: Row(3, 2, 3, 2) :: Nil) + checkAnswer( + structDf.select($"record.*", $"record.*"), + Row(1, 1, 1, 1) :: Row(1, 2, 1, 2) :: Row(2, 1, 2, 1) :: Row(2, 2, 2, 2) :: + Row(3, 1, 3, 1) :: Row(3, 2, 3, 2) :: Nil) - checkAnswer( - sql("select struct(a, b) as r1, struct(b, a) as r2 from testData2").select($"r1.*", $"r2.*"), - Row(1, 1, 1, 1) :: Row(1, 2, 2, 1) :: Row(2, 1, 1, 2) :: Row(2, 2, 2, 2) :: - Row(3, 1, 1, 3) :: Row(3, 2, 2, 3) :: Nil) + checkAnswer( + sql("select struct(a, b) as r1, struct(b, a) as r2 from testData2") + .select($"r1.*", $"r2.*"), + Row(1, 1, 1, 1) :: Row(1, 2, 2, 1) :: Row(2, 1, 1, 2) :: Row(2, 2, 2, 2) :: + Row(3, 1, 1, 3) :: Row(3, 2, 2, 3) :: Nil) - // Try with a temporary view - sql("select struct(a, b) as record from testData2").createOrReplaceTempView("structTable") - checkAnswer( - sql("SELECT record.* FROM structTable"), - Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) + // Try with a temporary view + sql("select struct(a, b) as record from testData2").createOrReplaceTempView("structTable") + checkAnswer( + sql("SELECT record.* FROM structTable"), + Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) - checkAnswer(sql( - """ - | SELECT min(struct(record.*)) FROM - | (select struct(a,b) as record from testData2) tmp + checkAnswer(sql( + """ + | SELECT min(struct(record.*)) FROM + | (select struct(a,b) as record from testData2) tmp """.stripMargin), - Row(Row(1, 1)) :: Nil) + Row(Row(1, 1)) :: Nil) - // Try with an alias on the select list - checkAnswer(sql( - """ - | SELECT max(struct(record.*)) as r FROM - | (select struct(a,b) as record from testData2) tmp + // Try with an alias on the select list + checkAnswer(sql( + """ + | SELECT max(struct(record.*)) as r FROM + | (select struct(a,b) as record from testData2) tmp """.stripMargin).select($"r.*"), - Row(3, 2) :: Nil) + Row(3, 2) :: Nil) - // With GROUP BY - checkAnswer(sql( - """ - | SELECT min(struct(record.*)) FROM - | (select a as a, struct(a,b) as record from testData2) tmp - | GROUP BY a + // With GROUP BY + checkAnswer(sql( + """ + | SELECT min(struct(record.*)) FROM + | (select a as a, struct(a,b) as record from testData2) tmp + | GROUP BY a """.stripMargin), - Row(Row(1, 1)) :: Row(Row(2, 1)) :: Row(Row(3, 1)) :: Nil) + Row(Row(1, 1)) :: Row(Row(2, 1)) :: Row(Row(3, 1)) :: Nil) - // With GROUP BY and alias - checkAnswer(sql( - """ - | SELECT max(struct(record.*)) as r FROM - | (select a as a, struct(a,b) as record from testData2) tmp - | GROUP BY a + // With GROUP BY and alias + checkAnswer(sql( + """ + | SELECT max(struct(record.*)) as r FROM + | (select a as a, struct(a,b) as record from testData2) tmp + | GROUP BY a """.stripMargin).select($"r.*"), - Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil) + Row(1, 2) :: Row(2, 2) :: Row(3, 2) :: Nil) - // With GROUP BY and alias and additional fields in the struct - checkAnswer(sql( - """ - | SELECT max(struct(a, record.*, b)) as r FROM - | (select a as a, b as b, struct(a,b) as record from testData2) tmp - | GROUP BY a + // With GROUP BY and alias and additional fields in the struct + checkAnswer(sql( + """ + | SELECT max(struct(a, record.*, b)) as r FROM + | (select a as a, b as b, struct(a,b) as record from testData2) tmp + | GROUP BY a """.stripMargin).select($"r.*"), - Row(1, 1, 2, 2) :: Row(2, 2, 2, 2) :: Row(3, 3, 2, 2) :: Nil) + Row(1, 1, 2, 2) :: Row(2, 2, 2, 2) :: Row(3, 3, 2, 2) :: Nil) - // Create a data set that contains nested structs. - val nestedStructData = sql( - """ - | SELECT struct(r1, r2) as record FROM - | (SELECT struct(a, b) as r1, struct(b, a) as r2 FROM testData2) tmp + // Create a data set that contains nested structs. + val nestedStructData = sql( + """ + | SELECT struct(r1, r2) as record FROM + | (SELECT struct(a, b) as r1, struct(b, a) as r2 FROM testData2) tmp """.stripMargin) - checkAnswer(nestedStructData.select($"record.*"), - Row(Row(1, 1), Row(1, 1)) :: Row(Row(1, 2), Row(2, 1)) :: Row(Row(2, 1), Row(1, 2)) :: - Row(Row(2, 2), Row(2, 2)) :: Row(Row(3, 1), Row(1, 3)) :: Row(Row(3, 2), Row(2, 3)) :: Nil) - checkAnswer(nestedStructData.select($"record.r1"), - Row(Row(1, 1)) :: Row(Row(1, 2)) :: Row(Row(2, 1)) :: Row(Row(2, 2)) :: - Row(Row(3, 1)) :: Row(Row(3, 2)) :: Nil) - checkAnswer( - nestedStructData.select($"record.r1.*"), - Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) - - // Try with a temporary view - withTempView("nestedStructTable") { - nestedStructData.createOrReplaceTempView("nestedStructTable") - checkAnswer( - sql("SELECT record.* FROM nestedStructTable"), - nestedStructData.select($"record.*")) + checkAnswer(nestedStructData.select($"record.*"), + Row(Row(1, 1), Row(1, 1)) :: Row(Row(1, 2), Row(2, 1)) :: Row(Row(2, 1), Row(1, 2)) :: + Row(Row(2, 2), Row(2, 2)) :: Row(Row(3, 1), Row(1, 3)) :: Row(Row(3, 2), Row(2, 3)) :: + Nil) + checkAnswer(nestedStructData.select($"record.r1"), + Row(Row(1, 1)) :: Row(Row(1, 2)) :: Row(Row(2, 1)) :: Row(Row(2, 2)) :: + Row(Row(3, 1)) :: Row(Row(3, 2)) :: Nil) checkAnswer( - sql("SELECT record.r1 FROM nestedStructTable"), - nestedStructData.select($"record.r1")) - checkAnswer( - sql("SELECT record.r1.* FROM nestedStructTable"), - nestedStructData.select($"record.r1.*")) - - // Try resolving something not there. - assert(intercept[AnalysisException](sql("SELECT abc.* FROM nestedStructTable")) - .getMessage.contains("cannot resolve")) - } + nestedStructData.select($"record.r1.*"), + Row(1, 1) :: Row(1, 2) :: Row(2, 1) :: Row(2, 2) :: Row(3, 1) :: Row(3, 2) :: Nil) - // Create paths with unusual characters - withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { - val specialCharacterPath = sql( - """ - | SELECT struct(`col$.a_`, `a.b.c.`) as `r&&b.c` FROM - | (SELECT struct(a, b) as `col$.a_`, struct(b, a) as `a.b.c.` FROM testData2) tmp - """.stripMargin) - withTempView("specialCharacterTable") { - specialCharacterPath.createOrReplaceTempView("specialCharacterTable") + // Try with a temporary view + withTempView("nestedStructTable") { + nestedStructData.createOrReplaceTempView("nestedStructTable") checkAnswer( - specialCharacterPath.select($"`r&&b.c`.*"), + sql("SELECT record.* FROM nestedStructTable"), nestedStructData.select($"record.*")) checkAnswer( - sql( - "SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"), - nestedStructData.select($"record.r1")) - checkAnswer( - sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"), - nestedStructData.select($"record.r2")) + sql("SELECT record.r1 FROM nestedStructTable"), + nestedStructData.select($"record.r1")) checkAnswer( - sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"), + sql("SELECT record.r1.* FROM nestedStructTable"), nestedStructData.select($"record.r1.*")) + + // Try resolving something not there. + assert(intercept[AnalysisException](sql("SELECT abc.* FROM nestedStructTable")) + .getMessage.contains("cannot resolve")) } - } - // Try star expanding a scalar. This should fail. - assert(intercept[AnalysisException](sql("select a.* from testData2")).getMessage.contains( - "Can only star expand struct data types.")) + // Create paths with unusual characters + withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { + val specialCharacterPath = sql( + """ + | SELECT struct(`col$.a_`, `a.b.c.`) as `r&&b.c` FROM + | (SELECT struct(a, b) as `col$.a_`, struct(b, a) as `a.b.c.` FROM testData2) tmp + """.stripMargin) + withTempView("specialCharacterTable") { + specialCharacterPath.createOrReplaceTempView("specialCharacterTable") + checkAnswer( + specialCharacterPath.select($"`r&&b.c`.*"), + nestedStructData.select($"record.*")) + checkAnswer( + sql( + "SELECT `r&&b.c`.`col$.a_` FROM specialCharacterTable"), + nestedStructData.select($"record.r1")) + checkAnswer( + sql("SELECT `r&&b.c`.`a.b.c.` FROM specialCharacterTable"), + nestedStructData.select($"record.r2")) + checkAnswer( + sql("SELECT `r&&b.c`.`col$.a_`.* FROM specialCharacterTable"), + nestedStructData.select($"record.r1.*")) + } + } + + // Try star expanding a scalar. This should fail. + assert(intercept[AnalysisException](sql("select a.* from testData2")).getMessage.contains( + "Can only star expand struct data types.")) + } } test("Struct Star Expansion - Name conflict") { @@ -2121,6 +2087,26 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } } + test("SPARK-27619: Throw analysis exception when hash and xxhash64 is used on MapType") { + Seq("hash", "xxhash64").foreach { + case hashExpression => + intercept[AnalysisException] { + spark.createDataset(Map(1 -> 10, 2 -> 20) :: Nil).selectExpr(s"$hashExpression(*)") + } + } + } + + test(s"SPARK-27619: When ${SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE.key} is true, hash can be " + + "used on Maptype") { + Seq("hash", "xxhash64").foreach { + case hashExpression => + withSQLConf(SQLConf.LEGACY_ALLOW_HASH_ON_MAPTYPE.key -> "true") { + val df = spark.createDataset(Map() :: Nil) + checkAnswer(df.selectExpr(s"$hashExpression(*)"), sql(s"SELECT $hashExpression(map())")) + } + } + } + test("xxhash64 function") { val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j") withTempView("tbl") { @@ -2776,7 +2762,9 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark sql("SELECT * FROM t, S WHERE c = C") }.message assert( - m.contains("cannot resolve '(default.t.`c` = default.S.`C`)' due to data type mismatch")) + m.contains( + "cannot resolve '(spark_catalog.default.t.`c` = spark_catalog.default.S.`C`)' " + + "due to data type mismatch")) } } } @@ -3184,38 +3172,40 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("string date comparison") { - spark.range(1).selectExpr("date '2000-01-01' as d").createOrReplaceTempView("t1") - val result = Date.valueOf("2000-01-01") - checkAnswer(sql("select * from t1 where d < '2000'"), Nil) - checkAnswer(sql("select * from t1 where d < '2001'"), Row(result)) - checkAnswer(sql("select * from t1 where d < '2000-01'"), Nil) - checkAnswer(sql("select * from t1 where d < '2000-01-01'"), Nil) - checkAnswer(sql("select * from t1 where d < '2000-1-1'"), Nil) - checkAnswer(sql("select * from t1 where d <= '2000-1-1'"), Row(result)) - checkAnswer(sql("select * from t1 where d <= '1999-12-30'"), Nil) - checkAnswer(sql("select * from t1 where d = '2000-1-1'"), Row(result)) - checkAnswer(sql("select * from t1 where d = '2000-01-01'"), Row(result)) - checkAnswer(sql("select * from t1 where d = '2000-1-02'"), Nil) - checkAnswer(sql("select * from t1 where d > '2000-01-01'"), Nil) - checkAnswer(sql("select * from t1 where d > '1999'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000-1'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000-1-1'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000-1-01'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000-01-1'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000-01-01'"), Row(result)) - checkAnswer(sql("select * from t1 where d >= '2000-01-02'"), Nil) - checkAnswer(sql("select * from t1 where '2000' >= d"), Row(result)) - checkAnswer(sql("select * from t1 where d > '2000-13'"), Nil) - - withSQLConf(SQLConf.LEGACY_CAST_DATETIME_TO_STRING.key -> "true") { + withTempView("t1") { + spark.range(1).selectExpr("date '2000-01-01' as d").createOrReplaceTempView("t1") + val result = Date.valueOf("2000-01-01") checkAnswer(sql("select * from t1 where d < '2000'"), Nil) checkAnswer(sql("select * from t1 where d < '2001'"), Row(result)) - checkAnswer(sql("select * from t1 where d < '2000-1-1'"), Row(result)) - checkAnswer(sql("select * from t1 where d <= '1999'"), Nil) + checkAnswer(sql("select * from t1 where d < '2000-01'"), Nil) + checkAnswer(sql("select * from t1 where d < '2000-01-01'"), Nil) + checkAnswer(sql("select * from t1 where d < '2000-1-1'"), Nil) + checkAnswer(sql("select * from t1 where d <= '2000-1-1'"), Row(result)) + checkAnswer(sql("select * from t1 where d <= '1999-12-30'"), Nil) + checkAnswer(sql("select * from t1 where d = '2000-1-1'"), Row(result)) + checkAnswer(sql("select * from t1 where d = '2000-01-01'"), Row(result)) + checkAnswer(sql("select * from t1 where d = '2000-1-02'"), Nil) + checkAnswer(sql("select * from t1 where d > '2000-01-01'"), Nil) + checkAnswer(sql("select * from t1 where d > '1999'"), Row(result)) checkAnswer(sql("select * from t1 where d >= '2000'"), Row(result)) - checkAnswer(sql("select * from t1 where d > '1999-13'"), Row(result)) - checkAnswer(sql("select to_date('2000-01-01') > '1'"), Row(true)) + checkAnswer(sql("select * from t1 where d >= '2000-1'"), Row(result)) + checkAnswer(sql("select * from t1 where d >= '2000-1-1'"), Row(result)) + checkAnswer(sql("select * from t1 where d >= '2000-1-01'"), Row(result)) + checkAnswer(sql("select * from t1 where d >= '2000-01-1'"), Row(result)) + checkAnswer(sql("select * from t1 where d >= '2000-01-01'"), Row(result)) + checkAnswer(sql("select * from t1 where d >= '2000-01-02'"), Nil) + checkAnswer(sql("select * from t1 where '2000' >= d"), Row(result)) + checkAnswer(sql("select * from t1 where d > '2000-13'"), Nil) + + withSQLConf(SQLConf.LEGACY_CAST_DATETIME_TO_STRING.key -> "true") { + checkAnswer(sql("select * from t1 where d < '2000'"), Nil) + checkAnswer(sql("select * from t1 where d < '2001'"), Row(result)) + checkAnswer(sql("select * from t1 where d < '2000-1-1'"), Row(result)) + checkAnswer(sql("select * from t1 where d <= '1999'"), Nil) + checkAnswer(sql("select * from t1 where d >= '2000'"), Row(result)) + checkAnswer(sql("select * from t1 where d > '1999-13'"), Row(result)) + checkAnswer(sql("select to_date('2000-01-01') > '1'"), Row(true)) + } } } @@ -3265,28 +3255,30 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark test("SPARK-28156: self-join should not miss cached view") { withTable("table1") { withView("table1_vw") { - val df = Seq.tabulate(5) { x => (x, x + 1, x + 2, x + 3) }.toDF("a", "b", "c", "d") - df.write.mode("overwrite").format("orc").saveAsTable("table1") - sql("drop view if exists table1_vw") - sql("create view table1_vw as select * from table1") - - val cachedView = sql("select a, b, c, d from table1_vw") - - cachedView.createOrReplaceTempView("cachedview") - cachedView.persist() - - val queryDf = sql( - s"""select leftside.a, leftside.b - |from cachedview leftside - |join cachedview rightside - |on leftside.a = rightside.a + withTempView("cachedview") { + val df = Seq.tabulate(5) { x => (x, x + 1, x + 2, x + 3) }.toDF("a", "b", "c", "d") + df.write.mode("overwrite").format("orc").saveAsTable("table1") + sql("drop view if exists table1_vw") + sql("create view table1_vw as select * from table1") + + val cachedView = sql("select a, b, c, d from table1_vw") + + cachedView.createOrReplaceTempView("cachedview") + cachedView.persist() + + val queryDf = sql( + s"""select leftside.a, leftside.b + |from cachedview leftside + |join cachedview rightside + |on leftside.a = rightside.a """.stripMargin) - val inMemoryTableScan = collect(queryDf.queryExecution.executedPlan) { - case i: InMemoryTableScanExec => i + val inMemoryTableScan = collect(queryDf.queryExecution.executedPlan) { + case i: InMemoryTableScanExec => i + } + assert(inMemoryTableScan.size == 2) + checkAnswer(queryDf, Row(0, 1) :: Row(1, 2) :: Row(2, 3) :: Row(3, 4) :: Row(4, 5) :: Nil) } - assert(inMemoryTableScan.size == 2) - checkAnswer(queryDf, Row(0, 1) :: Row(1, 2) :: Row(2, 3) :: Row(3, 4) :: Row(4, 5) :: Nil) } } @@ -3383,6 +3375,215 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark checkAnswer(df, Row(1)) } } + + test("SPARK-26218: Fix the corner case when casting float to Integer") { + withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + intercept[ArithmeticException]( + sql("SELECT CAST(CAST(2147483648 as FLOAT) as Integer)").collect() + ) + intercept[ArithmeticException]( + sql("SELECT CAST(CAST(2147483648 as DOUBLE) as Integer)").collect() + ) + } + } + + test("SPARK-30870: Column pruning shouldn't alias a nested column for the whole structure") { + withTable("t") { + val df = sql( + """ + |SELECT value + |FROM VALUES array(named_struct('field', named_struct('a', 1, 'b', 2))) AS (value) + """.stripMargin) + df.write.format("parquet").saveAsTable("t") + + val df2 = spark.table("t") + .limit(100) + .select(size(col("value.field"))) + val projects = df2.queryExecution.optimizedPlan.collect { + case p: Project => p + } + assert(projects.length == 1) + val aliases = NestedColumnAliasingSuite.collectGeneratedAliases(projects(0)) + assert(aliases.length == 0) + } + } + + test("SPARK-30955: Exclude Generate output when aliasing in nested column pruning") { + val df1 = sql( + """ + |SELECT explodedvalue.* + |FROM VALUES array(named_struct('nested', named_struct('a', 1, 'b', 2))) AS (value) + |LATERAL VIEW explode(value) AS explodedvalue + """.stripMargin) + checkAnswer(df1, Row(Row(1, 2)) :: Nil) + + val df2 = sql( + """ + |SELECT explodedvalue.nested.a + |FROM VALUES array(named_struct('nested', named_struct('a', 1, 'b', 2))) AS (value) + |LATERAL VIEW explode(value) AS explodedvalue + """.stripMargin) + checkAnswer(df2, Row(1) :: Nil) + } + + test("SPARK-31166: UNION map and other maps should not fail") { + checkAnswer( + sql("(SELECT map()) UNION ALL (SELECT map(1, 2))"), + Seq(Row(Map[Int, Int]()), Row(Map(1 -> 2)))) + } + + test("SPARK-31242: clone SparkSession should respect sessionInitWithConfigDefaults") { + // Note, only the conf explicitly set in SparkConf(e.g. in SharedSparkSessionBase) would cause + // problem before the fix. + withSQLConf(SQLConf.CODEGEN_FALLBACK.key -> "true") { + val cloned = spark.cloneSession() + SparkSession.setActiveSession(cloned) + assert(SQLConf.get.getConf(SQLConf.CODEGEN_FALLBACK) === true) + } + } + + test("SPARK-31761: test byte, short, integer overflow for (Divide) integral type") { + checkAnswer(sql("Select -2147483648 DIV -1"), Seq(Row(Integer.MIN_VALUE.toLong * -1))) + checkAnswer(sql("select CAST(-128 as Byte) DIV CAST (-1 as Byte)"), + Seq(Row(Byte.MinValue.toLong * -1))) + checkAnswer(sql("select CAST(-32768 as short) DIV CAST (-1 as short)"), + Seq(Row(Short.MinValue.toLong * -1))) + } + + test("normalize special floating numbers in subquery") { + withTempView("v1", "v2", "v3") { + Seq(-0.0).toDF("d").createTempView("v1") + Seq(0.0).toDF("d").createTempView("v2") + spark.range(2).createTempView("v3") + + // non-correlated subquery + checkAnswer(sql("SELECT (SELECT v1.d FROM v1 JOIN v2 ON v1.d = v2.d)"), Row(-0.0)) + // correlated subquery + checkAnswer( + sql( + """ + |SELECT id FROM v3 WHERE EXISTS + | (SELECT v1.d FROM v1 JOIN v2 ON v1.d = v2.d WHERE id > 0) + |""".stripMargin), Row(1)) + } + } + + test("SPARK-32237: Hint in CTE") { + withTable("t") { + sql("CREATE TABLE t USING PARQUET AS SELECT 1 AS id") + checkAnswer( + sql(""" + |WITH cte AS (SELECT /*+ REPARTITION(3) */ * FROM t) + |SELECT * FROM cte + """.stripMargin), + Row(1) :: Nil) + } + } + + test("SPARK-32372: ResolveReferences.dedupRight should only rewrite attributes for ancestor " + + "plans of the conflict plan") { + sql("SELECT name, avg(age) as avg_age FROM person GROUP BY name") + .createOrReplaceTempView("person_a") + sql("SELECT p1.name, p2.avg_age FROM person p1 JOIN person_a p2 ON p1.name = p2.name") + .createOrReplaceTempView("person_b") + sql("SELECT * FROM person_a UNION SELECT * FROM person_b") + .createOrReplaceTempView("person_c") + checkAnswer( + sql("SELECT p1.name, p2.avg_age FROM person_c p1 JOIN person_c p2 ON p1.name = p2.name"), + Row("jim", 20.0) :: Row("mike", 30.0) :: Nil) + } + + test("SPARK-32280: Avoid duplicate rewrite attributes when there're multiple JOINs") { + sql("SELECT 1 AS id").createOrReplaceTempView("A") + sql("SELECT id, 'foo' AS kind FROM A").createOrReplaceTempView("B") + sql("SELECT l.id as id FROM B AS l LEFT SEMI JOIN B AS r ON l.kind = r.kind") + .createOrReplaceTempView("C") + checkAnswer(sql("SELECT 0 FROM ( SELECT * FROM B JOIN C USING (id)) " + + "JOIN ( SELECT * FROM B JOIN C USING (id)) USING (id)"), Row(0)) + } + + test("SPARK-32788: non-partitioned table scan should not have partition filter") { + withTable("t") { + spark.range(1).write.saveAsTable("t") + checkAnswer(sql("SELECT id FROM t WHERE (SELECT true)"), Row(0L)) + } + } + + test("SPARK-33306: Timezone is needed when cast Date to String") { + withTempView("t1", "t2") { + spark.sql("select to_date(concat('2000-01-0', id)) as d from range(1, 2)") + .createOrReplaceTempView("t1") + spark.sql("select concat('2000-01-0', id) as d from range(1, 2)") + .createOrReplaceTempView("t2") + val result = Date.valueOf("2000-01-01") + + checkAnswer(sql("select t1.d from t1 join t2 on t1.d = t2.d"), Row(result)) + withSQLConf(SQLConf.LEGACY_CAST_DATETIME_TO_STRING.key -> "true") { + checkAnswer(sql("select t1.d from t1 join t2 on t1.d = t2.d"), Row(result)) + } + } + } + + test("SPARK-33338: GROUP BY using literal map should not fail") { + withTempDir { dir => + sql(s"CREATE TABLE t USING ORC LOCATION '${dir.toURI}' AS SELECT map('k1', 'v1') m, 'k1' k") + Seq( + "SELECT map('k1', 'v1')[k] FROM t GROUP BY 1", + "SELECT map('k1', 'v1')[k] FROM t GROUP BY map('k1', 'v1')[k]", + "SELECT map('k1', 'v1')[k] a FROM t GROUP BY a").foreach { statement => + checkAnswer(sql(statement), Row("v1")) + } + } + } + + test("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar") { + withTempView("df") { + Seq("m@ca").toDF("s").createOrReplaceTempView("df") + + val e = intercept[AnalysisException] { + sql("SELECT s LIKE 'm%@ca' ESCAPE '%' FROM df").collect() + } + assert(e.message.contains("the pattern 'm%@ca' is invalid, " + + "the escape character is not allowed to precede '@'")) + + checkAnswer(sql("SELECT s LIKE 'm@@ca' ESCAPE '@' FROM df"), Row(true)) + } + } + + test("SPARK-33593: Vector reader got incorrect data with binary partition value") { + Seq("false", "true").foreach(value => { + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> value) { + withTable("t1") { + sql( + """CREATE TABLE t1(name STRING, id BINARY, part BINARY) + |USING PARQUET PARTITIONED BY (part)""".stripMargin) + sql("INSERT INTO t1 PARTITION(part = 'Spark SQL') VALUES('a', X'537061726B2053514C')") + checkAnswer(sql("SELECT name, cast(id as string), cast(part as string) FROM t1"), + Row("a", "Spark SQL", "Spark SQL")) + } + } + + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> value) { + withTable("t2") { + sql( + """CREATE TABLE t2(name STRING, id BINARY, part BINARY) + |USING ORC PARTITIONED BY (part)""".stripMargin) + sql("INSERT INTO t2 PARTITION(part = 'Spark SQL') VALUES('a', X'537061726B2053514C')") + checkAnswer(sql("SELECT name, cast(id as string), cast(part as string) FROM t2"), + Row("a", "Spark SQL", "Spark SQL")) + } + } + }) + } + + test("SPARK-33591: null as a partition value") { + val t = "part_table" + withTable(t) { + sql(s"CREATE TABLE $t (col1 INT, p1 STRING) USING PARQUET PARTITIONED BY (p1)") + sql(s"INSERT INTO TABLE $t PARTITION (p1 = null) SELECT 0") + checkAnswer(sql(s"SELECT * FROM $t"), Row(0, null)) + } + } } case class Foo(bar: Option[String]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 2e5a9e0b4d45d..f43b838c79f24 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -18,12 +18,14 @@ package org.apache.spark.sql import java.io.File -import java.util.{Locale, TimeZone} +import java.util.Locale +import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile} @@ -62,7 +64,12 @@ import org.apache.spark.tags.ExtendedSQLTest * }}} * * The format for input files is simple: - * 1. A list of SQL queries separated by semicolon. + * 1. A list of SQL queries separated by semicolons by default. If the semicolon cannot effectively + * separate the SQL queries in the test file(e.g. bracketed comments), please use + * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with + * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END represent the beginning and end of a query, + * respectively. Code that is not surrounded by lines that begin with --QUERY-DELIMITER-START + * and --QUERY-DELIMITER-END is still separated by semicolons. * 2. Lines starting with -- are treated as comments and ignored. * 3. Lines starting with --SET are used to specify the configs when running this testing file. You * can set multiple configs in one --SET, using comma to separate them. Or you can use multiple @@ -115,7 +122,7 @@ import org.apache.spark.tags.ExtendedSQLTest * different types of UDFs. See 'udf/udf-inner-join.sql' as an example. */ @ExtendedSQLTest -class SQLQueryTestSuite extends QueryTest with SharedSparkSession { +class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper { import IntegratedUDFTestUtils._ @@ -125,12 +132,6 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { // We use a path based on Spark home for 2 reasons: // 1. Maven can't get correct resource directory when resources in other jars. // 2. We test subclasses in the hive-thriftserver module. - val sparkHome = { - assert(sys.props.contains("spark.test.home") || - sys.env.contains("SPARK_HOME"), "spark.test.home or SPARK_HOME is not set.") - sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) - } - java.nio.file.Paths.get(sparkHome, "sql", "core", "src", "test", "resources", "sql-tests").toFile } @@ -246,9 +247,18 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { /** Run a test case. */ protected def runTest(testCase: TestCase): Unit = { + def splitWithSemicolon(seq: Seq[String]) = { + seq.mkString("\n").split("(?<=[^\\\\]);") + } + + def splitCommentsAndCodes(input: String) = input.split("\n").partition { line => + val newLine = line.trim + newLine.startsWith("--") && !newLine.startsWith("--QUERY-DELIMITER") + } + val input = fileToString(new File(testCase.inputFile)) - val (comments, code) = input.split("\n").partition(_.trim.startsWith("--")) + val (comments, code) = splitCommentsAndCodes(input) // If `--IMPORT` found, load code from another test case file, then insert them // into the head in this test. @@ -256,15 +266,43 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { val importedCode = importedTestCaseName.flatMap { testCaseName => listTestCases.find(_.name == testCaseName).map { testCase => val input = fileToString(new File(testCase.inputFile)) - val (_, code) = input.split("\n").partition(_.trim.startsWith("--")) + val (_, code) = splitCommentsAndCodes(input) code } }.flatten + val allCode = importedCode ++ code + val tempQueries = if (allCode.exists(_.trim.startsWith("--QUERY-DELIMITER"))) { + // Although the loop is heavy, only used for bracketed comments test. + val querys = new ArrayBuffer[String] + val otherCodes = new ArrayBuffer[String] + var tempStr = "" + var start = false + for (c <- allCode) { + if (c.trim.startsWith("--QUERY-DELIMITER-START")) { + start = true + querys ++= splitWithSemicolon(otherCodes.toSeq) + otherCodes.clear() + } else if (c.trim.startsWith("--QUERY-DELIMITER-END")) { + start = false + querys += s"\n${tempStr.stripSuffix(";")}" + tempStr = "" + } else if (start) { + tempStr += s"\n$c" + } else { + otherCodes += c + } + } + if (otherCodes.nonEmpty) { + querys ++= splitWithSemicolon(otherCodes.toSeq) + } + querys.toSeq + } else { + splitWithSemicolon(allCode).toSeq + } + // List of SQL queries to run - // note: this is not a robust way to split queries using semicolon, but works for now. - val queries = (importedCode ++ code).mkString("\n").split("(?<=[^\\\\]);") - .map(_.trim).filter(_ != "").toSeq + val queries = tempQueries.map(_.trim).filter(_ != "").toSeq // Fix misplacement when comment is at the end of the query. .map(_.split("\n").filterNot(_.startsWith("--")).mkString("\n")).map(_.trim).filter(_ != "") @@ -317,7 +355,6 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { // Create a local SparkSession to have stronger isolation between different test cases. // This does not isolate catalog changes. val localSparkSession = spark.newSession() - loadTestData(localSparkSession) testCase match { case udfTestCase: UDFTest => @@ -469,7 +506,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { val df = session.sql(sql) val schema = df.schema.catalogString // Get answer, but also get rid of the #1234 expression ids that show up in explain plans - val answer = SQLExecution.withNewExecutionId(session, df.queryExecution, Some(sql)) { + val answer = SQLExecution.withNewExecutionId(df.queryExecution, Some(sql)) { hiveResultString(df.queryExecution.executedPlan).map(replaceNotIncludedMsg) } @@ -526,14 +563,20 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { } /** Load built-in test tables into the SparkSession. */ - private def loadTestData(session: SparkSession): Unit = { + private def createTestTables(session: SparkSession): Unit = { import session.implicits._ - (1 to 100).map(i => (i, i.toString)).toDF("key", "value").createOrReplaceTempView("testdata") + (1 to 100).map(i => (i, i.toString)).toDF("key", "value") + .repartition(1) + .write + .format("parquet") + .saveAsTable("testdata") ((Seq(1, 2, 3), Seq(Seq(1, 2, 3))) :: (Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil) .toDF("arraycol", "nestedarraycol") - .createOrReplaceTempView("arraydata") + .write + .format("parquet") + .saveAsTable("arraydata") (Tuple1(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) :: Tuple1(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) :: @@ -541,7 +584,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { Tuple1(Map(1 -> "a4", 2 -> "b4")) :: Tuple1(Map(1 -> "a5")) :: Nil) .toDF("mapcol") - .createOrReplaceTempView("mapdata") + .write + .format("parquet") + .saveAsTable("mapdata") session .read @@ -549,7 +594,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { .options(Map("delimiter" -> "\t", "header" -> "false")) .schema("a int, b float") .load(testFile("test-data/postgresql/agg.data")) - .createOrReplaceTempView("aggtest") + .write + .format("parquet") + .saveAsTable("aggtest") session .read @@ -575,7 +622,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { |string4 string """.stripMargin) .load(testFile("test-data/postgresql/onek.data")) - .createOrReplaceTempView("onek") + .write + .format("parquet") + .saveAsTable("onek") session .read @@ -601,25 +650,29 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { |string4 string """.stripMargin) .load(testFile("test-data/postgresql/tenk.data")) - .createOrReplaceTempView("tenk1") + .write + .format("parquet") + .saveAsTable("tenk1") } - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault + private def removeTestTables(session: SparkSession): Unit = { + session.sql("DROP TABLE IF EXISTS testdata") + session.sql("DROP TABLE IF EXISTS arraydata") + session.sql("DROP TABLE IF EXISTS mapdata") + session.sql("DROP TABLE IF EXISTS aggtest") + session.sql("DROP TABLE IF EXISTS onek") + session.sql("DROP TABLE IF EXISTS tenk1") + } override def beforeAll(): Unit = { super.beforeAll() - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) + createTestTables(spark) RuleExecutor.resetMetrics() } override def afterAll(): Unit = { try { - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) + removeTestTables(spark) // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala index 7e305e0504729..66c6fbeabbf55 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala @@ -74,72 +74,103 @@ case class ComplexReflectData( mapFieldContainsNull: Map[Int, Option[Long]], dataField: Data) +case class InvalidInJava(`abstract`: Int) + class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession { import testImplicits._ + // To avoid syntax error thrown by genjavadoc, make this case class non-top level and private. + private case class InvalidInJava2(`0`: Int) + test("query case class RDD") { - val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, - new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3), - new java.math.BigInteger("1"), scala.math.BigInt(1)) - Seq(data).toDF().createOrReplaceTempView("reflectData") - - assert(sql("SELECT * FROM reflectData").collect().head === - Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, - new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), - new Timestamp(12345), Seq(1, 2, 3), new java.math.BigDecimal(1), - new java.math.BigDecimal(1))) + withTempView("reflectData") { + val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, + new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3), + new java.math.BigInteger("1"), scala.math.BigInt(1)) + Seq(data).toDF().createOrReplaceTempView("reflectData") + + assert(sql("SELECT * FROM reflectData").collect().head === + Row("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, + new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), + new Timestamp(12345), Seq(1, 2, 3), new java.math.BigDecimal(1), + new java.math.BigDecimal(1))) + } } test("query case class RDD with nulls") { - val data = NullReflectData(null, null, null, null, null, null, null) - Seq(data).toDF().createOrReplaceTempView("reflectNullData") + withTempView("reflectNullData") { + val data = NullReflectData(null, null, null, null, null, null, null) + Seq(data).toDF().createOrReplaceTempView("reflectNullData") - assert(sql("SELECT * FROM reflectNullData").collect().head === - Row.fromSeq(Seq.fill(7)(null))) + assert(sql("SELECT * FROM reflectNullData").collect().head === + Row.fromSeq(Seq.fill(7)(null))) + } } test("query case class RDD with Nones") { - val data = OptionalReflectData(None, None, None, None, None, None, None) - Seq(data).toDF().createOrReplaceTempView("reflectOptionalData") + withTempView("reflectOptionalData") { + val data = OptionalReflectData(None, None, None, None, None, None, None) + Seq(data).toDF().createOrReplaceTempView("reflectOptionalData") - assert(sql("SELECT * FROM reflectOptionalData").collect().head === - Row.fromSeq(Seq.fill(7)(null))) + assert(sql("SELECT * FROM reflectOptionalData").collect().head === + Row.fromSeq(Seq.fill(7)(null))) + } } // Equality is broken for Arrays, so we test that separately. test("query binary data") { - Seq(ReflectBinary(Array[Byte](1))).toDF().createOrReplaceTempView("reflectBinary") + withTempView("reflectBinary") { + Seq(ReflectBinary(Array[Byte](1))).toDF().createOrReplaceTempView("reflectBinary") - val result = sql("SELECT data FROM reflectBinary") - .collect().head(0).asInstanceOf[Array[Byte]] - assert(result.toSeq === Seq[Byte](1)) + val result = sql("SELECT data FROM reflectBinary") + .collect().head(0).asInstanceOf[Array[Byte]] + assert(result.toSeq === Seq[Byte](1)) + } } test("query complex data") { - val data = ComplexReflectData( - Seq(1, 2, 3), - Seq(Some(1), Some(2), None), - Map(1 -> 10L, 2 -> 20L), - Map(1 -> Some(10L), 2 -> Some(20L), 3 -> None), - Data( - Seq(10, 20, 30), - Seq(Some(10), Some(20), None), - Map(10 -> 100L, 20 -> 200L), - Map(10 -> Some(100L), 20 -> Some(200L), 30 -> None), - Nested(None, "abc"))) - - Seq(data).toDF().createOrReplaceTempView("reflectComplexData") - assert(sql("SELECT * FROM reflectComplexData").collect().head === - Row( + withTempView("reflectComplexData") { + val data = ComplexReflectData( Seq(1, 2, 3), - Seq(1, 2, null), + Seq(Some(1), Some(2), None), Map(1 -> 10L, 2 -> 20L), - Map(1 -> 10L, 2 -> 20L, 3 -> null), - Row( + Map(1 -> Some(10L), 2 -> Some(20L), 3 -> None), + Data( Seq(10, 20, 30), - Seq(10, 20, null), + Seq(Some(10), Some(20), None), Map(10 -> 100L, 20 -> 200L), - Map(10 -> 100L, 20 -> 200L, 30 -> null), - Row(null, "abc")))) + Map(10 -> Some(100L), 20 -> Some(200L), 30 -> None), + Nested(None, "abc"))) + + Seq(data).toDF().createOrReplaceTempView("reflectComplexData") + assert(sql("SELECT * FROM reflectComplexData").collect().head === + Row( + Seq(1, 2, 3), + Seq(1, 2, null), + Map(1 -> 10L, 2 -> 20L), + Map(1 -> 10L, 2 -> 20L, 3 -> null), + Row( + Seq(10, 20, 30), + Seq(10, 20, null), + Map(10 -> 100L, 20 -> 200L), + Map(10 -> 100L, 20 -> 200L, 30 -> null), + Row(null, "abc")))) + } + } + + test("better error message when use java reserved keyword as field name") { + val e = intercept[UnsupportedOperationException] { + Seq(InvalidInJava(1)).toDS() + } + assert(e.getMessage.contains( + "`abstract` is not a valid identifier of Java and cannot be used as field name")) + } + + test("better error message when use invalid java identifier as field name") { + val e1 = intercept[UnsupportedOperationException] { + Seq(InvalidInJava2(1)).toDS() + } + assert(e1.getMessage.contains( + "`0` is not a valid identifier of Java and cannot be used as field name")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala index 31957a99e15af..003f5bc835d5f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala @@ -135,7 +135,7 @@ class SessionStateSuite extends SparkFunSuite { test("fork new session and inherit listener manager") { class CommandCollector extends QueryExecutionListener { val commands: ArrayBuffer[String] = ArrayBuffer.empty[String] - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable) : Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, ex: Exception) : Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { commands += funcName } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala index b3b94f8be0d17..6253626856bcd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala @@ -173,7 +173,7 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { val createTable = "CREATE TABLE `t1` (`a` STRUCT<`b`: STRING>)" sql(s"$createTable USING json") val shownDDL = getShowDDL("SHOW CREATE TABLE t1") - assert(shownDDL == createTable) + assert(shownDDL == "CREATE TABLE `default`.`t1` (`a` STRUCT<`b`: STRING>)") checkCreateTable("t1") } @@ -188,18 +188,26 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { if (result.length > 1) result(0) + result(1) else result.head } - protected def checkCreateTable(table: String): Unit = { - checkCreateTableOrView(TableIdentifier(table, Some("default")), "TABLE") + protected def checkCreateTable(table: String, serde: Boolean = false): Unit = { + checkCreateTableOrView(TableIdentifier(table, Some("default")), "TABLE", serde) } - protected def checkCreateView(table: String): Unit = { - checkCreateTableOrView(TableIdentifier(table, Some("default")), "VIEW") + protected def checkCreateView(table: String, serde: Boolean = false): Unit = { + checkCreateTableOrView(TableIdentifier(table, Some("default")), "VIEW", serde) } - private def checkCreateTableOrView(table: TableIdentifier, checkType: String): Unit = { + protected def checkCreateTableOrView( + table: TableIdentifier, + checkType: String, + serde: Boolean): Unit = { val db = table.database.getOrElse("default") val expected = spark.sharedState.externalCatalog.getTable(db, table.table) - val shownDDL = sql(s"SHOW CREATE TABLE ${table.quotedString}").head().getString(0) + val shownDDL = if (serde) { + sql(s"SHOW CREATE TABLE ${table.quotedString} AS SERDE").head().getString(0) + } else { + sql(s"SHOW CREATE TABLE ${table.quotedString}").head().getString(0) + } + sql(s"DROP $checkType ${table.quotedString}") try { @@ -212,29 +220,6 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { } protected def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = { - def normalize(table: CatalogTable): CatalogTable = { - val nondeterministicProps = Set( - "CreateTime", - "transient_lastDdlTime", - "grantTime", - "lastUpdateTime", - "last_modified_by", - "last_modified_time", - "Owner:", - // The following are hive specific schema parameters which we do not need to match exactly. - "totalNumberFiles", - "maxFileSize", - "minFileSize" - ) - - table.copy( - createTime = 0L, - lastAccessTime = 0L, - properties = table.properties.filterKeys(!nondeterministicProps.contains(_)), - stats = None, - ignoredProperties = Map.empty - ) - } - assert(normalize(actual) == normalize(expected)) + assert(CatalogTable.normalize(actual) == CatalogTable.normalize(expected)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala index 10b17571d2aaa..159d2c02188be 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionBuilderSuite.scala @@ -19,9 +19,11 @@ package org.apache.spark.sql import org.scalatest.BeforeAndAfterEach -import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite} +import org.apache.spark.internal.config.EXECUTOR_ALLOW_SPARK_CONTEXT import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.StaticSQLConf._ /** * Test cases for the builder pattern of [[SparkSession]]. @@ -152,4 +154,114 @@ class SparkSessionBuilderSuite extends SparkFunSuite with BeforeAndAfterEach { session.sparkContext.hadoopConfiguration.unset(mySpecialKey) } } + + test("SPARK-31234: RESET command will not change static sql configs and " + + "spark context conf values in SessionState") { + val session = SparkSession.builder() + .master("local") + .config(GLOBAL_TEMP_DATABASE.key, value = "globalTempDB-SPARK-31234") + .config("spark.app.name", "test-app-SPARK-31234") + .getOrCreate() + + assert(session.sessionState.conf.getConfString("spark.app.name") === "test-app-SPARK-31234") + assert(session.sessionState.conf.getConf(GLOBAL_TEMP_DATABASE) === "globaltempdb-spark-31234") + session.sql("RESET") + assert(session.sessionState.conf.getConfString("spark.app.name") === "test-app-SPARK-31234") + assert(session.sessionState.conf.getConf(GLOBAL_TEMP_DATABASE) === "globaltempdb-spark-31234") + } + + test("SPARK-31354: SparkContext only register one SparkSession ApplicationEnd listener") { + val conf = new SparkConf() + .setMaster("local") + .setAppName("test-app-SPARK-31354-1") + val context = new SparkContext(conf) + SparkSession + .builder() + .sparkContext(context) + .master("local") + .getOrCreate() + val postFirstCreation = context.listenerBus.listeners.size() + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + + SparkSession + .builder() + .sparkContext(context) + .master("local") + .getOrCreate() + val postSecondCreation = context.listenerBus.listeners.size() + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + assert(postFirstCreation == postSecondCreation) + } + + test("SPARK-31532: should not propagate static sql configs to the existing" + + " active/default SparkSession") { + val session = SparkSession.builder() + .master("local") + .config(GLOBAL_TEMP_DATABASE.key, value = "globalTempDB-SPARK-31532") + .config("spark.app.name", "test-app-SPARK-31532") + .getOrCreate() + // do not propagate static sql configs to the existing active session + val session1 = SparkSession + .builder() + .config(GLOBAL_TEMP_DATABASE.key, "globalTempDB-SPARK-31532-1") + .getOrCreate() + assert(session.conf.get(GLOBAL_TEMP_DATABASE) === "globaltempdb-spark-31532") + assert(session1.conf.get(GLOBAL_TEMP_DATABASE) === "globaltempdb-spark-31532") + + // do not propagate static sql configs to the existing default session + SparkSession.clearActiveSession() + val session2 = SparkSession + .builder() + .config(WAREHOUSE_PATH.key, "SPARK-31532-db") + .config(GLOBAL_TEMP_DATABASE.key, value = "globalTempDB-SPARK-31532-2") + .getOrCreate() + + assert(!session.conf.get(WAREHOUSE_PATH).contains("SPARK-31532-db")) + assert(session.conf.get(WAREHOUSE_PATH) === session2.conf.get(WAREHOUSE_PATH)) + assert(session2.conf.get(GLOBAL_TEMP_DATABASE) === "globaltempdb-spark-31532") + } + + test("SPARK-31532: propagate static sql configs if no existing SparkSession") { + val conf = new SparkConf() + .setMaster("local") + .setAppName("test-app-SPARK-31532-2") + .set(GLOBAL_TEMP_DATABASE.key, "globaltempdb-spark-31532") + .set(WAREHOUSE_PATH.key, "SPARK-31532-db") + SparkContext.getOrCreate(conf) + + // propagate static sql configs if no existing session + val session = SparkSession + .builder() + .config(GLOBAL_TEMP_DATABASE.key, "globalTempDB-SPARK-31532-2") + .config(WAREHOUSE_PATH.key, "SPARK-31532-db-2") + .getOrCreate() + assert(session.conf.get("spark.app.name") === "test-app-SPARK-31532-2") + assert(session.conf.get(GLOBAL_TEMP_DATABASE) === "globaltempdb-spark-31532-2") + assert(session.conf.get(WAREHOUSE_PATH) === "SPARK-31532-db-2") + } + + test("SPARK-32160: Disallow to create SparkSession in executors if the config is set") { + val session = SparkSession.builder().master("local-cluster[3, 1, 1024]").getOrCreate() + + val error = intercept[SparkException] { + session.range(1).foreach { v => + SparkSession.builder.master("local") + .config(EXECUTOR_ALLOW_SPARK_CONTEXT.key, false).getOrCreate() + () + } + }.getMessage() + + assert(error.contains("SparkSession should only be created and accessed on the driver.")) + } + + test("SPARK-32160: Allow to create SparkSession in executors") { + val session = SparkSession.builder().master("local-cluster[3, 1, 1024]").getOrCreate() + + session.range(1).foreach { v => + SparkSession.builder.master("local").getOrCreate().stop() + () + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index 99ea95089d71c..e5e8bc6917799 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -16,17 +16,24 @@ */ package org.apache.spark.sql -import java.util.Locale +import java.util.{Locale, UUID} -import org.apache.spark.{SparkFunSuite, TaskContext} +import scala.concurrent.Future + +import org.apache.spark.{MapOutputStatistics, SparkFunSuite, TaskContext} +import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, UnresolvedHint} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Statistics, UnresolvedHint} +import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, BroadcastExchangeLike, ShuffleExchangeExec, ShuffleExchangeLike} import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.COLUMN_BATCH_SIZE @@ -145,33 +152,83 @@ class SparkSessionExtensionSuite extends SparkFunSuite { } } - test("inject columnar") { + test("inject adaptive query prep rule") { val extensions = create { extensions => + // inject rule that will run during AQE query stage preparation and will add custom tags + // to the plan + extensions.injectQueryStagePrepRule(session => MyQueryStagePrepRule()) + // inject rule that will run during AQE query stage optimization and will verify that the + // custom tags were written in the preparation phase extensions.injectColumnar(session => - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) + MyColumarRule(MyNewQueryStageRule(), MyNewQueryStageRule())) } withSession(extensions) { session => - // The ApplyColumnarRulesAndInsertTransitions rule is not applied when enable AQE - session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) + session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, true) + assert(session.sessionState.queryStagePrepRules.contains(MyQueryStagePrepRule())) assert(session.sessionState.columnarRules.contains( - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) + MyColumarRule(MyNewQueryStageRule(), MyNewQueryStageRule()))) import session.sqlContext.implicits._ - // repartitioning avoids having the add operation pushed up into the LocalTableScan val data = Seq((100L), (200L), (300L)).toDF("vals").repartition(1) val df = data.selectExpr("vals + 1") - // Verify that both pre and post processing of the plan worked. - val found = df.queryExecution.executedPlan.collect { - case rep: ReplacedRowToColumnarExec => 1 - case proj: ColumnarProjectExec => 10 - case c2r: ColumnarToRowExec => 100 - }.sum - assert(found == 111) + df.collect() + } + } + + test("inject columnar AQE on") { + testInjectColumnar(true) + } + + test("inject columnar AQE off") { + testInjectColumnar(false) + } + + private def testInjectColumnar(enableAQE: Boolean): Unit = { + def collectPlanSteps(plan: SparkPlan): Seq[Int] = plan match { + case a: AdaptiveSparkPlanExec => + assert(a.toString.startsWith("AdaptiveSparkPlan isFinalPlan=true")) + collectPlanSteps(a.executedPlan) + case _ => plan.collect { + case _: ReplacedRowToColumnarExec => 1 + case _: ColumnarProjectExec => 10 + case _: ColumnarToRowExec => 100 + case s: QueryStageExec => collectPlanSteps(s.plan).sum + case _: MyShuffleExchangeExec => 1000 + case _: MyBroadcastExchangeExec => 10000 + } + } + + val extensions = create { extensions => + extensions.injectColumnar(session => + MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) + } + withSession(extensions) { session => + session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, enableAQE) + assert(session.sessionState.columnarRules.contains( + MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) + import session.sqlContext.implicits._ + // perform a join to inject a broadcast exchange + val left = Seq((1, 50L), (2, 100L), (3, 150L)).toDF("l1", "l2") + val right = Seq((1, 50L), (2, 100L), (3, 150L)).toDF("r1", "r2") + val data = left.join(right, $"l1" === $"r1") + // repartitioning avoids having the add operation pushed up into the LocalTableScan + .repartition(1) + val df = data.selectExpr("l2 + r2") + // execute the plan so that the final adaptive plan is available when AQE is on + df.collect() + val found = collectPlanSteps(df.queryExecution.executedPlan).sum + // 1 MyBroadcastExchangeExec + // 1 MyShuffleExchangeExec + // 1 ColumnarToRowExec + // 2 ColumnarProjectExec + // 1 ReplacedRowToColumnarExec + // so 11121 is expected. + assert(found == 11121) // Verify that we get back the expected, wrong, result val result = df.collect() - assert(result(0).getLong(0) == 102L) // Check that broken columnar Add was used. - assert(result(1).getLong(0) == 202L) - assert(result(2).getLong(0) == 302L) + assert(result(0).getLong(0) == 101L) // Check that broken columnar Add was used. + assert(result(1).getLong(0) == 201L) + assert(result(2).getLong(0) == 301L) } } @@ -327,6 +384,9 @@ case class MyParser(spark: SparkSession, delegate: ParserInterface) extends Pars override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText) + + override def parseRawDataType(sqlText: String): DataType = + delegate.parseRawDataType(sqlText) } object MyExtensions { @@ -342,6 +402,7 @@ object MyExtensions { """ note """, + "", "3.0.0", """ deprecated @@ -667,6 +728,16 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { def replaceWithColumnarPlan(plan: SparkPlan): SparkPlan = try { plan match { + case e: ShuffleExchangeExec => + // note that this is not actually columnar but demonstrates that exchanges can + // be replaced. + val replaced = e.withNewChildren(e.children.map(replaceWithColumnarPlan)) + MyShuffleExchangeExec(replaced.asInstanceOf[ShuffleExchangeExec]) + case e: BroadcastExchangeExec => + // note that this is not actually columnar but demonstrates that exchanges can + // be replaced. + val replaced = e.withNewChildren(e.children.map(replaceWithColumnarPlan)) + MyBroadcastExchangeExec(replaced.asInstanceOf[BroadcastExchangeExec]) case plan: ProjectExec => new ColumnarProjectExec(plan.projectList.map((exp) => replaceWithColumnarExpression(exp).asInstanceOf[NamedExpression]), @@ -685,6 +756,41 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { override def apply(plan: SparkPlan): SparkPlan = replaceWithColumnarPlan(plan) } +/** + * Custom Exchange used in tests to demonstrate that shuffles can be replaced regardless of + * whether AQE is enabled. + */ +case class MyShuffleExchangeExec(delegate: ShuffleExchangeExec) extends ShuffleExchangeLike { + override def numMappers: Int = delegate.numMappers + override def numPartitions: Int = delegate.numPartitions + override def canChangeNumPartitions: Boolean = delegate.canChangeNumPartitions + override def mapOutputStatisticsFuture: Future[MapOutputStatistics] = + delegate.mapOutputStatisticsFuture + override def getShuffleRDD(partitionSpecs: Array[ShufflePartitionSpec]): RDD[_] = + delegate.getShuffleRDD(partitionSpecs) + override def runtimeStatistics: Statistics = delegate.runtimeStatistics + override def child: SparkPlan = delegate.child + override protected def doExecute(): RDD[InternalRow] = delegate.execute() + override def outputPartitioning: Partitioning = delegate.outputPartitioning +} + +/** + * Custom Exchange used in tests to demonstrate that broadcasts can be replaced regardless of + * whether AQE is enabled. + */ +case class MyBroadcastExchangeExec(delegate: BroadcastExchangeExec) extends BroadcastExchangeLike { + override def runId: UUID = delegate.runId + override def relationFuture: java.util.concurrent.Future[Broadcast[Any]] = + delegate.relationFuture + override def completionFuture: Future[Broadcast[Any]] = delegate.completionFuture + override def runtimeStatistics: Statistics = delegate.runtimeStatistics + override def child: SparkPlan = delegate.child + override protected def doPrepare(): Unit = delegate.prepare() + override protected def doExecute(): RDD[InternalRow] = delegate.execute() + override def doExecuteBroadcast[T](): Broadcast[T] = delegate.executeBroadcast() + override def outputPartitioning: Partitioning = delegate.outputPartitioning +} + class ReplacedRowToColumnarExec(override val child: SparkPlan) extends RowToColumnarExec(child) { @@ -727,6 +833,31 @@ class MyExtensions extends (SparkSessionExtensions => Unit) { } } +object QueryPrepRuleHelper { + val myPrepTag: TreeNodeTag[String] = TreeNodeTag[String]("myPrepTag") + val myPrepTagValue: String = "myPrepTagValue" +} + +// this rule will run during AQE query preparation and will write custom tags to each node +case class MyQueryStagePrepRule() extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = plan.transformDown { + case plan => + plan.setTagValue(QueryPrepRuleHelper.myPrepTag, QueryPrepRuleHelper.myPrepTagValue) + plan + } +} + +// this rule will run during AQE query stage optimization and will verify custom tags were +// already written during query preparation phase +case class MyNewQueryStageRule() extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = plan.transformDown { + case plan if !plan.isInstanceOf[AdaptiveSparkPlanExec] => + assert(plan.getTagValue(QueryPrepRuleHelper.myPrepTag).get == + QueryPrepRuleHelper.myPrepTagValue) + plan + } +} + case class MyRule2(spark: SparkSession) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan } @@ -752,6 +883,7 @@ object MyExtensions2 { """ note """, + "", "3.0.0", """ deprecated @@ -784,6 +916,7 @@ object MyExtensions2Duplicate { """ note """, + "", "3.0.0", """ deprecated diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index e9ceab6724659..708b98e8fe15a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -481,7 +481,8 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared } } - DateTimeTestUtils.outstandingTimezones.foreach { timeZone => + DateTimeTestUtils.outstandingZoneIds.foreach { zid => + val timeZone = TimeZone.getTimeZone(zid) checkTimestampStats(DateType, DateTimeUtils.TimeZoneUTC, timeZone) { stats => assert(stats.min.get.asInstanceOf[Int] == TimeUnit.SECONDS.toDays(start)) assert(stats.max.get.asInstanceOf[Int] == TimeUnit.SECONDS.toDays(end - 1)) @@ -650,4 +651,21 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared } } } + + Seq(true, false).foreach { caseSensitive => + test(s"SPARK-30903: Fail fast on duplicate columns when analyze columns " + + s"- caseSensitive=$caseSensitive") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val table = "test_table" + withTable(table) { + sql(s"CREATE TABLE $table (value string, name string) USING PARQUET") + val dupCol = if (caseSensitive) "value" else "VaLuE" + val errorMsg = intercept[AnalysisException] { + sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS value, name, $dupCol") + }.getMessage + assert(errorMsg.contains("Found duplicate column(s)")) + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala index fde8ddf491bd1..b6ea26ab95549 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql import java.{lang => jl} import java.io.File import java.sql.{Date, Timestamp} -import java.util.concurrent.TimeUnit import scala.collection.mutable import scala.util.Random @@ -30,6 +29,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatisti import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, HistogramSerializer, LogicalPlan} import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.SQLTestUtils @@ -51,10 +51,10 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils private val d2 = Date.valueOf(d2Str) private val t1Str = "2016-05-08 00:00:01.000000" private val t1Internal = date(2016, 5, 8, 0, 0, 1) - private val t1 = new Timestamp(TimeUnit.MICROSECONDS.toMillis(t1Internal)) + private val t1 = new Timestamp(DateTimeUtils.toMillis(t1Internal)) private val t2Str = "2016-05-09 00:00:02.000000" private val t2Internal = date(2016, 5, 9, 0, 0, 2) - private val t2 = new Timestamp(TimeUnit.MICROSECONDS.toMillis(t2Internal)) + private val t2 = new Timestamp(DateTimeUtils.toMillis(t2Internal)) /** * Define a very simple 3 row table used for testing column serialization. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index ff8f94c68c5ee..141f1279a6ee5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, Sort} import org.apache.spark.sql.execution.{ColumnarToRowExec, ExecSubqueryExpression, FileSourceScanExec, InputAdapter, ReusedSubqueryExec, ScalarSubquery, SubqueryExec, WholeStageCodegenExec} -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecution} import org.apache.spark.sql.execution.datasources.FileScanRDD import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -153,29 +153,31 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } test("uncorrelated scalar subquery on a DataFrame generated query") { - val df = Seq((1, "one"), (2, "two"), (3, "three")).toDF("key", "value") - df.createOrReplaceTempView("subqueryData") + withTempView("subqueryData") { + val df = Seq((1, "one"), (2, "two"), (3, "three")).toDF("key", "value") + df.createOrReplaceTempView("subqueryData") - checkAnswer( - sql("select (select key from subqueryData where key > 2 order by key limit 1) + 1"), - Array(Row(4)) - ) + checkAnswer( + sql("select (select key from subqueryData where key > 2 order by key limit 1) + 1"), + Array(Row(4)) + ) - checkAnswer( - sql("select -(select max(key) from subqueryData)"), - Array(Row(-3)) - ) + checkAnswer( + sql("select -(select max(key) from subqueryData)"), + Array(Row(-3)) + ) - checkAnswer( - sql("select (select value from subqueryData limit 0)"), - Array(Row(null)) - ) + checkAnswer( + sql("select (select value from subqueryData limit 0)"), + Array(Row(null)) + ) - checkAnswer( - sql("select (select min(value) from subqueryData" + - " where key = (select max(key) from subqueryData) - 1)"), - Array(Row("two")) - ) + checkAnswer( + sql("select (select min(value) from subqueryData" + + " where key = (select max(key) from subqueryData) - 1)"), + Array(Row("two")) + ) + } } test("SPARK-15677: Queries against local relations with scalar subquery in Select list") { @@ -1357,11 +1359,9 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } } - test("SPARK-27279: Reuse Subquery") { + test("SPARK-27279: Reuse Subquery", DisableAdaptiveExecution("reuse is dynamic in AQE")) { Seq(true, false).foreach { reuse => - withSQLConf(SQLConf.SUBQUERY_REUSE_ENABLED.key -> reuse.toString, - SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { - // when enable AQE, the reusedExchange is inserted when executed. + withSQLConf(SQLConf.SUBQUERY_REUSE_ENABLED.key -> reuse.toString) { val df = sql( """ |SELECT (SELECT avg(key) FROM testData) + (SELECT avg(key) FROM testData) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala index aacb625d7921f..f4c6fb9b75ff8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import org.apache.spark.SparkConf +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.util.resourceToString import org.apache.spark.sql.internal.SQLConf @@ -46,11 +48,13 @@ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSSchema { "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90", "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") + val sqlConfgs: Seq[(String, String)] = Nil + tpcdsQueries.foreach { name => val queryString = resourceToString(s"tpcds/$name.sql", classLoader = Thread.currentThread().getContextClassLoader) test(name) { - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { + withSQLConf(sqlConfgs: _*) { // check the plans can be properly generated val plan = sql(queryString).queryExecution.executedPlan checkGeneratedCode(plan) @@ -69,7 +73,7 @@ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSSchema { val queryString = resourceToString(s"tpcds-v2.7.0/$name.sql", classLoader = Thread.currentThread().getContextClassLoader) test(s"$name-v2.7") { - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { + withSQLConf(sqlConfgs: _*) { // check the plans can be properly generated val plan = sql(queryString).queryExecution.executedPlan checkGeneratedCode(plan) @@ -98,3 +102,28 @@ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSSchema { } } } + +class TPCDSQueryWithStatsSuite extends TPCDSQuerySuite { + + override def beforeAll(): Unit = { + super.beforeAll() + for (tableName <- tableNames) { + // To simulate plan generation on actual TPCDS data, injects data stats here + spark.sessionState.catalog.alterTableStats( + TableIdentifier(tableName), Some(TPCDSTableStats.sf100TableStats(tableName))) + } + } + + // Sets configurations for enabling the optimization rules that + // exploit data statistics. + override val sqlConfgs = Seq( + SQLConf.CBO_ENABLED.key -> "true", + SQLConf.PLAN_STATS_ENABLED.key -> "true", + SQLConf.JOIN_REORDER_ENABLED.key -> "true" + ) +} + +class TPCDSQueryANSISuite extends TPCDSQuerySuite { + override protected def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.ANSI_ENABLED, true) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala new file mode 100644 index 0000000000000..f39b4b8b56c2e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala @@ -0,0 +1,503 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatistics} + +object TPCDSTableStats { + + // These data statistics are extracted from generated TPCDS data with SF=100 + + // scalastyle:off line.size.limit + val sf100TableStats = Map( + "customer" -> CatalogStatistics(500000000L, Some(2000000L), Map( + "c_birth_country" -> CatalogColumnStat(Some(196L), None, None, Some(69626), Some(9), Some(20), None, CatalogColumnStat.VERSION), + "c_current_cdemo_sk" -> CatalogColumnStat(Some(1184426L), Some("1"), Some("1920798"), Some(69943), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_customer_id" -> CatalogColumnStat(Some(2000000L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "c_birth_day" -> CatalogColumnStat(Some(32L), Some("1"), Some("31"), Some(70166), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_last_name" -> CatalogColumnStat(Some(5250L), None, None, Some(70098), Some(7), Some(13), None, CatalogColumnStat.VERSION), + "c_login" -> CatalogColumnStat(Some(0L), None, None, Some(2000000), Some(20), Some(20), None, CatalogColumnStat.VERSION), + "c_first_shipto_date_sk" -> CatalogColumnStat(Some(3755L), Some("2449028"), Some("2452678"), Some(70080), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_first_name" -> CatalogColumnStat(Some(5146L), None, None, Some(69769), Some(6), Some(11), None, CatalogColumnStat.VERSION), + "c_birth_month" -> CatalogColumnStat(Some(12L), Some("1"), Some("12"), Some(69896), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_email_address" -> CatalogColumnStat(Some(1929800L), None, None, Some(70200), Some(28), Some(47), None, CatalogColumnStat.VERSION), + "c_current_addr_sk" -> CatalogColumnStat(Some(824389L), Some("1"), Some("1000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_salutation" -> CatalogColumnStat(Some(6L), None, None, Some(69840), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_current_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(69657), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_birth_year" -> CatalogColumnStat(Some(67L), Some("1924"), Some("1992"), Some(69986), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_customer_sk" -> CatalogColumnStat(Some(1903054L), Some("1"), Some("2000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "c_last_review_date" -> CatalogColumnStat(Some(357L), None, None, Some(70102), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "c_preferred_cust_flag" -> CatalogColumnStat(Some(2L), None, None, Some(69778), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "c_first_sales_date_sk" -> CatalogColumnStat(Some(3758L), Some("2448998"), Some("2452648"), Some(69950), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "store_sales" -> CatalogStatistics(42623559552L, Some(287997024L), Map( + "ss_ext_sales_price" -> CatalogColumnStat(Some(712602L), Some("0.00"), Some("19878.00"), Some(12955462), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_item_sk" -> CatalogColumnStat(Some(206807L), Some("1"), Some("204000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_coupon_amt" -> CatalogColumnStat(Some(925847L), Some("0.00"), Some("19225.00"), Some(12958053), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_store_sk" -> CatalogColumnStat(Some(199L), Some("1"), Some("400"), Some(12950651), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_quantity" -> CatalogColumnStat(Some(99L), Some("1"), Some("100"), Some(12953654), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_ext_discount_amt" -> CatalogColumnStat(Some(925847L), Some("0.00"), Some("19225.00"), Some(12958053), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(12957139), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_ext_tax" -> CatalogColumnStat(Some(135620L), Some("0.00"), Some("1762.38"), Some(12957705), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_sold_time_sk" -> CatalogColumnStat(Some(47827L), Some("28800"), Some("75599"), Some(12953300), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_promo_sk" -> CatalogColumnStat(Some(1026L), Some("1"), Some("1000"), Some(12954088), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_net_paid_inc_tax" -> CatalogColumnStat(Some(1404501L), Some("0.00"), Some("21344.38"), Some(12958941), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_wholesale_cost" -> CatalogColumnStat(Some(9503L), Some("1.00"), Some("100.00"), Some(12958327), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_cdemo_sk" -> CatalogColumnStat(Some(1847065L), Some("1"), Some("1920800"), Some(12955252), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_sold_date_sk" -> CatalogColumnStat(Some(1781L), Some("2450816"), Some("2452642"), Some(12955025), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_list_price" -> CatalogColumnStat(Some(19079L), Some("1.00"), Some("200.00"), Some(12952108), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_ext_list_price" -> CatalogColumnStat(Some(752016L), Some("1.00"), Some("20000.00"), Some(12955040), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_net_profit" -> CatalogColumnStat(Some(1388760L), Some("-10000.00"), Some("9889.00"), Some(12955156), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_ticket_number" -> CatalogColumnStat(Some(24596280L), Some("1"), Some("24000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_net_paid" -> CatalogColumnStat(Some(1058635L), Some("0.00"), Some("19878.00"), Some(12954554), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_ext_wholesale_cost" -> CatalogColumnStat(Some(382964L), Some("1.00"), Some("10000.00"), Some(12960060), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ss_customer_sk" -> CatalogColumnStat(Some(1903054L), Some("1"), Some("2000000"), Some(12952082), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_addr_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(12956686), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ss_sales_price" -> CatalogColumnStat(Some(19261L), Some("0.00"), Some("200.00"), Some(12958587), Some(8), Some(8), None, CatalogColumnStat.VERSION) + )), + "web_sales" -> CatalogStatistics(14688252348L, Some(72001237L), Map( + "ws_sold_time_sk" -> CatalogColumnStat(Some(83505L), Some("0"), Some("86399"), Some(17931), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_ext_wholesale_cost" -> CatalogColumnStat(Some(382964L), Some("1.00"), Some("10000.00"), Some(17814), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_bill_cdemo_sk" -> CatalogColumnStat(Some(1707373L), Some("1"), Some("1920800"), Some(17833), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_net_paid_inc_ship" -> CatalogColumnStat(Some(1629659L), Some("0.00"), Some("43468.92"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_sales_price" -> CatalogColumnStat(Some(29143L), Some("0.00"), Some("300.00"), Some(18005), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_ship_mode_sk" -> CatalogColumnStat(Some(20L), Some("1"), Some("20"), Some(17823), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_ship_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(17833), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_order_number" -> CatalogColumnStat(Some(6073146L), Some("1"), Some("6000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_sold_date_sk" -> CatalogColumnStat(Some(1781L), Some("2450816"), Some("2452642"), Some(17922), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_web_site_sk" -> CatalogColumnStat(Some(25L), Some("1"), Some("24"), Some(18030), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_bill_customer_sk" -> CatalogColumnStat(Some(1800267L), Some("1"), Some("2000000"), Some(17882), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_coupon_amt" -> CatalogColumnStat(Some(863374L), Some("0.00"), Some("27591.16"), Some(18027), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_ship_addr_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(17931), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_net_paid_inc_tax" -> CatalogColumnStat(Some(1695042L), Some("0.00"), Some("32492.90"), Some(18102), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_quantity" -> CatalogColumnStat(Some(99L), Some("1"), Some("100"), Some(18014), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_bill_addr_sk" -> CatalogColumnStat(Some(940457L), Some("1"), Some("1000000"), Some(17801), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_net_paid" -> CatalogColumnStat(Some(1247989L), Some("0.00"), Some("29810.00"), Some(17968), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_ext_discount_amt" -> CatalogColumnStat(Some(934889L), Some("0.00"), Some("29982.00"), Some(17890), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_item_sk" -> CatalogColumnStat(Some(206807L), Some("1"), Some("204000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_ext_tax" -> CatalogColumnStat(Some(168876L), Some("0.00"), Some("2682.90"), Some(17800), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_wholesale_cost" -> CatalogColumnStat(Some(9503L), Some("1.00"), Some("100.00"), Some(17850), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_ship_customer_sk" -> CatalogColumnStat(Some(1811873L), Some("1"), Some("2000000"), Some(17886), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_net_paid_inc_ship_tax" -> CatalogColumnStat(Some(2186741L), Some("0.00"), Some("44479.52"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_web_page_sk" -> CatalogColumnStat(Some(2108L), Some("1"), Some("2040"), Some(17920), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_ext_sales_price" -> CatalogColumnStat(Some(908147L), Some("0.00"), Some("29810.00"), Some(17843), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_ext_ship_cost" -> CatalogColumnStat(Some(482711L), Some("0.00"), Some("14927.00"), Some(17923), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_bill_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(18011), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_ship_date_sk" -> CatalogColumnStat(Some(1898L), Some("2450817"), Some("2452762"), Some(17883), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_ext_list_price" -> CatalogColumnStat(Some(1071558L), Some("1.02"), Some("29997.00"), Some(18001), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_ship_cdemo_sk" -> CatalogColumnStat(Some(1755797L), Some("1"), Some("1920800"), Some(17903), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_warehouse_sk" -> CatalogColumnStat(Some(15L), Some("1"), Some("15"), Some(17812), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_list_price" -> CatalogColumnStat(Some(29447L), Some("1.00"), Some("300.00"), Some(17824), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ws_promo_sk" -> CatalogColumnStat(Some(1026L), Some("1"), Some("1000"), Some(18116), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ws_net_profit" -> CatalogColumnStat(Some(1482074L), Some("-9997.00"), Some("19840.00"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION) + )), + "date_dim" -> CatalogStatistics(19284936L, Some(73049L), Map( + "d_dom" -> CatalogColumnStat(Some(32L), Some("1"), Some("31"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_current_quarter" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_fy_week_seq" -> CatalogColumnStat(Some(10010L), Some("1"), Some("10436"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_holiday" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_current_day" -> CatalogColumnStat(Some(1L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_week_seq" -> CatalogColumnStat(Some(10010L), Some("1"), Some("10436"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_current_month" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_moy" -> CatalogColumnStat(Some(12L), Some("1"), Some("12"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_date" -> CatalogColumnStat(Some(73049L), None, None, Some(0), Some(10), Some(10), None, CatalogColumnStat.VERSION), + "d_same_day_lq" -> CatalogColumnStat(Some(73049L), Some("2414930"), Some("2487978"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_weekend" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_dow" -> CatalogColumnStat(Some(7L), Some("0"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_year" -> CatalogColumnStat(Some(202L), Some("1900"), Some("2100"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_current_week" -> CatalogColumnStat(Some(1L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_quarter_name" -> CatalogColumnStat(Some(774L), None, None, Some(0), Some(6), Some(6), None, CatalogColumnStat.VERSION), + "d_month_seq" -> CatalogColumnStat(Some(2431L), Some("0"), Some("2400"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_fy_year" -> CatalogColumnStat(Some(202L), Some("1900"), Some("2100"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_following_holiday" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_same_day_ly" -> CatalogColumnStat(Some(73049L), Some("2414657"), Some("2487705"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_day_name" -> CatalogColumnStat(Some(7L), None, None, Some(0), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "d_qoy" -> CatalogColumnStat(Some(4L), Some("1"), Some("4"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_date_sk" -> CatalogColumnStat(Some(73049L), Some("2415022"), Some("2488070"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_fy_quarter_seq" -> CatalogColumnStat(Some(787L), Some("1"), Some("801"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_current_year" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "d_date_id" -> CatalogColumnStat(Some(73049L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "d_quarter_seq" -> CatalogColumnStat(Some(787L), Some("1"), Some("801"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_last_dom" -> CatalogColumnStat(Some(2386L), Some("2415020"), Some("2488372"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "d_first_dom" -> CatalogColumnStat(Some(2329L), Some("2415021"), Some("2488070"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "household_demographics" -> CatalogStatistics(316800L, Some(7200L), Map( + "hd_buy_potential" -> CatalogColumnStat(Some(6L), None, None, Some(0), Some(8), Some(10), None, CatalogColumnStat.VERSION), + "hd_income_band_sk" -> CatalogColumnStat(Some(20L), Some("1"), Some("20"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "hd_demo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "hd_vehicle_count" -> CatalogColumnStat(Some(6L), Some("-1"), Some("4"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "hd_dep_count" -> CatalogColumnStat(Some(10L), Some("0"), Some("9"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "call_center" -> CatalogStatistics(16020L, Some(30L), Map( + "cc_rec_start_date" -> CatalogColumnStat(Some(4L), Some("1998-01-01"), Some("2002-01-01"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_mkt_desc" -> CatalogColumnStat(Some(21L), None, None, Some(0), Some(59), Some(93), None, CatalogColumnStat.VERSION), + "cc_hours" -> CatalogColumnStat(Some(3L), None, None, Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cc_street_name" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(8), Some(16), None, CatalogColumnStat.VERSION), + "cc_manager" -> CatalogColumnStat(Some(22L), None, None, Some(0), Some(13), Some(17), None, CatalogColumnStat.VERSION), + "cc_city" -> CatalogColumnStat(Some(12L), None, None, Some(0), Some(10), Some(15), None, CatalogColumnStat.VERSION), + "cc_class" -> CatalogColumnStat(Some(3L), None, None, Some(0), Some(6), Some(6), None, CatalogColumnStat.VERSION), + "cc_country" -> CatalogColumnStat(Some(1L), None, None, Some(0), Some(13), Some(13), None, CatalogColumnStat.VERSION), + "cc_open_date_sk" -> CatalogColumnStat(Some(15L), Some("2450794"), Some("2451146"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_market_manager" -> CatalogColumnStat(Some(25L), None, None, Some(0), Some(13), Some(17), None, CatalogColumnStat.VERSION), + "cc_street_number" -> CatalogColumnStat(Some(15L), None, None, Some(0), Some(3), Some(3), None, CatalogColumnStat.VERSION), + "cc_call_center_sk" -> CatalogColumnStat(Some(30L), Some("1"), Some("30"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_name" -> CatalogColumnStat(Some(15L), None, None, Some(0), Some(14), Some(19), None, CatalogColumnStat.VERSION), + "cc_suite_number" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "cc_mkt_id" -> CatalogColumnStat(Some(6L), Some("1"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_gmt_offset" -> CatalogColumnStat(Some(2L), Some("-6.00"), Some("-5.00"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cc_company_name" -> CatalogColumnStat(Some(6L), None, None, Some(0), Some(4), Some(5), None, CatalogColumnStat.VERSION), + "cc_division" -> CatalogColumnStat(Some(6L), Some("1"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_closed_date_sk" -> CatalogColumnStat(Some(0L), None, None, Some(30), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_state" -> CatalogColumnStat(Some(8L), None, None, Some(0), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "cc_street_type" -> CatalogColumnStat(Some(9L), None, None, Some(0), Some(5), Some(9), None, CatalogColumnStat.VERSION), + "cc_call_center_id" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "cc_sq_ft" -> CatalogColumnStat(Some(22L), Some("1670015"), Some("31896816"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_mkt_class" -> CatalogColumnStat(Some(26L), None, None, Some(0), Some(35), Some(50), None, CatalogColumnStat.VERSION), + "cc_tax_percentage" -> CatalogColumnStat(Some(10L), Some("0.00"), Some("0.12"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cc_division_name" -> CatalogColumnStat(Some(6L), None, None, Some(0), Some(5), Some(5), None, CatalogColumnStat.VERSION), + "cc_zip" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(5), Some(5), None, CatalogColumnStat.VERSION), + "cc_rec_end_date" -> CatalogColumnStat(Some(3L), Some("2000-01-01"), Some("2001-12-31"), Some(15), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_employees" -> CatalogColumnStat(Some(22L), Some("2935"), Some("69020"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cc_county" -> CatalogColumnStat(Some(8L), None, None, Some(0), Some(15), Some(17), None, CatalogColumnStat.VERSION), + "cc_company" -> CatalogColumnStat(Some(6L), Some("1"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "catalog_page" -> CatalogStatistics(3774000L, Some(20400L), Map( + "cp_catalog_page_id" -> CatalogColumnStat(Some(20400L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "cp_department" -> CatalogColumnStat(Some(1L), None, None, Some(205), Some(10), Some(10), None, CatalogColumnStat.VERSION), + "cp_description" -> CatalogColumnStat(Some(19304L), None, None, Some(190), Some(75), Some(99), None, CatalogColumnStat.VERSION), + "cp_start_date_sk" -> CatalogColumnStat(Some(88L), Some("2450815"), Some("2453005"), Some(196), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cp_end_date_sk" -> CatalogColumnStat(Some(101L), Some("2450844"), Some("2453186"), Some(206), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cp_catalog_page_number" -> CatalogColumnStat(Some(186L), Some("1"), Some("188"), Some(208), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cp_type" -> CatalogColumnStat(Some(3L), None, None, Some(197), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "cp_catalog_page_sk" -> CatalogColumnStat(Some(18915L), Some("1"), Some("20400"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cp_catalog_number" -> CatalogColumnStat(Some(110L), Some("1"), Some("109"), Some(197), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "catalog_returns" -> CatalogStatistics(2189464848L, Some(14404374L), Map( + "cr_return_amount" -> CatalogColumnStat(Some(563788L), Some("0.00"), Some("28778.31"), Some(288408), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_returning_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(288369), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_refunded_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(288053), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_catalog_page_sk" -> CatalogColumnStat(Some(11224L), Some("1"), Some("17108"), Some(288041), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_reason_sk" -> CatalogColumnStat(Some(55L), Some("1"), Some("55"), Some(287890), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_returning_cdemo_sk" -> CatalogColumnStat(Some(1839372L), Some("1"), Some("1920800"), Some(288128), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_fee" -> CatalogColumnStat(Some(9571L), Some("0.50"), Some("100.00"), Some(288038), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_refunded_cash" -> CatalogColumnStat(Some(592855L), Some("0.00"), Some("24544.84"), Some(287638), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_refunded_cdemo_sk" -> CatalogColumnStat(Some(1842661L), Some("1"), Some("1920800"), Some(287556), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_order_number" -> CatalogColumnStat(Some(8816832L), Some("2"), Some("16000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_ship_mode_sk" -> CatalogColumnStat(Some(20L), Some("1"), Some("20"), Some(287768), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_refunded_customer_sk" -> CatalogColumnStat(Some(1894309L), Some("1"), Some("2000000"), Some(287207), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_return_amt_inc_tax" -> CatalogColumnStat(Some(855338L), Some("0.00"), Some("29353.87"), Some(288246), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_returning_customer_sk" -> CatalogColumnStat(Some(1903054L), Some("1"), Some("2000000"), Some(287581), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_returned_date_sk" -> CatalogColumnStat(Some(2027L), Some("2450821"), Some("2452921"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_return_ship_cost" -> CatalogColumnStat(Some(347391L), Some("0.00"), Some("14130.96"), Some(287952), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_returning_addr_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(288203), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_return_quantity" -> CatalogColumnStat(Some(99L), Some("1"), Some("100"), Some(287844), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_store_credit" -> CatalogColumnStat(Some(415989L), Some("0.00"), Some("22167.49"), Some(288118), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_item_sk" -> CatalogColumnStat(Some(206807L), Some("1"), Some("204000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_return_tax" -> CatalogColumnStat(Some(101557L), Some("0.00"), Some("2390.75"), Some(288599), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_refunded_addr_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(287752), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_call_center_sk" -> CatalogColumnStat(Some(31L), Some("1"), Some("30"), Some(288179), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_returned_time_sk" -> CatalogColumnStat(Some(83505L), Some("0"), Some("86399"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cr_net_loss" -> CatalogColumnStat(Some(550908L), Some("0.50"), Some("15781.83"), Some(287954), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_reversed_charge" -> CatalogColumnStat(Some(410432L), Some("0.00"), Some("23801.24"), Some(288476), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cr_warehouse_sk" -> CatalogColumnStat(Some(15L), Some("1"), Some("15"), Some(288581), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "catalog_sales" -> CatalogStatistics(29375401260L, Some(143997065L), Map( + "cs_ship_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(720450), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_bill_addr_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(718886), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_net_paid_inc_ship_tax" -> CatalogColumnStat(Some(2570239L), Some("0.00"), Some("45460.80"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_ext_sales_price" -> CatalogColumnStat(Some(977146L), Some("0.00"), Some("29808.00"), Some(719228), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_sold_date_sk" -> CatalogColumnStat(Some(1772L), Some("2450815"), Some("2452654"), Some(719581), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_ext_tax" -> CatalogColumnStat(Some(189612L), Some("0.00"), Some("2619.36"), Some(719627), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_sold_time_sk" -> CatalogColumnStat(Some(83505L), Some("0"), Some("86399"), Some(720917), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_net_paid_inc_ship" -> CatalogColumnStat(Some(1870941L), Some("0.00"), Some("43725.00"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_warehouse_sk" -> CatalogColumnStat(Some(15L), Some("1"), Some("15"), Some(719624), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_ext_list_price" -> CatalogColumnStat(Some(1101138L), Some("1.00"), Some("29997.00"), Some(719642), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_quantity" -> CatalogColumnStat(Some(99L), Some("1"), Some("100"), Some(720147), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_ext_discount_amt" -> CatalogColumnStat(Some(965247L), Some("0.00"), Some("29765.00"), Some(719820), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_net_profit" -> CatalogColumnStat(Some(1640663L), Some("-10000.00"), Some("19840.00"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_bill_hdemo_sk" -> CatalogColumnStat(Some(7083L), Some("1"), Some("7200"), Some(719849), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_ext_ship_cost" -> CatalogColumnStat(Some(508914L), Some("0.00"), Some("14896.00"), Some(719848), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_item_sk" -> CatalogColumnStat(Some(206807L), Some("1"), Some("204000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_ship_cdemo_sk" -> CatalogColumnStat(Some(1847065L), Some("1"), Some("1920800"), Some(720292), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_wholesale_cost" -> CatalogColumnStat(Some(9503L), Some("1.00"), Some("100.00"), Some(721114), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_ship_date_sk" -> CatalogColumnStat(Some(1887L), Some("2450817"), Some("2452744"), Some(719625), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_ext_wholesale_cost" -> CatalogColumnStat(Some(382964L), Some("1.00"), Some("10000.00"), Some(719924), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_bill_cdemo_sk" -> CatalogColumnStat(Some(1847065L), Some("1"), Some("1920800"), Some(720208), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_sales_price" -> CatalogColumnStat(Some(29282L), Some("0.00"), Some("300.00"), Some(719781), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_net_paid" -> CatalogColumnStat(Some(1330351L), Some("0.00"), Some("29760.00"), Some(719706), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_promo_sk" -> CatalogColumnStat(Some(1026L), Some("1"), Some("1000"), Some(720194), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_call_center_sk" -> CatalogColumnStat(Some(31L), Some("1"), Some("30"), Some(719767), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_catalog_page_sk" -> CatalogColumnStat(Some(11224L), Some("1"), Some("17108"), Some(719180), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_bill_customer_sk" -> CatalogColumnStat(Some(1903054L), Some("1"), Some("2000000"), Some(719473), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_list_price" -> CatalogColumnStat(Some(29447L), Some("1.00"), Some("300.00"), Some(720328), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_ship_customer_sk" -> CatalogColumnStat(Some(1903054L), Some("1"), Some("2000000"), Some(720582), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_coupon_amt" -> CatalogColumnStat(Some(982009L), Some("0.00"), Some("28422.94"), Some(719631), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_ship_addr_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(718680), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_order_number" -> CatalogColumnStat(Some(15603123L), Some("1"), Some("16000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cs_net_paid_inc_tax" -> CatalogColumnStat(Some(1807594L), Some("0.00"), Some("31745.52"), Some(719354), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "cs_ship_mode_sk" -> CatalogColumnStat(Some(20L), Some("1"), Some("20"), Some(720146), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "customer_address" -> CatalogStatistics(245000000L, Some(1000000L), Map( + "ca_country" -> CatalogColumnStat(Some(1L), None, None, Some(30097), Some(13), Some(13), None, CatalogColumnStat.VERSION), + "ca_address_id" -> CatalogColumnStat(Some(1000000L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "ca_address_sk" -> CatalogColumnStat(Some(943039L), Some("1"), Some("1000000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ca_county" -> CatalogColumnStat(Some(1957L), None, None, Some(30028), Some(14), Some(28), None, CatalogColumnStat.VERSION), + "ca_gmt_offset" -> CatalogColumnStat(Some(6L), Some("-10.00"), Some("-5.00"), Some(30131), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "ca_street_type" -> CatalogColumnStat(Some(20L), None, None, Some(30124), Some(5), Some(9), None, CatalogColumnStat.VERSION), + "ca_street_name" -> CatalogColumnStat(Some(8292L), None, None, Some(30178), Some(9), Some(21), None, CatalogColumnStat.VERSION), + "ca_city" -> CatalogColumnStat(Some(977L), None, None, Some(30183), Some(9), Some(20), None, CatalogColumnStat.VERSION), + "ca_location_type" -> CatalogColumnStat(Some(3L), None, None, Some(30172), Some(9), Some(13), None, CatalogColumnStat.VERSION), + "ca_suite_number" -> CatalogColumnStat(Some(76L), None, None, Some(30047), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "ca_zip" -> CatalogColumnStat(Some(8311L), None, None, Some(30370), Some(5), Some(5), None, CatalogColumnStat.VERSION), + "ca_state" -> CatalogColumnStat(Some(54L), None, None, Some(30124), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "ca_street_number" -> CatalogColumnStat(Some(1034L), None, None, Some(30226), Some(3), Some(4), None, CatalogColumnStat.VERSION) + )), + "customer_demographics" -> CatalogStatistics(182476000L, Some(1920800L), Map( + "cd_dep_employed_count" -> CatalogColumnStat(Some(7L), Some("0"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cd_dep_count" -> CatalogColumnStat(Some(7L), Some("0"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cd_education_status" -> CatalogColumnStat(Some(7L), None, None, Some(0), Some(10), Some(15), None, CatalogColumnStat.VERSION), + "cd_marital_status" -> CatalogColumnStat(Some(5L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "cd_gender" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "cd_purchase_estimate" -> CatalogColumnStat(Some(20L), Some("500"), Some("10000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cd_demo_sk" -> CatalogColumnStat(Some(1847065L), Some("1"), Some("1920800"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "cd_credit_rating" -> CatalogColumnStat(Some(4L), None, None, Some(0), Some(7), Some(9), None, CatalogColumnStat.VERSION), + "cd_dep_college_count" -> CatalogColumnStat(Some(7L), Some("0"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "income_band" -> CatalogStatistics(400L, Some(20L), Map( + "ib_income_band_sk" -> CatalogColumnStat(Some(20L), Some("1"), Some("20"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ib_lower_bound" -> CatalogColumnStat(Some(19L), Some("0"), Some("190001"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "ib_upper_bound" -> CatalogColumnStat(Some(20L), Some("10000"), Some("200000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "inventory" -> CatalogStatistics(9583920000L, Some(399330000L), Map( + "inv_date_sk" -> CatalogColumnStat(Some(267L), Some("2450815"), Some("2452635"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "inv_item_sk" -> CatalogColumnStat(Some(206807L), Some("1"), Some("204000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "inv_warehouse_sk" -> CatalogColumnStat(Some(15L), Some("1"), Some("15"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "inv_quantity_on_hand" -> CatalogColumnStat(Some(1026L), Some("0"), Some("1000"), Some(19969395), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "item" -> CatalogStatistics(94248000L, Some(204000L), Map( + "i_container" -> CatalogColumnStat(Some(1L), None, None, Some(510), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "i_formulation" -> CatalogColumnStat(Some(158873L), None, None, Some(530), Some(20), Some(20), None, CatalogColumnStat.VERSION), + "i_brand_id" -> CatalogColumnStat(Some(862L), Some("1001001"), Some("10016017"), Some(516), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "i_manager_id" -> CatalogColumnStat(Some(99L), Some("1"), Some("100"), Some(506), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "i_rec_end_date" -> CatalogColumnStat(Some(3L), None, None, Some(102000), Some(10), Some(10), None, CatalogColumnStat.VERSION), + "i_class" -> CatalogColumnStat(Some(100L), None, None, Some(499), Some(8), Some(15), None, CatalogColumnStat.VERSION), + "i_wholesale_cost" -> CatalogColumnStat(Some(6297L), Some("0.02"), Some("88.91"), Some(489), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "i_item_id" -> CatalogColumnStat(Some(104042L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "i_manufact_id" -> CatalogColumnStat(Some(1026L), Some("1"), Some("1000"), Some(498), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "i_item_desc" -> CatalogColumnStat(Some(158754L), None, None, Some(508), Some(101), Some(200), None, CatalogColumnStat.VERSION), + "i_rec_start_date" -> CatalogColumnStat(Some(4L), None, None, Some(522), Some(10), Some(10), None, CatalogColumnStat.VERSION), + "i_color" -> CatalogColumnStat(Some(88L), None, None, Some(524), Some(6), Some(10), None, CatalogColumnStat.VERSION), + "i_product_name" -> CatalogColumnStat(Some(203486L), None, None, Some(514), Some(23), Some(30), None, CatalogColumnStat.VERSION), + "i_category" -> CatalogColumnStat(Some(10L), None, None, Some(482), Some(6), Some(11), None, CatalogColumnStat.VERSION), + "i_class_id" -> CatalogColumnStat(Some(16L), Some("1"), Some("16"), Some(491), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "i_item_sk" -> CatalogColumnStat(Some(204000L), Some("1"), Some("204000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "i_manufact" -> CatalogColumnStat(Some(997L), None, None, Some(544), Some(12), Some(15), None, CatalogColumnStat.VERSION), + "i_brand" -> CatalogColumnStat(Some(671L), None, None, Some(510), Some(17), Some(22), None, CatalogColumnStat.VERSION), + "i_current_price" -> CatalogColumnStat(Some(8736L), Some("0.09"), Some("99.99"), Some(518), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "i_category_id" -> CatalogColumnStat(Some(10L), Some("1"), Some("10"), Some(515), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "i_size" -> CatalogColumnStat(Some(7L), None, None, Some(515), Some(5), Some(11), None, CatalogColumnStat.VERSION), + "i_units" -> CatalogColumnStat(Some(21L), None, None, Some(503), Some(5), Some(7), None, CatalogColumnStat.VERSION) + )), + "promotion" -> CatalogStatistics(268000L, Some(1000L), Map( + "p_promo_name" -> CatalogColumnStat(Some(10L), None, None, Some(18), Some(4), Some(5), None, CatalogColumnStat.VERSION), + "p_end_date_sk" -> CatalogColumnStat(Some(564L), Some("2450116"), Some("2450967"), Some(12), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "p_channel_radio" -> CatalogColumnStat(Some(1L), None, None, Some(13), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_channel_demo" -> CatalogColumnStat(Some(1L), None, None, Some(16), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_item_sk" -> CatalogColumnStat(Some(986L), Some("280"), Some("203966"), Some(14), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "p_channel_dmail" -> CatalogColumnStat(Some(2L), None, None, Some(13), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_discount_active" -> CatalogColumnStat(Some(1L), None, None, Some(19), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_promo_id" -> CatalogColumnStat(Some(1000L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "p_channel_tv" -> CatalogColumnStat(Some(1L), None, None, Some(14), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_cost" -> CatalogColumnStat(Some(1L), Some("1000.00"), Some("1000.00"), Some(14), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "p_channel_email" -> CatalogColumnStat(Some(1L), None, None, Some(13), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_channel_event" -> CatalogColumnStat(Some(1L), None, None, Some(14), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_channel_catalog" -> CatalogColumnStat(Some(1L), None, None, Some(14), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_promo_sk" -> CatalogColumnStat(Some(1000L), Some("1"), Some("1000"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "p_purpose" -> CatalogColumnStat(Some(1L), None, None, Some(13), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "p_start_date_sk" -> CatalogColumnStat(Some(577L), Some("2450100"), Some("2450915"), Some(17), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "p_channel_press" -> CatalogColumnStat(Some(1L), None, None, Some(15), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "p_channel_details" -> CatalogColumnStat(Some(990L), None, None, Some(10), Some(40), Some(60), None, CatalogColumnStat.VERSION), + "p_response_target" -> CatalogColumnStat(Some(1L), Some("1"), Some("1"), Some(19), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "reason" -> CatalogStatistics(3630L, Some(55L), Map( + "r_reason_sk" -> CatalogColumnStat(Some(55L), Some("1"), Some("55"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "r_reason_id" -> CatalogColumnStat(Some(55L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "r_reason_desc" -> CatalogColumnStat(Some(55L), None, None, Some(0), Some(14), Some(43), None, CatalogColumnStat.VERSION) + )), + "ship_mode" -> CatalogStatistics(2420L, Some(20L), Map( + "sm_carrier" -> CatalogColumnStat(Some(19L), None, None, Some(0), Some(7), Some(14), None, CatalogColumnStat.VERSION), + "sm_ship_mode_sk" -> CatalogColumnStat(Some(20L), Some("1"), Some("20"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "sm_code" -> CatalogColumnStat(Some(4L), None, None, Some(0), Some(5), Some(7), None, CatalogColumnStat.VERSION), + "sm_type" -> CatalogColumnStat(Some(5L), None, None, Some(0), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "sm_contract" -> CatalogColumnStat(Some(18L), None, None, Some(0), Some(13), Some(20), None, CatalogColumnStat.VERSION), + "sm_ship_mode_id" -> CatalogColumnStat(Some(20L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION) + )), + "store" -> CatalogStatistics(207432L, Some(402L), Map( + "s_rec_end_date" -> CatalogColumnStat(Some(3L), None, None, Some(201), Some(10), Some(10), None, CatalogColumnStat.VERSION), + "s_state" -> CatalogColumnStat(Some(9L), None, None, Some(2), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "s_street_type" -> CatalogColumnStat(Some(20L), None, None, Some(6), Some(5), Some(9), None, CatalogColumnStat.VERSION), + "s_floor_space" -> CatalogColumnStat(Some(300L), Some("5004767"), Some("9997773"), Some(3), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_division_name" -> CatalogColumnStat(Some(1L), None, None, Some(5), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "s_market_desc" -> CatalogColumnStat(Some(318L), None, None, Some(5), Some(59), Some(100), None, CatalogColumnStat.VERSION), + "s_street_name" -> CatalogColumnStat(Some(255L), None, None, Some(6), Some(9), Some(16), None, CatalogColumnStat.VERSION), + "s_county" -> CatalogColumnStat(Some(9L), None, None, Some(4), Some(15), Some(17), None, CatalogColumnStat.VERSION), + "s_rec_start_date" -> CatalogColumnStat(Some(4L), None, None, Some(4), Some(10), Some(10), None, CatalogColumnStat.VERSION), + "s_company_id" -> CatalogColumnStat(Some(1L), Some("1"), Some("1"), Some(4), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_market_manager" -> CatalogColumnStat(Some(275L), None, None, Some(7), Some(13), Some(20), None, CatalogColumnStat.VERSION), + "s_number_employees" -> CatalogColumnStat(Some(94L), Some("200"), Some("300"), Some(5), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_manager" -> CatalogColumnStat(Some(300L), None, None, Some(5), Some(13), Some(20), None, CatalogColumnStat.VERSION), + "s_country" -> CatalogColumnStat(Some(1L), None, None, Some(4), Some(13), Some(13), None, CatalogColumnStat.VERSION), + "s_store_name" -> CatalogColumnStat(Some(10L), None, None, Some(2), Some(4), Some(5), None, CatalogColumnStat.VERSION), + "s_division_id" -> CatalogColumnStat(Some(1L), Some("1"), Some("1"), Some(2), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_street_number" -> CatalogColumnStat(Some(274L), None, None, Some(5), Some(3), Some(3), None, CatalogColumnStat.VERSION), + "s_company_name" -> CatalogColumnStat(Some(1L), None, None, Some(3), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "s_gmt_offset" -> CatalogColumnStat(Some(2L), Some("-6.00"), Some("-5.00"), Some(4), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "s_store_sk" -> CatalogColumnStat(Some(399L), Some("1"), Some("402"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_city" -> CatalogColumnStat(Some(18L), None, None, Some(5), Some(10), Some(15), None, CatalogColumnStat.VERSION), + "s_zip" -> CatalogColumnStat(Some(99L), None, None, Some(6), Some(5), Some(5), None, CatalogColumnStat.VERSION), + "s_market_id" -> CatalogColumnStat(Some(10L), Some("1"), Some("10"), Some(6), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_hours" -> CatalogColumnStat(Some(3L), None, None, Some(4), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "s_suite_number" -> CatalogColumnStat(Some(75L), None, None, Some(2), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "s_closed_date_sk" -> CatalogColumnStat(Some(70L), Some("2450823"), Some("2451313"), Some(296), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "s_store_id" -> CatalogColumnStat(Some(210L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "s_geography_class" -> CatalogColumnStat(Some(1L), None, None, Some(3), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "s_tax_precentage" -> CatalogColumnStat(Some(12L), Some("0.00"), Some("0.11"), Some(5), Some(8), Some(8), None, CatalogColumnStat.VERSION) + )), + "store_returns" -> CatalogStatistics(4837573440L, Some(28795080L), Map( + "sr_item_sk" -> CatalogColumnStat(Some(197284L), Some("1"), Some("204000"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_reversed_charge" -> CatalogColumnStat(Some(423824L), Some("0.00"), Some("16099.52"), Some(1009035), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_return_amt_inc_tax" -> CatalogColumnStat(Some(807709L), Some("0.00"), Some("20002.89"), Some(1006919), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_store_sk" -> CatalogColumnStat(Some(199L), Some("1"), Some("400"), Some(1007164), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_return_quantity" -> CatalogColumnStat(Some(103L), Some("1"), Some("100"), Some(1007948), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_fee" -> CatalogColumnStat(Some(9571L), Some("0.50"), Some("100.00"), Some(1008291), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_refunded_cash" -> CatalogColumnStat(Some(559685L), Some("0.00"), Some("17556.95"), Some(1008003), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_return_time_sk" -> CatalogColumnStat(Some(31932L), Some("28799"), Some("61199"), Some(1009330), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_addr_sk" -> CatalogColumnStat(Some(925738L), Some("1"), Some("1000000"), Some(1008253), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_return_amt" -> CatalogColumnStat(Some(492928L), Some("0.00"), Some("18973.20"), Some(1007419), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_returned_date_sk" -> CatalogColumnStat(Some(2010L), Some("2450820"), Some("2452822"), Some(1007464), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_return_tax" -> CatalogColumnStat(Some(86618L), Some("0.00"), Some("1611.71"), Some(1008618), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_return_ship_cost" -> CatalogColumnStat(Some(290195L), Some("0.00"), Some("9578.25"), Some(1007846), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_reason_sk" -> CatalogColumnStat(Some(57L), Some("1"), Some("55"), Some(1008299), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_net_loss" -> CatalogColumnStat(Some(487073L), Some("0.50"), Some("10447.72"), Some(1007153), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_cdemo_sk" -> CatalogColumnStat(Some(1807132L), Some("1"), Some("1920800"), Some(1006835), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_hdemo_sk" -> CatalogColumnStat(Some(6609L), Some("1"), Some("7200"), Some(1008547), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_store_credit" -> CatalogColumnStat(Some(410133L), Some("0.00"), Some("15642.11"), Some(1007102), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_customer_sk" -> CatalogColumnStat(Some(1860981L), Some("1"), Some("2000000"), Some(1008429), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "sr_ticket_number" -> CatalogColumnStat(Some(15853105L), Some("1"), Some("23999996"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION) + )), + "time_dim" -> CatalogStatistics(10886400L, Some(86400L), Map( + "t_sub_shift" -> CatalogColumnStat(Some(4L), None, None, Some(0), Some(7), Some(9), None, CatalogColumnStat.VERSION), + "t_time" -> CatalogColumnStat(Some(83505L), Some("0"), Some("86399"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "t_second" -> CatalogColumnStat(Some(60L), Some("0"), Some("59"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "t_hour" -> CatalogColumnStat(Some(25L), Some("0"), Some("23"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "t_am_pm" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "t_minute" -> CatalogColumnStat(Some(60L), Some("0"), Some("59"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "t_shift" -> CatalogColumnStat(Some(3L), None, None, Some(0), Some(6), Some(6), None, CatalogColumnStat.VERSION), + "t_time_id" -> CatalogColumnStat(Some(80197L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "t_time_sk" -> CatalogColumnStat(Some(83505L), Some("0"), Some("86399"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "t_meal_time" -> CatalogColumnStat(Some(3L), None, None, Some(50400), Some(7), Some(9), None, CatalogColumnStat.VERSION) + )), + "warehouse" -> CatalogStatistics(3855L, Some(15L), Map( + "w_state" -> CatalogColumnStat(Some(8L), None, None, Some(0), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "w_street_name" -> CatalogColumnStat(Some(14L), None, None, Some(1), Some(10), Some(14), None, CatalogColumnStat.VERSION), + "w_warehouse_name" -> CatalogColumnStat(Some(14L), None, None, Some(1), Some(17), Some(20), None, CatalogColumnStat.VERSION), + "w_county" -> CatalogColumnStat(Some(8L), None, None, Some(0), Some(14), Some(16), None, CatalogColumnStat.VERSION), + "w_street_number" -> CatalogColumnStat(Some(14L), None, None, Some(1), Some(3), Some(3), None, CatalogColumnStat.VERSION), + "w_warehouse_sq_ft" -> CatalogColumnStat(Some(14L), Some("73065"), Some("977787"), Some(1), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "w_city" -> CatalogColumnStat(Some(11L), None, None, Some(0), Some(8), Some(13), None, CatalogColumnStat.VERSION), + "w_warehouse_sk" -> CatalogColumnStat(Some(15L), Some("1"), Some("15"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "w_suite_number" -> CatalogColumnStat(Some(13L), None, None, Some(1), Some(8), Some(9), None, CatalogColumnStat.VERSION), + "w_zip" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(5), Some(5), None, CatalogColumnStat.VERSION), + "w_gmt_offset" -> CatalogColumnStat(Some(2L), Some("-6.00"), Some("-5.00"), Some(1), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "w_street_type" -> CatalogColumnStat(Some(10L), None, None, Some(1), Some(5), Some(7), None, CatalogColumnStat.VERSION), + "w_warehouse_id" -> CatalogColumnStat(Some(15L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "w_country" -> CatalogColumnStat(Some(1L), None, None, Some(0), Some(13), Some(13), None, CatalogColumnStat.VERSION) + )), + "web_page" -> CatalogStatistics(281520L, Some(2040L), Map( + "wp_type" -> CatalogColumnStat(Some(7L), None, None, Some(19), Some(7), Some(9), None, CatalogColumnStat.VERSION), + "wp_autogen_flag" -> CatalogColumnStat(Some(2L), None, None, Some(25), Some(1), Some(1), None, CatalogColumnStat.VERSION), + "wp_url" -> CatalogColumnStat(Some(1L), None, None, Some(25), Some(18), Some(18), None, CatalogColumnStat.VERSION), + "wp_image_count" -> CatalogColumnStat(Some(7L), Some("1"), Some("7"), Some(20), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_max_ad_count" -> CatalogColumnStat(Some(5L), Some("0"), Some("4"), Some(21), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_customer_sk" -> CatalogColumnStat(Some(486L), Some("711"), Some("1996257"), Some(1471), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_rec_end_date" -> CatalogColumnStat(Some(3L), Some("1999-09-03"), Some("2001-09-02"), Some(1020), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_creation_date_sk" -> CatalogColumnStat(Some(121L), Some("2450672"), Some("2450815"), Some(20), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_link_count" -> CatalogColumnStat(Some(25L), Some("2"), Some("25"), Some(16), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_web_page_id" -> CatalogColumnStat(Some(1004L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "wp_web_page_sk" -> CatalogColumnStat(Some(2040L), Some("1"), Some("2040"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_rec_start_date" -> CatalogColumnStat(Some(4L), Some("1997-09-03"), Some("2001-09-03"), Some(21), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_access_date_sk" -> CatalogColumnStat(Some(100L), Some("2452548"), Some("2452648"), Some(19), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "wp_char_count" -> CatalogColumnStat(Some(1493L), Some("303"), Some("8523"), Some(25), Some(4), Some(4), None, CatalogColumnStat.VERSION) + )), + "web_returns" -> CatalogStatistics(1439534000L, Some(7197670L), Map( + "wr_returning_addr_sk" -> CatalogColumnStat(Some(925738L), Some("1"), Some("1000000"), Some(323850), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_reversed_charge" -> CatalogColumnStat(Some(346909L), Some("0.00"), Some("22972.36"), Some(323810), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_returned_date_sk" -> CatalogColumnStat(Some(2189L), Some("2450820"), Some("2453002"), Some(324185), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_order_number" -> CatalogColumnStat(Some(4098425L), Some("1"), Some("5999999"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_fee" -> CatalogColumnStat(Some(9571L), Some("0.50"), Some("100.00"), Some(324065), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_account_credit" -> CatalogColumnStat(Some(334119L), Some("0.00"), Some("23028.27"), Some(324422), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_refunded_customer_sk" -> CatalogColumnStat(Some(1808850L), Some("1"), Some("2000000"), Some(324191), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_returning_customer_sk" -> CatalogColumnStat(Some(1813094L), Some("1"), Some("2000000"), Some(324024), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_return_tax" -> CatalogColumnStat(Some(88469L), Some("0.00"), Some("2551.16"), Some(323621), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_web_page_sk" -> CatalogColumnStat(Some(1994L), Some("1"), Some("2040"), Some(324900), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_item_sk" -> CatalogColumnStat(Some(197284L), Some("1"), Some("204000"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_reason_sk" -> CatalogColumnStat(Some(57L), Some("1"), Some("55"), Some(323666), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_return_amt_inc_tax" -> CatalogColumnStat(Some(683544L), Some("0.00"), Some("29493.38"), Some(323171), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_net_loss" -> CatalogColumnStat(Some(494707L), Some("0.50"), Some("15068.96"), Some(324438), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_refunded_cdemo_sk" -> CatalogColumnStat(Some(1755065L), Some("1"), Some("1920800"), Some(323863), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_returned_time_sk" -> CatalogColumnStat(Some(84232L), Some("0"), Some("86399"), Some(323677), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_return_amt" -> CatalogColumnStat(Some(453015L), Some("0.00"), Some("28346.31"), Some(323473), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_returning_cdemo_sk" -> CatalogColumnStat(Some(1755065L), Some("1"), Some("1920800"), Some(323899), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_returning_hdemo_sk" -> CatalogColumnStat(Some(6609L), Some("1"), Some("7200"), Some(323999), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_refunded_cash" -> CatalogColumnStat(Some(484316L), Some("0.00"), Some("26466.56"), Some(324693), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_refunded_hdemo_sk" -> CatalogColumnStat(Some(6609L), Some("1"), Some("7200"), Some(324230), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_return_quantity" -> CatalogColumnStat(Some(103L), Some("1"), Some("100"), Some(323764), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_return_ship_cost" -> CatalogColumnStat(Some(302038L), Some("0.00"), Some("13602.60"), Some(323341), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "wr_refunded_addr_sk" -> CatalogColumnStat(Some(925738L), Some("1"), Some("1000000"), Some(324482), Some(8), Some(8), None, CatalogColumnStat.VERSION) + )), + "web_site" -> CatalogStatistics(11760L, Some(24L), Map( + "web_rec_end_date" -> CatalogColumnStat(Some(3L), Some("1999-08-16"), Some("2001-08-15"), Some(12), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_market_manager" -> CatalogColumnStat(Some(21L), None, None, Some(0), Some(13), Some(15), None, CatalogColumnStat.VERSION), + "web_country" -> CatalogColumnStat(Some(1L), None, None, Some(0), Some(13), Some(13), None, CatalogColumnStat.VERSION), + "web_street_name" -> CatalogColumnStat(Some(24L), None, None, Some(0), Some(10), Some(14), None, CatalogColumnStat.VERSION), + "web_city" -> CatalogColumnStat(Some(11L), None, None, Some(0), Some(10), Some(15), None, CatalogColumnStat.VERSION), + "web_mkt_id" -> CatalogColumnStat(Some(6L), Some("1"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_close_date_sk" -> CatalogColumnStat(Some(8L), Some("2443328"), Some("2447131"), Some(4), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_street_number" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(3), Some(3), None, CatalogColumnStat.VERSION), + "web_gmt_offset" -> CatalogColumnStat(Some(2L), None, None, Some(0), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "web_rec_start_date" -> CatalogColumnStat(Some(4L), Some("1997-08-16"), Some("2001-08-16"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_mkt_class" -> CatalogColumnStat(Some(17L), None, None, Some(0), Some(32), Some(47), None, CatalogColumnStat.VERSION), + "web_county" -> CatalogColumnStat(Some(9L), None, None, Some(0), Some(14), Some(17), None, CatalogColumnStat.VERSION), + "web_class" -> CatalogColumnStat(Some(1L), None, None, Some(0), Some(7), Some(7), None, CatalogColumnStat.VERSION), + "web_site_sk" -> CatalogColumnStat(Some(24L), Some("1"), Some("24"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_manager" -> CatalogColumnStat(Some(19L), None, None, Some(0), Some(13), Some(16), None, CatalogColumnStat.VERSION), + "web_suite_number" -> CatalogColumnStat(Some(20L), None, None, Some(0), Some(9), Some(9), None, CatalogColumnStat.VERSION), + "web_site_id" -> CatalogColumnStat(Some(12L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), + "web_company_name" -> CatalogColumnStat(Some(6L), None, None, Some(0), Some(5), Some(5), None, CatalogColumnStat.VERSION), + "web_state" -> CatalogColumnStat(Some(9L), None, None, Some(0), Some(2), Some(2), None, CatalogColumnStat.VERSION), + "web_mkt_desc" -> CatalogColumnStat(Some(15L), None, None, Some(0), Some(66), Some(92), None, CatalogColumnStat.VERSION), + "web_name" -> CatalogColumnStat(Some(4L), None, None, Some(0), Some(6), Some(6), None, CatalogColumnStat.VERSION), + "web_company_id" -> CatalogColumnStat(Some(6L), Some("1"), Some("6"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_street_type" -> CatalogColumnStat(Some(15L), None, None, Some(0), Some(4), Some(9), None, CatalogColumnStat.VERSION), + "web_open_date_sk" -> CatalogColumnStat(Some(12L), Some("2450628"), Some("2450807"), Some(0), Some(4), Some(4), None, CatalogColumnStat.VERSION), + "web_tax_percentage" -> CatalogColumnStat(Some(8L), Some("0.00"), Some("0.12"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), + "web_zip" -> CatalogColumnStat(Some(14L), None, None, Some(0), Some(5), Some(5), None, CatalogColumnStat.VERSION) + )) + ) + // scalastyle:on line.size.limit +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala b/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala index fd6bc9662bfad..d2a6358ee822b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TestQueryExecutionListener.scala @@ -28,7 +28,7 @@ class TestQueryExecutionListener extends QueryExecutionListener { OnSuccessCall.isOnSuccessCalled.set(true) } - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = { } + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index cc3995516dcc2..91e9f1d9afb80 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql import java.math.BigDecimal +import org.apache.spark.SparkException import org.apache.spark.sql.api.java._ -import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.encoders.OuterScopes import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.{QueryExecution, SimpleMode} import org.apache.spark.sql.execution.columnar.InMemoryRelation @@ -33,7 +34,6 @@ import org.apache.spark.sql.test.SQLTestData._ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.QueryExecutionListener - private case class FunctionResult(f1: String, f2: String) class UDFSuite extends QueryTest with SharedSparkSession { @@ -134,10 +134,12 @@ class UDFSuite extends QueryTest with SharedSparkSession { assert(df1.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic)) assert(df1.head().getDouble(0) >= 0.0) - val bar = udf(() => Math.random(), DataTypes.DoubleType).asNondeterministic() - val df2 = testData.select(bar()) - assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic)) - assert(df2.head().getDouble(0) >= 0.0) + withSQLConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF.key -> "true") { + val bar = udf(() => Math.random(), DataTypes.DoubleType).asNondeterministic() + val df2 = testData.select(bar()) + assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic)) + assert(df2.head().getDouble(0) >= 0.0) + } val javaUdf = udf(new UDF0[Double] { override def call(): Double = Math.random() @@ -339,7 +341,7 @@ class UDFSuite extends QueryTest with SharedSparkSession { withTempPath { path => var numTotalCachedHit = 0 val listener = new QueryExecutionListener { - override def onFailure(f: String, qe: QueryExecution, e: Throwable): Unit = {} + override def onFailure(f: String, qe: QueryExecution, e: Exception): Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { qe.withCachedData match { @@ -441,16 +443,23 @@ class UDFSuite extends QueryTest with SharedSparkSession { } test("SPARK-25044 Verify null input handling for primitive types - with udf(Any, DataType)") { - val f = udf((x: Int) => x, IntegerType) - checkAnswer( - Seq(Integer.valueOf(1), null).toDF("x").select(f($"x")), - Row(1) :: Row(0) :: Nil) + withSQLConf(SQLConf.LEGACY_ALLOW_UNTYPED_SCALA_UDF.key -> "true") { + val f = udf((x: Int) => x, IntegerType) + checkAnswer( + Seq(Integer.valueOf(1), null).toDF("x").select(f($"x")), + Row(1) :: Row(0) :: Nil) + + val f2 = udf((x: Double) => x, DoubleType) + checkAnswer( + Seq(java.lang.Double.valueOf(1.1), null).toDF("x").select(f2($"x")), + Row(1.1) :: Row(0.0) :: Nil) + } - val f2 = udf((x: Double) => x, DoubleType) - checkAnswer( - Seq(java.lang.Double.valueOf(1.1), null).toDF("x").select(f2($"x")), - Row(1.1) :: Row(0.0) :: Nil) + } + test("use untyped Scala UDF should fail by default") { + val e = intercept[AnalysisException](udf((x: Int) => x, IntegerType)) + assert(e.getMessage.contains("You're using untyped Scala UDF")) } test("SPARK-26308: udf with decimal") { @@ -525,21 +534,79 @@ class UDFSuite extends QueryTest with SharedSparkSession { assert(spark.range(2).select(nonDeterministicJavaUDF()).distinct().count() == 2) } - test("Replace _FUNC_ in UDF ExpressionInfo") { - val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("upper")) - assert(info.getName === "upper") - assert(info.getClassName === "org.apache.spark.sql.catalyst.expressions.Upper") - assert(info.getUsage === "upper(str) - Returns `str` with all characters changed to uppercase.") - assert(info.getExamples.contains("> SELECT upper('SparkSql');")) - assert(info.getSince === "1.0.1") - assert(info.getNote === "") - assert(info.getExtended.contains("> SELECT upper('SparkSql');")) - } - test("SPARK-28521 error message for CAST(parameter types contains DataType)") { val e = intercept[AnalysisException] { spark.sql("SELECT CAST(1)") } assert(e.getMessage.contains("Invalid arguments for function cast")) } + + test("only one case class parameter") { + val f = (d: TestData) => d.key * d.value.toInt + val myUdf = udf(f) + val df = Seq(("data", TestData(50, "2"))).toDF("col1", "col2") + checkAnswer(df.select(myUdf(Column("col2"))), Row(100) :: Nil) + } + + test("one case class with primitive parameter") { + val f = (i: Int, p: TestData) => p.key * i + val myUdf = udf(f) + val df = Seq((2, TestData(50, "data"))).toDF("col1", "col2") + checkAnswer(df.select(myUdf(Column("col1"), Column("col2"))), Row(100) :: Nil) + } + + test("multiple case class parameters") { + val f = (d1: TestData, d2: TestData) => d1.key * d2.key + val myUdf = udf(f) + val df = Seq((TestData(10, "d1"), TestData(50, "d2"))).toDF("col1", "col2") + checkAnswer(df.select(myUdf(Column("col1"), Column("col2"))), Row(500) :: Nil) + } + + test("input case class parameter and return case class") { + val f = (d: TestData) => TestData(d.key * 2, "copy") + val myUdf = udf(f) + val df = Seq(("data", TestData(50, "d2"))).toDF("col1", "col2") + checkAnswer(df.select(myUdf(Column("col2"))), Row(Row(100, "copy")) :: Nil) + } + + test("any and case class parameter") { + val f = (any: Any, d: TestData) => s"${any.toString}, ${d.value}" + val myUdf = udf(f) + val df = Seq(("Hello", TestData(50, "World"))).toDF("col1", "col2") + checkAnswer(df.select(myUdf(Column("col1"), Column("col2"))), Row("Hello, World") :: Nil) + } + + test("nested case class parameter") { + val f = (y: Int, training: TrainingSales) => training.sales.year + y + val myUdf = udf(f) + val df = Seq((20, TrainingSales("training", CourseSales("course", 2000, 3.14)))) + .toDF("col1", "col2") + checkAnswer(df.select(myUdf(Column("col1"), Column("col2"))), Row(2020) :: Nil) + } + + object MalformedClassObject extends Serializable { + class MalformedNonPrimitiveFunction extends (String => Int) with Serializable { + override def apply(v1: String): Int = v1.toInt / 0 + } + + class MalformedPrimitiveFunction extends (Int => Int) with Serializable { + override def apply(v1: Int): Int = v1 / 0 + } + } + + test("SPARK-32238: Use Utils.getSimpleName to avoid hitting Malformed class name") { + OuterScopes.addOuterScope(MalformedClassObject) + val f1 = new MalformedClassObject.MalformedNonPrimitiveFunction() + val f2 = new MalformedClassObject.MalformedPrimitiveFunction() + + val e1 = intercept[SparkException] { + Seq("20").toDF("col").select(udf(f1).apply(Column("col"))).collect() + } + assert(e1.getMessage.contains("UDFSuite$MalformedClassObject$MalformedNonPrimitiveFunction")) + + val e2 = intercept[SparkException] { + Seq(20).toDF("col").select(udf(f2).apply(Column("col"))).collect() + } + assert(e2.getMessage.contains("UDFSuite$MalformedClassObject$MalformedPrimitiveFunction")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index ffc2018d2132d..7c126f5e9a0b5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import java.time.{LocalDateTime, ZoneOffset} import java.util.Arrays import org.apache.spark.rdd.RDD @@ -103,6 +104,24 @@ private[spark] class ExampleSubTypeUDT extends UserDefinedType[IExampleSubType] override def userClass: Class[IExampleSubType] = classOf[IExampleSubType] } +private[sql] case class FooWithDate(date: LocalDateTime, s: String, i: Int) + +private[sql] class LocalDateTimeUDT extends UserDefinedType[LocalDateTime] { + override def sqlType: DataType = LongType + + override def serialize(obj: LocalDateTime): Long = { + obj.toEpochSecond(ZoneOffset.UTC) + } + + def deserialize(datum: Any): LocalDateTime = datum match { + case value: Long => LocalDateTime.ofEpochSecond(value, 0, ZoneOffset.UTC) + } + + override def userClass: Class[LocalDateTime] = classOf[LocalDateTime] + + private[spark] override def asNullable: LocalDateTimeUDT = this +} + class UserDefinedTypeSuite extends QueryTest with SharedSparkSession with ParquetTest with ExpressionEvalHelper { import testImplicits._ @@ -115,6 +134,24 @@ class UserDefinedTypeSuite extends QueryTest with SharedSparkSession with Parque MyLabeledPoint(1.0, new TestUDT.MyDenseVector(Array(0.1, 1.0))), MyLabeledPoint(0.0, new TestUDT.MyDenseVector(Array(0.3, 3.0)))).toDF() + + test("SPARK-32090: equal") { + val udt1 = new ExampleBaseTypeUDT + val udt2 = new ExampleSubTypeUDT + val udt3 = new ExampleSubTypeUDT + assert(udt1 !== udt2) + assert(udt2 !== udt1) + assert(udt2 === udt3) + assert(udt3 === udt2) + } + + test("SPARK-32090: acceptsType") { + val udt1 = new ExampleBaseTypeUDT + val udt2 = new ExampleSubTypeUDT + assert(udt1.acceptsType(udt2)) + assert(!udt2.acceptsType(udt1)) + } + test("register user type: MyDenseVector for MyLabeledPoint") { val labels: RDD[Double] = pointsRDD.select('label).rdd.map { case Row(v: Double) => v } val labelsArrays: Array[Double] = labels.collect() @@ -131,12 +168,14 @@ class UserDefinedTypeSuite extends QueryTest with SharedSparkSession with Parque } test("UDTs and UDFs") { - spark.udf.register("testType", - (d: TestUDT.MyDenseVector) => d.isInstanceOf[TestUDT.MyDenseVector]) - pointsRDD.createOrReplaceTempView("points") - checkAnswer( - sql("SELECT testType(features) from points"), - Seq(Row(true), Row(true))) + withTempView("points") { + spark.udf.register("testType", + (d: TestUDT.MyDenseVector) => d.isInstanceOf[TestUDT.MyDenseVector]) + pointsRDD.createOrReplaceTempView("points") + checkAnswer( + sql("SELECT testType(features) from points"), + Seq(Row(true), Row(true))) + } } testStandardAndLegacyModes("UDTs with Parquet") { @@ -287,4 +326,22 @@ class UserDefinedTypeSuite extends QueryTest with SharedSparkSession with Parque checkAnswer(spark.createDataFrame(data, schema).selectExpr("typeof(a)"), Seq(Row("array"))) } + + test("SPARK-30993: UserDefinedType matched to fixed length SQL type shouldn't be corrupted") { + def concatFoo(a: FooWithDate, b: FooWithDate): FooWithDate = { + FooWithDate(b.date, a.s + b.s, a.i) + } + + UDTRegistration.register(classOf[LocalDateTime].getName, classOf[LocalDateTimeUDT].getName) + + // remove sub-millisecond part as we only use millis based timestamp while serde + val date = LocalDateTime.ofEpochSecond(LocalDateTime.now().toEpochSecond(ZoneOffset.UTC), + 0, ZoneOffset.UTC) + val inputDS = List(FooWithDate(date, "Foo", 1), FooWithDate(date, "Foo", 3), + FooWithDate(date, "Foo", 3)).toDS() + val agg = inputDS.groupByKey(x => x.i).mapGroups((_, iter) => iter.reduce(concatFoo)) + val result = agg.collect() + + assert(result.toSet === Set(FooWithDate(date, "FooFoo", 3), FooWithDate(date, "Foo", 1))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala new file mode 100644 index 0000000000000..524fc8a9b0dea --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.api.python + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} + +class PythonSQLUtilsSuite extends SparkFunSuite { + + test("listing sql configurations contains runtime ones only") { + val configs = PythonSQLUtils.listRuntimeSQLConfigs() + + // static sql configurations + assert(!configs.exists(entry => entry._1 == StaticSQLConf.SPARK_SESSION_EXTENSIONS.key), + "listSQLConfigs should contain public static sql configuration") + assert(!configs.exists(entry => entry._1 == StaticSQLConf.DEBUG_MODE.key), + "listSQLConfigs should not contain internal static sql configuration") + + // dynamic sql configurations + assert(configs.exists(entry => entry._1 == SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key), + "listSQLConfigs should contain public dynamic sql configuration") + assert(!configs.exists(entry => entry._1 == SQLConf.ANALYZER_MAX_ITERATIONS.key), + "listSQLConfigs should not contain internal dynamic sql configuration") + + // spark core configurations + assert(!configs.exists(entry => entry._1 == "spark.master"), + "listSQLConfigs should not contain core configuration") + } + + test("listing static sql configurations contains public static ones only") { + val configs = PythonSQLUtils.listStaticSQLConfigs() + + // static sql configurations + assert(configs.exists(entry => entry._1 == StaticSQLConf.SPARK_SESSION_EXTENSIONS.key), + "listStaticSQLConfigs should contain public static sql configuration") + assert(!configs.exists(entry => entry._1 == StaticSQLConf.DEBUG_MODE.key), + "listStaticSQLConfigs should not contain internal static sql configuration") + + // dynamic sql configurations + assert(!configs.exists(entry => entry._1 == SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key), + "listStaticSQLConfigs should not contain dynamic sql configuration") + assert(!configs.exists(entry => entry._1 == SQLConf.ANALYZER_MAX_ITERATIONS.key), + "listStaticSQLConfigs should not contain internal dynamic sql configuration") + + // spark core configurations + assert(!configs.exists(entry => entry._1 == "spark.master"), + "listStaticSQLConfigs should not contain core configuration") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala index 3cdac59c20fc9..436862920f53f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala @@ -67,9 +67,10 @@ trait AlterTableTests extends SharedSparkSession { assert(exc.getMessage.contains("Unsupported table change")) assert(exc.getMessage.contains("Cannot drop all fields")) // from the implementation - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType().add("id", IntegerType)) } } @@ -80,9 +81,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN data string") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType().add("id", IntegerType).add("data", StringType)) } } @@ -93,9 +95,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN data string NOT NULL") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === StructType(Seq( StructField("id", IntegerType), StructField("data", StringType, nullable = false)))) @@ -108,9 +111,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN data string COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === StructType(Seq( StructField("id", IntegerType), StructField("data", StringType).withComment("doc")))) @@ -136,12 +140,13 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (point struct) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN a string FIRST") - assert(getTableMetadata(t).schema == new StructType() + val tableName = fullTableName(t) + assert(getTableMetadata(tableName).schema == new StructType() .add("a", StringType) .add("point", new StructType().add("x", IntegerType))) sql(s"ALTER TABLE $t ADD COLUMN b string AFTER point") - assert(getTableMetadata(t).schema == new StructType() + assert(getTableMetadata(tableName).schema == new StructType() .add("a", StringType) .add("point", new StructType().add("x", IntegerType)) .add("b", StringType)) @@ -151,7 +156,7 @@ trait AlterTableTests extends SharedSparkSession { assert(e1.getMessage().contains("Couldn't find the reference column")) sql(s"ALTER TABLE $t ADD COLUMN point.y int FIRST") - assert(getTableMetadata(t).schema == new StructType() + assert(getTableMetadata(tableName).schema == new StructType() .add("a", StringType) .add("point", new StructType() .add("y", IntegerType) @@ -159,7 +164,7 @@ trait AlterTableTests extends SharedSparkSession { .add("b", StringType)) sql(s"ALTER TABLE $t ADD COLUMN point.z int AFTER x") - assert(getTableMetadata(t).schema == new StructType() + assert(getTableMetadata(tableName).schema == new StructType() .add("a", StringType) .add("point", new StructType() .add("y", IntegerType) @@ -173,15 +178,53 @@ trait AlterTableTests extends SharedSparkSession { } } + test("SPARK-30814: add column with position referencing new columns being added") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (a string, b int, point struct) USING $v2Format") + sql(s"ALTER TABLE $t ADD COLUMNS (x int AFTER a, y int AFTER x, z int AFTER y)") + + val tableName = fullTableName(t) + assert(getTableMetadata(tableName).schema === new StructType() + .add("a", StringType) + .add("x", IntegerType) + .add("y", IntegerType) + .add("z", IntegerType) + .add("b", IntegerType) + .add("point", new StructType() + .add("x", DoubleType) + .add("y", DoubleType))) + + sql(s"ALTER TABLE $t ADD COLUMNS (point.z double AFTER x, point.zz double AFTER z)") + assert(getTableMetadata(tableName).schema === new StructType() + .add("a", StringType) + .add("x", IntegerType) + .add("y", IntegerType) + .add("z", IntegerType) + .add("b", IntegerType) + .add("point", new StructType() + .add("x", DoubleType) + .add("z", DoubleType) + .add("zz", DoubleType) + .add("y", DoubleType))) + + // The new column being referenced should come before being referenced. + val e = intercept[AnalysisException]( + sql(s"ALTER TABLE $t ADD COLUMNS (yy int AFTER xx, xx int)")) + assert(e.getMessage().contains("Couldn't find the reference column for AFTER xx at root")) + } + } + test("AlterTable: add multiple columns") { val t = s"${catalogAndNamespace}table_name" withTable(t) { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMNS data string COMMENT 'doc', ts timestamp") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === StructType(Seq( StructField("id", IntegerType), StructField("data", StringType).withComment("doc"), @@ -195,9 +238,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, point struct) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN point.z double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", StructType(Seq( @@ -214,9 +258,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN points.key.z double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StructType(Seq( @@ -233,9 +278,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN points.value.z double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StringType, StructType(Seq( @@ -251,9 +297,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array>) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN points.element.z double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -269,9 +316,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN points array>") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -286,9 +334,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array>) USING $v2Format") sql(s"ALTER TABLE $t ADD COLUMN points.element.z double COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -342,8 +391,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN id TYPE bigint") - val table = getTableMetadata(t) - assert(table.name === fullTableName(t)) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) + + assert(table.name === tableName) assert(table.schema === new StructType().add("id", LongType)) } } @@ -363,13 +414,14 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id bigint NOT NULL) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN id SET NOT NULL") - val table = getTableMetadata(t) - assert(table.name === fullTableName(t)) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) + assert(table.name === tableName) assert(table.schema === new StructType().add("id", LongType, nullable = false)) sql(s"ALTER TABLE $t ALTER COLUMN id DROP NOT NULL") - val table2 = getTableMetadata(t) - assert(table2.name === fullTableName(t)) + val table2 = getTableMetadata(tableName) + assert(table2.name === tableName) assert(table2.schema === new StructType().add("id", LongType)) val e = intercept[AnalysisException] { @@ -385,8 +437,9 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, point struct) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN point.x TYPE double") - val table = getTableMetadata(t) - assert(table.name === fullTableName(t)) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", StructType(Seq( @@ -407,9 +460,10 @@ trait AlterTableTests extends SharedSparkSession { assert(exc.getMessage.contains("point")) assert(exc.getMessage.contains("update a struct by updating its fields")) - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", StructType(Seq( @@ -429,9 +483,10 @@ trait AlterTableTests extends SharedSparkSession { assert(exc.getMessage.contains("update the element by updating points.element")) - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(IntegerType))) @@ -444,9 +499,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.element TYPE long") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(LongType))) @@ -464,9 +520,10 @@ trait AlterTableTests extends SharedSparkSession { assert(exc.getMessage.contains("update a map by updating m.key or m.value")) - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("m", MapType(StringType, IntegerType))) @@ -479,9 +536,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, m map) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN m.value TYPE long") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("m", MapType(StringType, LongType))) @@ -495,9 +553,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.key.x TYPE double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StructType(Seq( @@ -513,9 +572,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.value.x TYPE double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StringType, StructType(Seq( @@ -530,9 +590,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array>) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.element.x TYPE double") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -589,9 +650,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN id COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === StructType(Seq(StructField("id", IntegerType).withComment("doc")))) } } @@ -602,7 +664,8 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (a int, b int, point struct) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN b FIRST") - assert(getTableMetadata(t).schema == new StructType() + val tableName = fullTableName(t) + assert(getTableMetadata(tableName).schema == new StructType() .add("b", IntegerType) .add("a", IntegerType) .add("point", new StructType() @@ -611,7 +674,7 @@ trait AlterTableTests extends SharedSparkSession { .add("z", IntegerType))) sql(s"ALTER TABLE $t ALTER COLUMN b AFTER point") - assert(getTableMetadata(t).schema == new StructType() + assert(getTableMetadata(tableName).schema == new StructType() .add("a", IntegerType) .add("point", new StructType() .add("x", IntegerType) @@ -624,7 +687,7 @@ trait AlterTableTests extends SharedSparkSession { assert(e1.getMessage.contains("Couldn't resolve positional argument")) sql(s"ALTER TABLE $t ALTER COLUMN point.y FIRST") - assert(getTableMetadata(t).schema == new StructType() + assert(getTableMetadata(tableName).schema == new StructType() .add("a", IntegerType) .add("point", new StructType() .add("y", IntegerType) @@ -633,7 +696,7 @@ trait AlterTableTests extends SharedSparkSession { .add("b", IntegerType)) sql(s"ALTER TABLE $t ALTER COLUMN point.y AFTER z") - assert(getTableMetadata(t).schema == new StructType() + assert(getTableMetadata(tableName).schema == new StructType() .add("a", IntegerType) .add("point", new StructType() .add("x", IntegerType) @@ -651,28 +714,16 @@ trait AlterTableTests extends SharedSparkSession { } } - test("AlterTable: update column type and comment") { - val t = s"${catalogAndNamespace}table_name" - withTable(t) { - sql(s"CREATE TABLE $t (id int) USING $v2Format") - sql(s"ALTER TABLE $t ALTER COLUMN id TYPE bigint COMMENT 'doc'") - - val table = getTableMetadata(t) - - assert(table.name === fullTableName(t)) - assert(table.schema === StructType(Seq(StructField("id", LongType).withComment("doc")))) - } - } - test("AlterTable: update nested column comment") { val t = s"${catalogAndNamespace}table_name" withTable(t) { sql(s"CREATE TABLE $t (id int, point struct) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN point.y COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", StructType(Seq( @@ -688,9 +739,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.key.y COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StructType(Seq( @@ -706,9 +758,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.value.y COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StringType, StructType(Seq( @@ -723,9 +776,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array>) USING $v2Format") sql(s"ALTER TABLE $t ALTER COLUMN points.element.y COMMENT 'doc'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -768,9 +822,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t RENAME COLUMN id TO user_id") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType().add("user_id", IntegerType)) } } @@ -781,9 +836,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, point struct) USING $v2Format") sql(s"ALTER TABLE $t RENAME COLUMN point.y TO t") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", StructType(Seq( @@ -799,9 +855,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t RENAME COLUMN point.key.y TO t") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", MapType(StructType(Seq( @@ -817,9 +874,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t RENAME COLUMN points.value.y TO t") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StringType, StructType(Seq( @@ -834,9 +892,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array>) USING $v2Format") sql(s"ALTER TABLE $t RENAME COLUMN points.element.y TO t") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -910,9 +969,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, data string) USING $v2Format") sql(s"ALTER TABLE $t DROP COLUMN data") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType().add("id", IntegerType)) } } @@ -924,9 +984,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t DROP COLUMN point.t") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", StructType(Seq( @@ -942,9 +1003,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t DROP COLUMN point.key.y") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("point", MapType(StructType(Seq( @@ -959,9 +1021,10 @@ trait AlterTableTests extends SharedSparkSession { s"USING $v2Format") sql(s"ALTER TABLE $t DROP COLUMN points.value.y") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", MapType(StringType, StructType(Seq( @@ -975,9 +1038,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int, points array>) USING $v2Format") sql(s"ALTER TABLE $t DROP COLUMN points.element.y") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.schema === new StructType() .add("id", IntegerType) .add("points", ArrayType(StructType(Seq( @@ -1019,9 +1083,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t SET LOCATION 's3://bucket/path'") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.properties === withDefaultOwnership(Map("provider" -> v2Format, "location" -> "s3://bucket/path")).asJava) } @@ -1046,9 +1111,10 @@ trait AlterTableTests extends SharedSparkSession { sql(s"CREATE TABLE $t (id int) USING $v2Format") sql(s"ALTER TABLE $t SET TBLPROPERTIES ('test'='34')") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.properties === withDefaultOwnership(Map("provider" -> v2Format, "test" -> "34")).asJava) } @@ -1059,17 +1125,18 @@ trait AlterTableTests extends SharedSparkSession { withTable(t) { sql(s"CREATE TABLE $t (id int) USING $v2Format TBLPROPERTIES('test' = '34')") - val table = getTableMetadata(t) + val tableName = fullTableName(t) + val table = getTableMetadata(tableName) - assert(table.name === fullTableName(t)) + assert(table.name === tableName) assert(table.properties === withDefaultOwnership(Map("provider" -> v2Format, "test" -> "34")).asJava) sql(s"ALTER TABLE $t UNSET TBLPROPERTIES ('test')") - val updated = getTableMetadata(t) + val updated = getTableMetadata(tableName) - assert(updated.name === fullTableName(t)) + assert(updated.name === tableName) assert(updated.properties === withDefaultOwnership(Map("provider" -> v2Format)).asJava) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala index 4c67888cbdc48..6b25d7c61663c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala @@ -85,7 +85,7 @@ class DataSourceV2DataFrameSessionCatalogSuite withTable(t1) { spark.range(20).write.format(v2Format).option("path", "abc").saveAsTable(t1) val cat = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog] - val tableInfo = cat.loadTable(Identifier.of(Array.empty, t1)) + val tableInfo = cat.loadTable(Identifier.of(Array("default"), t1)) assert(tableInfo.properties().get("location") === "abc") assert(tableInfo.properties().get("provider") === v2Format) } @@ -101,9 +101,15 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable new InMemoryTable(name, schema, partitions, properties) } + override def loadTable(ident: Identifier): Table = { + val identToUse = Option(InMemoryTableSessionCatalog.customIdentifierResolution) + .map(_(ident)) + .getOrElse(ident) + super.loadTable(identToUse) + } + override def alterTable(ident: Identifier, changes: TableChange*): Table = { - val fullIdent = fullIdentifier(ident) - Option(tables.get(fullIdent)) match { + Option(tables.get(ident)) match { case Some(table) => val properties = CatalogV2Util.applyPropertiesChanges(table.properties, changes) val schema = CatalogV2Util.applySchemaChanges(table.schema, changes) @@ -116,7 +122,7 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable val newTable = new InMemoryTable(table.name, schema, table.partitioning, properties) .withData(table.data) - tables.put(fullIdent, newTable) + tables.put(ident, newTable) newTable case _ => @@ -125,6 +131,21 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable } } +object InMemoryTableSessionCatalog { + private var customIdentifierResolution: Identifier => Identifier = _ + + def withCustomIdentifierResolver( + resolver: Identifier => Identifier)( + f: => Unit): Unit = { + try { + customIdentifierResolution = resolver + f + } finally { + customIdentifierResolution = null + } + } +} + private [connector] trait SessionCatalogTest[T <: Table, Catalog <: TestV2SessionCatalogBase[T]] extends QueryTest with SharedSparkSession diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala index 0a6897b829994..7c7afa9cfbd41 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala @@ -141,7 +141,7 @@ class DataSourceV2DataFrameSuite override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { plan = qe.analyzed } - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} } try { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala index 27725bcadbcd5..cf00b3b5e4410 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.{DataFrame, SaveMode} +import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} class DataSourceV2SQLSessionCatalogSuite @@ -47,6 +47,36 @@ class DataSourceV2SQLSessionCatalogSuite val v2Catalog = spark.sessionState.catalogManager.currentCatalog val nameParts = spark.sessionState.sqlParser.parseMultipartIdentifier(tableName) v2Catalog.asInstanceOf[TableCatalog] - .loadTable(Identifier.of(Array.empty, nameParts.last)) + .loadTable(Identifier.of(nameParts.init.toArray, nameParts.last)) + } + + test("SPARK-30697: catalog.isView doesn't throw an error for specialized identifiers") { + val t1 = "tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") + + def idResolver(id: Identifier): Identifier = Identifier.of(Array("default"), id.name()) + + InMemoryTableSessionCatalog.withCustomIdentifierResolver(idResolver) { + // The following should not throw AnalysisException. + sql(s"DESCRIBE TABLE ignored.$t1") + } + } + } + + test("SPARK-31624: SHOW TBLPROPERTIES working with V2 tables and the session catalog") { + val t1 = "tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format TBLPROPERTIES " + + "(key='v', key2='v2')") + + checkAnswer(sql(s"SHOW TBLPROPERTIES $t1"), Seq(Row("key", "v"), Row("key2", "v2"))) + + checkAnswer(sql(s"SHOW TBLPROPERTIES $t1('key')"), Row("key", "v")) + + checkAnswer( + sql(s"SHOW TBLPROPERTIES $t1('keyX')"), + Row("keyX", s"Table default.$t1 does not have property: keyX")) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 2c8349a0e6a75..df71a859899b7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql.connector +import java.sql.Timestamp +import java.time.LocalDate + import scala.collection.JavaConverters._ import org.apache.spark.SparkException @@ -27,7 +30,7 @@ import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} -import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION +import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode, V2_SESSION_CATALOG_IMPLEMENTATION} import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources.SimpleScanSource import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType} @@ -172,7 +175,7 @@ class DataSourceV2SQLSuite spark.sql(s"CREATE TABLE table_name (id bigint, data string) USING $v2Source") val testCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + val table = testCatalog.loadTable(Identifier.of(Array("default"), "table_name")) assert(table.name == "default.table_name") assert(table.partitioning.isEmpty) @@ -256,6 +259,24 @@ class DataSourceV2SQLSuite checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) } + // TODO: ignored by SPARK-31707, restore the test after create table syntax unification + ignore("CreateTable: without USING clause") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + val testCatalog = catalog("testcat").asTableCatalog + + sql("CREATE TABLE testcat.t1 (id int)") + val t1 = testCatalog.loadTable(Identifier.of(Array(), "t1")) + // Spark shouldn't set the default provider for catalog plugins. + assert(!t1.properties.containsKey(TableCatalog.PROP_PROVIDER)) + + sql("CREATE TABLE t2 (id int)") + val t2 = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog + .loadTable(Identifier.of(Array("default"), "t2")).asInstanceOf[V1Table] + // Spark should set the default provider as DEFAULT_DATA_SOURCE_NAME for the session catalog. + assert(t2.v1Table.provider == Some(conf.defaultDataSourceName)) + } + test("CreateTable/RepalceTable: invalid schema if has interval type") { Seq("CREATE", "REPLACE").foreach { action => val e1 = intercept[AnalysisException]( @@ -302,6 +323,37 @@ class DataSourceV2SQLSuite } } + test("CreateTableAsSelect: do not double execute on collect(), take() and other queries") { + val basicCatalog = catalog("testcat").asTableCatalog + val atomicCatalog = catalog("testcat_atomic").asTableCatalog + val basicIdentifier = "testcat.table_name" + val atomicIdentifier = "testcat_atomic.table_name" + + Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { + case (catalog, identifier) => + val df = spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT id, data FROM source") + + df.collect() + df.take(5) + df.tail(5) + df.where("true").collect() + df.where("true").take(5) + df.where("true").tail(5) + + val table = catalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == identifier) + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) + } + } + test("ReplaceTableAsSelect: basic v2 implementation.") { val basicCatalog = catalog("testcat").asTableCatalog val atomicCatalog = catalog("testcat_atomic").asTableCatalog @@ -329,6 +381,43 @@ class DataSourceV2SQLSuite } } + Seq("REPLACE", "CREATE OR REPLACE").foreach { cmd => + test(s"ReplaceTableAsSelect: do not double execute $cmd on collect()") { + val basicCatalog = catalog("testcat").asTableCatalog + val atomicCatalog = catalog("testcat_atomic").asTableCatalog + val basicIdentifier = "testcat.table_name" + val atomicIdentifier = "testcat_atomic.table_name" + + Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { + case (catalog, identifier) => + spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT id, data FROM source") + val originalTable = catalog.loadTable(Identifier.of(Array(), "table_name")) + + val df = spark.sql(s"$cmd TABLE $identifier USING foo AS SELECT id FROM source") + + df.collect() + df.take(5) + df.tail(5) + df.where("true").collect() + df.where("true").take(5) + df.where("true").tail(5) + + val replacedTable = catalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(replacedTable != originalTable, "Table should have been replaced.") + assert(replacedTable.name == identifier) + assert(replacedTable.partitioning.isEmpty) + assert(replacedTable.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(replacedTable.schema == new StructType().add("id", LongType)) + + val rdd = spark.sparkContext.parallelize(replacedTable.asInstanceOf[InMemoryTable].rows) + checkAnswer( + spark.internalCreateDataFrame(rdd, replacedTable.schema), + spark.table("source").select("id")) + } + } + } + test("ReplaceTableAsSelect: Non-atomic catalog drops the table if the write fails.") { spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") val testCatalog = catalog("testcat").asTableCatalog @@ -453,7 +542,7 @@ class DataSourceV2SQLSuite spark.sql(s"CREATE TABLE table_name USING $v2Source AS SELECT id, data FROM source") val testCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + val table = testCatalog.loadTable(Identifier.of(Array("default"), "table_name")) assert(table.name == "default.table_name") assert(table.partitioning.isEmpty) @@ -565,7 +654,7 @@ class DataSourceV2SQLSuite // The fact that the following line doesn't throw an exception means, the session catalog // can load the table. val t = catalog(SESSION_CATALOG_NAME).asTableCatalog - .loadTable(Identifier.of(Array.empty, "table_name")) + .loadTable(Identifier.of(Array("default"), "table_name")) assert(t.isInstanceOf[V1Table], "V1 table wasn't returned as an unresolved table") } @@ -595,6 +684,24 @@ class DataSourceV2SQLSuite } } + // TODO: ignored by SPARK-31707, restore the test after create table syntax unification + ignore("CreateTableAsSelect: without USING clause") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + val testCatalog = catalog("testcat").asTableCatalog + + sql("CREATE TABLE testcat.t1 AS SELECT 1 i") + val t1 = testCatalog.loadTable(Identifier.of(Array(), "t1")) + // Spark shouldn't set the default provider for catalog plugins. + assert(!t1.properties.containsKey(TableCatalog.PROP_PROVIDER)) + + sql("CREATE TABLE t2 AS SELECT 1 i") + val t2 = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog + .loadTable(Identifier.of(Array("default"), "t2")).asInstanceOf[V1Table] + // Spark should set the default provider as DEFAULT_DATA_SOURCE_NAME for the session catalog. + assert(t2.v1Table.provider == Some(conf.defaultDataSourceName)) + } + test("DropTable: basic") { val tableName = "testcat.ns1.ns2.tbl" val ident = Identifier.of(Array("ns1", "ns2"), "tbl") @@ -605,10 +712,10 @@ class DataSourceV2SQLSuite } test("DropTable: table qualified with the session catalog name") { - val ident = Identifier.of(Array(), "tbl") + val ident = Identifier.of(Array("default"), "tbl") sql("CREATE TABLE tbl USING json AS SELECT 1 AS i") assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true) - sql("DROP TABLE spark_catalog.tbl") + sql("DROP TABLE spark_catalog.default.tbl") assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === false) } @@ -679,6 +786,59 @@ class DataSourceV2SQLSuite } } + test("qualified column names for v2 tables") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, point struct) USING foo") + sql(s"INSERT INTO $t VALUES (1, (10, 20))") + + def check(tbl: String): Unit = { + checkAnswer( + sql(s"SELECT testcat.ns1.ns2.tbl.id, testcat.ns1.ns2.tbl.point.x FROM $tbl"), + Row(1, 10)) + checkAnswer(sql(s"SELECT ns1.ns2.tbl.id, ns1.ns2.tbl.point.x FROM $tbl"), Row(1, 10)) + checkAnswer(sql(s"SELECT ns2.tbl.id, ns2.tbl.point.x FROM $tbl"), Row(1, 10)) + checkAnswer(sql(s"SELECT tbl.id, tbl.point.x FROM $tbl"), Row(1, 10)) + } + + // Test with qualified table name "testcat.ns1.ns2.tbl". + check(t) + + // Test if current catalog and namespace is respected in column resolution. + sql("USE testcat.ns1.ns2") + check("tbl") + + val ex = intercept[AnalysisException] { + sql(s"SELECT ns1.ns2.ns3.tbl.id from $t") + } + assert(ex.getMessage.contains("cannot resolve '`ns1.ns2.ns3.tbl.id`")) + } + } + + test("qualified column names for v1 tables") { + Seq(true, false).foreach { useV1Table => + val format = if (useV1Table) "json" else v2Format + if (useV1Table) { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + } else { + spark.conf.set( + V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName) + } + + withTable("t") { + sql(s"CREATE TABLE t USING $format AS SELECT 1 AS i") + checkAnswer(sql("select i from t"), Row(1)) + checkAnswer(sql("select t.i from t"), Row(1)) + checkAnswer(sql("select default.t.i from t"), Row(1)) + checkAnswer(sql("select spark_catalog.default.t.i from t"), Row(1)) + checkAnswer(sql("select t.i from spark_catalog.default.t"), Row(1)) + checkAnswer(sql("select default.t.i from spark_catalog.default.t"), Row(1)) + checkAnswer(sql("select spark_catalog.default.t.i from spark_catalog.default.t"), Row(1)) + } + } + } + test("InsertInto: append - across catalog") { val t1 = "testcat.ns1.ns2.tbl" val t2 = "testcat2.db.tbl" @@ -752,6 +912,23 @@ class DataSourceV2SQLSuite assert(exception.getMessage.contains("The database name is not valid: a.b")) } + test("ShowViews: using v1 catalog, db name with multipartIdentifier ('a.b') is not allowed.") { + val exception = intercept[AnalysisException] { + sql("SHOW TABLES FROM a.b") + } + + assert(exception.getMessage.contains("The database name is not valid: a.b")) + } + + test("ShowViews: using v2 catalog, command not supported.") { + val exception = intercept[AnalysisException] { + sql("SHOW VIEWS FROM testcat") + } + + assert(exception.getMessage.contains("Catalog testcat doesn't support SHOW VIEWS," + + " only SessionCatalog supports this command.")) + } + test("ShowTables: using v2 catalog with empty namespace") { spark.sql("CREATE TABLE testcat.table (id bigint, data string) USING foo") runShowTablesSql("SHOW TABLES FROM testcat", Seq(Row("", "table"))) @@ -1340,7 +1517,12 @@ class DataSourceV2SQLSuite val sessionCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog def checkPartitioning(cat: TableCatalog, partition: String): Unit = { - val table = cat.loadTable(Identifier.of(Array.empty, "tbl")) + val namespace = if (cat.name == SESSION_CATALOG_NAME) { + Array("default") + } else { + Array[String]() + } + val table = cat.loadTable(Identifier.of(namespace, "tbl")) val partitions = table.partitioning().map(_.references()) assert(partitions.length === 1) val fieldNames = partitions.flatMap(_.map(_.fieldNames())) @@ -1376,48 +1558,48 @@ class DataSourceV2SQLSuite } test("tableCreation: duplicate column names in the table definition") { - val errorMsg = "Found duplicate column(s) in the table definition of t" + val errorMsg = "Found duplicate column(s) in the table definition of" Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { assertAnalysisError( s"CREATE TABLE t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg + s"$errorMsg default.t" ) assertAnalysisError( s"CREATE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg + s"$errorMsg t" ) assertAnalysisError( s"CREATE OR REPLACE TABLE t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg + s"$errorMsg default.t" ) assertAnalysisError( s"CREATE OR REPLACE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg + s"$errorMsg t" ) } } } test("tableCreation: duplicate nested column names in the table definition") { - val errorMsg = "Found duplicate column(s) in the table definition of t" + val errorMsg = "Found duplicate column(s) in the table definition of" Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { assertAnalysisError( s"CREATE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg + s"$errorMsg default.t" ) assertAnalysisError( s"CREATE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg + s"$errorMsg t" ) assertAnalysisError( s"CREATE OR REPLACE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg + s"$errorMsg default.t" ) assertAnalysisError( s"CREATE OR REPLACE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg + s"$errorMsg t" ) } } @@ -1451,7 +1633,6 @@ class DataSourceV2SQLSuite """ |CREATE TABLE testcat.t (id int, `a.b` string) USING foo |CLUSTERED BY (`a.b`) INTO 4 BUCKETS - |OPTIONS ('allow-unsupported-transforms'=true) """.stripMargin) val testCatalog = catalog("testcat").asTableCatalog.asInstanceOf[InMemoryTableCatalog] @@ -1530,6 +1711,20 @@ class DataSourceV2SQLSuite } } + test("SPARK-33435: REFRESH TABLE should invalidate all caches referencing the table") { + val tblName = "testcat.ns.t" + withTable(tblName) { + withTempView("t") { + sql(s"CREATE TABLE $tblName (id bigint) USING foo") + sql(s"CACHE TABLE t AS SELECT id FROM $tblName") + + assert(spark.sharedState.cacheManager.lookupCachedData(spark.table("t")).isDefined) + sql(s"REFRESH TABLE $tblName") + assert(spark.sharedState.cacheManager.lookupCachedData(spark.table("t")).isEmpty) + } + } + } + test("REPLACE TABLE: v1 table") { val e = intercept[AnalysisException] { sql(s"CREATE OR REPLACE TABLE tbl (a int) USING ${classOf[SimpleScanSource].getName}") @@ -1748,7 +1943,7 @@ class DataSourceV2SQLSuite withTable(t) { spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") testV1Command("ANALYZE TABLE", s"$t COMPUTE STATISTICS") - testV1Command("ANALYZE TABLE", s"$t COMPUTE STATISTICS FOR ALL COLUMNS") + testV1CommandSupportingTempView("ANALYZE TABLE", s"$t COMPUTE STATISTICS FOR ALL COLUMNS") } } @@ -1812,7 +2007,7 @@ class DataSourceV2SQLSuite val t = "testcat.ns1.ns2.tbl" withTable(t) { spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - testV1Command("SHOW CREATE TABLE", t) + testV1CommandSupportingTempView("SHOW CREATE TABLE", t) } } @@ -1821,12 +2016,12 @@ class DataSourceV2SQLSuite withTable(t) { spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - testV1Command("CACHE TABLE", t) + testV1CommandSupportingTempView("CACHE TABLE", t) val e = intercept[AnalysisException] { sql(s"CACHE LAZY TABLE $t") } - assert(e.message.contains("CACHE TABLE is only supported with v1 tables")) + assert(e.message.contains("CACHE TABLE is only supported with temp views or v1 tables")) } } @@ -1835,8 +2030,8 @@ class DataSourceV2SQLSuite withTable(t) { sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - testV1Command("UNCACHE TABLE", t) - testV1Command("UNCACHE TABLE", s"IF EXISTS $t") + testV1CommandSupportingTempView("UNCACHE TABLE", t) + testV1CommandSupportingTempView("UNCACHE TABLE", s"IF EXISTS $t") } } @@ -1845,8 +2040,8 @@ class DataSourceV2SQLSuite withTable(t) { spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - testV1Command("SHOW COLUMNS", s"FROM $t") - testV1Command("SHOW COLUMNS", s"IN $t") + testV1CommandSupportingTempView("SHOW COLUMNS", s"FROM $t") + testV1CommandSupportingTempView("SHOW COLUMNS", s"IN $t") val e3 = intercept[AnalysisException] { sql(s"SHOW COLUMNS FROM tbl IN testcat.ns1.ns2") @@ -1916,7 +2111,7 @@ class DataSourceV2SQLSuite val e = intercept[AnalysisException] { sql(s"ALTER VIEW $v AS SELECT 1") } - assert(e.message.contains("ALTER VIEW QUERY is only supported with v1 tables")) + assert(e.message.contains("ALTER VIEW QUERY is only supported with temp views or v1 tables")) } test("CREATE VIEW") { @@ -1943,8 +2138,6 @@ class DataSourceV2SQLSuite .add("value", StringType, nullable = false) val expected = Seq( - Row(TableCatalog.PROP_OWNER, defaultUser), - Row("provider", provider), Row("status", status), Row("user", user)) @@ -1994,7 +2187,8 @@ class DataSourceV2SQLSuite val e1 = intercept[AnalysisException] { sql("DESCRIBE FUNCTION default.ns1.ns2.fun") } - assert(e1.message.contains("Unsupported function name 'default.ns1.ns2.fun'")) + assert(e1.message.contains( + "The namespace in session catalog must have exactly one name part: default.ns1.ns2.fun")) } test("SHOW FUNCTIONS not valid v1 namespace") { @@ -2013,9 +2207,10 @@ class DataSourceV2SQLSuite assert(e.message.contains("DROP FUNCTION is only supported in v1 catalog")) val e1 = intercept[AnalysisException] { - sql("DESCRIBE FUNCTION default.ns1.ns2.fun") + sql("DROP FUNCTION default.ns1.ns2.fun") } - assert(e1.message.contains("Unsupported function name 'default.ns1.ns2.fun'")) + assert(e1.message.contains( + "The namespace in session catalog must have exactly one name part: default.ns1.ns2.fun")) } test("CREATE FUNCTION: only support session catalog") { @@ -2027,7 +2222,8 @@ class DataSourceV2SQLSuite val e1 = intercept[AnalysisException] { sql("CREATE FUNCTION default.ns1.ns2.fun as 'f'") } - assert(e1.message.contains("Unsupported function name 'default.ns1.ns2.fun'")) + assert(e1.message.contains( + "The namespace in session catalog must have exactly one name part: default.ns1.ns2.fun")) } test("global temp view should not be masked by v2 catalog") { @@ -2064,7 +2260,8 @@ class DataSourceV2SQLSuite // the session catalog, not the `gloabl_temp` v2 catalog. sql(s"CREATE TABLE $globalTempDB.ns1.ns2.tbl (id bigint, data string) USING json") } - assert(e.message.contains("global_temp.ns1.ns2.tbl is not a valid TableIdentifier")) + assert(e.message.contains( + "The namespace in session catalog must have exactly one name part: global_temp.ns1.ns2.tbl")) } test("table name same as catalog can be used") { @@ -2083,23 +2280,60 @@ class DataSourceV2SQLSuite withTable("t") { sql("CREATE TABLE t USING json AS SELECT 1 AS i") checkAnswer(sql("select * from t"), Row(1)) - checkAnswer(sql("select * from spark_catalog.t"), Row(1)) checkAnswer(sql("select * from spark_catalog.default.t"), Row(1)) } } + test("SPARK-30885: v1 table name should be fully qualified") { + def assertWrongTableIdent(): Unit = { + withTable("t") { + sql("CREATE TABLE t USING json AS SELECT 1 AS i") + + val t = "spark_catalog.t" + def verify(sql: String): Unit = { + val e = intercept[AnalysisException](spark.sql(sql)) + assert(e.message.contains( + s"The namespace in session catalog must have exactly one name part: $t")) + } + + verify(s"select * from $t") + // Verify V1 commands that bypass table lookups. + verify(s"REFRESH TABLE $t") + verify(s"DESCRIBE $t i") + verify(s"DROP TABLE $t") + verify(s"DROP VIEW $t") + verify(s"ANALYZE TABLE $t COMPUTE STATISTICS") + verify(s"ANALYZE TABLE $t COMPUTE STATISTICS FOR ALL COLUMNS") + verify(s"MSCK REPAIR TABLE $t") + verify(s"LOAD DATA INPATH 'filepath' INTO TABLE $t") + verify(s"SHOW CREATE TABLE $t") + verify(s"SHOW CREATE TABLE $t AS SERDE") + verify(s"CACHE TABLE $t") + verify(s"UNCACHE TABLE $t") + verify(s"TRUNCATE TABLE $t") + verify(s"SHOW PARTITIONS $t") + verify(s"SHOW COLUMNS FROM $t") + } + } + + assertWrongTableIdent() + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + assertWrongTableIdent() + } + test("SPARK-30259: session catalog can be specified in CREATE TABLE AS SELECT command") { withTable("tbl") { - val ident = Identifier.of(Array(), "tbl") - sql("CREATE TABLE spark_catalog.tbl USING json AS SELECT 1 AS i") + val ident = Identifier.of(Array("default"), "tbl") + sql("CREATE TABLE spark_catalog.default.tbl USING json AS SELECT 1 AS i") assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true) } } test("SPARK-30259: session catalog can be specified in CREATE TABLE command") { withTable("tbl") { - val ident = Identifier.of(Array(), "tbl") - sql("CREATE TABLE spark_catalog.tbl (col string) USING json") + val ident = Identifier.of(Array("default"), "tbl") + sql("CREATE TABLE spark_catalog.default.tbl (col string) USING json") assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true) } } @@ -2108,7 +2342,7 @@ class DataSourceV2SQLSuite // unset this config to use the default v2 session catalog. spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) - withTable("spark_catalog.t", "testcat.ns.t") { + withTable("spark_catalog.default.t", "testcat.ns.t") { sql("CREATE TABLE t USING parquet AS SELECT 1") sql("CREATE TABLE testcat.ns.t USING parquet AS SELECT 2") @@ -2130,17 +2364,18 @@ class DataSourceV2SQLSuite withTempView("t") { spark.range(10).createTempView("t") - withView(s"$sessionCatalogName.v") { + withView(s"$sessionCatalogName.default.v") { val e = intercept[AnalysisException] { - sql(s"CREATE VIEW $sessionCatalogName.v AS SELECT * FROM t") + sql(s"CREATE VIEW $sessionCatalogName.default.v AS SELECT * FROM t") } assert(e.message.contains("referencing a temporary view")) } } withTempView("t") { - withView(s"$sessionCatalogName.v") { - sql(s"CREATE VIEW $sessionCatalogName.v AS SELECT t1.col FROM t t1 JOIN ns1.ns2.t t2") + withView(s"$sessionCatalogName.default.v") { + sql(s"CREATE VIEW $sessionCatalogName.default.v " + + "AS SELECT t1.col FROM t t1 JOIN ns1.ns2.t t2") sql(s"USE $sessionCatalogName") // The view should read data from table `testcat.ns1.ns2.t` not the temp view. spark.range(10).createTempView("t") @@ -2214,6 +2449,81 @@ class DataSourceV2SQLSuite .head().getString(1) === expectedComment) } + test("SPARK-30799: temp view name can't contain catalog name") { + val sessionCatalogName = CatalogManager.SESSION_CATALOG_NAME + withTempView("v") { + spark.range(10).createTempView("v") + val e1 = intercept[AnalysisException]( + sql(s"CACHE TABLE $sessionCatalogName.v") + ) + assert(e1.message.contains( + "The namespace in session catalog must have exactly one name part: spark_catalog.v")) + } + val e2 = intercept[AnalysisException] { + sql(s"CREATE TEMP VIEW $sessionCatalogName.v AS SELECT 1") + } + assert(e2.message.contains("It is not allowed to add database prefix")) + } + + test("SPARK-31015: star expression should work for qualified column names for v2 tables") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, name string) USING foo") + sql(s"INSERT INTO $t VALUES (1, 'hello')") + + def check(tbl: String): Unit = { + checkAnswer(sql(s"SELECT testcat.ns1.ns2.tbl.* FROM $tbl"), Row(1, "hello")) + checkAnswer(sql(s"SELECT ns1.ns2.tbl.* FROM $tbl"), Row(1, "hello")) + checkAnswer(sql(s"SELECT ns2.tbl.* FROM $tbl"), Row(1, "hello")) + checkAnswer(sql(s"SELECT tbl.* FROM $tbl"), Row(1, "hello")) + } + + // Test with qualified table name "testcat.ns1.ns2.tbl". + check(t) + + // Test if current catalog and namespace is respected in column resolution. + sql("USE testcat.ns1.ns2") + check("tbl") + + val ex = intercept[AnalysisException] { + sql(s"SELECT ns1.ns2.ns3.tbl.* from $t") + } + assert(ex.getMessage.contains("cannot resolve 'ns1.ns2.ns3.tbl.*")) + } + } + + test("SPARK-32168: INSERT OVERWRITE - hidden days partition - dynamic mode") { + def testTimestamp(daysOffset: Int): Timestamp = { + Timestamp.valueOf(LocalDate.of(2020, 1, 1 + daysOffset).atStartOfDay()) + } + + withSQLConf(PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) { + val t1 = s"${catalogAndNamespace}tbl" + withTable(t1) { + val df = spark.createDataFrame(Seq( + (testTimestamp(1), "a"), + (testTimestamp(2), "b"), + (testTimestamp(3), "c"))).toDF("ts", "data") + df.createOrReplaceTempView("source_view") + + sql(s"CREATE TABLE $t1 (ts timestamp, data string) " + + s"USING $v2Format PARTITIONED BY (days(ts))") + sql(s"INSERT INTO $t1 VALUES " + + s"(CAST(date_add('2020-01-01', 2) AS timestamp), 'dummy'), " + + s"(CAST(date_add('2020-01-01', 4) AS timestamp), 'keep')") + sql(s"INSERT OVERWRITE TABLE $t1 SELECT ts, data FROM source_view") + + val expected = spark.createDataFrame(Seq( + (testTimestamp(1), "a"), + (testTimestamp(2), "b"), + (testTimestamp(3), "c"), + (testTimestamp(4), "keep"))).toDF("ts", "data") + + verifyTable(t1, expected) + } + } + } + private def testV1Command(sqlCommand: String, sqlParams: String): Unit = { val e = intercept[AnalysisException] { sql(s"$sqlCommand $sqlParams") @@ -2221,6 +2531,13 @@ class DataSourceV2SQLSuite assert(e.message.contains(s"$sqlCommand is only supported with v1 tables")) } + private def testV1CommandSupportingTempView(sqlCommand: String, sqlParams: String): Unit = { + val e = intercept[AnalysisException] { + sql(s"$sqlCommand $sqlParams") + } + assert(e.message.contains(s"$sqlCommand is only supported with temp views or v1 tables")) + } + private def assertAnalysisError(sqlStatement: String, expectedError: String): Unit = { val errMsg = intercept[AnalysisException] { sql(sqlStatement) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala index 2d8761f872da7..c2edcce8ce2ce 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala @@ -394,6 +394,35 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession with AdaptiveS checkAnswer(df, (0 until 3).map(i => Row(i))) } } + + test("SPARK-32609: DataSourceV2 with different pushedfilters should be different") { + def getScanExec(query: DataFrame): BatchScanExec = { + query.queryExecution.executedPlan.collect { + case d: BatchScanExec => d + }.head + } + + Seq(classOf[AdvancedDataSourceV2], classOf[JavaAdvancedDataSourceV2]).foreach { cls => + withClue(cls.getName) { + val df = spark.read.format(cls.getName).load() + val q1 = df.select('i).filter('i > 6) + val q2 = df.select('i).filter('i > 5) + val scan1 = getScanExec(q1) + val scan2 = getScanExec(q2) + assert(!scan1.equals(scan2)) + } + } + } + + test("SPARK-33267: push down with condition 'in (..., null)' should not throw NPE") { + Seq(classOf[AdvancedDataSourceV2], classOf[JavaAdvancedDataSourceV2]).foreach { cls => + withClue(cls.getName) { + val df = spark.read.format(cls.getName).load() + // before SPARK-33267 below query just threw NPE + df.select('i).where("i in (1, null)").collect() + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala index 01fcced5b12a8..a58bab276a41b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala @@ -37,8 +37,8 @@ class DataSourceV2UtilsSuite extends SparkFunSuite { val source = new DataSourceV2WithSessionConfig val confs = DataSourceV2Utils.extractSessionConfigs(source, conf) assert(confs.size == 2) - assert(confs.keySet.filter(_.startsWith("spark.datasource")).size == 0) - assert(confs.keySet.filter(_.startsWith("not.exist.prefix")).size == 0) + assert(!confs.keySet.exists(_.startsWith("spark.datasource"))) + assert(!confs.keySet.exists(_.startsWith("not.exist.prefix"))) assert(confs.keySet.contains("foo.bar")) assert(confs.keySet.contains("whateverConfigName")) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala index b0da2eb697f36..51d734279414a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala @@ -157,13 +157,13 @@ class FileDataSourceV2FallBackSuite extends QueryTest with SharedSparkSession { Seq("parquet", classOf[ParquetDataSourceV2].getCanonicalName).foreach { format => withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> format) { val commands = ArrayBuffer.empty[(String, LogicalPlan)] - val errors = ArrayBuffer.empty[(String, Throwable)] + val exceptions = ArrayBuffer.empty[(String, Exception)] val listener = new QueryExecutionListener { override def onFailure( funcName: String, qe: QueryExecution, - error: Throwable): Unit = { - errors += funcName -> error + exception: Exception): Unit = { + exceptions += funcName -> exception } override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala index 0fd6cf1b6746c..2cc7a1f994645 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala @@ -446,21 +446,35 @@ trait InsertIntoSQLOnlyTests } } - test("InsertInto: overwrite - multiple static partitions - dynamic mode") { - // Since all partitions are provided statically, this should be supported by everyone - withSQLConf(PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) { - val t1 = s"${catalogAndNamespace}tbl" - withTableAndData(t1) { view => - sql(s"CREATE TABLE $t1 (id bigint, data string, p int) " + - s"USING $v2Format PARTITIONED BY (id, p)") - sql(s"INSERT INTO $t1 VALUES (2L, 'dummy', 2), (4L, 'keep', 2)") - sql(s"INSERT OVERWRITE TABLE $t1 PARTITION (id = 2, p = 2) SELECT data FROM $view") - verifyTable(t1, Seq( - (2, "a", 2), - (2, "b", 2), - (2, "c", 2), - (4, "keep", 2)).toDF("id", "data", "p")) - } + dynamicOverwriteTest("InsertInto: overwrite - multiple static partitions - dynamic mode") { + val t1 = s"${catalogAndNamespace}tbl" + withTableAndData(t1) { view => + sql(s"CREATE TABLE $t1 (id bigint, data string, p int) " + + s"USING $v2Format PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t1 VALUES (2L, 'dummy', 2), (4L, 'keep', 2)") + sql(s"INSERT OVERWRITE TABLE $t1 PARTITION (id = 2, p = 2) SELECT data FROM $view") + verifyTable(t1, Seq( + (2, "a", 2), + (2, "b", 2), + (2, "c", 2), + (4, "keep", 2)).toDF("id", "data", "p")) + } + } + + test("do not double insert on INSERT INTO collect()") { + val t1 = s"${catalogAndNamespace}tbl" + withTableAndData(t1) { view => + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") + val df = sql(s"INSERT INTO TABLE $t1 SELECT * FROM $view") + + df.collect() + df.take(5) + df.tail(5) + df.where("true").collect() + df.where("true").take(5) + df.where("true").tail(5) + + verifyTable(t1, spark.table(view)) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala index 7bff955b18360..550bec7505422 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala @@ -75,7 +75,12 @@ class SupportsCatalogOptionsSuite extends QueryTest with SharedSparkSession with withCatalogOption.foreach(cName => dfw.option("catalog", cName)) dfw.partitionBy(partitionBy: _*).save() - val table = catalog(withCatalogOption.getOrElse(SESSION_CATALOG_NAME)).loadTable("t1") + val ident = if (withCatalogOption.isEmpty) { + Identifier.of(Array("default"), "t1") + } else { + Identifier.of(Array(), "t1") + } + val table = catalog(withCatalogOption.getOrElse(SESSION_CATALOG_NAME)).loadTable(ident) val namespace = withCatalogOption.getOrElse("default") assert(table.name() === s"$namespace.t1", "Table identifier was wrong") assert(table.partitioning().length === partitionBy.length, "Partitioning did not match") @@ -134,7 +139,7 @@ class SupportsCatalogOptionsSuite extends QueryTest with SharedSparkSession with val dfw = df.write.format(format).mode(SaveMode.Ignore).option("name", "t1") dfw.save() - val table = catalog(SESSION_CATALOG_NAME).loadTable("t1") + val table = catalog(SESSION_CATALOG_NAME).loadTable(Identifier.of(Array("default"), "t1")) assert(table.partitioning().isEmpty, "Partitioning should be empty") assert(table.schema() === new StructType().add("id", LongType), "Schema did not match") assert(load("t1", None).count() === 0) @@ -211,7 +216,7 @@ class SupportsCatalogOptionsSuite extends QueryTest with SharedSparkSession with override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { plan = qe.analyzed } - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} } spark.listenerManager.register(listener) @@ -279,7 +284,12 @@ class CatalogSupportingInMemoryTableProvider override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = { val name = options.get("name") assert(name != null, "The name should be provided for this table") - Identifier.of(Array.empty, name) + val namespace = if (options.containsKey("catalog")) { + Array[String]() + } else { + Array("default") + } + Identifier.of(namespace, name) } override def extractCatalog(options: CaseInsensitiveStringMap): String = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala index 3f6ac0b7f8d3c..637cf2fd16515 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala @@ -22,6 +22,7 @@ import java.util.concurrent.ConcurrentHashMap import scala.collection.JavaConverters._ +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.StructType @@ -41,23 +42,14 @@ private[connector] trait TestV2SessionCatalogBase[T <: Table] extends Delegating partitions: Array[Transform], properties: util.Map[String, String]): T - protected def fullIdentifier(ident: Identifier): Identifier = { - if (ident.namespace().isEmpty) { - Identifier.of(Array("default"), ident.name()) - } else { - ident - } - } - override def loadTable(ident: Identifier): Table = { - val fullIdent = fullIdentifier(ident) - if (tables.containsKey(fullIdent)) { - tables.get(fullIdent) + if (tables.containsKey(ident)) { + tables.get(ident) } else { // Table was created through the built-in catalog - val t = super.loadTable(fullIdent) + val t = super.loadTable(ident) val table = newTable(t.name(), t.schema(), t.partitioning(), t.properties()) - tables.put(fullIdent, table) + tables.put(ident, table) table } } @@ -69,13 +61,12 @@ private[connector] trait TestV2SessionCatalogBase[T <: Table] extends Delegating properties: util.Map[String, String]): Table = { val created = super.createTable(ident, schema, partitions, properties) val t = newTable(created.name(), schema, partitions, properties) - val fullIdent = fullIdentifier(ident) - tables.put(fullIdent, t) + tables.put(ident, t) t } override def dropTable(ident: Identifier): Boolean = { - tables.remove(fullIdentifier(ident)) + tables.remove(ident) super.dropTable(ident) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala index 10ed2048dbf61..4b52a4cbf4116 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala @@ -25,10 +25,14 @@ import scala.collection.mutable import org.scalatest.BeforeAndAfter import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, SaveMode, SparkSession, SQLContext} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform} import org.apache.spark.sql.connector.write.{LogicalWriteInfo, LogicalWriteInfoImpl, SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder} import org.apache.spark.sql.execution.datasources.DataSourceUtils +import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources._ import org.apache.spark.sql.test.SharedSparkSession @@ -124,6 +128,23 @@ class V1WriteFallbackSuite extends QueryTest with SharedSparkSession with Before } assert(e3.getMessage.contains("schema")) } + + test("fallback writes should only analyze plan once") { + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + try { + val session = SparkSession.builder() + .master("local[1]") + .withExtensions(_.injectPostHocResolutionRule(_ => OnlyOnceRule)) + .config(V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[V1FallbackTableCatalog].getName) + .getOrCreate() + val df = session.createDataFrame(Seq((1, "x"), (2, "y"), (3, "z"))) + df.write.mode("append").option("name", "t1").format(v2Format).saveAsTable("test") + } finally { + SparkSession.setActiveSession(spark) + SparkSession.setDefaultSession(spark) + } + } } class V1WriteFallbackSessionCatalogSuite @@ -318,3 +339,24 @@ class InMemoryTableWithV1Fallback( } } } + +/** A rule that fails if a query plan is analyzed twice. */ +object OnlyOnceRule extends Rule[LogicalPlan] { + private val tag = TreeNodeTag[String]("test") + private val counts = new mutable.HashMap[LogicalPlan, Int]() + + override def apply(plan: LogicalPlan): LogicalPlan = { + if (plan.getTagValue(tag).isEmpty) { + plan.setTagValue(tag, "abc") + plan + } else { + val cnt = counts.getOrElseUpdate(plan, 0) + 1 + // This rule will be run as injectPostHocResolutionRule, and is supposed to be run only twice. + // Once during planning and once during checkBatchIdempotence + assert(cnt <= 1, "This rule shouldn't have been called again") + counts.put(plan, cnt) + plan + } + + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala index 289f9dc427795..dd95ceb59bdc4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala @@ -151,6 +151,17 @@ class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTes } } + test("AlterTable: add column resolution - column position referencing new column") { + alterTableTest( + Seq( + TableChange.addColumn( + Array("x"), LongType, true, null, ColumnPosition.after("id")), + TableChange.addColumn( + Array("y"), LongType, true, null, ColumnPosition.after("X"))), + Seq("Couldn't find the reference column for AFTER X at root") + ) + } + test("AlterTable: add column resolution - nested positional") { Seq("X", "Y").foreach { ref => alterTableTest( @@ -161,6 +172,17 @@ class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTes } } + test("AlterTable: add column resolution - column position referencing new nested column") { + alterTableTest( + Seq( + TableChange.addColumn( + Array("point", "z"), LongType, true, null), + TableChange.addColumn( + Array("point", "zz"), LongType, true, null, ColumnPosition.after("Z"))), + Seq("Couldn't find the reference column for AFTER Z at point") + ) + } + test("AlterTable: drop column resolution") { Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref => alterTableTest( @@ -207,13 +229,17 @@ class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTes } private def alterTableTest(change: TableChange, error: Seq[String]): Unit = { + alterTableTest(Seq(change), error) + } + + private def alterTableTest(changes: Seq[TableChange], error: Seq[String]): Unit = { Seq(true, false).foreach { caseSensitive => withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { val plan = AlterTable( catalog, Identifier.of(Array(), "table_name"), TestRelation2, - Seq(change) + changes ) if (caseSensitive) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala similarity index 60% rename from sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala index 04b4d4f29f850..9e77f618eded6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala @@ -19,16 +19,17 @@ package org.apache.spark.sql.execution import org.scalatest.BeforeAndAfterAll -import org.apache.spark.{MapOutputStatistics, SparkConf, SparkFunSuite} +import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql._ import org.apache.spark.sql.execution.adaptive._ -import org.apache.spark.sql.execution.adaptive.{CoalescedShuffleReaderExec, ReduceNumShufflePartitions} +import org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.COALESCED_SHUFFLE_READER_DESCRIPTION +import org.apache.spark.sql.execution.adaptive.CustomShuffleReaderExec import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf -class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterAll { +class CoalesceShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterAll { private var originalActiveSparkSession: Option[SparkSession] = _ private var originalInstantiatedSparkSession: Option[SparkSession] = _ @@ -52,212 +53,6 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA } } - private def checkEstimation( - rule: ReduceNumShufflePartitions, - bytesByPartitionIdArray: Array[Array[Long]], - expectedPartitionStartIndices: Array[Int]): Unit = { - val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map { - case (bytesByPartitionId, index) => - new MapOutputStatistics(index, bytesByPartitionId) - } - val estimatedPartitionStartIndices = - rule.estimatePartitionStartAndEndIndices(mapOutputStatistics).map(_._1) - assert(estimatedPartitionStartIndices === expectedPartitionStartIndices) - } - - private def createReduceNumShufflePartitionsRule( - advisoryTargetPostShuffleInputSize: Long, - minNumPostShufflePartitions: Int = 1): ReduceNumShufflePartitions = { - val conf = new SQLConf().copy( - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE -> advisoryTargetPostShuffleInputSize, - SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS -> minNumPostShufflePartitions) - ReduceNumShufflePartitions(conf) - } - - test("test estimatePartitionStartIndices - 1 Exchange") { - val rule = createReduceNumShufflePartitionsRule(100L) - - { - // All bytes per partition are 0. - val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // Some bytes per partition are 0 and total size is less than the target size. - // 1 post-shuffle partition is needed. - val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // 2 post-shuffle partitions are needed. - val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0) - val expectedPartitionStartIndices = Array[Int](0, 3) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // There are a few large pre-shuffle partitions. - val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // All pre-shuffle partitions are larger than the targeted size. - val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // The last pre-shuffle partition is in a single post-shuffle partition. - val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110) - val expectedPartitionStartIndices = Array[Int](0, 4) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - } - - test("test estimatePartitionStartIndices - 2 Exchanges") { - val rule = createReduceNumShufflePartitionsRule(100L) - - { - // If there are multiple values of the number of pre-shuffle partitions, - // we should see an assertion error. - val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) - val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0) - val mapOutputStatistics = - Array( - new MapOutputStatistics(0, bytesByPartitionId1), - new MapOutputStatistics(1, bytesByPartitionId2)) - intercept[AssertionError](rule.estimatePartitionStartAndEndIndices( - mapOutputStatistics)) - } - - { - // All bytes per partition are 0. - val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) - val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // Some bytes per partition are 0. - // 1 post-shuffle partition is needed. - val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // 2 post-shuffle partition are needed. - val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) - val expectedPartitionStartIndices = Array[Int](0, 2, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // 4 post-shuffle partition are needed. - val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // 2 post-shuffle partition are needed. - val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // There are a few large pre-shuffle partitions. - val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // All pairs of pre-shuffle partitions are larger than the targeted size. - val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - } - - test("test estimatePartitionStartIndices and enforce minimal number of reducers") { - val rule = createReduceNumShufflePartitionsRule(100L, 2) - - { - // The minimal number of post-shuffle partitions is not enforced because - // the size of data is 0. - val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) - val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // The minimal number of post-shuffle partitions is enforced. - val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20) - val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5) - val expectedPartitionStartIndices = Array[Int](0, 3) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // The number of post-shuffle partitions is determined by the coordinator. - val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20) - val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30) - val expectedPartitionStartIndices = Array[Int](0, 1, 3, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - } - - /////////////////////////////////////////////////////////////////////////// - // Query tests - /////////////////////////////////////////////////////////////////////////// - val numInputPartitions: Int = 10 def withSparkSession( @@ -270,17 +65,17 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA .setAppName("test") .set(UI_ENABLED, false) .set(SQLConf.SHUFFLE_PARTITIONS.key, "5") - .set(SQLConf.SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS.key, "5") + .set(SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key, "5") .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "-1") .set( - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, targetPostShuffleInputSize.toString) minNumPostShufflePartitions match { case Some(numPartitions) => - sparkConf.set(SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS.key, numPartitions.toString) + sparkConf.set(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key, numPartitions.toString) case None => - sparkConf.set(SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS.key, "1") + sparkConf.set(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key, "1") } val spark = SparkSession.builder() @@ -313,7 +108,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val finalPlan = agg.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan val shuffleReaders = finalPlan.collect { - case reader: CoalescedShuffleReaderExec => reader + case r @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => r } assert(shuffleReaders.length === 1) minNumPostShufflePartitions match { @@ -360,7 +155,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val finalPlan = join.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan val shuffleReaders = finalPlan.collect { - case reader: CoalescedShuffleReaderExec => reader + case r @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => r } assert(shuffleReaders.length === 2) minNumPostShufflePartitions match { @@ -412,7 +207,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val finalPlan = join.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan val shuffleReaders = finalPlan.collect { - case reader: CoalescedShuffleReaderExec => reader + case r @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => r } assert(shuffleReaders.length === 2) minNumPostShufflePartitions match { @@ -464,7 +259,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val finalPlan = join.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan val shuffleReaders = finalPlan.collect { - case reader: CoalescedShuffleReaderExec => reader + case r @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => r } assert(shuffleReaders.length === 2) minNumPostShufflePartitions match { @@ -507,7 +302,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val finalPlan = join.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan val shuffleReaders = finalPlan.collect { - case reader: CoalescedShuffleReaderExec => reader + case r @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => r } assert(shuffleReaders.length === 0) } finally { @@ -535,7 +330,10 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA assert(finalPlan.collect { case ShuffleQueryStageExec(_, r: ReusedExchangeExec) => r }.length == 2) - assert(finalPlan.collect { case p: CoalescedShuffleReaderExec => p }.length == 3) + assert( + finalPlan.collect { + case p @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => p + }.length == 3) // test case 2: a query stage has 2 parent stages. @@ -583,7 +381,10 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA Seq(0, 1, 2).map(i => Row(i))) val finalPlan = resultDf.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan - assert(finalPlan.collect { case p: CoalescedShuffleReaderExec => p }.length == 0) + assert( + finalPlan.collect { + case p @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => p + }.isEmpty) } withSparkSession(test, 200, None) } @@ -601,7 +402,10 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA .asInstanceOf[AdaptiveSparkPlanExec].executedPlan // As the pre-shuffle partition number are different, we will skip reducing // the shuffle partition numbers. - assert(finalPlan.collect { case p: CoalescedShuffleReaderExec => p }.length == 0) + assert( + finalPlan.collect { + case p @ CustomShuffleReaderExec(_, _, COALESCED_SHUFFLE_READER_DESCRIPTION) => p + }.isEmpty) } withSparkSession(test, 100, None) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala index 1e90754ad7721..b27a940c364a4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala @@ -18,30 +18,27 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.expressions.scalalang.typed -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +// Disable AQE because the WholeStageCodegenExec is added when running QueryStageExec @deprecated("This test suite will be removed.", "3.0.0") class DeprecatedWholeStageCodegenSuite extends QueryTest with SharedSparkSession - with AdaptiveSparkPlanHelper { + with DisableAdaptiveExecutionSuite { test("simple typed UDAF should be included in WholeStageCodegen") { - withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { - // With enable AQE, the WholeStageCodegenExec rule is applied when running QueryStageExec. - import testImplicits._ + import testImplicits._ - val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() - .groupByKey(_._1).agg(typed.sum(_._2)) + val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() + .groupByKey(_._1).agg(typed.sum(_._2)) - val plan = ds.queryExecution.executedPlan - assert(find(plan)(p => - p.isInstanceOf[WholeStageCodegenExec] && - p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) - assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) - } + val plan = ds.queryExecution.executedPlan + assert(plan.find(p => + p.isInstanceOf[WholeStageCodegenExec] && + p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) + assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala index b29de9c4adbaa..98aba3ba25f17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArraySuite.scala @@ -27,32 +27,29 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow class ExternalAppendOnlyUnsafeRowArraySuite extends SparkFunSuite with LocalSparkContext { private val random = new java.util.Random() - private var taskContext: TaskContext = _ - - override def afterAll(): Unit = try { - TaskContext.unset() - } finally { - super.afterAll() - } private def withExternalArray(inMemoryThreshold: Int, spillThreshold: Int) (f: ExternalAppendOnlyUnsafeRowArray => Unit): Unit = { sc = new SparkContext("local", "test", new SparkConf(false)) - taskContext = MemoryTestingUtils.fakeTaskContext(SparkEnv.get) + val taskContext = MemoryTestingUtils.fakeTaskContext(SparkEnv.get) TaskContext.setTaskContext(taskContext) - val array = new ExternalAppendOnlyUnsafeRowArray( - taskContext.taskMemoryManager(), - SparkEnv.get.blockManager, - SparkEnv.get.serializerManager, - taskContext, - 1024, - SparkEnv.get.memoryManager.pageSizeBytes, - inMemoryThreshold, - spillThreshold) - try f(array) finally { - array.clear() + try { + val array = new ExternalAppendOnlyUnsafeRowArray( + taskContext.taskMemoryManager(), + SparkEnv.get.blockManager, + SparkEnv.get.serializerManager, + taskContext, + 1024, + SparkEnv.get.memoryManager.pageSizeBytes, + inMemoryThreshold, + spillThreshold) + try f(array) finally { + array.clear() + } + } finally { + TaskContext.unset() } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala index 7fbfa73623c85..28e82aa14e0d0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalog.Table import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, Join, JoinHint} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType @@ -170,4 +171,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSparkSession { isTemporary = true).toString) } } + + test("broadcast hint on global temp view") { + withGlobalTempView("v1") { + spark.range(10).createGlobalTempView("v1") + withTempView("v2") { + spark.range(10).createTempView("v2") + + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + Seq( + "SELECT /*+ MAPJOIN(v1) */ * FROM global_temp.v1, v2 WHERE v1.id = v2.id", + "SELECT /*+ MAPJOIN(global_temp.v1) */ * FROM global_temp.v1, v2 WHERE v1.id = v2.id" + ).foreach { statement => + sql(statement).queryExecution.optimizedPlan match { + case Join(_, _, _, _, JoinHint(Some(HintInfo(Some(BROADCAST))), None)) => + case _ => fail("broadcast hint not found in a left-side table") + } + } + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala index 80340b5552c6d..4b2a2b439c89e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GroupedIteratorSuite.scala @@ -28,14 +28,16 @@ class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() + val toRow = encoder.createSerializer() + val fromRow = encoder.createDeserializer() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) - val grouped = GroupedIterator(input.iterator.map(encoder.toRow), + val grouped = GroupedIterator(input.iterator.map(toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) - key.getInt(0) -> data.map(encoder.fromRow).toSeq + key.getInt(0) -> data.map(fromRow).toSeq }.toSeq assert(result == @@ -46,6 +48,8 @@ class GroupedIteratorSuite extends SparkFunSuite { test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() + val toRow = encoder.createSerializer() + val fromRow = encoder.createDeserializer() val input = Seq( Row(1, 2L, "a"), @@ -54,13 +58,13 @@ class GroupedIteratorSuite extends SparkFunSuite { Row(2, 1L, "d"), Row(3, 2L, "e")) - val grouped = GroupedIterator(input.iterator.map(encoder.toRow), + val grouped = GroupedIterator(input.iterator.map(toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) - (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) + (key.getInt(0), key.getLong(1), data.map(fromRow).toSeq) }.toSeq assert(result == @@ -73,8 +77,9 @@ class GroupedIteratorSuite extends SparkFunSuite { test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() + val toRow = encoder.createSerializer() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) - val grouped = GroupedIterator(input.iterator.map(encoder.toRow), + val grouped = GroupedIterator(input.iterator.map(toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala index bb59b12e6f350..a0b212d2cf6fd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala @@ -17,27 +17,34 @@ package org.apache.spark.sql.execution +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils +import org.apache.spark.sql.connector.InMemoryTableCatalog +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession} class HiveResultSuite extends SharedSparkSession { import testImplicits._ test("date formatting in hive result") { - val dates = Seq("2018-12-28", "1582-10-13", "1582-10-14", "1582-10-15") - val df = dates.toDF("a").selectExpr("cast(a as date) as b") - val executedPlan1 = df.queryExecution.executedPlan - val result = HiveResult.hiveResultString(executedPlan1) - assert(result == dates) - val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan - val result2 = HiveResult.hiveResultString(executedPlan2) - assert(result2 == dates.map(x => s"[$x]")) + DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId => + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> zoneId) { + val dates = Seq("2018-12-28", "1582-10-03", "1582-10-04", "1582-10-15") + val df = dates.toDF("a").selectExpr("cast(a as date) as b") + val executedPlan1 = df.queryExecution.executedPlan + val result = HiveResult.hiveResultString(executedPlan1) + assert(result == dates) + val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan + val result2 = HiveResult.hiveResultString(executedPlan2) + assert(result2 == dates.map(x => s"[$x]")) + } + } } test("timestamp formatting in hive result") { val timestamps = Seq( "2018-12-28 01:02:03", - "1582-10-13 01:02:03", - "1582-10-14 01:02:03", + "1582-10-03 01:02:03", + "1582-10-04 01:02:03", "1582-10-15 01:02:03") val df = timestamps.toDF("a").selectExpr("cast(a as timestamp) as b") val executedPlan1 = df.queryExecution.executedPlan @@ -68,4 +75,35 @@ class HiveResultSuite extends SharedSparkSession { val result = HiveResult.hiveResultString(executedPlan) assert(result.head === "0.00000000") } + + test("SHOW TABLES in hive result") { + withSQLConf("spark.sql.catalog.testcat" -> classOf[InMemoryTableCatalog].getName) { + Seq(("testcat.ns", "tbl", "foo"), ("spark_catalog.default", "tbl", "csv")).foreach { + case (ns, tbl, source) => + withTable(s"$ns.$tbl") { + spark.sql(s"CREATE TABLE $ns.$tbl (id bigint) USING $source") + val df = spark.sql(s"SHOW TABLES FROM $ns") + val executedPlan = df.queryExecution.executedPlan + assert(HiveResult.hiveResultString(executedPlan).head == tbl) + } + } + } + } + + test("DESCRIBE TABLE in hive result") { + withSQLConf("spark.sql.catalog.testcat" -> classOf[InMemoryTableCatalog].getName) { + Seq(("testcat.ns", "tbl", "foo"), ("spark_catalog.default", "tbl", "csv")).foreach { + case (ns, tbl, source) => + withTable(s"$ns.$tbl") { + spark.sql(s"CREATE TABLE $ns.$tbl (id bigint COMMENT 'col1') USING $source") + val df = spark.sql(s"DESCRIBE $ns.$tbl") + val executedPlan = df.queryExecution.executedPlan + val expected = "id " + + "\tbigint " + + "\tcol1 " + assert(HiveResult.hiveResultString(executedPlan).head == expected) + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala index 311f84c07a955..5bcec9b1e517c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala @@ -22,6 +22,7 @@ import scala.reflect.ClassTag import org.apache.spark.sql.TPCDSQuerySuite import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Final} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Generate, Join, LocalRelation, LogicalPlan, Range, Sample, Union, Window} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} import org.apache.spark.sql.execution.datasources.LogicalRelation @@ -29,22 +30,9 @@ import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.execution.window.WindowExec -import org.apache.spark.sql.internal.SQLConf -class LogicalPlanTagInSparkPlanSuite extends TPCDSQuerySuite { - - var originalValue: String = _ - // when enable AQE, the 'AdaptiveSparkPlanExec' node does not have a logical plan link - override def beforeAll(): Unit = { - super.beforeAll() - originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") - } - - override def afterAll(): Unit = { - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) - super.afterAll() - } +// Disable AQE because AdaptiveSparkPlanExec does not have a logical plan link +class LogicalPlanTagInSparkPlanSuite extends TPCDSQuerySuite with DisableAdaptiveExecutionSuite { override protected def checkGeneratedCode( plan: SparkPlan, checkMethodCodeSize: Boolean = true): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala index afb438e0bbc72..68691e2f7fdac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala @@ -103,6 +103,20 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSparkSession { "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2", "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t") + testMetadataOnly( + "SPARK-31590 Metadata-only queries should not include subquery in partition filters", + """ + |SELECT partcol1, MAX(partcol2) AS partcol2 + |FROM srcpart + |WHERE partcol1 = ( + | SELECT MAX(partcol1) + | FROM srcpart + |) + |AND partcol2 = 'even' + |GROUP BY partcol1 + |""".stripMargin + ) + testNotMetadataOnly( "Don't optimize metadata only query for non-partition columns", "select col1 from srcpart group by col1", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 0c5e2e3c7d1d4..b7be0f1320e61 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range, Repartition, Sort, Union} import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecution} import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ReuseExchange, ShuffleExchangeExec} @@ -234,19 +234,6 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } } - test("SPARK-23375: Cached sorted data doesn't need to be re-sorted") { - val query = testData.select('key, 'value).sort('key.desc).cache() - assert(query.queryExecution.optimizedPlan.isInstanceOf[InMemoryRelation]) - val resorted = query.sort('key.desc) - assert(resorted.queryExecution.optimizedPlan.collect { case s: Sort => s}.isEmpty) - assert(resorted.select('key).collect().map(_.getInt(0)).toSeq == - (1 to 100).reverse) - // with a different order, the sort is needed - val sortedAsc = query.sort('key) - assert(sortedAsc.queryExecution.optimizedPlan.collect { case s: Sort => s}.size == 1) - assert(sortedAsc.select('key).collect().map(_.getInt(0)).toSeq == (1 to 100)) - } - test("PartitioningCollection") { withTempView("normal", "small", "tiny") { testData.createOrReplaceTempView("normal") @@ -752,7 +739,8 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("SPARK-24556: always rewrite output partitioning in ReusedExchangeExec " + - "and InMemoryTableScanExec") { + "and InMemoryTableScanExec", + DisableAdaptiveExecution("Reuse is dynamic in AQE")) { def checkOutputPartitioningRewrite( plans: Seq[SparkPlan], expectedPartitioningClass: Class[_]): Unit = { @@ -782,8 +770,7 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { checkOutputPartitioningRewrite(inMemoryScan, expectedPartitioningClass) } // when enable AQE, the reusedExchange is inserted when executed. - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { // ReusedExchange is HashPartitioning val df1 = Seq(1 -> "a").toDF("i", "j").repartition($"i") val df2 = Seq(1 -> "a").toDF("i", "j").repartition($"i") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala index 987338cf6cbbf..5ff459513e848 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala @@ -58,4 +58,13 @@ class QueryPlanningTrackerEndToEndSuite extends StreamTest { StopStream) } + test("The start times should be in order: parsing <= analysis <= optimization <= planning") { + val df = spark.sql("select count(*) from range(1)") + df.queryExecution.executedPlan + val phases = df.queryExecution.tracker.phases + assert(phases("parsing").startTimeMs <= phases("analysis").startTimeMs) + assert(phases("analysis").startTimeMs <= phases("optimization").startTimeMs) + assert(phases("optimization").startTimeMs <= phases("planning").startTimeMs) + } + } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala new file mode 100644 index 0000000000000..9cea81239f37d --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.{DataFrame, QueryTest} +import org.apache.spark.sql.catalyst.plans.physical.{RangePartitioning, UnknownPartitioning} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite} +import org.apache.spark.sql.execution.joins.SortMergeJoinExec +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + + +abstract class RemoveRedundantSortsSuiteBase + extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { + import testImplicits._ + + private def checkNumSorts(df: DataFrame, count: Int): Unit = { + val plan = df.queryExecution.executedPlan + assert(collectWithSubqueries(plan) { case s: SortExec => s }.length == count) + } + + private def checkSorts(query: String, enabledCount: Int, disabledCount: Int): Unit = { + withSQLConf(SQLConf.REMOVE_REDUNDANT_SORTS_ENABLED.key -> "true") { + val df = sql(query) + checkNumSorts(df, enabledCount) + val result = df.collect() + withSQLConf(SQLConf.REMOVE_REDUNDANT_SORTS_ENABLED.key -> "false") { + val df = sql(query) + checkNumSorts(df, disabledCount) + checkAnswer(df, result) + } + } + } + + test("remove redundant sorts with limit") { + withTempView("t") { + spark.range(100).select('id as "key").createOrReplaceTempView("t") + val query = + """ + |SELECT key FROM + | (SELECT key FROM t WHERE key > 10 ORDER BY key DESC LIMIT 10) + |ORDER BY key DESC + |""".stripMargin + checkSorts(query, 0, 1) + } + } + + test("remove redundant sorts with sort merge join") { + withTempView("t1", "t2") { + spark.range(1000).select('id as "key").createOrReplaceTempView("t1") + spark.range(1000).select('id as "key").createOrReplaceTempView("t2") + val query = """ + |SELECT /*+ MERGE(t1) */ t1.key FROM + | (SELECT key FROM t1 WHERE key > 10 ORDER BY key DESC LIMIT 10) t1 + |JOIN + | (SELECT key FROM t2 WHERE key > 50 ORDER BY key DESC LIMIT 100) t2 + |ON t1.key = t2.key + |ORDER BY t1.key + """.stripMargin + + val queryAsc = query + " ASC" + checkSorts(queryAsc, 2, 3) + + // The top level sort should not be removed since the child output ordering is ASC and + // the required ordering is DESC. + val queryDesc = query + " DESC" + checkSorts(queryDesc, 3, 3) + } + } + + test("cached sorted data doesn't need to be re-sorted") { + withSQLConf(SQLConf.REMOVE_REDUNDANT_SORTS_ENABLED.key -> "true") { + val df = spark.range(1000).select('id as "key").sort('key.desc).cache() + val resorted = df.sort('key.desc) + val sortedAsc = df.sort('key.asc) + checkNumSorts(df, 0) + checkNumSorts(resorted, 0) + checkNumSorts(sortedAsc, 1) + val result = resorted.collect() + withSQLConf(SQLConf.REMOVE_REDUNDANT_SORTS_ENABLED.key -> "false") { + val resorted = df.sort('key.desc) + checkNumSorts(resorted, 1) + checkAnswer(resorted, result) + } + } + } + + test("SPARK-33472: shuffled join with different left and right side partition numbers") { + withTempView("t1", "t2") { + spark.range(0, 100, 1, 2).select('id as "key").createOrReplaceTempView("t1") + (0 to 100).toDF("key").createOrReplaceTempView("t2") + + val query = """ + |SELECT /*+ MERGE(t1) */ t1.key + |FROM t1 JOIN t2 ON t1.key = t2.key + |WHERE t1.key > 10 AND t2.key < 50 + |ORDER BY t1.key ASC + """.stripMargin + + val df = sql(query) + val sparkPlan = df.queryExecution.sparkPlan + val join = sparkPlan.collect { case j: SortMergeJoinExec => j }.head + val leftPartitioning = join.left.outputPartitioning + assert(leftPartitioning.isInstanceOf[RangePartitioning]) + assert(leftPartitioning.numPartitions == 2) + assert(join.right.outputPartitioning == UnknownPartitioning(0)) + checkSorts(query, 3, 3) + } + } +} + +class RemoveRedundantSortsSuite extends RemoveRedundantSortsSuiteBase + with DisableAdaptiveExecutionSuite + +class RemoveRedundantSortsSuiteAE extends RemoveRedundantSortsSuiteBase + with EnableAdaptiveExecutionSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala index 8bf7fe62cd49b..81e692076b432 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala @@ -17,11 +17,17 @@ package org.apache.spark.sql.execution +import java.util.concurrent.Executors + import scala.collection.parallel.immutable.ParRange +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration._ import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.types._ +import org.apache.spark.util.ThreadUtils class SQLExecutionSuite extends SparkFunSuite { @@ -119,6 +125,38 @@ class SQLExecutionSuite extends SparkFunSuite { spark.stop() } + + test("SPARK-32813: Table scan should work in different thread") { + val executor1 = Executors.newSingleThreadExecutor() + val executor2 = Executors.newSingleThreadExecutor() + var session: SparkSession = null + SparkSession.cleanupAnyExistingSession() + + withTempDir { tempDir => + try { + val tablePath = tempDir.toString + "/table" + val df = ThreadUtils.awaitResult(Future { + session = SparkSession.builder().appName("test").master("local[*]").getOrCreate() + + session.createDataFrame( + session.sparkContext.parallelize(Row(Array(1, 2, 3)) :: Nil), + StructType(Seq( + StructField("a", ArrayType(IntegerType, containsNull = false), nullable = false)))) + .write.parquet(tablePath) + + session.read.parquet(tablePath) + }(ExecutionContext.fromExecutorService(executor1)), 1.minute) + + ThreadUtils.awaitResult(Future { + assert(df.rdd.collect()(0) === Row(Seq(1, 2, 3))) + }(ExecutionContext.fromExecutorService(executor2)), 1.minute) + } finally { + executor1.shutdown() + executor2.shutdown() + session.stop() + } + } + } } object SQLExecutionSuite { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 9a393f19ce9bb..575efec364812 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -79,7 +79,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { var e = intercept[AnalysisException] { sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6") }.getMessage - assert(e.contains("Not allowed to create a permanent view `jtv1` by " + + assert(e.contains("Not allowed to create a permanent view `default`.`jtv1` by " + "referencing a temporary view temp_jtv1. " + "Please create a temp view instead by CREATE TEMP VIEW")) @@ -88,8 +88,8 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { e = intercept[AnalysisException] { sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6") }.getMessage - assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " + - s"a temporary view global_temp.global_temp_jtv1")) + assert(e.contains("Not allowed to create a permanent view `default`.`jtv1` by " + + "referencing a temporary view global_temp.global_temp_jtv1")) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala index 7aabf4d039f08..67ec1028f1998 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala @@ -41,67 +41,70 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 10) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - checkAnswer( - sql( - """ - |select area, sum(product), sum(sum(product)) over (partition by area) - |from windowData group by month, area - """.stripMargin), - Seq( - ("a", 5, 11), - ("a", 6, 11), - ("b", 7, 15), - ("b", 8, 15), - ("c", 9, 19), - ("c", 10, 19) - ).map(i => Row(i._1, i._2, i._3))) - - checkAnswer( - sql( - """ - |select area, sum(product) - 1, sum(sum(product)) over (partition by area) - |from windowData group by month, area - """.stripMargin), - Seq( - ("a", 4, 11), - ("a", 5, 11), - ("b", 6, 15), - ("b", 7, 15), - ("c", 8, 19), - ("c", 9, 19) - ).map(i => Row(i._1, i._2, i._3))) - - checkAnswer( - sql( - """ - |select area, sum(product), sum(product) / sum(sum(product)) over (partition by area) - |from windowData group by month, area - """.stripMargin), - Seq( - ("a", 5, 5d/11), - ("a", 6, 6d/11), - ("b", 7, 7d/15), - ("b", 8, 8d/15), - ("c", 10, 10d/19), - ("c", 9, 9d/19) - ).map(i => Row(i._1, i._2, i._3))) - - checkAnswer( - sql( - """ - |select area, sum(product), sum(product) / sum(sum(product) - 1) over (partition by area) - |from windowData group by month, area - """.stripMargin), - Seq( - ("a", 5, 5d/9), - ("a", 6, 6d/9), - ("b", 7, 7d/13), - ("b", 8, 8d/13), - ("c", 10, 10d/17), - ("c", 9, 9d/17) - ).map(i => Row(i._1, i._2, i._3))) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + checkAnswer( + sql( + """ + |select area, sum(product), sum(sum(product)) over (partition by area) + |from windowData group by month, area + """.stripMargin), + Seq( + ("a", 5, 11), + ("a", 6, 11), + ("b", 7, 15), + ("b", 8, 15), + ("c", 9, 19), + ("c", 10, 19) + ).map(i => Row(i._1, i._2, i._3))) + + checkAnswer( + sql( + """ + |select area, sum(product) - 1, sum(sum(product)) over (partition by area) + |from windowData group by month, area + """.stripMargin), + Seq( + ("a", 4, 11), + ("a", 5, 11), + ("b", 6, 15), + ("b", 7, 15), + ("c", 8, 19), + ("c", 9, 19) + ).map(i => Row(i._1, i._2, i._3))) + + checkAnswer( + sql( + """ + |select area, sum(product), sum(product) / sum(sum(product)) over (partition by area) + |from windowData group by month, area + """.stripMargin), + Seq( + ("a", 5, 5d/11), + ("a", 6, 6d/11), + ("b", 7, 7d/15), + ("b", 8, 8d/15), + ("c", 10, 10d/19), + ("c", 9, 9d/19) + ).map(i => Row(i._1, i._2, i._3))) + + checkAnswer( + sql( + """ + |select area, sum(product), sum(product) / sum(sum(product) - 1) over + |(partition by area) + |from windowData group by month, area + """.stripMargin), + Seq( + ("a", 5, 5d/9), + ("a", 6, 6d/9), + ("b", 7, 7d/13), + ("b", 8, 8d/13), + ("c", 10, 10d/17), + ("c", 9, 9d/17) + ).map(i => Row(i._1, i._2, i._3))) + } } test("window function: refer column in inner select block") { @@ -113,22 +116,24 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 10) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - checkAnswer( - sql( - """ - |select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1 - |from (select month, area, product, 1 as tmp1 from windowData) tmp - """.stripMargin), - Seq( - ("a", 2), - ("a", 3), - ("b", 2), - ("b", 3), - ("c", 2), - ("c", 3) - ).map(i => Row(i._1, i._2))) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + checkAnswer( + sql( + """ + |select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1 + |from (select month, area, product, 1 as tmp1 from windowData) tmp + """.stripMargin), + Seq( + ("a", 2), + ("a", 3), + ("b", 2), + ("b", 3), + ("c", 2), + ("c", 3) + ).map(i => Row(i._1, i._2))) + } } test("window function: partition and order expressions") { @@ -140,38 +145,40 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 10) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - checkAnswer( - sql( - """ - |select month, area, product, sum(product + 1) over (partition by 1 order by 2) - |from windowData - """.stripMargin), - Seq( - (1, "a", 5, 51), - (2, "a", 6, 51), - (3, "b", 7, 51), - (4, "b", 8, 51), - (5, "c", 9, 51), - (6, "c", 10, 51) - ).map(i => Row(i._1, i._2, i._3, i._4))) - - checkAnswer( - sql( - """ - |select month, area, product, sum(product) - |over (partition by month % 2 order by 10 - product) - |from windowData - """.stripMargin), - Seq( - (1, "a", 5, 21), - (2, "a", 6, 24), - (3, "b", 7, 16), - (4, "b", 8, 18), - (5, "c", 9, 9), - (6, "c", 10, 10) - ).map(i => Row(i._1, i._2, i._3, i._4))) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + checkAnswer( + sql( + """ + |select month, area, product, sum(product + 1) over (partition by 1 order by 2) + |from windowData + """.stripMargin), + Seq( + (1, "a", 5, 51), + (2, "a", 6, 51), + (3, "b", 7, 51), + (4, "b", 8, 51), + (5, "c", 9, 51), + (6, "c", 10, 51) + ).map(i => Row(i._1, i._2, i._3, i._4))) + + checkAnswer( + sql( + """ + |select month, area, product, sum(product) + |over (partition by month % 2 order by 10 - product) + |from windowData + """.stripMargin), + Seq( + (1, "a", 5, 21), + (2, "a", 6, 24), + (3, "b", 7, 16), + (4, "b", 8, 18), + (5, "c", 9, 9), + (6, "c", 10, 10) + ).map(i => Row(i._1, i._2, i._3, i._4))) + } } test("window function: distinct should not be silently ignored") { @@ -183,16 +190,18 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 10) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - val e = intercept[AnalysisException] { - sql( - """ - |select month, area, product, sum(distinct product + 1) over (partition by 1 order by 2) - |from windowData - """.stripMargin) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + val e = intercept[AnalysisException] { + sql( + """ + |select month, area, product, sum(distinct product + 1) over (partition by 1 order by 2) + |from windowData + """.stripMargin) + } + assert(e.getMessage.contains("Distinct window functions are not supported")) } - assert(e.getMessage.contains("Distinct window functions are not supported")) } test("window function: expressions in arguments of a window functions") { @@ -204,23 +213,25 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 10) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - checkAnswer( - sql( - """ - |select month, area, month % 2, - |lag(product, 1 + 1, product) over (partition by month % 2 order by area) - |from windowData - """.stripMargin), - Seq( - (1, "a", 1, 5), - (2, "a", 0, 6), - (3, "b", 1, 7), - (4, "b", 0, 8), - (5, "c", 1, 5), - (6, "c", 0, 6) - ).map(i => Row(i._1, i._2, i._3, i._4))) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + checkAnswer( + sql( + """ + |select month, area, month % 2, + |lag(product, 1 + 1, product) over (partition by month % 2 order by area) + |from windowData + """.stripMargin), + Seq( + (1, "a", 1, 5), + (2, "a", 0, 6), + (3, "b", 1, 7), + (4, "b", 0, 8), + (5, "c", 1, 5), + (6, "c", 0, 6) + ).map(i => Row(i._1, i._2, i._3, i._4))) + } } @@ -233,63 +244,65 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 11) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - checkAnswer( - sql("select month, product, sum(product + 1) over() from windowData order by area"), - Seq( - (2, 6, 57), - (3, 7, 57), - (4, 8, 57), - (5, 9, 57), - (6, 11, 57), - (1, 10, 57) - ).map(i => Row(i._1, i._2, i._3))) - - checkAnswer( - sql( - """ - |select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1 - |from (select month, area, product as p, 1 as tmp1 from windowData) tmp order by p - """.stripMargin), - Seq( - ("a", 2), - ("b", 2), - ("b", 3), - ("c", 2), - ("d", 2), - ("c", 3) - ).map(i => Row(i._1, i._2))) - - checkAnswer( - sql( - """ - |select area, rank() over (partition by area order by month) as c1 - |from windowData group by product, area, month order by product, area - """.stripMargin), - Seq( - ("a", 1), - ("b", 1), - ("b", 2), - ("c", 1), - ("d", 1), - ("c", 2) - ).map(i => Row(i._1, i._2))) - - checkAnswer( - sql( - """ - |select area, sum(product) / sum(sum(product)) over (partition by area) as c1 - |from windowData group by area, month order by month, c1 - """.stripMargin), - Seq( - ("d", 1.0), - ("a", 1.0), - ("b", 0.4666666666666667), - ("b", 0.5333333333333333), - ("c", 0.45), - ("c", 0.55) - ).map(i => Row(i._1, i._2))) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + checkAnswer( + sql("select month, product, sum(product + 1) over() from windowData order by area"), + Seq( + (2, 6, 57), + (3, 7, 57), + (4, 8, 57), + (5, 9, 57), + (6, 11, 57), + (1, 10, 57) + ).map(i => Row(i._1, i._2, i._3))) + + checkAnswer( + sql( + """ + |select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1 + |from (select month, area, product as p, 1 as tmp1 from windowData) tmp order by p + """.stripMargin), + Seq( + ("a", 2), + ("b", 2), + ("b", 3), + ("c", 2), + ("d", 2), + ("c", 3) + ).map(i => Row(i._1, i._2))) + + checkAnswer( + sql( + """ + |select area, rank() over (partition by area order by month) as c1 + |from windowData group by product, area, month order by product, area + """.stripMargin), + Seq( + ("a", 1), + ("b", 1), + ("b", 2), + ("c", 1), + ("d", 1), + ("c", 2) + ).map(i => Row(i._1, i._2))) + + checkAnswer( + sql( + """ + |select area, sum(product) / sum(sum(product)) over (partition by area) as c1 + |from windowData group by area, month order by month, c1 + """.stripMargin), + Seq( + ("d", 1.0), + ("a", 1.0), + ("b", 0.4666666666666667), + ("b", 0.5333333333333333), + ("c", 0.45), + ("c", 0.55) + ).map(i => Row(i._1, i._2))) + } } // todo: fix this test case by reimplementing the function ResolveAggregateFunctions @@ -302,23 +315,25 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { WindowData(5, "c", 9), WindowData(6, "c", 11) ) - sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") - - checkAnswer( - sql( - """ - |select area, sum(product) over () as c from windowData - |where product > 3 group by area, product - |having avg(month) > 0 order by avg(month), product - """.stripMargin), - Seq( - ("a", 51), - ("b", 51), - ("b", 51), - ("c", 51), - ("c", 51), - ("d", 51) - ).map(i => Row(i._1, i._2))) + withTempView("windowData") { + sparkContext.parallelize(data).toDF().createOrReplaceTempView("windowData") + + checkAnswer( + sql( + """ + |select area, sum(product) over () as c from windowData + |where product > 3 group by area, product + |having avg(month) > 0 order by avg(month), product + """.stripMargin), + Seq( + ("a", 51), + ("b", 51), + ("b", 51), + ("c", 51), + ("c", 51), + ("d", 51) + ).map(i => Row(i._1, i._2))) + } } test("window function: multiple window expressions in a single expression") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsUtilSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsUtilSuite.scala new file mode 100644 index 0000000000000..7acc33c43b19d --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsUtilSuite.scala @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.{MapOutputStatistics, SparkFunSuite} +import org.apache.spark.sql.execution.adaptive.ShufflePartitionsUtil + +class ShufflePartitionsUtilSuite extends SparkFunSuite { + + private def checkEstimation( + bytesByPartitionIdArray: Array[Array[Long]], + expectedPartitionStartIndices: Seq[CoalescedPartitionSpec], + targetSize: Long, + minNumPartitions: Int = 1): Unit = { + val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map { + case (bytesByPartitionId, index) => + new MapOutputStatistics(index, bytesByPartitionId) + } + val estimatedPartitionStartIndices = ShufflePartitionsUtil.coalescePartitions( + mapOutputStatistics, + targetSize, + minNumPartitions) + assert(estimatedPartitionStartIndices === expectedPartitionStartIndices) + } + + test("1 shuffle") { + val targetSize = 100 + + { + // All bytes per partition are 0. + val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 5)) + checkEstimation(Array(bytesByPartitionId), expectedPartitionSpecs, targetSize) + } + + { + // Some bytes per partition are 0 and total size is less than the target size. + // 1 coalesced partition is expected. + val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 5)) + checkEstimation(Array(bytesByPartitionId), expectedPartitionSpecs, targetSize) + } + + { + // 2 coalesced partitions are expected. + val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 3), CoalescedPartitionSpec(3, 5)) + checkEstimation(Array(bytesByPartitionId), expectedPartitionSpecs, targetSize) + } + + { + // There are a few large shuffle partitions. + val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 2), + CoalescedPartitionSpec(2, 3), + CoalescedPartitionSpec(3, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation(Array(bytesByPartitionId), expectedPartitionSpecs, targetSize) + } + + { + // All shuffle partitions are larger than the targeted size. + val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 2), + CoalescedPartitionSpec(2, 3), + CoalescedPartitionSpec(3, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation(Array(bytesByPartitionId), expectedPartitionSpecs, targetSize) + } + + { + // The last shuffle partition is in a single coalesced partition. + val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 4), CoalescedPartitionSpec(4, 5)) + checkEstimation(Array(bytesByPartitionId), expectedPartitionSpecs, targetSize) + } + } + + test("2 shuffles") { + val targetSize = 100 + + { + // If there are multiple values of the number of shuffle partitions, + // we should see an assertion error. + val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) + val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0) + intercept[AssertionError] { + checkEstimation(Array(bytesByPartitionId1, bytesByPartitionId2), Seq.empty, targetSize) + } + } + + { + // All bytes per partition are 0. + val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) + val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + + { + // Some bytes per partition are 0. + // 1 coalesced partition is expected. + val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + + { + // 2 coalesced partition are expected. + val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 2), + CoalescedPartitionSpec(2, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + + { + // 4 coalesced partition are expected. + val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 2), + CoalescedPartitionSpec(2, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + + { + // 2 coalesced partition are needed. + val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 2), + CoalescedPartitionSpec(2, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + + { + // There are a few large shuffle partitions. + val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 2), + CoalescedPartitionSpec(2, 3), + CoalescedPartitionSpec(3, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + + { + // All pairs of shuffle partitions are larger than the targeted size. + val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 2), + CoalescedPartitionSpec(2, 3), + CoalescedPartitionSpec(3, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize) + } + } + + test("enforce minimal number of coalesced partitions") { + val targetSize = 100 + val minNumPartitions = 2 + + { + // The minimal number of coalesced partitions is not enforced because + // the size of data is 0. + val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) + val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize, minNumPartitions) + } + + { + // The minimal number of coalesced partitions is enforced. + val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20) + val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5) + val expectedPartitionSpecs = Seq(CoalescedPartitionSpec(0, 3), CoalescedPartitionSpec(3, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize, minNumPartitions) + } + + { + // The number of coalesced partitions is determined by the algorithm. + val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20) + val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30) + val expectedPartitionSpecs = Seq( + CoalescedPartitionSpec(0, 1), + CoalescedPartitionSpec(1, 3), + CoalescedPartitionSpec(3, 4), + CoalescedPartitionSpec(4, 5)) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionSpecs, + targetSize, minNumPartitions) + } + } + + test("splitSizeListByTargetSize") { + val targetSize = 100 + + // merge the small partitions at the beginning/end + val sizeList1 = Seq[Long](15, 90, 15, 15, 15, 90, 15) + assert(ShufflePartitionsUtil.splitSizeListByTargetSize(sizeList1, targetSize).toSeq == + Seq(0, 2, 5)) + + // merge the small partitions in the middle + val sizeList2 = Seq[Long](30, 15, 90, 10, 90, 15, 30) + assert(ShufflePartitionsUtil.splitSizeListByTargetSize(sizeList2, targetSize).toSeq == + Seq(0, 2, 4, 5)) + + // merge small partitions if the partition itself is smaller than + // targetSize * SMALL_PARTITION_FACTOR + val sizeList3 = Seq[Long](15, 1000, 15, 1000) + assert(ShufflePartitionsUtil.splitSizeListByTargetSize(sizeList3, targetSize).toSeq == + Seq(0, 3)) + + // merge small partitions if the combined size is smaller than + // targetSize * MERGED_PARTITION_FACTOR + val sizeList4 = Seq[Long](35, 75, 90, 20, 35, 25, 35) + assert(ShufflePartitionsUtil.splitSizeListByTargetSize(sizeList4, targetSize).toSeq == + Seq(0, 2, 3)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala index 7654a9d982059..6a4f3f62641f8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SortSuite.scala @@ -97,6 +97,19 @@ class SortSuite extends SparkPlanTest with SharedSparkSession { } } + test("SPARK-33260: sort order is a Stream") { + val input = Seq( + ("Hello", 4, 2.0), + ("Hello", 1, 1.0), + ("World", 8, 3.0) + ) + checkAnswer( + input.toDF("a", "b", "c"), + (child: SparkPlan) => SortExec(Stream('a.asc, 'b.asc, 'c.asc), global = true, child = child), + input.sortBy(t => (t._1, t._2, t._3)).map(Row.fromTuple), + sortAnswers = false) + } + // Test sorting on different data types for ( dataType <- DataTypeTestUtils.atomicTypes ++ Set(NullType); diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala index e3bc414516c04..56fff1107ae39 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala @@ -84,4 +84,8 @@ class SparkPlanSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-30780 empty LocalTableScan should use RDD without partitions") { + assert(LocalTableScanExec(Nil, Nil).execute().getNumPartitions == 0) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala index b29e822add8bc..7ddf9d87a6aca 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala @@ -23,6 +23,7 @@ import scala.util.control.NonFatal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{DataFrame, Row, SparkSession, SQLContext} import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.test.SQLTestUtils /** @@ -237,7 +238,7 @@ object SparkPlanTest { * @param spark SqlContext used for execution of the plan */ def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = { - val execution = new QueryExecution(spark.sparkSession, null) { + val execution = new QueryExecution(spark.sparkSession, LocalRelation(Nil)) { override lazy val sparkPlan: SparkPlan = outputPlan transform { case plan: SparkPlan => val inputMap = plan.children.flatMap(_.output).map(a => (a.name, a)).toMap diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index 06574a9f8fd2c..343d3c1c13469 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -21,11 +21,12 @@ import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAlias, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.expressions.{Ascending, Concat, SortOrder} -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, RepartitionByExpression, Sort} +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.expressions.{Ascending, AttributeReference, Concat, SortOrder} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command._ -import org.apache.spark.sql.execution.datasources.{CreateTable, RefreshResource} -import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} +import org.apache.spark.sql.execution.datasources.{CreateTable, CreateTempViewUsing, RefreshResource} +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf, StaticSQLConf} import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} /** @@ -80,6 +81,15 @@ class SparkSqlParserSuite extends AnalysisTest { intercept("REFRESH", "Resource paths cannot be empty in REFRESH statements") } + test("SPARK-33118 CREATE TMEPORARY TABLE with LOCATION") { + assertEqual("CREATE TEMPORARY TABLE t USING parquet OPTIONS (path '/data/tmp/testspark1')", + CreateTempViewUsing(TableIdentifier("t", None), None, false, false, "parquet", + Map("path" -> "/data/tmp/testspark1"))) + assertEqual("CREATE TEMPORARY TABLE t USING parquet LOCATION '/data/tmp/testspark1'", + CreateTempViewUsing(TableIdentifier("t", None), None, false, false, "parquet", + Map("path" -> "/data/tmp/testspark1"))) + } + private def createTableUsing( table: String, database: Option[String] = None, @@ -251,4 +261,44 @@ class SparkSqlParserSuite extends AnalysisTest { assertEqual("ADD FILE /path with space/abc.txt", AddFileCommand("/path with space/abc.txt")) assertEqual("ADD JAR /path with space/abc.jar", AddJarCommand("/path with space/abc.jar")) } + + test("SPARK-32608: script transform with row format delimit") { + assertEqual( + """ + |SELECT TRANSFORM(a, b, c) + | ROW FORMAT DELIMITED + | FIELDS TERMINATED BY ',' + | COLLECTION ITEMS TERMINATED BY '#' + | MAP KEYS TERMINATED BY '@' + | LINES TERMINATED BY '\n' + | NULL DEFINED AS 'null' + | USING 'cat' AS (a, b, c) + | ROW FORMAT DELIMITED + | FIELDS TERMINATED BY ',' + | COLLECTION ITEMS TERMINATED BY '#' + | MAP KEYS TERMINATED BY '@' + | LINES TERMINATED BY '\n' + | NULL DEFINED AS 'NULL' + |FROM testData + """.stripMargin, + ScriptTransformation( + Seq('a, 'b, 'c), + "cat", + Seq(AttributeReference("a", StringType)(), + AttributeReference("b", StringType)(), + AttributeReference("c", StringType)()), + UnresolvedRelation(TableIdentifier("testData")), + ScriptInputOutputSchema( + Seq(("TOK_TABLEROWFORMATFIELD", ","), + ("TOK_TABLEROWFORMATCOLLITEMS", "#"), + ("TOK_TABLEROWFORMATMAPKEYS", "@"), + ("TOK_TABLEROWFORMATLINES", "\n"), + ("TOK_TABLEROWFORMATNULL", "null")), + Seq(("TOK_TABLEROWFORMATFIELD", ","), + ("TOK_TABLEROWFORMATCOLLITEMS", "#"), + ("TOK_TABLEROWFORMATMAPKEYS", "@"), + ("TOK_TABLEROWFORMATLINES", "\n"), + ("TOK_TABLEROWFORMATNULL", "NULL")), None, None, + List.empty, List.empty, None, None, false))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala index 8aa003a3dfeb0..f630cd8322c61 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala @@ -210,23 +210,8 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSparkSession test("SPARK-23376: Create UnsafeKVExternalSorter with BytesToByteMap having duplicated keys") { val memoryManager = new TestMemoryManager(new SparkConf()) val taskMemoryManager = new TaskMemoryManager(memoryManager, 0) - val map = new BytesToBytesMap(taskMemoryManager, 64, taskMemoryManager.pageSizeBytes()) - - // Key/value are a unsafe rows with a single int column + val map = createBytesToBytesMapWithDuplicateKeys(taskMemoryManager) val schema = new StructType().add("i", IntegerType) - val key = new UnsafeRow(1) - key.pointTo(new Array[Byte](32), 32) - key.setInt(0, 1) - val value = new UnsafeRow(1) - value.pointTo(new Array[Byte](32), 32) - value.setInt(0, 2) - - for (_ <- 1 to 65) { - val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes) - loc.append( - key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, - value.getBaseObject, value.getBaseOffset, value.getSizeInBytes) - } // Make sure we can successfully create a UnsafeKVExternalSorter with a `BytesToBytesMap` // which has duplicated keys and the number of entries exceeds its capacity. @@ -245,4 +230,82 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSparkSession TaskContext.unset() } } + + test("SPARK-31952: create UnsafeKVExternalSorter with existing map should count spilled memory " + + "size correctly") { + val memoryManager = new TestMemoryManager(new SparkConf()) + val taskMemoryManager = new TaskMemoryManager(memoryManager, 0) + val map = createBytesToBytesMapWithDuplicateKeys(taskMemoryManager) + val schema = new StructType().add("i", IntegerType) + + try { + val context = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties(), null) + TaskContext.setTaskContext(context) + val expectedSpillSize = map.getTotalMemoryConsumption + val sorter = new UnsafeKVExternalSorter( + schema, + schema, + sparkContext.env.blockManager, + sparkContext.env.serializerManager, + taskMemoryManager.pageSizeBytes(), + Int.MaxValue, + map) + assert(sorter.getSpillSize === expectedSpillSize) + } finally { + TaskContext.unset() + } + } + + test("SPARK-31952: UnsafeKVExternalSorter.merge should accumulate totalSpillBytes") { + val memoryManager = new TestMemoryManager(new SparkConf()) + val taskMemoryManager = new TaskMemoryManager(memoryManager, 0) + val map1 = createBytesToBytesMapWithDuplicateKeys(taskMemoryManager) + val map2 = createBytesToBytesMapWithDuplicateKeys(taskMemoryManager) + val schema = new StructType().add("i", IntegerType) + + try { + val context = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties(), null) + TaskContext.setTaskContext(context) + val expectedSpillSize = map1.getTotalMemoryConsumption + map2.getTotalMemoryConsumption + val sorter1 = new UnsafeKVExternalSorter( + schema, + schema, + sparkContext.env.blockManager, + sparkContext.env.serializerManager, + taskMemoryManager.pageSizeBytes(), + Int.MaxValue, + map1) + val sorter2 = new UnsafeKVExternalSorter( + schema, + schema, + sparkContext.env.blockManager, + sparkContext.env.serializerManager, + taskMemoryManager.pageSizeBytes(), + Int.MaxValue, + map2) + sorter1.merge(sorter2) + assert(sorter1.getSpillSize === expectedSpillSize) + } finally { + TaskContext.unset() + } + } + + private def createBytesToBytesMapWithDuplicateKeys(taskMemoryManager: TaskMemoryManager) + : BytesToBytesMap = { + val map = new BytesToBytesMap(taskMemoryManager, 64, taskMemoryManager.pageSizeBytes()) + // Key/value are a unsafe rows with a single int column + val key = new UnsafeRow(1) + key.pointTo(new Array[Byte](32), 32) + key.setInt(0, 1) + val value = new UnsafeRow(1) + value.pointTo(new Array[Byte](32), 32) + value.setInt(0, 2) + for (_ <- 1 to 65) { + val loc = map.lookup(key.getBaseObject, key.getBaseOffset, key.getSizeInBytes) + loc.append( + key.getBaseObject, key.getBaseOffset, key.getSizeInBytes, + value.getBaseObject, value.getBaseOffset, value.getSizeInBytes) + } + map + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala index f6814d8ff8a3d..c5a01de911962 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits +import org.scalatest.time.SpanSugar._ import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite @@ -50,7 +51,7 @@ class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", "--conf", "spark.sql.adaptive.enabled=false", unusedJar.toString) - SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") + SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..", 3.minutes) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala index 06a016fac5300..f7396ee2a89c8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.{Dataset, QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeAndComment, CodeGenerator} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec @@ -28,23 +29,12 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StringType, StructType} -class WholeStageCodegenSuite extends QueryTest with SharedSparkSession { +// Disable AQE because the WholeStageCodegenExec is added when running QueryStageExec +class WholeStageCodegenSuite extends QueryTest with SharedSparkSession + with DisableAdaptiveExecutionSuite { import testImplicits._ - var originalValue: String = _ - // With on AQE, the WholeStageCodegenExec is added when running QueryStageExec. - override def beforeAll(): Unit = { - super.beforeAll() - originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") - } - - override def afterAll(): Unit = { - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) - super.afterAll() - } - test("range/filter should be combined") { val df = spark.range(10).filter("id = 1").selectExpr("id + 1") val plan = df.queryExecution.executedPlan diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 78a1183664749..6d97a6bb47d0f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -20,14 +20,21 @@ package org.apache.spark.sql.execution.adaptive import java.io.File import java.net.URI +import org.apache.log4j.Level + import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart} -import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.execution.{ReusedSubqueryExec, SparkPlan} -import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec} +import org.apache.spark.sql.{QueryTest, Row, SparkSession, Strategy} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} +import org.apache.spark.sql.execution.{PartialReducerPartitionSpec, ReusedSubqueryExec, ShuffledRowRDD, SparkPlan} +import org.apache.spark.sql.execution.adaptive.OptimizeLocalShuffleReader.LOCAL_SHUFFLE_READER_DESCRIPTION +import org.apache.spark.sql.execution.command.DataWritingCommandExec +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildRight, SortMergeJoinExec} import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate +import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StructType} import org.apache.spark.util.Utils class AdaptiveQueryExecSuite @@ -46,7 +53,7 @@ class AdaptiveQueryExecSuite event match { case SparkListenerSQLAdaptiveExecutionUpdate(_, _, sparkPlanInfo) => if (sparkPlanInfo.simpleString.startsWith( - "AdaptiveSparkPlan(isFinalPlan=true)")) { + "AdaptiveSparkPlan isFinalPlan=true")) { finalPlanCnt += 1 } case _ => // ignore other events @@ -57,20 +64,23 @@ class AdaptiveQueryExecSuite val dfAdaptive = sql(query) val planBefore = dfAdaptive.queryExecution.executedPlan - assert(planBefore.toString.startsWith("AdaptiveSparkPlan(isFinalPlan=false)")) + assert(planBefore.toString.startsWith("AdaptiveSparkPlan isFinalPlan=false")) val result = dfAdaptive.collect() withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { val df = sql(query) - QueryTest.sameRows(result.toSeq, df.collect().toSeq) + checkAnswer(df, result) } val planAfter = dfAdaptive.queryExecution.executedPlan - assert(planAfter.toString.startsWith("AdaptiveSparkPlan(isFinalPlan=true)")) + assert(planAfter.toString.startsWith("AdaptiveSparkPlan isFinalPlan=true")) + val adaptivePlan = planAfter.asInstanceOf[AdaptiveSparkPlanExec].executedPlan spark.sparkContext.listenerBus.waitUntilEmpty() - assert(finalPlanCnt == 1) + // AQE will post `SparkListenerSQLAdaptiveExecutionUpdate` twice in case of subqueries that + // exist out of query stages. + val expectedFinalPlanCnt = adaptivePlan.find(_.subqueries.nonEmpty).map(_ => 2).getOrElse(1) + assert(finalPlanCnt == expectedFinalPlanCnt) spark.sparkContext.removeSparkListener(listener) - val adaptivePlan = planAfter.asInstanceOf[AdaptiveSparkPlanExec].executedPlan val exchanges = adaptivePlan.collect { case e: Exchange => e } @@ -91,14 +101,14 @@ class AdaptiveQueryExecSuite } private def findReusedExchange(plan: SparkPlan): Seq[ReusedExchangeExec] = { - collectInPlanAndSubqueries(plan) { + collectWithSubqueries(plan) { case ShuffleQueryStageExec(_, e: ReusedExchangeExec) => e case BroadcastQueryStageExec(_, e: ReusedExchangeExec) => e } } private def findReusedSubquery(plan: SparkPlan): Seq[ReusedSubqueryExec] = { - collectInPlanAndSubqueries(plan) { + collectWithSubqueries(plan) { case e: ReusedSubqueryExec => e } } @@ -110,10 +120,14 @@ class AdaptiveQueryExecSuite }.length val numLocalReaders = collect(plan) { - case reader: LocalShuffleReaderExec => reader - }.length - - assert(numShuffles === (numLocalReaders + numShufflesWithoutLocalReader)) + case reader @ CustomShuffleReaderExec(_, _, LOCAL_SHUFFLE_READER_DESCRIPTION) => reader + } + numLocalReaders.foreach { r => + val rdd = r.execute() + val parts = rdd.partitions + assert(parts.forall(rdd.preferredLocations(_).nonEmpty)) + } + assert(numShuffles === (numLocalReaders.length + numShufflesWithoutLocalReader)) } test("Change merge join to broadcast join") { @@ -134,7 +148,7 @@ class AdaptiveQueryExecSuite withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80", - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "10") { + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "10") { val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( "SELECT * FROM testData join testData2 ON key = a where value = '1'") val smj = findTopLevelSortMergeJoin(plan) @@ -142,11 +156,11 @@ class AdaptiveQueryExecSuite val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) val localReaders = collect(adaptivePlan) { - case reader: LocalShuffleReaderExec => reader + case reader @ CustomShuffleReaderExec(_, _, LOCAL_SHUFFLE_READER_DESCRIPTION) => reader } assert(localReaders.length == 2) - val localShuffleRDD0 = localReaders(0).execute().asInstanceOf[LocalShuffledRowRDD] - val localShuffleRDD1 = localReaders(1).execute().asInstanceOf[LocalShuffledRowRDD] + val localShuffleRDD0 = localReaders(0).execute().asInstanceOf[ShuffledRowRDD] + val localShuffleRDD1 = localReaders(1).execute().asInstanceOf[ShuffledRowRDD] // The pre-shuffle partition size is [0, 0, 0, 72, 0] // And the partitionStartIndices is [0, 3, 4], so advisoryParallelism = 3. // the final parallelism is @@ -166,7 +180,7 @@ class AdaptiveQueryExecSuite withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80", - SQLConf.REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key -> "false") { + SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "false") { val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( "SELECT * FROM testData join testData2 ON key = a where value = '1'") val smj = findTopLevelSortMergeJoin(plan) @@ -174,11 +188,11 @@ class AdaptiveQueryExecSuite val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) val localReaders = collect(adaptivePlan) { - case reader: LocalShuffleReaderExec => reader + case reader @ CustomShuffleReaderExec(_, _, LOCAL_SHUFFLE_READER_DESCRIPTION) => reader } assert(localReaders.length == 2) - val localShuffleRDD0 = localReaders(0).execute().asInstanceOf[LocalShuffledRowRDD] - val localShuffleRDD1 = localReaders(1).execute().asInstanceOf[LocalShuffledRowRDD] + val localShuffleRDD0 = localReaders(0).execute().asInstanceOf[ShuffledRowRDD] + val localShuffleRDD1 = localReaders(1).execute().asInstanceOf[ShuffledRowRDD] // the final parallelism is math.max(1, numReduces / numMappers): math.max(1, 5/2) = 2 // and the partitions length is 2 * numMappers = 4 assert(localShuffleRDD0.getPartitions.length == 4) @@ -583,171 +597,92 @@ class AdaptiveQueryExecSuite withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100", - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") { + SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100", + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100") { withTempView("skewData1", "skewData2") { spark .range(0, 1000, 1, 10) - .selectExpr("id % 2 as key1", "id as value1") + .selectExpr("id % 3 as key1", "id as value1") .createOrReplaceTempView("skewData1") spark .range(0, 1000, 1, 10) .selectExpr("id % 1 as key2", "id as value2") .createOrReplaceTempView("skewData2") - val (innerPlan, innerAdaptivePlan) = runAdaptiveAndVerifyResult( - "SELECT key1 FROM skewData1 join skewData2 ON key1 = key2 group by key1") - val innerSmj = findTopLevelSortMergeJoin(innerPlan) - assert(innerSmj.size == 1) + + def checkSkewJoin(query: String, optimizeSkewJoin: Boolean): Unit = { + val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult(query) + val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan) + assert(innerSmj.size == 1 && innerSmj.head.isSkewJoin == optimizeSkewJoin) + } + + checkSkewJoin( + "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2", true) // Additional shuffle introduced, so disable the "OptimizeSkewedJoin" optimization - val innerSmjAfter = findTopLevelSortMergeJoin(innerAdaptivePlan) - assert(innerSmjAfter.size == 1) + checkSkewJoin( + "SELECT key1 FROM skewData1 JOIN skewData2 ON key1 = key2 GROUP BY key1", false) } } } test("SPARK-29544: adaptive skew join with different join types") { - Seq("false", "true").foreach { reducePostShufflePartitionsEnabled => - withSQLConf( - SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", - SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100", - SQLConf.REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key -> reducePostShufflePartitionsEnabled, - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") { - withTempView("skewData1", "skewData2") { - spark - .range(0, 1000, 1, 10) - .selectExpr("id % 2 as key1", "id as value1") - .createOrReplaceTempView("skewData1") - spark - .range(0, 1000, 1, 10) - .selectExpr("id % 1 as key2", "id as value2") - .createOrReplaceTempView("skewData2") - // skewed inner join optimization - val (innerPlan, innerAdaptivePlan) = runAdaptiveAndVerifyResult( - "SELECT * FROM skewData1 join skewData2 ON key1 = key2") - val innerSmj = findTopLevelSortMergeJoin(innerPlan) - assert(innerSmj.size == 1) - // left stats: [3496, 0, 0, 0, 4014] - // right stats:[6292, 0, 0, 0, 0] - // the partition 0 in both left and right side are all skewed. - // And divide into 5 splits both in left and right (the max splits number). - // So there are 5 x 5 smjs for partition 0. - // Partition 4 in left side is skewed and is divided into 5 splits. - // The right side of partition 4 is not skewed. - // So there are 5 smjs for partition 4. - // So total (25 + 5 + 1) smjs. - // Union - // +- SortMergeJoin - // +- Sort - // +- CoalescedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- CoalescedShuffleReader - // +- ShuffleQueryStage - // +- SortMergeJoin - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // . - // . - // . - // +- SortMergeJoin - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - - val innerSmjAfter = findTopLevelSortMergeJoin(innerAdaptivePlan) - assert(innerSmjAfter.size == 31) - - // skewed left outer join optimization - val (leftPlan, leftAdaptivePlan) = runAdaptiveAndVerifyResult( - "SELECT * FROM skewData1 left outer join skewData2 ON key1 = key2") - val leftSmj = findTopLevelSortMergeJoin(leftPlan) - assert(leftSmj.size == 1) - // left stats: [3496, 0, 0, 0, 4014] - // right stats:[6292, 0, 0, 0, 0] - // The partition 0 in both left and right are all skewed. - // The partition 4 in left side is skewed. - // But for left outer join, we don't split the right partition even skewed. - // So the partition 0 in left side is divided into 5 splits(the max split number). - // the partition 4 in left side is divided into 5 splits(the max split number). - // So total (5 + 5 + 1) smjs. - // Union - // +- SortMergeJoin - // +- Sort - // +- CoalescedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- CoalescedShuffleReader - // +- ShuffleQueryStage - // +- SortMergeJoin - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // . - // . - // . - // +- SortMergeJoin - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - - val leftSmjAfter = findTopLevelSortMergeJoin(leftAdaptivePlan) - assert(leftSmjAfter.size == 11) - - // skewed right outer join optimization - val (rightPlan, rightAdaptivePlan) = runAdaptiveAndVerifyResult( - "SELECT * FROM skewData1 right outer join skewData2 ON key1 = key2") - val rightSmj = findTopLevelSortMergeJoin(rightPlan) - assert(rightSmj.size == 1) - // left stats: [3496, 0, 0, 0, 4014] - // right stats:[6292, 0, 0, 0, 0] - // The partition 0 in both left and right side are all skewed. - // And the partition 4 in left side is skewed. - // But for right outer join, we don't split the left partition even skewed. - // And divide right side into 5 splits(the max split number) - // So total 6 smjs. - // Union - // +- SortMergeJoin - // +- Sort - // +- CoalescedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- CoalescedShuffleReader - // +- ShuffleQueryStage - // +- SortMergeJoin - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // . - // . - // . - // +- SortMergeJoin - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - // +- Sort - // +- SkewedShuffleReader - // +- ShuffleQueryStage - - val rightSmjAfter = findTopLevelSortMergeJoin(rightAdaptivePlan) - assert(rightSmjAfter.size == 6) + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1", + SQLConf.SHUFFLE_PARTITIONS.key -> "100", + SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "800", + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "800") { + withTempView("skewData1", "skewData2") { + spark + .range(0, 1000, 1, 10) + .select( + when('id < 250, 249) + .when('id >= 750, 1000) + .otherwise('id).as("key1"), + 'id as "value1") + .createOrReplaceTempView("skewData1") + spark + .range(0, 1000, 1, 10) + .select( + when('id < 250, 249) + .otherwise('id).as("key2"), + 'id as "value2") + .createOrReplaceTempView("skewData2") + + def checkSkewJoin( + joins: Seq[SortMergeJoinExec], + leftSkewNum: Int, + rightSkewNum: Int): Unit = { + assert(joins.size == 1 && joins.head.isSkewJoin) + assert(joins.head.left.collect { + case r: CustomShuffleReaderExec => r + }.head.partitionSpecs.collect { + case p: PartialReducerPartitionSpec => p.reducerIndex + }.distinct.length == leftSkewNum) + assert(joins.head.right.collect { + case r: CustomShuffleReaderExec => r + }.head.partitionSpecs.collect { + case p: PartialReducerPartitionSpec => p.reducerIndex + }.distinct.length == rightSkewNum) } + + // skewed inner join optimization + val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM skewData1 join skewData2 ON key1 = key2") + val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan) + checkSkewJoin(innerSmj, 2, 1) + + // skewed left outer join optimization + val (_, leftAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM skewData1 left outer join skewData2 ON key1 = key2") + val leftSmj = findTopLevelSortMergeJoin(leftAdaptivePlan) + checkSkewJoin(leftSmj, 2, 0) + + // skewed right outer join optimization + val (_, rightAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM skewData1 right outer join skewData2 ON key1 = key2") + val rightSmj = findTopLevelSortMergeJoin(rightAdaptivePlan) + checkSkewJoin(rightSmj, 0, 1) } } } @@ -768,7 +703,8 @@ class AdaptiveQueryExecSuite val error = intercept[Exception] { agged.count() } - assert(error.getCause().toString contains "Failed to materialize query stage") + assert(error.getCause().toString contains "Invalid bucket file") + assert(error.getSuppressed.size === 0) } } } @@ -780,4 +716,191 @@ class AdaptiveQueryExecSuite ) } } + + test("force apply AQE") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") { + val plan = sql("SELECT * FROM testData").queryExecution.executedPlan + assert(plan.isInstanceOf[AdaptiveSparkPlanExec]) + } + } + + test("SPARK-30719: do not log warning if intentionally skip AQE") { + val testAppender = new LogAppender("aqe logging warning test when skip") + withLogAppender(testAppender) { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + val plan = sql("SELECT * FROM testData").queryExecution.executedPlan + assert(!plan.isInstanceOf[AdaptiveSparkPlanExec]) + } + } + assert(!testAppender.loggingEvents + .exists(msg => msg.getRenderedMessage.contains( + s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is" + + s" enabled but is not supported for"))) + } + + test("test log level") { + def verifyLog(expectedLevel: Level): Unit = { + val logAppender = new LogAppender("adaptive execution") + withLogAppender( + logAppender, + loggerName = Some(AdaptiveSparkPlanExec.getClass.getName.dropRight(1)), + level = Some(Level.TRACE)) { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") { + sql("SELECT * FROM testData join testData2 ON key = a where value = '1'").collect() + } + } + Seq("Plan changed", "Final plan").foreach { msg => + assert( + logAppender.loggingEvents.exists { event => + event.getRenderedMessage.contains(msg) && event.getLevel == expectedLevel + }) + } + } + + // Verify default log level + verifyLog(Level.DEBUG) + + // Verify custom log level + val levels = Seq( + "TRACE" -> Level.TRACE, + "trace" -> Level.TRACE, + "DEBUG" -> Level.DEBUG, + "debug" -> Level.DEBUG, + "INFO" -> Level.INFO, + "info" -> Level.INFO, + "WARN" -> Level.WARN, + "warn" -> Level.WARN, + "ERROR" -> Level.ERROR, + "error" -> Level.ERROR, + "deBUG" -> Level.DEBUG) + + levels.foreach { level => + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_LOG_LEVEL.key -> level._1) { + verifyLog(level._2) + } + } + } + + test("SPARK-31384: avoid NPE in OptimizeSkewedJoin when there's 0 partition plan") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t2") { + // create DataFrame with 0 partition + spark.createDataFrame(sparkContext.emptyRDD[Row], new StructType().add("b", IntegerType)) + .createOrReplaceTempView("t2") + // should run successfully without NPE + runAdaptiveAndVerifyResult("SELECT * FROM testData2 t1 left semi join t2 ON t1.a=t2.b") + } + } + } + + test("SPARK-30953: InsertAdaptiveSparkPlan should apply AQE on child plan of write commands") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") { + withTable("t1") { + val plan = sql("CREATE TABLE t1 USING parquet AS SELECT 1 col").queryExecution.executedPlan + assert(plan.isInstanceOf[DataWritingCommandExec]) + assert(plan.asInstanceOf[DataWritingCommandExec].child.isInstanceOf[AdaptiveSparkPlanExec]) + } + } + } + + test("AQE should set active session during execution") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + val df = spark.range(10).select(sum('id)) + assert(df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]) + SparkSession.setActiveSession(null) + checkAnswer(df, Seq(Row(45))) + SparkSession.setActiveSession(spark) // recover the active session. + } + } + + test("No deadlock in UI update") { + object TestStrategy extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case _: Aggregate => + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") { + spark.range(5).rdd + } + Nil + case _ => Nil + } + } + + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") { + try { + spark.experimental.extraStrategies = TestStrategy :: Nil + val df = spark.range(10).groupBy('id).count() + df.collect() + } finally { + spark.experimental.extraStrategies = Nil + } + } + } + + test("SPARK-31658: SQL UI should show write commands") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") { + withTable("t1") { + var checkDone = false + val listener = new SparkListener { + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case SparkListenerSQLAdaptiveExecutionUpdate(_, _, planInfo) => + assert(planInfo.nodeName == "Execute CreateDataSourceTableAsSelectCommand") + checkDone = true + case _ => // ignore other events + } + } + } + spark.sparkContext.addSparkListener(listener) + try { + sql("CREATE TABLE t1 USING parquet AS SELECT 1 col").collect() + spark.sparkContext.listenerBus.waitUntilEmpty() + assert(checkDone) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + } + } + + test("SPARK-31220 repartition obeys initialPartitionNum when adaptiveExecutionEnabled") { + Seq(true, false).foreach { enableAQE => + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> enableAQE.toString, + SQLConf.SHUFFLE_PARTITIONS.key -> "6", + SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key -> "7") { + val partitionsNum = spark.range(10).repartition($"id").rdd.collectPartitions().length + if (enableAQE) { + assert(partitionsNum === 7) + } else { + assert(partitionsNum === 6) + } + } + } + } + + test("SPARK-32753: Only copy tags to node with no tags") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + withTempView("v1") { + spark.range(10).union(spark.range(10)).createOrReplaceTempView("v1") + + val (_, adaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT id FROM v1 GROUP BY id DISTRIBUTE BY id") + assert(collect(adaptivePlan) { + case s: ShuffleExchangeExec => s + }.length == 1) + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala new file mode 100644 index 0000000000000..48f85ae76cd8c --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import java.io.{PrintWriter, StringWriter} + +import org.scalactic.source.Position +import org.scalatest.Tag + +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils + +/** + * Test with this tag will be ignored if the test suite extends `EnableAdaptiveExecutionSuite`. + * Otherwise, it will be executed with adaptive execution disabled. + */ +case class DisableAdaptiveExecution(reason: String) extends Tag("DisableAdaptiveExecution") + +/** + * Helper trait that enables AQE for all tests regardless of default config values, except that + * tests tagged with [[DisableAdaptiveExecution]] will be skipped. + */ +trait EnableAdaptiveExecutionSuite extends SQLTestUtils { + protected val forceApply = true + + override protected def test(testName: String, testTags: Tag*)(testFun: => Any) + (implicit pos: Position): Unit = { + if (testTags.exists(_.isInstanceOf[DisableAdaptiveExecution])) { + // we ignore the test here but assume that another test suite which extends + // `DisableAdaptiveExecutionSuite` will test it anyway to ensure test coverage + ignore(testName + " (disabled when AQE is on)", testTags: _*)(testFun) + } else { + super.test(testName, testTags: _*) { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> forceApply.toString) { + testFun + } + } + } + } +} + +/** + * Helper trait that disables AQE for all tests regardless of default config values. + */ +trait DisableAdaptiveExecutionSuite extends SQLTestUtils { + override protected def test(testName: String, testTags: Tag*)(testFun: => Any) + (implicit pos: Position): Unit = { + super.test(testName, testTags: _*) { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + testFun + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala index fdb23d5be78a1..1e6e59456c887 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala @@ -1208,15 +1208,13 @@ class ArrowConvertersSuite extends SharedSparkSession { spark.conf.unset(SQLConf.ARROW_EXECUTION_MAX_RECORDS_PER_BATCH.key) } - testQuietly("unsupported types") { - def runUnsupported(block: => Unit): Unit = { - val msg = intercept[UnsupportedOperationException] { - block - } - assert(msg.getMessage.contains("is not supported")) + testQuietly("interval is unsupported for arrow") { + val e = intercept[SparkException] { + calenderIntervalData.toDF().toArrowBatchRdd.collect() } - runUnsupported { calenderIntervalData.toDF().toArrowBatchRdd.collect() } + assert(e.getCause.isInstanceOf[UnsupportedOperationException]) + assert(e.getCause.getMessage.contains("Unsupported data type: interval")) } test("test Arrow Validator") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index a084bec985510..0fc43c7052d06 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -23,7 +23,6 @@ import scala.util.Random import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.internal.config.UI._ import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.parquet.{SpecificParquetRecordReaderBase, VectorizedParquetRecordReader} @@ -52,7 +51,6 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { .set("spark.master", "local[1]") .setIfMissing("spark.driver.memory", "3g") .setIfMissing("spark.executor.memory", "3g") - .setIfMissing(UI_ENABLED, false) val sparkSession = SparkSession.builder.config(conf).getOrCreate() @@ -169,7 +167,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { files.map(_.asInstanceOf[String]).foreach { p => val reader = new VectorizedParquetRecordReader( - null, enableOffHeapColumnVector, vectorizedReaderBatchSize) + enableOffHeapColumnVector, vectorizedReaderBatchSize) try { reader.initialize(p, ("id" :: Nil).asJava) val batch = reader.resultBatch() @@ -203,7 +201,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { files.map(_.asInstanceOf[String]).foreach { p => val reader = new VectorizedParquetRecordReader( - null, enableOffHeapColumnVector, vectorizedReaderBatchSize) + enableOffHeapColumnVector, vectorizedReaderBatchSize) try { reader.initialize(p, ("id" :: Nil).asJava) val batch = reader.resultBatch() @@ -458,7 +456,7 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { var sum = 0 files.map(_.asInstanceOf[String]).foreach { p => val reader = new VectorizedParquetRecordReader( - null, enableOffHeapColumnVector, vectorizedReaderBatchSize) + enableOffHeapColumnVector, vectorizedReaderBatchSize) try { reader.initialize(p, ("c1" :: "c2" :: Nil).asJava) val batch = reader.resultBatch() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala index 086583fdafe6d..c7b8737b7a753 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala @@ -17,9 +17,15 @@ package org.apache.spark.sql.execution.benchmark -import java.sql.Timestamp +import java.sql.{Date, Timestamp} +import java.time.{Instant, LocalDate} import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA} +import org.apache.spark.sql.execution.HiveResult +import org.apache.spark.sql.internal.SQLConf /** * Synthetic benchmark for date and timestamp functions. @@ -53,96 +59,192 @@ object DateTimeBenchmark extends SqlBasedBenchmark { } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - val N = 10000000 - runBenchmark("Extract components") { - run(N, "cast to timestamp", "cast(id as timestamp)") - run(N, "year") - run(N, "quarter") - run(N, "month") - run(N, "weekofyear") - run(N, "day") - run(N, "dayofyear") - run(N, "dayofmonth") - run(N, "dayofweek") - run(N, "weekday") - run(N, "hour") - run(N, "minute") - run(N, "second") - } - runBenchmark("Current date and time") { - run(N, "current_date", "current_date") - run(N, "current_timestamp", "current_timestamp") - } - runBenchmark("Date arithmetic") { - val dateExpr = "cast(cast(id as timestamp) as date)" - run(N, "cast to date", dateExpr) - run(N, "last_day", s"last_day($dateExpr)") - run(N, "next_day", s"next_day($dateExpr, 'TU')") - run(N, "date_add", s"date_add($dateExpr, 10)") - run(N, "date_sub", s"date_sub($dateExpr, 10)") - run(N, "add_months", s"add_months($dateExpr, 10)") - } - runBenchmark("Formatting dates") { - val dateExpr = "cast(cast(id as timestamp) as date)" - run(N, "format date", s"date_format($dateExpr, 'MMM yyyy')") - } - runBenchmark("Formatting timestamps") { - run(N, "from_unixtime", "from_unixtime(id, 'yyyy-MM-dd HH:mm:ss.SSSSSS')") - } - runBenchmark("Convert timestamps") { - val timestampExpr = "cast(id as timestamp)" - run(N, "from_utc_timestamp", s"from_utc_timestamp($timestampExpr, 'CET')") - run(N, "to_utc_timestamp", s"to_utc_timestamp($timestampExpr, 'CET')") - } - runBenchmark("Intervals") { - val (start, end) = ("cast(id as timestamp)", "cast((id+8640000) as timestamp)") - run(N, "cast interval", start, end) - run(N, "datediff", s"datediff($start, $end)") - run(N, "months_between", s"months_between($start, $end)") - run(1000000, "window", s"window($start, 100, 10, 1)") - } - runBenchmark("Truncation") { - val timestampExpr = "cast(id as timestamp)" - Seq("YEAR", "YYYY", "YY", "MON", "MONTH", "MM", "DAY", "DD", "HOUR", "MINUTE", - "SECOND", "WEEK", "QUARTER").foreach { level => - run(N, s"date_trunc $level", s"date_trunc('$level', $timestampExpr)") - } - val dateExpr = "cast(cast(id as timestamp) as date)" - Seq("year", "yyyy", "yy", "mon", "month", "mm").foreach { level => - run(N, s"trunc $level", s"trunc('$level', $dateExpr)") - } - } - runBenchmark("Parsing") { - val n = 1000000 - val timestampStrExpr = "concat('2019-01-27 11:02:01.', cast(mod(id, 1000) as string))" - val pattern = "'yyyy-MM-dd HH:mm:ss.SSS'" - run(n, "to timestamp str", timestampStrExpr) - run(n, "to_timestamp", s"to_timestamp($timestampStrExpr, $pattern)") - run(n, "to_unix_timestamp", s"to_unix_timestamp($timestampStrExpr, $pattern)") - val dateStrExpr = "concat('2019-01-', cast(mod(id, 25) as string))" - run(n, "to date str", dateStrExpr) - run(n, "to_date", s"to_date($dateStrExpr, 'yyyy-MM-dd')") - } - runBenchmark("Conversion from/to external types") { - import spark.implicits._ - val rowsNum = 5000000 - val numIters = 3 - val benchmark = new Benchmark("To/from java.sql.Timestamp", rowsNum, output = output) - benchmark.addCase("From java.sql.Timestamp", numIters) { _ => - spark.range(rowsNum) - .map(millis => new Timestamp(millis)) - .noop() - } - benchmark.addCase("Collect longs", numIters) { _ => - spark.range(0, rowsNum, 1, 1) - .collect() - } - benchmark.addCase("Collect timestamps", numIters) { _ => - spark.range(0, rowsNum, 1, 1) - .map(millis => new Timestamp(millis)) - .collect() + withDefaultTimeZone(LA) { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> LA.getId) { + val N = 10000000 + runBenchmark("datetime +/- interval") { + val benchmark = new Benchmark("datetime +/- interval", N, output = output) + val ts = "cast(id as timestamp)" + val dt = s"cast($ts as date)" + benchmark.addCase("date + interval(m)") { _ => + doBenchmark(N, s"$dt + interval 1 month") + } + benchmark.addCase("date + interval(m, d)") { _ => + doBenchmark(N, s"$dt + interval 1 month 2 day") + } + benchmark.addCase("date + interval(m, d, ms)") { _ => + doBenchmark(N, s"$dt + interval 1 month 2 day 5 hour") + } + benchmark.addCase("date - interval(m)") { _ => + doBenchmark(N, s"$dt - interval 1 month") + } + benchmark.addCase("date - interval(m, d)") { _ => + doBenchmark(N, s"$dt - interval 1 month 2 day") + } + benchmark.addCase("date - interval(m, d, ms)") { _ => + doBenchmark(N, s"$dt - interval 1 month 2 day 5 hour") + } + benchmark.addCase("timestamp + interval(m)") { _ => + doBenchmark(N, s"$ts + interval 1 month") + } + benchmark.addCase("timestamp + interval(m, d)") { _ => + doBenchmark(N, s"$ts + interval 1 month 2 day") + } + benchmark.addCase("timestamp + interval(m, d, ms)") { _ => + doBenchmark(N, s"$ts + interval 1 month 2 day 5 hour") + } + benchmark.addCase("timestamp - interval(m)") { _ => + doBenchmark(N, s"$ts - interval 1 month") + } + benchmark.addCase("timestamp - interval(m, d)") { _ => + doBenchmark(N, s"$ts - interval 1 month 2 day") + } + benchmark.addCase("timestamp - interval(m, d, ms)") { _ => + doBenchmark(N, s"$ts - interval 1 month 2 day 5 hour") + } + benchmark.run() + } + runBenchmark("Extract components") { + run(N, "cast to timestamp", "cast(id as timestamp)") + run(N, "year") + run(N, "quarter") + run(N, "month") + run(N, "weekofyear") + run(N, "day") + run(N, "dayofyear") + run(N, "dayofmonth") + run(N, "dayofweek") + run(N, "weekday") + run(N, "hour") + run(N, "minute") + run(N, "second") + } + runBenchmark("Current date and time") { + run(N, "current_date", "current_date") + run(N, "current_timestamp", "current_timestamp") + } + runBenchmark("Date arithmetic") { + val dateExpr = "cast(cast(id as timestamp) as date)" + run(N, "cast to date", dateExpr) + run(N, "last_day", s"last_day($dateExpr)") + run(N, "next_day", s"next_day($dateExpr, 'TU')") + run(N, "date_add", s"date_add($dateExpr, 10)") + run(N, "date_sub", s"date_sub($dateExpr, 10)") + run(N, "add_months", s"add_months($dateExpr, 10)") + } + runBenchmark("Formatting dates") { + val dateExpr = "cast(cast(id as timestamp) as date)" + run(N, "format date", s"date_format($dateExpr, 'MMM yyyy')") + } + runBenchmark("Formatting timestamps") { + run(N, "from_unixtime", "from_unixtime(id, 'yyyy-MM-dd HH:mm:ss.SSSSSS')") + } + runBenchmark("Convert timestamps") { + val timestampExpr = "cast(id as timestamp)" + run(N, "from_utc_timestamp", s"from_utc_timestamp($timestampExpr, 'CET')") + run(N, "to_utc_timestamp", s"to_utc_timestamp($timestampExpr, 'CET')") + } + runBenchmark("Intervals") { + val (start, end) = ("cast(id as timestamp)", "cast((id+8640000) as timestamp)") + run(N, "cast interval", start, end) + run(N, "datediff", s"datediff($start, $end)") + run(N, "months_between", s"months_between($start, $end)") + run(1000000, "window", s"window($start, 100, 10, 1)") + } + runBenchmark("Truncation") { + val timestampExpr = "cast(id as timestamp)" + Seq("YEAR", "YYYY", "YY", "MON", "MONTH", "MM", "DAY", "DD", "HOUR", "MINUTE", + "SECOND", "WEEK", "QUARTER").foreach { level => + run(N, s"date_trunc $level", s"date_trunc('$level', $timestampExpr)") + } + val dateExpr = "cast(cast(id as timestamp) as date)" + Seq("year", "yyyy", "yy", "mon", "month", "mm").foreach { level => + run(N, s"trunc $level", s"trunc('$level', $dateExpr)") + } + } + runBenchmark("Parsing") { + val n = 1000000 + val timestampStrExpr = "concat('2019-01-27 11:02:01.', cast(mod(id, 1000) as string))" + val pattern = "'yyyy-MM-dd HH:mm:ss.SSS'" + run(n, "to timestamp str", timestampStrExpr) + run(n, "to_timestamp", s"to_timestamp($timestampStrExpr, $pattern)") + run(n, "to_unix_timestamp", s"to_unix_timestamp($timestampStrExpr, $pattern)") + val dateStrExpr = "concat('2019-01-', lpad(mod(id, 25), 2, '0'))" + run(n, "to date str", dateStrExpr) + run(n, "to_date", s"to_date($dateStrExpr, 'yyyy-MM-dd')") + } + runBenchmark("Conversion from/to external types") { + import spark.implicits._ + val rowsNum = 5000000 + val numIters = 3 + val benchmark = new Benchmark("To/from Java's date-time", rowsNum, output = output) + benchmark.addCase("From java.sql.Date", numIters) { _ => + spark.range(rowsNum).map(millis => new Date(millis)).noop() + } + benchmark.addCase("From java.time.LocalDate", numIters) { _ => + spark.range(rowsNum).map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)).noop() + } + def dates = { + spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis)) + } + benchmark.addCase("Collect java.sql.Date", numIters) { _ => + dates.collect() + } + def localDates = { + spark.range(0, rowsNum, 1, 1) + .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)) + } + benchmark.addCase("Collect java.time.LocalDate", numIters) { _ => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + localDates.collect() + } + } + benchmark.addCase("From java.sql.Timestamp", numIters) { _ => + spark.range(rowsNum).map(millis => new Timestamp(millis)).noop() + } + benchmark.addCase("From java.time.Instant", numIters) { _ => + spark.range(rowsNum).map(millis => Instant.ofEpochMilli(millis)).noop() + } + benchmark.addCase("Collect longs", numIters) { _ => + spark.range(0, rowsNum, 1, 1) + .collect() + } + def timestamps = { + spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis)) + } + benchmark.addCase("Collect java.sql.Timestamp", numIters) { _ => + timestamps.collect() + } + def instants = { + spark.range(0, rowsNum, 1, 1).map(millis => Instant.ofEpochMilli(millis)) + } + benchmark.addCase("Collect java.time.Instant", numIters) { _ => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + instants.collect() + } + } + def toHiveString(df: Dataset[_]): Unit = { + HiveResult.hiveResultString(df.queryExecution.executedPlan) + } + benchmark.addCase("java.sql.Date to Hive string", numIters) { _ => + toHiveString(dates) + } + benchmark.addCase("java.time.LocalDate to Hive string", numIters) { _ => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + toHiveString(localDates) + } + } + benchmark.addCase("java.sql.Timestamp to Hive string", numIters) { _ => + toHiveString(timestamps) + } + benchmark.addCase("java.time.Instant to Hive string", numIters) { _ => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + toHiveString(instants) + } + } + benchmark.run() + } } - benchmark.run() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala new file mode 100644 index 0000000000000..7caaa5376db7f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import java.io.File +import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneOffset} + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.catalyst.util.DateTimeConstants.SECONDS_PER_DAY +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.{LegacyBehaviorPolicy, ParquetOutputTimestampType} + +object DateTime extends Enumeration { + type DateTime = Value + val DATE, TIMESTAMP, TIMESTAMP_INT96, TIMESTAMP_MICROS, TIMESTAMP_MILLIS = Value +} + +/** + * Synthetic benchmark for rebasing of date and timestamp in read/write. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/DateTimeRebaseBenchmark-results.txt". + * }}} + */ +object DateTimeRebaseBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + import DateTime._ + + private def genTs(cardinality: Int, start: LocalDateTime, end: LocalDateTime): DataFrame = { + val startSec = start.toEpochSecond(ZoneOffset.UTC) + val endSec = end.toEpochSecond(ZoneOffset.UTC) + spark.range(0, cardinality, 1, 1) + .select((($"id" % (endSec - startSec)) + startSec).as("seconds")) + .select($"seconds".cast("timestamp").as("ts")) + } + + private def genTsAfter1900(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(1900, 1, 31, 0, 0, 0) + val end = LocalDateTime.of(3000, 1, 1, 0, 0, 0) + genTs(cardinality, start, end) + } + + private def genTsBefore1900(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(10, 1, 1, 0, 0, 0) + val end = LocalDateTime.of(1900, 1, 1, 0, 0, 0) + genTs(cardinality, start, end) + } + + private def genDate(cardinality: Int, start: LocalDate, end: LocalDate): DataFrame = { + val startSec = LocalDateTime.of(start, LocalTime.MIDNIGHT).toEpochSecond(ZoneOffset.UTC) + val endSec = LocalDateTime.of(end, LocalTime.MIDNIGHT).toEpochSecond(ZoneOffset.UTC) + spark.range(0, cardinality * SECONDS_PER_DAY, SECONDS_PER_DAY, 1) + .select((($"id" % (endSec - startSec)) + startSec).as("seconds")) + .select($"seconds".cast("timestamp").as("ts")) + .select($"ts".cast("date").as("date")) + } + + private def genDateAfter1582(cardinality: Int): DataFrame = { + val start = LocalDate.of(1582, 10, 31) + val end = LocalDate.of(3000, 1, 1) + genDate(cardinality, start, end) + } + + private def genDateBefore1582(cardinality: Int): DataFrame = { + val start = LocalDate.of(10, 1, 1) + val end = LocalDate.of(1580, 10, 1) + genDate(cardinality, start, end) + } + + private def genDF(cardinality: Int, dateTime: DateTime, modernDates: Boolean): DataFrame = { + dateTime match { + case DATE => + if (modernDates) genDateAfter1582(cardinality) else genDateBefore1582(cardinality) + case TIMESTAMP | TIMESTAMP_INT96 | TIMESTAMP_MICROS | TIMESTAMP_MILLIS => + if (modernDates) genTsAfter1900(cardinality) else genTsBefore1900(cardinality) + case _ => throw new IllegalArgumentException( + s"cardinality = $cardinality dateTime = $dateTime modernDates = $modernDates") + } + } + + private def benchmarkInputs(benchmark: Benchmark, rowsNum: Int, dateTime: DateTime): Unit = { + val year = if (dateTime == DATE) 1582 else 1900 + benchmark.addCase(s"after $year, noop", 1) { _ => + genDF(rowsNum, dateTime, modernDates = true).noop() + } + benchmark.addCase(s"before $year, noop", 1) { _ => + genDF(rowsNum, dateTime, modernDates = false).noop() + } + } + + private def flagToStr(flag: Boolean): String = { + if (flag) "on" else "off" + } + + private def caseName( + modernDates: Boolean, + dateTime: DateTime, + mode: Option[LegacyBehaviorPolicy.Value] = None, + vec: Option[Boolean] = None): String = { + val period = if (modernDates) "after" else "before" + val year = if (dateTime == DATE) 1582 else 1900 + val vecFlag = vec.map(flagToStr).map(flag => s", vec $flag").getOrElse("") + val rebaseFlag = mode.map(_.toString).map(m => s", rebase $m").getOrElse("") + s"$period $year$vecFlag$rebaseFlag" + } + + private def getPath( + basePath: File, + dateTime: DateTime, + modernDates: Boolean, + mode: Option[LegacyBehaviorPolicy.Value] = None): String = { + val period = if (modernDates) "after" else "before" + val year = if (dateTime == DATE) 1582 else 1900 + val rebaseFlag = mode.map(_.toString).map(m => s"_$m").getOrElse("") + basePath.getAbsolutePath + s"/${dateTime}_${period}_$year$rebaseFlag" + } + + private def getOutputType(dateTime: DateTime): String = dateTime match { + case TIMESTAMP_INT96 => ParquetOutputTimestampType.INT96.toString + case TIMESTAMP_MILLIS => ParquetOutputTimestampType.TIMESTAMP_MILLIS.toString + case _ => ParquetOutputTimestampType.TIMESTAMP_MICROS.toString + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val rowsNum = 100000000 + + withDefaultTimeZone(LA) { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> LA.getId) { + withTempPath { path => + runBenchmark("Rebasing dates/timestamps in Parquet datasource") { + Seq( + DATE, TIMESTAMP_INT96, TIMESTAMP_MICROS, TIMESTAMP_MILLIS + ).foreach { dateTime => + val benchmark = new Benchmark( + s"Save $dateTime to parquet", + rowsNum, + output = output) + benchmarkInputs(benchmark, rowsNum, dateTime) + Seq(true, false).foreach { modernDates => + LegacyBehaviorPolicy.values + .filterNot(v => !modernDates && v == LegacyBehaviorPolicy.EXCEPTION) + .foreach { mode => + benchmark.addCase(caseName(modernDates, dateTime, Some(mode)), 1) { _ => + withSQLConf( + SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> getOutputType(dateTime), + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> mode.toString) { + genDF(rowsNum, dateTime, modernDates) + .write + .mode("overwrite") + .format("parquet") + .save(getPath(path, dateTime, modernDates, Some(mode))) + } + } + } + } + benchmark.run() + + val benchmark2 = new Benchmark( + s"Load $dateTime from parquet", rowsNum, output = output) + Seq(true, false).foreach { modernDates => + Seq(false, true).foreach { vec => + LegacyBehaviorPolicy.values + .filterNot(v => !modernDates && v == LegacyBehaviorPolicy.EXCEPTION) + .foreach { mode => + val name = caseName(modernDates, dateTime, Some(mode), Some(vec)) + benchmark2.addCase(name, 3) { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString) { + spark.read + .format("parquet") + .load(getPath(path, dateTime, modernDates, Some(mode))) + .noop() + } + } + } + } + } + benchmark2.run() + } + } + } + + withTempPath { path => + runBenchmark("Rebasing dates/timestamps in ORC datasource") { + Seq(DATE, TIMESTAMP).foreach { dateTime => + val benchmark = new Benchmark(s"Save $dateTime to ORC", rowsNum, output = output) + benchmarkInputs(benchmark, rowsNum, dateTime) + Seq(true, false).foreach { modernDates => + benchmark.addCase(caseName(modernDates, dateTime), 1) { _ => + genDF(rowsNum, dateTime, modernDates) + .write + .mode("overwrite") + .format("orc") + .save(getPath(path, dateTime, modernDates)) + } + } + benchmark.run() + + val benchmark2 = new Benchmark( + s"Load $dateTime from ORC", + rowsNum, + output = output) + Seq(true, false).foreach { modernDates => + Seq(false, true).foreach { vec => + benchmark2.addCase(caseName(modernDates, dateTime, vec = Some(vec)), 3) { _ => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vec.toString) { + spark + .read + .format("orc") + .load(getPath(path, dateTime, modernDates)) + .noop() + } + } + } + } + benchmark2.run() + } + } + } + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala index de23132284dc8..287854dc3646c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala @@ -42,7 +42,9 @@ object ExtractBenchmark extends SqlBasedBenchmark { spark .range(sinceSecond, sinceSecond + cardinality, 1, 1) .selectExpr(exprs: _*) - .noop() + .queryExecution + .toRdd + .foreach(_ => ()) } } @@ -82,34 +84,22 @@ object ExtractBenchmark extends SqlBasedBenchmark { } } - private case class Settings(fields: Seq[String], func: Seq[String], iterNum: Long) - override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { val N = 10000000L - val datetimeFields = Seq( - "MILLENNIUM", "CENTURY", "DECADE", "YEAR", - "ISOYEAR", "QUARTER", "MONTH", "WEEK", - "DAY", "DAYOFWEEK", "DOW", "ISODOW", - "DOY", "HOUR", "MINUTE", "SECOND", - "MILLISECONDS", "MICROSECONDS", "EPOCH") - val intervalFields = Seq( - "MILLENNIUM", "CENTURY", "DECADE", "YEAR", - "QUARTER", "MONTH", "DAY", - "HOUR", "MINUTE", "SECOND", - "MILLISECONDS", "MICROSECONDS", "EPOCH") + val datetimeFields = Seq("YEAR", "YEAROFWEEK", "QUARTER", "MONTH", "WEEK", "DAY", "DAYOFWEEK", + "DOW", "DOW_ISO", "DAYOFWEEK_ISO", "DOY", "HOUR", "MINUTE", "SECOND") + val intervalFields = Seq("YEAR", "MONTH", "DAY", "HOUR", "MINUTE", "SECOND") val settings = Map( - "timestamp" -> Settings(datetimeFields, Seq("extract", "date_part"), N), - "date" -> Settings(datetimeFields, Seq("extract", "date_part"), N), - "interval" -> Settings(intervalFields, Seq("date_part"), N)) + "timestamp" -> datetimeFields, + "date" -> datetimeFields, + "interval" -> intervalFields) - for { - (dataType, Settings(fields, funcs, iterNum)) <- settings - func <- funcs} { + for {(dataType, fields) <- settings; func <- Seq("extract", "date_part")} { val benchmark = new Benchmark(s"Invoke $func for $dataType", N, output = output) - run(benchmark, iterNum, s"cast to $dataType", castExpr(dataType)) - fields.foreach(run(benchmark, func, iterNum, _, dataType)) + run(benchmark, N, s"cast to $dataType", castExpr(dataType)) + fields.foreach(run(benchmark, func, N, _, dataType)) benchmark.run() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala index 444ffa4f99697..b3f65d40ad95b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala @@ -23,7 +23,6 @@ import scala.util.Random import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.internal.config.UI._ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.monotonically_increasing_id import org.apache.spark.sql.internal.SQLConf @@ -49,7 +48,6 @@ object FilterPushdownBenchmark extends SqlBasedBenchmark { .set("spark.master", "local[1]") .setIfMissing("spark.driver.memory", "3g") .setIfMissing("spark.executor.memory", "3g") - .setIfMissing(UI_ENABLED, false) .setIfMissing("orc.compression", "snappy") .setIfMissing("spark.sql.parquet.compression.codec", "snappy") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala index 94e763459a111..907e3f40c1911 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala @@ -44,7 +44,9 @@ object IntervalBenchmark extends SqlBasedBenchmark { spark .range(0, cardinality, 1, 1) .select(exprs: _*) - .noop() + .queryExecution + .toRdd + .foreach(_ => ()) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala new file mode 100644 index 0000000000000..d2bd962b50654 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.SparkConf +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for nested fields predicate push down performance for Parquet datasource. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/ParquetNestedPredicatePushDownBenchmark-results.txt". + * }}} + */ +object ParquetNestedPredicatePushDownBenchmark extends SqlBasedBenchmark { + + private val N = 100 * 1024 * 1024 + private val NUMBER_OF_ITER = 10 + + private val df: DataFrame = spark + .range(1, N, 1, 4) + .toDF("id") + .selectExpr("id", "STRUCT(id x, STRUCT(CAST(id AS STRING) z) y) nested") + .sort("id") + + private def addCase( + benchmark: Benchmark, + inputPath: String, + enableNestedPD: String, + name: String, + withFilter: DataFrame => DataFrame): Unit = { + val loadDF = spark.read.parquet(inputPath) + benchmark.addCase(name) { _ => + withSQLConf((SQLConf.NESTED_PREDICATE_PUSHDOWN_FILE_SOURCE_LIST.key, enableNestedPD)) { + withFilter(loadDF).noop() + } + } + } + + private def createAndRunBenchmark(name: String, withFilter: DataFrame => DataFrame): Unit = { + withTempPath { tempDir => + val outputPath = tempDir.getCanonicalPath + df.write.mode(SaveMode.Overwrite).parquet(outputPath) + val benchmark = new Benchmark(name, N, NUMBER_OF_ITER, output = output) + addCase( + benchmark, + outputPath, + enableNestedPD = "", + "Without nested predicate Pushdown", + withFilter) + addCase( + benchmark, + outputPath, + enableNestedPD = "parquet", + "With nested predicate Pushdown", + withFilter) + benchmark.run() + } + } + + /** + * Benchmark for sorted data with a filter which allows to filter out all the row groups + * when nested fields predicate push down enabled + */ + def runLoadNoRowGroupWhenPredicatePushedDown(): Unit = { + createAndRunBenchmark("Can skip all row groups", _.filter("nested.x < 0")) + } + + /** + * Benchmark with a filter which allows to load only some row groups + * when nested fields predicate push down enabled + */ + def runLoadSomeRowGroupWhenPredicatePushedDown(): Unit = { + createAndRunBenchmark("Can skip some row groups", _.filter("nested.x = 100")) + } + + /** + * Benchmark with a filter which still requires to + * load all the row groups on sorted data to see if we introduce too much + * overhead or not if enable nested predicate push down. + */ + def runLoadAllRowGroupsWhenPredicatePushedDown(): Unit = { + createAndRunBenchmark("Can skip no row groups", _.filter(s"nested.x >= 0 and nested.x <= $N")) + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runLoadNoRowGroupWhenPredicatePushedDown() + runLoadSomeRowGroupWhenPredicatePushedDown() + runLoadAllRowGroupsWhenPredicatePushedDown() + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala index ee7a03e5e0542..28387dcef125b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.SaveMode.Overwrite import org.apache.spark.sql.catalyst.plans.SQLHelper @@ -37,6 +38,7 @@ trait SqlBasedBenchmark extends BenchmarkBase with SQLHelper { .appName(this.getClass.getCanonicalName) .config(SQLConf.SHUFFLE_PARTITIONS.key, 1) .config(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, 1) + .config(UI_ENABLED.key, false) .getOrCreate() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index c93d27f02c686..43bc7c12937ec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -31,7 +31,8 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation * To run this: * {{{ * 1. without sbt: - * bin/spark-submit --class --data-location + * bin/spark-submit --jars , + * --class --data-location * 2. build/sbt "sql/test:runMain --data-location " * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt * "sql/test:runMain --data-location " @@ -81,7 +82,7 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark { val queryRelations = scala.collection.mutable.HashSet[String]() spark.sql(queryString).queryExecution.analyzed.foreach { case SubqueryAlias(alias, _: LogicalRelation) => - queryRelations.add(alias.identifier) + queryRelations.add(alias.name) case LogicalRelation(_, _, Some(catalogTable), _) => queryRelations.add(catalogTable.identifier.table) case HiveTableRelation(tableMeta, _, _, _, _) => @@ -97,11 +98,16 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark { } } - def filterQueries( + private def filterQueries( origQueries: Seq[String], - args: TPCDSQueryBenchmarkArguments): Seq[String] = { - if (args.queryFilter.nonEmpty) { - origQueries.filter(args.queryFilter.contains) + queryFilter: Set[String], + nameSuffix: String = ""): Seq[String] = { + if (queryFilter.nonEmpty) { + if (nameSuffix.nonEmpty) { + origQueries.filter { name => queryFilter.contains(s"$name$nameSuffix") } + } else { + origQueries.filter(queryFilter.contains) + } } else { origQueries } @@ -124,6 +130,7 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark { "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") // This list only includes TPC-DS v2.7 queries that are different from v1.4 ones + val nameSuffixForQueriesV2_7 = "-v2.7" val tpcdsQueriesV2_7 = Seq( "q5a", "q6", "q10a", "q11", "q12", "q14", "q14a", "q18a", "q20", "q22", "q22a", "q24", "q27a", "q34", "q35", "q35a", "q36a", "q47", "q49", @@ -131,8 +138,9 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark { "q80a", "q86a", "q98") // If `--query-filter` defined, filters the queries that this option selects - val queriesV1_4ToRun = filterQueries(tpcdsQueries, benchmarkArgs) - val queriesV2_7ToRun = filterQueries(tpcdsQueriesV2_7, benchmarkArgs) + val queriesV1_4ToRun = filterQueries(tpcdsQueries, benchmarkArgs.queryFilter) + val queriesV2_7ToRun = filterQueries(tpcdsQueriesV2_7, benchmarkArgs.queryFilter, + nameSuffix = nameSuffixForQueriesV2_7) if ((queriesV1_4ToRun ++ queriesV2_7ToRun).isEmpty) { throw new RuntimeException( @@ -142,6 +150,6 @@ object TPCDSQueryBenchmark extends SqlBasedBenchmark { val tableSizes = setupTables(benchmarkArgs.dataLocation) runTpcdsQueries(queryLocation = "tpcds", queries = queriesV1_4ToRun, tableSizes) runTpcdsQueries(queryLocation = "tpcds-v2.7.0", queries = queriesV2_7ToRun, tableSizes, - nameSuffix = "-v2.7") + nameSuffix = nameSuffixForQueriesV2_7) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala index f582d844cdc47..9b0389c6d1ea4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UnsafeArrayDataBenchmark.scala @@ -40,13 +40,16 @@ object UnsafeArrayDataBenchmark extends BenchmarkBase { UnsafeArrayData.calculateHeaderPortionInBytes(count) } + private lazy val intEncoder = ExpressionEncoder[Array[Int]]().resolveAndBind() + + private lazy val doubleEncoder = ExpressionEncoder[Array[Double]]().resolveAndBind() + def readUnsafeArray(iters: Int): Unit = { val count = 1024 * 1024 * 16 val rand = new Random(42) - + val intArrayToRow = intEncoder.createSerializer() val intPrimitiveArray = Array.fill[Int](count) { rand.nextInt } - val intEncoder = ExpressionEncoder[Array[Int]].resolveAndBind() - val intUnsafeArray = intEncoder.toRow(intPrimitiveArray).getArray(0) + val intUnsafeArray = intArrayToRow(intPrimitiveArray).getArray(0) val readIntArray = { i: Int => var n = 0 while (n < iters) { @@ -62,8 +65,8 @@ object UnsafeArrayDataBenchmark extends BenchmarkBase { } val doublePrimitiveArray = Array.fill[Double](count) { rand.nextDouble } - val doubleEncoder = ExpressionEncoder[Array[Double]].resolveAndBind() - val doubleUnsafeArray = doubleEncoder.toRow(doublePrimitiveArray).getArray(0) + val doubleArrayToRow = doubleEncoder.createSerializer() + val doubleUnsafeArray = doubleArrayToRow(doublePrimitiveArray).getArray(0) val readDoubleArray = { i: Int => var n = 0 while (n < iters) { @@ -90,12 +93,12 @@ object UnsafeArrayDataBenchmark extends BenchmarkBase { var intTotalLength: Int = 0 val intPrimitiveArray = Array.fill[Int](count) { rand.nextInt } - val intEncoder = ExpressionEncoder[Array[Int]].resolveAndBind() + val intArrayToRow = intEncoder.createSerializer() val writeIntArray = { i: Int => var len = 0 var n = 0 while (n < iters) { - len += intEncoder.toRow(intPrimitiveArray).getArray(0).numElements() + len += intArrayToRow(intPrimitiveArray).getArray(0).numElements() n += 1 } intTotalLength = len @@ -103,12 +106,12 @@ object UnsafeArrayDataBenchmark extends BenchmarkBase { var doubleTotalLength: Int = 0 val doublePrimitiveArray = Array.fill[Double](count) { rand.nextDouble } - val doubleEncoder = ExpressionEncoder[Array[Double]].resolveAndBind() + val doubleArrayToRow = doubleEncoder.createSerializer() val writeDoubleArray = { i: Int => var len = 0 var n = 0 while (n < iters) { - len += doubleEncoder.toRow(doublePrimitiveArray).getArray(0).numElements() + len += doubleArrayToRow(doublePrimitiveArray).getArray(0).numElements() n += 1 } doubleTotalLength = len @@ -126,8 +129,8 @@ object UnsafeArrayDataBenchmark extends BenchmarkBase { var intTotalLength: Int = 0 val intPrimitiveArray = Array.fill[Int](count) { rand.nextInt } - val intEncoder = ExpressionEncoder[Array[Int]].resolveAndBind() - val intUnsafeArray = intEncoder.toRow(intPrimitiveArray).getArray(0) + val intArrayToRow = intEncoder.createSerializer() + val intUnsafeArray = intArrayToRow(intPrimitiveArray).getArray(0) val readIntArray = { i: Int => var len = 0 var n = 0 @@ -140,8 +143,8 @@ object UnsafeArrayDataBenchmark extends BenchmarkBase { var doubleTotalLength: Int = 0 val doublePrimitiveArray = Array.fill[Double](count) { rand.nextDouble } - val doubleEncoder = ExpressionEncoder[Array[Double]].resolveAndBind() - val doubleUnsafeArray = doubleEncoder.toRow(doublePrimitiveArray).getArray(0) + val doubleArrayToRow = doubleEncoder.createSerializer() + val doubleUnsafeArray = doubleArrayToRow(doublePrimitiveArray).getArray(0) val readDoubleArray = { i: Int => var len = 0 var n = 0 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index 77047f329e105..18f29f7b90ad5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -126,13 +126,15 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSparkSession { } test("default size avoids broadcast") { - // TODO: Improve this test when we have better statistics - sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)) - .toDF().createOrReplaceTempView("sizeTst") - spark.catalog.cacheTable("sizeTst") - assert( - spark.table("sizeTst").queryExecution.analyzed.stats.sizeInBytes > - spark.conf.get(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD)) + withTempView("sizeTst") { + // TODO: Improve this test when we have better statistics + sparkContext.parallelize(1 to 10).map(i => TestData(i, i.toString)) + .toDF().createOrReplaceTempView("sizeTst") + spark.catalog.cacheTable("sizeTst") + assert( + spark.table("sizeTst").queryExecution.analyzed.stats.sizeInBytes > + spark.conf.get(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD)) + } } test("projection") { @@ -187,18 +189,20 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSparkSession { } test("SPARK-2729 regression: timestamp data type") { - val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time") - timestamps.createOrReplaceTempView("timestamps") + withTempView("timestamps") { + val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time") + timestamps.createOrReplaceTempView("timestamps") - checkAnswer( - sql("SELECT time FROM timestamps"), - timestamps.collect().toSeq) + checkAnswer( + sql("SELECT time FROM timestamps"), + timestamps.collect().toSeq) - spark.catalog.cacheTable("timestamps") + spark.catalog.cacheTable("timestamps") - checkAnswer( - sql("SELECT time FROM timestamps"), - timestamps.collect().toSeq) + checkAnswer( + sql("SELECT time FROM timestamps"), + timestamps.collect().toSeq) + } } test("SPARK-3320 regression: batched column buffer building should work with empty partitions") { @@ -229,10 +233,12 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSparkSession { assert(df.schema.head.dataType === DecimalType(15, 10)) - df.cache().createOrReplaceTempView("test_fixed_decimal") - checkAnswer( - sql("SELECT * FROM test_fixed_decimal"), - (1 to 10).map(i => Row(Decimal(i, 15, 10).toJavaBigDecimal))) + withTempView("test_fixed_decimal") { + df.cache().createOrReplaceTempView("test_fixed_decimal") + checkAnswer( + sql("SELECT * FROM test_fixed_decimal"), + (1 to 10).map(i => Row(Decimal(i, 15, 10).toJavaBigDecimal))) + } } test("test different data types") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala index 192db0e910d03..111a620df8c24 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala @@ -156,4 +156,30 @@ class BooleanBitSetSuite extends SparkFunSuite { test(s"$BooleanBitSet: multiple words and 1 more bit for decompression()") { skeletonForDecompress(BITS_PER_LONG * 2 + 1) } + + test(s"$BooleanBitSet: Only nulls for decompression()") { + val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet) + val numRows = 10 + + val rows = Seq.fill[InternalRow](numRows)({ + val row = new GenericInternalRow(1) + row.setNullAt(0) + row + }) + rows.foreach(builder.appendFrom(_, 0)) + val buffer = builder.build() + + // Rewinds, skips column header and 4 more bytes for compression scheme ID + val headerSize = CompressionScheme.columnHeaderSize(buffer) + buffer.position(headerSize) + assertResult(BooleanBitSet.typeId, "Wrong compression scheme ID")(buffer.getInt()) + + val decoder = BooleanBitSet.decoder(buffer, BOOLEAN) + val columnVector = new OnHeapColumnVector(numRows, BooleanType) + decoder.decompress(columnVector, numRows) + + (0 until numRows).foreach { rowNum => + assert(columnVector.isNullAt(rowNum)) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala index f946a6779ec95..c6fe64d1058ab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala @@ -46,7 +46,7 @@ class PassThroughSuite extends SparkFunSuite { val builder = TestCompressibleColumnBuilder(columnStats, columnType, PassThrough) - input.map { value => + input.foreach { value => val row = new GenericInternalRow(1) columnType.setField(row, 0, value) builder.appendFrom(row, 0) @@ -98,7 +98,7 @@ class PassThroughSuite extends SparkFunSuite { val row = new GenericInternalRow(1) val nullRow = new GenericInternalRow(1) nullRow.setNullAt(0) - input.map { value => + input.foreach { value => if (value == nullValue) { builder.appendFrom(nullRow, 0) } else { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala index 81965e4c6c353..c6a533dfae4d0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala @@ -75,12 +75,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { }.head } - private def withCreateTableStatement(sql: String)(prediction: CreateTableStatement => Unit) - : Unit = { - val statement = parser.parsePlan(sql).asInstanceOf[CreateTableStatement] - prediction(statement) - } - test("alter database - property values must be set") { assertUnsupported( sql = "ALTER DATABASE my_db SET DBPROPERTIES('key_without_value', 'key_with_value'='x')", @@ -486,17 +480,21 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("Test CTAS #3") { val s3 = """CREATE TABLE page_view AS SELECT * FROM src""" - val statement = parser.parsePlan(s3).asInstanceOf[CreateTableAsSelectStatement] - assert(statement.tableName(0) == "page_view") - assert(statement.asSelect == parser.parsePlan("SELECT * FROM src")) - assert(statement.partitioning.isEmpty) - assert(statement.bucketSpec.isEmpty) - assert(statement.properties.isEmpty) - assert(statement.provider == conf.defaultDataSourceName) - assert(statement.options.isEmpty) - assert(statement.location.isEmpty) - assert(statement.comment.isEmpty) - assert(!statement.ifNotExists) + val (desc, exists) = extractTableDesc(s3) + assert(exists == false) + assert(desc.identifier.database == None) + assert(desc.identifier.table == "page_view") + assert(desc.tableType == CatalogTableType.MANAGED) + assert(desc.storage.locationUri == None) + assert(desc.schema.isEmpty) + assert(desc.viewText == None) // TODO will be SQLText + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.storage.properties == Map()) + assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat")) + assert(desc.storage.outputFormat == + Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) + assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(desc.properties == Map()) } test("Test CTAS #4") { @@ -656,60 +654,67 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("create table - basic") { val query = "CREATE TABLE my_table (id int, name string)" - withCreateTableStatement(query) { state => - assert(state.tableName(0) == "my_table") - assert(state.tableSchema == new StructType().add("id", "int").add("name", "string")) - assert(state.partitioning.isEmpty) - assert(state.bucketSpec.isEmpty) - assert(state.properties.isEmpty) - assert(state.provider == conf.defaultDataSourceName) - assert(state.options.isEmpty) - assert(state.location.isEmpty) - assert(state.comment.isEmpty) - assert(!state.ifNotExists) - } + val (desc, allowExisting) = extractTableDesc(query) + assert(!allowExisting) + assert(desc.identifier.database.isEmpty) + assert(desc.identifier.table == "my_table") + assert(desc.tableType == CatalogTableType.MANAGED) + assert(desc.schema == new StructType().add("id", "int").add("name", "string")) + assert(desc.partitionColumnNames.isEmpty) + assert(desc.bucketSpec.isEmpty) + assert(desc.viewText.isEmpty) + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.storage.locationUri.isEmpty) + assert(desc.storage.inputFormat == + Some("org.apache.hadoop.mapred.TextInputFormat")) + assert(desc.storage.outputFormat == + Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) + assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(desc.storage.properties.isEmpty) + assert(desc.properties.isEmpty) + assert(desc.comment.isEmpty) } test("create table - with database name") { val query = "CREATE TABLE dbx.my_table (id int, name string)" - withCreateTableStatement(query) { state => - assert(state.tableName(0) == "dbx") - assert(state.tableName(1) == "my_table") - } + val (desc, _) = extractTableDesc(query) + assert(desc.identifier.database == Some("dbx")) + assert(desc.identifier.table == "my_table") } test("create table - temporary") { val query = "CREATE TEMPORARY TABLE tab1 (id int, name string)" val e = intercept[ParseException] { parser.parsePlan(query) } - assert(e.message.contains("CREATE TEMPORARY TABLE without a provider is not allowed.")) + assert(e.message.contains("CREATE TEMPORARY TABLE is not supported yet")) } test("create table - external") { val query = "CREATE EXTERNAL TABLE tab1 (id int, name string) LOCATION '/path/to/nowhere'" - val e = intercept[ParseException] { parser.parsePlan(query) } - assert(e.message.contains("Operation not allowed: CREATE EXTERNAL TABLE ...")) + val (desc, _) = extractTableDesc(query) + assert(desc.tableType == CatalogTableType.EXTERNAL) + assert(desc.storage.locationUri == Some(new URI("/path/to/nowhere"))) } test("create table - if not exists") { val query = "CREATE TABLE IF NOT EXISTS tab1 (id int, name string)" - withCreateTableStatement(query) { state => - assert(state.ifNotExists) - } + val (_, allowExisting) = extractTableDesc(query) + assert(allowExisting) } test("create table - comment") { val query = "CREATE TABLE my_table (id int, name string) COMMENT 'its hot as hell below'" - withCreateTableStatement(query) { state => - assert(state.comment == Some("its hot as hell below")) - } + val (desc, _) = extractTableDesc(query) + assert(desc.comment == Some("its hot as hell below")) } test("create table - partitioned columns") { - val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (id)" - withCreateTableStatement(query) { state => - val transform = IdentityTransform(FieldReference(Seq("id"))) - assert(state.partitioning == Seq(transform)) - } + val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (month int)" + val (desc, _) = extractTableDesc(query) + assert(desc.schema == new StructType() + .add("id", "int") + .add("name", "string") + .add("month", "int")) + assert(desc.partitionColumnNames == Seq("month")) } test("create table - clustered by") { @@ -725,22 +730,20 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { """ val query1 = s"$baseQuery INTO $numBuckets BUCKETS" - withCreateTableStatement(query1) { state => - assert(state.bucketSpec.isDefined) - val bucketSpec = state.bucketSpec.get - assert(bucketSpec.numBuckets == numBuckets) - assert(bucketSpec.bucketColumnNames.head.equals(bucketedColumn)) - assert(bucketSpec.sortColumnNames.isEmpty) - } + val (desc1, _) = extractTableDesc(query1) + assert(desc1.bucketSpec.isDefined) + val bucketSpec1 = desc1.bucketSpec.get + assert(bucketSpec1.numBuckets == numBuckets) + assert(bucketSpec1.bucketColumnNames.head.equals(bucketedColumn)) + assert(bucketSpec1.sortColumnNames.isEmpty) val query2 = s"$baseQuery SORTED BY($sortColumn) INTO $numBuckets BUCKETS" - withCreateTableStatement(query2) { state => - assert(state.bucketSpec.isDefined) - val bucketSpec = state.bucketSpec.get - assert(bucketSpec.numBuckets == numBuckets) - assert(bucketSpec.bucketColumnNames.head.equals(bucketedColumn)) - assert(bucketSpec.sortColumnNames.head.equals(sortColumn)) - } + val (desc2, _) = extractTableDesc(query2) + assert(desc2.bucketSpec.isDefined) + val bucketSpec2 = desc2.bucketSpec.get + assert(bucketSpec2.numBuckets == numBuckets) + assert(bucketSpec2.bucketColumnNames.head.equals(bucketedColumn)) + assert(bucketSpec2.sortColumnNames.head.equals(sortColumn)) } test("create table(hive) - skewed by") { @@ -810,9 +813,8 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("create table - properties") { val query = "CREATE TABLE my_table (id int, name string) TBLPROPERTIES ('k1'='v1', 'k2'='v2')" - withCreateTableStatement(query) { state => - assert(state.properties == Map("k1" -> "v1", "k2" -> "v2")) - } + val (desc, _) = extractTableDesc(query) + assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2")) } test("create table(hive) - everything!") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 913cd80a24c6e..5986cdc78d6b4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, NoSuchDatabaseException, NoSuchPartitionException, NoSuchTableException, TempTableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION @@ -188,7 +189,7 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSparkSession { withTable("t") { sql("CREATE TABLE t(i INT) USING parquet") val e = intercept[AnalysisException] { - sql("ALTER TABLE t ALTER COLUMN i TYPE INT FIRST") + sql("ALTER TABLE t ALTER COLUMN i FIRST") } assert(e.message.contains("ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables")) } @@ -473,17 +474,12 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { withEmptyDirInTablePath("tab1") { tableLoc => val hiddenGarbageFile = new File(tableLoc.getCanonicalPath, ".garbage") hiddenGarbageFile.createNewFile() - val exMsg = "Can not create the managed table('`tab1`'). The associated location" val exMsgWithDefaultDB = "Can not create the managed table('`default`.`tab1`'). The associated location" var ex = intercept[AnalysisException] { sql(s"CREATE TABLE tab1 USING ${dataSource} AS SELECT 1, 'a'") }.getMessage - if (isUsingHiveMetastore) { - assert(ex.contains(exMsgWithDefaultDB)) - } else { - assert(ex.contains(exMsg)) - } + assert(ex.contains(exMsgWithDefaultDB)) ex = intercept[AnalysisException] { sql(s"CREATE TABLE tab1 (col1 int, col2 string) USING ${dataSource}") @@ -509,8 +505,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val ex = intercept[AnalysisException] { sql("ALTER TABLE tab1 RENAME TO tab2") }.getMessage - val expectedMsg = "Can not rename the managed table('`tab1`'). The associated location" - assert(ex.contains(expectedMsg)) + assert(ex.contains( + "Can not rename the managed table('`default`.`tab1`'). The associated location")) } } } @@ -640,7 +636,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val errMsg = intercept[AnalysisException] { sql(s"CREATE TABLE t($c0 INT, $c1 INT) USING parquet") }.getMessage - assert(errMsg.contains("Found duplicate column(s) in the table definition of `t`")) + assert(errMsg.contains( + "Found duplicate column(s) in the table definition of `default`.`t`")) } } } @@ -649,7 +646,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val e = intercept[AnalysisException] { sql("CREATE TABLE tbl(a int, b string) USING json PARTITIONED BY (c)") } - assert(e.message == "partition column c is not defined in table tbl, " + + assert(e.message == "partition column c is not defined in table default.tbl, " + "defined table columns are: a, b") } @@ -657,7 +654,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val e = intercept[AnalysisException] { sql("CREATE TABLE tbl(a int, b string) USING json CLUSTERED BY (c) INTO 4 BUCKETS") } - assert(e.message == "bucket column c is not defined in table tbl, " + + assert(e.message == "bucket column c is not defined in table default.tbl, " + "defined table columns are: a, b") } @@ -1037,7 +1034,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { df.write.insertInto("students") spark.catalog.cacheTable("students") checkAnswer(spark.table("students"), df) - assume(spark.catalog.isCached("students"), "bad test: table was not cached in the first place") + assert(spark.catalog.isCached("students"), "bad test: table was not cached in the first place") sql("ALTER TABLE students RENAME TO teachers") sql("CREATE TABLE students (age INT, name STRING) USING parquet") // Now we have both students and teachers. @@ -1207,14 +1204,24 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } test("alter table: recover partitions (sequential)") { - withSQLConf(RDD_PARALLEL_LISTING_THRESHOLD.key -> "10") { + val oldRddParallelListingThreshold = spark.sparkContext.conf.get( + RDD_PARALLEL_LISTING_THRESHOLD) + try { + spark.sparkContext.conf.set(RDD_PARALLEL_LISTING_THRESHOLD.key, "10") testRecoverPartitions() + } finally { + spark.sparkContext.conf.set(RDD_PARALLEL_LISTING_THRESHOLD, oldRddParallelListingThreshold) } } test("alter table: recover partition (parallel)") { - withSQLConf(RDD_PARALLEL_LISTING_THRESHOLD.key -> "0") { + val oldRddParallelListingThreshold = spark.sparkContext.conf.get( + RDD_PARALLEL_LISTING_THRESHOLD) + try { + spark.sparkContext.conf.set(RDD_PARALLEL_LISTING_THRESHOLD.key, "0") testRecoverPartitions() + } finally { + spark.sparkContext.conf.set(RDD_PARALLEL_LISTING_THRESHOLD, oldRddParallelListingThreshold) } } @@ -1724,6 +1731,14 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { // use int literal as partition value for int type partition column sql("ALTER TABLE tab1 DROP PARTITION (a=9, b=9)") assert(catalog.listPartitions(tableIdent).isEmpty) + + // null partition values + createTablePartition(catalog, Map("a" -> null, "b" -> null), tableIdent) + val nullPartValue = if (isUsingHiveMetastore) "__HIVE_DEFAULT_PARTITION__" else null + assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == + Set(Map("a" -> nullPartValue, "b" -> nullPartValue))) + sql("ALTER TABLE tab1 DROP PARTITION (a = null, b = null)") + assert(catalog.listPartitions(tableIdent).isEmpty) } protected def testRenamePartitions(isDatasourceTable: Boolean): Unit = { @@ -1786,7 +1801,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { column.map(_.metadata).getOrElse(Metadata.empty) } // Ensure that change column will preserve other metadata fields. - sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 TYPE INT COMMENT 'this is col1'") + sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 TYPE INT") + sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 COMMENT 'this is col1'") assert(getMetadata("col1").getString("key") == "value") assert(getMetadata("col1").getString("comment") == "this is col1") } @@ -1962,7 +1978,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { Seq("json", "parquet").foreach { format => withTable("rectangles") { data.write.format(format).saveAsTable("rectangles") - assume(spark.table("rectangles").collect().nonEmpty, + assert(spark.table("rectangles").collect().nonEmpty, "bad test; table was empty to begin with") sql("TRUNCATE TABLE rectangles") @@ -2041,6 +2057,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { // Set ACL to table path. val customAcl = new java.util.ArrayList[AclEntry]() customAcl.add(new AclEntry.Builder() + .setName("test") .setType(AclEntryType.USER) .setScope(AclEntryScope.ACCESS) .setPermission(FsAction.READ).build()) @@ -2060,14 +2077,53 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { if (ignore) { assert(aclEntries.size() == 0) } else { - assert(aclEntries.size() == 1) + assert(aclEntries.size() == 4) assert(aclEntries.get(0) == customAcl.get(0)) + + // Setting ACLs will also set user/group/other permissions + // as ACL entries. + val user = new AclEntry.Builder() + .setType(AclEntryType.USER) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.ALL).build() + val group = new AclEntry.Builder() + .setType(AclEntryType.GROUP) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.ALL).build() + val other = new AclEntry.Builder() + .setType(AclEntryType.OTHER) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.ALL).build() + assert(aclEntries.get(1) == user) + assert(aclEntries.get(2) == group) + assert(aclEntries.get(3) == other) } } } } } + test("SPARK-31163: acl/permission should handle non-existed path when truncating table") { + withSQLConf(SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL.key -> "false") { + withTable("tab1") { + sql("CREATE TABLE tab1 (col1 STRING, col2 INT) USING parquet PARTITIONED BY (col2)") + sql("INSERT INTO tab1 SELECT 'one', 1") + checkAnswer(spark.table("tab1"), Row("one", 1)) + val part = spark.sessionState.catalog.listPartitions(TableIdentifier("tab1")).head + val path = new File(part.location.getPath) + sql("TRUNCATE TABLE tab1") + // simulate incomplete/unsuccessful truncate + assert(path.exists()) + path.delete() + assert(!path.exists()) + // execute without java.io.FileNotFoundException + sql("TRUNCATE TABLE tab1") + // partition path should be re-created + assert(path.exists()) + } + } + } + test("create temporary view with mismatched schema") { withTable("tab1") { spark.range(10).write.saveAsTable("tab1") @@ -2966,16 +3022,16 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } - test("Add a directory when spark.sql.legacy.addDirectory.recursive.enabled set to true") { + test(s"Add a directory when ${SQLConf.LEGACY_ADD_SINGLE_FILE_IN_ADD_FILE.key} set to false") { val directoryToAdd = Utils.createTempDir("/tmp/spark/addDirectory/") val testFile = File.createTempFile("testFile", "1", directoryToAdd) spark.sql(s"ADD FILE $directoryToAdd") assert(new File(SparkFiles.get(s"${directoryToAdd.getName}/${testFile.getName}")).exists()) } - test("Add a directory when spark.sql.legacy.addDirectory.recursive.enabled not set to true") { + test(s"Add a directory when ${SQLConf.LEGACY_ADD_SINGLE_FILE_IN_ADD_FILE.key} set to true") { withTempDir { testDir => - withSQLConf(SQLConf.LEGACY_ADD_DIRECTORY_USING_RECURSIVE.key -> "false") { + withSQLConf(SQLConf.LEGACY_ADD_SINGLE_FILE_IN_ADD_FILE.key -> "true") { val msg = intercept[SparkException] { spark.sql(s"ADD FILE $testDir") }.getMessage @@ -2983,6 +3039,58 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } } + + test("SPARK-33588: case sensitivity of partition spec in SHOW TABLE") { + val t = "part_table" + withTable(t) { + sql(s""" + |CREATE TABLE $t (price int, qty int, year int, month int) + |USING $dataSource + |PARTITIONED BY (year, month)""".stripMargin) + sql(s"INSERT INTO $t PARTITION(year = 2015, month = 1) SELECT 1, 1") + Seq( + true -> "PARTITION(year = 2015, month = 1)", + false -> "PARTITION(YEAR = 2015, Month = 1)" + ).foreach { case (caseSensitive, partitionSpec) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val df = sql(s"SHOW TABLE EXTENDED LIKE '$t' $partitionSpec") + val information = df.select("information").first().getString(0) + assert(information.contains("Partition Values: [year=2015, month=1]")) + } + } + } + } + + test("SPARK-33667: case sensitivity of partition spec in SHOW PARTITIONS") { + val t = "part_table" + withTable(t) { + sql(s""" + |CREATE TABLE $t (price int, qty int, year int, month int) + |USING $dataSource + |PARTITIONED BY (year, month)""".stripMargin) + sql(s"INSERT INTO $t PARTITION(year = 2015, month = 1) SELECT 1, 1") + Seq( + true -> "PARTITION(year = 2015, month = 1)", + false -> "PARTITION(YEAR = 2015, Month = 1)" + ).foreach { case (caseSensitive, partitionSpec) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + checkAnswer( + sql(s"SHOW PARTITIONS $t $partitionSpec"), + Row("year=2015/month=1")) + } + } + } + } + + test("SPARK-33670: show partitions from a datasource table") { + import testImplicits._ + val t = "part_datasrc" + withTable(t) { + val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") + df.write.partitionBy("a").format("parquet").mode(SaveMode.Overwrite).saveAsTable(t) + assert(sql(s"SHOW TABLE EXTENDED LIKE '$t' PARTITION(a = 1)").count() === 1) + } + } } object FakeLocalFsFileSystem { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index c0c3cd70fcc9e..8a3e1bb968efc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -37,11 +37,13 @@ import org.apache.spark.sql.connector.catalog.TableChange.{UpdateColumnComment, import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.SimpleScanSource import org.apache.spark.sql.types.{CharType, DoubleType, HIVE_TYPE_STRING, IntegerType, LongType, MetadataBuilder, StringType, StructField, StructType} class PlanResolutionSuite extends AnalysisTest { import CatalystSqlParser._ + private val v1Format = classOf[SimpleScanSource].getName private val v2Format = classOf[FakeV2Provider].getName private val table: Table = { @@ -61,6 +63,15 @@ class PlanResolutionSuite extends AnalysisTest { val t = mock(classOf[CatalogTable]) when(t.schema).thenReturn(new StructType().add("i", "int").add("s", "string")) when(t.tableType).thenReturn(CatalogTableType.MANAGED) + when(t.provider).thenReturn(Some(v1Format)) + V1Table(t) + } + + private val v1HiveTable: V1Table = { + val t = mock(classOf[CatalogTable]) + when(t.schema).thenReturn(new StructType().add("i", "int").add("s", "string")) + when(t.tableType).thenReturn(CatalogTableType.MANAGED) + when(t.provider).thenReturn(Some("hive")) V1Table(t) } @@ -83,6 +94,7 @@ class PlanResolutionSuite extends AnalysisTest { invocation.getArgument[Identifier](0).name match { case "v1Table" => v1Table case "v1Table1" => v1Table + case "v1HiveTable" => v1HiveTable case "v2Table" => table case "v2Table1" => table case "v2TableWithAcceptAnySchemaCapability" => tableWithAcceptAnySchemaCapability @@ -128,6 +140,7 @@ class PlanResolutionSuite extends AnalysisTest { } }) when(manager.currentCatalog).thenReturn(v2SessionCatalog) + when(manager.currentNamespace).thenReturn(Array("default")) when(manager.v1SessionCatalog).thenReturn(v1SessionCatalog) manager } @@ -145,7 +158,7 @@ class PlanResolutionSuite extends AnalysisTest { ResolveInlineTables(conf), analyzer.ResolveRelations, new ResolveCatalogs(catalogManager), - new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v")), + new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v"), _ => false), analyzer.ResolveTables, analyzer.ResolveReferences, analyzer.ResolveSubqueryColumnAliases, @@ -170,7 +183,7 @@ class PlanResolutionSuite extends AnalysisTest { "USING parquet PARTITIONED BY (a)" val expectedTableDesc = CatalogTable( - identifier = TableIdentifier("my_tab"), + identifier = TableIdentifier("my_tab", Some("default")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty, schema = new StructType() @@ -214,7 +227,7 @@ class PlanResolutionSuite extends AnalysisTest { "CLUSTERED BY (a) SORTED BY (b) INTO 5 BUCKETS" val expectedTableDesc = CatalogTable( - identifier = TableIdentifier("my_tab"), + identifier = TableIdentifier("my_tab", Some("default")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty, schema = new StructType().add("a", IntegerType).add("b", StringType), @@ -235,7 +248,7 @@ class PlanResolutionSuite extends AnalysisTest { val sql = "CREATE TABLE my_tab(a INT, b STRING) USING parquet COMMENT 'abc'" val expectedTableDesc = CatalogTable( - identifier = TableIdentifier("my_tab"), + identifier = TableIdentifier("my_tab", Some("default")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty, schema = new StructType().add("a", IntegerType).add("b", StringType), @@ -255,7 +268,7 @@ class PlanResolutionSuite extends AnalysisTest { val sql = "CREATE TABLE my_tab(a INT, b STRING) USING parquet TBLPROPERTIES('test' = 'test')" val expectedTableDesc = CatalogTable( - identifier = TableIdentifier("my_tab"), + identifier = TableIdentifier("my_tab", Some("default")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty, schema = new StructType().add("a", IntegerType).add("b", StringType), @@ -275,7 +288,7 @@ class PlanResolutionSuite extends AnalysisTest { val v1 = "CREATE TABLE my_tab(a INT, b STRING) USING parquet LOCATION '/tmp/file'" val expectedTableDesc = CatalogTable( - identifier = TableIdentifier("my_tab"), + identifier = TableIdentifier("my_tab", Some("default")), tableType = CatalogTableType.EXTERNAL, storage = CatalogStorageFormat.empty.copy(locationUri = Some(new URI("/tmp/file"))), schema = new StructType().add("a", IntegerType).add("b", StringType), @@ -329,7 +342,7 @@ class PlanResolutionSuite extends AnalysisTest { """.stripMargin val expectedTableDesc = CatalogTable( - identifier = TableIdentifier("table_name"), + identifier = TableIdentifier("table_name", Some("default")), tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat.empty.copy( properties = Map("a" -> "1", "b" -> "0.1", "c" -> "true") @@ -539,7 +552,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(ctas.catalog.name == "testcat") assert(ctas.tableName == Identifier.of(Array("mydb"), "table_name")) assert(ctas.properties == expectedProperties) - assert(ctas.writeOptions == Map("other" -> "20")) + assert(ctas.writeOptions.isEmpty) assert(ctas.partitioning.isEmpty) assert(ctas.ignoreIfExists) @@ -573,7 +586,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(ctas.catalog.name == "testcat") assert(ctas.tableName == Identifier.of(Array("mydb"), "table_name")) assert(ctas.properties == expectedProperties) - assert(ctas.writeOptions == Map("other" -> "20")) + assert(ctas.writeOptions.isEmpty) assert(ctas.partitioning.isEmpty) assert(ctas.ignoreIfExists) @@ -620,7 +633,7 @@ class PlanResolutionSuite extends AnalysisTest { val tableName1 = "db.tab" val tableIdent1 = TableIdentifier("tab", Option("db")) val tableName2 = "tab" - val tableIdent2 = TableIdentifier("tab", None) + val tableIdent2 = TableIdentifier("tab", Some("default")) parseResolveCompare(s"DROP TABLE $tableName1", DropTableCommand(tableIdent1, ifExists = false, isView = false, purge = false)) @@ -656,7 +669,7 @@ class PlanResolutionSuite extends AnalysisTest { val viewName1 = "db.view" val viewIdent1 = TableIdentifier("view", Option("db")) val viewName2 = "view" - val viewIdent2 = TableIdentifier("view") + val viewIdent2 = TableIdentifier("view", Option("default")) parseResolveCompare(s"DROP VIEW $viewName1", DropTableCommand(viewIdent1, ifExists = false, isView = true, purge = false)) @@ -687,7 +700,7 @@ class PlanResolutionSuite extends AnalysisTest { val parsed2_view = parseAndResolve(sql2_view) val parsed3_view = parseAndResolve(sql3_view) - val tableIdent = TableIdentifier("table_name", None) + val tableIdent = TableIdentifier("table_name", Some("default")) val expected1_view = AlterTableSetPropertiesCommand( tableIdent, Map("test" -> "test", "comment" -> "new_comment"), isView = true) val expected2_view = AlterTableUnsetPropertiesCommand( @@ -714,8 +727,8 @@ class PlanResolutionSuite extends AnalysisTest { val parsed2 = parseAndResolve(sql2) val parsed3 = parseAndResolve(sql3) - val tableIdent = TableIdentifier(tblName, None) if (useV1Command) { + val tableIdent = TableIdentifier(tblName, Some("default")) val expected1 = AlterTableSetPropertiesCommand( tableIdent, Map("test" -> "test", "comment" -> "new_comment"), isView = false) val expected2 = AlterTableUnsetPropertiesCommand( @@ -780,7 +793,7 @@ class PlanResolutionSuite extends AnalysisTest { val parsed = parseAndResolve(sql) if (useV1Command) { val expected = AlterTableSetPropertiesCommand( - TableIdentifier(tblName), + TableIdentifier(tblName, Some("default")), Map("a" -> "1", "b" -> "0.1", "c" -> "true"), isView = false) @@ -805,7 +818,7 @@ class PlanResolutionSuite extends AnalysisTest { val parsed = parseAndResolve(sql) if (useV1Command) { val expected = AlterTableSetLocationCommand( - TableIdentifier(tblName, None), + TableIdentifier(tblName, Some("default")), None, "new location") comparePlans(parsed, expected) @@ -827,8 +840,10 @@ class PlanResolutionSuite extends AnalysisTest { val parsed1 = parseAndResolve(sql1) val parsed2 = parseAndResolve(sql2) if (useV1Command) { - val expected1 = DescribeTableCommand(TableIdentifier(tblName, None), Map.empty, false) - val expected2 = DescribeTableCommand(TableIdentifier(tblName, None), Map.empty, true) + val expected1 = DescribeTableCommand( + TableIdentifier(tblName, Some("default")), Map.empty, false) + val expected2 = DescribeTableCommand( + TableIdentifier(tblName, Some("default")), Map.empty, true) comparePlans(parsed1, expected1) comparePlans(parsed2, expected2) @@ -850,7 +865,7 @@ class PlanResolutionSuite extends AnalysisTest { val parsed3 = parseAndResolve(sql3) if (useV1Command) { val expected3 = DescribeTableCommand( - TableIdentifier(tblName, None), Map("a" -> "1"), false) + TableIdentifier(tblName, Some("default")), Map("a" -> "1"), false) comparePlans(parsed3, expected3) } else { parsed3 match { @@ -885,33 +900,34 @@ class PlanResolutionSuite extends AnalysisTest { val parsed4 = parseAndResolve(sql4) parsed1 match { - case DeleteFromTable(_: DataSourceV2Relation, None) => - case _ => fail("Expect DeleteFromTable, bug got:\n" + parsed1.treeString) + case DeleteFromTable(AsDataSourceV2Relation(_), None) => + case _ => fail("Expect DeleteFromTable, but got:\n" + parsed1.treeString) } parsed2 match { case DeleteFromTable( - _: DataSourceV2Relation, + AsDataSourceV2Relation(_), Some(EqualTo(name: UnresolvedAttribute, StringLiteral("Robert")))) => assert(name.name == "name") - case _ => fail("Expect DeleteFromTable, bug got:\n" + parsed2.treeString) + case _ => fail("Expect DeleteFromTable, but got:\n" + parsed2.treeString) } parsed3 match { case DeleteFromTable( - SubqueryAlias(AliasIdentifier("t", None), _: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), Some(EqualTo(name: UnresolvedAttribute, StringLiteral("Robert")))) => assert(name.name == "t.name") - case _ => fail("Expect DeleteFromTable, bug got:\n" + parsed3.treeString) + case _ => fail("Expect DeleteFromTable, but got:\n" + parsed3.treeString) } parsed4 match { - case DeleteFromTable(SubqueryAlias(AliasIdentifier("t", None), _: DataSourceV2Relation), + case DeleteFromTable( + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), Some(InSubquery(values, query))) => assert(values.size == 1 && values.head.isInstanceOf[UnresolvedAttribute]) assert(values.head.asInstanceOf[UnresolvedAttribute].name == "t.name") query match { - case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", None), + case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", Seq()), UnresolvedSubqueryColumnAliases(outputColumnNames, Project(_, _: OneRowRelation)))), _, _, _) => assert(projects.size == 1 && projects.head.name == "s.name") @@ -944,7 +960,7 @@ class PlanResolutionSuite extends AnalysisTest { parsed1 match { case UpdateTable( - _: DataSourceV2Relation, + AsDataSourceV2Relation(_), Seq(Assignment(name: UnresolvedAttribute, StringLiteral("Robert")), Assignment(age: UnresolvedAttribute, IntegerLiteral(32))), None) => @@ -956,7 +972,7 @@ class PlanResolutionSuite extends AnalysisTest { parsed2 match { case UpdateTable( - SubqueryAlias(AliasIdentifier("t", None), _: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), Seq(Assignment(name: UnresolvedAttribute, StringLiteral("Robert")), Assignment(age: UnresolvedAttribute, IntegerLiteral(32))), None) => @@ -968,7 +984,7 @@ class PlanResolutionSuite extends AnalysisTest { parsed3 match { case UpdateTable( - SubqueryAlias(AliasIdentifier("t", None), _: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), Seq(Assignment(name: UnresolvedAttribute, StringLiteral("Robert")), Assignment(age: UnresolvedAttribute, IntegerLiteral(32))), Some(EqualTo(p: UnresolvedAttribute, IntegerLiteral(1)))) => @@ -980,14 +996,14 @@ class PlanResolutionSuite extends AnalysisTest { } parsed4 match { - case UpdateTable(SubqueryAlias(AliasIdentifier("t", None), _: DataSourceV2Relation), + case UpdateTable(SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), Seq(Assignment(key: UnresolvedAttribute, IntegerLiteral(32))), Some(InSubquery(values, query))) => assert(key.name == "t.age") assert(values.size == 1 && values.head.isInstanceOf[UnresolvedAttribute]) assert(values.head.asInstanceOf[UnresolvedAttribute].name == "t.name") query match { - case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", None), + case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", Seq()), UnresolvedSubqueryColumnAliases(outputColumnNames, Project(_, _: OneRowRelation)))), _, _, _) => assert(projects.size == 1 && projects.head.name == "s.name") @@ -1012,27 +1028,29 @@ class PlanResolutionSuite extends AnalysisTest { Seq("v1Table" -> true, "v2Table" -> false, "testcat.tab" -> false).foreach { case (tblName, useV1Command) => val sql1 = s"ALTER TABLE $tblName ALTER COLUMN i TYPE bigint" - val sql2 = s"ALTER TABLE $tblName ALTER COLUMN i TYPE bigint COMMENT 'new comment'" - val sql3 = s"ALTER TABLE $tblName ALTER COLUMN i COMMENT 'new comment'" + val sql2 = s"ALTER TABLE $tblName ALTER COLUMN i COMMENT 'new comment'" val parsed1 = parseAndResolve(sql1) val parsed2 = parseAndResolve(sql2) - val tableIdent = TableIdentifier(tblName, None) if (useV1Command) { + val tableIdent = TableIdentifier(tblName, Some("default")) + val oldColumn = StructField("i", IntegerType) val newColumn = StructField("i", LongType) val expected1 = AlterTableChangeColumnCommand( tableIdent, "i", newColumn) val expected2 = AlterTableChangeColumnCommand( - tableIdent, "i", newColumn.withComment("new comment")) + tableIdent, "i", oldColumn.withComment("new comment")) comparePlans(parsed1, expected1) comparePlans(parsed2, expected2) + val sql3 = s"ALTER TABLE $tblName ALTER COLUMN j COMMENT 'new comment'" val e1 = intercept[AnalysisException] { parseAndResolve(sql3) } - assert(e1.getMessage.contains("ALTER COLUMN with v1 tables must specify new data type")) + assert(e1.getMessage.contains( + "ALTER COLUMN cannot find column j in v1 table. Available: i, s")) val sql4 = s"ALTER TABLE $tblName ALTER COLUMN a.b.c TYPE bigint" val e2 = intercept[AnalysisException] { @@ -1040,18 +1058,7 @@ class PlanResolutionSuite extends AnalysisTest { } assert(e2.getMessage.contains( "ALTER COLUMN with qualified column is only supported with v2 tables")) - - val sql5 = s"ALTER TABLE $tblName ALTER COLUMN i TYPE char(1)" - val builder = new MetadataBuilder - builder.putString(HIVE_TYPE_STRING, CharType(1).catalogString) - val newColumnWithCleanedType = StructField("i", StringType, true, builder.build()) - val expected5 = AlterTableChangeColumnCommand( - tableIdent, "i", newColumnWithCleanedType) - val parsed5 = parseAndResolve(sql5) - comparePlans(parsed5, expected5) } else { - val parsed3 = parseAndResolve(sql3) - parsed1 match { case AlterTable(_, _, _: DataSourceV2Relation, changes) => assert(changes == Seq( @@ -1062,18 +1069,50 @@ class PlanResolutionSuite extends AnalysisTest { parsed2 match { case AlterTable(_, _, _: DataSourceV2Relation, changes) => assert(changes == Seq( - TableChange.updateColumnType(Array("i"), LongType), TableChange.updateColumnComment(Array("i"), "new comment"))) case _ => fail("expect AlterTable") } + } + } - parsed3 match { - case AlterTable(_, _, _: DataSourceV2Relation, changes) => - assert(changes == Seq( - TableChange.updateColumnComment(Array("i"), "new comment"))) - case _ => fail("expect AlterTable") + val sql = s"ALTER TABLE v1HiveTable ALTER COLUMN i TYPE char(1)" + val builder = new MetadataBuilder + builder.putString(HIVE_TYPE_STRING, CharType(1).catalogString) + val newColumnWithCleanedType = StructField("i", StringType, true, builder.build()) + val expected = AlterTableChangeColumnCommand( + TableIdentifier("v1HiveTable", Some("default")), "i", newColumnWithCleanedType) + val parsed = parseAndResolve(sql) + comparePlans(parsed, expected) + } + + test("alter table: alter column action is not specified") { + val e = intercept[AnalysisException] { + parseAndResolve("ALTER TABLE v1Table ALTER COLUMN i") + } + assert(e.getMessage.contains( + "ALTER TABLE table ALTER COLUMN requires a TYPE, a SET/DROP, a COMMENT, or a FIRST/AFTER")) + } + + test("alter table: alter column case sensitivity for v1 table") { + val tblName = "v1Table" + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val sql = s"ALTER TABLE $tblName ALTER COLUMN I COMMENT 'new comment'" + if (caseSensitive) { + val e = intercept[AnalysisException] { + parseAndResolve(sql) } + assert(e.getMessage.contains( + "ALTER COLUMN cannot find column I in v1 table. Available: i, s")) + } else { + val actual = parseAndResolve(sql) + val expected = AlterTableChangeColumnCommand( + TableIdentifier(tblName, Some("default")), + "I", + StructField("I", IntegerType).withComment("new comment")) + comparePlans(actual, expected) } + } } } @@ -1100,7 +1139,7 @@ class PlanResolutionSuite extends AnalysisTest { } val DSV2ResolutionTests = { - val v2SessionCatalogTable = s"${CatalogManager.SESSION_CATALOG_NAME}.v2Table" + val v2SessionCatalogTable = s"${CatalogManager.SESSION_CATALOG_NAME}.default.v2Table" Seq( ("ALTER TABLE testcat.tab ALTER COLUMN i TYPE bigint", false), ("ALTER TABLE tab ALTER COLUMN i TYPE bigint", false), @@ -1116,7 +1155,7 @@ class PlanResolutionSuite extends AnalysisTest { (s"SHOW TBLPROPERTIES $v2SessionCatalogTable", true), ("SELECT * from tab", false), ("SELECT * from testcat.tab", false), - (s"SELECT * from ${CatalogManager.SESSION_CATALOG_NAME}.v2Table", true) + (s"SELECT * from $v2SessionCatalogTable", true) ) } @@ -1129,7 +1168,7 @@ class PlanResolutionSuite extends AnalysisTest { case AlterTable(_, _, r: DataSourceV2Relation, _) => assert(r.catalog.exists(_ == catlogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) - case Project(_, r: DataSourceV2Relation) => + case Project(_, AsDataSourceV2Relation(r)) => assert(r.catalog.exists(_ == catlogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) case InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _) => @@ -1206,8 +1245,8 @@ class PlanResolutionSuite extends AnalysisTest { """.stripMargin parseAndResolve(sql1) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), source: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)), mergeCondition, Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), @@ -1232,8 +1271,8 @@ class PlanResolutionSuite extends AnalysisTest { """.stripMargin parseAndResolve(sql2) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), source: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)), mergeCondition, Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(ul: AttributeReference, @@ -1252,16 +1291,16 @@ class PlanResolutionSuite extends AnalysisTest { |MERGE INTO $target AS target |USING $source AS source |ON target.i = source.i - |WHEN MATCHED THEN DELETE + |WHEN MATCHED AND (target.s='delete') THEN DELETE |WHEN MATCHED THEN UPDATE SET target.s = source.s |WHEN NOT MATCHED THEN INSERT (target.i, target.s) values (source.i, source.s) """.stripMargin parseAndResolve(sql3) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), source: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)), mergeCondition, - Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), + Seq(DeleteAction(Some(_)), UpdateAction(None, updateAssigns)), Seq(InsertAction(None, insertAssigns))) => checkResolution(target, source, mergeCondition, None, None, None, updateAssigns, insertAssigns) @@ -1282,8 +1321,8 @@ class PlanResolutionSuite extends AnalysisTest { """.stripMargin parseAndResolve(sql4) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), source: Project), + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), source: Project), mergeCondition, Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), @@ -1311,8 +1350,8 @@ class PlanResolutionSuite extends AnalysisTest { """.stripMargin parseAndResolve(sql5) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), target: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), source: Project), + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), source: Project), mergeCondition, Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), @@ -1339,17 +1378,17 @@ class PlanResolutionSuite extends AnalysisTest { |MERGE INTO $target |USING $source |ON 1 = 1 - |WHEN MATCHED THEN DELETE + |WHEN MATCHED AND (${target}.s='delete') THEN DELETE |WHEN MATCHED THEN UPDATE SET s = 1 |WHEN NOT MATCHED AND (s = 'a') THEN INSERT (i) values (i) """.stripMargin parseAndResolve(sql1) match { case MergeIntoTable( - target: DataSourceV2Relation, - source: DataSourceV2Relation, + AsDataSourceV2Relation(target), + AsDataSourceV2Relation(source), _, - Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), + Seq(DeleteAction(Some(_)), UpdateAction(None, updateAssigns)), Seq(InsertAction( Some(EqualTo(il: AttributeReference, StringLiteral("a"))), insertAssigns))) => @@ -1426,7 +1465,7 @@ class PlanResolutionSuite extends AnalysisTest { |MERGE INTO non_exist_target |USING non_exist_source |ON target.i = source.i - |WHEN MATCHED THEN DELETE + |WHEN MATCHED AND (non_exist_target.s='delete') THEN DELETE |WHEN MATCHED THEN UPDATE SET * |WHEN NOT MATCHED THEN INSERT * """.stripMargin @@ -1453,8 +1492,8 @@ class PlanResolutionSuite extends AnalysisTest { parseAndResolve(sql) match { case MergeIntoTable( - SubqueryAlias(AliasIdentifier("target", None), _: DataSourceV2Relation), - SubqueryAlias(AliasIdentifier("source", None), _: DataSourceV2Relation), + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(_)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(_)), EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), Seq( DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), @@ -1479,5 +1518,48 @@ class PlanResolutionSuite extends AnalysisTest { case l => fail("Expected unresolved MergeIntoTable, but got:\n" + l.treeString) } } + + test("SPARK-31147: forbid CHAR type in non-Hive tables") { + def checkFailure(t: String, provider: String): Unit = { + val types = Seq( + "CHAR(2)", + "ARRAY", + "MAP", + "MAP", + "STRUCT") + types.foreach { tpe => + intercept[AnalysisException] { + parseAndResolve(s"CREATE TABLE $t(col $tpe) USING $provider") + } + intercept[AnalysisException] { + parseAndResolve(s"REPLACE TABLE $t(col $tpe) USING $provider") + } + intercept[AnalysisException] { + parseAndResolve(s"CREATE OR REPLACE TABLE $t(col $tpe) USING $provider") + } + intercept[AnalysisException] { + parseAndResolve(s"ALTER TABLE $t ADD COLUMN col $tpe") + } + intercept[AnalysisException] { + parseAndResolve(s"ALTER TABLE $t ADD COLUMN col $tpe") + } + intercept[AnalysisException] { + parseAndResolve(s"ALTER TABLE $t ALTER COLUMN col TYPE $tpe") + } + } + } + + checkFailure("v1Table", v1Format) + checkFailure("v2Table", v2Format) + checkFailure("testcat.tab", "foo") + } + // TODO: add tests for more commands. } + +object AsDataSourceV2Relation { + def unapply(plan: LogicalPlan): Option[DataSourceV2Relation] = plan match { + case SubqueryAlias(_, r: DataSourceV2Relation) => Some(r) + case _ => None + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala index b76db70494cf8..b94918eccd46e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala @@ -22,68 +22,128 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.sources import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} class DataSourceStrategySuite extends PlanTest with SharedSparkSession { - - test("translate simple expression") { - val attrInt = 'cint.int - val attrStr = 'cstr.string - - testTranslateFilter(EqualTo(attrInt, 1), Some(sources.EqualTo("cint", 1))) - testTranslateFilter(EqualTo(1, attrInt), Some(sources.EqualTo("cint", 1))) + val attrInts = Seq( + 'cint.int, + Symbol("c.int").int, + GetStructField('a.struct(StructType( + StructField("cstr", StringType, nullable = true) :: + StructField("cint", IntegerType, nullable = true) :: Nil)), 1, None), + GetStructField('a.struct(StructType( + StructField("c.int", IntegerType, nullable = true) :: + StructField("cstr", StringType, nullable = true) :: Nil)), 0, None), + GetStructField(Symbol("a.b").struct(StructType( + StructField("cstr1", StringType, nullable = true) :: + StructField("cstr2", StringType, nullable = true) :: + StructField("cint", IntegerType, nullable = true) :: Nil)), 2, None), + GetStructField(Symbol("a.b").struct(StructType( + StructField("c.int", IntegerType, nullable = true) :: Nil)), 0, None), + GetStructField(GetStructField('a.struct(StructType( + StructField("cstr1", StringType, nullable = true) :: + StructField("b", StructType(StructField("cint", IntegerType, nullable = true) :: + StructField("cstr2", StringType, nullable = true) :: Nil)) :: Nil)), 1, None), 0, None) + ).zip(Seq( + "cint", + "`c.int`", // single level field that contains `dot` in name + "a.cint", // two level nested field + "a.`c.int`", // two level nested field, and nested level contains `dot` + "`a.b`.cint", // two level nested field, and top level contains `dot` + "`a.b`.`c.int`", // two level nested field, and both levels contain `dot` + "a.b.cint" // three level nested field + )) + + val attrStrs = Seq( + 'cstr.string, + Symbol("c.str").string, + GetStructField('a.struct(StructType( + StructField("cint", IntegerType, nullable = true) :: + StructField("cstr", StringType, nullable = true) :: Nil)), 1, None), + GetStructField('a.struct(StructType( + StructField("c.str", StringType, nullable = true) :: + StructField("cint", IntegerType, nullable = true) :: Nil)), 0, None), + GetStructField(Symbol("a.b").struct(StructType( + StructField("cint1", IntegerType, nullable = true) :: + StructField("cint2", IntegerType, nullable = true) :: + StructField("cstr", StringType, nullable = true) :: Nil)), 2, None), + GetStructField(Symbol("a.b").struct(StructType( + StructField("c.str", StringType, nullable = true) :: Nil)), 0, None), + GetStructField(GetStructField('a.struct(StructType( + StructField("cint1", IntegerType, nullable = true) :: + StructField("b", StructType(StructField("cstr", StringType, nullable = true) :: + StructField("cint2", IntegerType, nullable = true) :: Nil)) :: Nil)), 1, None), 0, None) + ).zip(Seq( + "cstr", + "`c.str`", // single level field that contains `dot` in name + "a.cstr", // two level nested field + "a.`c.str`", // two level nested field, and nested level contains `dot` + "`a.b`.cstr", // two level nested field, and top level contains `dot` + "`a.b`.`c.str`", // two level nested field, and both levels contain `dot` + "a.b.cstr" // three level nested field + )) + + test("translate simple expression") { attrInts.zip(attrStrs) + .foreach { case ((attrInt, intColName), (attrStr, strColName)) => + + testTranslateFilter(EqualTo(attrInt, 1), Some(sources.EqualTo(intColName, 1))) + testTranslateFilter(EqualTo(1, attrInt), Some(sources.EqualTo(intColName, 1))) testTranslateFilter(EqualNullSafe(attrStr, Literal(null)), - Some(sources.EqualNullSafe("cstr", null))) + Some(sources.EqualNullSafe(strColName, null))) testTranslateFilter(EqualNullSafe(Literal(null), attrStr), - Some(sources.EqualNullSafe("cstr", null))) + Some(sources.EqualNullSafe(strColName, null))) - testTranslateFilter(GreaterThan(attrInt, 1), Some(sources.GreaterThan("cint", 1))) - testTranslateFilter(GreaterThan(1, attrInt), Some(sources.LessThan("cint", 1))) + testTranslateFilter(GreaterThan(attrInt, 1), Some(sources.GreaterThan(intColName, 1))) + testTranslateFilter(GreaterThan(1, attrInt), Some(sources.LessThan(intColName, 1))) - testTranslateFilter(LessThan(attrInt, 1), Some(sources.LessThan("cint", 1))) - testTranslateFilter(LessThan(1, attrInt), Some(sources.GreaterThan("cint", 1))) + testTranslateFilter(LessThan(attrInt, 1), Some(sources.LessThan(intColName, 1))) + testTranslateFilter(LessThan(1, attrInt), Some(sources.GreaterThan(intColName, 1))) - testTranslateFilter(GreaterThanOrEqual(attrInt, 1), Some(sources.GreaterThanOrEqual("cint", 1))) - testTranslateFilter(GreaterThanOrEqual(1, attrInt), Some(sources.LessThanOrEqual("cint", 1))) + testTranslateFilter(GreaterThanOrEqual(attrInt, 1), + Some(sources.GreaterThanOrEqual(intColName, 1))) + testTranslateFilter(GreaterThanOrEqual(1, attrInt), + Some(sources.LessThanOrEqual(intColName, 1))) - testTranslateFilter(LessThanOrEqual(attrInt, 1), Some(sources.LessThanOrEqual("cint", 1))) - testTranslateFilter(LessThanOrEqual(1, attrInt), Some(sources.GreaterThanOrEqual("cint", 1))) + testTranslateFilter(LessThanOrEqual(attrInt, 1), + Some(sources.LessThanOrEqual(intColName, 1))) + testTranslateFilter(LessThanOrEqual(1, attrInt), + Some(sources.GreaterThanOrEqual(intColName, 1))) - testTranslateFilter(InSet(attrInt, Set(1, 2, 3)), Some(sources.In("cint", Array(1, 2, 3)))) + testTranslateFilter(InSet(attrInt, Set(1, 2, 3)), Some(sources.In(intColName, Array(1, 2, 3)))) - testTranslateFilter(In(attrInt, Seq(1, 2, 3)), Some(sources.In("cint", Array(1, 2, 3)))) + testTranslateFilter(In(attrInt, Seq(1, 2, 3)), Some(sources.In(intColName, Array(1, 2, 3)))) - testTranslateFilter(IsNull(attrInt), Some(sources.IsNull("cint"))) - testTranslateFilter(IsNotNull(attrInt), Some(sources.IsNotNull("cint"))) + testTranslateFilter(IsNull(attrInt), Some(sources.IsNull(intColName))) + testTranslateFilter(IsNotNull(attrInt), Some(sources.IsNotNull(intColName))) // cint > 1 AND cint < 10 testTranslateFilter(And( GreaterThan(attrInt, 1), LessThan(attrInt, 10)), Some(sources.And( - sources.GreaterThan("cint", 1), - sources.LessThan("cint", 10)))) + sources.GreaterThan(intColName, 1), + sources.LessThan(intColName, 10)))) // cint >= 8 OR cint <= 2 testTranslateFilter(Or( GreaterThanOrEqual(attrInt, 8), LessThanOrEqual(attrInt, 2)), Some(sources.Or( - sources.GreaterThanOrEqual("cint", 8), - sources.LessThanOrEqual("cint", 2)))) + sources.GreaterThanOrEqual(intColName, 8), + sources.LessThanOrEqual(intColName, 2)))) testTranslateFilter(Not(GreaterThanOrEqual(attrInt, 8)), - Some(sources.Not(sources.GreaterThanOrEqual("cint", 8)))) + Some(sources.Not(sources.GreaterThanOrEqual(intColName, 8)))) - testTranslateFilter(StartsWith(attrStr, "a"), Some(sources.StringStartsWith("cstr", "a"))) + testTranslateFilter(StartsWith(attrStr, "a"), Some(sources.StringStartsWith(strColName, "a"))) - testTranslateFilter(EndsWith(attrStr, "a"), Some(sources.StringEndsWith("cstr", "a"))) + testTranslateFilter(EndsWith(attrStr, "a"), Some(sources.StringEndsWith(strColName, "a"))) - testTranslateFilter(Contains(attrStr, "a"), Some(sources.StringContains("cstr", "a"))) - } + testTranslateFilter(Contains(attrStr, "a"), Some(sources.StringContains(strColName, "a"))) + }} - test("translate complex expression") { - val attrInt = 'cint.int + test("translate complex expression") { attrInts.foreach { case (attrInt, intColName) => // ABS(cint) - 2 <= 1 testTranslateFilter(LessThanOrEqual( @@ -102,11 +162,11 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { LessThan(attrInt, 100))), Some(sources.Or( sources.And( - sources.GreaterThan("cint", 1), - sources.LessThan("cint", 10)), + sources.GreaterThan(intColName, 1), + sources.LessThan(intColName, 10)), sources.And( - sources.GreaterThan("cint", 50), - sources.LessThan("cint", 100))))) + sources.GreaterThan(intColName, 50), + sources.LessThan(intColName, 100))))) // SPARK-22548 Incorrect nested AND expression pushed down to JDBC data source // (cint > 1 AND ABS(cint) < 10) OR (cint < 50 AND cint > 100) @@ -142,11 +202,11 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { LessThan(attrInt, -10))), Some(sources.Or( sources.Or( - sources.EqualTo("cint", 1), - sources.EqualTo("cint", 10)), + sources.EqualTo(intColName, 1), + sources.EqualTo(intColName, 10)), sources.Or( - sources.GreaterThan("cint", 0), - sources.LessThan("cint", -10))))) + sources.GreaterThan(intColName, 0), + sources.LessThan(intColName, -10))))) // (cint = 1 OR ABS(cint) = 10) OR (cint > 0 OR cint < -10) testTranslateFilter(Or( @@ -173,11 +233,11 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { IsNotNull(attrInt))), Some(sources.And( sources.And( - sources.GreaterThan("cint", 1), - sources.LessThan("cint", 10)), + sources.GreaterThan(intColName, 1), + sources.LessThan(intColName, 10)), sources.And( - sources.EqualTo("cint", 6), - sources.IsNotNull("cint"))))) + sources.EqualTo(intColName, 6), + sources.IsNotNull(intColName))))) // (cint > 1 AND cint < 10) AND (ABS(cint) = 6 AND cint IS NOT NULL) testTranslateFilter(And( @@ -201,11 +261,11 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { IsNotNull(attrInt))), Some(sources.And( sources.Or( - sources.GreaterThan("cint", 1), - sources.LessThan("cint", 10)), + sources.GreaterThan(intColName, 1), + sources.LessThan(intColName, 10)), sources.Or( - sources.EqualTo("cint", 6), - sources.IsNotNull("cint"))))) + sources.EqualTo(intColName, 6), + sources.IsNotNull(intColName))))) // (cint > 1 OR cint < 10) AND (cint = 6 OR cint IS NOT NULL) testTranslateFilter(And( @@ -217,7 +277,7 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { // Functions such as 'Abs' are not supported EqualTo(Abs(attrInt), 6), IsNotNull(attrInt))), None) - } + }} test("SPARK-26865 DataSourceV2Strategy should push normalized filters") { val attrInt = 'cint.int @@ -226,13 +286,38 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { } } + test("SPARK-31027 test `PushableColumn.unapply` that finds the column name of " + + "an expression that can be pushed down") { + attrInts.foreach { case (attrInt, colName) => + assert(PushableColumnAndNestedColumn.unapply(attrInt) === Some(colName)) + + if (colName.contains(".")) { + assert(PushableColumnWithoutNestedColumn.unapply(attrInt) === None) + } else { + assert(PushableColumnWithoutNestedColumn.unapply(attrInt) === Some(colName)) + } + } + attrStrs.foreach { case (attrStr, colName) => + assert(PushableColumnAndNestedColumn.unapply(attrStr) === Some(colName)) + + if (colName.contains(".")) { + assert(PushableColumnWithoutNestedColumn.unapply(attrStr) === None) + } else { + assert(PushableColumnWithoutNestedColumn.unapply(attrStr) === Some(colName)) + } + } + + // `Abs(col)` can not be pushed down, so it returns `None` + assert(PushableColumnAndNestedColumn.unapply(Abs('col.int)) === None) + } + /** * Translate the given Catalyst [[Expression]] into data source [[sources.Filter]] * then verify against the given [[sources.Filter]]. */ def testTranslateFilter(catalystFilter: Expression, result: Option[sources.Filter]): Unit = { assertResult(result) { - DataSourceStrategy.translateFilter(catalystFilter) + DataSourceStrategy.translateFilter(catalystFilter, true) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala index 553773e2555cf..c412d953d7037 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala @@ -362,13 +362,15 @@ class FileIndexSuite extends SharedSparkSession { val wrongBasePath = new File(dir, "unknown") // basePath must be a directory wrongBasePath.mkdir() - val parameters = Map("basePath" -> wrongBasePath.getCanonicalPath) - val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None) - val msg = intercept[IllegalArgumentException] { - // trigger inferPartitioning() - fileIndex.partitionSpec() - }.getMessage - assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path") + withClue("SPARK-32368: 'basePath' can be case insensitive") { + val parameters = Map("bAsepAtH" -> wrongBasePath.getCanonicalPath) + val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None) + val msg = intercept[IllegalArgumentException] { + // trigger inferPartitioning() + fileIndex.partitionSpec() + }.getMessage + assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path") + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 5977e867f788a..37ec6e29213e7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -301,35 +301,15 @@ abstract class SchemaPruningSuite checkAnswer(query, Row("Y.", 1) :: Row("X.", 1) :: Row(null, 2) :: Row(null, 2) :: Nil) } - testSchemaPruning("select explode of nested field of array of struct") { - // Config combinations - val configs = Seq((true, true), (true, false), (false, true), (false, false)) - - configs.foreach { case (nestedPruning, nestedPruningOnExpr) => - withSQLConf( - SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> nestedPruning.toString, - SQLConf.NESTED_PRUNING_ON_EXPRESSIONS.key -> nestedPruningOnExpr.toString) { - val query1 = spark.table("contacts") - .select(explode(col("friends.first"))) - if (nestedPruning) { - // If `NESTED_SCHEMA_PRUNING_ENABLED` is enabled, - // even disabling `NESTED_PRUNING_ON_EXPRESSIONS`, - // nested schema is still pruned at scan node. - checkScan(query1, "struct>>") - } else { - checkScan(query1, "struct>>") - } - checkAnswer(query1, Row("Susan") :: Nil) - - val query2 = spark.table("contacts") - .select(explode(col("friends.first")), col("friends.middle")) - if (nestedPruning) { - checkScan(query2, "struct>>") - } else { - checkScan(query2, "struct>>") - } - checkAnswer(query2, Row("Susan", Array("Z.")) :: Nil) - } + testSchemaPruning("SPARK-32163: nested pruning should work even with cosmetic variations") { + withTempView("contact_alias") { + sql("select * from contacts") + .select(explode(col("friends.first")), col("friends")) + .createOrReplaceTempView("contact_alias") + + val query = sql("select friends.middle, col from contact_alias") + checkScan(query, "struct>>") + checkAnswer(query, Row(Array("Z."), "Susan") :: Nil) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala index 2cd142f913072..8462916daaab8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala @@ -304,7 +304,7 @@ class BinaryFileFormatSuite extends QueryTest with SharedSparkSession { val partitionedFile = mock(classOf[PartitionedFile]) when(partitionedFile.filePath).thenReturn(file.getPath) val encoder = RowEncoder(requiredSchema).resolveAndBind() - encoder.fromRow(reader(partitionedFile).next()) + encoder.createDeserializer().apply(reader(partitionedFile).next()) } test("column pruning") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala index e2abb39c986a7..53d287b32f8db 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala @@ -238,7 +238,9 @@ object CSVBenchmark extends SqlBasedBenchmark { def timestampStr: Dataset[String] = { spark.range(0, rowsNum, 1, 1).mapPartitions { iter => - iter.map(i => s"1970-01-01T01:02:03.${100 + i % 100}Z") + iter.map { + i => s"1970-01-01T01:02:03.${i % 200}Z".stripSuffix(".0Z") + } }.select($"value".as("timestamp")).as[String] } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 97dfbbdb7fd2f..4e93ea3e8e42f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1080,7 +1080,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa .format("csv") .option("header", "true") .option("timestampFormat", "yyyy/MM/dd HH:mm") - .option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + .option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .save(timestampsWithFormatPath) // This will load back the timestamps as string. @@ -1102,7 +1102,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa .option("header", "true") .option("inferSchema", "true") .option("timestampFormat", "yyyy/MM/dd HH:mm") - .option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + .option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .load(timestampsWithFormatPath) checkAnswer(readBack, timestampsWithFormat) @@ -1182,7 +1182,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa .schema(schemaWithCorrField1) .csv(testFile(valueMalformedFile)) checkAnswer(df2, - Row(0, null, "0,2013-111-11 12:13:14") :: + Row(0, null, "0,2013-111_11 12:13:14") :: Row(1, java.sql.Date.valueOf("1983-08-04"), null) :: Nil) @@ -1199,7 +1199,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa .schema(schemaWithCorrField2) .csv(testFile(valueMalformedFile)) checkAnswer(df3, - Row(0, "0,2013-111-11 12:13:14", null) :: + Row(0, "0,2013-111_11 12:13:14", null) :: Row(1, null, java.sql.Date.valueOf("1983-08-04")) :: Nil) @@ -1435,7 +1435,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa assert(df.filter($"_corrupt_record".isNull).count() == 1) checkAnswer( df.select(columnNameOfCorruptRecord), - Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil + Row("0,2013-111_11 12:13:14") :: Row(null) :: Nil ) } @@ -1450,10 +1450,22 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa val ds = sampledTestData.coalesce(1) ds.write.text(path.getAbsolutePath) - val readback = spark.read + val readback1 = spark.read .option("inferSchema", true).option("samplingRatio", 0.1) .csv(path.getCanonicalPath) - assert(readback.schema == new StructType().add("_c0", IntegerType)) + assert(readback1.schema == new StructType().add("_c0", IntegerType)) + + withClue("SPARK-32621: 'path' option can cause issues while inferring schema") { + // During infer, "path" option gets added again to the paths that have already been listed. + // This results in reading more data than necessary and causes different schema to be + // inferred when sampling ratio is involved. + val readback2 = spark.read + .option("inferSchema", true).option("samplingRatio", 0.1) + .option("path", path.getCanonicalPath) + .format("csv") + .load + assert(readback2.schema == new StructType().add("_c0", IntegerType)) + } }) } @@ -1890,11 +1902,26 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa test("SPARK-25387: bad input should not cause NPE") { val schema = StructType(StructField("a", IntegerType) :: Nil) - val input = spark.createDataset(Seq("\u0000\u0000\u0001234")) + val input = spark.createDataset(Seq("\u0001\u0000\u0001234")) checkAnswer(spark.read.schema(schema).csv(input), Row(null)) checkAnswer(spark.read.option("multiLine", true).schema(schema).csv(input), Row(null)) - assert(spark.read.csv(input).collect().toSet == Set(Row())) + assert(spark.read.schema(schema).csv(input).collect().toSet == Set(Row(null))) + } + + test("SPARK-31261: bad csv input with `columnNameCorruptRecord` should not cause NPE") { + val schema = StructType( + StructField("a", IntegerType) :: StructField("_corrupt_record", StringType) :: Nil) + val input = spark.createDataset(Seq("\u0001\u0000\u0001234")) + + checkAnswer( + spark.read + .option("columnNameOfCorruptRecord", "_corrupt_record") + .schema(schema) + .csv(input), + Row(null, "\u0001\u0000\u0001234")) + assert(spark.read.schema(schema).csv(input).collect().toSet == + Set(Row(null, "\u0001\u0000\u0001234"))) } test("field names of inferred schema shouldn't compare to the first row") { @@ -2093,7 +2120,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa Seq("csv", "").foreach { reader => withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> reader) { withTempPath { path => - val df = Seq(("0", "2013-111-11")).toDF("a", "b") + val df = Seq(("0", "2013-111_11")).toDF("a", "b") df.write .option("header", "true") .csv(path.getAbsolutePath) @@ -2109,7 +2136,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord) .schema(schemaWithCorrField) .csv(path.getAbsoluteFile.toString) - checkAnswer(readDF, Row(0, null, "0,2013-111-11") :: Nil) + checkAnswer(readDF, Row(0, null, "0,2013-111_11") :: Nil) } } } @@ -2216,7 +2243,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa val readback = spark.read .option("mode", mode) .option("header", true) - .option("timestampFormat", "uuuu-MM-dd HH:mm:ss") + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") .option("multiLine", multiLine) .schema("c0 string, c1 integer, c2 timestamp") .csv(path.getAbsolutePath) @@ -2235,7 +2262,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa } test("filters push down - malformed input in PERMISSIVE mode") { - val invalidTs = "2019-123-14 20:35:30" + val invalidTs = "2019-123_14 20:35:30" val invalidRow = s"0,$invalidTs,999" val validTs = "2019-12-14 20:35:30" Seq(true, false).foreach { filterPushdown => @@ -2252,7 +2279,7 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa .option("mode", "PERMISSIVE") .option("columnNameOfCorruptRecord", "c3") .option("header", true) - .option("timestampFormat", "uuuu-MM-dd HH:mm:ss") + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") .schema("c0 integer, c1 timestamp, c2 integer, c3 string") .csv(path.getAbsolutePath) .where(condition) @@ -2294,6 +2321,63 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa } } } + + test("SPARK-30810: parses and convert a CSV Dataset having different column from 'value'") { + val ds = spark.range(2).selectExpr("concat('a,b,', id) AS `a.text`").as[String] + val csv = spark.read.option("header", true).option("inferSchema", true).csv(ds) + assert(csv.schema.fieldNames === Seq("a", "b", "0")) + checkAnswer(csv, Row("a", "b", 1)) + } + + test("SPARK-30960: parse date/timestamp string with legacy format") { + val ds = Seq("2020-1-12 3:23:34.12, 2020-1-12 T").toDS() + val csv = spark.read.option("header", false).schema("t timestamp, d date").csv(ds) + checkAnswer(csv, Row(Timestamp.valueOf("2020-1-12 3:23:34.12"), Date.valueOf("2020-1-12"))) + } + + test("exception mode for parsing date/timestamp string") { + val ds = Seq("2020-01-27T20:06:11.847-0800").toDS() + val csv = spark.read + .option("header", false) + .option("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSz") + .schema("t timestamp").csv(ds) + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "exception") { + val msg = intercept[SparkException] { + csv.collect() + }.getCause.getMessage + assert(msg.contains("Fail to parse")) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "legacy") { + checkAnswer(csv, Row(Timestamp.valueOf("2020-01-27 20:06:11.847"))) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "corrected") { + checkAnswer(csv, Row(null)) + } + } + + test("SPARK-32614: don't treat rows starting with null char as comment") { + withTempPath { path => + Seq("\u0000foo", "bar", "baz").toDS.write.text(path.getCanonicalPath) + val df = spark.read.format("csv") + .option("header", "false") + .option("inferSchema", "true") + .load(path.getCanonicalPath) + assert(df.count() == 3) + } + } + + test("SPARK-32810: CSV data source should be able to read files with " + + "escaped glob metacharacter in the paths") { + withTempDir { dir => + val basePath = dir.getCanonicalPath + // test CSV writer / reader without specifying schema + val csvTableName = "[abc]" + spark.range(3).coalesce(1).write.csv(s"$basePath/$csvTableName") + val readback = spark.read + .csv(s"$basePath/${"""(\[|\]|\{|\})""".r.replaceAllIn(csvTableName, """\\$1""")}") + assert(readback.collect sameElements Array(Row("0"), Row("1"), Row("2"))) + } + } } class CSVv1Suite extends CSVSuite { @@ -2309,3 +2393,10 @@ class CSVv2Suite extends CSVSuite { .sparkConf .set(SQLConf.USE_V1_SOURCE_LIST, "") } + +class CSVLegacyTimeParserSuite extends CSVSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.LEGACY_TIME_PARSER_POLICY, "legacy") +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala index bcecaccc8cc89..0dbd6b5754afb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala @@ -430,7 +430,7 @@ object JsonBenchmark extends SqlBasedBenchmark { } readBench.addCase("infer timestamps from files", numIters) { _ => - spark.read.json(timestampDir).noop() + spark.read.option("inferTimestamp", true).json(timestampDir).noop() } val dateSchema = new StructType().add("date", DateType) @@ -445,7 +445,9 @@ object JsonBenchmark extends SqlBasedBenchmark { def timestampStr: Dataset[String] = { spark.range(0, rowsNum, 1, 1).mapPartitions { iter => - iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${100 + i % 100}Z"}""") + iter.map { i => + s"""{"timestamp":"1970-01-01T01:02:03.${i % 200}Z"}""".stripSuffix(".0Z") + } }.select($"value".as("timestamp")).as[String] } @@ -458,7 +460,7 @@ object JsonBenchmark extends SqlBasedBenchmark { } readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ => - spark.read.json(timestampStr).noop() + spark.read.option("inferTimestamp", true).json(timestampStr).noop() } def dateStr: Dataset[String] = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index d0e2e001034fb..8eb5432f8b61a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -21,6 +21,7 @@ import java.io._ import java.nio.charset.{Charset, StandardCharsets, UnsupportedCharsetException} import java.nio.file.Files import java.sql.{Date, Timestamp} +import java.time.LocalDate import java.util.Locale import com.fasterxml.jackson.core.JsonFactory @@ -39,6 +40,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.types.StructType.fromDDL +import org.apache.spark.sql.types.TestUDT.{MyDenseVector, MyDenseVectorUDT} import org.apache.spark.util.Utils class TestFileFilter extends PathFilter { @@ -246,548 +248,523 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } test("Complex field and type inferring with null in sampling") { - val jsonDF = spark.read.json(jsonNullStruct) - val expectedSchema = StructType( - StructField("headers", StructType( - StructField("Charset", StringType, true) :: - StructField("Host", StringType, true) :: Nil) - , true) :: - StructField("ip", StringType, true) :: - StructField("nullstr", StringType, true):: Nil) - - assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = spark.read.json(jsonNullStruct) + val expectedSchema = StructType( + StructField("headers", StructType( + StructField("Charset", StringType, true) :: + StructField("Host", StringType, true) :: Nil) + , true) :: + StructField("ip", StringType, true) :: + StructField("nullstr", StringType, true):: Nil) + + assert(expectedSchema === jsonDF.schema) + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select nullstr, headers.Host from jsonTable"), - Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null)) - ) + checkAnswer( + sql("select nullstr, headers.Host from jsonTable"), + Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null)) + ) + } } test("Primitive field and type inferring") { - val jsonDF = spark.read.json(primitiveFieldAndType) + withTempView("jsonTable") { + val jsonDF = spark.read.json(primitiveFieldAndType) - val expectedSchema = StructType( - StructField("bigInteger", DecimalType(20, 0), true) :: - StructField("boolean", BooleanType, true) :: - StructField("double", DoubleType, true) :: - StructField("integer", LongType, true) :: - StructField("long", LongType, true) :: - StructField("null", StringType, true) :: - StructField("string", StringType, true) :: Nil) + val expectedSchema = StructType( + StructField("bigInteger", DecimalType(20, 0), true) :: + StructField("boolean", BooleanType, true) :: + StructField("double", DoubleType, true) :: + StructField("integer", LongType, true) :: + StructField("long", LongType, true) :: + StructField("null", StringType, true) :: + StructField("string", StringType, true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select * from jsonTable"), - Row(new java.math.BigDecimal("92233720368547758070"), - true, - 1.7976931348623157, - 10, - 21474836470L, - null, - "this is a simple string.") - ) + checkAnswer( + sql("select * from jsonTable"), + Row(new java.math.BigDecimal("92233720368547758070"), + true, + 1.7976931348623157, + 10, + 21474836470L, + null, + "this is a simple string.") + ) + } } test("Complex field and type inferring") { - val jsonDF = spark.read.json(complexFieldAndType1) - - val expectedSchema = StructType( - StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) :: - StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType, true), true), true) :: - StructField("arrayOfBigInteger", ArrayType(DecimalType(21, 0), true), true) :: - StructField("arrayOfBoolean", ArrayType(BooleanType, true), true) :: - StructField("arrayOfDouble", ArrayType(DoubleType, true), true) :: - StructField("arrayOfInteger", ArrayType(LongType, true), true) :: - StructField("arrayOfLong", ArrayType(LongType, true), true) :: - StructField("arrayOfNull", ArrayType(StringType, true), true) :: - StructField("arrayOfString", ArrayType(StringType, true), true) :: - StructField("arrayOfStruct", ArrayType( - StructType( + withTempView("jsonTable") { + val jsonDF = spark.read.json(complexFieldAndType1) + + val expectedSchema = StructType( + StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) :: + StructField("arrayOfArray2", ArrayType(ArrayType(DoubleType, true), true), true) :: + StructField("arrayOfBigInteger", ArrayType(DecimalType(21, 0), true), true) :: + StructField("arrayOfBoolean", ArrayType(BooleanType, true), true) :: + StructField("arrayOfDouble", ArrayType(DoubleType, true), true) :: + StructField("arrayOfInteger", ArrayType(LongType, true), true) :: + StructField("arrayOfLong", ArrayType(LongType, true), true) :: + StructField("arrayOfNull", ArrayType(StringType, true), true) :: + StructField("arrayOfString", ArrayType(StringType, true), true) :: + StructField("arrayOfStruct", ArrayType( + StructType( + StructField("field1", BooleanType, true) :: + StructField("field2", StringType, true) :: + StructField("field3", StringType, true) :: Nil), true), true) :: + StructField("struct", StructType( StructField("field1", BooleanType, true) :: - StructField("field2", StringType, true) :: - StructField("field3", StringType, true) :: Nil), true), true) :: - StructField("struct", StructType( - StructField("field1", BooleanType, true) :: - StructField("field2", DecimalType(20, 0), true) :: Nil), true) :: - StructField("structWithArrayFields", StructType( - StructField("field1", ArrayType(LongType, true), true) :: - StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil) + StructField("field2", DecimalType(20, 0), true) :: Nil), true) :: + StructField("structWithArrayFields", StructType( + StructField("field1", ArrayType(LongType, true), true) :: + StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - // Access elements of a primitive array. - checkAnswer( - sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from jsonTable"), - Row("str1", "str2", null) - ) + // Access elements of a primitive array. + checkAnswer( + sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from jsonTable"), + Row("str1", "str2", null) + ) - // Access an array of null values. - checkAnswer( - sql("select arrayOfNull from jsonTable"), - Row(Seq(null, null, null, null)) - ) + // Access an array of null values. + checkAnswer( + sql("select arrayOfNull from jsonTable"), + Row(Seq(null, null, null, null)) + ) - // Access elements of a BigInteger array (we use DecimalType internally). - checkAnswer( - sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from jsonTable"), - Row(new java.math.BigDecimal("922337203685477580700"), - new java.math.BigDecimal("-922337203685477580800"), null) - ) + // Access elements of a BigInteger array (we use DecimalType internally). + checkAnswer( + sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from " + + "jsonTable"), + Row(new java.math.BigDecimal("922337203685477580700"), + new java.math.BigDecimal("-922337203685477580800"), null) + ) - // Access elements of an array of arrays. - checkAnswer( - sql("select arrayOfArray1[0], arrayOfArray1[1] from jsonTable"), - Row(Seq("1", "2", "3"), Seq("str1", "str2")) - ) + // Access elements of an array of arrays. + checkAnswer( + sql("select arrayOfArray1[0], arrayOfArray1[1] from jsonTable"), + Row(Seq("1", "2", "3"), Seq("str1", "str2")) + ) - // Access elements of an array of arrays. - checkAnswer( - sql("select arrayOfArray2[0], arrayOfArray2[1] from jsonTable"), - Row(Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1)) - ) + // Access elements of an array of arrays. + checkAnswer( + sql("select arrayOfArray2[0], arrayOfArray2[1] from jsonTable"), + Row(Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1)) + ) - // Access elements of an array inside a filed with the type of ArrayType(ArrayType). - checkAnswer( - sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from jsonTable"), - Row("str2", 2.1) - ) + // Access elements of an array inside a filed with the type of ArrayType(ArrayType). + checkAnswer( + sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from jsonTable"), + Row("str2", 2.1) + ) - // Access elements of an array of structs. - checkAnswer( - sql("select arrayOfStruct[0], arrayOfStruct[1], arrayOfStruct[2], arrayOfStruct[3] " + - "from jsonTable"), - Row( - Row(true, "str1", null), - Row(false, null, null), - Row(null, null, null), - null) - ) + // Access elements of an array of structs. + checkAnswer( + sql("select arrayOfStruct[0], arrayOfStruct[1], arrayOfStruct[2], arrayOfStruct[3] " + + "from jsonTable"), + Row( + Row(true, "str1", null), + Row(false, null, null), + Row(null, null, null), + null) + ) - // Access a struct and fields inside of it. - checkAnswer( - sql("select struct, struct.field1, struct.field2 from jsonTable"), - Row( - Row(true, new java.math.BigDecimal("92233720368547758070")), - true, - new java.math.BigDecimal("92233720368547758070")) :: Nil - ) + // Access a struct and fields inside of it. + checkAnswer( + sql("select struct, struct.field1, struct.field2 from jsonTable"), + Row( + Row(true, new java.math.BigDecimal("92233720368547758070")), + true, + new java.math.BigDecimal("92233720368547758070")) :: Nil + ) - // Access an array field of a struct. - checkAnswer( - sql("select structWithArrayFields.field1, structWithArrayFields.field2 from jsonTable"), - Row(Seq(4, 5, 6), Seq("str1", "str2")) - ) + // Access an array field of a struct. + checkAnswer( + sql("select structWithArrayFields.field1, structWithArrayFields.field2 from jsonTable"), + Row(Seq(4, 5, 6), Seq("str1", "str2")) + ) - // Access elements of an array field of a struct. - checkAnswer( - sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from jsonTable"), - Row(5, null) - ) + // Access elements of an array field of a struct. + checkAnswer( + sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from " + + "jsonTable"), + Row(5, null) + ) + } } test("GetField operation on complex data type") { - val jsonDF = spark.read.json(complexFieldAndType1) - jsonDF.createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = spark.read.json(complexFieldAndType1) + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"), - Row(true, "str1") - ) + checkAnswer( + sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"), + Row(true, "str1") + ) - // Getting all values of a specific field from an array of structs. - checkAnswer( - sql("select arrayOfStruct.field1, arrayOfStruct.field2 from jsonTable"), - Row(Seq(true, false, null), Seq("str1", null, null)) - ) + // Getting all values of a specific field from an array of structs. + checkAnswer( + sql("select arrayOfStruct.field1, arrayOfStruct.field2 from jsonTable"), + Row(Seq(true, false, null), Seq("str1", null, null)) + ) + } } test("Type conflict in primitive field values") { - val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) - - val expectedSchema = StructType( - StructField("num_bool", StringType, true) :: - StructField("num_num_1", LongType, true) :: - StructField("num_num_2", DoubleType, true) :: - StructField("num_num_3", DoubleType, true) :: - StructField("num_str", StringType, true) :: - StructField("str_bool", StringType, true) :: Nil) - - assert(expectedSchema === jsonDF.schema) - - jsonDF.createOrReplaceTempView("jsonTable") - - checkAnswer( - sql("select * from jsonTable"), - Row("true", 11L, null, 1.1, "13.1", "str1") :: - Row("12", null, 21474836470.9, null, null, "true") :: - Row("false", 21474836470L, 92233720368547758070d, 100, "str1", "false") :: - Row(null, 21474836570L, 1.1, 21474836470L, "92233720368547758070", null) :: Nil - ) - - // Number and Boolean conflict: resolve the type as number in this query. - checkAnswer( - sql("select num_bool - 10 from jsonTable where num_bool > 11"), - Row(2) - ) - - // Widening to LongType - checkAnswer( - sql("select num_num_1 - 100 from jsonTable where num_num_1 > 11"), - Row(21474836370L) :: Row(21474836470L) :: Nil - ) + withTempView("jsonTable") { + val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) - checkAnswer( - sql("select num_num_1 - 100 from jsonTable where num_num_1 > 10"), - Row(-89) :: Row(21474836370L) :: Row(21474836470L) :: Nil - ) + val expectedSchema = StructType( + StructField("num_bool", StringType, true) :: + StructField("num_num_1", LongType, true) :: + StructField("num_num_2", DoubleType, true) :: + StructField("num_num_3", DoubleType, true) :: + StructField("num_str", StringType, true) :: + StructField("str_bool", StringType, true) :: Nil) - // Widening to DecimalType - checkAnswer( - sql("select num_num_2 + 1.3 from jsonTable where num_num_2 > 1.1"), - Row(21474836472.2) :: - Row(92233720368547758071.3) :: Nil - ) + assert(expectedSchema === jsonDF.schema) - // Widening to Double - checkAnswer( - sql("select num_num_3 + 1.2 from jsonTable where num_num_3 > 1.1"), - Row(101.2) :: Row(21474836471.2) :: Nil - ) - - // Number and String conflict: resolve the type as number in this query. - checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 14d"), - Row(92233720368547758071.2) - ) + jsonDF.createOrReplaceTempView("jsonTable") - // Number and String conflict: resolve the type as number in this query. - checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str >= 92233720368547758060"), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue) - ) + checkAnswer( + sql("select * from jsonTable"), + Row("true", 11L, null, 1.1, "13.1", "str1") :: + Row("12", null, 21474836470.9, null, null, "true") :: + Row("false", 21474836470L, 92233720368547758070d, 100, "str1", "false") :: + Row(null, 21474836570L, 1.1, 21474836470L, "92233720368547758070", null) :: Nil + ) - // String and Boolean conflict: resolve the type as string. - checkAnswer( - sql("select * from jsonTable where str_bool = 'str1'"), - Row("true", 11L, null, 1.1, "13.1", "str1") - ) - } + // Number and Boolean conflict: resolve the type as number in this query. + checkAnswer( + sql("select num_bool - 10 from jsonTable where num_bool > 11"), + Row(2) + ) - ignore("Type conflict in primitive field values (Ignored)") { - val jsonDF = spark.read.json(primitiveFieldValueTypeConflict) - jsonDF.createOrReplaceTempView("jsonTable") + // Widening to LongType + checkAnswer( + sql("select num_num_1 - 100 from jsonTable where num_num_1 > 11"), + Row(21474836370L) :: Row(21474836470L) :: Nil + ) - // Right now, the analyzer does not promote strings in a boolean expression. - // Number and Boolean conflict: resolve the type as boolean in this query. - checkAnswer( - sql("select num_bool from jsonTable where NOT num_bool"), - Row(false) - ) + checkAnswer( + sql("select num_num_1 - 100 from jsonTable where num_num_1 > 10"), + Row(-89) :: Row(21474836370L) :: Row(21474836470L) :: Nil + ) - checkAnswer( - sql("select str_bool from jsonTable where NOT str_bool"), - Row(false) - ) + // Widening to DecimalType + checkAnswer( + sql("select num_num_2 + 1.3 from jsonTable where num_num_2 > 1.1"), + Row(21474836472.2) :: + Row(92233720368547758071.3) :: Nil + ) - // Right now, the analyzer does not know that num_bool should be treated as a boolean. - // Number and Boolean conflict: resolve the type as boolean in this query. - checkAnswer( - sql("select num_bool from jsonTable where num_bool"), - Row(true) - ) + // Widening to Double + checkAnswer( + sql("select num_num_3 + 1.2 from jsonTable where num_num_3 > 1.1"), + Row(101.2) :: Row(21474836471.2) :: Nil + ) - checkAnswer( - sql("select str_bool from jsonTable where str_bool"), - Row(false) - ) + // Number and String conflict: resolve the type as number in this query. + checkAnswer( + sql("select num_str + 1.2 from jsonTable where num_str > 14d"), + Row(92233720368547758071.2) + ) - // The plan of the following DSL is - // Project [(CAST(num_str#65:4, DoubleType) + 1.2) AS num#78] - // Filter (CAST(CAST(num_str#65:4, DoubleType), DecimalType) > 92233720368547758060) - // ExistingRdd [num_bool#61,num_num_1#62L,num_num_2#63,num_num_3#64,num_str#65,str_bool#66] - // We should directly cast num_str to DecimalType and also need to do the right type promotion - // in the Project. - checkAnswer( - jsonDF. - where('num_str >= BigDecimal("92233720368547758060")). - select(('num_str + 1.2).as("num")), - Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue()) - ) + // Number and String conflict: resolve the type as number in this query. + checkAnswer( + sql("select num_str + 1.2 from jsonTable where num_str >= 92233720368547758060"), + Row(new java.math.BigDecimal("92233720368547758071.2").doubleValue) + ) - // The following test will fail. The type of num_str is StringType. - // So, to evaluate num_str + 1.2, we first need to use Cast to convert the type. - // In our test data, one value of num_str is 13.1. - // The result of (CAST(num_str#65:4, DoubleType) + 1.2) for this value is 14.299999999999999, - // which is not 14.3. - // Number and String conflict: resolve the type as number in this query. - checkAnswer( - sql("select num_str + 1.2 from jsonTable where num_str > 13"), - Row(BigDecimal("14.3")) :: Row(BigDecimal("92233720368547758071.2")) :: Nil - ) + // String and Boolean conflict: resolve the type as string. + checkAnswer( + sql("select * from jsonTable where str_bool = 'str1'"), + Row("true", 11L, null, 1.1, "13.1", "str1") + ) + } } test("Type conflict in complex field values") { - val jsonDF = spark.read.json(complexFieldValueTypeConflict) + withTempView("jsonTable") { + val jsonDF = spark.read.json(complexFieldValueTypeConflict) - val expectedSchema = StructType( - StructField("array", ArrayType(LongType, true), true) :: - StructField("num_struct", StringType, true) :: - StructField("str_array", StringType, true) :: - StructField("struct", StructType( - StructField("field", StringType, true) :: Nil), true) :: - StructField("struct_array", StringType, true) :: Nil) + val expectedSchema = StructType( + StructField("array", ArrayType(LongType, true), true) :: + StructField("num_struct", StringType, true) :: + StructField("str_array", StringType, true) :: + StructField("struct", StructType( + StructField("field", StringType, true) :: Nil), true) :: + StructField("struct_array", StringType, true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select * from jsonTable"), - Row(Seq(), "11", "[1,2,3]", Row(null), "[]") :: - Row(null, """{"field":false}""", null, null, "{}") :: - Row(Seq(4, 5, 6), null, "str", Row(null), "[7,8,9]") :: - Row(Seq(7), "{}", """["str1","str2",33]""", Row("str"), """{"field":true}""") :: Nil - ) + checkAnswer( + sql("select * from jsonTable"), + Row(Seq(), "11", "[1,2,3]", Row(null), "[]") :: + Row(null, """{"field":false}""", null, null, "{}") :: + Row(Seq(4, 5, 6), null, "str", Row(null), "[7,8,9]") :: + Row(Seq(7), "{}", """["str1","str2",33]""", Row("str"), """{"field":true}""") :: Nil + ) + } } test("Type conflict in array elements") { - val jsonDF = spark.read.json(arrayElementTypeConflict) + withTempView("jsonTable") { + val jsonDF = spark.read.json(arrayElementTypeConflict) - val expectedSchema = StructType( - StructField("array1", ArrayType(StringType, true), true) :: - StructField("array2", ArrayType(StructType( - StructField("field", LongType, true) :: Nil), true), true) :: - StructField("array3", ArrayType(StringType, true), true) :: Nil) + val expectedSchema = StructType( + StructField("array1", ArrayType(StringType, true), true) :: + StructField("array2", ArrayType(StructType( + StructField("field", LongType, true) :: Nil), true), true) :: + StructField("array3", ArrayType(StringType, true), true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select * from jsonTable"), - Row(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]", - """{"field":"str"}"""), Seq(Row(214748364700L), Row(1)), null) :: - Row(null, null, Seq("""{"field":"str"}""", """{"field":1}""")) :: - Row(null, null, Seq("1", "2", "3")) :: Nil - ) + checkAnswer( + sql("select * from jsonTable"), + Row(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]", + """{"field":"str"}"""), Seq(Row(214748364700L), Row(1)), null) :: + Row(null, null, Seq("""{"field":"str"}""", """{"field":1}""")) :: + Row(null, null, Seq("1", "2", "3")) :: Nil + ) - // Treat an element as a number. - checkAnswer( - sql("select array1[0] + 1 from jsonTable where array1 is not null"), - Row(2) - ) + // Treat an element as a number. + checkAnswer( + sql("select array1[0] + 1 from jsonTable where array1 is not null"), + Row(2) + ) + } } test("Handling missing fields") { - val jsonDF = spark.read.json(missingFields) + withTempView("jsonTable") { + val jsonDF = spark.read.json(missingFields) - val expectedSchema = StructType( - StructField("a", BooleanType, true) :: - StructField("b", LongType, true) :: - StructField("c", ArrayType(LongType, true), true) :: - StructField("d", StructType( - StructField("field", BooleanType, true) :: Nil), true) :: - StructField("e", StringType, true) :: Nil) + val expectedSchema = StructType( + StructField("a", BooleanType, true) :: + StructField("b", LongType, true) :: + StructField("c", ArrayType(LongType, true), true) :: + StructField("d", StructType( + StructField("field", BooleanType, true) :: Nil), true) :: + StructField("e", StringType, true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") + } } test("Loading a JSON dataset from a text file") { - val dir = Utils.createTempDir() - dir.delete() - val path = dir.getCanonicalPath - primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).write.text(path) - val jsonDF = spark.read.json(path) + withTempView("jsonTable") { + val dir = Utils.createTempDir() + dir.delete() + val path = dir.getCanonicalPath + primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).write.text(path) + val jsonDF = spark.read.json(path) - val expectedSchema = StructType( - StructField("bigInteger", DecimalType(20, 0), true) :: - StructField("boolean", BooleanType, true) :: - StructField("double", DoubleType, true) :: - StructField("integer", LongType, true) :: - StructField("long", LongType, true) :: - StructField("null", StringType, true) :: - StructField("string", StringType, true) :: Nil) + val expectedSchema = StructType( + StructField("bigInteger", DecimalType(20, 0), true) :: + StructField("boolean", BooleanType, true) :: + StructField("double", DoubleType, true) :: + StructField("integer", LongType, true) :: + StructField("long", LongType, true) :: + StructField("null", StringType, true) :: + StructField("string", StringType, true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select * from jsonTable"), - Row(new java.math.BigDecimal("92233720368547758070"), - true, - 1.7976931348623157, - 10, - 21474836470L, - null, - "this is a simple string.") - ) + checkAnswer( + sql("select * from jsonTable"), + Row(new java.math.BigDecimal("92233720368547758070"), + true, + 1.7976931348623157, + 10, + 21474836470L, + null, + "this is a simple string.") + ) + } } test("Loading a JSON dataset primitivesAsString returns schema with primitive types as strings") { - val dir = Utils.createTempDir() - dir.delete() - val path = dir.getCanonicalPath - primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).write.text(path) - val jsonDF = spark.read.option("primitivesAsString", "true").json(path) - - val expectedSchema = StructType( - StructField("bigInteger", StringType, true) :: - StructField("boolean", StringType, true) :: - StructField("double", StringType, true) :: - StructField("integer", StringType, true) :: - StructField("long", StringType, true) :: - StructField("null", StringType, true) :: - StructField("string", StringType, true) :: Nil) + withTempView("jsonTable") { + val dir = Utils.createTempDir() + dir.delete() + val path = dir.getCanonicalPath + primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).write.text(path) + val jsonDF = spark.read.option("primitivesAsString", "true").json(path) + + val expectedSchema = StructType( + StructField("bigInteger", StringType, true) :: + StructField("boolean", StringType, true) :: + StructField("double", StringType, true) :: + StructField("integer", StringType, true) :: + StructField("long", StringType, true) :: + StructField("null", StringType, true) :: + StructField("string", StringType, true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select * from jsonTable"), - Row("92233720368547758070", - "true", - "1.7976931348623157", - "10", - "21474836470", - null, - "this is a simple string.") - ) + checkAnswer( + sql("select * from jsonTable"), + Row("92233720368547758070", + "true", + "1.7976931348623157", + "10", + "21474836470", + null, + "this is a simple string.") + ) + } } test("Loading a JSON dataset primitivesAsString returns complex fields as strings") { - val jsonDF = spark.read.option("primitivesAsString", "true").json(complexFieldAndType1) - - val expectedSchema = StructType( - StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) :: - StructField("arrayOfArray2", ArrayType(ArrayType(StringType, true), true), true) :: - StructField("arrayOfBigInteger", ArrayType(StringType, true), true) :: - StructField("arrayOfBoolean", ArrayType(StringType, true), true) :: - StructField("arrayOfDouble", ArrayType(StringType, true), true) :: - StructField("arrayOfInteger", ArrayType(StringType, true), true) :: - StructField("arrayOfLong", ArrayType(StringType, true), true) :: - StructField("arrayOfNull", ArrayType(StringType, true), true) :: - StructField("arrayOfString", ArrayType(StringType, true), true) :: - StructField("arrayOfStruct", ArrayType( - StructType( + withTempView("jsonTable") { + val jsonDF = spark.read.option("primitivesAsString", "true").json(complexFieldAndType1) + + val expectedSchema = StructType( + StructField("arrayOfArray1", ArrayType(ArrayType(StringType, true), true), true) :: + StructField("arrayOfArray2", ArrayType(ArrayType(StringType, true), true), true) :: + StructField("arrayOfBigInteger", ArrayType(StringType, true), true) :: + StructField("arrayOfBoolean", ArrayType(StringType, true), true) :: + StructField("arrayOfDouble", ArrayType(StringType, true), true) :: + StructField("arrayOfInteger", ArrayType(StringType, true), true) :: + StructField("arrayOfLong", ArrayType(StringType, true), true) :: + StructField("arrayOfNull", ArrayType(StringType, true), true) :: + StructField("arrayOfString", ArrayType(StringType, true), true) :: + StructField("arrayOfStruct", ArrayType( + StructType( + StructField("field1", StringType, true) :: + StructField("field2", StringType, true) :: + StructField("field3", StringType, true) :: Nil), true), true) :: + StructField("struct", StructType( StructField("field1", StringType, true) :: - StructField("field2", StringType, true) :: - StructField("field3", StringType, true) :: Nil), true), true) :: - StructField("struct", StructType( - StructField("field1", StringType, true) :: - StructField("field2", StringType, true) :: Nil), true) :: - StructField("structWithArrayFields", StructType( - StructField("field1", ArrayType(StringType, true), true) :: - StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil) + StructField("field2", StringType, true) :: Nil), true) :: + StructField("structWithArrayFields", StructType( + StructField("field1", ArrayType(StringType, true), true) :: + StructField("field2", ArrayType(StringType, true), true) :: Nil), true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - // Access elements of a primitive array. - checkAnswer( - sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from jsonTable"), - Row("str1", "str2", null) - ) + // Access elements of a primitive array. + checkAnswer( + sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from jsonTable"), + Row("str1", "str2", null) + ) - // Access an array of null values. - checkAnswer( - sql("select arrayOfNull from jsonTable"), - Row(Seq(null, null, null, null)) - ) + // Access an array of null values. + checkAnswer( + sql("select arrayOfNull from jsonTable"), + Row(Seq(null, null, null, null)) + ) - // Access elements of a BigInteger array (we use DecimalType internally). - checkAnswer( - sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from jsonTable"), - Row("922337203685477580700", "-922337203685477580800", null) - ) + // Access elements of a BigInteger array (we use DecimalType internally). + checkAnswer( + sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] from " + + "jsonTable"), + Row("922337203685477580700", "-922337203685477580800", null) + ) - // Access elements of an array of arrays. - checkAnswer( - sql("select arrayOfArray1[0], arrayOfArray1[1] from jsonTable"), - Row(Seq("1", "2", "3"), Seq("str1", "str2")) - ) + // Access elements of an array of arrays. + checkAnswer( + sql("select arrayOfArray1[0], arrayOfArray1[1] from jsonTable"), + Row(Seq("1", "2", "3"), Seq("str1", "str2")) + ) - // Access elements of an array of arrays. - checkAnswer( - sql("select arrayOfArray2[0], arrayOfArray2[1] from jsonTable"), - Row(Seq("1", "2", "3"), Seq("1.1", "2.1", "3.1")) - ) + // Access elements of an array of arrays. + checkAnswer( + sql("select arrayOfArray2[0], arrayOfArray2[1] from jsonTable"), + Row(Seq("1", "2", "3"), Seq("1.1", "2.1", "3.1")) + ) - // Access elements of an array inside a filed with the type of ArrayType(ArrayType). - checkAnswer( - sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from jsonTable"), - Row("str2", "2.1") - ) + // Access elements of an array inside a filed with the type of ArrayType(ArrayType). + checkAnswer( + sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from jsonTable"), + Row("str2", "2.1") + ) - // Access elements of an array of structs. - checkAnswer( - sql("select arrayOfStruct[0], arrayOfStruct[1], arrayOfStruct[2], arrayOfStruct[3] " + - "from jsonTable"), - Row( - Row("true", "str1", null), - Row("false", null, null), - Row(null, null, null), - null) - ) + // Access elements of an array of structs. + checkAnswer( + sql("select arrayOfStruct[0], arrayOfStruct[1], arrayOfStruct[2], arrayOfStruct[3] " + + "from jsonTable"), + Row( + Row("true", "str1", null), + Row("false", null, null), + Row(null, null, null), + null) + ) - // Access a struct and fields inside of it. - checkAnswer( - sql("select struct, struct.field1, struct.field2 from jsonTable"), - Row( - Row("true", "92233720368547758070"), - "true", - "92233720368547758070") :: Nil - ) + // Access a struct and fields inside of it. + checkAnswer( + sql("select struct, struct.field1, struct.field2 from jsonTable"), + Row( + Row("true", "92233720368547758070"), + "true", + "92233720368547758070") :: Nil + ) - // Access an array field of a struct. - checkAnswer( - sql("select structWithArrayFields.field1, structWithArrayFields.field2 from jsonTable"), - Row(Seq("4", "5", "6"), Seq("str1", "str2")) - ) + // Access an array field of a struct. + checkAnswer( + sql("select structWithArrayFields.field1, structWithArrayFields.field2 from jsonTable"), + Row(Seq("4", "5", "6"), Seq("str1", "str2")) + ) - // Access elements of an array field of a struct. - checkAnswer( - sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from jsonTable"), - Row("5", null) - ) + // Access elements of an array field of a struct. + checkAnswer( + sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] from " + + "jsonTable"), + Row("5", null) + ) + } } test("Loading a JSON dataset prefersDecimal returns schema with float types as BigDecimal") { - val jsonDF = spark.read.option("prefersDecimal", "true").json(primitiveFieldAndType) + withTempView("jsonTable") { + val jsonDF = spark.read.option("prefersDecimal", "true").json(primitiveFieldAndType) - val expectedSchema = StructType( - StructField("bigInteger", DecimalType(20, 0), true) :: - StructField("boolean", BooleanType, true) :: - StructField("double", DecimalType(17, 16), true) :: - StructField("integer", LongType, true) :: - StructField("long", LongType, true) :: - StructField("null", StringType, true) :: - StructField("string", StringType, true) :: Nil) + val expectedSchema = StructType( + StructField("bigInteger", DecimalType(20, 0), true) :: + StructField("boolean", BooleanType, true) :: + StructField("double", DecimalType(17, 16), true) :: + StructField("integer", LongType, true) :: + StructField("long", LongType, true) :: + StructField("null", StringType, true) :: + StructField("string", StringType, true) :: Nil) - assert(expectedSchema === jsonDF.schema) + assert(expectedSchema === jsonDF.schema) - jsonDF.createOrReplaceTempView("jsonTable") + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select * from jsonTable"), - Row(BigDecimal("92233720368547758070"), - true, - BigDecimal("1.7976931348623157"), - 10, - 21474836470L, - null, - "this is a simple string.") - ) + checkAnswer( + sql("select * from jsonTable"), + Row(BigDecimal("92233720368547758070"), + true, + BigDecimal("1.7976931348623157"), + 10, + 21474836470L, + null, + "this is a simple string.") + ) + } } test("Find compatible types even if inferred DecimalType is not capable of other IntegralType") { @@ -884,171 +861,182 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } test("Applying schemas") { - val dir = Utils.createTempDir() - dir.delete() - val path = dir.getCanonicalPath - primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).write.text(path) + withTempView("jsonTable1", "jsonTable2") { + val dir = Utils.createTempDir() + dir.delete() + val path = dir.getCanonicalPath + primitiveFieldAndType.map(record => record.replaceAll("\n", " ")).write.text(path) - val schema = StructType( - StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) :: - StructField("boolean", BooleanType, true) :: - StructField("double", DoubleType, true) :: - StructField("integer", IntegerType, true) :: - StructField("long", LongType, true) :: - StructField("null", StringType, true) :: - StructField("string", StringType, true) :: Nil) + val schema = StructType( + StructField("bigInteger", DecimalType.SYSTEM_DEFAULT, true) :: + StructField("boolean", BooleanType, true) :: + StructField("double", DoubleType, true) :: + StructField("integer", IntegerType, true) :: + StructField("long", LongType, true) :: + StructField("null", StringType, true) :: + StructField("string", StringType, true) :: Nil) - val jsonDF1 = spark.read.schema(schema).json(path) + val jsonDF1 = spark.read.schema(schema).json(path) - assert(schema === jsonDF1.schema) + assert(schema === jsonDF1.schema) - jsonDF1.createOrReplaceTempView("jsonTable1") + jsonDF1.createOrReplaceTempView("jsonTable1") - checkAnswer( - sql("select * from jsonTable1"), - Row(new java.math.BigDecimal("92233720368547758070"), - true, - 1.7976931348623157, - 10, - 21474836470L, - null, - "this is a simple string.") - ) + checkAnswer( + sql("select * from jsonTable1"), + Row(new java.math.BigDecimal("92233720368547758070"), + true, + 1.7976931348623157, + 10, + 21474836470L, + null, + "this is a simple string.") + ) - val jsonDF2 = spark.read.schema(schema).json(primitiveFieldAndType) + val jsonDF2 = spark.read.schema(schema).json(primitiveFieldAndType) - assert(schema === jsonDF2.schema) + assert(schema === jsonDF2.schema) - jsonDF2.createOrReplaceTempView("jsonTable2") + jsonDF2.createOrReplaceTempView("jsonTable2") - checkAnswer( - sql("select * from jsonTable2"), - Row(new java.math.BigDecimal("92233720368547758070"), - true, - 1.7976931348623157, - 10, - 21474836470L, - null, - "this is a simple string.") - ) + checkAnswer( + sql("select * from jsonTable2"), + Row(new java.math.BigDecimal("92233720368547758070"), + true, + 1.7976931348623157, + 10, + 21474836470L, + null, + "this is a simple string.") + ) + } } test("Applying schemas with MapType") { - val schemaWithSimpleMap = StructType( - StructField("map", MapType(StringType, IntegerType, true), false) :: Nil) - val jsonWithSimpleMap = spark.read.schema(schemaWithSimpleMap).json(mapType1) - - jsonWithSimpleMap.createOrReplaceTempView("jsonWithSimpleMap") + withTempView("jsonWithSimpleMap", "jsonWithComplexMap") { + val schemaWithSimpleMap = StructType( + StructField("map", MapType(StringType, IntegerType, true), false) :: Nil) + val jsonWithSimpleMap = spark.read.schema(schemaWithSimpleMap).json(mapType1) - checkAnswer( - sql("select `map` from jsonWithSimpleMap"), - Row(Map("a" -> 1)) :: - Row(Map("b" -> 2)) :: - Row(Map("c" -> 3)) :: - Row(Map("c" -> 1, "d" -> 4)) :: - Row(Map("e" -> null)) :: Nil - ) + jsonWithSimpleMap.createOrReplaceTempView("jsonWithSimpleMap") - withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { checkAnswer( - sql("select `map`['c'] from jsonWithSimpleMap"), - Row(null) :: - Row(null) :: - Row(3) :: - Row(1) :: - Row(null) :: Nil + sql("select `map` from jsonWithSimpleMap"), + Row(Map("a" -> 1)) :: + Row(Map("b" -> 2)) :: + Row(Map("c" -> 3)) :: + Row(Map("c" -> 1, "d" -> 4)) :: + Row(Map("e" -> null)) :: Nil ) - } - val innerStruct = StructType( - StructField("field1", ArrayType(IntegerType, true), true) :: - StructField("field2", IntegerType, true) :: Nil) - val schemaWithComplexMap = StructType( - StructField("map", MapType(StringType, innerStruct, true), false) :: Nil) + withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { + checkAnswer( + sql("select `map`['c'] from jsonWithSimpleMap"), + Row(null) :: + Row(null) :: + Row(3) :: + Row(1) :: + Row(null) :: Nil + ) + } - val jsonWithComplexMap = spark.read.schema(schemaWithComplexMap).json(mapType2) + val innerStruct = StructType( + StructField("field1", ArrayType(IntegerType, true), true) :: + StructField("field2", IntegerType, true) :: Nil) + val schemaWithComplexMap = StructType( + StructField("map", MapType(StringType, innerStruct, true), false) :: Nil) - jsonWithComplexMap.createOrReplaceTempView("jsonWithComplexMap") + val jsonWithComplexMap = spark.read.schema(schemaWithComplexMap).json(mapType2) - checkAnswer( - sql("select `map` from jsonWithComplexMap"), - Row(Map("a" -> Row(Seq(1, 2, 3, null), null))) :: - Row(Map("b" -> Row(null, 2))) :: - Row(Map("c" -> Row(Seq(), 4))) :: - Row(Map("c" -> Row(null, 3), "d" -> Row(Seq(null), null))) :: - Row(Map("e" -> null)) :: - Row(Map("f" -> Row(null, null))) :: Nil - ) + jsonWithComplexMap.createOrReplaceTempView("jsonWithComplexMap") - withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { checkAnswer( - sql("select `map`['a'].field1, `map`['c'].field2 from jsonWithComplexMap"), - Row(Seq(1, 2, 3, null), null) :: - Row(null, null) :: - Row(null, 4) :: - Row(null, 3) :: - Row(null, null) :: - Row(null, null) :: Nil + sql("select `map` from jsonWithComplexMap"), + Row(Map("a" -> Row(Seq(1, 2, 3, null), null))) :: + Row(Map("b" -> Row(null, 2))) :: + Row(Map("c" -> Row(Seq(), 4))) :: + Row(Map("c" -> Row(null, 3), "d" -> Row(Seq(null), null))) :: + Row(Map("e" -> null)) :: + Row(Map("f" -> Row(null, null))) :: Nil ) + + withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { + checkAnswer( + sql("select `map`['a'].field1, `map`['c'].field2 from jsonWithComplexMap"), + Row(Seq(1, 2, 3, null), null) :: + Row(null, null) :: + Row(null, 4) :: + Row(null, 3) :: + Row(null, null) :: + Row(null, null) :: Nil + ) + } } } test("SPARK-2096 Correctly parse dot notations") { - val jsonDF = spark.read.json(complexFieldAndType2) - jsonDF.createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = spark.read.json(complexFieldAndType2) + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"), - Row(true, "str1") - ) - checkAnswer( - sql( - """ - |select complexArrayOfStruct[0].field1[1].inner2[0], complexArrayOfStruct[1].field2[0][1] - |from jsonTable - """.stripMargin), - Row("str2", 6) - ) + checkAnswer( + sql("select arrayOfStruct[0].field1, arrayOfStruct[0].field2 from jsonTable"), + Row(true, "str1") + ) + checkAnswer( + sql( + """ + |select complexArrayOfStruct[0].field1[1].inner2[0], + |complexArrayOfStruct[1].field2[0][1] + |from jsonTable + """.stripMargin), + Row("str2", 6) + ) + } } test("SPARK-3390 Complex arrays") { - val jsonDF = spark.read.json(complexFieldAndType2) - jsonDF.createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = spark.read.json(complexFieldAndType2) + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql( - """ - |select arrayOfArray1[0][0][0], arrayOfArray1[1][0][1], arrayOfArray1[1][1][0] - |from jsonTable - """.stripMargin), - Row(5, 7, 8) - ) - checkAnswer( - sql( - """ - |select arrayOfArray2[0][0][0].inner1, arrayOfArray2[1][0], - |arrayOfArray2[1][1][1].inner2[0], arrayOfArray2[2][0][0].inner3[0][0].inner4 - |from jsonTable - """.stripMargin), - Row("str1", Nil, "str4", 2) - ) + checkAnswer( + sql( + """ + |select arrayOfArray1[0][0][0], arrayOfArray1[1][0][1], arrayOfArray1[1][1][0] + |from jsonTable + """.stripMargin), + Row(5, 7, 8) + ) + checkAnswer( + sql( + """ + |select arrayOfArray2[0][0][0].inner1, arrayOfArray2[1][0], + |arrayOfArray2[1][1][1].inner2[0], arrayOfArray2[2][0][0].inner3[0][0].inner4 + |from jsonTable + """.stripMargin), + Row("str1", Nil, "str4", 2) + ) + } } test("SPARK-3308 Read top level JSON arrays") { - val jsonDF = spark.read.json(jsonArray) - jsonDF.createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = spark.read.json(jsonArray) + jsonDF.createOrReplaceTempView("jsonTable") - checkAnswer( - sql( - """ - |select a, b, c - |from jsonTable - """.stripMargin), - Row("str_a_1", null, null) :: - Row("str_a_2", null, null) :: - Row(null, "str_b_3", null) :: - Row("str_a_4", "str_b_4", "str_c_4") :: Nil - ) + checkAnswer( + sql( + """ + |select a, b, c + |from jsonTable + """.stripMargin), + Row("str_a_1", null, null) :: + Row("str_a_2", null, null) :: + Row(null, "str_b_3", null) :: + Row("str_a_4", "str_b_4", "str_c_4") :: Nil + ) + } } test("Corrupt records: FAILFAST mode") { @@ -1190,158 +1178,162 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } test("SPARK-4068: nulls in arrays") { - val jsonDF = spark.read.json(nullsInArrays) - jsonDF.createOrReplaceTempView("jsonTable") + withTempView("jsonTable") { + val jsonDF = spark.read.json(nullsInArrays) + jsonDF.createOrReplaceTempView("jsonTable") - val schema = StructType( - StructField("field1", - ArrayType(ArrayType(ArrayType(ArrayType(StringType, true), true), true), true), true) :: - StructField("field2", - ArrayType(ArrayType( - StructType(StructField("Test", LongType, true) :: Nil), true), true), true) :: - StructField("field3", - ArrayType(ArrayType( - StructType(StructField("Test", StringType, true) :: Nil), true), true), true) :: - StructField("field4", - ArrayType(ArrayType(ArrayType(LongType, true), true), true), true) :: Nil) + val schema = StructType( + StructField("field1", + ArrayType(ArrayType(ArrayType(ArrayType(StringType, true), true), true), true), true) :: + StructField("field2", + ArrayType(ArrayType( + StructType(StructField("Test", LongType, true) :: Nil), true), true), true) :: + StructField("field3", + ArrayType(ArrayType( + StructType(StructField("Test", StringType, true) :: Nil), true), true), true) :: + StructField("field4", + ArrayType(ArrayType(ArrayType(LongType, true), true), true), true) :: Nil) - assert(schema === jsonDF.schema) + assert(schema === jsonDF.schema) - checkAnswer( - sql( - """ - |SELECT field1, field2, field3, field4 - |FROM jsonTable - """.stripMargin), - Row(Seq(Seq(null), Seq(Seq(Seq("Test")))), null, null, null) :: - Row(null, Seq(null, Seq(Row(1))), null, null) :: - Row(null, null, Seq(Seq(null), Seq(Row("2"))), null) :: - Row(null, null, null, Seq(Seq(null, Seq(1, 2, 3)))) :: Nil - ) + checkAnswer( + sql( + """ + |SELECT field1, field2, field3, field4 + |FROM jsonTable + """.stripMargin), + Row(Seq(Seq(null), Seq(Seq(Seq("Test")))), null, null, null) :: + Row(null, Seq(null, Seq(Row(1))), null, null) :: + Row(null, null, Seq(Seq(null), Seq(Row("2"))), null) :: + Row(null, null, null, Seq(Seq(null, Seq(1, 2, 3)))) :: Nil + ) + } } test("SPARK-4228 DataFrame to JSON") { - val schema1 = StructType( - StructField("f1", IntegerType, false) :: - StructField("f2", StringType, false) :: - StructField("f3", BooleanType, false) :: - StructField("f4", ArrayType(StringType), nullable = true) :: - StructField("f5", IntegerType, true) :: Nil) - - val rowRDD1 = unparsedStrings.map { r => - val values = r.split(",").map(_.trim) - val v5 = try values(3).toInt catch { - case _: NumberFormatException => null + withTempView("applySchema1", "applySchema2", "primitiveTable", "complexTable") { + val schema1 = StructType( + StructField("f1", IntegerType, false) :: + StructField("f2", StringType, false) :: + StructField("f3", BooleanType, false) :: + StructField("f4", ArrayType(StringType), nullable = true) :: + StructField("f5", IntegerType, true) :: Nil) + + val rowRDD1 = unparsedStrings.map { r => + val values = r.split(",").map(_.trim) + val v5 = try values(3).toInt catch { + case _: NumberFormatException => null + } + Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5) } - Row(values(0).toInt, values(1), values(2).toBoolean, r.split(",").toList, v5) - } - - val df1 = spark.createDataFrame(rowRDD1, schema1) - df1.createOrReplaceTempView("applySchema1") - val df2 = df1.toDF - val result = df2.toJSON.collect() - // scalastyle:off - assert(result(0) === "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}") - assert(result(3) === "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}") - // scalastyle:on - - val schema2 = StructType( - StructField("f1", StructType( - StructField("f11", IntegerType, false) :: - StructField("f12", BooleanType, false) :: Nil), false) :: - StructField("f2", MapType(StringType, IntegerType, true), false) :: Nil) - val rowRDD2 = unparsedStrings.map { r => - val values = r.split(",").map(_.trim) - val v4 = try values(3).toInt catch { - case _: NumberFormatException => null + val df1 = spark.createDataFrame(rowRDD1, schema1) + df1.createOrReplaceTempView("applySchema1") + val df2 = df1.toDF + val result = df2.toJSON.collect() + // scalastyle:off + assert(result(0) === "{\"f1\":1,\"f2\":\"A1\",\"f3\":true,\"f4\":[\"1\",\" A1\",\" true\",\" null\"]}") + assert(result(3) === "{\"f1\":4,\"f2\":\"D4\",\"f3\":true,\"f4\":[\"4\",\" D4\",\" true\",\" 2147483644\"],\"f5\":2147483644}") + // scalastyle:on + + val schema2 = StructType( + StructField("f1", StructType( + StructField("f11", IntegerType, false) :: + StructField("f12", BooleanType, false) :: Nil), false) :: + StructField("f2", MapType(StringType, IntegerType, true), false) :: Nil) + + val rowRDD2 = unparsedStrings.map { r => + val values = r.split(",").map(_.trim) + val v4 = try values(3).toInt catch { + case _: NumberFormatException => null + } + Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4)) } - Row(Row(values(0).toInt, values(2).toBoolean), Map(values(1) -> v4)) - } - val df3 = spark.createDataFrame(rowRDD2, schema2) - df3.createOrReplaceTempView("applySchema2") - val df4 = df3.toDF - val result2 = df4.toJSON.collect() + val df3 = spark.createDataFrame(rowRDD2, schema2) + df3.createOrReplaceTempView("applySchema2") + val df4 = df3.toDF + val result2 = df4.toJSON.collect() - assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}") - assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}") + assert(result2(1) === "{\"f1\":{\"f11\":2,\"f12\":false},\"f2\":{\"B2\":null}}") + assert(result2(3) === "{\"f1\":{\"f11\":4,\"f12\":true},\"f2\":{\"D4\":2147483644}}") - val jsonDF = spark.read.json(primitiveFieldAndType) - val primTable = spark.read.json(jsonDF.toJSON) - primTable.createOrReplaceTempView("primitiveTable") - checkAnswer( - sql("select * from primitiveTable"), - Row(new java.math.BigDecimal("92233720368547758070"), - true, - 1.7976931348623157, - 10, - 21474836470L, - "this is a simple string.") + val jsonDF = spark.read.json(primitiveFieldAndType) + val primTable = spark.read.json(jsonDF.toJSON) + primTable.createOrReplaceTempView("primitiveTable") + checkAnswer( + sql("select * from primitiveTable"), + Row(new java.math.BigDecimal("92233720368547758070"), + true, + 1.7976931348623157, + 10, + 21474836470L, + "this is a simple string.") ) - val complexJsonDF = spark.read.json(complexFieldAndType1) - val compTable = spark.read.json(complexJsonDF.toJSON) - compTable.createOrReplaceTempView("complexTable") - // Access elements of a primitive array. - checkAnswer( - sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from complexTable"), - Row("str1", "str2", null) - ) + val complexJsonDF = spark.read.json(complexFieldAndType1) + val compTable = spark.read.json(complexJsonDF.toJSON) + compTable.createOrReplaceTempView("complexTable") + // Access elements of a primitive array. + checkAnswer( + sql("select arrayOfString[0], arrayOfString[1], arrayOfString[2] from complexTable"), + Row("str1", "str2", null) + ) - // Access an array of null values. - checkAnswer( - sql("select arrayOfNull from complexTable"), - Row(Seq(null, null, null, null)) - ) + // Access an array of null values. + checkAnswer( + sql("select arrayOfNull from complexTable"), + Row(Seq(null, null, null, null)) + ) - // Access elements of a BigInteger array (we use DecimalType internally). - checkAnswer( - sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] " + - " from complexTable"), - Row(new java.math.BigDecimal("922337203685477580700"), - new java.math.BigDecimal("-922337203685477580800"), null) - ) + // Access elements of a BigInteger array (we use DecimalType internally). + checkAnswer( + sql("select arrayOfBigInteger[0], arrayOfBigInteger[1], arrayOfBigInteger[2] " + + " from complexTable"), + Row(new java.math.BigDecimal("922337203685477580700"), + new java.math.BigDecimal("-922337203685477580800"), null) + ) - // Access elements of an array of arrays. - checkAnswer( - sql("select arrayOfArray1[0], arrayOfArray1[1] from complexTable"), - Row(Seq("1", "2", "3"), Seq("str1", "str2")) - ) + // Access elements of an array of arrays. + checkAnswer( + sql("select arrayOfArray1[0], arrayOfArray1[1] from complexTable"), + Row(Seq("1", "2", "3"), Seq("str1", "str2")) + ) - // Access elements of an array of arrays. - checkAnswer( - sql("select arrayOfArray2[0], arrayOfArray2[1] from complexTable"), - Row(Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1)) - ) + // Access elements of an array of arrays. + checkAnswer( + sql("select arrayOfArray2[0], arrayOfArray2[1] from complexTable"), + Row(Seq(1.0, 2.0, 3.0), Seq(1.1, 2.1, 3.1)) + ) - // Access elements of an array inside a filed with the type of ArrayType(ArrayType). - checkAnswer( - sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from complexTable"), - Row("str2", 2.1) - ) + // Access elements of an array inside a filed with the type of ArrayType(ArrayType). + checkAnswer( + sql("select arrayOfArray1[1][1], arrayOfArray2[1][1] from complexTable"), + Row("str2", 2.1) + ) - // Access a struct and fields inside of it. - checkAnswer( - sql("select struct, struct.field1, struct.field2 from complexTable"), - Row( - Row(true, new java.math.BigDecimal("92233720368547758070")), - true, - new java.math.BigDecimal("92233720368547758070")) :: Nil - ) + // Access a struct and fields inside of it. + checkAnswer( + sql("select struct, struct.field1, struct.field2 from complexTable"), + Row( + Row(true, new java.math.BigDecimal("92233720368547758070")), + true, + new java.math.BigDecimal("92233720368547758070")) :: Nil + ) - // Access an array field of a struct. - checkAnswer( - sql("select structWithArrayFields.field1, structWithArrayFields.field2 from complexTable"), - Row(Seq(4, 5, 6), Seq("str1", "str2")) - ) + // Access an array field of a struct. + checkAnswer( + sql("select structWithArrayFields.field1, structWithArrayFields.field2 from complexTable"), + Row(Seq(4, 5, 6), Seq("str1", "str2")) + ) - // Access elements of an array field of a struct. - checkAnswer( - sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] " + - "from complexTable"), - Row(5, null) - ) + // Access elements of an array field of a struct. + checkAnswer( + sql("select structWithArrayFields.field1[1], structWithArrayFields.field2[3] " + + "from complexTable"), + Row(5, null) + ) + } } test("Dataset toJSON doesn't construct rdd") { @@ -1380,7 +1372,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson test("SPARK-6245 JsonInferSchema.infer on empty RDD") { // This is really a test that it doesn't throw an exception - val options = new JSONOptions(Map.empty[String, String], "GMT") + val options = new JSONOptions(Map.empty[String, String], "UTC") val emptySchema = new JsonInferSchema(options).infer( empty.rdd, CreateJacksonParser.string) @@ -1407,7 +1399,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } test("SPARK-8093 Erase empty structs") { - val options = new JSONOptions(Map.empty[String, String], "GMT") + val options = new JSONOptions(Map.empty[String, String], "UTC") val emptySchema = new JsonInferSchema(options).infer( emptyRecords.rdd, CreateJacksonParser.string) @@ -1422,20 +1414,21 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } withTempPath(root => { - val d1 = new File(root, "d1=1") - // root/dt=1/col1=abc - val p1_col1 = makePartition( - sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""), - d1, - "col1", - "abc") - - // root/dt=1/col1=abd - val p2 = makePartition( - sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""), - d1, - "col1", - "abd") + withTempView("test_myjson_with_part") { + val d1 = new File(root, "d1=1") + // root/dt=1/col1=abc + val p1_col1 = makePartition( + sparkContext.parallelize(2 to 5).map(i => s"""{"a": 1, "b": "str$i"}"""), + d1, + "col1", + "abc") + + // root/dt=1/col1=abd + val p2 = makePartition( + sparkContext.parallelize(6 to 10).map(i => s"""{"a": 1, "b": "str$i"}"""), + d1, + "col1", + "abd") spark.read.json(root.getAbsolutePath).createOrReplaceTempView("test_myjson_with_part") checkAnswer(sql( @@ -1444,9 +1437,111 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson "SELECT count(a) FROM test_myjson_with_part where d1 = 1 and col1='abd'"), Row(5)) checkAnswer(sql( "SELECT count(a) FROM test_myjson_with_part where d1 = 1"), Row(9)) + } }) } + test("backward compatibility") { + // This test we make sure our JSON support can read JSON data generated by previous version + // of Spark generated through toJSON method and JSON data source. + // The data is generated by the following program. + // Here are a few notes: + // - Spark 1.5.0 cannot save timestamp data. So, we manually added timestamp field (col13) + // in the JSON object. + // - For Spark before 1.5.1, we do not generate UDTs. So, we manually added the UDT value to + // JSON objects generated by those Spark versions (col17). + // - If the type is NullType, we do not write data out. + + // Create the schema. + val struct = + StructType( + StructField("f1", FloatType, true) :: + StructField("f2", ArrayType(BooleanType), true) :: Nil) + + val dataTypes = + Seq( + StringType, BinaryType, NullType, BooleanType, + ByteType, ShortType, IntegerType, LongType, + FloatType, DoubleType, DecimalType(25, 5), DecimalType(6, 5), + DateType, TimestampType, + ArrayType(IntegerType), MapType(StringType, LongType), struct, + new MyDenseVectorUDT()) + val fields = dataTypes.zipWithIndex.map { case (dataType, index) => + StructField(s"col$index", dataType, nullable = true) + } + val schema = StructType(fields) + + val constantValues = + Seq( + "a string in binary".getBytes(StandardCharsets.UTF_8), + null, + true, + 1.toByte, + 2.toShort, + 3, + Long.MaxValue, + 0.25.toFloat, + 0.75, + new java.math.BigDecimal(s"1234.23456"), + new java.math.BigDecimal(s"1.23456"), + java.sql.Date.valueOf("2015-01-01"), + java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123"), + Seq(2, 3, 4), + Map("a string" -> 2000L), + Row(4.75.toFloat, Seq(false, true)), + new MyDenseVector(Array(0.25, 2.25, 4.25))) + val data = + Row.fromSeq(Seq("Spark " + spark.sparkContext.version) ++ constantValues) :: Nil + + // Data generated by previous versions. + // scalastyle:off + val existingJSONData = + """{"col0":"Spark 1.2.2","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: + """{"col0":"Spark 1.3.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: + """{"col0":"Spark 1.3.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: + """{"col0":"Spark 1.4.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: + """{"col0":"Spark 1.4.1","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: + """{"col0":"Spark 1.5.0","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"2015-01-01","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: + """{"col0":"Spark 1.5.0","col1":"YSBzdHJpbmcgaW4gYmluYXJ5","col3":true,"col4":1,"col5":2,"col6":3,"col7":9223372036854775807,"col8":0.25,"col9":0.75,"col10":1234.23456,"col11":1.23456,"col12":"16436","col13":"2015-01-01 23:50:59.123","col14":[2,3,4],"col15":{"a string":2000},"col16":{"f1":4.75,"f2":[false,true]},"col17":[0.25,2.25,4.25]}""" :: Nil + // scalastyle:on + + // Generate data for the current version. + val df = spark.createDataFrame(spark.sparkContext.parallelize(data, 1), schema) + withTempPath { path => + df.write.format("json").mode("overwrite").save(path.getCanonicalPath) + + // df.toJSON will convert internal rows to external rows first and then generate + // JSON objects. While, df.write.format("json") will write internal rows directly. + val allJSON = + existingJSONData ++ + df.toJSON.collect() ++ + sparkContext.textFile(path.getCanonicalPath).collect() + + Utils.deleteRecursively(path) + sparkContext.parallelize(allJSON, 1).saveAsTextFile(path.getCanonicalPath) + + // Read data back with the schema specified. + val col0Values = + Seq( + "Spark 1.2.2", + "Spark 1.3.1", + "Spark 1.3.1", + "Spark 1.4.1", + "Spark 1.4.1", + "Spark 1.5.0", + "Spark 1.5.0", + "Spark " + spark.sparkContext.version, + "Spark " + spark.sparkContext.version) + val expectedResult = col0Values.map { v => + Row.fromSeq(Seq(v) ++ constantValues) + } + checkAnswer( + spark.read.format("json").schema(schema).load(path.getCanonicalPath), + expectedResult + ) + } + } + test("SPARK-11544 test pathfilter") { withTempPath { dir => val path = dir.getCanonicalPath @@ -1461,6 +1556,15 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson "mapreduce.input.pathFilter.class" -> classOf[TestFileFilter].getName ) assert(spark.read.options(extraOptions).json(path).count() === 2) + + withClue("SPARK-32621: 'path' option can cause issues while inferring schema") { + // During infer, "path" option is used again on top of the paths that have already been + // listed. When a partition is removed by TestFileFilter, this will cause a conflict while + // inferring partitions because the original path in the "path" option will list the + // partition directory that has been removed. + assert( + spark.read.options(extraOptions).format("json").option("path", path).load.count() === 2) + } } } @@ -1680,7 +1784,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson timestampsWithFormat.write .format("json") .option("timestampFormat", "yyyy/MM/dd HH:mm") - .option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + .option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .save(timestampsWithFormatPath) // This will load back the timestamps as string. @@ -1698,7 +1802,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson val readBack = spark.read .schema(customSchema) .option("timestampFormat", "yyyy/MM/dd HH:mm") - .option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + .option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .json(timestampsWithFormatPath) checkAnswer(readBack, timestampsWithFormat) @@ -2046,9 +2150,18 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson )(withTempPath { path => val ds = sampledTestData.coalesce(1) ds.write.text(path.getAbsolutePath) - val readback = spark.read.option("samplingRatio", 0.1).json(path.getCanonicalPath) - - assert(readback.schema == new StructType().add("f1", LongType)) + val readback1 = spark.read.option("samplingRatio", 0.1).json(path.getCanonicalPath) + assert(readback1.schema == new StructType().add("f1", LongType)) + + withClue("SPARK-32621: 'path' option can cause issues while inferring schema") { + // During infer, "path" option gets added again to the paths that have already been listed. + // This results in reading more data than necessary and causes different schema to be + // inferred when sampling ratio is involved. + val readback2 = spark.read + .option("samplingRatio", 0.1).option("path", path.getCanonicalPath) + .format("json").load + assert(readback2.schema == new StructType().add("f1", LongType)) + } }) } @@ -2142,9 +2255,8 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson .json(testFile(fileName)) .count() } - val errMsg = exception.getMessage - assert(errMsg.contains("Malformed records are detected in record parsing")) + assert(exception.getMessage.contains("Malformed records are detected in record parsing")) } def checkEncoding(expectedEncoding: String, pathToJsonFiles: String, @@ -2436,23 +2548,24 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } } - test("SPARK-25040: empty strings should be disallowed") { - def failedOnEmptyString(dataType: DataType): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - val errMessage = intercept[SparkException] { - df.collect() - }.getMessage - assert(errMessage.contains( - s"Failed to parse an empty string for data type ${dataType.catalogString}")) - } - def emptyString(dataType: DataType, expected: Any): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - checkAnswer(df, Row(expected) :: Nil) - } + private def failedOnEmptyString(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + val errMessage = intercept[SparkException] { + df.collect() + }.getMessage + assert(errMessage.contains( + s"Failed to parse an empty string for data type ${dataType.catalogString}")) + } + + private def emptyString(dataType: DataType, expected: Any): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(expected) :: Nil) + } + test("SPARK-25040: empty strings should be disallowed") { failedOnEmptyString(BooleanType) failedOnEmptyString(ByteType) failedOnEmptyString(ShortType) @@ -2471,6 +2584,36 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) } + test("SPARK-25040: allowing empty strings when legacy config is enabled") { + def emptyStringAsNull(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(null) :: Nil) + } + + // Legacy mode prior to Spark 3.0.0 + withSQLConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON.key -> "true") { + emptyStringAsNull(BooleanType) + emptyStringAsNull(ByteType) + emptyStringAsNull(ShortType) + emptyStringAsNull(IntegerType) + emptyStringAsNull(LongType) + + failedOnEmptyString(FloatType) + failedOnEmptyString(DoubleType) + failedOnEmptyString(TimestampType) + failedOnEmptyString(DateType) + + emptyStringAsNull(DecimalType.SYSTEM_DEFAULT) + emptyStringAsNull(ArrayType(IntegerType)) + emptyStringAsNull(MapType(StringType, IntegerType, true)) + emptyStringAsNull(StructType(StructField("f1", IntegerType, true) :: Nil)) + + emptyString(StringType, "") + emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) + } + } + test("return partial result for bad records") { val schema = "a double, b array, c string, _corrupt_record string" val badRecords = Seq( @@ -2485,7 +2628,9 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson } test("inferring timestamp type") { - def schemaOf(jsons: String*): StructType = spark.read.json(jsons.toDS).schema + def schemaOf(jsons: String*): StructType = { + spark.read.option("inferTimestamp", true).json(jsons.toDS).schema + } assert(schemaOf( """{"a":"2018-12-17T10:11:12.123-01:00"}""", @@ -2508,6 +2653,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson val timestampsWithFormatPath = s"${dir.getCanonicalPath}/timestampsWithFormat.json" val timestampsWithFormat = spark.read .option("timestampFormat", "dd/MM/yyyy HH:mm") + .option("inferTimestamp", true) .json(datesRecords) assert(timestampsWithFormat.schema === customSchema) @@ -2520,12 +2666,59 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson val readBack = spark.read .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") .option(DateTimeUtils.TIMEZONE_OPTION, "UTC") + .option("inferTimestamp", true) .json(timestampsWithFormatPath) assert(readBack.schema === customSchema) checkAnswer(readBack, timestampsWithFormat) } } + + test("SPARK-30960, SPARK-31641: parse date/timestamp string with legacy format") { + val julianDay = -141704 // 1582-01-01 in Julian calendar + val ds = Seq( + s"{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': '12345', 'd3': '$julianDay'}" + ).toDS() + val json = spark.read.schema("t timestamp, d date, d2 date, d3 date").json(ds) + checkAnswer(json, Row( + Timestamp.valueOf("2020-1-12 3:23:34.12"), + Date.valueOf("2020-1-12"), + Date.valueOf(LocalDate.ofEpochDay(12345)), + Date.valueOf("1582-01-01"))) + } + + test("exception mode for parsing date/timestamp string") { + val ds = Seq("{'t': '2020-01-27T20:06:11.847-0800'}").toDS() + val json = spark.read + .schema("t timestamp") + .option("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSz") + .json(ds) + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "exception") { + val msg = intercept[SparkException] { + json.collect() + }.getCause.getMessage + assert(msg.contains("Fail to parse")) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "legacy") { + checkAnswer(json, Row(Timestamp.valueOf("2020-01-27 20:06:11.847"))) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "corrected") { + checkAnswer(json, Row(null)) + } + } + + test("SPARK-32810: JSON data source should be able to read files with " + + "escaped glob metacharacter in the paths") { + withTempDir { dir => + val basePath = dir.getCanonicalPath + // test JSON writer / reader without specifying schema + val jsonTableName = "{def}" + spark.range(3).coalesce(1).write.json(s"$basePath/$jsonTableName") + val readback = spark.read + .json(s"$basePath/${"""(\[|\]|\{|\})""".r.replaceAllIn(jsonTableName, """\\$1""")}") + assert(readback.collect sameElements Array(Row(0), Row(1), Row(2))) + } + } } class JsonV1Suite extends JsonSuite { @@ -2541,3 +2734,10 @@ class JsonV2Suite extends JsonSuite { .sparkConf .set(SQLConf.USE_V1_SOURCE_LIST, "") } + +class JsonLegacyTimeParserSuite extends JsonSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.LEGACY_TIME_PARSER_POLICY, "legacy") +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala index 719bf91e1786b..bfcef46339908 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReaderSuite.scala @@ -17,16 +17,29 @@ package org.apache.spark.sql.execution.datasources.orc +import java.io.File + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType} +import org.apache.hadoop.mapreduce.lib.input.FileSplit +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.orc.TypeDescription import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.unsafe.types.UTF8String.fromString class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { + + import testImplicits._ + private val dataSchema = StructType.fromDDL("col1 int, col2 int") private val partitionSchema = StructType.fromDDL("p1 string, p2 string") private val partitionValues = InternalRow(fromString("partValue1"), fromString("partValue2")) @@ -77,4 +90,66 @@ class OrcColumnarBatchReaderSuite extends QueryTest with SharedSparkSession { assert(p1.getUTF8String(0) === partitionValues.getUTF8String(0)) } } + + test("SPARK-33593: partition column types") { + withTempPath { dir => + Seq(1).toDF().repartition(1).write.orc(dir.getCanonicalPath) + + val dataTypes = + Seq(StringType, BooleanType, ByteType, BinaryType, ShortType, IntegerType, LongType, + FloatType, DoubleType, DecimalType(25, 5), DateType, TimestampType) + + val constantValues = + Seq( + UTF8String.fromString("a string"), + true, + 1.toByte, + "Spark SQL".getBytes, + 2.toShort, + 3, + Long.MaxValue, + 0.25.toFloat, + 0.75D, + Decimal("1234.23456"), + DateTimeUtils.fromJavaDate(java.sql.Date.valueOf("2015-01-01")), + DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123"))) + + dataTypes.zip(constantValues).foreach { case (dt, v) => + val schema = StructType(StructField("col1", IntegerType) :: StructField("pcol", dt) :: Nil) + val partitionValues = new GenericInternalRow(Array(v)) + val file = new File(SpecificParquetRecordReaderBase.listDirectory(dir).get(0)) + val fileSplit = new FileSplit(new Path(file.getCanonicalPath), 0L, file.length, Array.empty) + val taskConf = sqlContext.sessionState.newHadoopConf() + val orcFileSchema = TypeDescription.fromString(schema.simpleString) + val vectorizedReader = new OrcColumnarBatchReader(4096) + val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) + val taskAttemptContext = new TaskAttemptContextImpl(taskConf, attemptId) + + try { + vectorizedReader.initialize(fileSplit, taskAttemptContext) + vectorizedReader.initBatch( + orcFileSchema, + schema.toArray, + Array(0, -1), + Array(-1, 0), + partitionValues) + vectorizedReader.nextKeyValue() + val row = vectorizedReader.getCurrentValue.getRow(0) + + // Use `GenericMutableRow` by explicitly copying rather than `ColumnarBatch` + // in order to use get(...) method which is not implemented in `ColumnarBatch`. + val actual = row.copy().get(1, dt) + val expected = v + if (dt.isInstanceOf[BinaryType]) { + assert(actual.asInstanceOf[Array[Byte]] + sameElements expected.asInstanceOf[Array[Byte]]) + } else { + assert(actual == expected) + } + } finally { + vectorizedReader.close() + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala index b8bf4b16fe53c..9caf0c836f711 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala @@ -595,22 +595,22 @@ abstract class OrcQueryTest extends OrcTest { } withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") { - val m1 = intercept[SparkException] { + val e1 = intercept[SparkException] { testIgnoreCorruptFiles() - }.getMessage - assert(m1.contains("Malformed ORC file")) - val m2 = intercept[SparkException] { + } + assert(e1.getMessage.contains("Malformed ORC file")) + val e2 = intercept[SparkException] { testIgnoreCorruptFilesWithoutSchemaInfer() - }.getMessage - assert(m2.contains("Malformed ORC file")) - val m3 = intercept[SparkException] { + } + assert(e2.getMessage.contains("Malformed ORC file")) + val e3 = intercept[SparkException] { testAllCorruptFiles() - }.getMessage - assert(m3.contains("Could not read footer for file")) - val m4 = intercept[SparkException] { + } + assert(e3.getMessage.contains("Could not read footer for file")) + val e4 = intercept[SparkException] { testAllCorruptFilesWithoutSchemaInfer() - }.getMessage - assert(m4.contains("Malformed ORC file")) + } + assert(e4.getMessage.contains("Malformed ORC file")) } } @@ -631,7 +631,7 @@ abstract class OrcQueryTest extends OrcTest { } } -class OrcQuerySuite extends OrcQueryTest with SharedSparkSession { +abstract class OrcQuerySuite extends OrcQueryTest with SharedSparkSession { import testImplicits._ test("LZO compression options for writing to an ORC file") { @@ -722,3 +722,10 @@ class OrcV1QuerySuite extends OrcQuerySuite { .sparkConf .set(SQLConf.USE_V1_SOURCE_LIST, "orc") } + +class OrcV2QuerySuite extends OrcQuerySuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index 1e27593584786..5fa23a40e677d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.orc import java.io.File import java.nio.charset.StandardCharsets.UTF_8 -import java.sql.Timestamp +import java.sql.{Date, Timestamp} import java.util.Locale import org.apache.hadoop.conf.Configuration @@ -32,7 +32,7 @@ import org.apache.orc.impl.RecordReaderImpl import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SPARK_VERSION_SHORT, SparkException} -import org.apache.spark.sql.{Row, SPARK_VERSION_METADATA_KEY} +import org.apache.spark.sql.{FakeFileSystemRequiringDSOption, Row, SPARK_VERSION_METADATA_KEY} import org.apache.spark.sql.execution.datasources.SchemaMergeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -213,9 +213,7 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val schema = SchemaMergeUtils.mergeSchemasInParallel( - spark, - fileStatuses, - schemaReader) + spark, Map.empty, fileStatuses, schemaReader) assert(schema.isDefined) assert(schema.get == StructType(Seq( @@ -482,6 +480,79 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { } } } + + test("SPARK-31238: compatibility with Spark 2.4 in reading dates") { + Seq(false, true).foreach { vectorized => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + checkAnswer( + readResourceOrcFile("test-data/before_1582_date_v2_4.snappy.orc"), + Row(java.sql.Date.valueOf("1200-01-01"))) + } + } + } + + test("SPARK-31238, SPARK-31423: rebasing dates in write") { + withTempPath { dir => + val path = dir.getAbsolutePath + Seq("1001-01-01", "1582-10-10").toDF("dateS") + .select($"dateS".cast("date").as("date")) + .write + .orc(path) + + Seq(false, true).foreach { vectorized => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + checkAnswer( + spark.read.orc(path), + Seq(Row(Date.valueOf("1001-01-01")), Row(Date.valueOf("1582-10-15")))) + } + } + } + } + + test("SPARK-31284: compatibility with Spark 2.4 in reading timestamps") { + Seq(false, true).foreach { vectorized => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + checkAnswer( + readResourceOrcFile("test-data/before_1582_ts_v2_4.snappy.orc"), + Row(java.sql.Timestamp.valueOf("1001-01-01 01:02:03.123456"))) + } + } + } + + test("SPARK-31284, SPARK-31423: rebasing timestamps in write") { + withTempPath { dir => + val path = dir.getAbsolutePath + Seq("1001-01-01 01:02:03.123456", "1582-10-10 11:12:13.654321").toDF("tsS") + .select($"tsS".cast("timestamp").as("ts")) + .write + .orc(path) + + Seq(false, true).foreach { vectorized => + withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + checkAnswer( + spark.read.orc(path), + Seq( + Row(java.sql.Timestamp.valueOf("1001-01-01 01:02:03.123456")), + Row(java.sql.Timestamp.valueOf("1582-10-15 11:12:13.654321")))) + } + } + } + } + + test("SPARK-33094: should propagate Hadoop config from DS options to underlying file system") { + withSQLConf( + "fs.file.impl" -> classOf[FakeFileSystemRequiringDSOption].getName, + "fs.file.impl.disable.cache" -> "true") { + Seq(false, true).foreach { mergeSchema => + withTempPath { dir => + val path = dir.getAbsolutePath + val conf = Map("ds_option" -> "value", "mergeSchema" -> mergeSchema.toString) + spark.range(1).write.options(conf).orc(path) + checkAnswer(spark.read.options(conf).orc(path), Row(0)) + } + } + } + } } class OrcSourceSuite extends OrcSuite with SharedSparkSession { @@ -531,4 +602,10 @@ class OrcSourceSuite extends OrcSuite with SharedSparkSession { test("SPARK-11412 read and merge orc schemas in parallel") { testMergeSchemasInParallel(OrcUtils.readOrcSchemasInParallel) } + + test("SPARK-31580: Read a file written before ORC-569") { + // Test ORC file came from ORC-621 + val df = readResourceOrcFile("test-data/TestStringDictionary.testRowIndex.orc") + assert(df.where("str < 'row 001000'").count() === 1000) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala index 388744bd0fd6e..e929f904c798d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala @@ -22,6 +22,7 @@ import java.io.File import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag +import org.apache.commons.io.FileUtils import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql._ @@ -133,4 +134,13 @@ abstract class OrcTest extends QueryTest with FileBasedDataSourceTest with Befor throw new AnalysisException("Can not match OrcTable in the query.") } } + + protected def readResourceOrcFile(name: String): DataFrame = { + val url = Thread.currentThread().getContextClassLoader.getResource(name) + // Copy to avoid URISyntaxException when `sql/hive` accesses the resources in `sql/core` + val file = File.createTempFile("orc-test", ".orc") + file.deleteOnExit(); + FileUtils.copyURLToFile(url, file) + spark.read.orc(file.getAbsolutePath) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 6d681afd23b18..fbfedf02dc87e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -42,7 +42,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) reader.initialize(file.asInstanceOf[String], null) val batch = reader.resultBatch() assert(reader.nextBatch()) @@ -69,7 +69,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) reader.initialize(file.asInstanceOf[String], null) val batch = reader.resultBatch() assert(reader.nextBatch()) @@ -100,7 +100,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) reader.initialize(file, null /* set columns to null to project all columns */) val column = reader.resultBatch().column(0) assert(reader.nextBatch()) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 286bb1e920266..307234dcb86e9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -20,6 +20,10 @@ package org.apache.spark.sql.execution.datasources.parquet import java.math.{BigDecimal => JBigDecimal} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} +import java.time.{LocalDate, LocalDateTime, ZoneId} + +import scala.reflect.ClassTag +import scala.reflect.runtime.universe.TypeTag import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operators} import org.apache.parquet.filter2.predicate.FilterApi._ @@ -32,7 +36,8 @@ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.parseColumnPath +import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation, PushableColumnAndNestedColumn} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.functions._ @@ -40,6 +45,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ +import org.apache.spark.tags.ExtendedSQLTest import org.apache.spark.util.{AccumulatorContext, AccumulatorV2} /** @@ -103,61 +109,109 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(df) } - private def checkBinaryFilterPredicate - (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row]) - (implicit df: DataFrame): Unit = { - def checkBinaryAnswer(df: DataFrame, expected: Seq[Row]) = { - assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).sorted) { - df.rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted + /** + * Takes a sequence of products `data` to generate multi-level nested + * dataframes as new test data. It tests both non-nested and nested dataframes + * which are written and read back with Parquet datasource. + * + * This is different from [[ParquetTest.withParquetDataFrame]] which does not + * test nested cases. + */ + private def withNestedParquetDataFrame[T <: Product: ClassTag: TypeTag](data: Seq[T]) + (runTest: (DataFrame, String, Any => Any) => Unit): Unit = + withNestedParquetDataFrame(spark.createDataFrame(data))(runTest) + + private def withNestedParquetDataFrame(inputDF: DataFrame) + (runTest: (DataFrame, String, Any => Any) => Unit): Unit = { + assert(inputDF.schema.fields.length == 1) + assert(!inputDF.schema.fields.head.dataType.isInstanceOf[StructType]) + val df = inputDF.toDF("temp") + Seq( + ( + df.withColumnRenamed("temp", "a"), + "a", // zero nesting + (x: Any) => x), + ( + df.withColumn("a", struct(df("temp") as "b")).drop("temp"), + "a.b", // one level nesting + (x: Any) => Row(x)), + ( + df.withColumn("a", struct(struct(df("temp") as "c") as "b")).drop("temp"), + "a.b.c", // two level nesting + (x: Any) => Row(Row(x)) + ), + ( + df.withColumnRenamed("temp", "a.b"), + "`a.b`", // zero nesting with column name containing `dots` + (x: Any) => x + ), + ( + df.withColumn("a.b", struct(df("temp") as "c.d") ).drop("temp"), + "`a.b`.`c.d`", // one level nesting with column names containing `dots` + (x: Any) => Row(x) + ) + ).foreach { case (newDF, colName, resultFun) => + withTempPath { file => + newDF.write.format(dataSourceName).save(file.getCanonicalPath) + readParquetFile(file.getCanonicalPath) { df => runTest(df, colName, resultFun) } } } - - checkFilterPredicate(df, predicate, filterClass, checkBinaryAnswer _, expected) - } - - private def checkBinaryFilterPredicate - (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Array[Byte]) - (implicit df: DataFrame): Unit = { - checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(df) } - private def testTimestampPushdown(data: Seq[Timestamp]): Unit = { + private def testTimestampPushdown(data: Seq[String], java8Api: Boolean): Unit = { + implicit class StringToTs(s: String) { + def ts: Timestamp = Timestamp.valueOf(s) + } assert(data.size === 4) val ts1 = data.head val ts2 = data(1) val ts3 = data(2) val ts4 = data(3) - withParquetDataFrame(data.map(i => Tuple1(i))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], data.map(i => Row.apply(i))) + import testImplicits._ + val df = data.map(i => Tuple1(Timestamp.valueOf(i))).toDF() + withNestedParquetDataFrame(df) { case (parquetDF, colName, fun) => + implicit val df: DataFrame = parquetDF + + def resultFun(tsStr: String): Any = { + val parsed = if (java8Api) { + LocalDateTime.parse(tsStr.replace(" ", "T")) + .atZone(ZoneId.systemDefault()) + .toInstant + } else { + Timestamp.valueOf(tsStr) + } + fun(parsed) + } - checkFilterPredicate('_1 === ts1, classOf[Eq[_]], ts1) - checkFilterPredicate('_1 <=> ts1, classOf[Eq[_]], ts1) - checkFilterPredicate('_1 =!= ts1, classOf[NotEq[_]], - Seq(ts2, ts3, ts4).map(i => Row.apply(i))) + val tsAttr = df(colName).expr + assert(df(colName).expr.dataType === TimestampType) - checkFilterPredicate('_1 < ts2, classOf[Lt[_]], ts1) - checkFilterPredicate('_1 > ts1, classOf[Gt[_]], Seq(ts2, ts3, ts4).map(i => Row.apply(i))) - checkFilterPredicate('_1 <= ts1, classOf[LtEq[_]], ts1) - checkFilterPredicate('_1 >= ts4, classOf[GtEq[_]], ts4) + checkFilterPredicate(tsAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(tsAttr.isNotNull, classOf[NotEq[_]], + data.map(i => Row.apply(resultFun(i)))) - checkFilterPredicate(Literal(ts1) === '_1, classOf[Eq[_]], ts1) - checkFilterPredicate(Literal(ts1) <=> '_1, classOf[Eq[_]], ts1) - checkFilterPredicate(Literal(ts2) > '_1, classOf[Lt[_]], ts1) - checkFilterPredicate(Literal(ts3) < '_1, classOf[Gt[_]], ts4) - checkFilterPredicate(Literal(ts1) >= '_1, classOf[LtEq[_]], ts1) - checkFilterPredicate(Literal(ts4) <= '_1, classOf[GtEq[_]], ts4) + checkFilterPredicate(tsAttr === ts1.ts, classOf[Eq[_]], resultFun(ts1)) + checkFilterPredicate(tsAttr <=> ts1.ts, classOf[Eq[_]], resultFun(ts1)) + checkFilterPredicate(tsAttr =!= ts1.ts, classOf[NotEq[_]], + Seq(ts2, ts3, ts4).map(i => Row.apply(resultFun(i)))) - checkFilterPredicate(!('_1 < ts4), classOf[GtEq[_]], ts4) - checkFilterPredicate('_1 < ts2 || '_1 > ts3, classOf[Operators.Or], Seq(Row(ts1), Row(ts4))) - } - } + checkFilterPredicate(tsAttr < ts2.ts, classOf[Lt[_]], resultFun(ts1)) + checkFilterPredicate(tsAttr > ts1.ts, classOf[Gt[_]], + Seq(ts2, ts3, ts4).map(i => Row.apply(resultFun(i)))) + checkFilterPredicate(tsAttr <= ts1.ts, classOf[LtEq[_]], resultFun(ts1)) + checkFilterPredicate(tsAttr >= ts4.ts, classOf[GtEq[_]], resultFun(ts4)) - private def testDecimalPushDown(data: DataFrame)(f: DataFrame => Unit): Unit = { - withTempPath { file => - data.write.parquet(file.getCanonicalPath) - readParquetFile(file.toString)(f) + checkFilterPredicate(Literal(ts1.ts) === tsAttr, classOf[Eq[_]], resultFun(ts1)) + checkFilterPredicate(Literal(ts1.ts) <=> tsAttr, classOf[Eq[_]], resultFun(ts1)) + checkFilterPredicate(Literal(ts2.ts) > tsAttr, classOf[Lt[_]], resultFun(ts1)) + checkFilterPredicate(Literal(ts3.ts) < tsAttr, classOf[Gt[_]], resultFun(ts4)) + checkFilterPredicate(Literal(ts1.ts) >= tsAttr, classOf[LtEq[_]], resultFun(ts1)) + checkFilterPredicate(Literal(ts4.ts) <= tsAttr, classOf[GtEq[_]], resultFun(ts4)) + + checkFilterPredicate(!(tsAttr < ts4.ts), classOf[GtEq[_]], resultFun(ts4)) + checkFilterPredicate(tsAttr < ts2.ts || tsAttr > ts3.ts, classOf[Operators.Or], + Seq(Row(resultFun(ts1)), Row(resultFun(ts4)))) } } @@ -187,201 +241,265 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared } test("filter pushdown - boolean") { - withParquetDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false))) + val data = (true :: false :: Nil).map(b => Tuple1.apply(Option(b))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val booleanAttr = df(colName).expr + assert(df(colName).expr.dataType === BooleanType) - checkFilterPredicate('_1 === true, classOf[Eq[_]], true) - checkFilterPredicate('_1 <=> true, classOf[Eq[_]], true) - checkFilterPredicate('_1 =!= true, classOf[NotEq[_]], false) + checkFilterPredicate(booleanAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(booleanAttr.isNotNull, classOf[NotEq[_]], + Seq(Row(resultFun(true)), Row(resultFun(false)))) + + checkFilterPredicate(booleanAttr === true, classOf[Eq[_]], resultFun(true)) + checkFilterPredicate(booleanAttr <=> true, classOf[Eq[_]], resultFun(true)) + checkFilterPredicate(booleanAttr =!= true, classOf[NotEq[_]], resultFun(false)) } } test("filter pushdown - tinyint") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toByte)))) { implicit df => - assert(df.schema.head.dataType === ByteType) - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 === 1.toByte, classOf[Eq[_]], 1) - checkFilterPredicate('_1 <=> 1.toByte, classOf[Eq[_]], 1) - checkFilterPredicate('_1 =!= 1.toByte, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 < 2.toByte, classOf[Lt[_]], 1) - checkFilterPredicate('_1 > 3.toByte, classOf[Gt[_]], 4) - checkFilterPredicate('_1 <= 1.toByte, classOf[LtEq[_]], 1) - checkFilterPredicate('_1 >= 4.toByte, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1.toByte) === '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1.toByte) <=> '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2.toByte) > '_1, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3.toByte) < '_1, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1.toByte) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4.toByte) <= '_1, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('_1 < 4.toByte), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 < 2.toByte || '_1 > 3.toByte, - classOf[Operators.Or], Seq(Row(1), Row(4))) + val data = (1 to 4).map(i => Tuple1(Option(i.toByte))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val tinyIntAttr = df(colName).expr + assert(df(colName).expr.dataType === ByteType) + + checkFilterPredicate(tinyIntAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(tinyIntAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(tinyIntAttr === 1.toByte, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(tinyIntAttr <=> 1.toByte, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(tinyIntAttr =!= 1.toByte, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(tinyIntAttr < 2.toByte, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(tinyIntAttr > 3.toByte, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(tinyIntAttr <= 1.toByte, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(tinyIntAttr >= 4.toByte, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1.toByte) === tinyIntAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1.toByte) <=> tinyIntAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2.toByte) > tinyIntAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3.toByte) < tinyIntAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1.toByte) >= tinyIntAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4.toByte) <= tinyIntAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(tinyIntAttr < 4.toByte), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(tinyIntAttr < 2.toByte || tinyIntAttr > 3.toByte, + classOf[Operators.Or], Seq(Row(resultFun(1)), Row(resultFun(4)))) } } test("filter pushdown - smallint") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit df => - assert(df.schema.head.dataType === ShortType) - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 === 1.toShort, classOf[Eq[_]], 1) - checkFilterPredicate('_1 <=> 1.toShort, classOf[Eq[_]], 1) - checkFilterPredicate('_1 =!= 1.toShort, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 < 2.toShort, classOf[Lt[_]], 1) - checkFilterPredicate('_1 > 3.toShort, classOf[Gt[_]], 4) - checkFilterPredicate('_1 <= 1.toShort, classOf[LtEq[_]], 1) - checkFilterPredicate('_1 >= 4.toShort, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1.toShort) === '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1.toShort) <=> '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2.toShort) > '_1, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3.toShort) < '_1, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1.toShort) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4.toShort) <= '_1, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('_1 < 4.toShort), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 < 2.toShort || '_1 > 3.toShort, - classOf[Operators.Or], Seq(Row(1), Row(4))) + val data = (1 to 4).map(i => Tuple1(Option(i.toShort))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val smallIntAttr = df(colName).expr + assert(df(colName).expr.dataType === ShortType) + + checkFilterPredicate(smallIntAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(smallIntAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(smallIntAttr === 1.toShort, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(smallIntAttr <=> 1.toShort, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(smallIntAttr =!= 1.toShort, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(smallIntAttr < 2.toShort, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(smallIntAttr > 3.toShort, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(smallIntAttr <= 1.toShort, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(smallIntAttr >= 4.toShort, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1.toShort) === smallIntAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1.toShort) <=> smallIntAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2.toShort) > smallIntAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3.toShort) < smallIntAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1.toShort) >= smallIntAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4.toShort) <= smallIntAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(smallIntAttr < 4.toShort), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(smallIntAttr < 2.toShort || smallIntAttr > 3.toShort, + classOf[Operators.Or], Seq(Row(resultFun(1)), Row(resultFun(4)))) } } test("filter pushdown - integer") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 =!= 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) - checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) - checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) + val data = (1 to 4).map(i => Tuple1(Option(i))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val intAttr = df(colName).expr + assert(df(colName).expr.dataType === IntegerType) + + checkFilterPredicate(intAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(intAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(intAttr === 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(intAttr <=> 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(intAttr =!= 1, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(intAttr < 2, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(intAttr > 3, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(intAttr <= 1, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(intAttr >= 4, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1) === intAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1) <=> intAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2) > intAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3) < intAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1) >= intAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4) <= intAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(intAttr < 4), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(intAttr < 2 || intAttr > 3, classOf[Operators.Or], + Seq(Row(resultFun(1)), Row(resultFun(4)))) } } test("filter pushdown - long") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 =!= 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) - checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) - checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) + val data = (1 to 4).map(i => Tuple1(Option(i.toLong))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val longAttr = df(colName).expr + assert(df(colName).expr.dataType === LongType) + + checkFilterPredicate(longAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(longAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(longAttr === 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(longAttr <=> 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(longAttr =!= 1, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(longAttr < 2, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(longAttr > 3, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(longAttr <= 1, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(longAttr >= 4, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1) === longAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1) <=> longAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2) > longAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3) < longAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1) >= longAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4) <= longAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(longAttr < 4), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(longAttr < 2 || longAttr > 3, classOf[Operators.Or], + Seq(Row(resultFun(1)), Row(resultFun(4)))) } } test("filter pushdown - float") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 =!= 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) - checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) - checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) + val data = (1 to 4).map(i => Tuple1(Option(i.toFloat))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val floatAttr = df(colName).expr + assert(df(colName).expr.dataType === FloatType) + + checkFilterPredicate(floatAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(floatAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(floatAttr === 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(floatAttr <=> 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(floatAttr =!= 1, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(floatAttr < 2, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(floatAttr > 3, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(floatAttr <= 1, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(floatAttr >= 4, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1) === floatAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1) <=> floatAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2) > floatAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3) < floatAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1) >= floatAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4) <= floatAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(floatAttr < 4), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(floatAttr < 2 || floatAttr > 3, classOf[Operators.Or], + Seq(Row(resultFun(1)), Row(resultFun(4)))) } } test("filter pushdown - double") { - withParquetDataFrame((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 <=> 1, classOf[Eq[_]], 1) - checkFilterPredicate('_1 =!= 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1) - checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4) - checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1) - checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1) <=> '_1, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) + val data = (1 to 4).map(i => Tuple1(Option(i.toDouble))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val doubleAttr = df(colName).expr + assert(df(colName).expr.dataType === DoubleType) + + checkFilterPredicate(doubleAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(doubleAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(doubleAttr === 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(doubleAttr <=> 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(doubleAttr =!= 1, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(doubleAttr < 2, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(doubleAttr > 3, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(doubleAttr <= 1, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(doubleAttr >= 4, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1) === doubleAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1) <=> doubleAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2) > doubleAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3) < doubleAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1) >= doubleAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4) <= doubleAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(doubleAttr < 4), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(doubleAttr < 2 || doubleAttr > 3, classOf[Operators.Or], + Seq(Row(resultFun(1)), Row(resultFun(4)))) } } test("filter pushdown - string") { - withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate( - '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString))) - - checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1") - checkFilterPredicate('_1 <=> "1", classOf[Eq[_]], "1") - checkFilterPredicate( - '_1 =!= "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString))) - - checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1") - checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4") - checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1") - checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4") - - checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1") - checkFilterPredicate(Literal("1") <=> '_1, classOf[Eq[_]], "1") - checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1") - checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4") - checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1") - checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4") - - checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4") - checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4"))) + val data = (1 to 4).map(i => Tuple1(Option(i.toString))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val stringAttr = df(colName).expr + assert(df(colName).expr.dataType === StringType) + + checkFilterPredicate(stringAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(stringAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i.toString)))) + + checkFilterPredicate(stringAttr === "1", classOf[Eq[_]], resultFun("1")) + checkFilterPredicate(stringAttr <=> "1", classOf[Eq[_]], resultFun("1")) + checkFilterPredicate(stringAttr =!= "1", classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i.toString)))) + + checkFilterPredicate(stringAttr < "2", classOf[Lt[_]], resultFun("1")) + checkFilterPredicate(stringAttr > "3", classOf[Gt[_]], resultFun("4")) + checkFilterPredicate(stringAttr <= "1", classOf[LtEq[_]], resultFun("1")) + checkFilterPredicate(stringAttr >= "4", classOf[GtEq[_]], resultFun("4")) + + checkFilterPredicate(Literal("1") === stringAttr, classOf[Eq[_]], resultFun("1")) + checkFilterPredicate(Literal("1") <=> stringAttr, classOf[Eq[_]], resultFun("1")) + checkFilterPredicate(Literal("2") > stringAttr, classOf[Lt[_]], resultFun("1")) + checkFilterPredicate(Literal("3") < stringAttr, classOf[Gt[_]], resultFun("4")) + checkFilterPredicate(Literal("1") >= stringAttr, classOf[LtEq[_]], resultFun("1")) + checkFilterPredicate(Literal("4") <= stringAttr, classOf[GtEq[_]], resultFun("4")) + + checkFilterPredicate(!(stringAttr < "4"), classOf[GtEq[_]], resultFun("4")) + checkFilterPredicate(stringAttr < "2" || stringAttr > "3", classOf[Operators.Or], + Seq(Row(resultFun("1")), Row(resultFun("4")))) } } @@ -390,32 +508,38 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared def b: Array[Byte] = int.toString.getBytes(StandardCharsets.UTF_8) } - withParquetDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df => - checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b) - checkBinaryFilterPredicate('_1 <=> 1.b, classOf[Eq[_]], 1.b) + val data = (1 to 4).map(i => Tuple1(Option(i.b))) + withNestedParquetDataFrame(data) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF - checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkBinaryFilterPredicate( - '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq) + val binaryAttr: Expression = df(colName).expr + assert(df(colName).expr.dataType === BinaryType) - checkBinaryFilterPredicate( - '_1 =!= 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq) + checkFilterPredicate(binaryAttr === 1.b, classOf[Eq[_]], resultFun(1.b)) + checkFilterPredicate(binaryAttr <=> 1.b, classOf[Eq[_]], resultFun(1.b)) - checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt[_]], 1.b) - checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt[_]], 4.b) - checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b) - checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b) + checkFilterPredicate(binaryAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(binaryAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i.b)))) - checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b) - checkBinaryFilterPredicate(Literal(1.b) <=> '_1, classOf[Eq[_]], 1.b) - checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b) - checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b) - checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b) - checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b) + checkFilterPredicate(binaryAttr =!= 1.b, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i.b)))) - checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b) - checkBinaryFilterPredicate( - '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b))) + checkFilterPredicate(binaryAttr < 2.b, classOf[Lt[_]], resultFun(1.b)) + checkFilterPredicate(binaryAttr > 3.b, classOf[Gt[_]], resultFun(4.b)) + checkFilterPredicate(binaryAttr <= 1.b, classOf[LtEq[_]], resultFun(1.b)) + checkFilterPredicate(binaryAttr >= 4.b, classOf[GtEq[_]], resultFun(4.b)) + + checkFilterPredicate(Literal(1.b) === binaryAttr, classOf[Eq[_]], resultFun(1.b)) + checkFilterPredicate(Literal(1.b) <=> binaryAttr, classOf[Eq[_]], resultFun(1.b)) + checkFilterPredicate(Literal(2.b) > binaryAttr, classOf[Lt[_]], resultFun(1.b)) + checkFilterPredicate(Literal(3.b) < binaryAttr, classOf[Gt[_]], resultFun(4.b)) + checkFilterPredicate(Literal(1.b) >= binaryAttr, classOf[LtEq[_]], resultFun(1.b)) + checkFilterPredicate(Literal(4.b) <= binaryAttr, classOf[GtEq[_]], resultFun(4.b)) + + checkFilterPredicate(!(binaryAttr < 4.b), classOf[GtEq[_]], resultFun(4.b)) + checkFilterPredicate(binaryAttr < 2.b || binaryAttr > 3.b, classOf[Operators.Or], + Seq(Row(resultFun(1.b)), Row(resultFun(4.b)))) } } @@ -425,70 +549,107 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared } val data = Seq("2018-03-18", "2018-03-19", "2018-03-20", "2018-03-21") + import testImplicits._ - withParquetDataFrame(data.map(i => Tuple1(i.date))) { implicit df => - checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], data.map(i => Row.apply(i.date))) - - checkFilterPredicate('_1 === "2018-03-18".date, classOf[Eq[_]], "2018-03-18".date) - checkFilterPredicate('_1 <=> "2018-03-18".date, classOf[Eq[_]], "2018-03-18".date) - checkFilterPredicate('_1 =!= "2018-03-18".date, classOf[NotEq[_]], - Seq("2018-03-19", "2018-03-20", "2018-03-21").map(i => Row.apply(i.date))) - - checkFilterPredicate('_1 < "2018-03-19".date, classOf[Lt[_]], "2018-03-18".date) - checkFilterPredicate('_1 > "2018-03-20".date, classOf[Gt[_]], "2018-03-21".date) - checkFilterPredicate('_1 <= "2018-03-18".date, classOf[LtEq[_]], "2018-03-18".date) - checkFilterPredicate('_1 >= "2018-03-21".date, classOf[GtEq[_]], "2018-03-21".date) + Seq(false, true).foreach { java8Api => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString) { + val dates = data.map(i => Tuple1(Date.valueOf(i))).toDF() + withNestedParquetDataFrame(dates) { case (inputDF, colName, fun) => + implicit val df: DataFrame = inputDF - checkFilterPredicate( - Literal("2018-03-18".date) === '_1, classOf[Eq[_]], "2018-03-18".date) - checkFilterPredicate( - Literal("2018-03-18".date) <=> '_1, classOf[Eq[_]], "2018-03-18".date) - checkFilterPredicate( - Literal("2018-03-19".date) > '_1, classOf[Lt[_]], "2018-03-18".date) - checkFilterPredicate( - Literal("2018-03-20".date) < '_1, classOf[Gt[_]], "2018-03-21".date) - checkFilterPredicate( - Literal("2018-03-18".date) >= '_1, classOf[LtEq[_]], "2018-03-18".date) - checkFilterPredicate( - Literal("2018-03-21".date) <= '_1, classOf[GtEq[_]], "2018-03-21".date) + def resultFun(dateStr: String): Any = { + val parsed = if (java8Api) LocalDate.parse(dateStr) else Date.valueOf(dateStr) + fun(parsed) + } - checkFilterPredicate(!('_1 < "2018-03-21".date), classOf[GtEq[_]], "2018-03-21".date) - checkFilterPredicate( - '_1 < "2018-03-19".date || '_1 > "2018-03-20".date, - classOf[Operators.Or], - Seq(Row("2018-03-18".date), Row("2018-03-21".date))) + val dateAttr: Expression = df(colName).expr + assert(df(colName).expr.dataType === DateType) + + checkFilterPredicate(dateAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(dateAttr.isNotNull, classOf[NotEq[_]], + data.map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(dateAttr === "2018-03-18".date, classOf[Eq[_]], + resultFun("2018-03-18")) + checkFilterPredicate(dateAttr <=> "2018-03-18".date, classOf[Eq[_]], + resultFun("2018-03-18")) + checkFilterPredicate(dateAttr =!= "2018-03-18".date, classOf[NotEq[_]], + Seq("2018-03-19", "2018-03-20", "2018-03-21").map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(dateAttr < "2018-03-19".date, classOf[Lt[_]], + resultFun("2018-03-18")) + checkFilterPredicate(dateAttr > "2018-03-20".date, classOf[Gt[_]], + resultFun("2018-03-21")) + checkFilterPredicate(dateAttr <= "2018-03-18".date, classOf[LtEq[_]], + resultFun("2018-03-18")) + checkFilterPredicate(dateAttr >= "2018-03-21".date, classOf[GtEq[_]], + resultFun("2018-03-21")) + + checkFilterPredicate(Literal("2018-03-18".date) === dateAttr, classOf[Eq[_]], + resultFun("2018-03-18")) + checkFilterPredicate(Literal("2018-03-18".date) <=> dateAttr, classOf[Eq[_]], + resultFun("2018-03-18")) + checkFilterPredicate(Literal("2018-03-19".date) > dateAttr, classOf[Lt[_]], + resultFun("2018-03-18")) + checkFilterPredicate(Literal("2018-03-20".date) < dateAttr, classOf[Gt[_]], + resultFun("2018-03-21")) + checkFilterPredicate(Literal("2018-03-18".date) >= dateAttr, classOf[LtEq[_]], + resultFun("2018-03-18")) + checkFilterPredicate(Literal("2018-03-21".date) <= dateAttr, classOf[GtEq[_]], + resultFun("2018-03-21")) + + checkFilterPredicate(!(dateAttr < "2018-03-21".date), classOf[GtEq[_]], + resultFun("2018-03-21")) + checkFilterPredicate( + dateAttr < "2018-03-19".date || dateAttr > "2018-03-20".date, + classOf[Operators.Or], + Seq(Row(resultFun("2018-03-18")), Row(resultFun("2018-03-21")))) + } + } } } test("filter pushdown - timestamp") { - // spark.sql.parquet.outputTimestampType = TIMESTAMP_MILLIS - val millisData = Seq(Timestamp.valueOf("2018-06-14 08:28:53.123"), - Timestamp.valueOf("2018-06-15 08:28:53.123"), - Timestamp.valueOf("2018-06-16 08:28:53.123"), - Timestamp.valueOf("2018-06-17 08:28:53.123")) - withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> - ParquetOutputTimestampType.TIMESTAMP_MILLIS.toString) { - testTimestampPushdown(millisData) - } - - // spark.sql.parquet.outputTimestampType = TIMESTAMP_MICROS - val microsData = Seq(Timestamp.valueOf("2018-06-14 08:28:53.123456"), - Timestamp.valueOf("2018-06-15 08:28:53.123456"), - Timestamp.valueOf("2018-06-16 08:28:53.123456"), - Timestamp.valueOf("2018-06-17 08:28:53.123456")) - withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> - ParquetOutputTimestampType.TIMESTAMP_MICROS.toString) { - testTimestampPushdown(microsData) - } - - // spark.sql.parquet.outputTimestampType = INT96 doesn't support pushdown - withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> - ParquetOutputTimestampType.INT96.toString) { - withParquetDataFrame(millisData.map(i => Tuple1(i))) { implicit df => - val schema = new SparkToParquetSchemaConverter(conf).convert(df.schema) - assertResult(None) { - createParquetFilters(schema).createFilter(sources.IsNull("_1")) + Seq(true, false).foreach { java8Api => + withSQLConf( + SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString, + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED") { + // spark.sql.parquet.outputTimestampType = TIMESTAMP_MILLIS + val millisData = Seq( + "1000-06-14 08:28:53.123", + "1582-06-15 08:28:53.001", + "1900-06-16 08:28:53.0", + "2018-06-17 08:28:53.999") + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> + ParquetOutputTimestampType.TIMESTAMP_MILLIS.toString) { + testTimestampPushdown(millisData, java8Api) + } + + // spark.sql.parquet.outputTimestampType = TIMESTAMP_MICROS + val microsData = Seq( + "1000-06-14 08:28:53.123456", + "1582-06-15 08:28:53.123456", + "1900-06-16 08:28:53.123456", + "2018-06-17 08:28:53.123456") + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> + ParquetOutputTimestampType.TIMESTAMP_MICROS.toString) { + testTimestampPushdown(microsData, java8Api) + } + + // spark.sql.parquet.outputTimestampType = INT96 doesn't support pushdown + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> + ParquetOutputTimestampType.INT96.toString) { + import testImplicits._ + withTempPath { file => + millisData.map(i => Tuple1(Timestamp.valueOf(i))).toDF + .write.format(dataSourceName).save(file.getCanonicalPath) + readParquetFile(file.getCanonicalPath) { df => + val schema = new SparkToParquetSchemaConverter(conf).convert(df.schema) + assertResult(None) { + createParquetFilters(schema).createFilter(sources.IsNull("_1")) + } + } + } } } } @@ -502,33 +663,39 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared (false, DecimalType.MAX_PRECISION) // binaryWriterUsingUnscaledBytes ).foreach { case (legacyFormat, precision) => withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> legacyFormat.toString) { - val schema = StructType.fromDDL(s"a decimal($precision, 2)") val rdd = spark.sparkContext.parallelize((1 to 4).map(i => Row(new java.math.BigDecimal(i)))) - val dataFrame = spark.createDataFrame(rdd, schema) - testDecimalPushDown(dataFrame) { implicit df => - assert(df.schema === schema) - checkFilterPredicate('a.isNull, classOf[Eq[_]], Seq.empty[Row]) - checkFilterPredicate('a.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_))) - - checkFilterPredicate('a === 1, classOf[Eq[_]], 1) - checkFilterPredicate('a <=> 1, classOf[Eq[_]], 1) - checkFilterPredicate('a =!= 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_))) - - checkFilterPredicate('a < 2, classOf[Lt[_]], 1) - checkFilterPredicate('a > 3, classOf[Gt[_]], 4) - checkFilterPredicate('a <= 1, classOf[LtEq[_]], 1) - checkFilterPredicate('a >= 4, classOf[GtEq[_]], 4) - - checkFilterPredicate(Literal(1) === 'a, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(1) <=> 'a, classOf[Eq[_]], 1) - checkFilterPredicate(Literal(2) > 'a, classOf[Lt[_]], 1) - checkFilterPredicate(Literal(3) < 'a, classOf[Gt[_]], 4) - checkFilterPredicate(Literal(1) >= 'a, classOf[LtEq[_]], 1) - checkFilterPredicate(Literal(4) <= 'a, classOf[GtEq[_]], 4) - - checkFilterPredicate(!('a < 4), classOf[GtEq[_]], 4) - checkFilterPredicate('a < 2 || 'a > 3, classOf[Operators.Or], Seq(Row(1), Row(4))) + val dataFrame = spark.createDataFrame(rdd, StructType.fromDDL(s"a decimal($precision, 2)")) + withNestedParquetDataFrame(dataFrame) { case (inputDF, colName, resultFun) => + implicit val df: DataFrame = inputDF + + val decimalAttr: Expression = df(colName).expr + assert(df(colName).expr.dataType === DecimalType(precision, 2)) + + checkFilterPredicate(decimalAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) + checkFilterPredicate(decimalAttr.isNotNull, classOf[NotEq[_]], + (1 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(decimalAttr === 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(decimalAttr <=> 1, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(decimalAttr =!= 1, classOf[NotEq[_]], + (2 to 4).map(i => Row.apply(resultFun(i)))) + + checkFilterPredicate(decimalAttr < 2, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(decimalAttr > 3, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(decimalAttr <= 1, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(decimalAttr >= 4, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(Literal(1) === decimalAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(1) <=> decimalAttr, classOf[Eq[_]], resultFun(1)) + checkFilterPredicate(Literal(2) > decimalAttr, classOf[Lt[_]], resultFun(1)) + checkFilterPredicate(Literal(3) < decimalAttr, classOf[Gt[_]], resultFun(4)) + checkFilterPredicate(Literal(1) >= decimalAttr, classOf[LtEq[_]], resultFun(1)) + checkFilterPredicate(Literal(4) <= decimalAttr, classOf[GtEq[_]], resultFun(4)) + + checkFilterPredicate(!(decimalAttr < 4), classOf[GtEq[_]], resultFun(4)) + checkFilterPredicate(decimalAttr < 2 || decimalAttr > 3, classOf[Operators.Or], + Seq(Row(resultFun(1)), Row(resultFun(4)))) } } } @@ -648,10 +815,9 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared test("Filter applied on merged Parquet schema with new column should work") { import testImplicits._ - Seq("true", "false").foreach { vectorized => + withAllParquetReaders { withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true", - SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true", - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { + SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") { withTempPath { dir => val path1 = s"${dir.getCanonicalPath}/table1" (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path1) @@ -1085,33 +1251,31 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared } test("SPARK-17213: Broken Parquet filter push-down for string columns") { - Seq(true, false).foreach { vectorizedEnabled => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorizedEnabled.toString) { - withTempPath { dir => - import testImplicits._ + withAllParquetReaders { + withTempPath { dir => + import testImplicits._ - val path = dir.getCanonicalPath - // scalastyle:off nonascii - Seq("a", "é").toDF("name").write.parquet(path) - // scalastyle:on nonascii + val path = dir.getCanonicalPath + // scalastyle:off nonascii + Seq("a", "é").toDF("name").write.parquet(path) + // scalastyle:on nonascii - assert(spark.read.parquet(path).where("name > 'a'").count() == 1) - assert(spark.read.parquet(path).where("name >= 'a'").count() == 2) + assert(spark.read.parquet(path).where("name > 'a'").count() == 1) + assert(spark.read.parquet(path).where("name >= 'a'").count() == 2) - // scalastyle:off nonascii - assert(spark.read.parquet(path).where("name < 'é'").count() == 1) - assert(spark.read.parquet(path).where("name <= 'é'").count() == 2) - // scalastyle:on nonascii - } + // scalastyle:off nonascii + assert(spark.read.parquet(path).where("name < 'é'").count() == 1) + assert(spark.read.parquet(path).where("name <= 'é'").count() == 2) + // scalastyle:on nonascii } } } - test("SPARK-20364: Disable Parquet predicate pushdown for fields having dots in the names") { + test("SPARK-31026: Parquet predicate pushdown for fields having dots in the names") { import testImplicits._ - Seq(true, false).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString, + withAllParquetReaders { + withSQLConf( SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> true.toString, SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { withTempPath { path => @@ -1120,6 +1284,28 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared assert(readBack.count() == 1) } } + + withSQLConf( + // Makes sure disabling 'spark.sql.parquet.recordFilter' still enables + // row group level filtering. + SQLConf.PARQUET_RECORD_FILTER_ENABLED.key -> "false", + SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") { + + withTempPath { path => + val data = (1 to 1024) + data.toDF("col.dots").coalesce(1) + .write.option("parquet.block.size", 512) + .parquet(path.getAbsolutePath) + val df = spark.read.parquet(path.getAbsolutePath).filter("`col.dots` == 500") + // Here, we strip the Spark side filter and check the actual results from Parquet. + val actual = stripSparkFilter(df).collect().length + // Since those are filtered at row group level, the result count should be less + // than the total length but should not be a single record. + // Note that, if record level filtering is enabled, it should be a single record. + // If no filter is pushed down to Parquet, it should be the total length of data. + assert(actual > 1 && actual < data.length) + } + } } } @@ -1390,8 +1576,30 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared } } } + + test("SPARK-30826: case insensitivity of StringStartsWith attribute") { + import testImplicits._ + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + withTable("t1") { + withTempPath { dir => + val path = dir.toURI.toString + Seq("42").toDF("COL").write.parquet(path) + spark.sql( + s""" + |CREATE TABLE t1 (col STRING) + |USING parquet + |OPTIONS (path '$path') + """.stripMargin) + checkAnswer( + spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'"), + Row("42")) + } + } + } + } } +@ExtendedSQLTest class ParquetV1FilterSuite extends ParquetFilterSuite { override protected def sparkConf: SparkConf = super @@ -1406,51 +1614,72 @@ class ParquetV1FilterSuite extends ParquetFilterSuite { expected: Seq[Row]): Unit = { val output = predicate.collect { case a: Attribute => a }.distinct - withSQLConf( - SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true", - SQLConf.PARQUET_FILTER_PUSHDOWN_DATE_ENABLED.key -> "true", - SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED.key -> "true", - SQLConf.PARQUET_FILTER_PUSHDOWN_DECIMAL_ENABLED.key -> "true", - SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED.key -> "true", - // Disable adding filters from constraints because it adds, for instance, - // is-not-null to pushed filters, which makes it hard to test if the pushed - // filter is expected or not (this had to be fixed with SPARK-13495). - SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> InferFiltersFromConstraints.ruleName, - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - val query = df - .select(output.map(e => Column(e)): _*) - .where(Column(predicate)) + Seq(("parquet", true), ("", false)).foreach { case (pushdownDsList, nestedPredicatePushdown) => + withSQLConf( + SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true", + SQLConf.PARQUET_FILTER_PUSHDOWN_DATE_ENABLED.key -> "true", + SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED.key -> "true", + SQLConf.PARQUET_FILTER_PUSHDOWN_DECIMAL_ENABLED.key -> "true", + SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED.key -> "true", + // Disable adding filters from constraints because it adds, for instance, + // is-not-null to pushed filters, which makes it hard to test if the pushed + // filter is expected or not (this had to be fixed with SPARK-13495). + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> InferFiltersFromConstraints.ruleName, + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.NESTED_PREDICATE_PUSHDOWN_FILE_SOURCE_LIST.key -> pushdownDsList) { + val query = df + .select(output.map(e => Column(e)): _*) + .where(Column(predicate)) + + val nestedOrAttributes = predicate.collectFirst { + case g: GetStructField => g + case a: Attribute => a + } + assert(nestedOrAttributes.isDefined, "No GetStructField nor Attribute is detected.") + + val parsed = parseColumnPath( + PushableColumnAndNestedColumn.unapply(nestedOrAttributes.get).get) + + val containsNestedColumnOrDot = parsed.length > 1 || parsed(0).contains(".") + + var maybeRelation: Option[HadoopFsRelation] = None + val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect { + case PhysicalOperation(_, filters, + LogicalRelation(relation: HadoopFsRelation, _, _, _)) => + maybeRelation = Some(relation) + filters + }.flatten.reduceLeftOption(_ && _) + assert(maybeAnalyzedPredicate.isDefined, "No filter is analyzed from the given query") + + val (_, selectedFilters, _) = + DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq) + // If predicates contains nested column or dot, we push down the predicates only if + // "parquet" is in `NESTED_PREDICATE_PUSHDOWN_V1_SOURCE_LIST`. + if (nestedPredicatePushdown || !containsNestedColumnOrDot) { + assert(selectedFilters.nonEmpty, "No filter is pushed down") + val schema = new SparkToParquetSchemaConverter(conf).convert(df.schema) + val parquetFilters = createParquetFilters(schema) + // In this test suite, all the simple predicates are convertible here. + assert(parquetFilters.convertibleFilters(selectedFilters) === selectedFilters) + val pushedParquetFilters = selectedFilters.map { pred => + val maybeFilter = parquetFilters.createFilter(pred) + assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred") + maybeFilter.get + } + // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`) + assert(pushedParquetFilters.exists(_.getClass === filterClass), + s"${pushedParquetFilters.map(_.getClass).toList} did not contain ${filterClass}.") - var maybeRelation: Option[HadoopFsRelation] = None - val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect { - case PhysicalOperation(_, filters, - LogicalRelation(relation: HadoopFsRelation, _, _, _)) => - maybeRelation = Some(relation) - filters - }.flatten.reduceLeftOption(_ && _) - assert(maybeAnalyzedPredicate.isDefined, "No filter is analyzed from the given query") - - val (_, selectedFilters, _) = - DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq) - assert(selectedFilters.nonEmpty, "No filter is pushed down") - val schema = new SparkToParquetSchemaConverter(conf).convert(df.schema) - val parquetFilters = createParquetFilters(schema) - // In this test suite, all the simple predicates are convertible here. - assert(parquetFilters.convertibleFilters(selectedFilters) === selectedFilters) - val pushedParquetFilters = selectedFilters.map { pred => - val maybeFilter = parquetFilters.createFilter(pred) - assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pred") - maybeFilter.get + checker(stripSparkFilter(query), expected) + } else { + assert(selectedFilters.isEmpty, "There is filter pushed down") + } } - // Doesn't bother checking type parameters here (e.g. `Eq[Integer]`) - assert(pushedParquetFilters.exists(_.getClass === filterClass), - s"${pushedParquetFilters.map(_.getClass).toList} did not contain ${filterClass}.") - - checker(stripSparkFilter(query), expected) } } } +@ExtendedSQLTest class ParquetV2FilterSuite extends ParquetFilterSuite { // TODO: enable Parquet V2 write path after file source V2 writers are workable. override protected def sparkConf: SparkConf = @@ -1485,7 +1714,7 @@ class ParquetV2FilterSuite extends ParquetFilterSuite { case PhysicalOperation(_, filters, DataSourceV2ScanRelation(_, scan: ParquetScan, _)) => assert(filters.nonEmpty, "No filter is analyzed from the given query") - val sourceFilters = filters.flatMap(DataSourceStrategy.translateFilter).toArray + val sourceFilters = filters.flatMap(DataSourceStrategy.translateFilter(_, true)).toArray val pushedFilters = scan.pushedFilters assert(pushedFilters.nonEmpty, "No filter is pushed down") val schema = new SparkToParquetSchemaConverter(conf).convert(df.schema) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 1550b3bbb6242..0984711dfcdfd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql.execution.datasources.parquet +import java.nio.file.{Files, Paths, StandardCopyOption} +import java.sql.{Date, Timestamp} +import java.time._ import java.util.Locale import scala.collection.JavaConverters._ @@ -39,14 +42,15 @@ import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} -import org.apache.spark.{SPARK_VERSION_SHORT, SparkException} +import org.apache.spark.{SPARK_VERSION_SHORT, SparkException, SparkUpgradeException} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection} import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeRow} -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils} import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -644,47 +648,39 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession } test("read dictionary encoded decimals written as INT32") { - ("true" :: "false" :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - checkAnswer( - // Decimal column in this file is encoded using plain dictionary - readResourceParquetFile("test-data/dec-in-i32.parquet"), - spark.range(1 << 4).select('id % 10 cast DecimalType(5, 2) as 'i32_dec)) - } + withAllParquetReaders { + checkAnswer( + // Decimal column in this file is encoded using plain dictionary + readResourceParquetFile("test-data/dec-in-i32.parquet"), + spark.range(1 << 4).select('id % 10 cast DecimalType(5, 2) as 'i32_dec)) } } test("read dictionary encoded decimals written as INT64") { - ("true" :: "false" :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - checkAnswer( - // Decimal column in this file is encoded using plain dictionary - readResourceParquetFile("test-data/dec-in-i64.parquet"), - spark.range(1 << 4).select('id % 10 cast DecimalType(10, 2) as 'i64_dec)) - } + withAllParquetReaders { + checkAnswer( + // Decimal column in this file is encoded using plain dictionary + readResourceParquetFile("test-data/dec-in-i64.parquet"), + spark.range(1 << 4).select('id % 10 cast DecimalType(10, 2) as 'i64_dec)) } } test("read dictionary encoded decimals written as FIXED_LEN_BYTE_ARRAY") { - ("true" :: "false" :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - checkAnswer( - // Decimal column in this file is encoded using plain dictionary - readResourceParquetFile("test-data/dec-in-fixed-len.parquet"), - spark.range(1 << 4).select('id % 10 cast DecimalType(10, 2) as 'fixed_len_dec)) - } + withAllParquetReaders { + checkAnswer( + // Decimal column in this file is encoded using plain dictionary + readResourceParquetFile("test-data/dec-in-fixed-len.parquet"), + spark.range(1 << 4).select('id % 10 cast DecimalType(10, 2) as 'fixed_len_dec)) } } test("read dictionary and plain encoded timestamp_millis written as INT64") { - ("true" :: "false" :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - checkAnswer( - // timestamp column in this file is encoded using combination of plain - // and dictionary encodings. - readResourceParquetFile("test-data/timemillis-in-i64.parquet"), - (1 to 3).map(i => Row(new java.sql.Timestamp(10)))) - } + withAllParquetReaders { + checkAnswer( + // timestamp column in this file is encoded using combination of plain + // and dictionary encodings. + readResourceParquetFile("test-data/timemillis-in-i64.parquet"), + (1 to 3).map(i => Row(new java.sql.Timestamp(10)))) } } @@ -719,7 +715,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession { val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) try { reader.initialize(file, null) val result = mutable.ArrayBuffer.empty[(Int, String)] @@ -738,7 +734,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession { val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) try { reader.initialize(file, ("_2" :: Nil).asJava) val result = mutable.ArrayBuffer.empty[(String)] @@ -756,7 +752,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession { val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) try { reader.initialize(file, ("_2" :: "_1" :: Nil).asJava) val result = mutable.ArrayBuffer.empty[(String, Int)] @@ -775,7 +771,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession { val conf = sqlContext.conf val reader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) try { reader.initialize(file, List[String]().asJava) var result = 0 @@ -795,7 +791,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession Seq(1).toDF().repartition(1).write.parquet(dir.getCanonicalPath) val dataTypes = - Seq(StringType, BooleanType, ByteType, ShortType, IntegerType, LongType, + Seq(StringType, BooleanType, ByteType, BinaryType, ShortType, IntegerType, LongType, FloatType, DoubleType, DecimalType(25, 5), DateType, TimestampType) val constantValues = @@ -803,6 +799,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession UTF8String.fromString("a string"), true, 1.toByte, + "Spark SQL".getBytes, 2.toShort, 3, Long.MaxValue, @@ -816,7 +813,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession val schema = StructType(StructField("pcol", dt) :: Nil) val conf = sqlContext.conf val vectorizedReader = new VectorizedParquetRecordReader( - null, conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) + conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize) val partitionValues = new GenericInternalRow(Array(v)) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) @@ -830,7 +827,11 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession // in order to use get(...) method which is not implemented in `ColumnarBatch`. val actual = row.copy().get(1, dt) val expected = v - assert(actual == expected) + if (dt.isInstanceOf[BinaryType]) { + assert(actual.asInstanceOf[Array[Byte]] sameElements expected.asInstanceOf[Array[Byte]]) + } else { + assert(actual == expected) + } } finally { vectorizedReader.close() } @@ -863,20 +864,281 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession } } + private def getMetaData(dir: java.io.File): Map[String, String] = { + val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) + val conf = new Configuration() + val hadoopInputFile = HadoopInputFile.fromPath(new Path(file), conf) + val parquetReadOptions = HadoopReadOptions.builder(conf).build() + val m = ParquetFileReader.open(hadoopInputFile, parquetReadOptions) + val metadata = try { + m.getFileMetaData.getKeyValueMetaData + } finally { + m.close() + } + metadata.asScala.toMap + } + test("Write Spark version into Parquet metadata") { withTempPath { dir => - val path = dir.getAbsolutePath - spark.range(1).repartition(1).write.parquet(path) - val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) - - val conf = new Configuration() - val hadoopInputFile = HadoopInputFile.fromPath(new Path(file), conf) - val parquetReadOptions = HadoopReadOptions.builder(conf).build() - val m = ParquetFileReader.open(hadoopInputFile, parquetReadOptions) - val metaData = m.getFileMetaData.getKeyValueMetaData - m.close() + spark.range(1).repartition(1).write.parquet(dir.getAbsolutePath) + assert(getMetaData(dir)(SPARK_VERSION_METADATA_KEY) === SPARK_VERSION_SHORT) + } + } + + // It generates input files for the test below: + // "SPARK-31159: compatibility with Spark 2.4 in reading dates/timestamps" + ignore("SPARK-31806: generate test files for checking compatibility with Spark 2.4") { + val resourceDir = "sql/core/src/test/resources/test-data" + val version = "2_4_5" + val N = 8 + def save( + in: Seq[(String, String)], + t: String, + dstFile: String, + options: Map[String, String] = Map.empty): Unit = { + withTempDir { dir => + in.toDF("dict", "plain") + .select($"dict".cast(t), $"plain".cast(t)) + .repartition(1) + .write + .mode("overwrite") + .options(options) + .parquet(dir.getCanonicalPath) + Files.copy( + dir.listFiles().filter(_.getName.endsWith(".snappy.parquet")).head.toPath, + Paths.get(resourceDir, dstFile), + StandardCopyOption.REPLACE_EXISTING) + } + } + DateTimeTestUtils.withDefaultTimeZone(DateTimeTestUtils.LA) { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> DateTimeTestUtils.LA.getId) { + save( + (1 to N).map(i => ("1001-01-01", s"1001-01-0$i")), + "date", + s"before_1582_date_v$version.snappy.parquet") + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> "TIMESTAMP_MILLIS") { + save( + (1 to N).map(i => ("1001-01-01 01:02:03.123", s"1001-01-0$i 01:02:03.123")), + "timestamp", + s"before_1582_timestamp_millis_v$version.snappy.parquet") + } + val usTs = (1 to N).map(i => ("1001-01-01 01:02:03.123456", s"1001-01-0$i 01:02:03.123456")) + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> "TIMESTAMP_MICROS") { + save(usTs, "timestamp", s"before_1582_timestamp_micros_v$version.snappy.parquet") + } + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> "INT96") { + // Comparing to other logical types, Parquet-MR chooses dictionary encoding for the + // INT96 logical type because it consumes less memory for small column cardinality. + // Huge parquet files doesn't make sense to place to the resource folder. That's why + // we explicitly set `parquet.enable.dictionary` and generate two files w/ and w/o + // dictionary encoding. + save( + usTs, + "timestamp", + s"before_1582_timestamp_int96_plain_v$version.snappy.parquet", + Map("parquet.enable.dictionary" -> "false")) + save( + usTs, + "timestamp", + s"before_1582_timestamp_int96_dict_v$version.snappy.parquet", + Map("parquet.enable.dictionary" -> "true")) + } + } + } + } + + test("SPARK-31159: compatibility with Spark 2.4 in reading dates/timestamps") { + val N = 8 + // test reading the existing 2.4 files and new 3.0 files (with rebase on/off) together. + def checkReadMixedFiles[T]( + fileName: String, + catalystType: String, + rowFunc: Int => (String, String), + toJavaType: String => T, + checkDefaultLegacyRead: String => Unit, + tsOutputType: String = "TIMESTAMP_MICROS"): Unit = { + withTempPaths(2) { paths => + paths.foreach(_.delete()) + val path2_4 = getResourceParquetFilePath("test-data/" + fileName) + val path3_0 = paths(0).getCanonicalPath + val path3_0_rebase = paths(1).getCanonicalPath + val df = Seq.tabulate(N)(rowFunc).toDF("dict", "plain") + .select($"dict".cast(catalystType), $"plain".cast(catalystType)) + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> tsOutputType) { + checkDefaultLegacyRead(path2_4) + // By default we should fail to write ancient datetime values. + val e = intercept[SparkException](df.write.parquet(path3_0)) + assert(e.getCause.getCause.getCause.isInstanceOf[SparkUpgradeException]) + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString) { + df.write.mode("overwrite").parquet(path3_0) + } + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> LEGACY.toString) { + df.write.parquet(path3_0_rebase) + } + } + // For Parquet files written by Spark 3.0, we know the writer info and don't need the + // config to guide the rebase behavior. + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key -> LEGACY.toString) { + checkAnswer( + spark.read.format("parquet").load(path2_4, path3_0, path3_0_rebase), + (0 until N).flatMap { i => + val (dictS, plainS) = rowFunc(i) + Seq.tabulate(3) { _ => + Row(toJavaType(dictS), toJavaType(plainS)) + } + }) + } + } + } + def failInRead(path: String): Unit = { + val e = intercept[SparkException](spark.read.parquet(path).collect()) + assert(e.getCause.isInstanceOf[SparkUpgradeException]) + } + def successInRead(path: String): Unit = spark.read.parquet(path).collect() + Seq( + // By default we should fail to read ancient datetime values when parquet files don't + // contain Spark version. + "2_4_5" -> failInRead _, + "2_4_6" -> successInRead _).foreach { case (version, checkDefaultRead) => + withAllParquetReaders { + checkReadMixedFiles( + s"before_1582_date_v$version.snappy.parquet", + "date", + (i: Int) => ("1001-01-01", s"1001-01-0${i + 1}"), + java.sql.Date.valueOf, + checkDefaultRead) + checkReadMixedFiles( + s"before_1582_timestamp_micros_v$version.snappy.parquet", + "timestamp", + (i: Int) => ("1001-01-01 01:02:03.123456", s"1001-01-0${i + 1} 01:02:03.123456"), + java.sql.Timestamp.valueOf, + checkDefaultRead) + checkReadMixedFiles( + s"before_1582_timestamp_millis_v$version.snappy.parquet", + "timestamp", + (i: Int) => ("1001-01-01 01:02:03.123", s"1001-01-0${i + 1} 01:02:03.123"), + java.sql.Timestamp.valueOf, + checkDefaultRead, + tsOutputType = "TIMESTAMP_MILLIS") + // INT96 is a legacy timestamp format and we always rebase the seconds for it. + Seq("plain", "dict").foreach { enc => + checkAnswer(readResourceParquetFile( + s"test-data/before_1582_timestamp_int96_${enc}_v$version.snappy.parquet"), + Seq.tabulate(N) { i => + Row( + java.sql.Timestamp.valueOf("1001-01-01 01:02:03.123456"), + java.sql.Timestamp.valueOf(s"1001-01-0${i + 1} 01:02:03.123456")) + }) + } + } + } + } - assert(metaData.get(SPARK_VERSION_METADATA_KEY) === SPARK_VERSION_SHORT) + test("SPARK-31159: rebasing timestamps in write") { + val N = 8 + Seq(false, true).foreach { dictionaryEncoding => + Seq( + ("TIMESTAMP_MILLIS", "1001-01-01 01:02:03.123", "1001-01-07 01:09:05.123"), + ("TIMESTAMP_MICROS", "1001-01-01 01:02:03.123456", "1001-01-07 01:09:05.123456"), + ("INT96", "1001-01-01 01:02:03.123456", "1001-01-01 01:02:03.123456") + ).foreach { case (outType, tsStr, nonRebased) => + withClue(s"output type $outType") { + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> outType) { + withTempPath { dir => + val path = dir.getAbsolutePath + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> LEGACY.toString) { + Seq.tabulate(N)(_ => tsStr).toDF("tsS") + .select($"tsS".cast("timestamp").as("ts")) + .repartition(1) + .write + .option("parquet.enable.dictionary", dictionaryEncoding) + .parquet(path) + } + + withAllParquetReaders { + // The file metadata indicates if it needs rebase or not, so we can always get the + // correct result regardless of the "rebase mode" config. + Seq(LEGACY, CORRECTED, EXCEPTION).foreach { mode => + withSQLConf( + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ.key -> mode.toString) { + checkAnswer( + spark.read.parquet(path), + Seq.tabulate(N)(_ => Row(Timestamp.valueOf(tsStr)))) + } + } + + // Force to not rebase to prove the written datetime values are rebased + // and we will get wrong result if we don't rebase while reading. + withSQLConf("spark.test.forceNoRebase" -> "true") { + checkAnswer( + spark.read.parquet(path), + Seq.tabulate(N)(_ => Row(Timestamp.valueOf(nonRebased)))) + } + } + } + } + } + } + } + } + + test("SPARK-31159: rebasing dates in write") { + val N = 8 + Seq(false, true).foreach { dictionaryEncoding => + withTempPath { dir => + val path = dir.getAbsolutePath + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> LEGACY.toString) { + Seq.tabulate(N)(_ => "1001-01-01").toDF("dateS") + .select($"dateS".cast("date").as("date")) + .repartition(1) + .write + .option("parquet.enable.dictionary", dictionaryEncoding) + .parquet(path) + } + + withAllParquetReaders { + // The file metadata indicates if it needs rebase or not, so we can always get the + // correct result regardless of the "rebase mode" config. + Seq(LEGACY, CORRECTED, EXCEPTION).foreach { mode => + withSQLConf(SQLConf.LEGACY_AVRO_REBASE_MODE_IN_READ.key -> mode.toString) { + checkAnswer( + spark.read.parquet(path), + Seq.tabulate(N)(_ => Row(Date.valueOf("1001-01-01")))) + } + } + + // Force to not rebase to prove the written datetime values are rebased and we will get + // wrong result if we don't rebase while reading. + withSQLConf("spark.test.forceNoRebase" -> "true") { + checkAnswer( + spark.read.parquet(path), + Seq.tabulate(N)(_ => Row(Date.valueOf("1001-01-07")))) + } + } + } + } + } + + test("SPARK-33163: write the metadata key 'org.apache.spark.legacyDateTime'") { + def saveTs(dir: java.io.File): Unit = { + Seq(Timestamp.valueOf("2020-10-15 01:02:03")).toDF() + .repartition(1) + .write + .parquet(dir.getAbsolutePath) + } + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> LEGACY.toString) { + withTempPath { dir => + saveTs(dir) + assert(getMetaData(dir)(SPARK_LEGACY_DATETIME) === "") + } + } + Seq(CORRECTED, EXCEPTION).foreach { mode => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> mode.toString) { + withTempPath { dir => + saveTs(dir) + assert(getMetaData(dir).get(SPARK_LEGACY_DATETIME).isEmpty) + } + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala index 1ded34f24e436..a14f6416199a1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.datasources.parquet import java.io.File +import java.time.ZoneOffset import org.apache.commons.io.FileUtils import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} @@ -118,21 +119,16 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS ).map { s => java.sql.Timestamp.valueOf(s) } import testImplicits._ // match the column names of the file from impala - withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> - SQLConf.ParquetOutputTimestampType.INT96.toString) { - val df = spark.createDataset(ts).toDF().repartition(1) - .withColumnRenamed("value", "ts") - df.write.parquet(tableDir.getAbsolutePath) - } + val df = spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts") + df.write.parquet(tableDir.getAbsolutePath) FileUtils.copyFile(new File(impalaPath), new File(tableDir, "part-00001.parq")) Seq(false, true).foreach { int96TimestampConversion => - Seq(false, true).foreach { vectorized => + withAllParquetReaders { withSQLConf( (SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key, SQLConf.ParquetOutputTimestampType.INT96.toString), - (SQLConf.PARQUET_INT96_TIMESTAMP_CONVERSION.key, int96TimestampConversion.toString()), - (SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized.toString()) + (SQLConf.PARQUET_INT96_TIMESTAMP_CONVERSION.key, int96TimestampConversion.toString()) ) { val readBack = spark.read.parquet(tableDir.getAbsolutePath).collect() assert(readBack.size === 6) @@ -145,14 +141,15 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS impalaFileData.map { ts => DateTimeUtils.toJavaTimestamp(DateTimeUtils.convertTz( DateTimeUtils.fromJavaTimestamp(ts), - DateTimeUtils.TimeZoneUTC, - DateTimeUtils.getTimeZone(conf.sessionLocalTimeZone))) + ZoneOffset.UTC, + DateTimeUtils.getZoneId(conf.sessionLocalTimeZone))) } } val fullExpectations = (ts ++ impalaExpectations).map(_.toString).sorted.toArray val actual = readBack.map(_.getTimestamp(0).toString).sorted withClue( - s"int96TimestampConversion = $int96TimestampConversion; vectorized = $vectorized") { + s"int96TimestampConversion = $int96TimestampConversion; " + + s"vectorized = ${SQLConf.get.parquetVectorizedReaderEnabled}") { assert(fullExpectations === actual) // Now test that the behavior is still correct even with a filter which could get diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index e63929470ce5f..32a9558e91f10 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -59,7 +59,8 @@ abstract class ParquetPartitionDiscoverySuite val timeZoneId = ZoneId.systemDefault() val df = DateFormatter(timeZoneId) - val tf = TimestampFormatter(timestampPartitionPattern, timeZoneId) + val tf = TimestampFormatter( + timestampPartitionPattern, timeZoneId, isParsing = true) protected override def beforeAll(): Unit = { super.beforeAll() @@ -87,7 +88,7 @@ abstract class ParquetPartitionDiscoverySuite check("1990-02-24 12:00:30", Literal.create(Timestamp.valueOf("1990-02-24 12:00:30"), TimestampType)) - val c = Calendar.getInstance(TimeZone.getTimeZone("GMT")) + val c = Calendar.getInstance(TimeZone.getTimeZone("UTC")) c.set(1990, 1, 24, 12, 0, 30) c.set(Calendar.MILLISECOND, 0) check("1990-02-24 12:00:30", @@ -691,10 +692,10 @@ abstract class ParquetPartitionDiscoverySuite } withTempPath { dir => - df.write.option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + df.write.option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString) val fields = schema.map(f => Column(f.name).cast(f.dataType)) - checkAnswer(spark.read.option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + checkAnswer(spark.read.option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .load(dir.toString).select(fields: _*), row) } } @@ -733,10 +734,10 @@ abstract class ParquetPartitionDiscoverySuite } withTempPath { dir => - df.write.option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + df.write.option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .format("parquet").partitionBy(partitionColumns.map(_.name): _*).save(dir.toString) val fields = schema.map(f => Column(f.name)) - checkAnswer(spark.read.option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + checkAnswer(spark.read.option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .load(dir.toString).select(fields: _*), row) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 917aaba2669ce..05d305a9b52ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -168,11 +168,9 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS withTempPath { file => val df = spark.createDataFrame(sparkContext.parallelize(data), schema) df.write.parquet(file.getCanonicalPath) - ("true" :: "false" :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - val df2 = spark.read.parquet(file.getCanonicalPath) - checkAnswer(df2, df.collect().toSeq) - } + withAllParquetReaders { + val df2 = spark.read.parquet(file.getCanonicalPath) + checkAnswer(df2, df.collect().toSeq) } } } @@ -791,15 +789,13 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS } test("SPARK-26677: negated null-safe equality comparison should not filter matched row groups") { - (true :: false :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { - withTempPath { path => - // Repeated values for dictionary encoding. - Seq(Some("A"), Some("A"), None).toDF.repartition(1) - .write.parquet(path.getAbsolutePath) - val df = spark.read.parquet(path.getAbsolutePath) - checkAnswer(stripSparkFilter(df.where("NOT (value <=> 'A')")), df) - } + withAllParquetReaders { + withTempPath { path => + // Repeated values for dictionary encoding. + Seq(Some("A"), Some("A"), None).toDF.repartition(1) + .write.parquet(path.getAbsolutePath) + val df = spark.read.parquet(path.getAbsolutePath) + checkAnswer(stripSparkFilter(df.where("NOT (value <=> 'A')")), df) } } } @@ -821,10 +817,8 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> toTsType) { write(df2.write.mode(SaveMode.Append)) } - Seq("true", "false").foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - checkAnswer(readback, df1.unionAll(df2)) - } + withAllParquetReaders { + checkAnswer(readback, df1.unionAll(df2)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala index c64e95078e916..cab93bd96fff4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.execution.datasources.SchemaPruningSuite import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.tags.ExtendedSQLTest abstract class ParquetSchemaPruningSuite extends SchemaPruningSuite with AdaptiveSparkPlanHelper { override protected val dataSourceName: String = "parquet" @@ -33,6 +34,7 @@ abstract class ParquetSchemaPruningSuite extends SchemaPruningSuite with Adaptiv } +@ExtendedSQLTest class ParquetV1SchemaPruningSuite extends ParquetSchemaPruningSuite { override protected def sparkConf: SparkConf = super @@ -40,6 +42,7 @@ class ParquetV1SchemaPruningSuite extends ParquetSchemaPruningSuite { .set(SQLConf.USE_V1_SOURCE_LIST, "parquet") } +@ExtendedSQLTest class ParquetV2SchemaPruningSuite extends ParquetSchemaPruningSuite { // TODO: enable Parquet V2 write path after file source V2 writers are workable. override protected def sparkConf: SparkConf = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala index 828ba6aee026b..db8ee724c01c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala @@ -152,7 +152,17 @@ private[sql] trait ParquetTest extends FileBasedDataSourceTest { } protected def readResourceParquetFile(name: String): DataFrame = { - val url = Thread.currentThread().getContextClassLoader.getResource(name) - spark.read.parquet(url.toString) + spark.read.parquet(getResourceParquetFilePath(name)) + } + + protected def getResourceParquetFilePath(name: String): String = { + Thread.currentThread().getContextClassLoader.getResource(name).toString + } + + def withAllParquetReaders(code: => Unit): Unit = { + // test the row-based reader + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false")(code) + // test the vectorized reader + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true")(code) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala index 539ff0d0e905c..7e97994476694 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala @@ -24,14 +24,14 @@ import java.nio.file.Files import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec -import org.apache.spark.TestUtils +import org.apache.spark.{SparkConf, TestUtils} import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, SaveMode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.Utils -class TextSuite extends QueryTest with SharedSparkSession { +abstract class TextSuite extends QueryTest with SharedSparkSession { import testImplicits._ test("reading text file") { @@ -234,3 +234,17 @@ class TextSuite extends QueryTest with SharedSparkSession { assert(data.length == 4) } } + +class TextV1Suite extends TextSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "text") +} + +class TextV2Suite extends TextSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala index 5e3b3441aa74f..f4812844cbae3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala @@ -19,12 +19,13 @@ package org.apache.spark.sql.execution.datasources.text import java.io.File +import org.apache.spark.SparkConf import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{StringType, StructType} -class WholeTextFileSuite extends QueryTest with SharedSparkSession { +abstract class WholeTextFileSuite extends QueryTest with SharedSparkSession { // Hadoop's FileSystem caching does not use the Configuration as part of its cache key, which // can cause Filesystem.get(Configuration) to return a cached instance created with a different @@ -103,3 +104,17 @@ class WholeTextFileSuite extends QueryTest with SharedSparkSession { } } } + +class WholeTextFileV1Suite extends WholeTextFileSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "text") +} + +class WholeTextFileV2Suite extends WholeTextFileSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 4cb845b2487d6..e9ef7c1a0c540 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -24,27 +24,14 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext import org.apache.spark.sql.execution.{CodegenSupport, LeafExecNode, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.functions._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.test.SQLTestData.TestData import org.apache.spark.sql.types.StructType -class DebuggingSuite extends SharedSparkSession { - - - var originalValue: String = _ - // With on AQE, the WholeStageCodegenExec is added when running QueryStageExec. - override def beforeAll(): Unit = { - super.beforeAll() - originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") - } - - override def afterAll(): Unit = { - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) - super.afterAll() - } +// Disable AQE because the WholeStageCodegenExec is added when running QueryStageExec +class DebuggingSuite extends SharedSparkSession with DisableAdaptiveExecutionSuite { test("DataFrame.debug()") { testData.debug() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala index 5ce758e1e4eb8..ef0a596f21104 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{BitwiseAnd, BitwiseOr, Cast, Literal, ShiftLeft} import org.apache.spark.sql.catalyst.plans.logical.BROADCAST import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec} -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite} import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.exchange.EnsureRequirements import org.apache.spark.sql.functions._ @@ -39,7 +39,8 @@ import org.apache.spark.sql.types.{LongType, ShortType} * unsafe map in [[org.apache.spark.sql.execution.joins.UnsafeHashedRelation]] is not triggered * without serializing the hashed relation, which does not happen in local mode. */ -class BroadcastJoinSuite extends QueryTest with SQLTestUtils with AdaptiveSparkPlanHelper { +abstract class BroadcastJoinSuiteBase extends QueryTest with SQLTestUtils + with AdaptiveSparkPlanHelper { import testImplicits._ protected var spark: SparkSession = null @@ -206,24 +207,25 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils with AdaptiveSparkP test("broadcast hint in SQL") { import org.apache.spark.sql.catalyst.plans.logical.Join - - spark.range(10).createOrReplaceTempView("t") - spark.range(10).createOrReplaceTempView("u") - - for (name <- Seq("BROADCAST", "BROADCASTJOIN", "MAPJOIN")) { - val plan1 = sql(s"SELECT /*+ $name(t) */ * FROM t JOIN u ON t.id = u.id").queryExecution - .optimizedPlan - val plan2 = sql(s"SELECT /*+ $name(u) */ * FROM t JOIN u ON t.id = u.id").queryExecution - .optimizedPlan - val plan3 = sql(s"SELECT /*+ $name(v) */ * FROM t JOIN u ON t.id = u.id").queryExecution - .optimizedPlan - - assert(plan1.asInstanceOf[Join].hint.leftHint.get.strategy.contains(BROADCAST)) - assert(plan1.asInstanceOf[Join].hint.rightHint.isEmpty) - assert(plan2.asInstanceOf[Join].hint.leftHint.isEmpty) - assert(plan2.asInstanceOf[Join].hint.rightHint.get.strategy.contains(BROADCAST)) - assert(plan3.asInstanceOf[Join].hint.leftHint.isEmpty) - assert(plan3.asInstanceOf[Join].hint.rightHint.isEmpty) + withTempView("t", "u") { + spark.range(10).createOrReplaceTempView("t") + spark.range(10).createOrReplaceTempView("u") + + for (name <- Seq("BROADCAST", "BROADCASTJOIN", "MAPJOIN")) { + val plan1 = sql(s"SELECT /*+ $name(t) */ * FROM t JOIN u ON t.id = u.id").queryExecution + .optimizedPlan + val plan2 = sql(s"SELECT /*+ $name(u) */ * FROM t JOIN u ON t.id = u.id").queryExecution + .optimizedPlan + val plan3 = sql(s"SELECT /*+ $name(v) */ * FROM t JOIN u ON t.id = u.id").queryExecution + .optimizedPlan + + assert(plan1.asInstanceOf[Join].hint.leftHint.get.strategy.contains(BROADCAST)) + assert(plan1.asInstanceOf[Join].hint.rightHint.isEmpty) + assert(plan2.asInstanceOf[Join].hint.leftHint.isEmpty) + assert(plan2.asInstanceOf[Join].hint.rightHint.get.strategy.contains(BROADCAST)) + assert(plan3.asInstanceOf[Join].hint.leftHint.isEmpty) + assert(plan3.asInstanceOf[Join].hint.rightHint.isEmpty) + } } } @@ -237,33 +239,40 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils with AdaptiveSparkP assert(HashJoin.rewriteKeyExpr(l :: l :: Nil) === l :: l :: Nil) assert(HashJoin.rewriteKeyExpr(l :: i :: Nil) === l :: i :: Nil) - assert(HashJoin.rewriteKeyExpr(i :: Nil) === Cast(i, LongType) :: Nil) + assert(HashJoin.rewriteKeyExpr(i :: Nil) === + Cast(i, LongType, Some(conf.sessionLocalTimeZone)) :: Nil) assert(HashJoin.rewriteKeyExpr(i :: l :: Nil) === i :: l :: Nil) assert(HashJoin.rewriteKeyExpr(i :: i :: Nil) === - BitwiseOr(ShiftLeft(Cast(i, LongType), Literal(32)), - BitwiseAnd(Cast(i, LongType), Literal((1L << 32) - 1))) :: Nil) + BitwiseOr(ShiftLeft(Cast(i, LongType, Some(conf.sessionLocalTimeZone)), Literal(32)), + BitwiseAnd(Cast(i, LongType, Some(conf.sessionLocalTimeZone)), Literal((1L << 32) - 1))) :: + Nil) assert(HashJoin.rewriteKeyExpr(i :: i :: i :: Nil) === i :: i :: i :: Nil) - assert(HashJoin.rewriteKeyExpr(s :: Nil) === Cast(s, LongType) :: Nil) + assert(HashJoin.rewriteKeyExpr(s :: Nil) === + Cast(s, LongType, Some(conf.sessionLocalTimeZone)) :: Nil) assert(HashJoin.rewriteKeyExpr(s :: l :: Nil) === s :: l :: Nil) assert(HashJoin.rewriteKeyExpr(s :: s :: Nil) === - BitwiseOr(ShiftLeft(Cast(s, LongType), Literal(16)), - BitwiseAnd(Cast(s, LongType), Literal((1L << 16) - 1))) :: Nil) + BitwiseOr(ShiftLeft(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal(16)), + BitwiseAnd(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal((1L << 16) - 1))) :: + Nil) assert(HashJoin.rewriteKeyExpr(s :: s :: s :: Nil) === BitwiseOr(ShiftLeft( - BitwiseOr(ShiftLeft(Cast(s, LongType), Literal(16)), - BitwiseAnd(Cast(s, LongType), Literal((1L << 16) - 1))), + BitwiseOr(ShiftLeft(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal(16)), + BitwiseAnd(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal((1L << 16) - 1))), Literal(16)), - BitwiseAnd(Cast(s, LongType), Literal((1L << 16) - 1))) :: Nil) + BitwiseAnd(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal((1L << 16) - 1))) :: + Nil) assert(HashJoin.rewriteKeyExpr(s :: s :: s :: s :: Nil) === BitwiseOr(ShiftLeft( BitwiseOr(ShiftLeft( - BitwiseOr(ShiftLeft(Cast(s, LongType), Literal(16)), - BitwiseAnd(Cast(s, LongType), Literal((1L << 16) - 1))), + BitwiseOr(ShiftLeft(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal(16)), + BitwiseAnd(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), + Literal((1L << 16) - 1))), Literal(16)), - BitwiseAnd(Cast(s, LongType), Literal((1L << 16) - 1))), + BitwiseAnd(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal((1L << 16) - 1))), Literal(16)), - BitwiseAnd(Cast(s, LongType), Literal((1L << 16) - 1))) :: Nil) + BitwiseAnd(Cast(s, LongType, Some(conf.sessionLocalTimeZone)), Literal((1L << 16) - 1))) :: + Nil) assert(HashJoin.rewriteKeyExpr(s :: s :: s :: s :: s :: Nil) === s :: s :: s :: s :: s :: Nil) @@ -398,4 +407,22 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils with AdaptiveSparkP } } } + + test("Broadcast timeout") { + val timeout = 5 + val slowUDF = udf({ x: Int => Thread.sleep(timeout * 10 * 1000); x }) + val df1 = spark.range(10).select($"id" as 'a) + val df2 = spark.range(5).select(slowUDF($"id") as 'a) + val testDf = df1.join(broadcast(df2), "a") + withSQLConf(SQLConf.BROADCAST_TIMEOUT.key -> timeout.toString) { + val e = intercept[Exception] { + testDf.collect() + } + assert(e.getMessage.contains(s"Could not execute broadcast in $timeout secs.")) + } + } } + +class BroadcastJoinSuite extends BroadcastJoinSuiteBase with DisableAdaptiveExecutionSuite + +class BroadcastJoinSuiteAE extends BroadcastJoinSuiteBase with EnableAdaptiveExecutionSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala index 3526aa254c280..5e7985e28a8e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/HashedRelationSuite.scala @@ -358,6 +358,45 @@ class HashedRelationSuite extends SharedSparkSession { assert(java.util.Arrays.equals(os.toByteArray, os2.toByteArray)) } + test("SPARK-31511: Make BytesToBytesMap iterators thread-safe") { + val ser = sparkContext.env.serializer.newInstance() + val key = Seq(BoundReference(0, LongType, false)) + + val unsafeProj = UnsafeProjection.create( + Seq(BoundReference(0, LongType, false), BoundReference(1, IntegerType, true))) + val rows = (0 until 10000).map(i => unsafeProj(InternalRow(Int.int2long(i), i + 1)).copy()) + val unsafeHashed = UnsafeHashedRelation(rows.iterator, key, 1, mm) + + val os = new ByteArrayOutputStream() + val thread1 = new Thread { + override def run(): Unit = { + val out = new ObjectOutputStream(os) + unsafeHashed.asInstanceOf[UnsafeHashedRelation].writeExternal(out) + out.flush() + } + } + + val thread2 = new Thread { + override def run(): Unit = { + val threadOut = new ObjectOutputStream(new ByteArrayOutputStream()) + unsafeHashed.asInstanceOf[UnsafeHashedRelation].writeExternal(threadOut) + threadOut.flush() + } + } + + thread1.start() + thread2.start() + thread1.join() + thread2.join() + + val unsafeHashed2 = ser.deserialize[UnsafeHashedRelation](ser.serialize(unsafeHashed)) + val os2 = new ByteArrayOutputStream() + val out2 = new ObjectOutputStream(os2) + unsafeHashed2.writeExternal(out2) + out2.flush() + assert(java.util.Arrays.equals(os.toByteArray, os2.toByteArray)) + } + // This test require 4G heap to run, should run it manually ignore("build HashedRelation that is larger than 1G") { val unsafeProj = UnsafeProjection.create( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 7d09577075d5d..00c0315ae3779 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.execution.{FilterExec, RangeExec, SparkPlan, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.functions._ @@ -33,22 +34,11 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.{AccumulatorContext, JsonProtocol} -class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { +// Disable AQE because metric info is different with AQE on/off +class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils + with DisableAdaptiveExecutionSuite { import testImplicits._ - var originalValue: String = _ - // With AQE on/off, the metric info is different. - override def beforeAll(): Unit = { - super.beforeAll() - originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") - } - - override def afterAll(): Unit = { - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) - super.afterAll() - } - /** * Generates a `DataFrame` by filling randomly generated bytes for hash collision. */ @@ -98,7 +88,7 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val ds = spark.range(10).filter('id < 5) testSparkPlanMetricsWithPredicates(ds.toDF(), 1, Map( 0L -> (("WholeStageCodegen (1)", Map( - "duration total (min, med, max (stageId (attemptId): taskId))" -> { + "duration" -> { _.toString.matches(timingMetricPattern) })))), true) } @@ -110,10 +100,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val df = testData2.groupBy().count() // 2 partitions val expected1 = Seq( Map("number of output rows" -> 2L, - "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + "avg hash probe bucket list iters" -> aggregateMetricsPattern), Map("number of output rows" -> 1L, - "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + "avg hash probe bucket list iters" -> aggregateMetricsPattern)) val shuffleExpected1 = Map( "records read" -> 2L, @@ -130,10 +120,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val df2 = testData2.groupBy('a).count() val expected2 = Seq( Map("number of output rows" -> 4L, - "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + "avg hash probe bucket list iters" -> aggregateMetricsPattern), Map("number of output rows" -> 3L, - "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + "avg hash probe bucket list iters" -> aggregateMetricsPattern)) val shuffleExpected2 = Map( @@ -181,12 +171,17 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { } val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get nodeIds.foreach { nodeId => - val probes = metrics(nodeId)._2("avg hash probe bucket list iters (min, med, max (stageId" + - " (attemptId): taskId))") - // Extract min, med, max from the string and strip off everthing else. - val index = probes.toString.stripPrefix("\n(").stripSuffix(")").indexOf(" (", 0) - probes.toString.stripPrefix("\n(").stripSuffix(")").slice(0, index).split(", ").foreach { - probe => assert(probe.toDouble > 1.0) + val probes = metrics(nodeId)._2("avg hash probe bucket list iters").toString + if (!probes.contains("\n")) { + // It's a single metrics value + assert(probes.toDouble > 1.0) + } else { + val mainValue = probes.split("\n").apply(1).stripPrefix("(").stripSuffix(")") + // Extract min, med, max from the string and strip off everthing else. + val index = mainValue.indexOf(" (", 0) + mainValue.slice(0, index).split(", ").foreach { + probe => assert(probe.toDouble > 1.0) + } } } } @@ -231,13 +226,13 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val df = Seq(1, 3, 2).toDF("id").sort('id) testSparkPlanMetricsWithPredicates(df, 2, Map( 0L -> (("Sort", Map( - "sort time total (min, med, max (stageId (attemptId): taskId))" -> { + "sort time" -> { _.toString.matches(timingMetricPattern) }, - "peak memory total (min, med, max (stageId (attemptId): taskId))" -> { + "peak memory" -> { _.toString.matches(sizeMetricPattern) }, - "spill size total (min, med, max (stageId (attemptId): taskId))" -> { + "spill size" -> { _.toString.matches(sizeMetricPattern) }))) )) @@ -468,27 +463,30 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { // TODO: test file source V2 as well when its statistics is correctly computed. withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "parquet") { withTempDir { tempDir => - val dir = new File(tempDir, "pqS").getCanonicalPath - - spark.range(10).write.parquet(dir) - spark.read.parquet(dir).createOrReplaceTempView("pqS") - - // The executed plan looks like: - // Exchange RoundRobinPartitioning(2) - // +- BroadcastNestedLoopJoin BuildLeft, Cross - // :- BroadcastExchange IdentityBroadcastMode - // : +- Exchange RoundRobinPartitioning(3) - // : +- *Range (0, 30, step=1, splits=2) - // +- *FileScan parquet [id#465L] Batched: true, Format: Parquet, Location: ...(ignored) - val res3 = InputOutputMetricsHelper.run( - spark.range(30).repartition(3).crossJoin(sql("select * from pqS")).repartition(2).toDF() - ) - // The query above is executed in the following stages: - // 1. range(30) => (30, 0, 30) - // 2. sql("select * from pqS") => (0, 30, 0) - // 3. crossJoin(...) of 1. and 2. => (10, 0, 300) - // 4. shuffle & return results => (0, 300, 0) - assert(res3 === (30L, 0L, 30L) :: (0L, 30L, 0L) :: (10L, 0L, 300L) :: (0L, 300L, 0L) :: Nil) + withTempView("pqS") { + val dir = new File(tempDir, "pqS").getCanonicalPath + + spark.range(10).write.parquet(dir) + spark.read.parquet(dir).createOrReplaceTempView("pqS") + + // The executed plan looks like: + // Exchange RoundRobinPartitioning(2) + // +- BroadcastNestedLoopJoin BuildLeft, Cross + // :- BroadcastExchange IdentityBroadcastMode + // : +- Exchange RoundRobinPartitioning(3) + // : +- *Range (0, 30, step=1, splits=2) + // +- *FileScan parquet [id#465L] Batched: true, Format: Parquet, Location: ...(ignored) + val res3 = InputOutputMetricsHelper.run( + spark.range(30).repartition(3).crossJoin(sql("select * from pqS")).repartition(2).toDF() + ) + // The query above is executed in the following stages: + // 1. range(30) => (30, 0, 30) + // 2. sql("select * from pqS") => (0, 30, 0) + // 3. crossJoin(...) of 1. and 2. => (10, 0, 300) + // 4. shuffle & return results => (0, 300, 0) + assert(res3 === (30L, 0L, 30L) :: (0L, 30L, 0L) :: (10L, 0L, 300L) :: (0L, 300L, 0L) :: + Nil) + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala index 0c1148f7b82e4..2977b5339ab51 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala @@ -41,28 +41,28 @@ trait SQLMetricsTestUtils extends SQLTestUtils { protected def statusStore: SQLAppStatusStore = spark.sharedState.statusStore - // Pattern of size SQLMetric value, e.g. "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB (stage 0 - // (attempt 0): task 4))" OR "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB)" + // Pattern of size SQLMetric value, e.g. "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB (stage 0.0: + // task 4))" OR "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB)" protected val sizeMetricPattern = { val bytes = "([0-9]+(\\.[0-9]+)?) (EiB|PiB|TiB|GiB|MiB|KiB|B)" - val maxMetrics = "\\(stage ([0-9])+ \\(attempt ([0-9])+\\)\\: task ([0-9])+\\)" - s"\\n$bytes \\($bytes, $bytes, $bytes( $maxMetrics)?\\)" + val maxMetrics = "\\(stage ([0-9])+\\.([0-9])+\\: task ([0-9])+\\)" + s"(.*\\n$bytes \\($bytes, $bytes, $bytes( $maxMetrics)?\\))|($bytes)" } - // Pattern of timing SQLMetric value, e.g. "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms (stage 3 (attempt - // 0): task 217))" OR "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms)" + // Pattern of timing SQLMetric value, e.g. "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms (stage 3.0): + // task 217))" OR "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms)" OR "1.0 ms" protected val timingMetricPattern = { val duration = "([0-9]+(\\.[0-9]+)?) (ms|s|m|h)" - val maxMetrics = "\\(stage ([0-9])+ \\(attempt ([0-9])+\\)\\: task ([0-9])+\\)" - s"\\n$duration \\($duration, $duration, $duration( $maxMetrics)?\\)" + val maxMetrics = "\\(stage ([0-9])+\\.([0-9])+\\: task ([0-9])+\\)" + s"(.*\\n$duration \\($duration, $duration, $duration( $maxMetrics)?\\))|($duration)" } // Pattern of size SQLMetric value for Aggregate tests. - // e.g "\n(1, 1, 0.9 (stage 1 (attempt 0): task 8)) OR "\n(1, 1, 0.9 )" + // e.g "\n(1, 1, 0.9 (stage 1.0: task 8)) OR "\n(1, 1, 0.9 )" OR "1" protected val aggregateMetricsPattern = { val iters = "([0-9]+(\\.[0-9]+)?)" - val maxMetrics = "\\(stage ([0-9])+ \\(attempt ([0-9])+\\)\\: task ([0-9])+\\)" - s"\\n\\($iters, $iters, $iters( $maxMetrics)?\\)" + val maxMetrics = "\\(stage ([0-9])+\\.([0-9])+\\: task ([0-9])+\\)" + s"(.*\\n\\($iters, $iters, $iters( $maxMetrics)?\\))|($iters)" } /** @@ -98,7 +98,7 @@ trait SQLMetricsTestUtils extends SQLTestUtils { } val totalNumBytesMetric = executedNode.metrics.find( - _.name == "written output total (min, med, max (stageId (attemptId): taskId))").get + _.name == "written output").get val totalNumBytes = metrics(totalNumBytesMetric.accumulatorId).replaceAll(",", "") .split(" ").head.trim.toDouble assert(totalNumBytes > 0) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecutionSuite.scala index c228740df07c8..60b8c9627844b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecutionSuite.scala @@ -19,8 +19,13 @@ package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter +import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.catalyst.plans.logical.Range +import org.apache.spark.sql.connector.read.streaming +import org.apache.spark.sql.connector.read.streaming.SparkDataStream import org.apache.spark.sql.functions.{count, window} import org.apache.spark.sql.streaming.StreamTest +import org.apache.spark.sql.types.{LongType, StructType} class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter { @@ -68,4 +73,122 @@ class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter { CheckNewAnswer((25, 1), (30, 1)) // This should not throw the error reported in SPARK-24156 ) } + + test("no-data-batch re-executed after restart should call V1 source.getBatch()") { + val testSource = ReExecutedBatchTestSource(spark) + val df = testSource.toDF() + .withColumn("eventTime", $"value".cast("timestamp")) + .withWatermark("eventTime", "10 seconds") + .groupBy(window($"eventTime", "5 seconds") as 'window) + .agg(count("*") as 'count) + .select($"window".getField("start").cast("long").as[Long]) + + /** Reset this test source so that it appears to be a new source requiring initialization */ + def resetSource(): StreamAction = Execute("reset source") { _ => + testSource.reset() // Make it look like a new source that needs to be re-initialized + require(testSource.currentOffset === 0) + require(testSource.getBatchCallCount === 0) + } + + /** Add data to this test source by incrementing its available offset */ + def addData(numNewRows: Int): StreamAction = new AddData { + override def addData(query: Option[StreamExecution]): (SparkDataStream, streaming.Offset) = { + testSource.incrementAvailableOffset(numNewRows) + (testSource, testSource.getOffset.get) + } + } + + testStream(df)( + addData(numNewRows = 10), // generate values 1...10, sets watermark to 0 + CheckAnswer(), + addData(numNewRows = 10), // generate values 11...20, sets watermark to 10 + ProcessAllAvailable(), // let no-data-batch be executed + CheckAnswer(0, 5), // start time of windows closed and outputted + Execute("verify source internal state before stop") { q => + // Last batch should be a no-data batch + require(q.lastProgress.numInputRows === 0) + // Source should have expected internal state + require(testSource.currentOffset === 20) + // getBatch should be called only for 2 batches with data, not for no-data-batches + assert(testSource.getBatchCallCount === 2) + }, + StopStream, + + /* Verify that if the last no-data-batch was incomplete, getBatch() is called only once */ + Execute("mark last batch as incomplete") { q => + // Delete the last committed batch from the commit log to signify that the last batch + // (a no-data batch) did not complete and has to be re-executed on restart. + val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) + q.commitLog.purgeAfter(commit - 1) + }, + resetSource(), + StartStream(), + ProcessAllAvailable(), // allow initialization and re-execution + Execute("verify source.getBatch() called after re-executed no-data-batch") { q => + // After restart, getBatch() should be called once even for no-data batch + assert(testSource.getBatchCallCount === 1) + assert(testSource.currentOffset === 20) + }, + addData(numNewRows = 10), // generate values 21...30, sets watermark to 20 + ProcessAllAvailable(), // let no-data-batch be executed + CheckAnswer(0, 5, 10, 15), + StopStream, + + /* Verify that if the last no-data-batch was complete, getBatch() is still called only once */ + Execute("verify last batch was complete") { q => + // Verify that the commit log records the last batch as completed + require(q.commitLog.getLatest().map(_._1).get === q.offsetLog.getLatest().map(_._1).get) + }, + resetSource(), + StartStream(), + ProcessAllAvailable(), // allow initialization to completed + Execute("verify source.getBatch() called even if no-data-batch was not re-executed") { q => + // After restart, getBatch() should be called even for no-data batch, but only once + assert(testSource.getBatchCallCount === 1) + assert(testSource.currentOffset === 30) + }, + addData(numNewRows = 10), // generate values 31...40, sets watermark to 30 + ProcessAllAvailable(), // let no-data-batch be executed + CheckAnswer(0, 5, 10, 15, 20, 25) + ) + } + + + case class ReExecutedBatchTestSource(spark: SparkSession) extends Source { + @volatile var currentOffset = 0L + @volatile var getBatchCallCount = 0 + + override def getOffset: Option[Offset] = { + if (currentOffset <= 0) None else Some(LongOffset(currentOffset)) + } + + override def getBatch(start: Option[Offset], end: Offset): DataFrame = { + getBatchCallCount = getBatchCallCount + 1 + if (currentOffset == 0) currentOffset = getOffsetValue(end) + val plan = Range( + start.map(getOffsetValue).getOrElse(0L) + 1L, getOffsetValue(end) + 1L, 1, None, + isStreaming = true) + Dataset.ofRows(spark, plan) + } + + def incrementAvailableOffset(numNewRows: Int): Unit = { + currentOffset = currentOffset + numNewRows + } + + def reset(): Unit = { + currentOffset = 0L + getBatchCallCount = 0 + } + def toDF(): DataFrame = Dataset.ofRows(spark, StreamingExecutionRelation(this, spark)) + override def schema: StructType = new StructType().add("value", LongType) + override def stop(): Unit = {} + private def getOffsetValue(offset: Offset): Long = { + offset match { + case s: SerializedOffset => LongOffset(s).offset + case l: LongOffset => l.offset + case _ => throw new IllegalArgumentException("incorrect offset type: " + offset) + } + } + } } + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala index 298afa880c930..24b8a973ade38 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala @@ -57,7 +57,7 @@ class AllExecutionsPageSuite extends SharedSparkSession with BeforeAndAfter { val html = renderSQLPage(request, tab, statusStore).toString().toLowerCase(Locale.ROOT) assert(html.contains("failed queries")) - assert(!html.contains("1970")) + assert(!html.contains("1970/01/01")) } test("sorting should be successful") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala index 55b551d0af078..0746059365004 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala @@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlanInfo, SQLExecution} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.functions.count import org.apache.spark.sql.internal.SQLConf @@ -152,11 +153,14 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils expected.foreach { case (id, value) => // The values in actual can be SQL metrics meaning that they contain additional formatting // when converted to string. Verify that they start with the expected value. - // TODO: this is brittle. There is no requirement that the actual string needs to start - // with the accumulator value. assert(actual.contains(id)) val v = actual(id).trim - assert(v.startsWith(value.toString), s"Wrong value for accumulator $id") + if (v.contains("\n")) { + // The actual value can be "total (max, ...)\n6 ms (5 ms, ...)". + assert(v.split("\n")(1).startsWith(value.toString), s"Wrong value for accumulator $id") + } else { + assert(v.startsWith(value.toString), s"Wrong value for accumulator $id") + } } } @@ -506,7 +510,7 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils override lazy val executedPlan = physicalPlan } - SQLExecution.withNewExecutionId(spark, dummyQueryExecution) { + SQLExecution.withNewExecutionId(dummyQueryExecution) { physicalPlan.execute().collect() } @@ -620,13 +624,12 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils assert(statusStore.execution(2) === None) } - test("SPARK-29894 test Codegen Stage Id in SparkPlanInfo") { - withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { - // with AQE on, the WholeStageCodegen rule is applied when running QueryStageExec. - val df = createTestDataFrame.select(count("*")) - val sparkPlanInfo = SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan) - assert(sparkPlanInfo.nodeName === "WholeStageCodegen (2)") - } + test("SPARK-29894 test Codegen Stage Id in SparkPlanInfo", + DisableAdaptiveExecution("WSCG rule is applied later in AQE")) { + // with AQE on, the WholeStageCodegen rule is applied when running QueryStageExec. + val df = createTestDataFrame.select(count("*")) + val sparkPlanInfo = SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan) + assert(sparkPlanInfo.nodeName === "WholeStageCodegen (2)") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala index 37d028d6a713f..a369b2d6900f9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala @@ -447,13 +447,6 @@ class ColumnarBatchSuite extends SparkFunSuite { Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET, 2.234f) Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET + 4, 1.123f) - if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) { - // Ensure array contains Little Endian floats - val bb = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN) - Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET, bb.getFloat(0)) - Platform.putFloat(buffer, Platform.BYTE_ARRAY_OFFSET + 4, bb.getFloat(4)) - } - column.putFloats(idx, 1, buffer, 4) column.putFloats(idx + 1, 1, buffer, 0) reference += 1.123f @@ -491,6 +484,57 @@ class ColumnarBatchSuite extends SparkFunSuite { } } + testVector("[SPARK-31703] Float API - Little Endian", 1024, FloatType) { + column => + val seed = System.currentTimeMillis() + val random = new Random(seed) + val reference = mutable.ArrayBuffer.empty[Float] + + var idx = 0 + + val littleEndian = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN) + littleEndian.putFloat(0, 1.357f) + littleEndian.putFloat(4, 2.468f) + val arr = new Array[Byte](littleEndian.remaining) + littleEndian.get(arr) + + column.putFloatsLittleEndian(idx, 1, arr, 4) + column.putFloatsLittleEndian(idx + 1, 1, arr, 0) + reference += 2.468f + reference += 1.357f + idx += 2 + + column.putFloatsLittleEndian(idx, 2, arr, 0) + reference += 1.357f + reference += 2.468f + idx += 2 + + while (idx < column.capacity) { + val single = random.nextBoolean() + if (single) { + val v = random.nextFloat() + column.putFloat(idx, v) + reference += v + idx += 1 + } else { + val n = math.min(random.nextInt(column.capacity / 20), column.capacity - idx) + val v = random.nextFloat() + column.putFloats(idx, n, v) + var i = 0 + while (i < n) { + reference += v + i += 1 + } + idx += n + } + } + + reference.zipWithIndex.foreach { v => + assert(v._1 == column.getFloat(v._2), + "Seed = " + seed + " VectorType=" + column.getClass.getSimpleName) + } + } + testVector("Double APIs", 1024, DoubleType) { column => val seed = System.currentTimeMillis() @@ -531,13 +575,6 @@ class ColumnarBatchSuite extends SparkFunSuite { Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET, 2.234) Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET + 8, 1.123) - if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) { - // Ensure array contains Little Endian doubles - val bb = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN) - Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET, bb.getDouble(0)) - Platform.putDouble(buffer, Platform.BYTE_ARRAY_OFFSET + 8, bb.getDouble(8)) - } - column.putDoubles(idx, 1, buffer, 8) column.putDoubles(idx + 1, 1, buffer, 0) reference += 1.123 @@ -575,6 +612,57 @@ class ColumnarBatchSuite extends SparkFunSuite { } } + testVector("[SPARK-31703] Double API - Little Endian", 1024, DoubleType) { + column => + val seed = System.currentTimeMillis() + val random = new Random(seed) + val reference = mutable.ArrayBuffer.empty[Double] + + var idx = 0 + + val littleEndian = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN) + littleEndian.putDouble(0, 1.357) + littleEndian.putDouble(8, 2.468) + val arr = new Array[Byte](littleEndian.remaining) + littleEndian.get(arr) + + column.putDoublesLittleEndian(idx, 1, arr, 8) + column.putDoublesLittleEndian(idx + 1, 1, arr, 0) + reference += 2.468 + reference += 1.357 + idx += 2 + + column.putDoublesLittleEndian(idx, 2, arr, 0) + reference += 1.357 + reference += 2.468 + idx += 2 + + while (idx < column.capacity) { + val single = random.nextBoolean() + if (single) { + val v = random.nextDouble() + column.putDouble(idx, v) + reference += v + idx += 1 + } else { + val n = math.min(random.nextInt(column.capacity / 20), column.capacity - idx) + val v = random.nextDouble() + column.putDoubles(idx, n, v) + var i = 0 + while (i < n) { + reference += v + i += 1 + } + idx += n + } + } + + reference.zipWithIndex.foreach { v => + assert(v._1 == column.getDouble(v._2), + "Seed = " + seed + " VectorType=" + column.getClass.getSimpleName) + } + } + testVector("String APIs", 7, StringType) { column => val reference = mutable.ArrayBuffer.empty[String] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala new file mode 100644 index 0000000000000..e18514c6f93f9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/expressions/ExpressionInfoSuite.scala @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.expressions + +import scala.collection.parallel.immutable.ParVector + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.expressions.ExpressionInfo +import org.apache.spark.sql.execution.HiveResult.hiveResultString +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +class ExpressionInfoSuite extends SparkFunSuite with SharedSparkSession { + + test("Replace _FUNC_ in ExpressionInfo") { + val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("upper")) + assert(info.getName === "upper") + assert(info.getClassName === "org.apache.spark.sql.catalyst.expressions.Upper") + assert(info.getUsage === "upper(str) - Returns `str` with all characters changed to uppercase.") + assert(info.getExamples.contains("> SELECT upper('SparkSql');")) + assert(info.getSince === "1.0.1") + assert(info.getNote === "") + assert(info.getExtended.contains("> SELECT upper('SparkSql');")) + } + + test("group info in ExpressionInfo") { + val info = spark.sessionState.catalog.lookupFunctionInfo(FunctionIdentifier("sum")) + assert(info.getGroup === "agg_funcs") + + Seq("agg_funcs", "array_funcs", "datetime_funcs", "json_funcs", "map_funcs", "window_funcs") + .foreach { groupName => + val info = new ExpressionInfo( + "testClass", null, "testName", null, "", "", "", groupName, "", "") + assert(info.getGroup === groupName) + } + + val errMsg = intercept[IllegalArgumentException] { + val invalidGroupName = "invalid_group_funcs" + new ExpressionInfo("testClass", null, "testName", null, "", "", "", invalidGroupName, "", "") + }.getMessage + assert(errMsg.contains("'group' is malformed in the expression [testName].")) + } + + test("error handling in ExpressionInfo") { + val errMsg1 = intercept[IllegalArgumentException] { + val invalidNote = " invalid note" + new ExpressionInfo("testClass", null, "testName", null, "", "", invalidNote, "", "", "") + }.getMessage + assert(errMsg1.contains("'note' is malformed in the expression [testName].")) + + val errMsg2 = intercept[IllegalArgumentException] { + val invalidSince = "-3.0.0" + new ExpressionInfo("testClass", null, "testName", null, "", "", "", "", invalidSince, "") + }.getMessage + assert(errMsg2.contains("'since' is malformed in the expression [testName].")) + + val errMsg3 = intercept[IllegalArgumentException] { + val invalidDeprecated = " invalid deprecated" + new ExpressionInfo("testClass", null, "testName", null, "", "", "", "", "", invalidDeprecated) + }.getMessage + assert(errMsg3.contains("'deprecated' is malformed in the expression [testName].")) + } + + test("using _FUNC_ instead of function names in examples") { + val exampleRe = "(>.*;)".r + val setStmtRe = "(?i)^(>\\s+set\\s+).+".r + val ignoreSet = Set( + // Examples for CaseWhen show simpler syntax: + // `CASE WHEN ... THEN ... WHEN ... THEN ... END` + "org.apache.spark.sql.catalyst.expressions.CaseWhen", + // _FUNC_ is replaced by `locate` but `locate(... IN ...)` is not supported + "org.apache.spark.sql.catalyst.expressions.StringLocate", + // _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)` + "org.apache.spark.sql.catalyst.expressions.Remainder", + // Examples demonstrate alternative names, see SPARK-20749 + "org.apache.spark.sql.catalyst.expressions.Length") + spark.sessionState.functionRegistry.listFunction().foreach { funcId => + val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) + val className = info.getClassName + withClue(s"Expression class '$className'") { + val exprExamples = info.getOriginalExamples + if (!exprExamples.isEmpty && !ignoreSet.contains(className)) { + assert(exampleRe.findAllIn(exprExamples).toIterable + .filter(setStmtRe.findFirstIn(_).isEmpty) // Ignore SET commands + .forall(_.contains("_FUNC_"))) + } + } + } + } + + test("check outputs of expression examples") { + def unindentAndTrim(s: String): String = { + s.replaceAll("\n\\s+", "\n").trim + } + val beginSqlStmtRe = " > ".r + val endSqlStmtRe = ";\n".r + def checkExampleSyntax(example: String): Unit = { + val beginStmtNum = beginSqlStmtRe.findAllIn(example).length + val endStmtNum = endSqlStmtRe.findAllIn(example).length + assert(beginStmtNum === endStmtNum, + "The number of ` > ` does not match to the number of `;`") + } + val exampleRe = """^(.+);\n(?s)(.+)$""".r + val ignoreSet = Set( + // One of examples shows getting the current timestamp + "org.apache.spark.sql.catalyst.expressions.UnixTimestamp", + "org.apache.spark.sql.catalyst.expressions.CurrentDate", + "org.apache.spark.sql.catalyst.expressions.CurrentTimestamp", + "org.apache.spark.sql.catalyst.expressions.Now", + // Random output without a seed + "org.apache.spark.sql.catalyst.expressions.Rand", + "org.apache.spark.sql.catalyst.expressions.Randn", + "org.apache.spark.sql.catalyst.expressions.Shuffle", + "org.apache.spark.sql.catalyst.expressions.Uuid", + // The example calls methods that return unstable results. + "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection") + + val parFuncs = new ParVector(spark.sessionState.functionRegistry.listFunction().toVector) + parFuncs.foreach { funcId => + // Examples can change settings. We clone the session to prevent tests clashing. + val clonedSpark = spark.cloneSession() + // Coalescing partitions can change result order, so disable it. + clonedSpark.sessionState.conf.setConf(SQLConf.COALESCE_PARTITIONS_ENABLED, false) + val info = clonedSpark.sessionState.catalog.lookupFunctionInfo(funcId) + val className = info.getClassName + if (!ignoreSet.contains(className)) { + withClue(s"Function '${info.getName}', Expression class '$className'") { + val example = info.getExamples + checkExampleSyntax(example) + example.split(" > ").toList.foreach { + case exampleRe(sql, output) => + val df = clonedSpark.sql(sql) + val actual = unindentAndTrim( + hiveResultString(df.queryExecution.executedPlan).mkString("\n")) + val expected = unindentAndTrim(output) + assert(actual === expected) + case _ => + } + } + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index d6a1fde2147b1..4630a429830fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -89,9 +89,9 @@ class CatalogSuite extends SharedSparkSession { val columns = dbName .map { db => spark.catalog.listColumns(db, tableName) } .getOrElse { spark.catalog.listColumns(tableName) } - assume(tableMetadata.schema.nonEmpty, "bad test") - assume(tableMetadata.partitionColumnNames.nonEmpty, "bad test") - assume(tableMetadata.bucketSpec.isDefined, "bad test") + assert(tableMetadata.schema.nonEmpty, "bad test") + assert(tableMetadata.partitionColumnNames.nonEmpty, "bad test") + assert(tableMetadata.bucketSpec.isDefined, "bad test") assert(columns.collect().map(_.name).toSet == tableMetadata.schema.map(_.name).toSet) val bucketColumnNames = tableMetadata.bucketSpec.map(_.bucketColumnNames).getOrElse(Nil).toSet columns.collect().foreach { col => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/DeprecatedCreateExternalTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/DeprecatedCreateExternalTableSuite.scala new file mode 100644 index 0000000000000..0b5cd3dae4761 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/DeprecatedCreateExternalTableSuite.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.internal + +import java.io.File + +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.CatalogTableType +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StructType + +class DeprecatedCreateExternalTableSuite extends SharedSparkSession { + test("createExternalTable with explicit path") { + withTable("t") { + withTempDir { dir => + val path = new File(dir, "test") + spark.range(100).write.parquet(path.getAbsolutePath) + spark.catalog.createExternalTable( + tableName = "t", + path = path.getAbsolutePath + ) + assert(spark.sessionState.catalog.tableExists(TableIdentifier("t"))) + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) + assert(table.tableType === CatalogTableType.EXTERNAL) + assert(table.provider === Some("parquet")) + assert(table.schema === new StructType().add("id", "long")) + assert(table.storage.locationUri.get == makeQualifiedPath(path.getAbsolutePath)) + } + } + } + + test("createExternalTable with 'path' options") { + withTable("t") { + withTempDir { dir => + val path = new File(dir, "test") + spark.range(100).write.parquet(path.getAbsolutePath) + spark.catalog.createExternalTable( + tableName = "t", + source = "parquet", + options = Map("path" -> path.getAbsolutePath)) + assert(spark.sessionState.catalog.tableExists(TableIdentifier("t"))) + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) + assert(table.tableType === CatalogTableType.EXTERNAL) + assert(table.provider === Some("parquet")) + assert(table.schema === new StructType().add("id", "long")) + assert(table.storage.locationUri.get == makeQualifiedPath(path.getAbsolutePath)) + } + } + } + + test("createExternalTable with explicit schema") { + withTable("t") { + withTempDir { dir => + val path = new File(dir, "test") + spark.range(100).write.parquet(path.getAbsolutePath) + spark.catalog.createExternalTable( + tableName = "t", + source = "parquet", + schema = new StructType().add("i", "int"), + options = Map("path" -> path.getAbsolutePath)) + assert(spark.sessionState.catalog.tableExists(TableIdentifier("t"))) + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) + assert(table.tableType === CatalogTableType.EXTERNAL) + assert(table.provider === Some("parquet")) + assert(table.schema === new StructType().add("i", "int")) + assert(table.storage.locationUri.get == makeQualifiedPath(path.getAbsolutePath)) + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index 0cc658c499615..567524ac75c2e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.internal +import java.util.UUID + import org.scalatest.Assertions._ import org.apache.spark.{SparkException, SparkFunSuite, TaskContext} @@ -26,6 +28,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlan} +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution import org.apache.spark.sql.execution.debug.codegenStringSeq import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SQLTestUtils @@ -96,10 +99,10 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { } } - test("SPARK-22219: refactor to control to generate comment") { + test("SPARK-22219: refactor to control to generate comment", + DisableAdaptiveExecution("WSCG rule is applied later in AQE")) { Seq(true, false).foreach { flag => - withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString, - SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) { // with AQE on, the WholeStageCodegen rule is applied when running QueryStageExec. val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() .queryExecution.executedPlan) @@ -144,17 +147,45 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { } // set local configuration and assert - val confValue1 = "e" + val confValue1 = UUID.randomUUID().toString() createDataframe(confKey, confValue1).createOrReplaceTempView("m") spark.sparkContext.setLocalProperty(confKey, confValue1) - assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM m)").collect.size == 1) + assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM m)").collect().length == 1) // change the conf value and assert again - val confValue2 = "f" + val confValue2 = UUID.randomUUID().toString() createDataframe(confKey, confValue2).createOrReplaceTempView("n") spark.sparkContext.setLocalProperty(confKey, confValue2) - assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM n)").collect().size == 1) + assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM n)").collect().length == 1) + } + } + } + + test("SPARK-22590 propagate local properties to broadcast execution thread") { + withSQLConf(StaticSQLConf.BROADCAST_EXCHANGE_MAX_THREAD_THRESHOLD.key -> "1") { + val df1 = Seq(true).toDF() + val confKey = "spark.sql.y" + val confValue1 = UUID.randomUUID().toString() + val confValue2 = UUID.randomUUID().toString() + + def generateBroadcastDataFrame(confKey: String, confValue: String): Dataset[Boolean] = { + val df = spark.range(1).mapPartitions { _ => + Iterator(TaskContext.get.getLocalProperty(confKey) == confValue) + } + df.hint("broadcast") } + + // set local propert and assert + val df2 = generateBroadcastDataFrame(confKey, confValue1) + spark.sparkContext.setLocalProperty(confKey, confValue1) + val checks = df1.join(df2).collect() + assert(checks.forall(_.toSeq == Seq(true, true))) + + // change local property and re-assert + val df3 = generateBroadcastDataFrame(confKey, confValue2) + spark.sparkContext.setLocalProperty(confKey, confValue2) + val checks2 = df1.join(df3).collect() + assert(checks2.forall(_.toSeq == Seq(true, true))) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 61be3672f3ebe..f1cd37f91d78a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -23,6 +23,7 @@ import org.apache.hadoop.fs.Path import org.apache.log4j.Level import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.MIT import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.test.{SharedSparkSession, TestSQLContext} import org.apache.spark.util.Utils @@ -115,6 +116,21 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { } } + test("SPARK-31234: reset will not change static sql configs and spark core configs") { + val conf = spark.sparkContext.getConf.getAll.toMap + val appName = conf.get("spark.app.name") + val driverHost = conf.get("spark.driver.host") + val master = conf.get("spark.master") + val warehouseDir = conf.get("spark.sql.warehouse.dir") + // ensure the conf here is not default value, and will not be reset to default value later + assert(warehouseDir.get.contains(this.getClass.getCanonicalName)) + sql("RESET") + assert(conf.get("spark.app.name") === appName) + assert(conf.get("spark.driver.host") === driverHost) + assert(conf.get("spark.master") === master) + assert(conf.get("spark.sql.warehouse.dir") === warehouseDir) + } + test("reset - public conf") { spark.sessionState.conf.clear() val original = spark.conf.get(SQLConf.GROUP_BY_ORDINAL) @@ -170,33 +186,33 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { assert(e.getMessage === s"${SQLConf.CASE_SENSITIVE.key} should be boolean, but was 10") } - test("Test SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE's method") { + test("Test ADVISORY_PARTITION_SIZE_IN_BYTES's method") { spark.sessionState.conf.clear() - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "100") - assert(spark.conf.get(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) === 100) + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "100") + assert(spark.conf.get(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) === 100) - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "1k") - assert(spark.conf.get(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) === 1024) + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "1k") + assert(spark.conf.get(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) === 1024) - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "1M") - assert(spark.conf.get(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) === 1048576) + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "1M") + assert(spark.conf.get(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) === 1048576) - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "1g") - assert(spark.conf.get(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) === 1073741824) + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "1g") + assert(spark.conf.get(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) === 1073741824) - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "-1") - assert(spark.conf.get(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE) === -1) + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "-1") + assert(spark.conf.get(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) === -1) // Test overflow exception intercept[IllegalArgumentException] { // This value exceeds Long.MaxValue - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "90000000000g") + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "90000000000g") } intercept[IllegalArgumentException] { // This value less than Long.MinValue - spark.conf.set(SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key, "-90000000000g") + spark.conf.set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "-90000000000g") } spark.sessionState.conf.clear() @@ -259,7 +275,7 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { // check default value assert(spark.sessionState.conf.parquetOutputTimestampType == - SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS) + SQLConf.ParquetOutputTimestampType.INT96) spark.sessionState.conf.setConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE, "timestamp_micros") assert(spark.sessionState.conf.parquetOutputTimestampType == @@ -285,8 +301,8 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { assert(spark.sessionState.conf.getConfString(fallback.key, "lzo") === "lzo") val displayValue = spark.sessionState.conf.getAllDefinedConfs - .find { case (key, _, _) => key == fallback.key } - .map { case (_, v, _) => v } + .find { case (key, _, _, _) => key == fallback.key } + .map { case (_, v, _, _) => v } .get assert(displayValue === fallback.defaultValueString) @@ -297,8 +313,8 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { assert(spark.sessionState.conf.getConfString(fallback.key) === "lzo") val newDisplayValue = spark.sessionState.conf.getAllDefinedConfs - .find { case (key, _, _) => key == fallback.key } - .map { case (_, v, _) => v } + .find { case (key, _, _, _) => key == fallback.key } + .map { case (_, v, _, _) => v } .get assert(newDisplayValue === "lzo") @@ -348,4 +364,23 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { } check(config2) } + + test("spark.sql.session.timeZone should only accept valid zone id") { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, MIT.getId) + assert(sql(s"set ${SQLConf.SESSION_LOCAL_TIMEZONE.key}").head().getString(1) === MIT.getId) + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "America/Chicago") + assert(sql(s"set ${SQLConf.SESSION_LOCAL_TIMEZONE.key}").head().getString(1) === + "America/Chicago") + + intercept[IllegalArgumentException] { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "pst") + } + intercept[IllegalArgumentException] { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "GMT+8:00") + } + val e = intercept[IllegalArgumentException] { + spark.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, "Asia/shanghai") + } + assert(e.getMessage === "Cannot resolve the given timezone with ZoneId.of(_, ZoneId.SHORT_IDS)") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SharedStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SharedStateSuite.scala new file mode 100644 index 0000000000000..4d33fc1855ab2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SharedStateSuite.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.internal + +import java.net.URL + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FsUrlStreamHandlerFactory + +import org.apache.spark.SparkConf +import org.apache.spark.sql.test.SharedSparkSession + + +/** + * Tests for [[org.apache.spark.sql.internal.SharedState]]. + */ +class SharedStateSuite extends SharedSparkSession { + + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.hadoop.fs.defaultFS", "file:///") + } + + test("SPARK-31692: Url handler factory should have the hadoop configs from Spark conf") { + // Accessing shared state to init the object since it is `lazy val` + spark.sharedState + val field = classOf[URL].getDeclaredField("factory") + field.setAccessible(true) + val value = field.get(null) + assert(value.isInstanceOf[FsUrlStreamHandlerFactory]) + val streamFactory = value.asInstanceOf[FsUrlStreamHandlerFactory] + + val confField = classOf[FsUrlStreamHandlerFactory].getDeclaredField("conf") + confField.setAccessible(true) + val conf = confField.get(streamFactory) + + assert(conf.isInstanceOf[Configuration]) + assert(conf.asInstanceOf[Configuration].get("fs.defaultFS") == "file:///") + } + + test("SPARK-33740: hadoop configs in hive-site.xml can overrides pre-existing hadoop ones") { + val conf = new SparkConf() + val hadoopConf = new Configuration() + SharedState.loadHiveConfFile(conf, hadoopConf) + assert(hadoopConf.get("hadoop.tmp.dir") === "/tmp/hive_one") + hadoopConf.clear() + SharedState.loadHiveConfFile( + conf.set("spark.hadoop.hadoop.tmp.dir", "noop"), hadoopConf) + assert(hadoopConf.get("hadoop.tmp.dir") === null) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 9cba95f7d7df2..1f584a569c3f3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -21,6 +21,8 @@ import java.math.BigDecimal import java.sql.{Date, DriverManager, SQLException, Timestamp} import java.util.{Calendar, GregorianCalendar, Properties} +import scala.collection.JavaConverters._ + import org.h2.jdbc.JdbcSQLException import org.scalatest.{BeforeAndAfter, PrivateMethodTester} @@ -635,12 +637,14 @@ class JDBCSuite extends QueryTest } test("test DATE types in cache") { - val rows = spark.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties()).collect() - spark.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties()) - .cache().createOrReplaceTempView("mycached_date") - val cachedRows = sql("select * from mycached_date").collect() - assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01")) - assert(cachedRows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01")) + withTempView("mycached_date") { + val rows = spark.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties()).collect() + spark.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties()) + .cache().createOrReplaceTempView("mycached_date") + val cachedRows = sql("select * from mycached_date").collect() + assert(rows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01")) + assert(cachedRows(0).getAs[java.sql.Date](1) === java.sql.Date.valueOf("1996-01-01")) + } } test("test types for null value") { @@ -692,7 +696,7 @@ class JDBCSuite extends QueryTest test("Remap types via JdbcDialects") { JdbcDialects.registerDialect(testH2Dialect) val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties()) - assert(df.schema.filter(_.dataType != org.apache.spark.sql.types.StringType).isEmpty) + assert(!df.schema.exists(_.dataType != org.apache.spark.sql.types.StringType)) val rows = df.collect() assert(rows(0).get(0).isInstanceOf[String]) assert(rows(0).get(1).isInstanceOf[String]) @@ -847,6 +851,8 @@ class JDBCSuite extends QueryTest assert(Postgres.getCatalystType(java.sql.Types.OTHER, "jsonb", 1, null) === Some(StringType)) assert(Postgres.getCatalystType(java.sql.Types.ARRAY, "_numeric", 0, md) == Some(ArrayType(DecimalType.SYSTEM_DEFAULT))) + assert(Postgres.getCatalystType(java.sql.Types.ARRAY, "_bpchar", 64, md) == + Some(ArrayType(StringType))) assert(Postgres.getJDBCType(FloatType).map(_.databaseTypeDefinition).get == "FLOAT4") assert(Postgres.getJDBCType(DoubleType).map(_.databaseTypeDefinition).get == "FLOAT8") assert(Postgres.getJDBCType(ByteType).map(_.databaseTypeDefinition).get == "SMALLINT") @@ -1300,7 +1306,8 @@ class JDBCSuite extends QueryTest testJdbcOptions(new JDBCOptions(parameters)) testJdbcOptions(new JDBCOptions(CaseInsensitiveMap(parameters))) // test add/remove key-value from the case-insensitive map - var modifiedParameters = CaseInsensitiveMap(Map.empty) ++ parameters + var modifiedParameters = + (CaseInsensitiveMap(Map.empty) ++ parameters).asInstanceOf[Map[String, String]] testJdbcOptions(new JDBCOptions(modifiedParameters)) modifiedParameters -= "dbtable" assert(modifiedParameters.get("dbTAblE").isEmpty) @@ -1699,4 +1706,37 @@ class JDBCSuite extends QueryTest assert(JdbcDialects.get("jdbc:teradata://localhost/db") === TeradataDialect) assert(JdbcDialects.get("jdbc:Teradata://localhost/db") === TeradataDialect) } + + test("SQLContext.jdbc (deprecated)") { + val sqlContext = spark.sqlContext + var jdbcDF = sqlContext.jdbc(urlWithUserAndPass, "TEST.PEOPLE") + checkAnswer(jdbcDF, Row("fred", 1) :: Row("mary", 2) :: Row ("joe 'foo' \"bar\"", 3) :: Nil) + + jdbcDF = sqlContext.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3) + checkNumPartitions(jdbcDF, 3) + checkAnswer(jdbcDF, Row("fred", 1) :: Row("mary", 2) :: Row ("joe 'foo' \"bar\"", 3) :: Nil) + + val parts = Array[String]("THEID = 2") + jdbcDF = sqlContext.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts) + checkAnswer(jdbcDF, Row("mary", 2) :: Nil) + } + + test("SPARK-32364: JDBCOption constructor") { + val extraOptions = CaseInsensitiveMap[String](Map("UrL" -> "url1", "dBTable" -> "table1")) + val connectionProperties = new Properties() + connectionProperties.put("url", "url2") + connectionProperties.put("dbtable", "table2") + + // connection property should override the options in extraOptions + val params = extraOptions ++ connectionProperties.asScala + assert(params.size == 2) + assert(params.get("uRl").contains("url2")) + assert(params.get("DbtaBle").contains("table2")) + + // JDBCOptions constructor parameter should overwrite the existing conf + val options = new JDBCOptions(url, "table3", params) + assert(options.asProperties.size == 2) + assert(options.asProperties.get("url") == url) + assert(options.asProperties.get("dbtable") == "table3") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index c7266c886128c..df8ca33b893cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning -import org.apache.spark.sql.execution.{DataSourceScanExec, SortExec} +import org.apache.spark.sql.execution.{DataSourceScanExec, FileSourceScanExec, SortExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import org.apache.spark.sql.execution.datasources.BucketingUtils import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec @@ -42,7 +42,7 @@ import org.apache.spark.util.collection.BitSet class BucketedReadWithoutHiveSupportSuite extends BucketedReadSuite with SharedSparkSession { protected override def beforeAll(): Unit = { super.beforeAll() - assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") + assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") } } @@ -100,6 +100,12 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { } } + private def getFileScan(plan: SparkPlan): FileSourceScanExec = { + val fileScan = plan.collect { case f: FileSourceScanExec => f } + assert(fileScan.nonEmpty, plan) + fileScan.head + } + // To verify if the bucket pruning works, this function checks two conditions: // 1) Check if the pruned buckets (before filtering) are empty. // 2) Verify the final result is the same as the expected one @@ -119,8 +125,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { // Filter could hide the bug in bucket pruning. Thus, skipping all the filters val plan = bucketedDataFrame.filter(filterCondition).queryExecution.executedPlan - val rdd = plan.find(_.isInstanceOf[DataSourceScanExec]) - assert(rdd.isDefined, plan) + val fileScan = getFileScan(plan) // if nothing should be pruned, skip the pruning test if (bucketValues.nonEmpty) { @@ -128,7 +133,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { bucketValues.foreach { value => matchedBuckets.set(BucketingUtils.getBucketIdFromValue(bucketColumn, numBuckets, value)) } - val invalidBuckets = rdd.get.execute().mapPartitionsWithIndex { case (index, iter) => + val invalidBuckets = fileScan.execute().mapPartitionsWithIndex { case (index, iter) => // return indexes of partitions that should have been pruned and are not empty if (!matchedBuckets.get(index % numBuckets) && iter.nonEmpty) { Iterator(index) @@ -183,7 +188,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { // Case 4: InSet val inSetExpr = expressions.InSet($"j".expr, - Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3).map(lit(_).expr)) + Set(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3)) checkPrunedAnswers( bucketSpec, bucketValues = Seq(bucketValue, bucketValue + 1, bucketValue + 2, bucketValue + 3), @@ -297,10 +302,9 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { val bucketedDataFrame = spark.table("bucketed_table").select("i", "j", "k") val plan = bucketedDataFrame.queryExecution.executedPlan - val rdd = plan.find(_.isInstanceOf[DataSourceScanExec]) - assert(rdd.isDefined, plan) + val fileScan = getFileScan(plan) - val emptyBuckets = rdd.get.execute().mapPartitionsWithIndex { case (index, iter) => + val emptyBuckets = fileScan.execute().mapPartitionsWithIndex { case (index, iter) => // return indexes of empty partitions if (iter.isEmpty) { Iterator(index) @@ -762,10 +766,13 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { withTable("bucketed_table") { df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table") - checkAnswer(spark.table("bucketed_table").select("j"), df1.select("j")) + val scanDF = spark.table("bucketed_table").select("j") + assert(!getFileScan(scanDF.queryExecution.executedPlan).bucketedScan) + checkAnswer(scanDF, df1.select("j")) - checkAnswer(spark.table("bucketed_table").groupBy("j").agg(max("k")), - df1.groupBy("j").agg(max("k"))) + val aggDF = spark.table("bucketed_table").groupBy("j").agg(max("k")) + assert(!getFileScan(aggDF.queryExecution.executedPlan).bucketedScan) + checkAnswer(aggDF, df1.groupBy("j").agg(max("k"))) } } @@ -821,7 +828,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { test("SPARK-29655 Read bucketed tables obeys spark.sql.shuffle.partitions") { withSQLConf( SQLConf.SHUFFLE_PARTITIONS.key -> "5", - SQLConf.SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS.key -> "7") { + SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key -> "7") { val bucketSpec = Some(BucketSpec(6, Seq("i", "j"), Nil)) Seq(false, true).foreach { enableAdaptive => withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> s"$enableAdaptive") { @@ -836,4 +843,32 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { } } } + + test("SPARK-32767 Bucket join should work if SHUFFLE_PARTITIONS larger than bucket number") { + withSQLConf( + SQLConf.SHUFFLE_PARTITIONS.key -> "9", + SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key -> "10") { + + val testSpec1 = BucketedTableTestSpec( + Some(BucketSpec(8, Seq("i", "j"), Seq("i", "j"))), + numPartitions = 1, + expectedShuffle = false, + expectedSort = false) + val testSpec2 = BucketedTableTestSpec( + Some(BucketSpec(6, Seq("i", "j"), Seq("i", "j"))), + numPartitions = 1, + expectedShuffle = true, + expectedSort = true) + Seq(false, true).foreach { enableAdaptive => + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> s"$enableAdaptive") { + Seq((testSpec1, testSpec2), (testSpec2, testSpec1)).foreach { specs => + testBucketing( + bucketedTableTestSpecLeft = specs._1, + bucketedTableTestSpecRight = specs._2, + joinCondition = joinCondition(Seq("i", "j"))) + } + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala index 9713de988e379..a410f32d4af7e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedWriteSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} class BucketedWriteWithoutHiveSupportSuite extends BucketedWriteSuite with SharedSparkSession { protected override def beforeAll(): Unit = { super.beforeAll() - assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") + assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") } override protected def fileFormatsToTest: Seq[String] = Seq("parquet", "json") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala index e1022e377132c..a6c50904d395b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceAnalysisSuite.scala @@ -22,9 +22,10 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, Expression, Literal} +import org.apache.spark.sql.catalyst.expressions.{Alias, AnsiCast, Attribute, Cast, Expression, Literal} import org.apache.spark.sql.execution.datasources.DataSourceAnalysis import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.types.{DataType, IntegerType, StructType} class DataSourceAnalysisSuite extends SparkFunSuite with BeforeAndAfterAll { @@ -52,7 +53,12 @@ class DataSourceAnalysisSuite extends SparkFunSuite with BeforeAndAfterAll { Seq(true, false).foreach { caseSensitive => val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive) def cast(e: Expression, dt: DataType): Expression = { - Cast(e, dt, Option(conf.sessionLocalTimeZone)) + conf.storeAssignmentPolicy match { + case StoreAssignmentPolicy.ANSI | StoreAssignmentPolicy.STRICT => + AnsiCast(e, dt, Option(conf.sessionLocalTimeZone)) + case _ => + Cast(e, dt, Option(conf.sessionLocalTimeZone)) + } } val rule = DataSourceAnalysis(conf) test( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/FiltersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/FiltersSuite.scala index 1cb7a2156c3d3..33b2db57d9f0f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/FiltersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/FiltersSuite.scala @@ -24,66 +24,143 @@ import org.apache.spark.SparkFunSuite */ class FiltersSuite extends SparkFunSuite { - test("EqualTo references") { - assert(EqualTo("a", "1").references.toSeq == Seq("a")) - assert(EqualTo("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) + private def withFieldNames(f: (String, Array[String]) => Unit): Unit = { + Seq(("a", Array("a")), + ("a.b", Array("a", "b")), + ("`a.b`.c", Array("a.b", "c")), + ("`a.b`.`c.d`.`e.f`", Array("a.b", "c.d", "e.f")) + ).foreach { case (name, fieldNames) => + f(name, fieldNames) + } } - test("EqualNullSafe references") { - assert(EqualNullSafe("a", "1").references.toSeq == Seq("a")) - assert(EqualNullSafe("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) - } + test("EqualTo references") { withFieldNames { (name, fieldNames) => + assert(EqualTo(name, "1").references.toSeq == Seq(name)) + assert(EqualTo(name, "1").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) - test("GreaterThan references") { - assert(GreaterThan("a", "1").references.toSeq == Seq("a")) - assert(GreaterThan("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) - } + assert(EqualTo(name, EqualTo("b", "2")).references.toSeq == Seq(name, "b")) + assert(EqualTo("b", EqualTo(name, "2")).references.toSeq == Seq("b", name)) - test("GreaterThanOrEqual references") { - assert(GreaterThanOrEqual("a", "1").references.toSeq == Seq("a")) - assert(GreaterThanOrEqual("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) - } + assert(EqualTo(name, EqualTo("b", "2")).v2references.toSeq.map(_.toSeq) + == Seq(fieldNames.toSeq, Seq("b"))) + assert(EqualTo("b", EqualTo(name, "2")).v2references.toSeq.map(_.toSeq) + == Seq(Seq("b"), fieldNames.toSeq)) + }} - test("LessThan references") { - assert(LessThan("a", "1").references.toSeq == Seq("a")) - assert(LessThan("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) - } + test("EqualNullSafe references") { withFieldNames { (name, fieldNames) => + assert(EqualNullSafe(name, "1").references.toSeq == Seq(name)) + assert(EqualNullSafe(name, "1").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) - test("LessThanOrEqual references") { - assert(LessThanOrEqual("a", "1").references.toSeq == Seq("a")) - assert(LessThanOrEqual("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) - } + assert(EqualNullSafe(name, EqualTo("b", "2")).references.toSeq == Seq(name, "b")) + assert(EqualNullSafe("b", EqualTo(name, "2")).references.toSeq == Seq("b", name)) - test("In references") { - assert(In("a", Array("1")).references.toSeq == Seq("a")) - assert(In("a", Array("1", EqualTo("b", "2"))).references.toSeq == Seq("a", "b")) - } + assert(EqualNullSafe(name, EqualTo("b", "2")).v2references.toSeq.map(_.toSeq) + == Seq(fieldNames.toSeq, Seq("b"))) + assert(EqualNullSafe("b", EqualTo(name, "2")).v2references.toSeq.map(_.toSeq) + == Seq(Seq("b"), fieldNames.toSeq)) + }} - test("IsNull references") { - assert(IsNull("a").references.toSeq == Seq("a")) - } + test("GreaterThan references") { withFieldNames { (name, fieldNames) => + assert(GreaterThan(name, "1").references.toSeq == Seq(name)) + assert(GreaterThan(name, "1").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) - test("IsNotNull references") { - assert(IsNotNull("a").references.toSeq == Seq("a")) - } + assert(GreaterThan(name, EqualTo("b", "2")).references.toSeq == Seq(name, "b")) + assert(GreaterThan("b", EqualTo(name, "2")).references.toSeq == Seq("b", name)) - test("And references") { - assert(And(EqualTo("a", "1"), EqualTo("b", "1")).references.toSeq == Seq("a", "b")) - } + assert(GreaterThan(name, EqualTo("b", "2")).v2references.toSeq.map(_.toSeq) + == Seq(fieldNames.toSeq, Seq("b"))) + assert(GreaterThan("b", EqualTo(name, "2")).v2references.toSeq.map(_.toSeq) + == Seq(Seq("b"), fieldNames.toSeq)) + }} - test("Or references") { - assert(Or(EqualTo("a", "1"), EqualTo("b", "1")).references.toSeq == Seq("a", "b")) - } + test("GreaterThanOrEqual references") { withFieldNames { (name, fieldNames) => + assert(GreaterThanOrEqual(name, "1").references.toSeq == Seq(name)) + assert(GreaterThanOrEqual(name, "1").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) - test("StringStartsWith references") { - assert(StringStartsWith("a", "str").references.toSeq == Seq("a")) - } + assert(GreaterThanOrEqual(name, EqualTo("b", "2")).references.toSeq == Seq(name, "b")) + assert(GreaterThanOrEqual("b", EqualTo(name, "2")).references.toSeq == Seq("b", name)) - test("StringEndsWith references") { - assert(StringEndsWith("a", "str").references.toSeq == Seq("a")) - } + assert(GreaterThanOrEqual(name, EqualTo("b", "2")).v2references.toSeq.map(_.toSeq) + == Seq(fieldNames.toSeq, Seq("b"))) + assert(GreaterThanOrEqual("b", EqualTo(name, "2")).v2references.toSeq.map(_.toSeq) + == Seq(Seq("b"), fieldNames.toSeq)) + }} - test("StringContains references") { - assert(StringContains("a", "str").references.toSeq == Seq("a")) - } + test("LessThan references") { withFieldNames { (name, fieldNames) => + assert(LessThan(name, "1").references.toSeq == Seq(name)) + assert(LessThan(name, "1").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + + assert(LessThan("a", EqualTo("b", "2")).references.toSeq == Seq("a", "b")) + }} + + test("LessThanOrEqual references") { withFieldNames { (name, fieldNames) => + assert(LessThanOrEqual(name, "1").references.toSeq == Seq(name)) + assert(LessThanOrEqual(name, "1").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + + assert(LessThanOrEqual(name, EqualTo("b", "2")).references.toSeq == Seq(name, "b")) + assert(LessThanOrEqual("b", EqualTo(name, "2")).references.toSeq == Seq("b", name)) + + assert(LessThanOrEqual(name, EqualTo("b", "2")).v2references.toSeq.map(_.toSeq) + == Seq(fieldNames.toSeq, Seq("b"))) + assert(LessThanOrEqual("b", EqualTo(name, "2")).v2references.toSeq.map(_.toSeq) + == Seq(Seq("b"), fieldNames.toSeq)) + }} + + test("In references") { withFieldNames { (name, fieldNames) => + assert(In(name, Array("1")).references.toSeq == Seq(name)) + assert(In(name, Array("1")).v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + + assert(In(name, Array("1", EqualTo("b", "2"))).references.toSeq == Seq(name, "b")) + assert(In("b", Array("1", EqualTo(name, "2"))).references.toSeq == Seq("b", name)) + + assert(In(name, Array("1", EqualTo("b", "2"))).v2references.toSeq.map(_.toSeq) + == Seq(fieldNames.toSeq, Seq("b"))) + assert(In("b", Array("1", EqualTo(name, "2"))).v2references.toSeq.map(_.toSeq) + == Seq(Seq("b"), fieldNames.toSeq)) + }} + + test("IsNull references") { withFieldNames { (name, fieldNames) => + assert(IsNull(name).references.toSeq == Seq(name)) + assert(IsNull(name).v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + }} + + test("IsNotNull references") { withFieldNames { (name, fieldNames) => + assert(IsNotNull(name).references.toSeq == Seq(name)) + assert(IsNull(name).v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + }} + + test("And references") { withFieldNames { (name, fieldNames) => + assert(And(EqualTo(name, "1"), EqualTo("b", "1")).references.toSeq == Seq(name, "b")) + assert(And(EqualTo("b", "1"), EqualTo(name, "1")).references.toSeq == Seq("b", name)) + + assert(And(EqualTo(name, "1"), EqualTo("b", "1")).v2references.toSeq.map(_.toSeq) == + Seq(fieldNames.toSeq, Seq("b"))) + assert(And(EqualTo("b", "1"), EqualTo(name, "1")).v2references.toSeq.map(_.toSeq) == + Seq(Seq("b"), fieldNames.toSeq)) + }} + + test("Or references") { withFieldNames { (name, fieldNames) => + assert(Or(EqualTo(name, "1"), EqualTo("b", "1")).references.toSeq == Seq(name, "b")) + assert(Or(EqualTo("b", "1"), EqualTo(name, "1")).references.toSeq == Seq("b", name)) + + assert(Or(EqualTo(name, "1"), EqualTo("b", "1")).v2references.toSeq.map(_.toSeq) == + Seq(fieldNames.toSeq, Seq("b"))) + assert(Or(EqualTo("b", "1"), EqualTo(name, "1")).v2references.toSeq.map(_.toSeq) == + Seq(Seq("b"), fieldNames.toSeq)) + }} + + test("StringStartsWith references") { withFieldNames { (name, fieldNames) => + assert(StringStartsWith(name, "str").references.toSeq == Seq(name)) + assert(StringStartsWith(name, "str").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + }} + + test("StringEndsWith references") { withFieldNames { (name, fieldNames) => + assert(StringEndsWith(name, "str").references.toSeq == Seq(name)) + assert(StringEndsWith(name, "str").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + }} + + test("StringContains references") { withFieldNames { (name, fieldNames) => + assert(StringContains(name, "str").references.toSeq == Seq(name)) + assert(StringContains(name, "str").v2references.toSeq.map(_.toSeq) == Seq(fieldNames.toSeq)) + }} } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index bcff30a51c3f5..995d4b0e7230e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -523,15 +523,17 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { test("new partitions should be added to catalog after writing to catalog table") { val table = "partitioned_catalog_table" + val tempTable = "partitioned_catalog_temp_table" val numParts = 210 withTable(table) { - val df = (1 to numParts).map(i => (i, i)).toDF("part", "col1") - val tempTable = "partitioned_catalog_temp_table" - df.createOrReplaceTempView(tempTable) - sql(s"CREATE TABLE $table (part Int, col1 Int) USING parquet PARTITIONED BY (part)") - sql(s"INSERT INTO TABLE $table SELECT * from $tempTable") - val partitions = spark.sessionState.catalog.listPartitionNames(TableIdentifier(table)) - assert(partitions.size == numParts) + withTempView(tempTable) { + val df = (1 to numParts).map(i => (i, i)).toDF("part", "col1") + df.createOrReplaceTempView(tempTable) + sql(s"CREATE TABLE $table (part Int, col1 Int) USING parquet PARTITIONED BY (part)") + sql(s"INSERT INTO TABLE $table SELECT * from $tempTable") + val partitions = spark.sessionState.catalog.listPartitionNames(TableIdentifier(table)) + assert(partitions.size == numParts) + } } } @@ -620,12 +622,12 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { var msg = intercept[AnalysisException] { sql("insert into t select 1L, 2") }.getMessage - assert(msg.contains("Cannot safely cast 'i': LongType to IntegerType")) + assert(msg.contains("Cannot safely cast 'i': bigint to int")) msg = intercept[AnalysisException] { sql("insert into t select 1, 2.0") }.getMessage - assert(msg.contains("Cannot safely cast 'd': DecimalType(2,1) to DoubleType")) + assert(msg.contains("Cannot safely cast 'd': decimal(2,1) to double")) msg = intercept[AnalysisException] { sql("insert into t select 1, 2.0D, 3") @@ -657,18 +659,18 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { var msg = intercept[AnalysisException] { sql("insert into t values('a', 'b')") }.getMessage - assert(msg.contains("Cannot safely cast 'i': StringType to IntegerType") && - msg.contains("Cannot safely cast 'd': StringType to DoubleType")) + assert(msg.contains("Cannot safely cast 'i': string to int") && + msg.contains("Cannot safely cast 'd': string to double")) msg = intercept[AnalysisException] { sql("insert into t values(now(), now())") }.getMessage - assert(msg.contains("Cannot safely cast 'i': TimestampType to IntegerType") && - msg.contains("Cannot safely cast 'd': TimestampType to DoubleType")) + assert(msg.contains("Cannot safely cast 'i': timestamp to int") && + msg.contains("Cannot safely cast 'd': timestamp to double")) msg = intercept[AnalysisException] { sql("insert into t values(true, false)") }.getMessage - assert(msg.contains("Cannot safely cast 'i': BooleanType to IntegerType") && - msg.contains("Cannot safely cast 'd': BooleanType to DoubleType")) + assert(msg.contains("Cannot safely cast 'i': boolean to int") && + msg.contains("Cannot safely cast 'd': boolean to double")) } } } @@ -753,6 +755,27 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { } } + test("SPARK-30844: static partition should also follow StoreAssignmentPolicy") { + SQLConf.StoreAssignmentPolicy.values.foreach { policy => + withSQLConf( + SQLConf.STORE_ASSIGNMENT_POLICY.key -> policy.toString) { + withTable("t") { + sql("create table t(a int, b string) using parquet partitioned by (a)") + policy match { + case SQLConf.StoreAssignmentPolicy.ANSI | SQLConf.StoreAssignmentPolicy.STRICT => + val errorMsg = intercept[NumberFormatException] { + sql("insert into t partition(a='ansi') values('ansi')") + }.getMessage + assert(errorMsg.contains("invalid input syntax for type numeric: ansi")) + case SQLConf.StoreAssignmentPolicy.LEGACY => + sql("insert into t partition(a='ansi') values('ansi')") + checkAnswer(sql("select * from t"), Row("ansi", null) :: Nil) + } + } + } + } + } + test("SPARK-24860: dynamic partition overwrite specified per source without catalog table") { withTempPath { path => Seq((1, 1), (2, 2)).toDF("i", "part") @@ -802,21 +825,29 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { } test("Stop task set if FileAlreadyExistsException was thrown") { - withSQLConf("fs.file.impl" -> classOf[FileExistingTestFileSystem].getName, - "fs.file.impl.disable.cache" -> "true") { - withTable("t") { - sql( - """ - |CREATE TABLE t(i INT, part1 INT) USING PARQUET - |PARTITIONED BY (part1) + Seq(true, false).foreach { fastFail => + withSQLConf("fs.file.impl" -> classOf[FileExistingTestFileSystem].getName, + "fs.file.impl.disable.cache" -> "true", + SQLConf.FASTFAIL_ON_FILEFORMAT_OUTPUT.key -> fastFail.toString) { + withTable("t") { + sql( + """ + |CREATE TABLE t(i INT, part1 INT) USING PARQUET + |PARTITIONED BY (part1) """.stripMargin) - val df = Seq((1, 1)).toDF("i", "part1") - val err = intercept[SparkException] { - df.write.mode("overwrite").format("parquet").insertInto("t") + val df = Seq((1, 1)).toDF("i", "part1") + val err = intercept[SparkException] { + df.write.mode("overwrite").format("parquet").insertInto("t") + } + + if (fastFail) { + assert(err.getCause.getMessage.contains("can not write to output file: " + + "org.apache.hadoop.fs.FileAlreadyExistsException")) + } else { + assert(err.getCause.getMessage.contains("Task failed while writing rows")) + } } - assert(err.getCause.getMessage.contains("can not write to output file: " + - "org.apache.hadoop.fs.FileAlreadyExistsException")) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index ab1d1f80e7397..6df1c5db14c26 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -24,7 +24,7 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.TestUtils import org.apache.spark.internal.Logging -import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol @@ -139,15 +139,15 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { checkPartitionValues(files.head, "2016-12-01 00:00:00") } withTempPath { f => - df.write.option(DateTimeUtils.TIMEZONE_OPTION, "GMT") + df.write.option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .partitionBy("ts").parquet(f.getAbsolutePath) val files = TestUtils.recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet")) assert(files.length == 1) - // use timeZone option "GMT" to format partition value. + // use timeZone option utcTz.getId to format partition value. checkPartitionValues(files.head, "2016-12-01 08:00:00") } withTempPath { f => - withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "GMT") { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { df.write.partitionBy("ts").parquet(f.getAbsolutePath) val files = TestUtils.recursiveList(f).filter(_.getAbsolutePath.endsWith("parquet")) assert(files.length == 1) @@ -156,4 +156,12 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-31968: duplicate partition columns check") { + withTempPath { f => + val e = intercept[AnalysisException]( + Seq((3, 2)).toDF("a", "b").write.partitionBy("b", "b").csv(f.getAbsolutePath)) + assert(e.getMessage.contains("Found duplicate column(s) b, b: `b`;")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala index ce3ec7f97a537..1e0d9bd9990e2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala @@ -103,30 +103,32 @@ class SaveLoadSuite extends DataSourceTest with SharedSparkSession with BeforeAn } test("save and save again") { - df.write.json(path.toString) - - val message = intercept[AnalysisException] { + withTempView("jsonTable2") { df.write.json(path.toString) - }.getMessage - assert( - message.contains("already exists"), - "We should complain that the path already exists.") + val message = intercept[AnalysisException] { + df.write.json(path.toString) + }.getMessage - if (path.exists()) Utils.deleteRecursively(path) + assert( + message.contains("already exists"), + "We should complain that the path already exists.") - df.write.json(path.toString) - checkLoad() + if (path.exists()) Utils.deleteRecursively(path) - df.write.mode(SaveMode.Overwrite).json(path.toString) - checkLoad() + df.write.json(path.toString) + checkLoad() - // verify the append mode - df.write.mode(SaveMode.Append).json(path.toString) - val df2 = df.union(df) - df2.createOrReplaceTempView("jsonTable2") + df.write.mode(SaveMode.Overwrite).json(path.toString) + checkLoad() - checkLoad(df2, "jsonTable2") + // verify the append mode + df.write.mode(SaveMode.Append).json(path.toString) + val df2 = df.union(df) + df2.createOrReplaceTempView("jsonTable2") + + checkLoad(df2, "jsonTable2") + } } test("SPARK-23459: Improve error message when specified unknown column in partition columns") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala index 92ec2a0c172ef..6486e1aee8649 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala @@ -29,6 +29,7 @@ import org.scalatest.{BeforeAndAfter, Matchers} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, Dataset} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.UTC import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.MemorySink import org.apache.spark.sql.functions.{count, window} @@ -592,6 +593,33 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche } } + test("SPARK-27340 Alias on TimeWindow expression cause watermark metadata lost") { + val inputData = MemoryStream[Int] + val aliasWindow = inputData.toDF() + .withColumn("eventTime", $"value".cast("timestamp")) + .withWatermark("eventTime", "10 seconds") + .select(window($"eventTime", "5 seconds") as 'aliasWindow) + // Check the eventTime metadata is kept in the top level alias. + assert(aliasWindow.logicalPlan.output.exists( + _.metadata.contains(EventTimeWatermark.delayKey))) + + val windowedAggregation = aliasWindow + .groupBy('aliasWindow) + .agg(count("*") as 'count) + .select($"aliasWindow".getField("start").cast("long").as[Long], $"count".as[Long]) + + testStream(windowedAggregation)( + AddData(inputData, 10, 11, 12, 13, 14, 15), + CheckNewAnswer(), + AddData(inputData, 25), // Advance watermark to 15 seconds + CheckNewAnswer((10, 5)), + assertNumStateRows(2), + AddData(inputData, 10), // Should not emit anything as data less than watermark + CheckNewAnswer(), + assertNumStateRows(2) + ) + } + test("test no-data flag") { val flagKey = SQLConf.STREAMING_NO_DATA_MICRO_BATCHES_ENABLED.key @@ -773,7 +801,7 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche } private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 - timestampFormat.setTimeZone(ju.TimeZone.getTimeZone("UTC")) + timestampFormat.setTimeZone(ju.TimeZone.getTimeZone(UTC)) private def formatTimestamp(sec: Long): String = { timestampFormat.format(new ju.Date(sec * 1000)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala index 877965100f018..aa2664c1e5393 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala @@ -555,10 +555,12 @@ abstract class FileStreamSinkSuite extends StreamTest { } } - val fs = new Path(outputDir.getCanonicalPath).getFileSystem( - spark.sessionState.newHadoopConf()) - val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, - outputDir.getCanonicalPath) + val outputDirPath = new Path(outputDir.getCanonicalPath) + val hadoopConf = spark.sessionState.newHadoopConf() + val fs = outputDirPath.getFileSystem(hadoopConf) + val logPath = FileStreamSink.getMetadataLogPath(fs, outputDirPath, conf) + + val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, logPath.toString) val allFiles = sinkLog.allFiles() // only files from non-empty partition should be logged diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala index fa320333143ec..7b16aebc531fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala @@ -532,6 +532,18 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } } + test("SPARK-31935: Hadoop file system config should be effective in data source options") { + withTempDir { dir => + val path = dir.getCanonicalPath + val defaultFs = "nonexistFS://nonexistFS" + val expectMessage = "No FileSystem for scheme nonexistFS" + val message = intercept[java.io.IOException] { + spark.readStream.option("fs.defaultFS", defaultFs).text(path) + }.getMessage + assert(message.filterNot(Set(':', '"').contains) == expectMessage) + } + } + test("read from textfile") { withTempDirs { case (src, tmp) => val textStream = spark.readStream.textFile(src.getCanonicalPath) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala index d36c64f61a726..b04f8b0d4d174 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala @@ -798,7 +798,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest { } }, CheckNewAnswer(("c", "-1")), - assertNumStateRows(total = 0, updated = 0) + assertNumStateRows(total = 0, updated = 1) ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index b6618826487c6..d8d997e1768bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -193,13 +193,15 @@ class StreamSuite extends StreamTest { } test("sql queries") { - val inputData = MemoryStream[Int] - inputData.toDF().createOrReplaceTempView("stream") - val evens = sql("SELECT * FROM stream WHERE value % 2 = 0") - - testStream(evens)( - AddData(inputData, 1, 2, 3, 4), - CheckAnswer(2, 4)) + withTempView("stream") { + val inputData = MemoryStream[Int] + inputData.toDF().createOrReplaceTempView("stream") + val evens = sql("SELECT * FROM stream WHERE value % 2 = 0") + + testStream(evens)( + AddData(inputData, 1, 2, 3, 4), + CheckAnswer(2, 4)) + } } test("DataFrame reuse") { @@ -1244,9 +1246,10 @@ class StreamSuite extends StreamTest { failAfter(60.seconds) { val startTime = System.nanoTime() withSQLConf(SQLConf.STREAMING_STOP_TIMEOUT.key -> "2000") { - intercept[TimeoutException] { + val ex = intercept[TimeoutException] { sq.stop() } + assert(ex.getMessage.contains(sq.id.toString)) } val duration = (System.nanoTime() - startTime) / 1e6 assert(duration >= 2000, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala index 6d5ad873eedea..93bfd64455408 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala @@ -144,16 +144,22 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with } } + private def createToExternalRowConverter[A : Encoder](): A => Row = { + val encoder = encoderFor[A] + val toInternalRow = encoder.createSerializer() + val toExternalRow = RowEncoder(encoder.schema).resolveAndBind().createDeserializer() + toExternalRow.compose(toInternalRow) + } + /** * Checks to make sure that the current data stored in the sink matches the `expectedAnswer`. * This operation automatically blocks until all added data has been processed. */ object CheckAnswer { def apply[A : Encoder](data: A*): CheckAnswerRows = { - val encoder = encoderFor[A] - val toExternalRow = RowEncoder(encoder.schema).resolveAndBind() + val toExternalRow = createToExternalRowConverter[A]() CheckAnswerRows( - data.map(d => toExternalRow.fromRow(encoder.toRow(d))), + data.map(toExternalRow), lastOnly = false, isSorted = false) } @@ -174,10 +180,9 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with } def apply[A: Encoder](isSorted: Boolean, data: A*): CheckAnswerRows = { - val encoder = encoderFor[A] - val toExternalRow = RowEncoder(encoder.schema).resolveAndBind() + val toExternalRow = createToExternalRowConverter[A]() CheckAnswerRows( - data.map(d => toExternalRow.fromRow(encoder.toRow(d))), + data.map(toExternalRow), lastOnly = true, isSorted = isSorted) } @@ -215,9 +220,8 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with def apply(): CheckNewAnswerRows = CheckNewAnswerRows(Seq.empty) def apply[A: Encoder](data: A, moreData: A*): CheckNewAnswerRows = { - val encoder = encoderFor[A] - val toExternalRow = RowEncoder(encoder.schema).resolveAndBind() - CheckNewAnswerRows((data +: moreData).map(d => toExternalRow.fromRow(encoder.toRow(d)))) + val toExternalRow = createToExternalRowConverter[A]() + CheckNewAnswerRows((data +: moreData).map(toExternalRow)) } def apply(rows: Row*): CheckNewAnswerRows = CheckNewAnswerRows(rows) @@ -301,6 +305,14 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with def apply(func: StreamExecution => Any): AssertOnQuery = apply("Execute")(func) } + /** Call `StreamingQuery.processAllAvailable()` to wait. */ + object ProcessAllAvailable { + def apply(): AssertOnQuery = AssertOnQuery { query => + query.processAllAvailable() + true + } + } + object AwaitEpoch { def apply(epoch: Long): AssertOnQuery = Execute { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index 741355381222d..85e1b85b84d26 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -202,47 +202,68 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { } } - def stateOperatorProgresses: Seq[StateOperatorProgress] = { - val operatorProgress = mutable.ArrayBuffer[StateOperatorProgress]() - var progress = query.recentProgress.last - - operatorProgress ++= progress.stateOperators.map { op => op.copy(op.numRowsUpdated) } - if (progress.numInputRows == 0) { - // empty batch, merge metrics from previous batch as well - progress = query.recentProgress.takeRight(2).head - operatorProgress.zipWithIndex.foreach { case (sop, index) => - // "numRowsUpdated" should be merged, as it could be updated in both batches. - // (for now it is only updated from previous batch, but things can be changed.) - // other metrics represent current status of state so picking up the latest values. - val newOperatorProgress = sop.copy( - sop.numRowsUpdated + progress.stateOperators(index).numRowsUpdated) - operatorProgress(index) = newOperatorProgress - } - } + // Pick the latest progress that actually ran a batch + def lastExecutedBatch: StreamingQueryProgress = { + query.recentProgress.filter(_.durationMs.containsKey("addBatch")).last + } - operatorProgress + def stateOperatorProgresses: Seq[StateOperatorProgress] = { + lastExecutedBatch.stateOperators } } + val clock = new StreamManualClock() + testStream(aggWithWatermark)( + // batchId 0 AddData(inputData, 15), - CheckAnswer(), // watermark = 5 + StartStream(Trigger.ProcessingTime("interval 1 second"), clock), + CheckAnswer(), // watermark = 0 AssertOnQuery { _.stateNodes.size === 1 }, AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 }, AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 }, AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 1 }, + AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 }, + + // batchId 1 without data + AdvanceManualClock(1000L), // watermark = 5 + Execute { q => // wait for the no data batch to complete + eventually(timeout(streamingTimeout)) { assert(q.lastProgress.batchId === 1) } + }, + CheckAnswer(), + AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 0 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 1 }, + AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 }, + + // batchId 2 with data AddData(inputData, 10, 12, 14), - CheckAnswer(), // watermark = 5 - AssertOnQuery { _.stateNodes.size === 1 }, + AdvanceManualClock(1000L), // watermark = 5 + CheckAnswer(), AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 }, AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 }, AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 2 }, + AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 }, + + // batchId 3 with data AddData(inputData, 25), - CheckAnswer((10, 3)), // watermark = 15 - AssertOnQuery { _.stateNodes.size === 1 }, - AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 1 }, + AdvanceManualClock(1000L), // watermark = 5 + CheckAnswer(), + AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 }, AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 }, - AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 2 } + AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 3 }, + AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 }, + + // batchId 4 without data + AdvanceManualClock(1000L), // watermark = 15 + Execute { q => // wait for the no data batch to complete + eventually(timeout(streamingTimeout)) { assert(q.lastProgress.batchId === 4) } + }, + CheckAnswer((10, 3)), + AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 1 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 0 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 2 }, + AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 1 } ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index cfd7204ea2931..51ddc7b49fcda 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -280,6 +280,12 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest { { // State should have been cleaned if flag is set, otherwise should not have been cleaned if (flag) assertNumStateRows(total = 1, updated = 1) else assertNumStateRows(total = 7, updated = 1) + }, + AssertOnQuery { q => + eventually(timeout(streamingTimeout)) { + q.lastProgress.sink.numOutputRows == 0L + true + } } ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala index 3f218c9cb7fd9..7837b20cd6830 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.streaming import java.io.File +import java.sql.Timestamp import java.util.{Locale, UUID} import scala.util.Random @@ -991,4 +992,47 @@ class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with ) } } + + test("SPARK-32148 stream-stream join regression on Spark 3.0.0") { + val input1 = MemoryStream[(Timestamp, String, String)] + val df1 = input1.toDF + .selectExpr("_1 as eventTime", "_2 as id", "_3 as comment") + .withWatermark(s"eventTime", "2 minutes") + + val input2 = MemoryStream[(Timestamp, String, String)] + val df2 = input2.toDF + .selectExpr("_1 as eventTime", "_2 as id", "_3 as name") + .withWatermark(s"eventTime", "4 minutes") + + val joined = df1.as("left") + .join(df2.as("right"), + expr(""" + |left.id = right.id AND left.eventTime BETWEEN + | right.eventTime - INTERVAL 30 seconds AND + | right.eventTime + INTERVAL 30 seconds + """.stripMargin), + joinType = "leftOuter") + + val inputDataForInput1 = Seq( + (Timestamp.valueOf("2020-01-01 00:00:00"), "abc", "has no join partner"), + (Timestamp.valueOf("2020-01-02 00:00:00"), "abc", "joined with A"), + (Timestamp.valueOf("2020-01-02 01:00:00"), "abc", "joined with B")) + + val inputDataForInput2 = Seq( + (Timestamp.valueOf("2020-01-02 00:00:10"), "abc", "A"), + (Timestamp.valueOf("2020-01-02 00:59:59"), "abc", "B"), + (Timestamp.valueOf("2020-01-02 02:00:00"), "abc", "C")) + + val expectedOutput = Seq( + (Timestamp.valueOf("2020-01-01 00:00:00"), "abc", "has no join partner", null, null, null), + (Timestamp.valueOf("2020-01-02 00:00:00"), "abc", "joined with A", + Timestamp.valueOf("2020-01-02 00:00:10"), "abc", "A"), + (Timestamp.valueOf("2020-01-02 01:00:00"), "abc", "joined with B", + Timestamp.valueOf("2020-01-02 00:59:59"), "abc", "B")) + + testStream(joined)( + MultiAddData((input1, inputDataForInput1), (input2, inputDataForInput2)), + CheckNewAnswer(expectedOutput.head, expectedOutput.tail: _*) + ) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala index 9d0f829ac9684..a9aec6dbbdcd4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala @@ -49,7 +49,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { assert(spark.streams.active.isEmpty) // Skip check default `StreamingQueryStatusListener` which is for streaming UI. assert(spark.streams.listListeners() - .filterNot(_.isInstanceOf[StreamingQueryStatusListener]).isEmpty) + .forall(_.isInstanceOf[StreamingQueryStatusListener])) // Make sure we don't leak any events to the next test spark.sparkContext.listenerBus.waitUntilEmpty() } @@ -254,8 +254,10 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { assert(newEvent.name === event.name) } - testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, "name", 1L)) - testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, null, 1L)) + testSerialization( + new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, "name", "2016-12-05T20:54:20.827Z")) + testSerialization( + new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, null, "2016-12-05T20:54:20.827Z")) } test("QueryProgressEvent serialization") { @@ -382,28 +384,27 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { } } - testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") { + testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2_0_0") { // query-event-logs-version-2.0.0.txt has all types of events generated by - // Structured Streaming in Spark 2.0.0. + // Structured Streaming in Spark 2.0.0. Because we renamed the classes, // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it // to verify that we can skip broken jsons generated by Structured Streaming. - testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.0.txt") + testReplayListenerBusWithBrokenEventJsons("query-event-logs-version-2.0.0.txt", 1) } - testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") { + testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2_0_1") { // query-event-logs-version-2.0.1.txt has all types of events generated by - // Structured Streaming in Spark 2.0.1. + // Structured Streaming in Spark 2.0.1. Because we renamed the classes, // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it // to verify that we can skip broken jsons generated by Structured Streaming. - testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.1.txt") + testReplayListenerBusWithBrokenEventJsons("query-event-logs-version-2.0.1.txt", 1) } - testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.2") { + testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2_0_2") { // query-event-logs-version-2.0.2.txt has all types of events generated by - // Structured Streaming in Spark 2.0.2. - // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it - // to verify that we can skip broken jsons generated by Structured Streaming. - testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.2.txt") + // Structured Streaming in Spark 2.0.2. SPARK-18516 refactored Structured Streaming query events + // in 2.1.0. This test is to verify we are able to load events generated by Spark 2.0.2. + testReplayListenerBusWithBrokenEventJsons("query-event-logs-version-2.0.2.txt", 5) } test("listener propagates observable metrics") { @@ -432,9 +433,13 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { } try { + val noDataProgressIntervalKey = SQLConf.STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL.key spark.streams.addListener(listener) testStream(df, OutputMode.Append)( - StartStream(Trigger.ProcessingTime(100), triggerClock = clock), + StartStream( + Trigger.ProcessingTime(100), + triggerClock = clock, + Map(noDataProgressIntervalKey -> "100")), // Batch 1 AddData(inputData, 1, 2), AdvanceManualClock(100), @@ -463,7 +468,51 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { } } - private def testReplayListenerBusWithBorkenEventJsons(fileName: String): Unit = { + test("SPARK-31593: remove unnecessary streaming query progress update") { + withSQLConf(SQLConf.STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL.key -> "100") { + @volatile var numProgressEvent = 0 + val listener = new StreamingQueryListener { + override def onQueryStarted(event: QueryStartedEvent): Unit = {} + override def onQueryProgress(event: QueryProgressEvent): Unit = { + numProgressEvent += 1 + } + override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {} + } + spark.streams.addListener(listener) + + def checkProgressEvent(count: Int): StreamAction = { + AssertOnQuery { _ => + eventually(Timeout(streamingTimeout)) { + assert(numProgressEvent == count) + } + true + } + } + + try { + val input = new MemoryStream[Int](0, sqlContext) + val clock = new StreamManualClock() + val result = input.toDF().select("value") + testStream(result)( + StartStream(trigger = Trigger.ProcessingTime(10), triggerClock = clock), + AddData(input, 10), + checkProgressEvent(1), + AdvanceManualClock(10), + checkProgressEvent(2), + AdvanceManualClock(90), + checkProgressEvent(2), + AdvanceManualClock(10), + checkProgressEvent(3) + ) + } finally { + spark.streams.removeListener(listener) + } + } + } + + private def testReplayListenerBusWithBrokenEventJsons( + fileName: String, + expectedEventSize: Int): Unit = { val input = getClass.getResourceAsStream(s"/structured-streaming/$fileName") val events = mutable.ArrayBuffer[SparkListenerEvent]() try { @@ -479,8 +528,8 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { replayer.addListener(new SparkListener {}) replayer.replay(input, fileName) // SparkListenerApplicationEnd is the only valid event - assert(events.size === 1) - assert(events(0).isInstanceOf[SparkListenerApplicationEnd]) + assert(events.size === expectedEventSize) + assert(events.last.isInstanceOf[SparkListenerApplicationEnd]) } finally { input.close() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala index 6f00b528cb8bd..08b3644745f9a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala @@ -241,6 +241,7 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { assert(nextProgress.numInputRows === 0) assert(nextProgress.stateOperators.head.numRowsTotal === 2) assert(nextProgress.stateOperators.head.numRowsUpdated === 0) + assert(nextProgress.sink.numOutputRows === 0) } } finally { query.stop() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala index 77f5c856ff0f4..1f408d55fd811 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala @@ -33,8 +33,11 @@ import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkException, TestUtils} import org.apache.spark.internal.Logging -import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Dataset, Row} +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, Rand, Randn, Shuffle, Uuid} +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.Complete import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2} import org.apache.spark.sql.execution.exchange.ReusedExchangeExec @@ -1106,6 +1109,90 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi } } + test("SPARK-32456: SQL union in streaming query of append mode without watermark") { + val inputData1 = MemoryStream[Int] + val inputData2 = MemoryStream[Int] + withTempView("s1", "s2") { + inputData1.toDF().createOrReplaceTempView("s1") + inputData2.toDF().createOrReplaceTempView("s2") + val unioned = spark.sql( + "select s1.value from s1 union select s2.value from s2") + checkExceptionMessage(unioned) + } + } + + test("SPARK-32456: distinct in streaming query of append mode without watermark") { + val inputData = MemoryStream[Int] + withTempView("deduptest") { + inputData.toDF().toDF("value").createOrReplaceTempView("deduptest") + val distinct = spark.sql("select distinct value from deduptest") + checkExceptionMessage(distinct) + } + } + + test("SPARK-32456: distinct in streaming query of complete mode") { + val inputData = MemoryStream[Int] + withTempView("deduptest") { + inputData.toDF().toDF("value").createOrReplaceTempView("deduptest") + val distinct = spark.sql("select distinct value from deduptest") + + testStream(distinct, Complete)( + AddData(inputData, 1, 2, 3, 3, 4), + CheckAnswer(Row(1), Row(2), Row(3), Row(4)) + ) + } + } + + testQuietly("limit on empty batch should not cause state store error") { + // The source only produces two batches, the first batch is empty and the second batch has data. + val source = new Source { + var batchId = 0 + override def stop(): Unit = {} + override def getOffset: Option[Offset] = { + Some(LongOffset(batchId + 1)) + } + override def getBatch(start: Option[Offset], end: Offset): DataFrame = { + if (batchId == 0) { + batchId += 1 + Dataset.ofRows(spark, LocalRelation(schema.toAttributes, Nil, isStreaming = true)) + } else { + Dataset.ofRows(spark, + LocalRelation(schema.toAttributes, InternalRow(10) :: Nil, isStreaming = true)) + } + } + override def schema: StructType = MockSourceProvider.fakeSchema + } + + MockSourceProvider.withMockSources(source) { + val df = spark.readStream + .format("org.apache.spark.sql.streaming.util.MockSourceProvider") + .load() + .limit(1) + + testStream(df)( + StartStream(), + AssertOnQuery { q => + q.processAllAvailable() + true + }, + CheckAnswer(10)) + } + } + + private def checkExceptionMessage(df: DataFrame): Unit = { + withTempDir { outputDir => + withTempDir { checkpointDir => + val exception = intercept[AnalysisException]( + df.writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .start(outputDir.getCanonicalPath)) + assert(exception.getMessage.contains( + "Append output mode not supported when there are streaming aggregations on streaming " + + "DataFrames/DataSets without watermark")) + } + } + } + /** Create a streaming DF that only execute one batch in which it returns the given static DF */ private def createSingleTriggerStreamingDF(triggerDF: DataFrame): DataFrame = { require(!triggerDF.isStreaming) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala index 8599ceb833ca4..0d17f2e0bc7fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala @@ -181,17 +181,19 @@ class ContinuousSuite extends ContinuousSuiteBase { } test("subquery alias") { - val input = ContinuousMemoryStream[Int] - input.toDF().createOrReplaceTempView("memory") - val test = spark.sql("select value from memory where value > 2") + withTempView("memory") { + val input = ContinuousMemoryStream[Int] + input.toDF().createOrReplaceTempView("memory") + val test = spark.sql("select value from memory where value > 2") - testStream(test)( - AddData(input, 0, 1), - CheckAnswer(), - StopStream, - AddData(input, 2, 3, 4), - StartStream(), - CheckAnswer(3, 4)) + testStream(test)( + AddData(input, 0, 1), + CheckAnswer(), + StopStream, + AddData(input, 2, 3, 4), + StartStream(), + CheckAnswer(3, 4)) + } } test("repeatedly restart") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala index f9fc540c2ab80..8bf7e276e12c2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala @@ -43,11 +43,13 @@ object LastOptions { var mockStreamSourceProvider = mock(classOf[StreamSourceProvider]) var mockStreamSinkProvider = mock(classOf[StreamSinkProvider]) var parameters: Map[String, String] = null + var sinkParameters: Map[String, String] = null var schema: Option[StructType] = null var partitionColumns: Seq[String] = Nil def clear(): Unit = { parameters = null + sinkParameters = null schema = null partitionColumns = null reset(mockStreamSourceProvider) @@ -101,7 +103,7 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider { parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { - LastOptions.parameters = parameters + LastOptions.sinkParameters = parameters LastOptions.partitionColumns = partitionColumns LastOptions.mockStreamSinkProvider.createSink(spark, parameters, partitionColumns, outputMode) (_: Long, _: DataFrame) => {} @@ -171,16 +173,48 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter { df.writeStream .format("org.apache.spark.sql.streaming.test") - .option("opt1", "1") - .options(Map("opt2" -> "2")) + .option("opt1", "5") + .options(Map("opt2" -> "4")) .options(map) .option("checkpointLocation", newMetadataDir) .start() .stop() - assert(LastOptions.parameters("opt1") == "1") - assert(LastOptions.parameters("opt2") == "2") - assert(LastOptions.parameters("opt3") == "3") + assert(LastOptions.sinkParameters("opt1") == "5") + assert(LastOptions.sinkParameters("opt2") == "4") + assert(LastOptions.sinkParameters("opt3") == "3") + assert(LastOptions.sinkParameters.contains("checkpointLocation")) + } + + test("SPARK-32832: later option should override earlier options for load()") { + spark.readStream + .format("org.apache.spark.sql.streaming.test") + .option("paTh", "1") + .option("PATH", "2") + .option("Path", "3") + .option("patH", "4") + .option("path", "5") + .load() + assert(LastOptions.parameters("path") == "5") + } + + test("SPARK-32832: later option should override earlier options for start()") { + val ds = spark.readStream + .format("org.apache.spark.sql.streaming.test") + .load() + assert(LastOptions.parameters.isEmpty) + + ds.writeStream + .format("org.apache.spark.sql.streaming.test") + .option("checkpointLocation", newMetadataDir) + .option("paTh", "1") + .option("PATH", "2") + .option("Path", "3") + .option("patH", "4") + .option("path", "5") + .start() + .stop() + assert(LastOptions.sinkParameters("path") == "5") } test("partitioning") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala index de43e470e8e13..2a1e18ab66bb7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala @@ -97,7 +97,7 @@ class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { when(streamQuery.name).thenReturn("query") when(streamQuery.id).thenReturn(id) when(streamQuery.runId).thenReturn(id) - when(streamQuery.submissionTime).thenReturn(1L) + when(streamQuery.startTimestamp).thenReturn(1L) when(streamQuery.lastProgress).thenReturn(progress) when(streamQuery.recentProgress).thenReturn(Array(progress)) when(streamQuery.exception).thenReturn(None) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala index bd74ed340b408..6aa440e5609c5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala @@ -27,12 +27,13 @@ import org.apache.spark.sql.streaming class StreamingQueryStatusListenerSuite extends StreamTest { test("onQueryStarted, onQueryProgress, onQueryTerminated") { - val listener = new StreamingQueryStatusListener(spark.sqlContext.conf) + val listener = new StreamingQueryStatusListener(spark.sparkContext.conf) // hanlde query started event val id = UUID.randomUUID() val runId = UUID.randomUUID() - val startEvent = new StreamingQueryListener.QueryStartedEvent(id, runId, "test", 1L) + val startEvent = new StreamingQueryListener.QueryStartedEvent( + id, runId, "test", "2016-12-05T20:54:20.827Z") listener.onQueryStarted(startEvent) // result checking @@ -73,12 +74,13 @@ class StreamingQueryStatusListenerSuite extends StreamTest { } test("same query start multiple times") { - val listener = new StreamingQueryStatusListener(spark.sqlContext.conf) + val listener = new StreamingQueryStatusListener(spark.sparkContext.conf) // handle first time start val id = UUID.randomUUID() val runId0 = UUID.randomUUID() - val startEvent0 = new StreamingQueryListener.QueryStartedEvent(id, runId0, "test", 1L) + val startEvent0 = new StreamingQueryListener.QueryStartedEvent( + id, runId0, "test", "2016-12-05T20:54:20.827Z") listener.onQueryStarted(startEvent0) // handle terminate event @@ -87,7 +89,8 @@ class StreamingQueryStatusListenerSuite extends StreamTest { // handle second time start val runId1 = UUID.randomUUID() - val startEvent1 = new StreamingQueryListener.QueryStartedEvent(id, runId1, "test", 1L) + val startEvent1 = new StreamingQueryListener.QueryStartedEvent( + id, runId1, "test", "2016-12-05T20:54:20.827Z") listener.onQueryStarted(startEvent1) // result checking diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala new file mode 100644 index 0000000000000..fdf4c6634d79f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import org.openqa.selenium.WebDriver +import org.openqa.selenium.htmlunit.HtmlUnitDriver +import org.scalatest._ +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ +import org.scalatestplus.selenium.WebBrowser + +import org.apache.spark._ +import org.apache.spark.internal.config.UI.{UI_ENABLED, UI_PORT} +import org.apache.spark.sql.LocalSparkSession.withSparkSession +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.util.quietly +import org.apache.spark.sql.streaming.StreamingQueryException +import org.apache.spark.ui.SparkUICssErrorHandler + +class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with BeforeAndAfterAll { + + implicit var webDriver: WebDriver = _ + + override def beforeAll(): Unit = { + super.beforeAll() + webDriver = new HtmlUnitDriver { + getWebClient.setCssErrorHandler(new SparkUICssErrorHandler) + } + } + + private def newSparkSession( + master: String = "local", + additionalConfs: Map[String, String] = Map.empty): SparkSession = { + val conf = new SparkConf() + .setMaster(master) + .setAppName("ui-test") + .set(UI_ENABLED, true) + .set(UI_PORT, 0) + additionalConfs.foreach { case (k, v) => conf.set(k, v) } + val spark = SparkSession.builder().master(master).config(conf).getOrCreate() + assert(spark.sparkContext.ui.isDefined) + spark + } + + def goToUi(spark: SparkSession, path: String): Unit = { + go to (spark.sparkContext.ui.get.webUrl.stripSuffix("/") + path) + } + + test("SPARK-30984: Structured Streaming UI should be activated when running a streaming query") { + quietly { + withSparkSession(newSparkSession()) { spark => + import spark.implicits._ + try { + spark.range(1, 10).count() + + goToUi(spark, "/StreamingQuery") + + val h3Text = findAll(cssSelector("h3")).map(_.text).toSeq + h3Text should not contain ("Streaming Query") + + val activeQuery = + spark.readStream.format("rate").load().writeStream.format("noop").start() + val completedQuery = + spark.readStream.format("rate").load().writeStream.format("noop").start() + completedQuery.stop() + val failedQuery = spark.readStream.format("rate").load().select("value").as[Long] + .map(_ / 0).writeStream.format("noop").start() + try { + failedQuery.awaitTermination() + } catch { + case _: StreamingQueryException => + } + + eventually(timeout(30.seconds), interval(100.milliseconds)) { + // Check the query list page + goToUi(spark, "/StreamingQuery") + + findAll(cssSelector("h3")).map(_.text).toSeq should contain("Streaming Query") + findAll(cssSelector("""#activeQueries-table th""")).map(_.text).toSeq should be { + List("Name", "Status", "Id", "Run ID", "Start Time", "Duration", "Avg Input /sec", + "Avg Process /sec", "Lastest Batch") + } + val activeQueries = + findAll(cssSelector("""#activeQueries-table td""")).map(_.text).toSeq + activeQueries should contain(activeQuery.id.toString) + activeQueries should contain(activeQuery.runId.toString) + findAll(cssSelector("""#completedQueries-table th""")) + .map(_.text).toSeq should be { + List("Name", "Status", "Id", "Run ID", "Start Time", "Duration", "Avg Input /sec", + "Avg Process /sec", "Lastest Batch", "Error") + } + val completedQueries = + findAll(cssSelector("""#completedQueries-table td""")).map(_.text).toSeq + completedQueries should contain(completedQuery.id.toString) + completedQueries should contain(completedQuery.runId.toString) + completedQueries should contain(failedQuery.id.toString) + completedQueries should contain(failedQuery.runId.toString) + + // Check the query statistics page + val activeQueryLink = + findAll(cssSelector("""#activeQueries-table a""")).flatMap(_.attribute("href")).next + go to activeQueryLink + + findAll(cssSelector("h3")) + .map(_.text).toSeq should contain("Streaming Query Statistics") + val summaryText = findAll(cssSelector("div strong")).map(_.text).toSeq + summaryText should contain ("Name:") + summaryText should contain ("Id:") + summaryText should contain ("RunId:") + findAll(cssSelector("""#stat-table th""")).map(_.text).toSeq should be { + List("", "Timelines", "Histograms") + } + } + } finally { + spark.streams.active.foreach(_.stop()) + } + } + } + } + + override def afterAll(): Unit = { + try { + if (webDriver != null) { + webDriver.quit() + } + } finally { + super.afterAll() + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala index fb939007697c2..c7ca0125c8a0f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala @@ -224,6 +224,28 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with assert(LastOptions.parameters("opt3") == "3") } + test("SPARK-32364: path argument of load function should override all existing options") { + spark.read + .format("org.apache.spark.sql.test") + .option("paTh", "1") + .option("PATH", "2") + .option("Path", "3") + .option("patH", "4") + .load("5") + assert(LastOptions.parameters("path") == "5") + } + + test("SPARK-32364: path argument of save function should override all existing options") { + Seq(1).toDF.write + .format("org.apache.spark.sql.test") + .option("paTh", "1") + .option("PATH", "2") + .option("Path", "3") + .option("patH", "4") + .save("5") + assert(LastOptions.parameters("path") == "5") + } + test("pass partitionBy as options") { Seq(1).toDF.write .format("org.apache.spark.sql.test") @@ -282,7 +304,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with plan = qe.analyzed } - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} } spark.listenerManager.register(listener) @@ -333,7 +355,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with var msg = intercept[AnalysisException] { Seq((1L, 2.0)).toDF("i", "d").write.mode("append").saveAsTable("t") }.getMessage - assert(msg.contains("Cannot safely cast 'i': LongType to IntegerType")) + assert(msg.contains("Cannot safely cast 'i': bigint to int")) // Insert into table successfully. Seq((1, 2.0)).toDF("i", "d").write.mode("append").saveAsTable("t") @@ -354,14 +376,14 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with var msg = intercept[AnalysisException] { Seq(("a", "b")).toDF("i", "d").write.mode("append").saveAsTable("t") }.getMessage - assert(msg.contains("Cannot safely cast 'i': StringType to IntegerType") && - msg.contains("Cannot safely cast 'd': StringType to DoubleType")) + assert(msg.contains("Cannot safely cast 'i': string to int") && + msg.contains("Cannot safely cast 'd': string to double")) msg = intercept[AnalysisException] { Seq((true, false)).toDF("i", "d").write.mode("append").saveAsTable("t") }.getMessage - assert(msg.contains("Cannot safely cast 'i': BooleanType to IntegerType") && - msg.contains("Cannot safely cast 'd': BooleanType to DoubleType")) + assert(msg.contains("Cannot safely cast 'i': boolean to int") && + msg.contains("Cannot safely cast 'd': boolean to double")) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 38893f846e5a4..7be15e9d87004 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -27,7 +27,8 @@ import scala.language.implicitConversions import scala.util.control.NonFatal import org.apache.hadoop.fs.Path -import org.scalatest.{BeforeAndAfterAll, Suite} +import org.scalactic.source.Position +import org.scalatest.{BeforeAndAfterAll, Suite, Tag} import org.scalatest.concurrent.Eventually import org.apache.spark.SparkFunSuite @@ -40,6 +41,7 @@ import org.apache.spark.sql.catalyst.plans.PlanTestBase import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.FilterExec +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution import org.apache.spark.sql.execution.datasources.DataSourceUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.UninterruptibleThread @@ -114,6 +116,19 @@ private[sql] trait SQLTestUtils extends SparkFunSuite with SQLTestUtilsBase with } } + override protected def test(testName: String, testTags: Tag*)(testFun: => Any) + (implicit pos: Position): Unit = { + if (testTags.exists(_.isInstanceOf[DisableAdaptiveExecution])) { + super.test(testName, testTags: _*) { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + testFun + } + } + } else { + super.test(testName, testTags: _*)(testFun) + } + } + /** * Run a test on a separate `UninterruptibleThread`. */ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala index 6881812286b24..b17c93503804c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala @@ -20,15 +20,17 @@ package org.apache.spark.sql.util import scala.collection.mutable.ArrayBuffer import org.apache.spark._ -import org.apache.spark.sql.{functions, AnalysisException, QueryTest, Row} +import org.apache.spark.sql.{functions, AnalysisException, Dataset, QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, InsertIntoStatement, LogicalPlan, Project} -import org.apache.spark.sql.execution.{QueryExecution, WholeStageCodegenExec} +import org.apache.spark.sql.execution.{QueryExecution, QueryExecutionException, WholeStageCodegenExec} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.{CreateTable, InsertIntoHadoopFsRelationCommand} import org.apache.spark.sql.execution.datasources.json.JsonFileFormat -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StringType class DataFrameCallbackSuite extends QueryTest with SharedSparkSession @@ -40,7 +42,7 @@ class DataFrameCallbackSuite extends QueryTest val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)] val listener = new QueryExecutionListener { // Only test successful case here, so no need to implement `onFailure` - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { metrics += ((funcName, qe, duration)) @@ -67,10 +69,10 @@ class DataFrameCallbackSuite extends QueryTest } testQuietly("execute callback functions when a DataFrame action failed") { - val metrics = ArrayBuffer.empty[(String, QueryExecution, Throwable)] + val metrics = ArrayBuffer.empty[(String, QueryExecution, Exception)] val listener = new QueryExecutionListener { - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = { - metrics += ((funcName, qe, error)) + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { + metrics += ((funcName, qe, exception)) } // Only test failed case here, so no need to implement `onSuccess` @@ -96,7 +98,7 @@ class DataFrameCallbackSuite extends QueryTest val metrics = ArrayBuffer.empty[Long] val listener = new QueryExecutionListener { // Only test successful case here, so no need to implement `onFailure` - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { val metric = stripAQEPlan(qe.executedPlan) match { @@ -136,7 +138,7 @@ class DataFrameCallbackSuite extends QueryTest val metrics = ArrayBuffer.empty[Long] val listener = new QueryExecutionListener { // Only test successful case here, so no need to implement `onFailure` - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { metrics += qe.executedPlan.longMetric("dataSize").value @@ -176,10 +178,10 @@ class DataFrameCallbackSuite extends QueryTest test("execute callback functions for DataFrameWriter") { val commands = ArrayBuffer.empty[(String, LogicalPlan)] - val errors = ArrayBuffer.empty[(String, Throwable)] + val exceptions = ArrayBuffer.empty[(String, Exception)] val listener = new QueryExecutionListener { - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = { - errors += funcName -> error + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { + exceptions += funcName -> exception } override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { @@ -225,9 +227,9 @@ class DataFrameCallbackSuite extends QueryTest spark.range(10).select($"id", $"id").write.insertInto("tab") } sparkContext.listenerBus.waitUntilEmpty() - assert(errors.length == 1) - assert(errors.head._1 == "insertInto") - assert(errors.head._2 == e) + assert(exceptions.length == 1) + assert(exceptions.head._1 == "insertInto") + assert(exceptions.head._2 == e) } } @@ -238,7 +240,7 @@ class DataFrameCallbackSuite extends QueryTest metricMaps += qe.observedMetrics } - override def onFailure(funcName: String, qe: QueryExecution, exception: Throwable): Unit = { + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { // No-op } } @@ -278,4 +280,32 @@ class DataFrameCallbackSuite extends QueryTest spark.listenerManager.unregister(listener) } } + + testQuietly("SPARK-31144: QueryExecutionListener should receive `java.lang.Error`") { + var e: Exception = null + val listener = new QueryExecutionListener { + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { + e = exception + } + override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {} + } + spark.listenerManager.register(listener) + + intercept[Error] { + Dataset.ofRows(spark, ErrorTestCommand("foo")).collect() + } + sparkContext.listenerBus.waitUntilEmpty() + assert(e != null && e.isInstanceOf[QueryExecutionException] + && e.getCause.isInstanceOf[Error] && e.getCause.getMessage == "foo") + spark.listenerManager.unregister(listener) + } +} + +/** A test command that throws `java.lang.Error` during execution. */ +case class ErrorTestCommand(foo: String) extends RunnableCommand { + + override val output: Seq[Attribute] = Seq(AttributeReference("foo", StringType)()) + + override def run(sparkSession: SparkSession): Seq[Row] = + throw new java.lang.Error(foo) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala index 2fd6cb220ea3f..ab854a0281c6c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala @@ -57,7 +57,7 @@ private class CountingQueryExecutionListener extends QueryExecutionListener { CALLBACK_COUNT.incrementAndGet() } - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = { + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { CALLBACK_COUNT.incrementAndGet() } diff --git a/sql/core/v1.2/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/v1.2/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java index 9bfad1e83ee7b..6601bcb9018f4 100644 --- a/sql/core/v1.2/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java +++ b/sql/core/v1.2/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java @@ -21,7 +21,10 @@ import org.apache.orc.storage.ql.exec.vector.*; +import org.apache.spark.sql.catalyst.util.DateTimeUtils; +import org.apache.spark.sql.catalyst.util.RebaseDateTime; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.TimestampType; import org.apache.spark.sql.vectorized.ColumnarArray; @@ -41,6 +44,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto private DecimalColumnVector decimalData; private TimestampColumnVector timestampData; private final boolean isTimestamp; + private final boolean isDate; private int batchSize; @@ -53,6 +57,12 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto isTimestamp = false; } + if (type instanceof DateType) { + isDate = true; + } else { + isDate = false; + } + baseData = vector; if (vector instanceof LongColumnVector) { longData = (LongColumnVector) vector; @@ -129,14 +139,19 @@ public short getShort(int rowId) { @Override public int getInt(int rowId) { - return (int) longData.vector[getRowIndex(rowId)]; + int value = (int) longData.vector[getRowIndex(rowId)]; + if (isDate) { + return RebaseDateTime.rebaseJulianToGregorianDays(value); + } else { + return value; + } } @Override public long getLong(int rowId) { int index = getRowIndex(rowId); if (isTimestamp) { - return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000; + return DateTimeUtils.fromJavaTimestamp(timestampData.asScratchTimestamp(index)); } else { return longData.vector[index]; } diff --git a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/DaysWritable.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/DaysWritable.scala new file mode 100644 index 0000000000000..1dccf0ca1faef --- /dev/null +++ b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/DaysWritable.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.orc + +import java.io.{DataInput, DataOutput, IOException} +import java.sql.Date + +import org.apache.hadoop.io.WritableUtils +import org.apache.orc.storage.serde2.io.DateWritable + +import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseJulianToGregorianDays} + +/** + * The class accepts/returns days in Gregorian calendar and rebase them + * via conversion to local date in Julian calendar for dates before 1582-10-15 + * in read/write for backward compatibility with Spark 2.4 and earlier versions. + * + * This is a clone of `org.apache.spark.sql.execution.datasources.DaysWritable`. + * The class is cloned because Hive ORC v1.2 uses different `DateWritable`: + * - v1.2: `org.apache.orc.storage.serde2.io.DateWritable` + * - v2.3 and `HiveInspectors`: `org.apache.hadoop.hive.serde2.io.DateWritable` + * + * @param gregorianDays The number of days since the epoch 1970-01-01 in + * Gregorian calendar. + * @param julianDays The number of days since the epoch 1970-01-01 in + * Julian calendar. + */ +class DaysWritable( + var gregorianDays: Int, + var julianDays: Int) + extends DateWritable { + + def this() = this(0, 0) + def this(gregorianDays: Int) = + this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays)) + def this(dateWritable: DateWritable) = { + this( + gregorianDays = dateWritable match { + case daysWritable: DaysWritable => daysWritable.gregorianDays + case dateWritable: DateWritable => + rebaseJulianToGregorianDays(dateWritable.getDays) + }, + julianDays = dateWritable.getDays) + } + + override def getDays: Int = julianDays + override def get(): Date = new Date(DateWritable.daysToMillis(julianDays)) + + override def set(d: Int): Unit = { + gregorianDays = d + julianDays = rebaseGregorianToJulianDays(d) + } + + @throws[IOException] + override def write(out: DataOutput): Unit = { + WritableUtils.writeVInt(out, julianDays) + } + + @throws[IOException] + override def readFields(in: DataInput): Unit = { + julianDays = WritableUtils.readVInt(in) + gregorianDays = rebaseJulianToGregorianDays(julianDays) + } +} diff --git a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala index 995c5ed317de1..a068347634cda 100644 --- a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala +++ b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.datasources.orc +import java.time.{Instant, LocalDate} + import org.apache.orc.storage.common.`type`.HiveDecimal import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument} import org.apache.orc.storage.ql.io.sarg.SearchArgument.Builder @@ -24,6 +26,8 @@ import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory.newBuilder import org.apache.orc.storage.serde2.io.HiveDecimalWritable import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types._ @@ -64,9 +68,11 @@ private[sql] object OrcFilters extends OrcFiltersBase { * Create ORC filter as a SearchArgument instance. */ def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = { - val dataTypeMap = schema.map(f => f.name -> f.dataType).toMap + val dataTypeMap = getDataTypeMap(schema, SQLConf.get.caseSensitiveAnalysis) // Combines all convertible filters using `And` to produce a single conjunction - val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, filters)) + // TODO (SPARK-25557): ORC doesn't support nested predicate pushdown, so they are removed. + val newFilters = filters.filter(!_.containsNestedColumn) + val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, newFilters)) conjunctionOptional.map { conjunction => // Then tries to build a single ORC `SearchArgument` for the conjunction predicate. // The input predicate is fully convertible. There should not be any empty result in the @@ -77,7 +83,7 @@ private[sql] object OrcFilters extends OrcFiltersBase { def convertibleFilters( schema: StructType, - dataTypeMap: Map[String, DataType], + dataTypeMap: Map[String, OrcPrimitiveField], filters: Seq[Filter]): Seq[Filter] = { import org.apache.spark.sql.sources._ @@ -135,7 +141,7 @@ private[sql] object OrcFilters extends OrcFiltersBase { /** * Get PredicateLeafType which is corresponding to the given DataType. */ - private def getPredicateLeafType(dataType: DataType) = dataType match { + private[sql] def getPredicateLeafType(dataType: DataType) = dataType match { case BooleanType => PredicateLeaf.Type.BOOLEAN case ByteType | ShortType | IntegerType | LongType => PredicateLeaf.Type.LONG case FloatType | DoubleType => PredicateLeaf.Type.FLOAT @@ -159,6 +165,10 @@ private[sql] object OrcFilters extends OrcFiltersBase { value.asInstanceOf[Number].doubleValue() case _: DecimalType => new HiveDecimalWritable(HiveDecimal.create(value.asInstanceOf[java.math.BigDecimal])) + case _: DateType if value.isInstanceOf[LocalDate] => + toJavaDate(localDateToDays(value.asInstanceOf[LocalDate])) + case _: TimestampType if value.isInstanceOf[Instant] => + toJavaTimestamp(instantToMicros(value.asInstanceOf[Instant])) case _ => value } @@ -171,7 +181,7 @@ private[sql] object OrcFilters extends OrcFiltersBase { * @return the builder so far. */ private def buildSearchArgument( - dataTypeMap: Map[String, DataType], + dataTypeMap: Map[String, OrcPrimitiveField], expression: Filter, builder: Builder): Builder = { import org.apache.spark.sql.sources._ @@ -207,60 +217,61 @@ private[sql] object OrcFilters extends OrcFiltersBase { * @return the builder so far. */ private def buildLeafSearchArgument( - dataTypeMap: Map[String, DataType], + dataTypeMap: Map[String, OrcPrimitiveField], expression: Filter, builder: Builder): Option[Builder] = { def getType(attribute: String): PredicateLeaf.Type = - getPredicateLeafType(dataTypeMap(attribute)) + getPredicateLeafType(dataTypeMap(attribute).fieldType) import org.apache.spark.sql.sources._ // NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()` // call is mandatory. ORC `SearchArgument` builder requires that all leaf predicates must be // wrapped by a "parent" predicate (`And`, `Or`, or `Not`). + // Since ORC 1.5.0 (ORC-323), we need to quote for column names with `.` characters + // in order to distinguish predicate pushdown for nested columns. expression match { - case EqualTo(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().equals(quotedName, getType(attribute), castedValue).end()) + case EqualTo(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .equals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case EqualNullSafe(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().nullSafeEquals(quotedName, getType(attribute), castedValue).end()) + case EqualNullSafe(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .nullSafeEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case LessThan(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().lessThan(quotedName, getType(attribute), castedValue).end()) + case LessThan(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case LessThanOrEqual(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().lessThanEquals(quotedName, getType(attribute), castedValue).end()) + case LessThanOrEqual(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case GreaterThan(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startNot().lessThanEquals(quotedName, getType(attribute), castedValue).end()) + case GreaterThan(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startNot() + .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case GreaterThanOrEqual(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startNot().lessThan(quotedName, getType(attribute), castedValue).end()) + case GreaterThanOrEqual(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startNot() + .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case IsNull(attribute) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - Some(builder.startAnd().isNull(quotedName, getType(attribute)).end()) + case IsNull(name) if dataTypeMap.contains(name) => + Some(builder.startAnd() + .isNull(dataTypeMap(name).fieldName, getType(name)).end()) - case IsNotNull(attribute) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - Some(builder.startNot().isNull(quotedName, getType(attribute)).end()) + case IsNotNull(name) if dataTypeMap.contains(name) => + Some(builder.startNot() + .isNull(dataTypeMap(name).fieldName, getType(name)).end()) - case In(attribute, values) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(attribute))) - Some(builder.startAnd().in(quotedName, getType(attribute), + case In(name, values) if dataTypeMap.contains(name) => + val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(name).fieldType)) + Some(builder.startAnd().in(dataTypeMap(name).fieldName, getType(name), castedValues.map(_.asInstanceOf[AnyRef]): _*).end()) case _ => None diff --git a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala index 68503aba22b40..7fbc1cd205b13 100644 --- a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala +++ b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.datasources.orc -import java.sql.Date - import org.apache.orc.storage.common.`type`.HiveDecimal import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch import org.apache.orc.storage.ql.io.sarg.{SearchArgument => OrcSearchArgument} @@ -38,7 +36,9 @@ private[sql] object OrcShimUtils { private[sql] type Operator = OrcOperator private[sql] type SearchArgument = OrcSearchArgument - def getSqlDate(value: Any): Date = value.asInstanceOf[DateWritable].get + def getGregorianDays(value: Any): Int = { + new DaysWritable(value.asInstanceOf[DateWritable]).gregorianDays + } def getDecimal(value: Any): Decimal = { val decimal = value.asInstanceOf[HiveDecimalWritable].getHiveDecimal() @@ -47,13 +47,13 @@ private[sql] object OrcShimUtils { def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = { if (reuseObj) { - val result = new DateWritable() + val result = new DaysWritable() (getter, ordinal) => result.set(getter.getInt(ordinal)) result } else { (getter: SpecializedGetters, ordinal: Int) => - new DateWritable(getter.getInt(ordinal)) + new DaysWritable(getter.getInt(ordinal)) } } diff --git a/sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala index ee5162bced8ac..a3c2343a73b97 100644 --- a/sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala +++ b/sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala @@ -24,9 +24,10 @@ import java.sql.{Date, Timestamp} import scala.collection.JavaConverters._ import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument} +import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory.newBuilder -import org.apache.spark.SparkConf -import org.apache.spark.sql.{AnalysisException, Column, DataFrame} +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Row} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation @@ -245,29 +246,41 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { } test("filter pushdown - timestamp") { - val timeString = "2015-08-20 14:57:00" - val timestamps = (1 to 4).map { i => - val milliseconds = Timestamp.valueOf(timeString).getTime + i * 3600 - new Timestamp(milliseconds) - } - withOrcDataFrame(timestamps.map(Tuple1(_))) { implicit df => - checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(timestamps(0)) <=> $"_1", - PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(timestamps(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + val input = Seq( + "1000-01-01 01:02:03", + "1582-10-01 00:11:22", + "1900-01-01 23:59:59", + "2020-05-25 10:11:12").map(Timestamp.valueOf) + + withOrcFile(input.map(Tuple1(_))) { path => + Seq(false, true).foreach { java8Api => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString) { + readFile(path) { implicit df => + val timestamps = input.map(Literal(_)) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate( + Literal(timestamps(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate( + Literal(timestamps(2)) < $"_1", + PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate( + Literal(timestamps(0)) >= $"_1", + PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + } + } + } } } @@ -299,26 +312,33 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { } test("filter pushdown - date") { - val dates = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day => + val input = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day => Date.valueOf(day) } - withOrcDataFrame(dates.map(Tuple1(_))) { implicit df => - checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(dates(0)) === $"_1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(dates(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(dates(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(dates(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + withOrcFile(input.map(Tuple1(_))) { path => + Seq(false, true).foreach { java8Api => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString) { + readFile(path) { implicit df => + val dates = input.map(Literal(_)) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(dates(0) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(dates(0) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(dates(1) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(dates(2) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(dates(0) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(dates(3) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + } + } + } } } @@ -450,5 +470,142 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { ).get.toString } } + + test("SPARK-32622: case sensitivity in predicate pushdown") { + withTempPath { dir => + val count = 10 + val tableName = "spark_32622" + val tableDir1 = dir.getAbsoluteFile + "/table1" + + // Physical ORC files have both `A` and `a` fields. + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + spark.range(count).repartition(count).selectExpr("id - 1 as A", "id as a") + .write.mode("overwrite").orc(tableDir1) + } + + // Metastore table has both `A` and `a` fields too. + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + sql( + s""" + |CREATE TABLE $tableName (A LONG, a LONG) USING ORC LOCATION '$tableDir1' + """.stripMargin) + + checkAnswer(sql(s"select a, A from $tableName"), (0 until count).map(c => Row(c, c - 1))) + + val actual1 = stripSparkFilter(sql(s"select A from $tableName where A < 0")) + assert(actual1.count() == 1) + + val actual2 = stripSparkFilter(sql(s"select A from $tableName where a < 0")) + assert(actual2.count() == 0) + } + + // Exception thrown for ambiguous case. + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + val e = intercept[AnalysisException] { + sql(s"select a from $tableName where a < 0").collect() + } + assert(e.getMessage.contains( + "Reference 'a' is ambiguous")) + } + } + + // Metastore table has only `A` field. + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql( + s""" + |CREATE TABLE $tableName (A LONG) USING ORC LOCATION '$tableDir1' + """.stripMargin) + + val e = intercept[SparkException] { + sql(s"select A from $tableName where A < 0").collect() + } + assert(e.getCause.isInstanceOf[RuntimeException] && e.getCause.getMessage.contains( + """Found duplicate field(s) "A": [A, a] in case-insensitive mode""")) + } + } + + // Physical ORC files have only `A` field. + val tableDir2 = dir.getAbsoluteFile + "/table2" + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + spark.range(count).repartition(count).selectExpr("id - 1 as A") + .write.mode("overwrite").orc(tableDir2) + } + + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql( + s""" + |CREATE TABLE $tableName (a LONG) USING ORC LOCATION '$tableDir2' + """.stripMargin) + + checkAnswer(sql(s"select a from $tableName"), (0 until count).map(c => Row(c - 1))) + + val actual = stripSparkFilter(sql(s"select a from $tableName where a < 0")) + assert(actual.count() == 1) + } + } + + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + sql( + s""" + |CREATE TABLE $tableName (A LONG) USING ORC LOCATION '$tableDir2' + """.stripMargin) + + checkAnswer(sql(s"select A from $tableName"), (0 until count).map(c => Row(c - 1))) + + val actual = stripSparkFilter(sql(s"select A from $tableName where A < 0")) + assert(actual.count() == 1) + } + } + } + } + + test("SPARK-32646: Case-insensitive field resolution for pushdown when reading ORC") { + import org.apache.spark.sql.sources._ + + def getOrcFilter( + schema: StructType, + filters: Seq[Filter], + caseSensitive: String): Option[SearchArgument] = { + var orcFilter: Option[SearchArgument] = None + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive) { + orcFilter = + OrcFilters.createFilter(schema, filters) + } + orcFilter + } + + def testFilter( + schema: StructType, + filters: Seq[Filter], + expected: SearchArgument): Unit = { + val caseSensitiveFilters = getOrcFilter(schema, filters, "true") + val caseInsensitiveFilters = getOrcFilter(schema, filters, "false") + + assert(caseSensitiveFilters.isEmpty) + assert(caseInsensitiveFilters.isDefined) + + assert(caseInsensitiveFilters.get.getLeaves().size() > 0) + assert(caseInsensitiveFilters.get.getLeaves().size() == expected.getLeaves().size()) + (0 until expected.getLeaves().size()).foreach { index => + assert(caseInsensitiveFilters.get.getLeaves().get(index) == expected.getLeaves().get(index)) + } + } + + val schema = StructType(Seq(StructField("cint", IntegerType))) + testFilter(schema, Seq(GreaterThan("CINT", 1)), + newBuilder.startNot() + .lessThanEquals("cint", OrcFilters.getPredicateLeafType(IntegerType), 1L).`end`().build()) + testFilter(schema, Seq( + And(GreaterThan("CINT", 1), EqualTo("Cint", 2))), + newBuilder.startAnd() + .startNot() + .lessThanEquals("cint", OrcFilters.getPredicateLeafType(IntegerType), 1L).`end`() + .equals("cint", OrcFilters.getPredicateLeafType(IntegerType), 2L) + .`end`().build()) + } } diff --git a/sql/core/v2.3/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/v2.3/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java index 2f1925e69a337..6e55fedfc4deb 100644 --- a/sql/core/v2.3/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java +++ b/sql/core/v2.3/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java @@ -21,7 +21,10 @@ import org.apache.hadoop.hive.ql.exec.vector.*; +import org.apache.spark.sql.catalyst.util.DateTimeUtils; +import org.apache.spark.sql.catalyst.util.RebaseDateTime; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.TimestampType; import org.apache.spark.sql.vectorized.ColumnarArray; @@ -41,6 +44,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto private DecimalColumnVector decimalData; private TimestampColumnVector timestampData; private final boolean isTimestamp; + private final boolean isDate; private int batchSize; @@ -53,6 +57,12 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto isTimestamp = false; } + if (type instanceof DateType) { + isDate = true; + } else { + isDate = false; + } + baseData = vector; if (vector instanceof LongColumnVector) { longData = (LongColumnVector) vector; @@ -129,14 +139,19 @@ public short getShort(int rowId) { @Override public int getInt(int rowId) { - return (int) longData.vector[getRowIndex(rowId)]; + int value = (int) longData.vector[getRowIndex(rowId)]; + if (isDate) { + return RebaseDateTime.rebaseJulianToGregorianDays(value); + } else { + return value; + } } @Override public long getLong(int rowId) { int index = getRowIndex(rowId); if (isTimestamp) { - return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000; + return DateTimeUtils.fromJavaTimestamp(timestampData.asScratchTimestamp(index)); } else { return longData.vector[index]; } diff --git a/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala index 948ab44a8c19c..9f1927eff3993 100644 --- a/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala +++ b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.datasources.orc +import java.time.{Instant, LocalDate} + import org.apache.hadoop.hive.common.`type`.HiveDecimal import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument} import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder @@ -24,6 +26,8 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types._ @@ -64,9 +68,11 @@ private[sql] object OrcFilters extends OrcFiltersBase { * Create ORC filter as a SearchArgument instance. */ def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = { - val dataTypeMap = schema.map(f => f.name -> f.dataType).toMap + val dataTypeMap = getDataTypeMap(schema, SQLConf.get.caseSensitiveAnalysis) // Combines all convertible filters using `And` to produce a single conjunction - val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, filters)) + // TODO (SPARK-25557): ORC doesn't support nested predicate pushdown, so they are removed. + val newFilters = filters.filter(!_.containsNestedColumn) + val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, newFilters)) conjunctionOptional.map { conjunction => // Then tries to build a single ORC `SearchArgument` for the conjunction predicate. // The input predicate is fully convertible. There should not be any empty result in the @@ -77,7 +83,7 @@ private[sql] object OrcFilters extends OrcFiltersBase { def convertibleFilters( schema: StructType, - dataTypeMap: Map[String, DataType], + dataTypeMap: Map[String, OrcPrimitiveField], filters: Seq[Filter]): Seq[Filter] = { import org.apache.spark.sql.sources._ @@ -135,7 +141,7 @@ private[sql] object OrcFilters extends OrcFiltersBase { /** * Get PredicateLeafType which is corresponding to the given DataType. */ - private def getPredicateLeafType(dataType: DataType) = dataType match { + private[sql] def getPredicateLeafType(dataType: DataType) = dataType match { case BooleanType => PredicateLeaf.Type.BOOLEAN case ByteType | ShortType | IntegerType | LongType => PredicateLeaf.Type.LONG case FloatType | DoubleType => PredicateLeaf.Type.FLOAT @@ -159,6 +165,10 @@ private[sql] object OrcFilters extends OrcFiltersBase { value.asInstanceOf[Number].doubleValue() case _: DecimalType => new HiveDecimalWritable(HiveDecimal.create(value.asInstanceOf[java.math.BigDecimal])) + case _: DateType if value.isInstanceOf[LocalDate] => + toJavaDate(localDateToDays(value.asInstanceOf[LocalDate])) + case _: TimestampType if value.isInstanceOf[Instant] => + toJavaTimestamp(instantToMicros(value.asInstanceOf[Instant])) case _ => value } @@ -171,7 +181,7 @@ private[sql] object OrcFilters extends OrcFiltersBase { * @return the builder so far. */ private def buildSearchArgument( - dataTypeMap: Map[String, DataType], + dataTypeMap: Map[String, OrcPrimitiveField], expression: Filter, builder: Builder): Builder = { import org.apache.spark.sql.sources._ @@ -207,60 +217,59 @@ private[sql] object OrcFilters extends OrcFiltersBase { * @return the builder so far. */ private def buildLeafSearchArgument( - dataTypeMap: Map[String, DataType], + dataTypeMap: Map[String, OrcPrimitiveField], expression: Filter, builder: Builder): Option[Builder] = { def getType(attribute: String): PredicateLeaf.Type = - getPredicateLeafType(dataTypeMap(attribute)) + getPredicateLeafType(dataTypeMap(attribute).fieldType) import org.apache.spark.sql.sources._ // NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()` // call is mandatory. ORC `SearchArgument` builder requires that all leaf predicates must be // wrapped by a "parent" predicate (`And`, `Or`, or `Not`). + // Since ORC 1.5.0 (ORC-323), we need to quote for column names with `.` characters + // in order to distinguish predicate pushdown for nested columns. expression match { - case EqualTo(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().equals(quotedName, getType(attribute), castedValue).end()) + case EqualTo(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .equals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case EqualNullSafe(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().nullSafeEquals(quotedName, getType(attribute), castedValue).end()) + case EqualNullSafe(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .nullSafeEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case LessThan(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().lessThan(quotedName, getType(attribute), castedValue).end()) + case LessThan(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case LessThanOrEqual(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startAnd().lessThanEquals(quotedName, getType(attribute), castedValue).end()) + case LessThanOrEqual(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startAnd() + .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case GreaterThan(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startNot().lessThanEquals(quotedName, getType(attribute), castedValue).end()) + case GreaterThan(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startNot() + .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case GreaterThanOrEqual(attribute, value) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValue = castLiteralValue(value, dataTypeMap(attribute)) - Some(builder.startNot().lessThan(quotedName, getType(attribute), castedValue).end()) + case GreaterThanOrEqual(name, value) if dataTypeMap.contains(name) => + val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType) + Some(builder.startNot() + .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end()) - case IsNull(attribute) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - Some(builder.startAnd().isNull(quotedName, getType(attribute)).end()) + case IsNull(name) if dataTypeMap.contains(name) => + Some(builder.startAnd().isNull(dataTypeMap(name).fieldName, getType(name)).end()) - case IsNotNull(attribute) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - Some(builder.startNot().isNull(quotedName, getType(attribute)).end()) + case IsNotNull(name) if dataTypeMap.contains(name) => + Some(builder.startNot().isNull(dataTypeMap(name).fieldName, getType(name)).end()) - case In(attribute, values) if isSearchableType(dataTypeMap(attribute)) => - val quotedName = quoteAttributeNameIfNeeded(attribute) - val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(attribute))) - Some(builder.startAnd().in(quotedName, getType(attribute), + case In(name, values) if dataTypeMap.contains(name) => + val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(name).fieldType)) + Some(builder.startAnd().in(dataTypeMap(name).fieldName, getType(name), castedValues.map(_.asInstanceOf[AnyRef]): _*).end()) case _ => None diff --git a/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala index c32f024476e69..60c5b7a266c51 100644 --- a/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala +++ b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.datasources.orc -import java.sql.Date - import org.apache.hadoop.hive.common.`type`.HiveDecimal import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch import org.apache.hadoop.hive.ql.io.sarg.{SearchArgument => OrcSearchArgument} @@ -26,6 +24,7 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator} import org.apache.hadoop.hive.serde2.io.{DateWritable, HiveDecimalWritable} import org.apache.spark.sql.catalyst.expressions.SpecializedGetters +import org.apache.spark.sql.execution.datasources.DaysWritable import org.apache.spark.sql.types.Decimal /** @@ -38,7 +37,9 @@ private[sql] object OrcShimUtils { private[sql] type Operator = OrcOperator private[sql] type SearchArgument = OrcSearchArgument - def getSqlDate(value: Any): Date = value.asInstanceOf[DateWritable].get + def getGregorianDays(value: Any): Int = { + new DaysWritable(value.asInstanceOf[DateWritable]).gregorianDays + } def getDecimal(value: Any): Decimal = { val decimal = value.asInstanceOf[HiveDecimalWritable].getHiveDecimal() @@ -47,13 +48,13 @@ private[sql] object OrcShimUtils { def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = { if (reuseObj) { - val result = new DateWritable() + val result = new DaysWritable() (getter, ordinal) => result.set(getter.getInt(ordinal)) result } else { (getter: SpecializedGetters, ordinal: Int) => - new DateWritable(getter.getInt(ordinal)) + new DaysWritable(getter.getInt(ordinal)) } } diff --git a/sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala index 1baa69e82bb18..cb69413277f63 100644 --- a/sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala +++ b/sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala @@ -24,9 +24,10 @@ import java.sql.{Date, Timestamp} import scala.collection.JavaConverters._ import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument} +import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder -import org.apache.spark.SparkConf -import org.apache.spark.sql.{AnalysisException, Column, DataFrame} +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Row} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation @@ -246,29 +247,41 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { } test("filter pushdown - timestamp") { - val timeString = "2015-08-20 14:57:00" - val timestamps = (1 to 4).map { i => - val milliseconds = Timestamp.valueOf(timeString).getTime + i * 3600 - new Timestamp(milliseconds) - } - withOrcDataFrame(timestamps.map(Tuple1(_))) { implicit df => - checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate( - Literal(timestamps(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(timestamps(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + val input = Seq( + "1000-01-01 01:02:03", + "1582-10-01 00:11:22", + "1900-01-01 23:59:59", + "2020-05-25 10:11:12").map(Timestamp.valueOf) + + withOrcFile(input.map(Tuple1(_))) { path => + Seq(false, true).foreach { java8Api => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString) { + readFile(path) { implicit df => + val timestamps = input.map(Literal(_)) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate( + Literal(timestamps(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate( + Literal(timestamps(2)) < $"_1", + PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate( + Literal(timestamps(0)) >= $"_1", + PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + } + } + } } } @@ -300,26 +313,33 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { } test("filter pushdown - date") { - val dates = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day => + val input = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day => Date.valueOf(day) } - withOrcDataFrame(dates.map(Tuple1(_))) { implicit df => - checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(dates(0)) === $"_1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(dates(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(dates(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(dates(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + withOrcFile(input.map(Tuple1(_))) { path => + Seq(false, true).foreach { java8Api => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString) { + readFile(path) { implicit df => + val dates = input.map(Literal(_)) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(dates(0) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(dates(0) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(dates(1) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(dates(2) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(dates(0) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(dates(3) <= $"_1", PredicateLeaf.Operator.LESS_THAN) + } + } + } } } @@ -451,5 +471,142 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { ).get.toString } } + + test("SPARK-32622: case sensitivity in predicate pushdown") { + withTempPath { dir => + val count = 10 + val tableName = "spark_32622" + val tableDir1 = dir.getAbsoluteFile + "/table1" + + // Physical ORC files have both `A` and `a` fields. + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + spark.range(count).repartition(count).selectExpr("id - 1 as A", "id as a") + .write.mode("overwrite").orc(tableDir1) + } + + // Metastore table has both `A` and `a` fields too. + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + sql( + s""" + |CREATE TABLE $tableName (A LONG, a LONG) USING ORC LOCATION '$tableDir1' + """.stripMargin) + + checkAnswer(sql(s"select a, A from $tableName"), (0 until count).map(c => Row(c, c - 1))) + + val actual1 = stripSparkFilter(sql(s"select A from $tableName where A < 0")) + assert(actual1.count() == 1) + + val actual2 = stripSparkFilter(sql(s"select A from $tableName where a < 0")) + assert(actual2.count() == 0) + } + + // Exception thrown for ambiguous case. + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + val e = intercept[AnalysisException] { + sql(s"select a from $tableName where a < 0").collect() + } + assert(e.getMessage.contains( + "Reference 'a' is ambiguous")) + } + } + + // Metastore table has only `A` field. + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql( + s""" + |CREATE TABLE $tableName (A LONG) USING ORC LOCATION '$tableDir1' + """.stripMargin) + + val e = intercept[SparkException] { + sql(s"select A from $tableName where A < 0").collect() + } + assert(e.getCause.isInstanceOf[RuntimeException] && e.getCause.getMessage.contains( + """Found duplicate field(s) "A": [A, a] in case-insensitive mode""")) + } + } + + // Physical ORC files have only `A` field. + val tableDir2 = dir.getAbsoluteFile + "/table2" + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + spark.range(count).repartition(count).selectExpr("id - 1 as A") + .write.mode("overwrite").orc(tableDir2) + } + + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql( + s""" + |CREATE TABLE $tableName (a LONG) USING ORC LOCATION '$tableDir2' + """.stripMargin) + + checkAnswer(sql(s"select a from $tableName"), (0 until count).map(c => Row(c - 1))) + + val actual = stripSparkFilter(sql(s"select a from $tableName where a < 0")) + assert(actual.count() == 1) + } + } + + withTable(tableName) { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + sql( + s""" + |CREATE TABLE $tableName (A LONG) USING ORC LOCATION '$tableDir2' + """.stripMargin) + + checkAnswer(sql(s"select A from $tableName"), (0 until count).map(c => Row(c - 1))) + + val actual = stripSparkFilter(sql(s"select A from $tableName where A < 0")) + assert(actual.count() == 1) + } + } + } + } + + test("SPARK-32646: Case-insensitive field resolution for pushdown when reading ORC") { + import org.apache.spark.sql.sources._ + + def getOrcFilter( + schema: StructType, + filters: Seq[Filter], + caseSensitive: String): Option[SearchArgument] = { + var orcFilter: Option[SearchArgument] = None + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive) { + orcFilter = + OrcFilters.createFilter(schema, filters) + } + orcFilter + } + + def testFilter( + schema: StructType, + filters: Seq[Filter], + expected: SearchArgument): Unit = { + val caseSensitiveFilters = getOrcFilter(schema, filters, "true") + val caseInsensitiveFilters = getOrcFilter(schema, filters, "false") + + assert(caseSensitiveFilters.isEmpty) + assert(caseInsensitiveFilters.isDefined) + + assert(caseInsensitiveFilters.get.getLeaves().size() > 0) + assert(caseInsensitiveFilters.get.getLeaves().size() == expected.getLeaves().size()) + (0 until expected.getLeaves().size()).foreach { index => + assert(caseInsensitiveFilters.get.getLeaves().get(index) == expected.getLeaves().get(index)) + } + } + + val schema = StructType(Seq(StructField("cint", IntegerType))) + testFilter(schema, Seq(GreaterThan("CINT", 1)), + newBuilder.startNot() + .lessThanEquals("cint", OrcFilters.getPredicateLeafType(IntegerType), 1L).`end`().build()) + testFilter(schema, Seq( + And(GreaterThan("CINT", 1), EqualTo("Cint", 2))), + newBuilder.startAnd() + .startNot() + .lessThanEquals("cint", OrcFilters.getPredicateLeafType(IntegerType), 1L).`end`() + .equals("cint", OrcFilters.getPredicateLeafType(IntegerType), 2L) + .`end`().build()) + } } diff --git a/sql/create-docs.sh b/sql/create-docs.sh index 4353708d22f7b..8721df874ee73 100755 --- a/sql/create-docs.sh +++ b/sql/create-docs.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# Script to create SQL API docs. This requires `mkdocs` and to build +# Script to create SQL API and config docs. This requires `mkdocs` and to build # Spark first. After running this script the html docs can be found in # $SPARK_HOME/sql/site @@ -27,26 +27,31 @@ set -e FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)" SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)" -if ! hash python 2>/dev/null; then - echo "Missing python in your path, skipping SQL documentation generation." +if ! hash python3 2>/dev/null; then + echo "Missing python3 in your path, skipping SQL documentation generation." exit 0 fi if ! hash mkdocs 2>/dev/null; then echo "Missing mkdocs in your path, trying to install mkdocs for SQL documentation generation." - pip install mkdocs + pip3 install mkdocs fi pushd "$FWDIR" > /dev/null -# Now create the markdown file rm -fr docs mkdir docs -echo "Generating markdown files for SQL documentation." -"$SPARK_HOME/bin/spark-submit" gen-sql-markdown.py -# Now create the HTML files -echo "Generating HTML files for SQL documentation." +echo "Generating SQL API Markdown files." +"$SPARK_HOME/bin/spark-submit" gen-sql-api-docs.py + +echo "Generating SQL configuration table HTML file." +"$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py + +echo "Generating HTML files for SQL function table and examples." +"$SPARK_HOME/bin/spark-submit" gen-sql-functions-docs.py + +echo "Generating HTML files for SQL API documentation." mkdocs build --clean rm -fr docs diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-api-docs.py similarity index 94% rename from sql/gen-sql-markdown.py rename to sql/gen-sql-api-docs.py index e0529f8310613..61328997c1c58 100644 --- a/sql/gen-sql-markdown.py +++ b/sql/gen-sql-api-docs.py @@ -15,10 +15,12 @@ # limitations under the License. # -import sys import os from collections import namedtuple +from pyspark.java_gateway import launch_gateway + + ExpressionInfo = namedtuple( "ExpressionInfo", "className name usage arguments examples note since deprecated") @@ -41,7 +43,7 @@ def _list_function_infos(jvm): usage=usage, arguments=jinfo.getArguments().replace("_FUNC_", name), examples=jinfo.getExamples().replace("_FUNC_", name), - note=jinfo.getNote(), + note=jinfo.getNote().replace("_FUNC_", name), since=jinfo.getSince(), deprecated=jinfo.getDeprecated())) return sorted(infos, key=lambda i: i.name) @@ -158,7 +160,7 @@ def _make_pretty_deprecated(deprecated): return "**Deprecated:**\n%s\n" % deprecated -def generate_sql_markdown(jvm, path): +def generate_sql_api_markdown(jvm, path): """ Generates a markdown file after listing the function information. The output file is created in `path`. @@ -219,8 +221,7 @@ def generate_sql_markdown(jvm, path): if __name__ == "__main__": - from pyspark.java_gateway import launch_gateway - jvm = launch_gateway().jvm - markdown_file_path = "%s/docs/index.md" % os.path.dirname(sys.argv[0]) - generate_sql_markdown(jvm, markdown_file_path) + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + markdown_file_path = os.path.join(spark_root_dir, "sql/docs/index.md") + generate_sql_api_markdown(jvm, markdown_file_path) diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py new file mode 100644 index 0000000000000..f1980bcc0d80d --- /dev/null +++ b/sql/gen-sql-config-docs.py @@ -0,0 +1,135 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re + +from collections import namedtuple +from textwrap import dedent + +# To avoid adding a new direct dependency, we import markdown from within mkdocs. +from mkdocs.structure.pages import markdown + +from pyspark.java_gateway import launch_gateway + + +SQLConfEntry = namedtuple( + "SQLConfEntry", ["name", "default", "description", "version"]) + + +def get_sql_configs(jvm, group): + if group == "static": + config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listStaticSQLConfigs() + else: + config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listRuntimeSQLConfigs() + sql_configs = [ + SQLConfEntry( + name=_sql_config._1(), + default=_sql_config._2(), + description=_sql_config._3(), + version=_sql_config._4() + ) + for _sql_config in config_set + ] + return sql_configs + + +def generate_sql_configs_table_html(sql_configs, path): + """ + Generates an HTML table at `path` that lists all public SQL + configuration options. + + The table will look something like this: + + ```html + + + + + + + + + + + ... + +
      Property NameDefaultMeaningSince Version
      spark.sql.adaptive.enabledfalse

      When true, enable adaptive query execution.

      2.1.0
      + ``` + """ + value_reference_pattern = re.compile(r"^$") + + with open(path, 'w') as f: + f.write(dedent( + """ + + + """ + )) + for config in sorted(sql_configs, key=lambda x: x.name): + if config.name == "spark.sql.session.timeZone": + default = "(value of local timezone)" + elif config.name == "spark.sql.warehouse.dir": + default = "(value of $PWD/spark-warehouse)" + elif config.default == "": + default = "(none)" + elif config.default.startswith(" + + + + + + """ + .format( + name=config.name, + default=default, + description=markdown.markdown(config.description), + version=config.version + ) + )) + f.write("
      Property NameDefaultMeaningSince Version
      {name}{default}{description}{version}
      \n") + + +if __name__ == "__main__": + jvm = launch_gateway().jvm + docs_root_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "docs") + + sql_configs = get_sql_configs(jvm, "runtime") + sql_configs_table_path = os.path.join(docs_root_dir, "generated-runtime-sql-config-table.html") + generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path) + + sql_configs = get_sql_configs(jvm, "static") + sql_configs_table_path = os.path.join(docs_root_dir, "generated-static-sql-config-table.html") + generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path) diff --git a/sql/gen-sql-functions-docs.py b/sql/gen-sql-functions-docs.py new file mode 100644 index 0000000000000..c07734e273051 --- /dev/null +++ b/sql/gen-sql-functions-docs.py @@ -0,0 +1,230 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import itertools +import os +import re +from collections import namedtuple + +# To avoid adding a new direct dependency, we import markdown from within mkdocs. +from mkdocs.structure.pages import markdown + +from pyspark.java_gateway import launch_gateway + + +ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group") + +groups = { + "agg_funcs", "array_funcs", "datetime_funcs", + "json_funcs", "map_funcs", "window_funcs", +} + + +def _list_grouped_function_infos(jvm): + """ + Returns a list of function information grouped by each group value via JVM. + Sorts wrapped expression infos in each group by name and returns them. + """ + + jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos() + infos = [] + + for jinfo in filter(lambda x: x.getGroup() in groups, jinfos): + name = jinfo.getName() + usage = jinfo.getUsage() + usage = usage.replace("_FUNC_", name) if usage is not None else usage + infos.append(ExpressionInfo( + name=name, + usage=usage, + examples=jinfo.getExamples().replace("_FUNC_", name), + group=jinfo.getGroup())) + + # Groups expression info by each group value + grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group) + # Then, sort expression infos in each group by name + return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos] + + +# TODO(SPARK-31499): Needs to add a column to describe arguments and their types +def _make_pretty_usage(infos): + """ + Makes the usage description pretty and returns a formatted string. + + Expected input: + + func(*) - ... + + func(expr[, expr...]) - ... + + Expected output: + + + + + + + + + + + + + + + + + + ... +
      FunctionDescription
      func(*)...
      func(expr[, expr...])...
      + + """ + + result = [] + result.append("") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + result.append(" ") + + for info in infos: + # Extracts (signature, description) pairs from `info.usage`. + # Expected formats are as follows; + # - `_FUNC_(...) - description`, or + # - `_FUNC_ - description` + usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:]) + for (sig, description) in zip(usages, usages): + result.append(" ") + result.append(" " % sig) + result.append(" " % description.strip()) + result.append(" ") + + result.append(" ") + result.append("
      FunctionDescription
      %s%s
      \n") + return "\n".join(result) + + +def _make_pretty_examples(jspark, infos): + """ + Makes the examples description pretty and returns a formatted string if `infos` + has any `examples` starting with the example prefix. Otherwise, returns None. + + Expected input: + + Examples: + > SELECT func(col)...; + ... + > SELECT func(col)...; + ... + + Expected output: +
      
      +      -- func
      +      SELECT
      +      ...
      +    
      + ``` + + """ + + pretty_output = "" + for info in infos: + if info.examples.startswith("\n Examples:"): + output = [] + output.append("-- %s" % info.name) + query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n")) + for query_example in query_examples: + query = query_example.lstrip(" > ") + print(" %s" % query) + query_output = jspark.sql(query).showString(20, 20, False) + output.append(query) + output.append(query_output) + pretty_output += "\n" + "\n".join(output) + if pretty_output != "": + return markdown.markdown( + "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code']) + + +def generate_functions_table_html(jvm, html_output_dir): + """ + Generates a HTML file after listing the function information. The output file + is created under `html_output_dir`. + + Expected output: + + + + + + + + + + + + + + + + + + + ... +
      FunctionDescription
      func(*)...
      func(expr[, expr...])...
      + + """ + for key, infos in _list_grouped_function_infos(jvm): + function_table = _make_pretty_usage(infos) + key = key.replace("_", "-") + with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html: + table_html.write(function_table) + + +def generate_functions_examples_html(jvm, jspark, html_output_dir): + """ + Generates a HTML file after listing and executing the function information. + The output file is created under `html_output_dir`. + + Expected output: + +
      
      +      -- func
      +      SELECT
      +      ...
      +    
      + + """ + print("Running SQL examples to generate formatted output.") + for key, infos in _list_grouped_function_infos(jvm): + examples = _make_pretty_examples(jspark, infos) + key = key.replace("_", "-") + if examples is not None: + with open("%s/generated-%s-examples.html" % ( + html_output_dir, key), 'w') as examples_html: + examples_html.write(examples) + + +if __name__ == "__main__": + jvm = launch_gateway().jvm + jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate() + jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy. + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + html_output_dir = os.path.join(spark_root_dir, "docs") + generate_functions_table_html(jvm, html_output_dir) + generate_functions_examples_html(jvm, jspark, html_output_dir) diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 75c7f77942396..f376afbd19599 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index f15193b0dc3cc..f9f2ceeed8a75 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.thriftserver import java.util.Locale import java.util.concurrent.atomic.AtomicBoolean +import org.apache.hadoop.hive.common.ServerUtils import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService} @@ -101,6 +102,8 @@ object HiveThriftServer2 extends Logging { SparkSQLEnv.sqlContext.sessionState.newHadoopConf()) try { + // Cleanup the scratch dir before starting + ServerUtils.cleanUpScratchDir(executionHive.conf) val server = new HiveThriftServer2(SparkSQLEnv.sqlContext) server.init(executionHive.conf) server.start() diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala index 76d07848f79a9..33f7f21276611 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala @@ -44,12 +44,13 @@ import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.{Utils => SparkUtils} private[hive] class SparkExecuteStatementOperation( + val sqlContext: SQLContext, parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], runInBackground: Boolean = true) - (sqlContext: SQLContext, sessionToActivePool: JMap[SessionHandle, String]) extends ExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground) + with SparkOperation with Logging { private var result: DataFrame = _ @@ -62,7 +63,6 @@ private[hive] class SparkExecuteStatementOperation( private var previousFetchStartOffset: Long = 0 private var iter: Iterator[SparkRow] = _ private var dataTypes: Array[DataType] = _ - private var statementId: String = _ private lazy val resultSchema: TableSchema = { if (result == null || result.schema.isEmpty) { @@ -73,13 +73,6 @@ private[hive] class SparkExecuteStatementOperation( } } - override def close(): Unit = { - // RDDs will be cleaned automatically upon garbage collection. - logInfo(s"Close statement with $statementId") - cleanup(OperationState.CLOSED) - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } - def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any], ordinal: Int): Unit = { dataTypes(ordinal) match { case StringType => @@ -100,12 +93,15 @@ private[hive] class SparkExecuteStatementOperation( to += from.getByte(ordinal) case ShortType => to += from.getShort(ordinal) - case DateType => - to += from.getAs[Date](ordinal) - case TimestampType => - to += from.getAs[Timestamp](ordinal) case BinaryType => to += from.getAs[Array[Byte]](ordinal) + // SPARK-31859, SPARK-31861: Date and Timestamp need to be turned to String here to: + // - respect spark.sql.session.timeZone + // - work with spark.sql.datetime.java8API.enabled + // These types have always been sent over the wire as string, converted later. + case _: DateType | _: TimestampType => + val hiveString = HiveResult.toHiveString((from.get(ordinal), dataTypes(ordinal))) + to += hiveString case CalendarIntervalType => to += HiveResult.toHiveString((from.getAs[CalendarInterval](ordinal), CalendarIntervalType)) case _: ArrayType | _: StructType | _: MapType | _: UserDefinedType[_] => @@ -114,7 +110,7 @@ private[hive] class SparkExecuteStatementOperation( } } - def getNextRowSet(order: FetchOrientation, maxRowsL: Long): RowSet = withSchedulerPool { + def getNextRowSet(order: FetchOrientation, maxRowsL: Long): RowSet = withLocalProperties { log.info(s"Received getNextRowSet request order=${order} and maxRowsL=${maxRowsL} " + s"with ${statementId}") validateDefaultFetchOrientation(order) @@ -193,7 +189,6 @@ private[hive] class SparkExecuteStatementOperation( override def runInternal(): Unit = { setState(OperationState.PENDING) - statementId = UUID.randomUUID().toString logInfo(s"Submitting query '$statement' with $statementId") HiveThriftServer2.eventManager.onStatementStart( statementId, @@ -217,7 +212,9 @@ private[hive] class SparkExecuteStatementOperation( override def run(): Unit = { registerCurrentOperationLog() try { - execute() + withLocalProperties { + execute() + } } catch { case e: HiveSQLException => setOperationException(e) @@ -259,7 +256,7 @@ private[hive] class SparkExecuteStatementOperation( } } - private def execute(): Unit = withSchedulerPool { + private def execute(): Unit = { try { synchronized { if (getStatus.getState.isTerminal) { @@ -274,16 +271,14 @@ private[hive] class SparkExecuteStatementOperation( val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader Thread.currentThread().setContextClassLoader(executionHiveClassLoader) + // Always set the session state classloader to `executionHiveClassLoader` even for sync mode + if (!runInBackground) { + parentSession.getSessionState.getConf.setClassLoader(executionHiveClassLoader) + } + sqlContext.sparkContext.setJobGroup(statementId, statement) result = sqlContext.sql(statement) logDebug(result.queryExecution.toString()) - result.queryExecution.logical match { - case SetCommand(Some((SQLConf.THRIFTSERVER_POOL.key, Some(value)))) => - sessionToActivePool.put(parentSession.getSessionHandle, value) - logInfo(s"Setting ${SparkContext.SPARK_SCHEDULER_POOL}=$value for future statements " + - "in this session.") - case _ => - } HiveThriftServer2.eventManager.onStatementParsed(statementId, result.queryExecution.toString()) iter = { @@ -295,7 +290,7 @@ private[hive] class SparkExecuteStatementOperation( resultList.get.iterator } } - dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray + dataTypes = result.schema.fields.map(_.dataType) } catch { // Actually do need to catch Throwable as some failures don't inherit from Exception and // HiveServer will silently swallow them. @@ -341,38 +336,25 @@ private[hive] class SparkExecuteStatementOperation( synchronized { if (!getStatus.getState.isTerminal) { logInfo(s"Cancel query with $statementId") - cleanup(OperationState.CANCELED) + setState(OperationState.CANCELED) + cleanup() HiveThriftServer2.eventManager.onStatementCanceled(statementId) } } } - private def cleanup(state: OperationState): Unit = { - setState(state) + override protected def cleanup(): Unit = { if (runInBackground) { val backgroundHandle = getBackgroundHandle() if (backgroundHandle != null) { backgroundHandle.cancel(true) } } + // RDDs will be cleaned automatically upon garbage collection. if (statementId != null) { sqlContext.sparkContext.cancelJobGroup(statementId) } } - - private def withSchedulerPool[T](body: => T): T = { - val pool = sessionToActivePool.get(parentSession.getSessionHandle) - if (pool != null) { - sqlContext.sparkContext.setLocalProperty(SparkContext.SPARK_SCHEDULER_POOL, pool) - } - try { - body - } finally { - if (pool != null) { - sqlContext.sparkContext.setLocalProperty(SparkContext.SPARK_SCHEDULER_POOL, null) - } - } - } } object SparkExecuteStatementOperation { diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala index 2945cfd200e46..55070e035b944 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala @@ -36,19 +36,13 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param parentSession a HiveSession from SessionManager */ private[hive] class SparkGetCatalogsOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession) - extends GetCatalogsOperation(parentSession) with Logging { - - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } + extends GetCatalogsOperation(parentSession) + with SparkOperation + with Logging { override def runInternal(): Unit = { - statementId = UUID.randomUUID().toString val logMsg = "Listing catalogs" logInfo(s"$logMsg with $statementId") setState(OperationState.RUNNING) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala index ff7cbfeae13be..ca8ad5e6ad134 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala @@ -48,26 +48,19 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param columnName column name */ private[hive] class SparkGetColumnsOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession, catalogName: String, schemaName: String, tableName: String, columnName: String) extends GetColumnsOperation(parentSession, catalogName, schemaName, tableName, columnName) - with Logging { + with SparkOperation + with Logging { val catalog: SessionCatalog = sqlContext.sessionState.catalog - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } - override def runInternal(): Unit = { - statementId = UUID.randomUUID().toString // Do not change cmdStr. It's used for Hive auditing and authorization. val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName, tablePattern : $tableName" val logMsg = s"Listing columns '$cmdStr, columnName : $columnName'" diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala index d9c12b6ca9e64..f5e647bfd4f38 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala @@ -43,22 +43,16 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param functionName function name pattern */ private[hive] class SparkGetFunctionsOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession, catalogName: String, schemaName: String, functionName: String) - extends GetFunctionsOperation(parentSession, catalogName, schemaName, functionName) with Logging { - - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } + extends GetFunctionsOperation(parentSession, catalogName, schemaName, functionName) + with SparkOperation + with Logging { override def runInternal(): Unit = { - statementId = UUID.randomUUID().toString // Do not change cmdStr. It's used for Hive auditing and authorization. val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName" val logMsg = s"Listing functions '$cmdStr, functionName : $functionName'" diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala index db19880d1b99f..74220986fcd34 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala @@ -40,21 +40,15 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param schemaName database name, null or a concrete database name */ private[hive] class SparkGetSchemasOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession, catalogName: String, schemaName: String) - extends GetSchemasOperation(parentSession, catalogName, schemaName) with Logging { - - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } + extends GetSchemasOperation(parentSession, catalogName, schemaName) + with SparkOperation + with Logging { override def runInternal(): Unit = { - statementId = UUID.randomUUID().toString // Do not change cmdStr. It's used for Hive auditing and authorization. val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName" val logMsg = s"Listing databases '$cmdStr'" diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala index b4093e58d3c07..1cf9c3a731af5 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala @@ -37,16 +37,11 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param parentSession a HiveSession from SessionManager */ private[hive] class SparkGetTableTypesOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession) - extends GetTableTypesOperation(parentSession) with SparkMetadataOperationUtils with Logging { - - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } + extends GetTableTypesOperation(parentSession) + with SparkOperation + with Logging { override def runInternal(): Unit = { statementId = UUID.randomUUID().toString diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala index 45c6d980aac47..a1d21e2d60c63 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala @@ -46,24 +46,17 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param tableTypes list of allowed table types, e.g. "TABLE", "VIEW" */ private[hive] class SparkGetTablesOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession, catalogName: String, schemaName: String, tableName: String, tableTypes: JList[String]) extends GetTablesOperation(parentSession, catalogName, schemaName, tableName, tableTypes) - with SparkMetadataOperationUtils with Logging { - - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } + with SparkOperation + with Logging { override def runInternal(): Unit = { - statementId = UUID.randomUUID().toString // Do not change cmdStr. It's used for Hive auditing and authorization. val cmdStr = s"catalog : $catalogName, schemaPattern : $schemaName" val tableTypesStr = if (tableTypes == null) "null" else tableTypes.asScala.mkString(",") diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala index dd5668a93f82d..e38139d60df60 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala @@ -36,16 +36,11 @@ import org.apache.spark.util.{Utils => SparkUtils} * @param parentSession a HiveSession from SessionManager */ private[hive] class SparkGetTypeInfoOperation( - sqlContext: SQLContext, + val sqlContext: SQLContext, parentSession: HiveSession) - extends GetTypeInfoOperation(parentSession) with Logging { - - private var statementId: String = _ - - override def close(): Unit = { - super.close() - HiveThriftServer2.eventManager.onOperationClosed(statementId) - } + extends GetTypeInfoOperation(parentSession) + with SparkOperation + with Logging { override def runInternal(): Unit = { statementId = UUID.randomUUID().toString diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala new file mode 100644 index 0000000000000..59516dc26bae6 --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkOperation.scala @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver + +import org.apache.hive.service.cli.OperationState +import org.apache.hive.service.cli.operation.Operation + +import org.apache.spark.SparkContext +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{SparkSession, SQLContext} +import org.apache.spark.sql.catalyst.catalog.CatalogTableType +import org.apache.spark.sql.catalyst.catalog.CatalogTableType.{EXTERNAL, MANAGED, VIEW} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.util.Utils + +/** + * Utils for Spark operations. + */ +private[hive] trait SparkOperation extends Operation with Logging { + + protected def sqlContext: SQLContext + + protected var statementId = getHandle().getHandleIdentifier().getPublicId().toString() + + protected def cleanup(): Unit = Unit // noop by default + + abstract override def run(): Unit = { + withLocalProperties { + super.run() + } + } + + abstract override def close(): Unit = { + super.close() + cleanup() + logInfo(s"Close statement with $statementId") + HiveThriftServer2.eventManager.onOperationClosed(statementId) + } + + // Set thread local properties for the execution of the operation. + // This method should be applied during the execution of the operation, by all the child threads. + // The original spark context local properties will be restored after the operation. + // + // It is used to: + // - set appropriate SparkSession + // - set scheduler pool for the operation + def withLocalProperties[T](f: => T): T = { + val originalProps = Utils.cloneProperties(sqlContext.sparkContext.getLocalProperties) + val originalSession = SparkSession.getActiveSession + + try { + // Set active SparkSession + SparkSession.setActiveSession(sqlContext.sparkSession) + + // Set scheduler pool + sqlContext.sparkSession.conf.getOption(SQLConf.THRIFTSERVER_POOL.key) match { + case Some(pool) => + sqlContext.sparkContext.setLocalProperty(SparkContext.SPARK_SCHEDULER_POOL, pool) + case None => + } + + // run the body + f + } finally { + // reset local properties, will also reset SPARK_SCHEDULER_POOL + sqlContext.sparkContext.setLocalProperties(originalProps) + + originalSession match { + case Some(session) => SparkSession.setActiveSession(session) + case None => SparkSession.clearActiveSession() + } + } + } + + def tableTypeString(tableType: CatalogTableType): String = tableType match { + case EXTERNAL | MANAGED => "TABLE" + case VIEW => "VIEW" + case t => + throw new IllegalArgumentException(s"Unknown table type is found: $t") + } +} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index b665d4a31b9b1..581aa68b6ba3f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -44,7 +44,9 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveUtils +import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.hive.security.HiveDelegationTokenProvider +import org.apache.spark.sql.internal.SharedState import org.apache.spark.util.ShutdownHookManager /** @@ -88,12 +90,7 @@ private[hive] object SparkSQLCLIDriver extends Logging { val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) val extraConfigs = HiveUtils.formatTimeVarsForHiveClient(hadoopConf) - val cliConf = new HiveConf(classOf[SessionState]) - (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue) - ++ sparkConf.getAll.toMap ++ extraConfigs).foreach { - case (k, v) => - cliConf.set(k, v) - } + val cliConf = HiveClientImpl.newHiveConf(sparkConf, hadoopConf, extraConfigs) val sessionState = new CliSessionState(cliConf) @@ -134,6 +131,7 @@ private[hive] object SparkSQLCLIDriver extends Logging { UserGroupInformation.getCurrentUser.addCredentials(credentials) } + SharedState.loadHiveConfFile(sparkConf, conf) SessionState.start(sessionState) // Clean up after we exit @@ -192,8 +190,11 @@ private[hive] object SparkSQLCLIDriver extends Logging { // Execute -i init files (always in silent mode) cli.processInitFiles(sessionState) - newHiveConf.foreach { kv => - SparkSQLEnv.sqlContext.setConf(kv._1, kv._2) + // We don't propagate hive.metastore.warehouse.dir, because it might has been adjusted in + // [[SharedState.loadHiveConfFile]] based on the user specified or default values of + // spark.sql.warehouse.dir and hive.metastore.warehouse.dir. + for ((k, v) <- newHiveConf if k != "hive.metastore.warehouse.dir") { + SparkSQLEnv.sqlContext.setConf(k, v) } if (sessionState.execString != null) { @@ -379,10 +380,18 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { ret = rc.getResponseCode if (ret != 0) { - // For analysis exception, only the error is printed out to the console. - rc.getException() match { - case e : AnalysisException => - err.println(s"""Error in query: ${e.getMessage}""") + rc.getException match { + case e: AnalysisException => e.cause match { + case Some(_) if !sessionState.getIsSilent => + err.println( + s"""Error in query: ${e.getMessage} + |${org.apache.hadoop.util.StringUtils.stringifyException(e)} + """.stripMargin) + // For analysis exceptions in silent mode or simple ones that only related to the + // query itself, such as `NoSuchDatabaseException`, only the error is printed out + // to the console. + case _ => err.println(s"""Error in query: ${e.getMessage}""") + } case _ => err.println(rc.getErrorMessage()) } driver.close() @@ -490,7 +499,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { val ignoreErrors = HiveConf.getBoolVar(conf, HiveConf.ConfVars.CLIIGNOREERRORS) if (ret != 0 && !ignoreErrors) { CommandProcessorFactory.clean(conf.asInstanceOf[HiveConf]) - ret + return ret } } } @@ -506,35 +515,86 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { } // Adapted splitSemiColon from Hive 2.3's CliDriver.splitSemiColon. + // Note: [SPARK-31595] if there is a `'` in a double quoted string, or a `"` in a single quoted + // string, the origin implementation from Hive will not drop the trailing semicolon as expected, + // hence we refined this function a little bit. + // Note: [SPARK-33100] Ignore a semicolon inside a bracketed comment in spark-sql. private def splitSemiColon(line: String): JList[String] = { var insideSingleQuote = false var insideDoubleQuote = false + var insideSimpleComment = false + var bracketedCommentLevel = 0 var escape = false var beginIndex = 0 + var leavingBracketedComment = false + var isStatement = false val ret = new JArrayList[String] + + def insideBracketedComment: Boolean = bracketedCommentLevel > 0 + def insideComment: Boolean = insideSimpleComment || insideBracketedComment + def statementInProgress(index: Int): Boolean = isStatement || (!insideComment && + index > beginIndex && !s"${line.charAt(index)}".trim.isEmpty) + for (index <- 0 until line.length) { - if (line.charAt(index) == '\'') { + // Checks if we need to decrement a bracketed comment level; the last character '/' of + // bracketed comments is still inside the comment, so `insideBracketedComment` must keep true + // in the previous loop and we decrement the level here if needed. + if (leavingBracketedComment) { + bracketedCommentLevel -= 1 + leavingBracketedComment = false + } + + if (line.charAt(index) == '\'' && !insideComment) { // take a look to see if it is escaped - if (!escape) { + // See the comment above about SPARK-31595 + if (!escape && !insideDoubleQuote) { // flip the boolean variable insideSingleQuote = !insideSingleQuote } - } else if (line.charAt(index) == '\"') { + } else if (line.charAt(index) == '\"' && !insideComment) { // take a look to see if it is escaped - if (!escape) { + // See the comment above about SPARK-31595 + if (!escape && !insideSingleQuote) { // flip the boolean variable insideDoubleQuote = !insideDoubleQuote } + } else if (line.charAt(index) == '-') { + val hasNext = index + 1 < line.length + if (insideDoubleQuote || insideSingleQuote || insideComment) { + // Ignores '-' in any case of quotes or comment. + // Avoids to start a comment(--) within a quoted segment or already in a comment. + // Sample query: select "quoted value --" + // ^^ avoids starting a comment if it's inside quotes. + } else if (hasNext && line.charAt(index + 1) == '-') { + // ignore quotes and ; in simple comment + insideSimpleComment = true + } } else if (line.charAt(index) == ';') { - if (insideSingleQuote || insideDoubleQuote) { + if (insideSingleQuote || insideDoubleQuote || insideComment) { // do not split } else { - // split, do not include ; itself - ret.add(line.substring(beginIndex, index)) + if (isStatement) { + // split, do not include ; itself + ret.add(line.substring(beginIndex, index)) + } beginIndex = index + 1 + isStatement = false + } + } else if (line.charAt(index) == '\n') { + // with a new line the inline simple comment should end. + if (!escape) { + insideSimpleComment = false + } + } else if (line.charAt(index) == '/' && !insideSimpleComment) { + val hasNext = index + 1 < line.length + if (insideSingleQuote || insideDoubleQuote) { + // Ignores '/' in any case of quotes + } else if (insideBracketedComment && line.charAt(index - 1) == '*' ) { + // Decrements `bracketedCommentLevel` at the beginning of the next loop + leavingBracketedComment = true + } else if (hasNext && !insideBracketedComment && line.charAt(index + 1) == '*') { + bracketedCommentLevel += 1 } - } else { - // nothing to do } // set the escape if (escape) { @@ -542,8 +602,12 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { } else if (line.charAt(index) == '\\') { escape = true } + + isStatement = statementInProgress(index) + } + if (isStatement) { + ret.add(line.substring(beginIndex)) } - ret.add(line.substring(beginIndex)) ret } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala index 362ac362e9718..12fba0eae6dce 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala @@ -61,7 +61,7 @@ private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlCont try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) - hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { + hiveResponse = SQLExecution.withNewExecutionId(execution) { hiveResultString(execution.executedPlan) } tableSchema = getResultSetSchema(execution) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala index b3171897141c2..e10e7ed1a2769 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala @@ -78,7 +78,6 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: val ctx = sparkSqlOperationManager.sessionToContexts.getOrDefault(sessionHandle, sqlContext) ctx.sparkSession.sessionState.catalog.getTempViewNames().foreach(ctx.uncacheTable) super.closeSession(sessionHandle) - sparkSqlOperationManager.sessionToActivePool.remove(sessionHandle) sparkSqlOperationManager.sessionToContexts.remove(sessionHandle) } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala index 3396560f43502..bc9c13eb0d4f8 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala @@ -38,7 +38,6 @@ private[thriftserver] class SparkSQLOperationManager() val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") - val sessionToActivePool = new ConcurrentHashMap[SessionHandle, String]() val sessionToContexts = new ConcurrentHashMap[SessionHandle, SQLContext]() override def newExecuteStatementOperation( @@ -51,8 +50,8 @@ private[thriftserver] class SparkSQLOperationManager() s" initialized or had already closed.") val conf = sqlContext.sessionState.conf val runInBackground = async && conf.getConf(HiveUtils.HIVE_THRIFT_SERVER_ASYNC) - val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, - runInBackground)(sqlContext, sessionToActivePool) + val operation = new SparkExecuteStatementOperation( + sqlContext, parentSession, statement, confOverlay, runInBackground) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") diff --git a/sql/hive-thriftserver/src/test/noclasspath/hive-site.xml b/sql/hive-thriftserver/src/test/noclasspath/hive-site.xml new file mode 100644 index 0000000000000..d0bf04d36b00d --- /dev/null +++ b/sql/hive-thriftserver/src/test/noclasspath/hive-site.xml @@ -0,0 +1,30 @@ + + + + + + + hive.in.test + true + Internal marker for test. + + + hive.metastore.warehouse.dir + /tmp/hive_one + + diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 6609701be0ede..4a075d301b601 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -27,22 +27,23 @@ import scala.concurrent.Promise import scala.concurrent.duration._ import org.apache.hadoop.hive.conf.HiveConf.ConfVars -import org.scalatest.BeforeAndAfterAll +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.test.HiveTestJars +import org.apache.spark.sql.internal.StaticSQLConf import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer import org.apache.spark.util.{ThreadUtils, Utils} /** - * A test suite for the `spark-sql` CLI tool. Note that all test cases share the same temporary - * Hive metastore and warehouse. + * A test suite for the `spark-sql` CLI tool. */ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { val warehousePath = Utils.createTempDir() val metastorePath = Utils.createTempDir() val scratchDirPath = Utils.createTempDir() + val sparkWareHouseDir = Utils.createTempDir() override def beforeAll(): Unit = { super.beforeAll() @@ -53,9 +54,9 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { override def afterAll(): Unit = { try { - warehousePath.delete() - metastorePath.delete() - scratchDirPath.delete() + Utils.deleteRecursively(warehousePath) + Utils.deleteRecursively(metastorePath) + Utils.deleteRecursively(scratchDirPath) } finally { super.afterAll() } @@ -70,30 +71,63 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { * is taken as an immediate error condition. That is: if a line containing * with one of these strings is found, fail the test immediately. * The default value is `Seq("Error:")` + * @param maybeWarehouse an option for warehouse path, which will be set via + * `hive.metastore.warehouse.dir`. + * @param useExternalHiveFile whether to load the hive-site.xml from `src/test/noclasspath` or + * not, disabled by default + * @param metastore which path the embedded derby database for metastore locates. Use the the + * global `metastorePath` by default * @param queriesAndExpectedAnswers one or more tuples of query + answer */ def runCliWithin( timeout: FiniteDuration, extraArgs: Seq[String] = Seq.empty, - errorResponses: Seq[String] = Seq("Error:"))( + errorResponses: Seq[String] = Seq("Error:"), + maybeWarehouse: Option[File] = Some(warehousePath), + useExternalHiveFile: Boolean = false, + metastore: File = metastorePath)( queriesAndExpectedAnswers: (String, String)*): Unit = { - val (queries, expectedAnswers) = queriesAndExpectedAnswers.unzip // Explicitly adds ENTER for each statement to make sure they are actually entered into the CLI. - val queriesString = queries.map(_ + "\n").mkString + val queriesString = queriesAndExpectedAnswers.map(_._1 + "\n").mkString + // spark-sql echoes the queries on STDOUT, expect first an echo of the query, then the answer. + val expectedAnswers = queriesAndExpectedAnswers.flatMap { + case (query, answer) => + if (query == "") { + // empty query means a command launched with -e + Seq(answer) + } else { + // spark-sql echoes the submitted queries + val queryEcho = query.split("\n").toList match { + case firstLine :: tail => + s"spark-sql> $firstLine" :: tail.map(l => s" > $l") + } + // longer lines sometimes get split in the output, + // match the first 60 characters of each query line + queryEcho.map(_.take(60)) :+ answer + } + } + val extraHive = if (useExternalHiveFile) { + s"--driver-class-path ${System.getProperty("user.dir")}/src/test/noclasspath" + } else { + "" + } + val warehouseConf = + maybeWarehouse.map(dir => s"--hiveconf ${ConfVars.METASTOREWAREHOUSE}=$dir").getOrElse("") val command = { val cliScript = "../../bin/spark-sql".split("/").mkString(File.separator) - val jdbcUrl = s"jdbc:derby:;databaseName=$metastorePath;create=true" + val jdbcUrl = s"jdbc:derby:;databaseName=$metastore;create=true" s"""$cliScript | --master local | --driver-java-options -Dderby.system.durability=test + | $extraHive | --conf spark.ui.enabled=false | --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$jdbcUrl - | --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath | --hiveconf ${ConfVars.SCRATCHDIR}=$scratchDirPath | --hiveconf conf1=conftest | --hiveconf conf2=1 + | $warehouseConf """.stripMargin.split("\\s+").toSeq ++ extraArgs } @@ -105,10 +139,13 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { def captureOutput(source: String)(line: String): Unit = lock.synchronized { // This test suite sometimes gets extremely slow out of unknown reason on Jenkins. Here we // add a timestamp to provide more diagnosis information. - buffer += s"${new Timestamp(new Date().getTime)} - $source> $line" + val newLine = s"${new Timestamp(new Date().getTime)} - $source> $line" + log.info(newLine) + buffer += newLine // If we haven't found all expected answers and another expected answer comes up... if (next < expectedAnswers.size && line.contains(expectedAnswers(next))) { + log.info(s"$source> found expected output line $next: '${expectedAnswers(next)}'") next += 1 // If all expected answers have been found... if (next == expectedAnswers.size) { @@ -136,6 +173,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { try { ThreadUtils.awaitResult(foundAllExpectedAnswers.future, timeout) + log.info("Found all expected output.") } catch { case cause: Throwable => val message = s""" @@ -144,8 +182,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { |======================= |Spark SQL CLI command line: ${command.mkString(" ")} |Exception: $cause - |Executed query $next "${queries(next)}", - |But failed to capture expected output "${expectedAnswers(next)}" within $timeout. + |Failed to capture next expected output "${expectedAnswers(next)}" within $timeout. | |${buffer.mkString("\n")} |=========================== @@ -155,7 +192,82 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { logError(message, cause) fail(message, cause) } finally { - process.destroy() + if (!process.waitFor(1, MINUTES)) { + try { + fail("spark-sql did not exit gracefully.") + } finally { + process.destroy() + } + } + } + } + + test("load warehouse dir from hive-site.xml") { + val metastore = Utils.createTempDir() + metastore.delete() + try { + runCliWithin(1.minute, + maybeWarehouse = None, + useExternalHiveFile = true, + metastore = metastore)( + "desc database default;" -> "hive_one", + "set spark.sql.warehouse.dir;" -> "hive_one") + } finally { + Utils.deleteRecursively(metastore) + } + } + + test("load warehouse dir from --hiveconf") { + // --hiveconf will overrides hive-site.xml + runCliWithin(2.minute, useExternalHiveFile = true)( + "desc database default;" -> warehousePath.getAbsolutePath, + "create database cliTestDb;" -> "", + "desc database cliTestDb;" -> warehousePath.getAbsolutePath, + "set spark.sql.warehouse.dir;" -> warehousePath.getAbsolutePath) + } + + test("load warehouse dir from --conf spark(.hadoop).hive.*") { + // override conf from hive-site.xml + val metastore = Utils.createTempDir() + metastore.delete() + try { + runCliWithin(2.minute, + extraArgs = + Seq("--conf", s"spark.hadoop.${ConfVars.METASTOREWAREHOUSE}=$sparkWareHouseDir"), + maybeWarehouse = None, + useExternalHiveFile = true, + metastore = metastore)( + "desc database default;" -> sparkWareHouseDir.getAbsolutePath, + "create database cliTestDb;" -> "", + "desc database cliTestDb;" -> sparkWareHouseDir.getAbsolutePath, + "set spark.sql.warehouse.dir;" -> sparkWareHouseDir.getAbsolutePath) + + // override conf from --hiveconf too + runCliWithin(2.minute, + extraArgs = Seq("--conf", s"spark.${ConfVars.METASTOREWAREHOUSE}=$sparkWareHouseDir"), + metastore = metastore)( + "desc database default;" -> sparkWareHouseDir.getAbsolutePath, + "create database cliTestDb;" -> "", + "desc database cliTestDb;" -> sparkWareHouseDir.getAbsolutePath, + "set spark.sql.warehouse.dir;" -> sparkWareHouseDir.getAbsolutePath) + } finally { + Utils.deleteRecursively(metastore) + } + } + + test("load warehouse dir from spark.sql.warehouse.dir") { + // spark.sql.warehouse.dir overrides all hive ones + val metastore = Utils.createTempDir() + metastore.delete() + try { + runCliWithin(2.minute, + extraArgs = Seq( + "--conf", s"${StaticSQLConf.WAREHOUSE_PATH.key}=${sparkWareHouseDir}1", + "--conf", s"spark.hadoop.${ConfVars.METASTOREWAREHOUSE}=${sparkWareHouseDir}2"), + metastore = metastore)( + "desc database default;" -> sparkWareHouseDir.getAbsolutePath.concat("1")) + } finally { + Utils.deleteRecursively(metastore) } } @@ -168,7 +280,8 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { -> "", "SHOW TABLES;" -> "hive_test", - s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE hive_test;" + s"""LOAD DATA LOCAL INPATH '$dataFilePath' + |OVERWRITE INTO TABLE hive_test;""".stripMargin -> "", "CACHE TABLE hive_test;" -> "", @@ -185,18 +298,18 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { test("Single command with --database") { runCliWithin(2.minute)( - "CREATE DATABASE hive_test_db;" + "CREATE DATABASE hive_db_test;" -> "", - "USE hive_test_db;" + "USE hive_db_test;" -> "", - "CREATE TABLE hive_test(key INT, val STRING);" + "CREATE TABLE hive_table_test(key INT, val STRING);" -> "", "SHOW TABLES;" - -> "hive_test" + -> "hive_table_test" ) - runCliWithin(2.minute, Seq("--database", "hive_test_db", "-e", "SHOW TABLES;"))( - "" -> "hive_test" + runCliWithin(2.minute, Seq("--database", "hive_db_test", "-e", "SHOW TABLES;"))( + "" -> "hive_table_test" ) } @@ -208,12 +321,12 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { runCliWithin(3.minute, Seq("--jars", s"$jarFile"))( """CREATE TABLE t1(key string, val string) - |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; - """.stripMargin + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';""".stripMargin -> "", "CREATE TABLE sourceTable (key INT, val STRING) USING hive;" -> "", - s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTable;" + s"""LOAD DATA LOCAL INPATH '$dataFilePath' + |OVERWRITE INTO TABLE sourceTable;""".stripMargin -> "", "INSERT INTO TABLE t1 SELECT key, val FROM sourceTable;" -> "", @@ -234,12 +347,12 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { 3.minute, Seq("--conf", s"spark.hadoop.${ConfVars.HIVEAUXJARS}=$hiveContribJar"))( """CREATE TABLE addJarWithHiveAux(key string, val string) - |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; - """.stripMargin + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';""".stripMargin -> "", "CREATE TABLE sourceTableForWithHiveAux (key INT, val STRING) USING hive;" -> "", - s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTableForWithHiveAux;" + s"""LOAD DATA LOCAL INPATH '$dataFilePath' + |OVERWRITE INTO TABLE sourceTableForWithHiveAux;""".stripMargin -> "", "INSERT INTO TABLE addJarWithHiveAux SELECT key, val FROM sourceTableForWithHiveAux;" -> "", @@ -308,19 +421,6 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { ) } - test("SPARK-21451: spark.sql.warehouse.dir should respect options in --hiveconf") { - runCliWithin(1.minute)("set spark.sql.warehouse.dir;" -> warehousePath.getAbsolutePath) - } - - test("SPARK-21451: Apply spark.hadoop.* configurations") { - val tmpDir = Utils.createTempDir(namePrefix = "SPARK-21451") - runCliWithin( - 1.minute, - Seq("--conf", s"spark.hadoop.${ConfVars.METASTOREWAREHOUSE}=$tmpDir"))( - "set spark.sql.warehouse.dir;" -> tmpDir.getAbsolutePath) - tmpDir.delete() - } - test("Support hive.aux.jars.path") { val hiveContribJar = HiveTestJars.getHiveContribJar().getCanonicalPath runCliWithin( @@ -367,12 +467,12 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { 3.minute)( s"ADD JAR ${hiveContribJar};" -> "", """CREATE TABLE addJarWithSQL(key string, val string) - |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; - """.stripMargin + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe';""".stripMargin -> "", "CREATE TABLE sourceTableForWithSQL(key INT, val STRING) USING hive;" -> "", - s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTableForWithSQL;" + s"""LOAD DATA LOCAL INPATH '$dataFilePath' + |OVERWRITE INTO TABLE sourceTableForWithSQL;""".stripMargin -> "", "INSERT INTO TABLE addJarWithSQL SELECT key, val FROM sourceTableForWithSQL;" -> "", @@ -400,4 +500,78 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { -> "1.000000000000000000" ) } + + test("SPARK-30049 Should not complain for quotes in commented lines") { + runCliWithin(1.minute)( + """SELECT concat('test', 'comment') -- someone's comment here + |;""".stripMargin -> "testcomment" + ) + } + + test("SPARK-31102 spark-sql fails to parse when contains comment") { + runCliWithin(1.minute)( + """SELECT concat('test', 'comment'), + | -- someone's comment here + | 2;""".stripMargin -> "testcomment" + ) + } + + test("SPARK-30049 Should not complain for quotes in commented with multi-lines") { + runCliWithin(1.minute)( + """SELECT concat('test', 'comment') -- someone's comment here \ + | comment continues here with single ' quote \ + | extra ' \ + |;""".stripMargin -> "testcomment" + ) + } + + test("SPARK-31595 Should allow unescaped quote mark in quoted string") { + runCliWithin(1.minute)( + "SELECT '\"legal string a';select 1 + 234;".stripMargin -> "235" + ) + runCliWithin(1.minute)( + "SELECT \"legal 'string b\";select 22222 + 1;".stripMargin -> "22223" + ) + } + + test("AnalysisException with root cause will be printStacktrace") { + // If it is not in silent mode, will print the stacktrace + runCliWithin( + 1.minute, + extraArgs = Seq("--hiveconf", "hive.session.silent=false", + "-e", "select date_sub(date'2011-11-11', '1.2');"), + errorResponses = Seq("NumberFormatException"))( + ("", "Error in query: The second argument of 'date_sub' function needs to be an integer."), + ("", "NumberFormatException: invalid input syntax for type numeric: 1.2")) + // If it is in silent mode, will print the error message only + runCliWithin( + 1.minute, + extraArgs = Seq("--conf", "spark.hive.session.silent=true", + "-e", "select date_sub(date'2011-11-11', '1.2');"), + errorResponses = Seq("AnalysisException"))( + ("", "Error in query: The second argument of 'date_sub' function needs to be an integer.")) + } + + test("SPARK-33100: Ignore a semicolon inside a bracketed comment in spark-sql") { + runCliWithin(4.minute)( + "/* SELECT 'test';*/ SELECT 'test';" -> "test", + ";;/* SELECT 'test';*/ SELECT 'test';" -> "test", + "/* SELECT 'test';*/;; SELECT 'test';" -> "test", + "SELECT 'test'; -- SELECT 'test';" -> "test", + "SELECT 'test'; /* SELECT 'test';*/;" -> "test", + "/*$meta chars{^\\;}*/ SELECT 'test';" -> "test", + "/*\nmulti-line\n*/ SELECT 'test';" -> "test", + "/*/* multi-level bracketed*/ SELECT 'test';" -> "test" + ) + } + + test("SPARK-33100: test sql statements with hint in bracketed comment") { + runCliWithin(2.minute)( + "CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES(1, 2) AS t1(k, v);" -> "", + "CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES(2, 1) AS t2(k, v);" -> "", + "EXPLAIN SELECT /*+ MERGEJOIN(t1) */ t1.* FROM t1 JOIN t2 ON t1.k = t2.v;" -> "SortMergeJoin", + "EXPLAIN SELECT /* + MERGEJOIN(t1) */ t1.* FROM t1 JOIN t2 ON t1.k = t2.v;" + -> "BroadcastHashJoin" + ) + } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala index d056b3b2153cf..4564c2209a931 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.util.QueryExecutionListener class DummyQueryExecutionListener extends QueryExecutionListener { override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {} - override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} } class DummyStreamingQueryListener extends StreamingQueryListener { diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index 84eed7b2eda22..396e7f9d397db 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -545,7 +545,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { } if (HiveUtils.isHive23) { - assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.6")) + assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.7")) } else { assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("1.2.1")) } @@ -562,7 +562,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { } if (HiveUtils.isHive23) { - assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.6")) + assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("2.3.7")) } else { assert(conf.get(HiveUtils.FAKE_HIVE_VERSION.key) === Some("1.2.1")) } @@ -771,6 +771,97 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { client.closeSession(sessionHandle) } } + + test("SPARK-29492: use add jar in sync mode") { + withCLIServiceClient { client => + val user = System.getProperty("user.name") + val sessionHandle = client.openSession(user, "") + withJdbcStatement("smallKV", "addJar") { statement => + val confOverlay = new java.util.HashMap[java.lang.String, java.lang.String] + val jarFile = HiveTestJars.getHiveHcatalogCoreJar().getCanonicalPath + + Seq(s"ADD JAR $jarFile", + "CREATE TABLE smallKV(key INT, val STRING) USING hive", + s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE smallKV") + .foreach(query => client.executeStatement(sessionHandle, query, confOverlay)) + + client.executeStatement(sessionHandle, + """CREATE TABLE addJar(key string) + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' + """.stripMargin, confOverlay) + + client.executeStatement(sessionHandle, + "INSERT INTO TABLE addJar SELECT 'k1' as key FROM smallKV limit 1", confOverlay) + + val operationHandle = client.executeStatement( + sessionHandle, + "SELECT key FROM addJar", + confOverlay) + + // Fetch result first time + assertResult(1, "Fetching result first time from next row") { + + val rows_next = client.fetchResults( + operationHandle, + FetchOrientation.FETCH_NEXT, + 1000, + FetchType.QUERY_OUTPUT) + rows_next.numRows() + } + } + } + } + + test("SPARK-31859 Thriftserver works with spark.sql.datetime.java8API.enabled=true") { + withJdbcStatement() { st => + st.execute("set spark.sql.datetime.java8API.enabled=true") + val rs = st.executeQuery("select date '2020-05-28', timestamp '2020-05-28 00:00:00'") + rs.next() + assert(rs.getDate(1).toString() == "2020-05-28") + assert(rs.getTimestamp(2).toString() == "2020-05-28 00:00:00.0") + } + } + + test("SPARK-31861 Thriftserver respects spark.sql.session.timeZone") { + withJdbcStatement() { st => + st.execute("set spark.sql.session.timeZone=+03:15") // different than Thriftserver's JVM tz + val rs = st.executeQuery("select timestamp '2020-05-28 10:00:00'") + rs.next() + // The timestamp as string is the same as the literal + assert(rs.getString(1) == "2020-05-28 10:00:00.0") + // Parsing it to java.sql.Timestamp in the client will always result in a timestamp + // in client default JVM timezone. The string value of the Timestamp will match the literal, + // but if the JDBC application cares about the internal timezone and UTC offset of the + // Timestamp object, it should set spark.sql.session.timeZone to match its client JVM tz. + assert(rs.getTimestamp(1).toString() == "2020-05-28 10:00:00.0") + } + } + + test("SPARK-31863 Session conf should persist between Thriftserver worker threads") { + val iter = 20 + withJdbcStatement() { statement => + // date 'now' is resolved during parsing, and relies on SQLConf.get to + // obtain the current set timezone. We exploit this to run this test. + // If the timezones are set correctly to 25 hours apart across threads, + // the dates should reflect this. + + // iterate a few times for the odd chance the same thread is selected + for (_ <- 0 until iter) { + statement.execute("SET spark.sql.session.timeZone=GMT-12") + val firstResult = statement.executeQuery("SELECT date 'now'") + firstResult.next() + val beyondDateLineWest = firstResult.getDate(1) + + statement.execute("SET spark.sql.session.timeZone=GMT+13") + val secondResult = statement.executeQuery("SELECT date 'now'") + secondResult.next() + val dateLineEast = secondResult.getDate(1) + assert( + dateLineEast after beyondDateLineWest, + "SQLConf changes should persist across execution threads") + } + } + } } class SingleSessionSuite extends HiveThriftJdbcTest { @@ -780,7 +871,7 @@ class SingleSessionSuite extends HiveThriftJdbcTest { s"--conf ${HIVE_THRIFT_SERVER_SINGLESESSION.key}=true" :: Nil test("share the temporary functions across JDBC connections") { - withMultipleConnectionJdbcStatement()( + withMultipleConnectionJdbcStatement("test_udtf")( { statement => val jarPath = "../hive/src/test/resources/TestUDTF.jar" val jarURL = s"file://${System.getProperty("user.dir")}/$jarPath" @@ -884,6 +975,39 @@ class SingleSessionSuite extends HiveThriftJdbcTest { } } +class HiveThriftCleanUpScratchDirSuite extends HiveThriftJdbcTest{ + var tempScratchDir: File = _ + + override protected def beforeAll(): Unit = { + tempScratchDir = Utils.createTempDir() + tempScratchDir.setWritable(true, false) + assert(tempScratchDir.list().isEmpty) + new File(tempScratchDir.getAbsolutePath + File.separator + "SPARK-31626").createNewFile() + assert(tempScratchDir.list().nonEmpty) + super.beforeAll() + } + + override def mode: ServerMode.Value = ServerMode.binary + + override protected def extraConf: Seq[String] = + s" --hiveconf ${ConfVars.HIVE_START_CLEANUP_SCRATCHDIR}=true " :: + s"--hiveconf ${ConfVars.SCRATCHDIR}=${tempScratchDir.getAbsolutePath}" :: Nil + + test("Cleanup the Hive scratchdir when starting the Hive Server") { + assert(!tempScratchDir.exists()) + withJdbcStatement() { statement => + val rs = statement.executeQuery("SELECT id FROM range(1)") + assert(rs.next()) + assert(rs.getLong(1) === 0L) + } + } + + override protected def afterAll(): Unit = { + Utils.deleteRecursively(tempScratchDir) + super.afterAll() + } +} + class HiveThriftHttpServerSuite extends HiveThriftJdbcTest { override def mode: ServerMode.Value = ServerMode.http diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala index ce610098156f3..e002bc0117c8b 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala @@ -19,29 +19,25 @@ package org.apache.spark.sql.hive.thriftserver import java.sql.{DriverManager, Statement} +import scala.collection.JavaConverters._ import scala.concurrent.duration._ -import scala.util.{Random, Try} +import scala.util.Try import org.apache.hadoop.hive.conf.HiveConf.ConfVars +import org.apache.hive.service.cli.thrift.ThriftCLIService -import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSparkSession trait SharedThriftServer extends SharedSparkSession { private var hiveServer2: HiveThriftServer2 = _ + private var serverPort: Int = 0 override def beforeAll(): Unit = { super.beforeAll() - // Chooses a random port between 10000 and 19999 - var listeningPort = 10000 + Random.nextInt(10000) - // Retries up to 3 times with different port numbers if the server fails to start - (1 to 3).foldLeft(Try(startThriftServer(listeningPort, 0))) { case (started, attempt) => - started.orElse { - listeningPort += 1 - Try(startThriftServer(listeningPort, attempt)) - } + (1 to 3).foldLeft(Try(startThriftServer(0))) { case (started, attempt) => + started.orElse(Try(startThriftServer(attempt))) }.recover { case cause: Throwable => throw cause @@ -59,8 +55,7 @@ trait SharedThriftServer extends SharedSparkSession { protected def withJdbcStatement(fs: (Statement => Unit)*): Unit = { val user = System.getProperty("user.name") - - val serverPort = hiveServer2.getHiveConf.get(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname) + require(serverPort != 0, "Failed to bind an actual port for HiveThriftServer2") val connections = fs.map { _ => DriverManager.getConnection(s"jdbc:hive2://localhost:$serverPort", user, "") } val statements = connections.map(_.createStatement()) @@ -73,11 +68,19 @@ trait SharedThriftServer extends SharedSparkSession { } } - private def startThriftServer(port: Int, attempt: Int): Unit = { - logInfo(s"Trying to start HiveThriftServer2: port=$port, attempt=$attempt") + private def startThriftServer(attempt: Int): Unit = { + logInfo(s"Trying to start HiveThriftServer2:, attempt=$attempt") val sqlContext = spark.newSession().sqlContext - sqlContext.setConf(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, port.toString) + // Set the HIVE_SERVER2_THRIFT_PORT to 0, so it could randomly pick any free port to use. + // It's much more robust than set a random port generated by ourselves ahead + sqlContext.setConf(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, "0") hiveServer2 = HiveThriftServer2.startWithContext(sqlContext) + hiveServer2.getServices.asScala.foreach { + case t: ThriftCLIService if t.getPortNumber != 0 => + serverPort = t.getPortNumber + logInfo(s"Started HiveThriftServer2: port=$serverPort, attempt=$attempt") + case _ => + } // Wait for thrift server to be ready to serve the query, via executing simple query // till the query succeeds. See SPARK-30345 for more details. diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperationSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperationSuite.scala index 13df3fabc4919..4c2f29e0bf394 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperationSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperationSuite.scala @@ -17,10 +17,25 @@ package org.apache.spark.sql.hive.thriftserver +import java.util +import java.util.concurrent.Semaphore + +import scala.concurrent.duration._ + +import org.apache.hadoop.hive.conf.HiveConf +import org.apache.hive.service.cli.OperationState +import org.apache.hive.service.cli.session.{HiveSession, HiveSessionImpl} +import org.mockito.Mockito.{doReturn, mock, spy, when, RETURNS_DEEP_STUBS} +import org.mockito.invocation.InvocationOnMock + import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.hive.thriftserver.ui.HiveThriftServer2EventManager +import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, NullType, StringType, StructField, StructType} -class SparkExecuteStatementOperationSuite extends SparkFunSuite { +class SparkExecuteStatementOperationSuite extends SparkFunSuite with SharedSparkSession { + test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") { val field1 = StructField("NULL", NullType) val field2 = StructField("(IF(true, NULL, NULL))", NullType) @@ -42,4 +57,68 @@ class SparkExecuteStatementOperationSuite extends SparkFunSuite { assert(columns.get(1).getType().getName == "INT") assert(columns.get(1).getComment() == "") } + + Seq( + (OperationState.CANCELED, (_: SparkExecuteStatementOperation).cancel()), + (OperationState.CLOSED, (_: SparkExecuteStatementOperation).close()) + ).foreach { case (finalState, transition) => + test("SPARK-32057 SparkExecuteStatementOperation should not transiently become ERROR " + + s"before being set to $finalState") { + val hiveSession = new HiveSessionImpl(ThriftserverShimUtils.testedProtocolVersions.head, + "username", "password", new HiveConf, "ip address") + hiveSession.open(new util.HashMap) + + HiveThriftServer2.eventManager = mock(classOf[HiveThriftServer2EventManager]) + + val spySqlContext = spy(sqlContext) + + // When cancel() is called on the operation, cleanup causes an exception to be thrown inside + // of execute(). This should not cause the state to become ERROR. The exception here will be + // triggered in our custom cleanup(). + val signal = new Semaphore(0) + val dataFrame = mock(classOf[DataFrame], RETURNS_DEEP_STUBS) + when(dataFrame.collect()).thenAnswer((_: InvocationOnMock) => { + signal.acquire() + throw new RuntimeException("Operation was cancelled by test cleanup.") + }) + val statement = "stmt" + doReturn(dataFrame, Nil: _*).when(spySqlContext).sql(statement) + + val executeStatementOperation = new MySparkExecuteStatementOperation(spySqlContext, + hiveSession, statement, signal, finalState) + + val run = new Thread() { + override def run(): Unit = executeStatementOperation.runInternal() + } + assert(executeStatementOperation.getStatus.getState === OperationState.INITIALIZED) + run.start() + eventually(timeout(5.seconds)) { + assert(executeStatementOperation.getStatus.getState === OperationState.RUNNING) + } + transition(executeStatementOperation) + run.join() + assert(executeStatementOperation.getStatus.getState === finalState) + } + } + + private class MySparkExecuteStatementOperation( + sqlContext: SQLContext, + hiveSession: HiveSession, + statement: String, + signal: Semaphore, + finalState: OperationState) + extends SparkExecuteStatementOperation(sqlContext, hiveSession, statement, + new util.HashMap, false) { + + override def cleanup(): Unit = { + super.cleanup() + signal.release() + // At this point, operation should already be in finalState (set by either close() or + // cancel()). We want to check if it stays in finalState after the exception thrown by + // releasing the semaphore propagates. We hence need to sleep for a short while. + Thread.sleep(1000) + // State should not be ERROR + assert(getStatus.getState === finalState) + } + } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala index ffd1fc48f19fe..f28faea2be868 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.hive.thriftserver +import org.apache.commons.io.FileUtils import test.custom.listener.{DummyQueryExecutionListener, DummyStreamingQueryListener} import org.apache.spark.SparkFunSuite @@ -25,10 +26,19 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hive.HiveUtils.{HIVE_METASTORE_JARS, HIVE_METASTORE_VERSION} import org.apache.spark.sql.hive.test.TestHiveContext import org.apache.spark.sql.internal.StaticSQLConf.{QUERY_EXECUTION_LISTENERS, STREAMING_QUERY_LISTENERS, WAREHOUSE_PATH} +import org.apache.spark.util.Utils class SparkSQLEnvSuite extends SparkFunSuite { test("SPARK-29604 external listeners should be initialized with Spark classloader") { + val metastorePath = Utils.createTempDir("spark_derby") + FileUtils.forceDelete(metastorePath) + + val jdbcUrl = s"jdbc:derby:;databaseName=$metastorePath;create=true" + withSystemProperties( + "javax.jdo.option.ConnectionURL" -> jdbcUrl, + "derby.system.durability" -> "test", + "spark.ui.enabled" -> "false", QUERY_EXECUTION_LISTENERS.key -> classOf[DummyQueryExecutionListener].getCanonicalName, STREAMING_QUERY_LISTENERS.key -> classOf[DummyStreamingQueryListener].getCanonicalName, WAREHOUSE_PATH.key -> TestHiveContext.makeWarehouseDir().toURI.getPath, diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala index a63b5dac0aac3..fd45e7a48c0eb 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala @@ -192,6 +192,11 @@ class SparkThriftServerProtocolVersionsSuite extends HiveThriftJdbcTest { assert(rs.next()) assert(rs.getBigDecimal(1) === new java.math.BigDecimal("1.00")) } + testExecuteStatementWithProtocolVersion(version, + "SELECT cast(null as decimal) ") { rs => + assert(rs.next()) + assert(rs.getBigDecimal(1) === null) + } } test(s"$version get string type") { diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index d9ac9ab441f0c..8b16674f05a84 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -23,6 +23,7 @@ import java.util.{Locale, MissingFormatArgumentException} import scala.util.control.NonFatal +import org.apache.commons.io.FileUtils import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.spark.SparkException @@ -32,6 +33,7 @@ import org.apache.spark.sql.catalyst.util.fileToString import org.apache.spark.sql.execution.HiveResult import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ +import org.apache.spark.tags.GitHubActionsUnstableTest /** * Re-run all the tests in SQLQueryTestSuite via Thrift Server. @@ -51,8 +53,18 @@ import org.apache.spark.sql.types._ * 2. Support DESC command. * 3. Support SHOW command. */ +@GitHubActionsUnstableTest class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServer { + override protected def testFile(fileName: String): String = { + val url = Thread.currentThread().getContextClassLoader.getResource(fileName) + // Copy to avoid URISyntaxException during accessing the resources in `sql/core` + val file = File.createTempFile("thriftserver-test", ".data") + file.deleteOnExit() + FileUtils.copyURLToFile(url, file) + file.getAbsolutePath + } + /** List of test cases to ignore, in lower cases. */ override def blackList: Set[String] = super.blackList ++ Set( // Missing UDF @@ -80,8 +92,6 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ // We do not test with configSet. withJdbcStatement { statement => - loadTestData(statement) - configSet.foreach { case (k, v) => statement.execute(s"SET $k = $v") } @@ -262,61 +272,6 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ } } - /** Load built-in test tables. */ - private def loadTestData(statement: Statement): Unit = { - // Prepare the data - statement.execute( - """ - |CREATE OR REPLACE TEMPORARY VIEW testdata as - |SELECT id AS key, CAST(id AS string) AS value FROM range(1, 101) - """.stripMargin) - statement.execute( - """ - |CREATE OR REPLACE TEMPORARY VIEW arraydata as - |SELECT * FROM VALUES - |(ARRAY(1, 2, 3), ARRAY(ARRAY(1, 2, 3))), - |(ARRAY(2, 3, 4), ARRAY(ARRAY(2, 3, 4))) AS v(arraycol, nestedarraycol) - """.stripMargin) - statement.execute( - """ - |CREATE OR REPLACE TEMPORARY VIEW mapdata as - |SELECT * FROM VALUES - |MAP(1, 'a1', 2, 'b1', 3, 'c1', 4, 'd1', 5, 'e1'), - |MAP(1, 'a2', 2, 'b2', 3, 'c2', 4, 'd2'), - |MAP(1, 'a3', 2, 'b3', 3, 'c3'), - |MAP(1, 'a4', 2, 'b4'), - |MAP(1, 'a5') AS v(mapcol) - """.stripMargin) - statement.execute( - s""" - |CREATE TEMPORARY VIEW aggtest - | (a int, b float) - |USING csv - |OPTIONS (path '${baseResourcePath.getParent}/test-data/postgresql/agg.data', - | header 'false', delimiter '\t') - """.stripMargin) - statement.execute( - s""" - |CREATE OR REPLACE TEMPORARY VIEW onek - | (unique1 int, unique2 int, two int, four int, ten int, twenty int, hundred int, - | thousand int, twothousand int, fivethous int, tenthous int, odd int, even int, - | stringu1 string, stringu2 string, string4 string) - |USING csv - |OPTIONS (path '${baseResourcePath.getParent}/test-data/postgresql/onek.data', - | header 'false', delimiter '\t') - """.stripMargin) - statement.execute( - s""" - |CREATE OR REPLACE TEMPORARY VIEW tenk1 - | (unique1 int, unique2 int, two int, four int, ten int, twenty int, hundred int, - | thousand int, twothousand int, fivethous int, tenthous int, odd int, even int, - | stringu1 string, stringu2 string, string4 string) - |USING csv - | OPTIONS (path '${baseResourcePath.getParent}/test-data/postgresql/tenk.data', - | header 'false', delimiter '\t') - """.stripMargin) - } - // Returns true if sql is retrieving data. private def isNeedSort(sql: String): Boolean = { val upperCase = sql.toUpperCase(Locale.ROOT) diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index 3e1fce78ae71c..73547d752024b 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -17,6 +17,9 @@ package org.apache.spark.sql.hive.thriftserver +import org.apache.spark.tags.GitHubActionsUnstableTest + +@GitHubActionsUnstableTest class ThriftServerWithSparkContextSuite extends SharedThriftServer { test("SPARK-29911: Uncache cached tables when session closed") { diff --git a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java index 547c6056b4fe8..462b93a0f09fe 100644 --- a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java @@ -123,22 +123,6 @@ private static TColumnValue stringValue(HiveVarchar value) { return TColumnValue.stringVal(tStringValue); } - private static TColumnValue dateValue(Date value) { - TStringValue tStringValue = new TStringValue(); - if (value != null) { - tStringValue.setValue(value.toString()); - } - return new TColumnValue(TColumnValue.stringVal(tStringValue)); - } - - private static TColumnValue timestampValue(Timestamp value) { - TStringValue tStringValue = new TStringValue(); - if (value != null) { - tStringValue.setValue(value.toString()); - } - return TColumnValue.stringVal(tStringValue); - } - private static TColumnValue stringValue(HiveIntervalYearMonth value) { TStringValue tStrValue = new TStringValue(); if (value != null) { @@ -178,15 +162,16 @@ public static TColumnValue toTColumnValue(Type type, Object value) { case VARCHAR_TYPE: return stringValue((HiveVarchar)value); case DATE_TYPE: - return dateValue((Date)value); case TIMESTAMP_TYPE: - return timestampValue((Timestamp)value); + // SPARK-31859, SPARK-31861: converted to string already in SparkExecuteStatementOperation + return stringValue((String)value); case INTERVAL_YEAR_MONTH_TYPE: return stringValue((HiveIntervalYearMonth) value); case INTERVAL_DAY_TIME_TYPE: return stringValue((HiveIntervalDayTime) value); case DECIMAL_TYPE: - return stringValue(((BigDecimal)value).toPlainString()); + String plainStr = value == null ? null : ((BigDecimal)value).toPlainString(); + return stringValue(plainStr); case BINARY_TYPE: String strVal = value == null ? null : UTF8String.fromBytes((byte[])value).toString(); return stringValue(strVal); diff --git a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java index 51bb28748d9e2..4b331423948fa 100644 --- a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java @@ -280,7 +280,10 @@ public void cancel() throws HiveSQLException { throw new UnsupportedOperationException("SQLOperation.cancel()"); } - public abstract void close() throws HiveSQLException; + public void close() throws HiveSQLException { + setState(OperationState.CLOSED); + cleanupOperationLog(); + } public abstract TableSchema getResultSetSchema() throws HiveSQLException; diff --git a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/SessionManager.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/SessionManager.java index 859f9c8b449e5..ad6fb3ba37a0e 100644 --- a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/SessionManager.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/SessionManager.java @@ -148,14 +148,20 @@ public synchronized void start() { } } + private final Object timeoutCheckerLock = new Object(); + private void startTimeoutChecker() { final long interval = Math.max(checkInterval, 3000L); // minimum 3 seconds - Runnable timeoutChecker = new Runnable() { + final Runnable timeoutChecker = new Runnable() { @Override public void run() { - for (sleepInterval(interval); !shutdown; sleepInterval(interval)) { + sleepFor(interval); + while (!shutdown) { long current = System.currentTimeMillis(); for (HiveSession session : new ArrayList(handleToSession.values())) { + if (shutdown) { + break; + } if (sessionTimeout > 0 && session.getLastAccessTime() + sessionTimeout <= current && (!checkOperation || session.getNoOperationTime() > sessionTimeout)) { SessionHandle handle = session.getSessionHandle(); @@ -170,24 +176,34 @@ public void run() { session.closeExpiredOperations(); } } + sleepFor(interval); } } - private void sleepInterval(long interval) { - try { - Thread.sleep(interval); - } catch (InterruptedException e) { - // ignore + private void sleepFor(long interval) { + synchronized (timeoutCheckerLock) { + try { + timeoutCheckerLock.wait(interval); + } catch (InterruptedException e) { + // Ignore, and break. + } } } }; backgroundOperationPool.execute(timeoutChecker); } + private void shutdownTimeoutChecker() { + shutdown = true; + synchronized (timeoutCheckerLock) { + timeoutCheckerLock.notify(); + } + } + @Override public synchronized void stop() { super.stop(); - shutdown = true; + shutdownTimeoutChecker(); if (backgroundOperationPool != null) { backgroundOperationPool.shutdown(); long timeout = hiveConf.getTimeVar( diff --git a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java index 21b8bf7de75ce..e1ee503b81209 100644 --- a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java @@ -76,6 +76,10 @@ public void run() { keyStorePassword, sslVersionBlacklist); } + // In case HIVE_SERVER2_THRIFT_PORT or hive.server2.thrift.port is configured with 0 which + // represents any free port, we should set it to the actual one + portNum = serverSocket.getServerSocket().getLocalPort(); + // Server args int maxMessageSize = hiveConf.getIntVar(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_MAX_MESSAGE_SIZE); int requestTimeout = (int) hiveConf.getTimeVar( diff --git a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java index 504e63dbc5e5e..1099a00b67eb7 100644 --- a/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java @@ -143,6 +143,9 @@ public void run() { // TODO: check defaults: maxTimeout, keepalive, maxBodySize, bodyRecieveDuration, etc. // Finally, start the server httpServer.start(); + // In case HIVE_SERVER2_THRIFT_HTTP_PORT or hive.server2.thrift.http.port is configured with + // 0 which represents any free port, we should set it to the actual one + portNum = connector.getLocalPort(); String msg = "Started " + ThriftHttpCLIService.class.getSimpleName() + " in " + schemeName + " mode on port " + connector.getLocalPort()+ " path=" + httpPath + " with " + minWorkerThreads + "..." + maxWorkerThreads + " worker threads"; diff --git a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java index f881ab159ea67..85adf55df15e0 100644 --- a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java @@ -124,22 +124,6 @@ private static TColumnValue stringValue(HiveVarchar value) { return TColumnValue.stringVal(tStringValue); } - private static TColumnValue dateValue(Date value) { - TStringValue tStringValue = new TStringValue(); - if (value != null) { - tStringValue.setValue(value.toString()); - } - return new TColumnValue(TColumnValue.stringVal(tStringValue)); - } - - private static TColumnValue timestampValue(Timestamp value) { - TStringValue tStringValue = new TStringValue(); - if (value != null) { - tStringValue.setValue(value.toString()); - } - return TColumnValue.stringVal(tStringValue); - } - private static TColumnValue stringValue(HiveIntervalYearMonth value) { TStringValue tStrValue = new TStringValue(); if (value != null) { @@ -181,15 +165,16 @@ public static TColumnValue toTColumnValue(TypeDescriptor typeDescriptor, Object case VARCHAR_TYPE: return stringValue((HiveVarchar)value); case DATE_TYPE: - return dateValue((Date)value); case TIMESTAMP_TYPE: - return timestampValue((Timestamp)value); + // SPARK-31859, SPARK-31861: converted to string already in SparkExecuteStatementOperation + return stringValue((String)value); case INTERVAL_YEAR_MONTH_TYPE: return stringValue((HiveIntervalYearMonth) value); case INTERVAL_DAY_TIME_TYPE: return stringValue((HiveIntervalDayTime) value); case DECIMAL_TYPE: - return stringValue(((BigDecimal)value).toPlainString()); + String plainStr = value == null ? null : ((BigDecimal)value).toPlainString(); + return stringValue(plainStr); case BINARY_TYPE: String strVal = value == null ? null : UTF8String.fromBytes((byte[])value).toString(); return stringValue(strVal); diff --git a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java index f26c715add987..558c68f85c16b 100644 --- a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java @@ -298,7 +298,10 @@ public void cancel() throws HiveSQLException { throw new UnsupportedOperationException("SQLOperation.cancel()"); } - public abstract void close() throws HiveSQLException; + public void close() throws HiveSQLException { + setState(OperationState.CLOSED); + cleanupOperationLog(); + } public abstract TableSchema getResultSetSchema() throws HiveSQLException; diff --git a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/SessionManager.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/SessionManager.java index 49221b13bb892..5a381d170b4f9 100644 --- a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/SessionManager.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/SessionManager.java @@ -148,14 +148,20 @@ public synchronized void start() { } } + private final Object timeoutCheckerLock = new Object(); + private void startTimeoutChecker() { final long interval = Math.max(checkInterval, 3000L); // minimum 3 seconds - Runnable timeoutChecker = new Runnable() { + final Runnable timeoutChecker = new Runnable() { @Override public void run() { - for (sleepInterval(interval); !shutdown; sleepInterval(interval)) { + sleepFor(interval); + while (!shutdown) { long current = System.currentTimeMillis(); for (HiveSession session : new ArrayList(handleToSession.values())) { + if (shutdown) { + break; + } if (sessionTimeout > 0 && session.getLastAccessTime() + sessionTimeout <= current && (!checkOperation || session.getNoOperationTime() > sessionTimeout)) { SessionHandle handle = session.getSessionHandle(); @@ -170,24 +176,34 @@ public void run() { session.closeExpiredOperations(); } } + sleepFor(interval); } } - private void sleepInterval(long interval) { - try { - Thread.sleep(interval); - } catch (InterruptedException e) { - // ignore + private void sleepFor(long interval) { + synchronized (timeoutCheckerLock) { + try { + timeoutCheckerLock.wait(interval); + } catch (InterruptedException e) { + // Ignore, and break. + } } } }; backgroundOperationPool.execute(timeoutChecker); } + private void shutdownTimeoutChecker() { + shutdown = true; + synchronized (timeoutCheckerLock) { + timeoutCheckerLock.notify(); + } + } + @Override public synchronized void stop() { super.stop(); - shutdown = true; + shutdownTimeoutChecker(); if (backgroundOperationPool != null) { backgroundOperationPool.shutdown(); long timeout = hiveConf.getTimeVar( diff --git a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java index fc19c65daaf54..a7de9c0f3d0d2 100644 --- a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java @@ -77,6 +77,10 @@ public void run() { keyStorePassword, sslVersionBlacklist); } + // In case HIVE_SERVER2_THRIFT_PORT or hive.server2.thrift.port is configured with 0 which + // represents any free port, we should set it to the actual one + portNum = serverSocket.getServerSocket().getLocalPort(); + // Server args int maxMessageSize = hiveConf.getIntVar(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_MAX_MESSAGE_SIZE); int requestTimeout = (int) hiveConf.getTimeVar( diff --git a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java index 08626e7eb146d..73d5f84476af0 100644 --- a/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java @@ -144,6 +144,9 @@ public void run() { // TODO: check defaults: maxTimeout, keepalive, maxBodySize, bodyRecieveDuration, etc. // Finally, start the server httpServer.start(); + // In case HIVE_SERVER2_THRIFT_HTTP_PORT or hive.server2.thrift.http.port is configured with + // 0 which represents any free port, we should set it to the actual one + portNum = connector.getLocalPort(); String msg = "Started " + ThriftHttpCLIService.class.getSimpleName() + " in " + schemeName + " mode on port " + portNum + " path=" + httpPath + " with " + minWorkerThreads + "..." + maxWorkerThreads + " worker threads"; diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 29825e5116ef9..4112512e56fee 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.hive.execution import java.io.File -import java.util.{Locale, TimeZone} import org.scalatest.BeforeAndAfter @@ -27,22 +26,21 @@ import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy +import org.apache.spark.tags.SlowHiveTest /** * Runs the test cases that are included in the hive distribution. */ +@SlowHiveTest class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // TODO: bundle in jar files... get from classpath private lazy val hiveQueryDir = TestHive.getHiveFile( "ql/src/test/queries/clientpositive".split("/").mkString(File.separator)) - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault private val originalColumnBatchSize = TestHive.conf.columnBatchSize private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone - private val originalCreateHiveTable = TestHive.conf.createHiveTableByDefaultEnabled def testCases: Seq[(String, File)] = { hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f) @@ -51,10 +49,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) // Set a relatively small column batch size for testing purposes TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5) // Enable in-memory partition pruning for testing purposes @@ -66,21 +60,16 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests // (timestamp_*) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles") - TestHive.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, true) RuleExecutor.resetMetrics() } override def afterAll(): Unit = { try { TestHive.setCacheTables(false) - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize) TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone) - TestHive.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, - originalCreateHiveTable) // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala index ed23f65815917..2c0970c85449f 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.hive.execution import java.io.File -import java.util.{Locale, TimeZone} import org.scalatest.BeforeAndAfter @@ -33,17 +32,11 @@ import org.apache.spark.util.Utils * files, every `createQueryTest` calls should explicitly set `reset` to `false`. */ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfter { - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault private val testTempDir = Utils.createTempDir() override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) // Create the table used in windowing.q sql("DROP TABLE IF EXISTS part") @@ -103,8 +96,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte override def afterAll(): Unit = { try { TestHive.setCacheTables(false) - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) TestHive.reset() } finally { super.afterAll() @@ -747,17 +738,11 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte class HiveWindowFunctionQueryFileSuite extends HiveCompatibilitySuite with BeforeAndAfter { - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault private val testTempDir = Utils.createTempDir() override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) // The following settings are used for generating golden files with Hive. // We have to use kryo to correctly let Hive serialize plans with window functions. @@ -772,8 +757,6 @@ class HiveWindowFunctionQueryFileSuite override def afterAll(): Unit = { try { TestHive.setCacheTables(false) - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) TestHive.reset() } finally { super.afterAll() diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index c37582386347b..1b4c6dc110e97 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../../pom.xml diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala new file mode 100644 index 0000000000000..02a5117f005e8 --- /dev/null +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.spark.SparkContext +import org.apache.spark.api.java.JavaSparkContext +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{SparkSession, SQLContext} + + +/** + * An instance of the Spark SQL execution engine that integrates with data stored in Hive. + * Configuration for Hive is read from hive-site.xml on the classpath. + */ +@deprecated("Use SparkSession.builder.enableHiveSupport instead", "2.0.0") +class HiveContext private[hive](_sparkSession: SparkSession) + extends SQLContext(_sparkSession) with Logging { + + self => + + def this(sc: SparkContext) = { + this(SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc)).getOrCreate()) + } + + def this(sc: JavaSparkContext) = this(sc.sc) + + /** + * Returns a new HiveContext as new session, which will have separated SQLConf, UDF/UDAF, + * temporary tables and SessionState, but sharing the same CacheManager, IsolatedClientLoader + * and Hive client (both of execution and metadata) with existing HiveContext. + */ + override def newSession(): HiveContext = { + new HiveContext(sparkSession.newSession()) + } + + /** + * Invalidate and refresh all the cached the metadata of the given table. For performance reasons, + * Spark SQL or the external data source library it uses might cache certain metadata about a + * table, such as the location of blocks. When those change outside of Spark SQL, users should + * call this function to invalidate the cache. + * + * @since 1.3.0 + */ + def refreshTable(tableName: String): Unit = { + sparkSession.catalog.refreshTable(tableName) + } + +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index ca292f65efeee..2d4e45aecdbb1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -61,6 +61,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat import HiveExternalCatalog._ import CatalogTableType._ + // SPARK-32256: Make sure `VersionInfo` is initialized before touching the isolated classloader. + // This is to ensure Hive can get the Hadoop version when using the isolated classloader. + org.apache.hadoop.util.VersionInfo.getVersion() + /** * A Hive client used to interact with the metastore. */ @@ -634,7 +638,15 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat k.startsWith(DATASOURCE_PREFIX) || k.startsWith(STATISTICS_PREFIX) || k.startsWith(CREATED_SPARK_VERSION) } - val newTableProps = propsFromOldTable ++ tableDefinition.properties + partitionProviderProp + val newFormatIfExists = tableDefinition.provider.flatMap { p => + if (DDLUtils.isDatasourceTable(tableDefinition)) { + Some(DATASOURCE_PROVIDER -> p) + } else { + None + } + } + val newTableProps = + propsFromOldTable ++ tableDefinition.properties + partitionProviderProp ++ newFormatIfExists // // Add old table's owner if we need to restore val owner = Option(tableDefinition.owner).filter(_.nonEmpty).getOrElse(oldTableDef.owner) @@ -853,6 +865,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat client.listTables(db, pattern) } + override def listViews(db: String, pattern: String): Seq[String] = withClient { + requireDbExists(db) + client.listTablesByType(db, pattern, CatalogTableType.VIEW) + } + override def loadTable( db: String, table: String, @@ -934,9 +951,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat // Hive metastore is not case preserving and the partition columns are always lower cased. We need // to lower case the column names in partition specification before calling partition related Hive // APIs, to match this behaviour. - private def lowerCasePartitionSpec(spec: TablePartitionSpec): TablePartitionSpec = { + private def toMetaStorePartitionSpec(spec: TablePartitionSpec): TablePartitionSpec = { // scalastyle:off caselocale - spec.map { case (k, v) => k.toLowerCase -> v } + val lowNames = spec.map { case (k, v) => k.toLowerCase -> v } + ExternalCatalogUtils.convertNullPartitionValues(lowNames) // scalastyle:on caselocale } @@ -985,8 +1003,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat } p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toUri))) } - val lowerCasedParts = partsWithLocation.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec))) - client.createPartitions(db, table, lowerCasedParts, ignoreIfExists) + val metaStoreParts = partsWithLocation + .map(p => p.copy(spec = toMetaStorePartitionSpec(p.spec))) + client.createPartitions(db, table, metaStoreParts, ignoreIfExists) } override def dropPartitions( @@ -998,7 +1017,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat retainData: Boolean): Unit = withClient { requireTableExists(db, table) client.dropPartitions( - db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge, retainData) + db, table, parts.map(toMetaStorePartitionSpec), ignoreIfNotExists, purge, retainData) } override def renamePartitions( @@ -1007,7 +1026,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat specs: Seq[TablePartitionSpec], newSpecs: Seq[TablePartitionSpec]): Unit = withClient { client.renamePartitions( - db, table, specs.map(lowerCasePartitionSpec), newSpecs.map(lowerCasePartitionSpec)) + db, table, specs.map(toMetaStorePartitionSpec), newSpecs.map(toMetaStorePartitionSpec)) val tableMeta = getTable(db, table) val partitionColumnNames = tableMeta.partitionColumnNames @@ -1023,7 +1042,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val fs = tablePath.getFileSystem(hadoopConf) val newParts = newSpecs.map { spec => val rightPath = renamePartitionDirectory(fs, tablePath, partitionColumnNames, spec) - val partition = client.getPartition(db, table, lowerCasePartitionSpec(spec)) + val partition = client.getPartition(db, table, toMetaStorePartitionSpec(spec)) partition.copy(storage = partition.storage.copy(locationUri = Some(rightPath.toUri))) } alterPartitions(db, table, newParts) @@ -1133,12 +1152,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat db: String, table: String, newParts: Seq[CatalogTablePartition]): Unit = withClient { - val lowerCasedParts = newParts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec))) - - val rawTable = getRawTable(db, table) - + val metaStoreParts = newParts.map(p => p.copy(spec = toMetaStorePartitionSpec(p.spec))) // convert partition statistics to properties so that we can persist them through hive api - val withStatsProps = lowerCasedParts.map { p => + val withStatsProps = metaStoreParts.map { p => if (p.stats.isDefined) { val statsProperties = statsToProperties(p.stats.get) p.copy(parameters = p.parameters ++ statsProperties) @@ -1154,7 +1170,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat db: String, table: String, spec: TablePartitionSpec): CatalogTablePartition = withClient { - val part = client.getPartition(db, table, lowerCasePartitionSpec(spec)) + val part = client.getPartition(db, table, toMetaStorePartitionSpec(spec)) restorePartitionMetadata(part, getTable(db, table)) } @@ -1192,7 +1208,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat db: String, table: String, spec: TablePartitionSpec): Option[CatalogTablePartition] = withClient { - client.getPartitionOption(db, table, lowerCasePartitionSpec(spec)).map { part => + client.getPartitionOption(db, table, toMetaStorePartitionSpec(spec)).map { part => restorePartitionMetadata(part, getTable(db, table)) } } @@ -1207,7 +1223,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val catalogTable = getTable(db, table) val partColNameMap = buildLowerCasePartColNameMap(catalogTable).mapValues(escapePathName) val clientPartitionNames = - client.getPartitionNames(catalogTable, partialSpec.map(lowerCasePartitionSpec)) + client.getPartitionNames(catalogTable, partialSpec.map(toMetaStorePartitionSpec)) clientPartitionNames.map { partitionPath => val partSpec = PartitioningUtils.parsePathFragmentAsSeq(partitionPath) partSpec.map { case (partName, partValue) => @@ -1226,11 +1242,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat table: String, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient { val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table)) - val res = client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => - part.copy(spec = restorePartitionSpec(part.spec, partColNameMap)) + val metaStoreSpec = partialSpec.map(toMetaStorePartitionSpec) + val res = client.getPartitions(db, table, metaStoreSpec) + .map { part => part.copy(spec = restorePartitionSpec(part.spec, partColNameMap)) } - partialSpec match { + metaStoreSpec match { // This might be a bug of Hive: When the partition value inside the partial partition spec // contains dot, and we ask Hive to list partitions w.r.t. the partial partition spec, Hive // treats dot as matching any single character and may return more partitions than we diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 0cd9b3641bd4a..16e9014340244 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.hive import java.lang.reflect.{ParameterizedType, Type, WildcardType} -import java.util.concurrent.TimeUnit._ import scala.collection.JavaConverters._ @@ -33,6 +32,7 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.execution.datasources.DaysWritable import org.apache.spark.sql.types import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -466,7 +466,7 @@ private[hive] trait HiveInspectors { _ => constant case poi: WritableConstantTimestampObjectInspector => val t = poi.getWritableConstantValue - val constant = SECONDS.toMicros(t.getSeconds) + NANOSECONDS.toMicros(t.getNanos) + val constant = DateTimeUtils.fromJavaTimestamp(t.getTimestamp) _ => constant case poi: WritableConstantIntObjectInspector => val constant = poi.getWritableConstantValue.get() @@ -618,7 +618,7 @@ private[hive] trait HiveInspectors { case x: DateObjectInspector if x.preferWritable() => data: Any => { if (data != null) { - DateTimeUtils.fromJavaDate(x.getPrimitiveWritableObject(data).get()) + new DaysWritable(x.getPrimitiveWritableObject(data)).gregorianDays } else { null } @@ -634,8 +634,7 @@ private[hive] trait HiveInspectors { case x: TimestampObjectInspector if x.preferWritable() => data: Any => { if (data != null) { - val t = x.getPrimitiveWritableObject(data) - SECONDS.toMicros(t.getSeconds) + NANOSECONDS.toMicros(t.getNanos) + DateTimeUtils.fromJavaTimestamp(x.getPrimitiveWritableObject(data).getTimestamp) } else { null } @@ -1011,8 +1010,12 @@ private[hive] trait HiveInspectors { new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]]) } - private def getDateWritable(value: Any): hiveIo.DateWritable = - if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[Int]) + private def getDateWritable(value: Any): DaysWritable = + if (value == null) { + null + } else { + new DaysWritable(value.asInstanceOf[Int]) + } private def getTimestampWritable(value: Any): hiveIo.TimestampWritable = if (value == null) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index b117c582a3e6e..e25610757a69b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -25,7 +25,9 @@ import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{SparkOptimizer, SparkPlanner} +import org.apache.spark.sql.execution.aggregate.ResolveEncodersInScalaAgg import org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin +import org.apache.spark.sql.execution.command.CommandCheck import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.v2.TableCapabilityCheck import org.apache.spark.sql.hive.client.HiveClient @@ -75,7 +77,9 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: new FallBackFileSourceV2(session) +: - new ResolveSessionCatalog(catalogManager, conf, catalog.isView) +: + ResolveEncodersInScalaAgg +: + new ResolveSessionCatalog( + catalogManager, conf, catalog.isTempView, catalog.isTempFunction) +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = @@ -92,22 +96,12 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session PreWriteCheck +: PreReadCheck +: TableCapabilityCheck +: + CommandCheck(conf) +: customCheckRules } - /** - * Logical query plan optimizer that takes into account Hive. - */ - override protected def optimizer: Optimizer = { - new SparkOptimizer(catalogManager, catalog, experimentalMethods) { - override def postHocOptimizationBatches: Seq[Batch] = Seq( - Batch("Prune Hive Table Partitions", Once, new PruneHiveTablePartitions(session)) - ) - - override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = - super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules - } - } + override def customEarlyScanPushDownRules: Seq[Rule[LogicalPlan]] = + Seq(new PruneHiveTablePartitions(session)) /** * Planner that takes into account Hive-specific strategies. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala index 3beef6b1df457..04a6a8f8aa9a5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala @@ -118,9 +118,12 @@ private[hive] object HiveShim { * * @param functionClassName UDF class name * @param instance optional UDF instance which contains additional information (for macro) + * @param clazz optional class instance to create UDF instance */ - private[hive] case class HiveFunctionWrapper(var functionClassName: String, - private var instance: AnyRef = null) extends java.io.Externalizable { + private[hive] case class HiveFunctionWrapper( + var functionClassName: String, + private var instance: AnyRef = null, + private var clazz: Class[_ <: AnyRef] = null) extends java.io.Externalizable { // for Serialization def this() = this(null) @@ -232,8 +235,10 @@ private[hive] object HiveShim { in.readFully(functionInBytes) // deserialize the function object via Hive Utilities + clazz = Utils.getContextOrSparkClassLoader.loadClass(functionClassName) + .asInstanceOf[Class[_ <: AnyRef]] instance = deserializePlan[AnyRef](new java.io.ByteArrayInputStream(functionInBytes), - Utils.getContextOrSparkClassLoader.loadClass(functionClassName)) + clazz) } } @@ -241,8 +246,11 @@ private[hive] object HiveShim { if (instance != null) { instance.asInstanceOf[UDFType] } else { - val func = Utils.getContextOrSparkClassLoader - .loadClass(functionClassName).getConstructor().newInstance().asInstanceOf[UDFType] + if (clazz == null) { + clazz = Utils.getContextOrSparkClassLoader.loadClass(functionClassName) + .asInstanceOf[Class[_ <: AnyRef]] + } + val func = clazz.getConstructor().newInstance().asInstanceOf[UDFType] if (!func.isInstanceOf[UDF]) { // We cache the function if it's no the Simple UDF, // as we always have to create new instance for Simple UDF diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index 9c4b8a5819a33..04caf57efdc74 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -61,10 +61,11 @@ private[spark] object HiveUtils extends Logging { /** The version of hive used internally by Spark SQL. */ val builtinHiveVersion: String = if (isHive23) hiveVersion else "1.2.1" - val HIVE_METASTORE_VERSION = buildConf("spark.sql.hive.metastore.version") + val HIVE_METASTORE_VERSION = buildStaticConf("spark.sql.hive.metastore.version") .doc("Version of the Hive metastore. Available options are " + - "0.12.0 through 2.3.6 and " + + "0.12.0 through 2.3.7 and " + "3.0.0 through 3.1.2.") + .version("1.4.0") .stringConf .createWithDefault(builtinHiveVersion) @@ -73,10 +74,10 @@ private[spark] object HiveUtils extends Logging { // already rely on this config. val FAKE_HIVE_VERSION = buildConf("spark.sql.hive.version") .doc(s"deprecated, please use ${HIVE_METASTORE_VERSION.key} to get the Hive version in Spark.") - .stringConf - .createWithDefault(builtinHiveVersion) + .version("1.1.1") + .fallbackConf(HIVE_METASTORE_VERSION) - val HIVE_METASTORE_JARS = buildConf("spark.sql.hive.metastore.jars") + val HIVE_METASTORE_JARS = buildStaticConf("spark.sql.hive.metastore.jars") .doc(s""" | Location of the jars that should be used to instantiate the HiveMetastoreClient. | This property can be one of three options: " @@ -89,12 +90,14 @@ private[spark] object HiveUtils extends Logging { | Use Hive jars of specified version downloaded from Maven repositories. | 3. A classpath in the standard format for both Hive and Hadoop. """.stripMargin) + .version("1.4.0") .stringConf .createWithDefault("builtin") val CONVERT_METASTORE_PARQUET = buildConf("spark.sql.hive.convertMetastoreParquet") .doc("When set to true, the built-in Parquet reader and writer are used to process " + "parquet tables created by using the HiveQL syntax, instead of Hive serde.") + .version("1.1.1") .booleanConf .createWithDefault(true) @@ -103,12 +106,14 @@ private[spark] object HiveUtils extends Logging { .doc("When true, also tries to merge possibly different but compatible Parquet schemas in " + "different Parquet data files. This configuration is only effective " + "when \"spark.sql.hive.convertMetastoreParquet\" is true.") + .version("1.3.1") .booleanConf .createWithDefault(false) val CONVERT_METASTORE_ORC = buildConf("spark.sql.hive.convertMetastoreOrc") .doc("When set to true, the built-in ORC reader and writer are used to process " + "ORC tables created by using the HiveQL syntax, instead of Hive serde.") + .version("2.0.0") .booleanConf .createWithDefault(true) @@ -118,6 +123,7 @@ private[spark] object HiveUtils extends Logging { "`spark.sql.hive.convertMetastoreOrc` is true, the built-in ORC/Parquet writer is used" + "to process inserting into partitioned ORC/Parquet tables created by using the HiveSQL " + "syntax.") + .version("3.0.0") .booleanConf .createWithDefault(true) @@ -126,15 +132,17 @@ private[spark] object HiveUtils extends Logging { "instead of Hive serde in CTAS. This flag is effective only if " + "`spark.sql.hive.convertMetastoreParquet` or `spark.sql.hive.convertMetastoreOrc` is " + "enabled respectively for Parquet and ORC formats") + .version("3.0.0") .booleanConf .createWithDefault(true) - val HIVE_METASTORE_SHARED_PREFIXES = buildConf("spark.sql.hive.metastore.sharedPrefixes") + val HIVE_METASTORE_SHARED_PREFIXES = buildStaticConf("spark.sql.hive.metastore.sharedPrefixes") .doc("A comma separated list of class prefixes that should be loaded using the classloader " + "that is shared between Spark SQL and a specific version of Hive. An example of classes " + "that should be shared is JDBC drivers that are needed to talk to the metastore. Other " + "classes that need to be shared are those that interact with classes that are already " + "shared. For example, custom appenders that are used by log4j.") + .version("1.4.0") .stringConf .toSequence .createWithDefault(jdbcPrefixes) @@ -142,16 +150,18 @@ private[spark] object HiveUtils extends Logging { private def jdbcPrefixes = Seq( "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc") - val HIVE_METASTORE_BARRIER_PREFIXES = buildConf("spark.sql.hive.metastore.barrierPrefixes") + val HIVE_METASTORE_BARRIER_PREFIXES = buildStaticConf("spark.sql.hive.metastore.barrierPrefixes") .doc("A comma separated list of class prefixes that should explicitly be reloaded for each " + "version of Hive that Spark SQL is communicating with. For example, Hive UDFs that are " + "declared in a prefix that typically would be shared (i.e. org.apache.spark.*).") + .version("1.4.0") .stringConf .toSequence .createWithDefault(Nil) val HIVE_THRIFT_SERVER_ASYNC = buildConf("spark.sql.hive.thriftServer.async") .doc("When set to true, Hive Thrift server executes SQL queries in an asynchronous way.") + .version("1.5.0") .booleanConf .createWithDefault(true) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index e31dffa4795c5..3ea80eaf6f714 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -61,6 +61,15 @@ private[hive] trait HiveClient { /** Returns the names of tables in the given database that matches the given pattern. */ def listTables(dbName: String, pattern: String): Seq[String] + /** + * Returns the names of tables with specific tableType in the given database that matches + * the given pattern. + */ + def listTablesByType( + dbName: String, + pattern: String, + tableType: CatalogTableType): Seq[String] + /** Sets the name of current database. */ def setCurrentDatabase(databaseName: String): Unit diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index b5c5f0e9381bc..f3118368b9829 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.client import java.io.{File, PrintStream} import java.lang.{Iterable => JIterable} +import java.lang.reflect.InvocationTargetException import java.nio.charset.StandardCharsets.UTF_8 import java.util.{Locale, Map => JMap} import java.util.concurrent.TimeUnit._ @@ -48,7 +49,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPartitionException} +import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPartitionException, NoSuchPartitionsException, PartitionsAlreadyExistException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.Expression @@ -58,7 +59,6 @@ import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveExternalCatalog.{DATASOURCE_SCHEMA, DATASOURCE_SCHEMA_NUMPARTS, DATASOURCE_SCHEMA_PART_PREFIX} import org.apache.spark.sql.hive.HiveUtils -import org.apache.spark.sql.hive.client.HiveClientImpl._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.{CircularBuffer, Utils} @@ -99,6 +99,8 @@ private[hive] class HiveClientImpl( extends HiveClient with Logging { + import HiveClientImpl._ + // Circular buffer to hold what hive prints to STDOUT and ERR. Only printed when failures occur. private val outputBuffer = new CircularBuffer() @@ -159,36 +161,7 @@ private[hive] class HiveClientImpl( s"(version ${version.fullVersion}) is ${conf.getVar(ConfVars.METASTOREWAREHOUSE)}") private def newState(): SessionState = { - val hiveConf = new HiveConf(classOf[SessionState]) - // HiveConf is a Hadoop Configuration, which has a field of classLoader and - // the initial value will be the current thread's context class loader - // (i.e. initClassLoader at here). - // We call hiveConf.setClassLoader(initClassLoader) at here to make - // this action explicit. - hiveConf.setClassLoader(initClassLoader) - - // 1: Take all from the hadoopConf to this hiveConf. - // This hadoopConf contains user settings in Hadoop's core-site.xml file - // and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in - // SharedState and put settings in this hadoopConf instead of relying on HiveConf - // to load user settings. Otherwise, HiveConf's initialize method will override - // settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars - // is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath - // has hive-site.xml. So, HiveConf will use that to override its default values. - // 2: we set all spark confs to this hiveConf. - // 3: we set all entries in config to this hiveConf. - val confMap = (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue) ++ - sparkConf.getAll.toMap ++ extraConfig).toMap - confMap.foreach { case (k, v) => hiveConf.set(k, v) } - SQLConf.get.redactOptions(confMap).foreach { case (k, v) => - logDebug( - s""" - |Applying Hadoop/Hive/Spark and extra properties to Hive Conf: - |$k=$v - """.stripMargin) - } - // Disable CBO because we removed the Calcite dependency. - hiveConf.setBoolean("hive.cbo.enable", false) + val hiveConf = newHiveConf(sparkConf, hadoopConf, extraConfig, Some(initClassLoader)) val state = new SessionState(hiveConf) if (clientLoader.cachedHive != null) { Hive.set(clientLoader.cachedHive.asInstanceOf[Hive]) @@ -318,7 +291,15 @@ private[hive] class HiveClientImpl( // with the HiveConf in `state` to override the context class loader of the current // thread. shim.setCurrentSessionState(state) - val ret = try f finally { + val ret = try { + f + } catch { + case e: NoClassDefFoundError + if HiveUtils.isHive23 && e.getMessage.contains("org/apache/hadoop/hive/serde2/SerDe") => + throw new ClassNotFoundException("The SerDe interface removed since Hive 2.3(HIVE-15167)." + + " Please migrate your custom SerDes to Hive 2.3 or build your own Spark with" + + " hive-1.2 profile. See HIVE-15167 for more details.", e) + } finally { state.getConf.setClassLoader(originalConfLoader) Thread.currentThread().setContextClassLoader(original) HiveCatalogMetrics.incrementHiveClientCalls(1) @@ -625,7 +606,17 @@ private[hive] class HiveClientImpl( table: String, parts: Seq[CatalogTablePartition], ignoreIfExists: Boolean): Unit = withHiveState { - shim.createPartitions(client, db, table, parts, ignoreIfExists) + def replaceExistException(e: Throwable): Unit = e match { + case _: HiveException if e.getCause.isInstanceOf[AlreadyExistsException] => + throw new PartitionsAlreadyExistException(db, table, parts.map(_.spec)) + case _ => throw e + } + try { + shim.createPartitions(client, db, table, parts, ignoreIfExists) + } catch { + case e: InvocationTargetException => replaceExistException(e.getCause) + case e: Throwable => replaceExistException(e) + } } override def dropPartitions( @@ -646,9 +637,7 @@ private[hive] class HiveClientImpl( // (b='1', c='1') and (b='1', c='2'), a partial spec of (b='1') will match both. val parts = client.getPartitions(hiveTable, s.asJava).asScala if (parts.isEmpty && !ignoreIfNotExists) { - throw new AnalysisException( - s"No partition is dropped. One partition spec '$s' does not exist in table '$table' " + - s"database '$db'") + throw new NoSuchPartitionsException(db, table, Seq(s)) } parts.map(_.getValues) }.distinct @@ -775,6 +764,24 @@ private[hive] class HiveClientImpl( client.getTablesByPattern(dbName, pattern).asScala } + override def listTablesByType( + dbName: String, + pattern: String, + tableType: CatalogTableType): Seq[String] = withHiveState { + val hiveTableType = toHiveTableType(tableType) + try { + // Try with Hive API getTablesByType first, it's supported from Hive 2.3+. + shim.getTablesByType(client, dbName, pattern, hiveTableType) + } catch { + case _: UnsupportedOperationException => + // Fallback to filter logic if getTablesByType not supported. + val tableNames = client.getTablesByPattern(dbName, pattern).asScala + getRawTablesByName(dbName, tableNames) + .filter(_.getTableType == hiveTableType) + .map(_.getTableName) + } + } + /** * Runs the specified SQL query using Hive. */ @@ -831,7 +838,12 @@ private[hive] class HiveClientImpl( state.out.println(tokens(0) + " " + cmd_1) // scalastyle:on println } - Seq(proc.run(cmd_1).getResponseCode.toString) + val response: CommandProcessorResponse = proc.run(cmd_1) + // Throw an exception if there is an error in query processing. + if (response.getResponseCode != 0) { + throw new QueryExecutionException(response.getErrorMessage) + } + Seq(response.getResponseCode.toString) } } catch { case e: Exception => @@ -977,7 +989,7 @@ private[hive] class HiveClientImpl( } } -private[hive] object HiveClientImpl { +private[hive] object HiveClientImpl extends Logging { /** Converts the native StructField to Hive's FieldSchema. */ def toHiveColumn(c: StructField): FieldSchema = { val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) { @@ -1026,25 +1038,29 @@ private[hive] object HiveClientImpl { private def toOutputFormat(name: String) = Utils.classForName[org.apache.hadoop.hive.ql.io.HiveOutputFormat[_, _]](name) + def toHiveTableType(catalogTableType: CatalogTableType): HiveTableType = { + catalogTableType match { + case CatalogTableType.EXTERNAL => HiveTableType.EXTERNAL_TABLE + case CatalogTableType.MANAGED => HiveTableType.MANAGED_TABLE + case CatalogTableType.VIEW => HiveTableType.VIRTUAL_VIEW + case t => + throw new IllegalArgumentException( + s"Unknown table type is found at toHiveTableType: $t") + } + } + /** * Converts the native table metadata representation format CatalogTable to Hive's Table. */ def toHiveTable(table: CatalogTable, userName: Option[String] = None): HiveTable = { val hiveTable = new HiveTable(table.database, table.identifier.table) + hiveTable.setTableType(toHiveTableType(table.tableType)) // For EXTERNAL_TABLE, we also need to set EXTERNAL field in the table properties. // Otherwise, Hive metastore will change the table to a MANAGED_TABLE. // (metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java#L1095-L1105) - hiveTable.setTableType(table.tableType match { - case CatalogTableType.EXTERNAL => - hiveTable.setProperty("EXTERNAL", "TRUE") - HiveTableType.EXTERNAL_TABLE - case CatalogTableType.MANAGED => - HiveTableType.MANAGED_TABLE - case CatalogTableType.VIEW => HiveTableType.VIRTUAL_VIEW - case t => - throw new IllegalArgumentException( - s"Unknown table type is found at toHiveTable: $t") - }) + if (table.tableType == CatalogTableType.EXTERNAL) { + hiveTable.setProperty("EXTERNAL", "TRUE") + } // Note: In Hive the schema and partition columns must be disjoint sets val (partCols, schema) = table.schema.map(toHiveColumn).partition { c => table.partitionColumnNames.contains(c.getName) @@ -1219,4 +1235,50 @@ private[hive] object HiveClientImpl { StatsSetupConst.RAW_DATA_SIZE, StatsSetupConst.TOTAL_SIZE ) + + def newHiveConf( + sparkConf: SparkConf, + hadoopConf: JIterable[JMap.Entry[String, String]], + extraConfig: Map[String, String], + classLoader: Option[ClassLoader] = None): HiveConf = { + val hiveConf = new HiveConf(classOf[SessionState]) + // HiveConf is a Hadoop Configuration, which has a field of classLoader and + // the initial value will be the current thread's context class loader. + // We call hiveConf.setClassLoader(initClassLoader) at here to ensure it use the classloader + // we want. + classLoader.foreach(hiveConf.setClassLoader) + // 1: Take all from the hadoopConf to this hiveConf. + // This hadoopConf contains user settings in Hadoop's core-site.xml file + // and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in + // SharedState and put settings in this hadoopConf instead of relying on HiveConf + // to load user settings. Otherwise, HiveConf's initialize method will override + // settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars + // is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath + // has hive-site.xml. So, HiveConf will use that to override its default values. + // 2: we set all spark confs to this hiveConf. + // 3: we set all entries in config to this hiveConf. + val confMap = (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue) ++ + sparkConf.getAll.toMap ++ extraConfig).toMap + confMap.foreach { case (k, v) => hiveConf.set(k, v) } + SQLConf.get.redactOptions(confMap).foreach { case (k, v) => + logDebug(s"Applying Hadoop/Hive/Spark and extra properties to Hive Conf:$k=$v") + } + // Disable CBO because we removed the Calcite dependency. + hiveConf.setBoolean("hive.cbo.enable", false) + // If this is true, SessionState.start will create a file to log hive job which will not be + // deleted on exit and is useless for spark + if (hiveConf.getBoolean("hive.session.history.enabled", false)) { + logWarning("Detected HiveConf hive.session.history.enabled is true and will be reset to" + + " false to disable useless hive logic") + hiveConf.setBoolean("hive.session.history.enabled", false) + } + // If this is tez engine, SessionState.start might bring extra logic to initialize tez stuff, + // which is useless for spark. + if (hiveConf.get("hive.execution.engine") == "tez") { + logWarning("Detected HiveConf hive.execution.engine is 'tez' and will be reset to 'mr'" + + " to disable useless hive logic") + hiveConf.set("hive.execution.engine", "mr") + } + hiveConf + } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 50ce536a160c8..d11bf94a1fd86 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -29,6 +29,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.IMetaStoreClient +import org.apache.hadoop.hive.metastore.TableType import org.apache.hadoop.hive.metastore.api.{Database, EnvironmentContext, Function => HiveFunction, FunctionType, MetaException, PrincipalType, ResourceType, ResourceUri} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.io.AcidUtils @@ -90,6 +91,12 @@ private[client] sealed abstract class Shim { def alterPartitions(hive: Hive, tableName: String, newParts: JList[Partition]): Unit + def getTablesByType( + hive: Hive, + dbName: String, + pattern: String, + tableType: TableType): Seq[String] + def createPartitions( hive: Hive, db: String, @@ -363,6 +370,15 @@ private[client] class Shim_v0_12 extends Shim with Logging { conf.getIntVar(HiveConf.ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY) * 1000L } + override def getTablesByType( + hive: Hive, + dbName: String, + pattern: String, + tableType: TableType): Seq[String] = { + throw new UnsupportedOperationException("Hive 2.2 and lower versions don't support " + + "getTablesByType. Please use Hive 2.3 or higher version.") + } + override def loadPartition( hive: Hive, loadPath: Path, @@ -692,7 +708,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { .map(col => col.getName).toSet def unapply(attr: Attribute): Option[String] = { - if (varcharKeys.contains(attr.name)) { + val resolver = SQLConf.get.resolver + if (varcharKeys.exists(c => resolver(c, attr.name))) { None } else if (attr.dataType.isInstanceOf[IntegralType] || attr.dataType == StringType) { Some(attr.name) @@ -712,7 +729,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { def unapply(expr: Expression): Option[Attribute] = { expr match { case attr: Attribute => Some(attr) - case Cast(child @ AtomicType(), dt: AtomicType, _) + case Cast(child @ IntegralType(), dt: IntegralType, _) if Cast.canUpCast(child.dataType.asInstanceOf[AtomicType], dt) => unapply(child) case _ => None } @@ -1220,7 +1237,24 @@ private[client] class Shim_v2_1 extends Shim_v2_0 { private[client] class Shim_v2_2 extends Shim_v2_1 -private[client] class Shim_v2_3 extends Shim_v2_1 +private[client] class Shim_v2_3 extends Shim_v2_1 { + private lazy val getTablesByTypeMethod = + findMethod( + classOf[Hive], + "getTablesByType", + classOf[String], + classOf[String], + classOf[TableType]) + + override def getTablesByType( + hive: Hive, + dbName: String, + pattern: String, + tableType: TableType): Seq[String] = { + getTablesByTypeMethod.invoke(hive, dbName, pattern, tableType) + .asInstanceOf[JList[String]].asScala + } +} private[client] class Shim_v3_0 extends Shim_v2_3 { // Spark supports only non-ACID operations @@ -1293,10 +1327,7 @@ private[client] class Shim_v3_0 extends Shim_v2_3 { inheritTableSpecs: Boolean, isSkewedStoreAsSubdir: Boolean, isSrcLocal: Boolean): Unit = { - val session = SparkSession.getActiveSession - assert(session.nonEmpty) - val database = session.get.sessionState.catalog.getCurrentDatabase - val table = hive.getTable(database, tableName) + val table = hive.getTable(tableName) val loadFileType = if (replace) { clazzLoadFileType.getEnumConstants.find(_.toString.equalsIgnoreCase("REPLACE_ALL")) } else { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 5da7b70cfc7aa..42a0ec0253b85 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -104,7 +104,8 @@ private[hive] object IsolatedClientLoader extends Logging { case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0 case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1 case "2.2" | "2.2.0" => hive.v2_2 - case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | "2.3.6" => hive.v2_3 + case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | "2.3.6" | "2.3.7" => + hive.v2_3 case "3.0" | "3.0.0" => hive.v3_0 case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1 case version => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala index 26319119a596a..8526d86454604 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/package.scala @@ -89,7 +89,7 @@ package object client { // Since HIVE-14496, Hive materialized view need calcite-core. // For spark, only VersionsSuite currently creates a hive materialized view for testing. - case object v2_3 extends HiveVersion("2.3.6", + case object v2_3 extends HiveVersion("2.3.7", exclusions = Seq("org.apache.calcite:calcite-druid", "org.apache.calcite.avatica:avatica", "org.apache.curator:*", diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala index b66c302a7d7ea..7ef637ed553ad 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.util.SchemaUtils @@ -63,12 +64,16 @@ case class InsertIntoHiveDirCommand( s"when inserting into ${storage.locationUri.get}", sparkSession.sessionState.conf.caseSensitiveAnalysis) - val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( + val table = CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), + provider = Some(DDLUtils.HIVE_PROVIDER), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, schema = outputColumns.toStructType - )) + ) + DDLUtils.checkDataColNames(table) + + val hiveTable = HiveClientImpl.toHiveTable(table) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 801be64702519..39d5b711ab87e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, ExternalCatalog, ExternalCatalogUtils} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.CommandUtils import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} @@ -106,7 +107,7 @@ case class InsertIntoHiveTable( } // un-cache this table. - sparkSession.catalog.uncacheTable(table.identifier.quotedString) + CommandUtils.uncacheTableOrView(sparkSession, table.identifier.quotedString) sparkSession.sessionState.catalog.refreshTable(table.identifier) CommandUtils.updateTableStats(sparkSession, table) @@ -130,6 +131,7 @@ case class InsertIntoHiveTable( val numDynamicPartitions = partition.values.count(_.isEmpty) val numStaticPartitions = partition.values.count(_.nonEmpty) val partitionSpec = partition.map { + case (key, Some(null)) => key -> ExternalCatalogUtils.DEFAULT_PARTITION_NAME case (key, Some(value)) => key -> value case (key, None) => key -> "" } @@ -223,9 +225,13 @@ case class InsertIntoHiveTable( ExternalCatalogUtils.unescapePathName(splitPart(1)) }.toMap + val caseInsensitiveDpMap = CaseInsensitiveMap(dpMap) + val updatedPartitionSpec = partition.map { + case (key, Some(null)) => key -> ExternalCatalogUtils.DEFAULT_PARTITION_NAME case (key, Some(value)) => key -> value - case (key, None) if dpMap.contains(key) => key -> dpMap(key) + case (key, None) if caseInsensitiveDpMap.contains(key) => + key -> caseInsensitiveDpMap(key) case (key, _) => throw new SparkException(s"Dynamic partition key $key is not among " + "written partition paths.") @@ -239,7 +245,7 @@ case class InsertIntoHiveTable( if (fs.exists(partitionPath)) { if (!fs.delete(partitionPath, true)) { throw new RuntimeException( - "Cannot remove partition directory '" + partitionPath.toString) + s"Cannot remove partition directory '$partitionPath'") } } } @@ -291,7 +297,7 @@ case class InsertIntoHiveTable( if (fs.exists(path)) { if (!fs.delete(path, true)) { throw new RuntimeException( - "Cannot remove partition directory '" + path.toString) + s"Cannot remove partition directory '$path'") } // Don't let Hive do overwrite operation since it is slower. doHiveOverwrite = false diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala index a0349f627d107..da6e4c52cf3a7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala @@ -30,6 +30,14 @@ import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.internal.SQLConf /** + * Prune hive table partitions using partition filters on [[HiveTableRelation]]. The pruned + * partitions will be kept in [[HiveTableRelation.prunedPartitions]], and the statistics of + * the hive table relation will be updated based on pruned partitions. + * + * This rule is executed in optimization phase, so the statistics can be updated before physical + * planning, which is useful for some spark strategy, eg. + * [[org.apache.spark.sql.execution.SparkStrategies.JoinSelection]]. + * * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source. */ private[sql] class PruneHiveTablePartitions(session: SparkSession) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala index e12f663304e7a..c7183fd7385a6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.execution import java.io._ import java.nio.charset.StandardCharsets import java.util.Properties +import java.util.concurrent.TimeUnit import javax.annotation.Nullable import scala.collection.JavaConverters._ @@ -42,6 +43,7 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution._ import org.apache.spark.sql.hive.HiveInspectors import org.apache.spark.sql.hive.HiveShim._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataType import org.apache.spark.util.{CircularBuffer, RedirectThread, SerializableConfiguration, Utils} @@ -94,9 +96,8 @@ case class ScriptTransformationExec( // This new thread will consume the ScriptTransformation's input rows and write them to the // external process. That process's output will be read by this current thread. val writerThread = new ScriptTransformationWriterThread( - inputIterator, + inputIterator.map(outputProjection), input.map(_.dataType), - outputProjection, inputSerde, inputSoi, ioschema, @@ -137,6 +138,15 @@ case class ScriptTransformationExec( throw writerThread.exception.get } + // There can be a lag between reader read EOF and the process termination. + // If the script fails to startup, this kind of error may be missed. + // So explicitly waiting for the process termination. + val timeout = conf.getConf(SQLConf.SCRIPT_TRANSFORMATION_EXIT_TIMEOUT) + val exitRes = proc.waitFor(timeout, TimeUnit.SECONDS) + if (!exitRes) { + log.warn(s"Transformation script process exits timeout in $timeout seconds") + } + if (!proc.isAlive) { val exitCode = proc.exitValue() if (exitCode != 0) { @@ -174,7 +184,6 @@ case class ScriptTransformationExec( // Ideally the proc should *not* be alive at this point but // there can be a lag between EOF being written out and the process // being terminated. So explicitly waiting for the process to be done. - proc.waitFor() checkFailureAndPropagate() return false } @@ -249,16 +258,15 @@ case class ScriptTransformationExec( private class ScriptTransformationWriterThread( iter: Iterator[InternalRow], inputSchema: Seq[DataType], - outputProjection: Projection, @Nullable inputSerde: AbstractSerDe, - @Nullable inputSoi: ObjectInspector, + @Nullable inputSoi: StructObjectInspector, ioschema: HiveScriptIOSchema, outputStream: OutputStream, proc: Process, stderrBuffer: CircularBuffer, taskContext: TaskContext, conf: Configuration - ) extends Thread("Thread-ScriptTransformation-Feed") with Logging { + ) extends Thread("Thread-ScriptTransformation-Feed") with HiveInspectors with Logging { setDaemon(true) @@ -278,8 +286,8 @@ private class ScriptTransformationWriterThread( var threwException: Boolean = true val len = inputSchema.length try { - iter.map(outputProjection).foreach { row => - if (inputSerde == null) { + if (inputSerde == null) { + iter.foreach { row => val data = if (len == 0) { ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES") } else { @@ -295,10 +303,21 @@ private class ScriptTransformationWriterThread( sb.toString() } outputStream.write(data.getBytes(StandardCharsets.UTF_8)) - } else { - val writable = inputSerde.serialize( - row.asInstanceOf[GenericInternalRow].values, inputSoi) + } + } else { + // Convert Spark InternalRows to hive data via `HiveInspectors.wrapperFor`. + val hiveData = new Array[Any](inputSchema.length) + val fieldOIs = inputSoi.getAllStructFieldRefs.asScala.map(_.getFieldObjectInspector).toArray + val wrappers = fieldOIs.zip(inputSchema).map { case (f, dt) => wrapperFor(f, dt) } + + iter.foreach { row => + var i = 0 + while (i < fieldOIs.length) { + hiveData(i) = if (row.isNullAt(i)) null else wrappers(i)(row.get(i, inputSchema(i))) + i += 1 + } + val writable = inputSerde.serialize(hiveData, inputSoi) if (scriptInputWriter != null) { scriptInputWriter.write(writable) } else { @@ -374,14 +393,13 @@ case class HiveScriptIOSchema ( val outputRowFormatMap = outputRowFormat.toMap.withDefault((k) => defaultFormat(k)) - def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, ObjectInspector)] = { + def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, StructObjectInspector)] = { inputSerdeClass.map { serdeClass => val (columns, columnTypes) = parseAttrs(input) val serde = initSerDe(serdeClass, columns, columnTypes, inputSerdeProps) val fieldObjectInspectors = columnTypes.map(toInspector) val objectInspector = ObjectInspectorFactory .getStandardStructObjectInspector(columns.asJava, fieldObjectInspectors.asJava) - .asInstanceOf[ObjectInspector] (serde, objectInspector) } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index 05d608a2016a5..3117781a84bbf 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -41,6 +41,11 @@ import org.apache.spark.sql.hive.HiveShim._ import org.apache.spark.sql.types._ import org.apache.spark.util.Utils +/** + * Here we cannot extends `ImplicitTypeCasts` to compatible with UDF input data type, the reason is: + * we use children data type to reflect UDF method first and will get exception if it fails so that + * we can never go into `ImplicitTypeCasts`. + */ private[hive] case class HiveSimpleUDF( name: String, funcWrapper: HiveFunctionWrapper, children: Seq[Expression]) extends Expression diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala index 7f2eb14956dc1..d1ee1baadcbce 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala @@ -70,14 +70,12 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf) if (orcOptions.mergeSchema) { SchemaMergeUtils.mergeSchemasInParallel( - sparkSession, - files, - OrcFileOperator.readOrcSchemasInParallel) + sparkSession, options, files, OrcFileOperator.readOrcSchemasInParallel) } else { val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles OrcFileOperator.readSchema( files.map(_.getPath.toString), - Some(sparkSession.sessionState.newHadoopConf()), + Some(sparkSession.sessionState.newHadoopConfWithOptions(options)), ignoreCorruptFiles ) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala index cd1bffb6b7ab7..f9c514567c639 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder import org.apache.spark.SparkException import org.apache.spark.internal.Logging +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.quoteIfNeeded import org.apache.spark.sql.execution.datasources.orc.{OrcFilters => DatasourceOrcFilters} import org.apache.spark.sql.execution.datasources.orc.OrcFilters.buildTree import org.apache.spark.sql.hive.HiveUtils @@ -73,9 +74,11 @@ private[orc] object OrcFilters extends Logging { if (HiveUtils.isHive23) { DatasourceOrcFilters.createFilter(schema, filters).asInstanceOf[Option[SearchArgument]] } else { - val dataTypeMap = schema.map(f => f.name -> f.dataType).toMap + val dataTypeMap = schema.map(f => quoteIfNeeded(f.name) -> f.dataType).toMap + // TODO (SPARK-25557): ORC doesn't support nested predicate pushdown, so they are removed. + val newFilters = filters.filter(!_.containsNestedColumn) // Combines all convertible filters using `And` to produce a single conjunction - val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, filters)) + val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, newFilters)) conjunctionOptional.map { conjunction => // Then tries to build a single ORC `SearchArgument` for the conjunction predicate. // The input predicate is fully convertible. There should not be any empty result in the diff --git a/sql/hive/src/test/noclasspath/TestUDTF-spark-26560.jar b/sql/hive/src/test/noclasspath/TestUDTF-spark-26560.jar deleted file mode 100644 index b73b17d5c7880..0000000000000 Binary files a/sql/hive/src/test/noclasspath/TestUDTF-spark-26560.jar and /dev/null differ diff --git a/sql/hive/src/test/noclasspath/hive-test-udfs.jar b/sql/hive/src/test/noclasspath/hive-test-udfs.jar new file mode 100644 index 0000000000000..a5bfa456f6686 Binary files /dev/null and b/sql/hive/src/test/noclasspath/hive-test-udfs.jar differ diff --git a/sql/hive/src/test/resources/data/scripts/test_transform.py b/sql/hive/src/test/resources/data/scripts/test_transform.py index ac6d11d8b919c..dedb370f6c90e 100755 --- a/sql/hive/src/test/resources/data/scripts/test_transform.py +++ b/sql/hive/src/test/resources/data/scripts/test_transform.py @@ -1,3 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# import sys delim = sys.argv[1] diff --git a/sql/hive/src/test/resources/golden/timestamp cast #2-0-732ed232ac592c5e7f7c913a88874fd2 b/sql/hive/src/test/resources/golden/timestamp cast #3-0-732ed232ac592c5e7f7c913a88874fd2 similarity index 100% rename from sql/hive/src/test/resources/golden/timestamp cast #2-0-732ed232ac592c5e7f7c913a88874fd2 rename to sql/hive/src/test/resources/golden/timestamp cast #3-0-732ed232ac592c5e7f7c913a88874fd2 diff --git a/sql/hive/src/test/resources/golden/timestamp cast #6-0-6d2da5cfada03605834e38bc4075bc79 b/sql/hive/src/test/resources/golden/timestamp cast #4-0-6d2da5cfada03605834e38bc4075bc79 similarity index 100% rename from sql/hive/src/test/resources/golden/timestamp cast #6-0-6d2da5cfada03605834e38bc4075bc79 rename to sql/hive/src/test/resources/golden/timestamp cast #4-0-6d2da5cfada03605834e38bc4075bc79 diff --git a/sql/hive/src/test/resources/golden/timestamp cast #4-0-732ed232ac592c5e7f7c913a88874fd2 b/sql/hive/src/test/resources/golden/timestamp cast #4-0-732ed232ac592c5e7f7c913a88874fd2 deleted file mode 100644 index 5625e59da8873..0000000000000 --- a/sql/hive/src/test/resources/golden/timestamp cast #4-0-732ed232ac592c5e7f7c913a88874fd2 +++ /dev/null @@ -1 +0,0 @@ -1.2 diff --git a/sql/hive/src/test/resources/golden/timestamp cast #8-0-6d2da5cfada03605834e38bc4075bc79 b/sql/hive/src/test/resources/golden/timestamp cast #8-0-6d2da5cfada03605834e38bc4075bc79 deleted file mode 100644 index 1d94c8a014fb4..0000000000000 --- a/sql/hive/src/test/resources/golden/timestamp cast #8-0-6d2da5cfada03605834e38bc4075bc79 +++ /dev/null @@ -1 +0,0 @@ --1.2 diff --git a/sql/hive/src/test/resources/test_script.py b/sql/hive/src/test/resources/test_script.py new file mode 100644 index 0000000000000..82ef7b38f0c1b --- /dev/null +++ b/sql/hive/src/test/resources/test_script.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +for line in sys.stdin: + (a, b, c, d, e) = line.split('\t') + sys.stdout.write('\t'.join([a, b, c, d, e])) + sys.stdout.flush() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index 7b3fb68174234..e3141f504ea90 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -19,12 +19,15 @@ package org.apache.spark.sql.hive import java.io.File -import org.apache.spark.sql.{AnalysisException, Dataset, QueryTest, SaveMode} +import org.apache.commons.io.FileUtils + +import org.apache.spark.sql.{AnalysisException, Dataset, QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType import org.apache.spark.storage.RDDBlockId @@ -102,10 +105,10 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("uncache of nonexistent tables") { - val expectedErrorMsg = "Table or view not found: nonexistentTable" + val expectedErrorMsg = "Table or view not found:" // make sure table doesn't exist var e = intercept[AnalysisException](spark.table("nonexistentTable")).getMessage - assert(e.contains(expectedErrorMsg)) + assert(e.contains(s"$expectedErrorMsg nonexistentTable")) e = intercept[AnalysisException] { uncacheTable("nonexistentTable") }.getMessage @@ -113,7 +116,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto e = intercept[AnalysisException] { sql("UNCACHE TABLE nonexistentTable") }.getMessage - assert(e.contains(expectedErrorMsg)) + assert(e.contains(s"$expectedErrorMsg default.nonexistentTable")) sql("UNCACHE TABLE IF EXISTS nonexistentTable") } @@ -403,13 +406,13 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // Cache the table 'cachedTable' in default db without qualified table name , and then // check whether the table is cached with expected name. sql("CACHE TABLE cachedTable OPTIONS('storageLevel' 'DISK_ONLY')") - assertCached(sql("SELECT * FROM cachedTable"), "`cachedTable`", DISK_ONLY) + assertCached(sql("SELECT * FROM cachedTable"), "`default`.`cachedTable`", DISK_ONLY) assert(spark.catalog.isCached("cachedTable"), "Table 'cachedTable' should be cached.") // Refresh the table 'cachedTable' in default db with unqualified table name, and then // check whether the table is still cached with the same name. sql("REFRESH TABLE cachedTable") - assertCached(sql("SELECT * FROM cachedTable"), "`cachedTable`", DISK_ONLY) + assertCached(sql("SELECT * FROM cachedTable"), "`default`.`cachedTable`", DISK_ONLY) assert(spark.catalog.isCached("cachedTable"), "Table 'cachedTable' should be cached after refreshing with its unqualified name.") @@ -420,7 +423,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // 'default.cachedTable', instead of 'cachedTable' activateDatabase(db) { sql("REFRESH TABLE default.cachedTable") - assertCached(sql("SELECT * FROM default.cachedTable"), "`cachedTable`", DISK_ONLY) + assertCached( + sql("SELECT * FROM default.cachedTable"), "`default`.`cachedTable`", DISK_ONLY) assert(spark.catalog.isCached("default.cachedTable"), "Table 'cachedTable' should be cached after refreshing with its qualified name.") } @@ -428,4 +432,99 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } } + + test("SPARK-33963: do not use table stats while looking in table cache") { + val t = "table_on_test" + withTable(t) { + sql(s"CREATE TABLE $t (col int)") + assert(!spark.catalog.isCached(t)) + sql(s"CACHE TABLE $t") + assert(spark.catalog.isCached(t)) + } + } + + test("SPARK-33950: refresh cache after partition dropping") { + withTable("t") { + sql(s"CREATE TABLE t (id int, part int) USING hive PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + sql("INSERT INTO t PARTITION (part=1) SELECT 1") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(1, 1))) + sql("ALTER TABLE t DROP PARTITION (part=0)") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(1, 1))) + } + } + + test("SPARK-34011: refresh cache after partition renaming") { + withTable("t") { + sql("CREATE TABLE t (id int, part int) USING hive PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + sql("INSERT INTO t PARTITION (part=1) SELECT 1") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + QueryTest.checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(1, 1))) + sql("ALTER TABLE t PARTITION (part=0) RENAME TO PARTITION (part=2)") + assert(spark.catalog.isCached("t")) + QueryTest.checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 2), Row(1, 1))) + } + } + + test("SPARK-34055: refresh cache in partition adding") { + withTable("t") { + sql("CREATE TABLE t (id int, part int) USING hive PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0))) + + // Create new partition (part = 1) in the filesystem + val information = sql("SHOW TABLE EXTENDED LIKE 't' PARTITION (part = 0)") + .select("information") + .first().getString(0) + val part0Loc = information + .split("\\r?\\n") + .filter(_.startsWith("Location:")) + .head + .replace("Location: file:", "") + val part1Loc = part0Loc.replace("part=0", "part=1") + FileUtils.copyDirectory(new File(part0Loc), new File(part1Loc)) + + sql(s"ALTER TABLE t ADD PARTITION (part=1) LOCATION '$part1Loc'") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(0, 1))) + } + } + + test("SPARK-34060: update stats of cached table") { + withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "true") { + def checkTableSize(expected: String): Unit = { + val stats = + sql("DESCRIBE TABLE EXTENDED t") + .select("data_type") + .where("col_name = 'Statistics'") + .first() + .getString(0) + assert(stats.contains(expected)) + } + + sql("CREATE TABLE t (id int, part int) USING hive PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + sql("INSERT INTO t PARTITION (part=1) SELECT 1") + assert(!spark.catalog.isCached("t")) + sql("CACHE TABLE t") + assert(spark.catalog.isCached("t")) + checkAnswer(sql("SELECT * FROM t"), Seq(Row(0, 0), Row(1, 1))) + checkTableSize("4 bytes") + + sql("ALTER TABLE t DROP PARTITION (part=0)") + assert(spark.catalog.isCached("t")) + checkTableSize("2 bytes") + checkAnswer(sql("SELECT * FROM t"), Seq(Row(1, 1))) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala new file mode 100644 index 0000000000000..a80db765846e9 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.scalatest.BeforeAndAfterEach + +import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} + + +class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach { + + override protected val enableAutoThreadAudit = false + private var sc: SparkContext = null + private var hc: HiveContext = null + + override def beforeAll(): Unit = { + super.beforeAll() + sc = SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("test")) + HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true).foreach { case (k, v) => + sc.hadoopConfiguration.set(k, v) + } + hc = new HiveContext(sc) + } + + override def afterEach(): Unit = { + try { + hc.sharedState.cacheManager.clearCache() + hc.sessionState.catalog.reset() + } finally { + super.afterEach() + } + } + + override def afterAll(): Unit = { + try { + sc = null + hc = null + } finally { + super.afterAll() + } + } + + test("basic operations") { + val _hc = hc + import _hc.implicits._ + val df1 = (1 to 20).map { i => (i, i) }.toDF("a", "x") + val df2 = (1 to 100).map { i => (i, i % 10, i % 2 == 0) }.toDF("a", "b", "c") + .select($"a", $"b") + .filter($"a" > 10 && $"b" > 6 && $"c") + val df3 = df1.join(df2, "a") + val res = df3.collect() + val expected = Seq((18, 18, 8)).toDF("a", "x", "b").collect() + assert(res.toSeq == expected.toSeq) + df3.createOrReplaceTempView("mai_table") + val df4 = hc.table("mai_table") + val res2 = df4.collect() + assert(res2.toSeq == expected.toSeq) + } + + test("basic DDLs") { + val _hc = hc + import _hc.implicits._ + val databases = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) + assert(databases.toSeq == Seq("default")) + hc.sql("CREATE DATABASE mee_db") + hc.sql("USE mee_db") + val databases2 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) + assert(databases2.toSet == Set("default", "mee_db")) + val df = (1 to 10).map { i => ("bob" + i.toString, i) }.toDF("name", "age") + df.createOrReplaceTempView("mee_table") + hc.sql("CREATE TABLE moo_table (name string, age int)") + hc.sql("INSERT INTO moo_table SELECT * FROM mee_table") + assert( + hc.sql("SELECT * FROM moo_table order by name").collect().toSeq == + df.collect().toSeq.sortBy(_.getString(0))) + val tables = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) + assert(tables.toSet == Set("moo_table", "mee_table")) + hc.sql("DROP TABLE moo_table") + hc.sql("DROP TABLE mee_table") + val tables2 = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) + assert(tables2.isEmpty) + hc.sql("USE default") + hc.sql("DROP DATABASE mee_db CASCADE") + val databases3 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) + assert(databases3.toSeq == Seq("default")) + } + +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala index 79e569b51ca1d..473a93bf129df 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala @@ -17,11 +17,14 @@ package org.apache.spark.sql.hive +import java.net.URI + import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.types.StructType @@ -154,4 +157,65 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite { catalog.createTable(hiveTable, ignoreIfExists = false) assert(catalog.getTable("db1", "spark_29498").owner === owner) } + + test("SPARK-30868 throw an exception if HiveClient#runSqlHive fails") { + val client = externalCatalog.client + // test add jars which doesn't exists + val jarPath = "file:///tmp/not_exists.jar" + assertThrows[QueryExecutionException](client.runSqlHive(s"ADD JAR $jarPath")) + + // test change to the database which doesn't exists + assertThrows[QueryExecutionException](client.runSqlHive( + s"use db_not_exists")) + + // test create hive table failed with unsupported into type + assertThrows[QueryExecutionException](client.runSqlHive( + s"CREATE TABLE t(n into)")) + + // test desc table failed with wrong `FORMATED` keyword + assertThrows[QueryExecutionException](client.runSqlHive( + s"DESC FORMATED t")) + + // test wrong insert query + assertThrows[QueryExecutionException](client.runSqlHive( + "INSERT overwrite directory \"fs://localhost/tmp\" select 1 as a")) + } + + test("SPARK-31061: alterTable should be able to change table provider") { + val catalog = newBasicCatalog() + val parquetTable = CatalogTable( + identifier = TableIdentifier("parq_tbl", Some("db1")), + tableType = CatalogTableType.MANAGED, + storage = storageFormat.copy(locationUri = Some(new URI("file:/some/path"))), + schema = new StructType().add("col1", "int").add("col2", "string"), + provider = Some("parquet")) + catalog.createTable(parquetTable, ignoreIfExists = false) + + val rawTable = externalCatalog.getTable("db1", "parq_tbl") + assert(rawTable.provider === Some("parquet")) + + val fooTable = parquetTable.copy(provider = Some("foo")) + catalog.alterTable(fooTable) + val alteredTable = externalCatalog.getTable("db1", "parq_tbl") + assert(alteredTable.provider === Some("foo")) + } + + test("SPARK-31061: alterTable should be able to change table provider from hive") { + val catalog = newBasicCatalog() + val hiveTable = CatalogTable( + identifier = TableIdentifier("parq_tbl", Some("db1")), + tableType = CatalogTableType.MANAGED, + storage = storageFormat, + schema = new StructType().add("col1", "int").add("col2", "string"), + provider = Some("hive")) + catalog.createTable(hiveTable, ignoreIfExists = false) + + val rawTable = externalCatalog.getTable("db1", "parq_tbl") + assert(rawTable.provider === Some("hive")) + + val fooTable = rawTable.copy(provider = Some("foo")) + catalog.alterTable(fooTable) + val alteredTable = externalCatalog.getTable("db1", "parq_tbl") + assert(alteredTable.provider === Some("foo")) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 3b5a1247bc09c..b81b7e8ec0c0f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH import org.apache.spark.sql.test.SQLTestUtils -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.Utils /** @@ -46,6 +46,7 @@ import org.apache.spark.util.Utils * expected version under this local directory, e.g. `/tmp/spark-test/spark-2.0.3`, we will skip the * downloading for this spark version. */ +@SlowHiveTest @ExtendedHiveTest class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { private val isTestAtLeastJava9 = SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_9) @@ -233,7 +234,7 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils { // Tests the latest version of every release line. val testingVersions: Seq[String] = { import scala.io.Source - try { + val versions: Seq[String] = try { Source.fromURL(s"${releaseMirror}/spark").mkString .split("\n") .filter(_.contains("""
    • Nil + case NonFatal(_) => Seq("3.0.1", "2.4.7") // A temporary fallback to use a specific version } + versions.filter(v => v.startsWith("3") || !TestUtils.isPythonVersionAtLeast38()) } protected var spark: SparkSession = _ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index 5912992694e84..13c48f38e7f78 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.hive import java.util -import java.util.{Locale, TimeZone} import org.apache.hadoop.hive.ql.udf.UDAFPercentile import org.apache.hadoop.hive.serde2.io.DoubleWritable @@ -74,11 +73,6 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { .get()) } - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) - val data = Literal(true) :: Literal(null) :: diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala index 94a55b911f092..db8ebcd45f3eb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala @@ -114,7 +114,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi val e2 = intercept[SparkException] { sql("select * from test").count() } - assert(e2.getMessage.contains("FileNotFoundException")) + assert(e.getMessage.contains("FileNotFoundException")) spark.catalog.refreshByPath(dir.getAbsolutePath) assert(sql("select * from test").count() == 3) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 20bafd832d0da..95e99c653d6f6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -62,7 +62,7 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils { spark.sql("create view vw1 as select 1 as id") val plan = spark.sql("select id from vw1").queryExecution.analyzed val aliases = plan.collect { - case x @ SubqueryAlias(AliasIdentifier("vw1", Some("default")), _) => x + case x @ SubqueryAlias(AliasIdentifier("vw1", Seq("spark_catalog", "default")), _) => x } assert(aliases.size == 1) } @@ -97,7 +97,7 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils { |c22 map, |c23 struct, |c24 struct - |) + |) USING hive """.stripMargin) val schema = hiveClient.getTable("default", "t").schema diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala index 590ef949ffbd7..6496d3154feea 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala @@ -118,11 +118,6 @@ class HiveSchemaInferenceSuite properties = Map.empty), true) - // Add partition records (if specified) - if (!partitionCols.isEmpty) { - spark.catalog.recoverPartitions(TEST_TABLE_NAME) - } - // Check that the table returned by HiveExternalCatalog has schemaPreservesCase set to false // and that the raw table returned by the Hive client doesn't have any Spark SQL properties // set (table needs to be obtained from client since HiveExternalCatalog filters these @@ -130,6 +125,12 @@ class HiveSchemaInferenceSuite assert(!externalCatalog.getTable(DATABASE, TEST_TABLE_NAME).schemaPreservesCase) val rawTable = client.getTable(DATABASE, TEST_TABLE_NAME) assert(rawTable.properties.filterKeys(_.startsWith(DATASOURCE_SCHEMA_PREFIX)) == Map.empty) + + // Add partition records (if specified) + if (!partitionCols.isEmpty) { + spark.catalog.recoverPartitions(TEST_TABLE_NAME) + } + schema } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala index 6e2dcfc04d498..78535b094b83d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSharedStateSuite.scala @@ -45,7 +45,6 @@ class HiveSharedStateSuite extends SparkFunSuite { GLOBAL_TEMP_DATABASE.key -> tmpDb) val state = new SharedState(sc, initialConfigs) - assert(state.warehousePath !== invalidPath, "warehouse path can't determine by session options") assert(sc.conf.get(WAREHOUSE_PATH.key) !== invalidPath, "warehouse conf in session options can't affect application wide spark conf") assert(sc.hadoopConfiguration.get(ConfVars.METASTOREWAREHOUSE.varname) !== invalidPath, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala index e5d572c90af38..cfcf70c0e79f0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala @@ -19,38 +19,50 @@ package org.apache.spark.sql.hive import org.apache.spark.sql.{AnalysisException, ShowCreateTableSuite} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} +import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSingleton { - private var origCreateHiveTableConfig = false - - protected override def beforeAll(): Unit = { - super.beforeAll() - origCreateHiveTableConfig = - SQLConf.get.getConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED) - SQLConf.get.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, true) - } - - protected override def afterAll(): Unit = { - SQLConf.get.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, - origCreateHiveTableConfig) - super.afterAll() + test("view") { + Seq(true, false).foreach { serde => + withView("v1") { + sql("CREATE VIEW v1 AS SELECT 1 AS a") + checkCreateView("v1", serde) + } + } } - test("view") { - withView("v1") { - sql("CREATE VIEW v1 AS SELECT 1 AS a") - checkCreateHiveTableOrView("v1", "VIEW") + test("view with output columns") { + Seq(true, false).foreach { serde => + withView("v1") { + sql("CREATE VIEW v1 (a, b COMMENT 'b column') AS SELECT 1 AS a, 2 AS b") + checkCreateView("v1", serde) + } } } - test("view with output columns") { - withView("v1") { - sql("CREATE VIEW v1 (b) AS SELECT 1 AS a") - checkCreateHiveTableOrView("v1", "VIEW") + test("view with table comment and properties") { + Seq(true, false).foreach { serde => + withView("v1") { + sql( + s""" + |CREATE VIEW v1 ( + | c1 COMMENT 'bla', + | c2 + |) + |COMMENT 'table comment' + |TBLPROPERTIES ( + | 'prop1' = 'value1', + | 'prop2' = 'value2' + |) + |AS SELECT 1 AS c1, '2' AS c2 + """.stripMargin + ) + + checkCreateView("v1", serde) + } } } @@ -68,7 +80,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } @@ -88,7 +100,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } } @@ -108,7 +120,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } @@ -126,7 +138,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } @@ -141,7 +153,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } @@ -163,7 +175,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } @@ -176,7 +188,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet |INTO 2 BUCKETS """.stripMargin ) - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } } @@ -219,29 +231,9 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet val createTable = "CREATE TABLE `t1` (`a` STRUCT<`b`: STRING>) USING hive" sql(createTable) val shownDDL = getShowDDL("SHOW CREATE TABLE t1") - assert(shownDDL == createTable.dropRight(" USING hive".length)) + assert(shownDDL == "CREATE TABLE `default`.`t1` (`a` STRUCT<`b`: STRING>)") - checkCreateHiveTableOrView("t1") - } - } - - /** - * This method compares the given table with the table created by the DDL generated by - * `SHOW CREATE TABLE AS SERDE`. - */ - private def checkCreateHiveTableOrView(tableName: String, checkType: String = "TABLE"): Unit = { - val table = TableIdentifier(tableName, Some("default")) - val db = table.database.getOrElse("default") - val expected = spark.sharedState.externalCatalog.getTable(db, table.table) - val shownDDL = sql(s"SHOW CREATE TABLE ${table.quotedString} AS SERDE").head().getString(0) - sql(s"DROP $checkType ${table.quotedString}") - - try { - sql(shownDDL) - val actual = spark.sharedState.externalCatalog.getTable(db, table.table) - checkCatalogTables(expected, actual) - } finally { - sql(s"DROP $checkType IF EXISTS ${table.table}") + checkCreateTable("t1", serde = true) } } @@ -343,7 +335,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet ) val cause = intercept[AnalysisException] { - checkCreateHiveTableOrView("t1") + checkCreateTable("t1", serde = true) } assert(cause.getMessage.contains("Use `SHOW CREATE TABLE` without `AS SERDE` instead")) @@ -445,27 +437,6 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet } } - test("hive view is not supported by show create table without as serde") { - withTable("t1") { - withView("v1") { - sql("CREATE TABLE t1 (c1 STRING, c2 STRING)") - - createRawHiveTable( - s""" - |CREATE VIEW v1 - |AS SELECT * from t1 - """.stripMargin - ) - - val cause = intercept[AnalysisException] { - sql("SHOW CREATE TABLE v1") - } - - assert(cause.getMessage.contains("view isn't supported")) - } - } - } - test("partitioned, bucketed hive table in Spark DDL") { withTable("t1") { sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index 31ff62ed0a530..3a7e92ee1c00b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -38,12 +38,13 @@ import org.apache.spark.sql.hive.test.{HiveTestJars, TestHiveContext} import org.apache.spark.sql.internal.SQLConf.SHUFFLE_PARTITIONS import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH import org.apache.spark.sql.types.{DecimalType, StructType} -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.{ResetSystemProperties, Utils} /** * This suite tests spark-submit with applications using HiveContext. */ +@SlowHiveTest @ExtendedHiveTest class HiveSparkSubmitSuite extends SparkSubmitTestUtils @@ -787,7 +788,7 @@ object SPARK_18360 { .enableHiveSupport().getOrCreate() val defaultDbLocation = spark.catalog.getDatabase("default").locationUri - assert(new Path(defaultDbLocation) == new Path(spark.sharedState.warehousePath)) + assert(new Path(defaultDbLocation) == new Path(spark.conf.get(WAREHOUSE_PATH))) val hiveClient = spark.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala new file mode 100644 index 0000000000000..ee8e6f4f78be5 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUDFDynamicLoadSuite.scala @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} +import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.{IntegerType, StringType} +import org.apache.spark.util.Utils + +class HiveUDFDynamicLoadSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { + + case class UDFTestInformation( + identifier: String, + funcName: String, + className: String, + fnVerifyQuery: () => Unit, + fnCreateHiveUDFExpression: () => Expression) + + private val udfTestInfos: Seq[UDFTestInformation] = Array( + // UDF + // UDFExampleAdd2 is slightly modified version of UDFExampleAdd in hive/contrib, + // which adds two integers or doubles. + UDFTestInformation( + "UDF", + "udf_add2", + "org.apache.hadoop.hive.contrib.udf.example.UDFExampleAdd2", + () => { + checkAnswer(sql("SELECT udf_add2(1, 2)"), Row(3) :: Nil) + }, + () => { + HiveSimpleUDF( + "default.udf_add2", + HiveFunctionWrapper("org.apache.hadoop.hive.contrib.udf.example.UDFExampleAdd2"), + Array( + AttributeReference("a", IntegerType, nullable = false)(), + AttributeReference("b", IntegerType, nullable = false)())) + }), + + // GenericUDF + // GenericUDFTrim2 is cloned version of GenericUDFTrim in hive/contrib. + UDFTestInformation( + "GENERIC_UDF", + "generic_udf_trim2", + "org.apache.hadoop.hive.contrib.udf.example.GenericUDFTrim2", + () => { + checkAnswer(sql("SELECT generic_udf_trim2(' hello ')"), Row("hello") :: Nil) + }, + () => { + HiveGenericUDF( + "default.generic_udf_trim2", + HiveFunctionWrapper("org.apache.hadoop.hive.contrib.udf.example.GenericUDFTrim2"), + Array(AttributeReference("a", StringType, nullable = false)()) + ) + } + ), + + // AbstractGenericUDAFResolver + // GenericUDAFSum2 is cloned version of GenericUDAFSum in hive/exec. + UDFTestInformation( + "GENERIC_UDAF", + "generic_udaf_sum2", + "org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum2", + () => { + import spark.implicits._ + val df = Seq((0: Integer) -> 0, (1: Integer) -> 1, (2: Integer) -> 2, (3: Integer) -> 3) + .toDF("key", "value").createOrReplaceTempView("t") + checkAnswer(sql("SELECT generic_udaf_sum2(value) FROM t GROUP BY key % 2"), + Row(2) :: Row(4) :: Nil) + }, + () => { + HiveUDAFFunction( + "default.generic_udaf_sum2", + HiveFunctionWrapper("org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum2"), + Array(AttributeReference("a", IntegerType, nullable = false)()) + ) + } + ), + + // UDAF + // UDAFExampleMax2 is cloned version of UDAFExampleMax in hive/contrib. + UDFTestInformation( + "UDAF", + "udaf_max2", + "org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax2", + () => { + import spark.implicits._ + val df = Seq((0: Integer) -> 0, (1: Integer) -> 1, (2: Integer) -> 2, (3: Integer) -> 3) + .toDF("key", "value").createOrReplaceTempView("t") + checkAnswer(sql("SELECT udaf_max2(value) FROM t GROUP BY key % 2"), + Row(2) :: Row(3) :: Nil) + }, + () => { + HiveUDAFFunction( + "default.udaf_max2", + HiveFunctionWrapper("org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax2"), + Array(AttributeReference("a", IntegerType, nullable = false)()), + isUDAFBridgeRequired = true + ) + } + ), + + // GenericUDTF + // GenericUDTFCount3 is slightly modified version of GenericUDTFCount2 in hive/contrib, + // which emits the count for three times. + UDFTestInformation( + "GENERIC_UDTF", + "udtf_count3", + "org.apache.hadoop.hive.contrib.udtf.example.GenericUDTFCount3", + () => { + checkAnswer( + sql("SELECT udtf_count3(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"), + Row(3) :: Row(3) :: Row(3) :: Nil) + }, + () => { + HiveGenericUDTF( + "default.udtf_count3", + HiveFunctionWrapper("org.apache.hadoop.hive.contrib.udtf.example.GenericUDTFCount3"), + Array.empty[Expression] + ) + } + ) + ) + + udfTestInfos.foreach { udfInfo => + // The test jars are built from below commit: + // https://github.com/HeartSaVioR/hive/commit/12f3f036b6efd0299cd1d457c0c0a65e0fd7e5f2 + // which contain new UDF classes to be dynamically loaded and tested via Spark. + + // This jar file should not be placed to the classpath. + val jarPath = "src/test/noclasspath/hive-test-udfs.jar" + val jarUrl = s"file://${System.getProperty("user.dir")}/$jarPath" + + test("Spark should be able to run Hive UDF using jar regardless of " + + s"current thread context classloader (${udfInfo.identifier}") { + Utils.withContextClassLoader(Utils.getSparkClassLoader) { + withUserDefinedFunction(udfInfo.funcName -> false) { + val sparkClassLoader = Thread.currentThread().getContextClassLoader + + sql(s"CREATE FUNCTION ${udfInfo.funcName} AS '${udfInfo.className}' USING JAR '$jarUrl'") + + assert(Thread.currentThread().getContextClassLoader eq sparkClassLoader) + + // JAR will be loaded at first usage, and it will change the current thread's + // context classloader to jar classloader in sharedState. + // See SessionState.addJar for details. + udfInfo.fnVerifyQuery() + + assert(Thread.currentThread().getContextClassLoader ne sparkClassLoader) + assert(Thread.currentThread().getContextClassLoader eq + spark.sqlContext.sharedState.jarClassLoader) + + val udfExpr = udfInfo.fnCreateHiveUDFExpression() + // force initializing - this is what we do in HiveSessionCatalog + udfExpr.dataType + + // Roll back to the original classloader and run query again. Without this line, the test + // would pass, as thread's context classloader is changed to jar classloader. But thread + // context classloader can be changed from others as well which would fail the query; one + // example is spark-shell, which thread context classloader rolls back automatically. This + // mimics the behavior of spark-shell. + Thread.currentThread().setContextClassLoader(sparkClassLoader) + + udfInfo.fnVerifyQuery() + + val newExpr = udfExpr.makeCopy(udfExpr.productIterator.map(_.asInstanceOf[AnyRef]) + .toArray) + newExpr.dataType + } + } + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala index 80afc9d8f44bc..e1b0637963b75 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala @@ -53,7 +53,8 @@ class PartitionProviderCompatibilitySuite s"ALTER TABLE $tableName PARTITION (partCol=1) SET LOCATION '/foo'", s"ALTER TABLE $tableName DROP PARTITION (partCol=1)", s"DESCRIBE $tableName PARTITION (partCol=1)", - s"SHOW PARTITIONS $tableName") + s"SHOW PARTITIONS $tableName", + s"SHOW TABLE EXTENDED LIKE '$tableName' PARTITION (partCol=1)") withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { for (cmd <- unsupportedCommands) { @@ -124,10 +125,15 @@ class PartitionProviderCompatibilitySuite } // disabled withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") { - val e = intercept[AnalysisException] { - spark.sql(s"show partitions test") + Seq( + "SHOW PARTITIONS test", + "SHOW TABLE EXTENDED LIKE 'test' PARTITION (partCol=1)" + ).foreach { showPartitions => + val e = intercept[AnalysisException] { + spark.sql(showPartitions) + } + assert(e.getMessage.contains("filesource partition management is disabled")) } - assert(e.getMessage.contains("filesource partition management is disabled")) spark.sql("refresh table test") assert(spark.sql("select * from test").count() == 5) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala index 3af163af0968c..f9ae1e00cfa36 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala @@ -405,8 +405,8 @@ class PartitionedTablePerfStatsSuite }) executorPool.shutdown() executorPool.awaitTermination(30, TimeUnit.SECONDS) - assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 50) - assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 1) + assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 100) + assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 2) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 488175a22bad7..046ab01640444 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -993,12 +993,16 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto assert(fetched1.get.colStats.size == 2) withTempPaths(numPaths = 2) { case Seq(dir1, dir2) => - val file1 = new File(dir1 + "/data") + val partDir1 = new File(new File(dir1, "ds=2008-04-09"), "hr=11") + val file1 = new File(partDir1, "data") + file1.getParentFile.mkdirs() Utils.tryWithResource(new PrintWriter(file1)) { writer => writer.write("1,a") } - val file2 = new File(dir2 + "/data") + val partDir2 = new File(new File(dir2, "ds=2008-04-09"), "hr=12") + val file2 = new File(partDir2, "data") + file2.getParentFile.mkdirs() Utils.tryWithResource(new PrintWriter(file2)) { writer => writer.write("1,a") } @@ -1007,8 +1011,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql( s""" |ALTER TABLE $table ADD - |PARTITION (ds='2008-04-09', hr='11') LOCATION '${dir1.toURI.toString}' - |PARTITION (ds='2008-04-09', hr='12') LOCATION '${dir2.toURI.toString}' + |PARTITION (ds='2008-04-09', hr='11') LOCATION '${partDir1.toURI.toString}' + |PARTITION (ds='2008-04-09', hr='12') LOCATION '${partDir1.toURI.toString}' """.stripMargin) if (autoUpdate) { val fetched2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None) @@ -1520,10 +1524,12 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto val ext_tbl = "SPARK_30269_external" withTempDir { dir => withTable(tbl, ext_tbl) { - sql(s"CREATE TABLE $tbl (key INT, value STRING, ds STRING) PARTITIONED BY (ds)") + sql(s"CREATE TABLE $tbl (key INT, value STRING, ds STRING)" + + "USING parquet PARTITIONED BY (ds)") sql( s""" | CREATE TABLE $ext_tbl (key INT, value STRING, ds STRING) + | USING PARQUET | PARTITIONED BY (ds) | LOCATION '${dir.toURI}' """.stripMargin) @@ -1556,4 +1562,20 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } } } + + test("SPARK-34084: auto update table stats") { + Seq("parquet", "hive").foreach { format => + withTable("t") { + withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "false") { + sql(s"CREATE TABLE t (col0 int, part int) USING $format PARTITIONED BY (part)") + sql("INSERT INTO t PARTITION (part=0) SELECT 0") + assert(getCatalogTable("t").stats.isEmpty) + } + withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> "true") { + sql("ALTER TABLE t ADD PARTITION (part=1)") + assert(getTableStats("t").sizeInBytes > 0) + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala new file mode 100644 index 0000000000000..65492abf38cc0 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.client + +import java.io.File +import java.net.URLClassLoader + +import org.apache.hadoop.conf.Configuration + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} +import org.apache.spark.util.Utils + +/** + * This test suite requires a clean JVM because it's testing the initialization of static codes in + * `org.apache.hadoop.util.VersionInfo`. + */ +class HadoopVersionInfoSuite extends SparkFunSuite { + override protected val enableAutoThreadAudit = false + + test("SPARK-32256: Hadoop VersionInfo should be preloaded") { + val ivyPath = + Utils.createTempDir(namePrefix = s"${classOf[HadoopVersionInfoSuite].getSimpleName}-ivy") + try { + val hadoopConf = new Configuration() + hadoopConf.set("test", "success") + hadoopConf.set("datanucleus.schema.autoCreateAll", "true") + hadoopConf.set("hive.metastore.schema.verification", "false") + + // Download jars for Hive 2.0 + val client = IsolatedClientLoader.forVersion( + hiveMetastoreVersion = "2.0", + hadoopVersion = "2.7.4", + sparkConf = new SparkConf(), + hadoopConf = hadoopConf, + config = HiveClientBuilder.buildConf(Map.empty), + ivyPath = Some(ivyPath.getCanonicalPath), + sharesHadoopClasses = true) + val jars = client.classLoader.getParent.asInstanceOf[URLClassLoader].getURLs + .map(u => new File(u.toURI)) + // Drop all Hadoop jars to use the existing Hadoop jars on the classpath + .filter(!_.getName.startsWith("org.apache.hadoop_hadoop-")) + + val sparkConf = new SparkConf() + sparkConf.set(HiveUtils.HIVE_METASTORE_VERSION, "2.0") + sparkConf.set( + HiveUtils.HIVE_METASTORE_JARS, + jars.map(_.getCanonicalPath).mkString(File.pathSeparator)) + HiveClientBuilder.buildConf(Map.empty).foreach { case (k, v) => + hadoopConf.set(k, v) + } + new HiveExternalCatalog(sparkConf, hadoopConf).client.getState + } finally { + Utils.deleteRecursively(ivyPath) + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala index ab73f668c6ca6..2ad3afcb214b3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala @@ -33,7 +33,7 @@ private[client] object HiveClientBuilder { Some(new File(sys.props("java.io.tmpdir"), "hive-ivy-cache").getAbsolutePath)) } - private def buildConf(extraConf: Map[String, String]) = { + private[client] def buildConf(extraConf: Map[String, String]): Map[String, String] = { lazy val warehousePath = Utils.createTempDir() lazy val metastorePath = Utils.createTempDir() metastorePath.delete() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index 2d615f6fdc261..edb1a27df806f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StructType} +import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StringType, StructType} import org.apache.spark.util.Utils class HivePartitionFilteringSuite(version: String) @@ -276,6 +276,13 @@ class HivePartitionFilteringSuite(version: String) buildClient(new Configuration(), sharesHadoopClasses = false) } + test("getPartitionsByFilter: chunk in ('ab', 'ba') and ((cast(ds as string)>'20170102')") { + val day = (20170101 to 20170103, 0 to 4, Seq("ab", "ba")) + testMetastorePartitionFiltering( + attr("chunk").in("ab", "ba") && (attr("ds").cast(StringType) > "20170102"), + day :: Nil) + } + private def testMetastorePartitionFiltering( filterExpr: Expression, expectedDs: Seq[Int], diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index 4760af7aa46ff..41b90db3b2e7e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPermanentFunctionException} +import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchPermanentFunctionException, PartitionsAlreadyExistException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal} import org.apache.spark.sql.catalyst.util.quietly @@ -41,7 +41,7 @@ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.test.TestHiveVersion import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.StructType -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, GitHubActionsUnstableTest} import org.apache.spark.util.{MutableURLClassLoader, Utils} /** @@ -52,6 +52,7 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils} */ // TODO: Refactor this to `HiveClientSuite` and make it a subclass of `HiveVersionSuite` @ExtendedHiveTest +@GitHubActionsUnstableTest class VersionsSuite extends SparkFunSuite with Logging { override protected val enableAutoThreadAudit = false @@ -82,6 +83,18 @@ class VersionsSuite extends SparkFunSuite with Logging { assert("success" === client.getConf("test", null)) } + test("override useless and side-effect hive configurations ") { + val hadoopConf = new Configuration() + // These hive flags should be reset by spark + hadoopConf.setBoolean("hive.cbo.enable", true) + hadoopConf.setBoolean("hive.session.history.enabled", true) + hadoopConf.set("hive.execution.engine", "tez") + val client = buildClient(HiveUtils.builtinHiveVersion, hadoopConf) + assert(!client.getConf("hive.cbo.enable", "true").toBoolean) + assert(!client.getConf("hive.session.history.enabled", "true").toBoolean) + assert(client.getConf("hive.execution.engine", "tez") === "mr") + } + private def getNestedMessages(e: Throwable): String = { var causes = "" var lastException = e @@ -142,10 +155,11 @@ class VersionsSuite extends SparkFunSuite with Logging { .client.version.fullVersion.startsWith(version)) } - def table(database: String, tableName: String): CatalogTable = { + def table(database: String, tableName: String, + tableType: CatalogTableType = CatalogTableType.MANAGED): CatalogTable = { CatalogTable( identifier = TableIdentifier(tableName, Some(database)), - tableType = CatalogTableType.MANAGED, + tableType = tableType, schema = new StructType().add("key", "int"), storage = CatalogStorageFormat( locationUri = None, @@ -261,7 +275,9 @@ class VersionsSuite extends SparkFunSuite with Logging { test(s"$version: createTable") { client.createTable(table("default", tableName = "src"), ignoreIfExists = false) - client.createTable(table("default", "temporary"), ignoreIfExists = false) + client.createTable(table("default", tableName = "temporary"), ignoreIfExists = false) + client.createTable(table("default", tableName = "view1", tableType = CatalogTableType.VIEW), + ignoreIfExists = false) } test(s"$version: loadTable") { @@ -377,7 +393,7 @@ class VersionsSuite extends SparkFunSuite with Logging { } test(s"$version: listTables(database)") { - assert(client.listTables("default") === Seq("src", "temporary")) + assert(client.listTables("default") === Seq("src", "temporary", "view1")) } test(s"$version: listTables(database, pattern)") { @@ -385,6 +401,13 @@ class VersionsSuite extends SparkFunSuite with Logging { assert(client.listTables("default", pattern = "nonexist").isEmpty) } + test(s"$version: listTablesByType(database, pattern, tableType)") { + assert(client.listTablesByType("default", pattern = "view1", + CatalogTableType.VIEW) === Seq("view1")) + assert(client.listTablesByType("default", pattern = "nonexist", + CatalogTableType.VIEW).isEmpty) + } + test(s"$version: dropTable") { val versionsWithoutPurge = if (versions.contains("0.14")) versions.takeWhile(_ != "0.14") else Nil @@ -400,6 +423,16 @@ class VersionsSuite extends SparkFunSuite with Logging { client.dropTable("default", tableName = "temporary", ignoreIfNotExists = false, purge = false) } + // Drop table with type CatalogTableType.VIEW. + try { + client.dropTable("default", tableName = "view1", ignoreIfNotExists = false, + purge = true) + assert(!versionsWithoutPurge.contains(version)) + } catch { + case _: UnsupportedOperationException => + client.dropTable("default", tableName = "view1", ignoreIfNotExists = false, + purge = false) + } assert(client.listTables("default") === Seq("src")) } @@ -560,6 +593,27 @@ class VersionsSuite extends SparkFunSuite with Logging { assert(client.getPartitionOption("default", "src_part", spec).isEmpty) } + test(s"$version: createPartitions if already exists") { + val partitions = Seq(CatalogTablePartition( + Map("key1" -> "101", "key2" -> "102"), + storageFormat)) + try { + client.createPartitions("default", "src_part", partitions, ignoreIfExists = false) + val errMsg = intercept[PartitionsAlreadyExistException] { + client.createPartitions("default", "src_part", partitions, ignoreIfExists = false) + }.getMessage + assert(errMsg.contains("partitions already exists")) + } finally { + client.dropPartitions( + "default", + "src_part", + partitions.map(_.spec), + ignoreIfNotExists = true, + purge = false, + retainData = false) + } + } + /////////////////////////////////////////////////////////////////////////// // Function related API /////////////////////////////////////////////////////////////////////////// @@ -950,7 +1004,7 @@ class VersionsSuite extends SparkFunSuite with Logging { """.stripMargin ) - val errorMsg = "Cannot safely cast 'f0': DecimalType(2,1) to BinaryType" + val errorMsg = "Cannot safely cast 'f0': decimal(2,1) to binary" if (isPartitioned) { val insertStmt = s"INSERT OVERWRITE TABLE $tableName partition (ds='a') SELECT 1.3" diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index f84b854048e8a..1dd2ad3837cc3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -31,6 +31,8 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.SlowHiveTest +import org.apache.spark.unsafe.UnsafeAlignedOffset class ScalaAggregateFunction(schema: StructType) extends UserDefinedAggregateFunction { @@ -203,11 +205,13 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te } test("group by function") { - Seq((1, 2)).toDF("a", "b").createOrReplaceTempView("data") + withTempView("data") { + Seq((1, 2)).toDF("a", "b").createOrReplaceTempView("data") - checkAnswer( - sql("SELECT floor(a) AS a, collect_set(b) FROM data GROUP BY floor(a) ORDER BY a"), - Row(1, Array(2)) :: Nil) + checkAnswer( + sql("SELECT floor(a) AS a, collect_set(b) FROM data GROUP BY floor(a) ORDER BY a"), + Row(1, Array(2)) :: Nil) + } } test("empty table") { @@ -799,43 +803,45 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te (5, 8, 17), (6, 2, 11)).toDF("a", "b", "c") - covar_tab.createOrReplaceTempView("covar_tab") + withTempView("covar_tab") { + covar_tab.createOrReplaceTempView("covar_tab") - checkAnswer( - spark.sql( - """ - |SELECT corr(b, c) FROM covar_tab WHERE a < 1 - """.stripMargin), - Row(null) :: Nil) + checkAnswer( + spark.sql( + """ + |SELECT corr(b, c) FROM covar_tab WHERE a < 1 + """.stripMargin), + Row(null) :: Nil) - checkAnswer( - spark.sql( - """ - |SELECT corr(b, c) FROM covar_tab WHERE a < 3 - """.stripMargin), - Row(null) :: Nil) + checkAnswer( + spark.sql( + """ + |SELECT corr(b, c) FROM covar_tab WHERE a < 3 + """.stripMargin), + Row(null) :: Nil) - checkAnswer( - spark.sql( - """ - |SELECT corr(b, c) FROM covar_tab WHERE a = 3 - """.stripMargin), - Row(Double.NaN) :: Nil) + checkAnswer( + spark.sql( + """ + |SELECT corr(b, c) FROM covar_tab WHERE a = 3 + """.stripMargin), + Row(Double.NaN) :: Nil) - checkAnswer( - spark.sql( - """ - |SELECT a, corr(b, c) FROM covar_tab GROUP BY a ORDER BY a - """.stripMargin), - Row(1, null) :: - Row(2, null) :: - Row(3, Double.NaN) :: - Row(4, Double.NaN) :: - Row(5, Double.NaN) :: - Row(6, Double.NaN) :: Nil) - - val corr7 = spark.sql("SELECT corr(b, c) FROM covar_tab").collect()(0).getDouble(0) - assert(math.abs(corr7 - 0.6633880657639323) < 1e-12) + checkAnswer( + spark.sql( + """ + |SELECT a, corr(b, c) FROM covar_tab GROUP BY a ORDER BY a + """.stripMargin), + Row(1, null) :: + Row(2, null) :: + Row(3, Double.NaN) :: + Row(4, Double.NaN) :: + Row(5, Double.NaN) :: + Row(6, Double.NaN) :: Nil) + + val corr7 = spark.sql("SELECT corr(b, c) FROM covar_tab").collect()(0).getDouble(0) + assert(math.abs(corr7 - 0.6633880657639323) < 1e-12) + } } test("covariance: covar_pop and covar_samp") { @@ -1049,36 +1055,42 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te class HashAggregationQuerySuite extends AggregationQuerySuite +@SlowHiveTest class HashAggregationQueryWithControlledFallbackSuite extends AggregationQuerySuite { override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = { Seq("true", "false").foreach { enableTwoLevelMaps => withSQLConf(SQLConf.ENABLE_TWOLEVEL_AGG_MAP.key -> enableTwoLevelMaps) { - (1 to 3).foreach { fallbackStartsAt => - withSQLConf("spark.sql.TungstenAggregate.testFallbackStartsAt" -> - s"${(fallbackStartsAt - 1).toString}, ${fallbackStartsAt.toString}") { - // Create a new df to make sure its physical operator picks up - // spark.sql.TungstenAggregate.testFallbackStartsAt. - // todo: remove it? - val newActual = Dataset.ofRows(spark, actual.logicalPlan) - - QueryTest.getErrorMessageInCheckAnswer(newActual, expectedAnswer) match { - case Some(errorMessage) => - val newErrorMessage = - s""" - |The following aggregation query failed when using HashAggregate with - |controlled fallback (it falls back to bytes to bytes map once it has processed - |${fallbackStartsAt - 1} input rows and to sort-based aggregation once it has - |processed $fallbackStartsAt input rows). The query is ${actual.queryExecution} - | - |$errorMessage - """.stripMargin - - fail(newErrorMessage) - case None => // Success + Seq(4, 8).foreach { uaoSize => + UnsafeAlignedOffset.setUaoSize(uaoSize) + (1 to 3).foreach { fallbackStartsAt => + withSQLConf("spark.sql.TungstenAggregate.testFallbackStartsAt" -> + s"${(fallbackStartsAt - 1).toString}, ${fallbackStartsAt.toString}") { + // Create a new df to make sure its physical operator picks up + // spark.sql.TungstenAggregate.testFallbackStartsAt. + // todo: remove it? + val newActual = Dataset.ofRows(spark, actual.logicalPlan) + + QueryTest.getErrorMessageInCheckAnswer(newActual, expectedAnswer) match { + case Some(errorMessage) => + val newErrorMessage = + s""" + |The following aggregation query failed when using HashAggregate with + |controlled fallback (it falls back to bytes to bytes map once it has + |processed ${fallbackStartsAt - 1} input rows and to sort-based aggregation + |once it has processed $fallbackStartsAt input rows). + |The query is ${actual.queryExecution} + |$errorMessage + """.stripMargin + + fail(newErrorMessage) + case None => // Success + } } } + // reset static uaoSize to avoid affect other tests + UnsafeAlignedOffset.setUaoSize(0) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index dbbf2b29fe8b7..2b7cfe57fdae3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -117,6 +117,52 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } + test("show views") { + withView("show1a", "show2b", "global_temp.temp1", "temp2") { + sql("CREATE VIEW show1a AS SELECT 1 AS id") + sql("CREATE VIEW show2b AS SELECT 1 AS id") + sql("CREATE GLOBAL TEMP VIEW temp1 AS SELECT 1 AS id") + sql("CREATE TEMP VIEW temp2 AS SELECT 1 AS id") + checkAnswer( + sql("SHOW VIEWS"), + Row("default", "show1a", false) :: + Row("default", "show2b", false) :: + Row("default", "parquet_view1", false) :: + Row("", "temp2", true) :: Nil) + checkAnswer( + sql("SHOW VIEWS IN default"), + Row("default", "show1a", false) :: + Row("default", "show2b", false) :: + Row("default", "parquet_view1", false) :: + Row("", "temp2", true) :: Nil) + checkAnswer( + sql("SHOW VIEWS FROM default"), + Row("default", "show1a", false) :: + Row("default", "show2b", false) :: + Row("default", "parquet_view1", false) :: + Row("", "temp2", true) :: Nil) + checkAnswer( + sql("SHOW VIEWS FROM global_temp"), + Row("global_temp", "temp1", true) :: + Row("", "temp2", true) :: Nil) + checkAnswer( + sql("SHOW VIEWS 'show1*|show2*'"), + Row("default", "show1a", false) :: + Row("default", "show2b", false) :: Nil) + checkAnswer( + sql("SHOW VIEWS LIKE 'show1*|show2*'"), + Row("default", "show1a", false) :: + Row("default", "show2b", false) :: Nil) + checkAnswer( + sql("SHOW VIEWS IN default 'show1*'"), + Row("default", "show1a", false) :: Nil) + checkAnswer( + sql("SHOW VIEWS IN default LIKE 'show1*|show2*'"), + Row("default", "show1a", false) :: + Row("default", "show2b", false) :: Nil) + } + } + test("show tblproperties of data source tables - basic") { checkAnswer( sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = 'my_key1'"), @@ -133,7 +179,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val message = intercept[AnalysisException] { sql("SHOW TBLPROPERTIES badtable") }.getMessage - assert(message.contains("Table not found: badtable")) + assert(message.contains("Table or view not found: badtable")) // When key is not found, a row containing the error is returned. checkAnswer( @@ -147,21 +193,6 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto checkAnswer(sql("SHOW TBLPROPERTIES parquet_tab2('`prop2Key`')"), Row("prop2Val")) } - test("show tblproperties for spark temporary table - AnalysisException is thrown") { - withTempView("parquet_temp") { - sql( - """ - |CREATE TEMPORARY VIEW parquet_temp (c1 INT, c2 STRING) - |USING org.apache.spark.sql.parquet.DefaultSource - """.stripMargin) - - val message = intercept[AnalysisException] { - sql("SHOW TBLPROPERTIES parquet_temp") - }.getMessage - assert(message.contains("parquet_temp is a temp view not table")) - } - } - Seq(true, false).foreach { local => val loadQuery = if (local) "LOAD DATA LOCAL" else "LOAD DATA" test(loadQuery) { @@ -444,7 +475,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val message2 = intercept[AnalysisException] { sql("SHOW PARTITIONS parquet_tab4 PARTITION(abcd=2015, xyz=1)") }.getMessage - assert(message2.contains("Non-partitioning column(s) [abcd, xyz] are specified")) + assert(message2.contains("abcd is not a valid partition column")) val message3 = intercept[AnalysisException] { sql("SHOW PARTITIONS parquet_view1") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index 28e1db961f611..8b1f4c92755b9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -346,8 +346,7 @@ abstract class HiveComparisonTest val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) => val query = new TestHiveQueryExecution(queryString.replace("../../data", testDataPath)) def getResult(): Seq[String] = { - SQLExecution.withNewExecutionId( - query.sparkSession, query)(hiveResultString(query.executedPlan)) + SQLExecution.withNewExecutionId(query)(hiveResultString(query.executedPlan)) } try { (query, prepareAnswer(query, getResult())) } catch { case e: Throwable => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index ba48cfd4142f6..c8726c7ae4bc9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -28,7 +28,7 @@ import org.scalatest.BeforeAndAfterEach import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, NoSuchPartitionsException, PartitionsAlreadyExistException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.connector.catalog.CatalogManager @@ -44,9 +44,11 @@ import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.SlowHiveTest import org.apache.spark.util.Utils // TODO(gatorsmile): combine HiveCatalogedDDLSuite and HiveDDLSuite +@SlowHiveTest class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeAndAfterEach { override def afterEach(): Unit = { try { @@ -404,6 +406,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA } } +@SlowHiveTest class HiveDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { import testImplicits._ @@ -602,8 +605,8 @@ class HiveDDLSuite val tab = "tab_with_partitions" withTempDir { tmpDir => val basePath = new File(tmpDir.getCanonicalPath) - val part1Path = new File(basePath + "/part1") - val part2Path = new File(basePath + "/part2") + val part1Path = new File(new File(basePath, "part10"), "part11") + val part2Path = new File(new File(basePath, "part20"), "part21") val dirSet = part1Path :: part2Path :: Nil // Before data insertion, all the directory are empty @@ -981,8 +984,8 @@ class HiveDDLSuite val expectedSerdePropsString = expectedSerdeProps.map { case (k, v) => s"'$k'='$v'" }.mkString(", ") val oldPart = catalog.getPartition(TableIdentifier("boxes"), Map("width" -> "4")) - assume(oldPart.storage.serde != Some(expectedSerde), "bad test: serde was already set") - assume(oldPart.storage.properties.filterKeys(expectedSerdeProps.contains) != + assert(oldPart.storage.serde != Some(expectedSerde), "bad test: serde was already set") + assert(oldPart.storage.properties.filterKeys(expectedSerdeProps.contains) != expectedSerdeProps, "bad test: serde properties were already set") sql(s"""ALTER TABLE boxes PARTITION (width=4) | SET SERDE '$expectedSerde' @@ -1188,7 +1191,7 @@ class HiveDDLSuite expectedDBUri, Map.empty)) // the database directory was created - assert(fs.exists(dbPath) && fs.isDirectory(dbPath)) + assert(fs.exists(dbPath) && fs.getFileStatus(dbPath).isDirectory) sql(s"USE $dbName") val tabName = "tab1" @@ -1580,6 +1583,12 @@ class HiveDDLSuite "source table/view path should be different from target table path") } + if (DDLUtils.isHiveTable(targetTable)) { + assert(targetTable.tracksPartitionsInCatalog) + } else { + assert(targetTable.tracksPartitionsInCatalog == sourceTable.tracksPartitionsInCatalog) + } + // The source table contents should not been seen in the target table. assert(spark.table(sourceTable.identifier).count() != 0, "the source table should be nonempty") assert(spark.table(targetTable.identifier).count() == 0, "the target table should be empty") @@ -1729,7 +1738,7 @@ class HiveDDLSuite Seq("json", "parquet").foreach { format => withTable("rectangles") { data.write.format(format).saveAsTable("rectangles") - assume(spark.table("rectangles").collect().nonEmpty, + assert(spark.table("rectangles").collect().nonEmpty, "bad test; table was empty to begin with") sql("TRUNCATE TABLE rectangles") @@ -2309,37 +2318,39 @@ class HiveDDLSuite implicit val _sqlContext = spark.sqlContext - Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word").createOrReplaceTempView("t1") - // Make a table and ensure it will be broadcast. - sql("""CREATE TABLE smallTable(word string, number int) - |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' - |STORED AS TEXTFILE - """.stripMargin) + withTempView("t1") { + Seq((1, "one"), (2, "two"), (4, "four")).toDF("number", "word").createOrReplaceTempView("t1") + // Make a table and ensure it will be broadcast. + sql("""CREATE TABLE smallTable(word string, number int) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |STORED AS TEXTFILE + """.stripMargin) - sql( - """INSERT INTO smallTable - |SELECT word, number from t1 - """.stripMargin) + sql( + """INSERT INTO smallTable + |SELECT word, number from t1 + """.stripMargin) - val inputData = MemoryStream[Int] - val joined = inputData.toDS().toDF() - .join(spark.table("smallTable"), $"value" === $"number") + val inputData = MemoryStream[Int] + val joined = inputData.toDS().toDF() + .join(spark.table("smallTable"), $"value" === $"number") - val sq = joined.writeStream - .format("memory") - .queryName("t2") - .start() - try { - inputData.addData(1, 2) + val sq = joined.writeStream + .format("memory") + .queryName("t2") + .start() + try { + inputData.addData(1, 2) - sq.processAllAvailable() + sq.processAllAvailable() - checkAnswer( - spark.table("t2"), - Seq(Row(1, "one", 1), Row(2, "two", 2)) - ) - } finally { - sq.stop() + checkAnswer( + spark.table("t2"), + Seq(Row(1, "one", 1), Row(2, "two", 2)) + ) + } finally { + sq.stop() + } } } @@ -2698,29 +2709,74 @@ class HiveDDLSuite } } - test("SPARK-30098: create table without provider should " + - "use default data source under non-legacy mode") { + test("SPARK-30785: create table like a partitioned table") { val catalog = spark.sessionState.catalog - withSQLConf( - SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "false") { - withTable("s") { - val defaultProvider = conf.defaultDataSourceName - sql("CREATE TABLE s(a INT, b INT)") - val table = catalog.getTableMetadata(TableIdentifier("s")) - assert(table.provider === Some(defaultProvider)) - } + withTable("sc_part", "ta_part") { + sql("CREATE TABLE sc_part (key string, ts int) USING parquet PARTITIONED BY (ts)") + sql("CREATE TABLE ta_part like sc_part") + val sourceTable = catalog.getTableMetadata(TableIdentifier("sc_part", Some("default"))) + val targetTable = catalog.getTableMetadata(TableIdentifier("ta_part", Some("default"))) + assert(sourceTable.tracksPartitionsInCatalog) + assert(targetTable.tracksPartitionsInCatalog) + assert(targetTable.partitionColumnNames == Seq("ts")) + sql("ALTER TABLE ta_part ADD PARTITION (ts=10)") // no exception + checkAnswer(sql("SHOW PARTITIONS ta_part"), Row("ts=10") :: Nil) } } - test("SPARK-30098: create table without provider should " + - "use hive under legacy mode") { - val catalog = spark.sessionState.catalog - withSQLConf( - SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") { - withTable("s") { - sql("CREATE TABLE s(a INT, b INT)") - val table = catalog.getTableMetadata(TableIdentifier("s")) - assert(table.provider === Some("hive")) + test("SPARK-31904: Fix case sensitive problem of char and varchar partition columns") { + withTable("t1", "t2") { + sql("CREATE TABLE t1(a STRING, B VARCHAR(10), C CHAR(10)) STORED AS parquet") + sql("CREATE TABLE t2 USING parquet PARTITIONED BY (b, c) AS SELECT * FROM t1") + // make sure there is no exception + assert(sql("SELECT * FROM t2 WHERE b = 'A'").collect().isEmpty) + assert(sql("SELECT * FROM t2 WHERE c = 'A'").collect().isEmpty) + } + } + + test("SPARK-33742: partition already exists") { + withTable("t") { + sql(s"CREATE TABLE t (data string) PARTITIONED BY (id bigint)") + sql(s"ALTER TABLE t ADD PARTITION (id=2) LOCATION 'loc1'") + + val errMsg = intercept[PartitionsAlreadyExistException] { + sql(s"ALTER TABLE t ADD PARTITION (id=1) LOCATION 'loc'" + + " PARTITION (id=2) LOCATION 'loc1'") + }.getMessage + assert(errMsg.contains("The following partitions already exists")) + + sql(s"ALTER TABLE t ADD IF NOT EXISTS PARTITION (id=1) LOCATION 'loc'" + + " PARTITION (id=2) LOCATION 'loc1'") + checkAnswer(sql("SHOW PARTITIONS t"), Seq(Row("id=1"), Row("id=2"))) + } + } + + test("SPARK-33788: partition not exists") { + withTable("t") { + sql(s"CREATE TABLE t (data string) PARTITIONED BY (id bigint)") + sql(s"ALTER TABLE t ADD PARTITION (id=1)") + + val errMsg = intercept[NoSuchPartitionsException] { + sql(s"ALTER TABLE t DROP PARTITION (id=1), PARTITION (id=2)") + }.getMessage + assert(errMsg.contains("partitions not found in table")) + + checkAnswer(sql("SHOW PARTITIONS t"), Seq(Row("id=1"))) + sql(s"ALTER TABLE t DROP IF EXISTS PARTITION (id=1), PARTITION (id=2)") + checkAnswer(sql("SHOW PARTITIONS t"), Seq.empty) + } + } + + test("SPARK-33844: Insert overwrite directory should check schema too") { + withView("v") { + spark.range(1).createTempView("v") + withTempPath { path => + val e = intercept[AnalysisException] { + spark.sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path.getCanonicalPath}' " + + s"STORED AS PARQUET SELECT ID, if(1=1, 1, 0), abs(id), '^-' FROM v") + }.getMessage + assert(e.contains("Attribute name \"(IF((1 = 1), 1, 0))\" contains" + + " invalid character(s) among \" ,;{}()\\n\\t=\". Please use alias to rename it.")) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala index f9a4e2cd210e3..7a913e99fdbe6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecution import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -133,22 +134,21 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto "src") } - test("explain output of physical plan should contain proper codegen stage ID") { - withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { - checkKeywordsExist(sql( - """ - |EXPLAIN SELECT t1.id AS a, t2.id AS b FROM - |(SELECT * FROM range(3)) t1 JOIN - |(SELECT * FROM range(10)) t2 ON t1.id == t2.id % 3 - """.stripMargin), - "== Physical Plan ==", - "*(2) Project ", - "+- *(2) BroadcastHashJoin ", - " :- BroadcastExchange ", - " : +- *(1) Range ", - " +- *(2) Range " - ) - } + test("explain output of physical plan should contain proper codegen stage ID", + DisableAdaptiveExecution("Adaptive explain is different")) { + checkKeywordsExist(sql( + """ + |EXPLAIN SELECT t1.id AS a, t2.id AS b FROM + |(SELECT * FROM range(3)) t1 JOIN + |(SELECT * FROM range(10)) t2 ON t1.id == t2.id % 3 + """.stripMargin), + "== Physical Plan ==", + "*(2) Project ", + "+- *(2) BroadcastHashJoin ", + " :- BroadcastExchange ", + " : +- *(1) Range ", + " +- *(2) Range " + ) } test("EXPLAIN CODEGEN command") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 5a8365017a5ba..e798a352f72bf 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.hive.execution import java.io.File import java.net.URI import java.sql.Timestamp -import java.util.{Locale, TimeZone} +import java.util.Locale import scala.util.Try @@ -39,6 +39,7 @@ import org.apache.spark.sql.hive.test.{HiveTestJars, TestHive} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.tags.SlowHiveTest case class TestData(a: Int, b: String) @@ -46,10 +47,8 @@ case class TestData(a: Int, b: String) * A set of test cases expressed in Hive QL that are not covered by the tests * included in the hive distribution. */ +@SlowHiveTest class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAndAfter { - private val originalTimeZone = TimeZone.getDefault - private val originalLocale = Locale.getDefault - import org.apache.spark.sql.hive.test.TestHive.implicits._ private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled @@ -59,10 +58,6 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) - // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) - TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) - // Add Locale setting - Locale.setDefault(Locale.US) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) } @@ -70,8 +65,6 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd override def afterAll(): Unit = { try { TestHive.setCacheTables(false) - TimeZone.setDefault(originalTimeZone) - Locale.setDefault(originalLocale) sql("DROP TEMPORARY FUNCTION IF EXISTS udtf_count2") TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) } finally { @@ -565,33 +558,27 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd assert(1 == res.getDouble(0)) } - createQueryTest("timestamp cast #2", - "SELECT CAST(CAST(1.2 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1") - - test("timestamp cast #3") { - val res = sql("SELECT CAST(CAST(1200 AS TIMESTAMP) AS INT) FROM src LIMIT 1").collect().head - assert(1200 == res.getInt(0)) + test("timestamp cast #2") { + val res = sql("SELECT CAST(CAST(-1 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1").collect().head + assert(-1 == res.get(0)) } - createQueryTest("timestamp cast #4", + createQueryTest("timestamp cast #3", "SELECT CAST(CAST(1.2 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1") + createQueryTest("timestamp cast #4", + "SELECT CAST(CAST(-1.2 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1") + test("timestamp cast #5") { - val res = sql("SELECT CAST(CAST(-1 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1").collect().head - assert(-1 == res.get(0)) + val res = sql("SELECT CAST(CAST(1200 AS TIMESTAMP) AS INT) FROM src LIMIT 1").collect().head + assert(1200 == res.getInt(0)) } - createQueryTest("timestamp cast #6", - "SELECT CAST(CAST(-1.2 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1") - - test("timestamp cast #7") { + test("timestamp cast #6") { val res = sql("SELECT CAST(CAST(-1200 AS TIMESTAMP) AS INT) FROM src LIMIT 1").collect().head assert(-1200 == res.getInt(0)) } - createQueryTest("timestamp cast #8", - "SELECT CAST(CAST(-1.2 AS TIMESTAMP) AS DOUBLE) FROM src LIMIT 1") - createQueryTest("select null from table", "SELECT null FROM src LIMIT 1") @@ -698,15 +685,17 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd "SELECT srcalias.KEY, SRCALIAS.value FROM sRc SrCAlias WHERE SrCAlias.kEy < 15") test("case sensitivity: created temporary view") { - val testData = - TestHive.sparkContext.parallelize( - TestData(1, "str1") :: - TestData(2, "str2") :: Nil) - testData.toDF().createOrReplaceTempView("REGisteredTABle") - - assertResult(Array(Row(2, "str2"))) { - sql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " + - "WHERE TableAliaS.a > 1").collect() + withTempView("REGisteredTABle") { + val testData = + TestHive.sparkContext.parallelize( + TestData(1, "str1") :: + TestData(2, "str2") :: Nil) + testData.toDF().createOrReplaceTempView("REGisteredTABle") + + assertResult(Array(Row(2, "str2"))) { + sql("SELECT tablealias.A, TABLEALIAS.b FROM reGisteredTABle TableAlias " + + "WHERE TableAliaS.a > 1").collect() + } } } @@ -725,16 +714,18 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd } test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") { - val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3)) - .zipWithIndex.map {case ((value, attr), key) => HavingRow(key, value, attr)} - TestHive.sparkContext.parallelize(fixture).toDF().createOrReplaceTempView("having_test") - val results = - sql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3") - .collect() - .map(x => (x.getString(0), x.getInt(1))) + withTempView("having_test") { + val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3)) + .zipWithIndex.map {case ((value, attr), key) => HavingRow(key, value, attr)} + TestHive.sparkContext.parallelize(fixture).toDF().createOrReplaceTempView("having_test") + val results = + sql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3") + .collect() + .map(x => (x.getString(0), x.getInt(1))) - assert(results === Array(("foo", 4))) - TestHive.reset() + assert(results === Array(("foo", 4))) + TestHive.reset() + } } test("SPARK-2180: HAVING with non-boolean clause raises no exceptions") { @@ -966,22 +957,24 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd } test("SPARK-3414 regression: should store analyzed logical plan when creating a temporary view") { - sparkContext.makeRDD(Seq.empty[LogEntry]).toDF().createOrReplaceTempView("rawLogs") - sparkContext.makeRDD(Seq.empty[LogFile]).toDF().createOrReplaceTempView("logFiles") + withTempView("rawLogs", "logFiles", "boom") { + sparkContext.makeRDD(Seq.empty[LogEntry]).toDF().createOrReplaceTempView("rawLogs") + sparkContext.makeRDD(Seq.empty[LogFile]).toDF().createOrReplaceTempView("logFiles") - sql( - """ - SELECT name, message - FROM rawLogs - JOIN ( - SELECT name - FROM logFiles - ) files - ON rawLogs.filename = files.name - """).createOrReplaceTempView("boom") - - // This should be successfully analyzed - sql("SELECT * FROM boom").queryExecution.analyzed + sql( + """ + SELECT name, message + FROM rawLogs + JOIN ( + SELECT name + FROM logFiles + ) files + ON rawLogs.filename = files.name + """).createOrReplaceTempView("boom") + + // This should be successfully analyzed + sql("SELECT * FROM boom").queryExecution.analyzed + } } test("SPARK-3810: PreprocessTableInsertion static partitioning support") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala index 5e6e114fc3fdc..fa43ff14fd796 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala @@ -80,8 +80,8 @@ class HiveSQLViewSuite extends SQLViewSuite with TestHiveSingleton { val e = intercept[AnalysisException] { sql(s"CREATE VIEW view1 AS SELECT $tempFunctionName(id) from tab1") }.getMessage - assert(e.contains("Not allowed to create a permanent view `view1` by referencing " + - s"a temporary function `$tempFunctionName`")) + assert(e.contains("Not allowed to create a permanent view `default`.`view1` by " + + s"referencing a temporary function `$tempFunctionName`")) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala index f8ba7bf2c1a62..ac9ae8c9229db 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala @@ -154,7 +154,7 @@ class HiveSerDeReadWriteSuite extends QueryTest with SQLTestUtils with TestHiveS } } - Seq("PARQUET", "ORC").foreach { fileFormat => + Seq("PARQUET", "ORC", "TEXTFILE").foreach { fileFormat => test(s"Read/Write Hive $fileFormat serde table") { // Numeric Types checkNumericTypes(fileFormat, "TINYINT", 2) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala index 9a1190af02fac..24b1e3405379c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala @@ -82,9 +82,12 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte }.head } + // Make sure we set the config values to TestHive.conf. + override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = + SQLConf.withExistingConf(TestHive.conf)(super.withSQLConf(pairs: _*)(f)) + test("Test the default fileformat for Hive-serde tables") { - withSQLConf("hive.default.fileformat" -> "orc", - SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") { + withSQLConf("hive.default.fileformat" -> "orc") { val (desc, exists) = extractTableDesc( "CREATE TABLE IF NOT EXISTS fileformat_test (id int)") assert(exists) @@ -93,8 +96,7 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) } - withSQLConf("hive.default.fileformat" -> "parquet", - SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") { + withSQLConf("hive.default.fileformat" -> "parquet") { val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)") assert(exists) val input = desc.storage.inputFormat diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index 7bca2af379934..d3c8428b71806 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.command.FunctionsCommand import org.apache.spark.sql.functions.max +import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils @@ -658,6 +659,25 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { } } + test("SPARK-32877: add test for Hive UDF complex decimal type") { + assume(HiveUtils.isHive23) + withUserDefinedFunction("testArraySum" -> false) { + sql(s"CREATE FUNCTION testArraySum AS '${classOf[ArraySumUDF].getName}'") + checkAnswer( + sql("SELECT testArraySum(array(1, 1.1, 1.2))"), + Seq(Row(3.3))) + + val msg = intercept[AnalysisException] { + sql("SELECT testArraySum(1)") + }.getMessage + assert(msg.contains(s"No handler for UDF/UDAF/UDTF '${classOf[ArraySumUDF].getName}'")) + + val msg2 = intercept[AnalysisException] { + sql("SELECT testArraySum(1, 2)") + }.getMessage + assert(msg2.contains(s"No handler for UDF/UDAF/UDTF '${classOf[ArraySumUDF].getName}'")) + } + } } class TestPair(x: Int, y: Int) extends Writable with Serializable { @@ -741,3 +761,14 @@ class StatelessUDF extends UDF { result } } + +class ArraySumUDF extends UDF { + import scala.collection.JavaConverters._ + def evaluate(values: java.util.List[java.lang.Double]): java.lang.Double = { + var r = 0d + for (v <- values.asScala) { + r += v + } + r + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala index b20ef035594da..6f37e39a532d6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/Hive_2_1_DDLSuite.scala @@ -27,13 +27,14 @@ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.types._ -import org.apache.spark.tags.ExtendedHiveTest +import org.apache.spark.tags.{ExtendedHiveTest, SlowHiveTest} import org.apache.spark.util.Utils /** * A separate set of DDL tests that uses Hive 2.1 libraries, which behave a little differently * from the built-in ones. */ +@SlowHiveTest @ExtendedHiveTest class Hive_2_1_DDLSuite extends SparkFunSuite with TestHiveSingleton with BeforeAndAfterEach with BeforeAndAfterAll { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala index 16668f93bd4e7..4d6dafd598a2e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala @@ -17,24 +17,13 @@ package org.apache.spark.sql.hive.execution +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.execution.metric.SQLMetricsTestUtils import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.SQLConf -class SQLMetricsSuite extends SQLMetricsTestUtils with TestHiveSingleton { - - var originalValue: String = _ - // With AQE on/off, the metric info is different. - override def beforeAll(): Unit = { - super.beforeAll() - originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") - } - - override def afterAll(): Unit = { - spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) - super.afterAll() - } +// Disable AQE because metric info is different with AQE on/off +class SQLMetricsSuite extends SQLMetricsTestUtils with TestHiveSingleton + with DisableAdaptiveExecutionSuite { test("writing data out metrics: hive") { testMetricsNonDynamicPartition("hive", "t1") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 539b464743461..a46db32b9adee 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, Functio import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogUtils, HiveTableRelation} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} +import org.apache.spark.sql.execution.adaptive.{DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite} import org.apache.spark.sql.execution.command.{FunctionsCommand, LoadDataCommand} import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.functions._ @@ -42,6 +43,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ +import org.apache.spark.tags.SlowHiveTest import org.apache.spark.util.Utils case class Nested1(f1: Nested2) @@ -67,7 +69,7 @@ case class Order( * Hive to generate them (in contrast to HiveQuerySuite). Often this is because the query is * valid, but Hive currently cannot execute it. */ -class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { +abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHiveSingleton { import hiveContext._ import spark.implicits._ @@ -88,24 +90,28 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("script") { - assume(TestUtils.testCommandAvailable("/bin/bash")) - assume(TestUtils.testCommandAvailable("echo | sed")) - val scriptFilePath = getTestResourcePath("test_script.sh") - val df = Seq(("x1", "y1", "z1"), ("x2", "y2", "z2")).toDF("c1", "c2", "c3") - df.createOrReplaceTempView("script_table") - val query1 = sql( - s""" - |SELECT col1 FROM (from(SELECT c1, c2, c3 FROM script_table) tempt_table - |REDUCE c1, c2, c3 USING 'bash $scriptFilePath' AS - |(col1 STRING, col2 STRING)) script_test_table""".stripMargin) - checkAnswer(query1, Row("x1_y1") :: Row("x2_y2") :: Nil) + withTempView("script_table") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + assume(TestUtils.testCommandAvailable("echo | sed")) + val scriptFilePath = getTestResourcePath("test_script.sh") + val df = Seq(("x1", "y1", "z1"), ("x2", "y2", "z2")).toDF("c1", "c2", "c3") + df.createOrReplaceTempView("script_table") + val query1 = sql( + s""" + |SELECT col1 FROM (from(SELECT c1, c2, c3 FROM script_table) tempt_table + |REDUCE c1, c2, c3 USING 'bash $scriptFilePath' AS + |(col1 STRING, col2 STRING)) script_test_table""".stripMargin) + checkAnswer(query1, Row("x1_y1") :: Row("x2_y2") :: Nil) + } } test("SPARK-6835: udtf in lateral view") { - val df = Seq((1, 1)).toDF("c1", "c2") - df.createOrReplaceTempView("table1") - val query = sql("SELECT c1, v FROM table1 LATERAL VIEW stack(3, 1, c1 + 1, c1 + 2) d AS v") - checkAnswer(query, Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: Nil) + withTempView("table1") { + val df = Seq((1, 1)).toDF("c1", "c2") + df.createOrReplaceTempView("table1") + val query = sql("SELECT c1, v FROM table1 LATERAL VIEW stack(3, 1, c1 + 1, c1 + 2) d AS v") + checkAnswer(query, Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: Nil) + } } test("SPARK-13651: generator outputs shouldn't be resolved from its child's output") { @@ -119,70 +125,72 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-6851: Self-joined converted parquet tables") { - val orders = Seq( - Order(1, "Atlas", "MTB", 234, "2015-01-07", "John D", "Pacifica", "CA", 20151), - Order(3, "Swift", "MTB", 285, "2015-01-17", "John S", "Redwood City", "CA", 20151), - Order(4, "Atlas", "Hybrid", 303, "2015-01-23", "Jones S", "San Mateo", "CA", 20151), - Order(7, "Next", "MTB", 356, "2015-01-04", "Jane D", "Daly City", "CA", 20151), - Order(10, "Next", "YFlikr", 187, "2015-01-09", "John D", "Fremont", "CA", 20151), - Order(11, "Swift", "YFlikr", 187, "2015-01-23", "John D", "Hayward", "CA", 20151), - Order(2, "Next", "Hybrid", 324, "2015-02-03", "Jane D", "Daly City", "CA", 20152), - Order(5, "Next", "Street", 187, "2015-02-08", "John D", "Fremont", "CA", 20152), - Order(6, "Atlas", "Street", 154, "2015-02-09", "John D", "Pacifica", "CA", 20152), - Order(8, "Swift", "Hybrid", 485, "2015-02-19", "John S", "Redwood City", "CA", 20152), - Order(9, "Atlas", "Split", 303, "2015-02-28", "Jones S", "San Mateo", "CA", 20152)) - - val orderUpdates = Seq( - Order(1, "Atlas", "MTB", 434, "2015-01-07", "John D", "Pacifica", "CA", 20151), - Order(11, "Swift", "YFlikr", 137, "2015-01-23", "John D", "Hayward", "CA", 20151)) - - orders.toDF.createOrReplaceTempView("orders1") - orderUpdates.toDF.createOrReplaceTempView("orderupdates1") - - withTable("orders", "orderupdates") { - sql( - """CREATE TABLE orders( - | id INT, - | make String, - | type String, - | price INT, - | pdate String, - | customer String, - | city String) - |PARTITIONED BY (state STRING, month INT) - |STORED AS PARQUET - """.stripMargin) + withTempView("orders1", "orderupdates1") { + val orders = Seq( + Order(1, "Atlas", "MTB", 234, "2015-01-07", "John D", "Pacifica", "CA", 20151), + Order(3, "Swift", "MTB", 285, "2015-01-17", "John S", "Redwood City", "CA", 20151), + Order(4, "Atlas", "Hybrid", 303, "2015-01-23", "Jones S", "San Mateo", "CA", 20151), + Order(7, "Next", "MTB", 356, "2015-01-04", "Jane D", "Daly City", "CA", 20151), + Order(10, "Next", "YFlikr", 187, "2015-01-09", "John D", "Fremont", "CA", 20151), + Order(11, "Swift", "YFlikr", 187, "2015-01-23", "John D", "Hayward", "CA", 20151), + Order(2, "Next", "Hybrid", 324, "2015-02-03", "Jane D", "Daly City", "CA", 20152), + Order(5, "Next", "Street", 187, "2015-02-08", "John D", "Fremont", "CA", 20152), + Order(6, "Atlas", "Street", 154, "2015-02-09", "John D", "Pacifica", "CA", 20152), + Order(8, "Swift", "Hybrid", 485, "2015-02-19", "John S", "Redwood City", "CA", 20152), + Order(9, "Atlas", "Split", 303, "2015-02-28", "Jones S", "San Mateo", "CA", 20152)) + + val orderUpdates = Seq( + Order(1, "Atlas", "MTB", 434, "2015-01-07", "John D", "Pacifica", "CA", 20151), + Order(11, "Swift", "YFlikr", 137, "2015-01-23", "John D", "Hayward", "CA", 20151)) + + orders.toDF.createOrReplaceTempView("orders1") + orderUpdates.toDF.createOrReplaceTempView("orderupdates1") + + withTable("orders", "orderupdates") { + sql( + """CREATE TABLE orders( + | id INT, + | make String, + | type String, + | price INT, + | pdate String, + | customer String, + | city String) + |PARTITIONED BY (state STRING, month INT) + |STORED AS PARQUET + """.stripMargin) - sql( - """CREATE TABLE orderupdates( - | id INT, - | make String, - | type String, - | price INT, - | pdate String, - | customer String, - | city String) - |PARTITIONED BY (state STRING, month INT) - |STORED AS PARQUET - """.stripMargin) + sql( + """CREATE TABLE orderupdates( + | id INT, + | make String, + | type String, + | price INT, + | pdate String, + | customer String, + | city String) + |PARTITIONED BY (state STRING, month INT) + |STORED AS PARQUET + """.stripMargin) - sql("set hive.exec.dynamic.partition.mode=nonstrict") - sql("INSERT INTO TABLE orders PARTITION(state, month) SELECT * FROM orders1") - sql("INSERT INTO TABLE orderupdates PARTITION(state, month) SELECT * FROM orderupdates1") + sql("set hive.exec.dynamic.partition.mode=nonstrict") + sql("INSERT INTO TABLE orders PARTITION(state, month) SELECT * FROM orders1") + sql("INSERT INTO TABLE orderupdates PARTITION(state, month) SELECT * FROM orderupdates1") - checkAnswer( - sql( - """ - |select orders.state, orders.month - |from orders - |join ( - | select distinct orders.state,orders.month - | from orders - | join orderupdates - | on orderupdates.id = orders.id) ao - | on ao.state = orders.state and ao.month = orders.month - """.stripMargin), - (1 to 6).map(_ => Row("CA", 20151))) + checkAnswer( + sql( + """ + |select orders.state, orders.month + |from orders + |join ( + | select distinct orders.state,orders.month + | from orders + | join orderupdates + | on orderupdates.id = orders.id) ao + | on ao.state = orders.state and ao.month = orders.month + """.stripMargin), + (1 to 6).map(_ => Row("CA", 20151))) + } } } @@ -335,71 +343,76 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-5371: union with null and sum") { - val df = Seq((1, 1)).toDF("c1", "c2") - df.createOrReplaceTempView("table1") - - val query = sql( - """ - |SELECT - | MIN(c1), - | MIN(c2) - |FROM ( - | SELECT - | SUM(c1) c1, - | NULL c2 - | FROM table1 - | UNION ALL - | SELECT - | NULL c1, - | SUM(c2) c2 - | FROM table1 - |) a - """.stripMargin) - checkAnswer(query, Row(1, 1) :: Nil) - } - - test("CTAS with WITH clause") { + withTempView("table1") { + val df = Seq((1, 1)).toDF("c1", "c2") + df.createOrReplaceTempView("table1") - val df = Seq((1, 1)).toDF("c1", "c2") - df.createOrReplaceTempView("table1") - withTable("with_table1") { - sql( + val query = sql( """ - |CREATE TABLE with_table1 AS - |WITH T AS ( - | SELECT * + |SELECT + | MIN(c1), + | MIN(c2) + |FROM ( + | SELECT + | SUM(c1) c1, + | NULL c2 + | FROM table1 + | UNION ALL + | SELECT + | NULL c1, + | SUM(c2) c2 | FROM table1 - |) - |SELECT * - |FROM T + |) a """.stripMargin) - val query = sql("SELECT * FROM with_table1") checkAnswer(query, Row(1, 1) :: Nil) } } + test("CTAS with WITH clause") { + withTempView("table1") { + val df = Seq((1, 1)).toDF("c1", "c2") + df.createOrReplaceTempView("table1") + withTable("with_table1") { + sql( + """ + |CREATE TABLE with_table1 AS + |WITH T AS ( + | SELECT * + | FROM table1 + |) + |SELECT * + |FROM T + """.stripMargin) + val query = sql("SELECT * FROM with_table1") + checkAnswer(query, Row(1, 1) :: Nil) + } + } + } + test("explode nested Field") { - Seq(NestedArray1(NestedArray2(Seq(1, 2, 3)))).toDF.createOrReplaceTempView("nestedArray") - checkAnswer( - sql("SELECT ints FROM nestedArray LATERAL VIEW explode(a.b) a AS ints"), - Row(1) :: Row(2) :: Row(3) :: Nil) + withTempView("nestedArray") { + Seq(NestedArray1(NestedArray2(Seq(1, 2, 3)))).toDF.createOrReplaceTempView("nestedArray") + checkAnswer( + sql("SELECT ints FROM nestedArray LATERAL VIEW explode(a.b) a AS ints"), + Row(1) :: Row(2) :: Row(3) :: Nil) - checkAnswer( - sql("SELECT `ints` FROM nestedArray LATERAL VIEW explode(a.b) `a` AS `ints`"), - Row(1) :: Row(2) :: Row(3) :: Nil) + checkAnswer( + sql("SELECT `ints` FROM nestedArray LATERAL VIEW explode(a.b) `a` AS `ints`"), + Row(1) :: Row(2) :: Row(3) :: Nil) - checkAnswer( - sql("SELECT `a`.`ints` FROM nestedArray LATERAL VIEW explode(a.b) `a` AS `ints`"), - Row(1) :: Row(2) :: Row(3) :: Nil) + checkAnswer( + sql("SELECT `a`.`ints` FROM nestedArray LATERAL VIEW explode(a.b) `a` AS `ints`"), + Row(1) :: Row(2) :: Row(3) :: Nil) - checkAnswer( - sql( - """ - |SELECT `weird``tab`.`weird``col` - |FROM nestedArray - |LATERAL VIEW explode(a.b) `weird``tab` AS `weird``col` - """.stripMargin), - Row(1) :: Row(2) :: Row(3) :: Nil) + checkAnswer( + sql( + """ + |SELECT `weird``tab`.`weird``col` + |FROM nestedArray + |LATERAL VIEW explode(a.b) `weird``tab` AS `weird``col` + """.stripMargin), + Row(1) :: Row(2) :: Row(3) :: Nil) + } } test("SPARK-4512 Fix attribute reference resolution error when using SORT BY") { @@ -741,20 +754,22 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("double nested data") { - withTable("test_ctas_1234") { - sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil) - .toDF().createOrReplaceTempView("nested") - checkAnswer( - sql("SELECT f1.f2.f3 FROM nested"), - Row(1)) + withTempView("nested") { + withTable("test_ctas_1234") { + sparkContext.parallelize(Nested1(Nested2(Nested3(1))) :: Nil) + .toDF().createOrReplaceTempView("nested") + checkAnswer( + sql("SELECT f1.f2.f3 FROM nested"), + Row(1)) - sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested") - checkAnswer( - sql("SELECT * FROM test_ctas_1234"), - sql("SELECT * FROM nested").collect().toSeq) + sql("CREATE TABLE test_ctas_1234 AS SELECT * from nested") + checkAnswer( + sql("SELECT * FROM test_ctas_1234"), + sql("SELECT * FROM nested").collect().toSeq) - intercept[AnalysisException] { - sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect() + intercept[AnalysisException] { + sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect() + } } } } @@ -831,13 +846,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-4963 DataFrame sample on mutable row return wrong result") { - sql("SELECT * FROM src WHERE key % 2 = 0") - .sample(withReplacement = false, fraction = 0.3) - .createOrReplaceTempView("sampled") - (1 to 10).foreach { i => - checkAnswer( - sql("SELECT * FROM sampled WHERE key % 2 = 1"), - Seq.empty[Row]) + withTempView("sampled") { + sql("SELECT * FROM src WHERE key % 2 = 0") + .sample(withReplacement = false, fraction = 0.3) + .createOrReplaceTempView("sampled") + (1 to 10).foreach { i => + checkAnswer( + sql("SELECT * FROM sampled WHERE key % 2 = 1"), + Seq.empty[Row]) + } } } @@ -898,33 +915,38 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("resolve udtf in projection #1") { - val ds = (1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}""").toDS() - read.json(ds).createOrReplaceTempView("data") - val df = sql("SELECT explode(a) AS val FROM data") - val col = df("val") + withTempView("data") { + val ds = (1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}""").toDS() + read.json(ds).createOrReplaceTempView("data") + sql("SELECT explode(a) AS val FROM data") + } } test("resolve udtf in projection #2") { - val ds = (1 to 2).map(i => s"""{"a":[$i, ${i + 1}]}""").toDS() - read.json(ds).createOrReplaceTempView("data") - checkAnswer(sql("SELECT explode(map(1, 1)) FROM data LIMIT 1"), Row(1, 1) :: Nil) - checkAnswer(sql("SELECT explode(map(1, 1)) as (k1, k2) FROM data LIMIT 1"), Row(1, 1) :: Nil) - intercept[AnalysisException] { - sql("SELECT explode(map(1, 1)) as k1 FROM data LIMIT 1") - } + withTempView("data") { + val ds = (1 to 2).map(i => s"""{"a":[$i, ${i + 1}]}""").toDS() + read.json(ds).createOrReplaceTempView("data") + checkAnswer(sql("SELECT explode(map(1, 1)) FROM data LIMIT 1"), Row(1, 1) :: Nil) + checkAnswer(sql("SELECT explode(map(1, 1)) as (k1, k2) FROM data LIMIT 1"), Row(1, 1) :: Nil) + intercept[AnalysisException] { + sql("SELECT explode(map(1, 1)) as k1 FROM data LIMIT 1") + } - intercept[AnalysisException] { - sql("SELECT explode(map(1, 1)) as (k1, k2, k3) FROM data LIMIT 1") + intercept[AnalysisException] { + sql("SELECT explode(map(1, 1)) as (k1, k2, k3) FROM data LIMIT 1") + } } } // TGF with non-TGF in project is allowed in Spark SQL, but not in Hive test("TGF with non-TGF in projection") { - val ds = Seq("""{"a": "1", "b":"1"}""").toDS() - read.json(ds).createOrReplaceTempView("data") - checkAnswer( - sql("SELECT explode(map(a, b)) as (k1, k2), a, b FROM data"), - Row("1", "1", "1", "1") :: Nil) + withTempView("data") { + val ds = Seq("""{"a": "1", "b":"1"}""").toDS() + read.json(ds).createOrReplaceTempView("data") + checkAnswer( + sql("SELECT explode(map(a, b)) as (k1, k2), a, b FROM data"), + Row("1", "1", "1", "1") :: Nil) + } } test("logical.Project should not be resolved if it contains aggregates or generators") { @@ -974,47 +996,58 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-5203 union with different decimal precision") { - Seq.empty[(java.math.BigDecimal, java.math.BigDecimal)] - .toDF("d1", "d2") - .select($"d1".cast(DecimalType(10, 5)).as("d")) - .createOrReplaceTempView("dn") + withTempView("dn") { + Seq.empty[(java.math.BigDecimal, java.math.BigDecimal)] + .toDF("d1", "d2") + .select($"d1".cast(DecimalType(10, 5)).as("d")) + .createOrReplaceTempView("dn") - sql("select d from dn union all select d * 2 from dn") - .queryExecution.analyzed + sql("select d from dn union all select d * 2 from dn") + .queryExecution.analyzed + } } test("Star Expansion - script transform") { - assume(TestUtils.testCommandAvailable("/bin/bash")) - val data = (1 to 100000).map { i => (i, i, i) } - data.toDF("d1", "d2", "d3").createOrReplaceTempView("script_trans") - assert(100000 === sql("SELECT TRANSFORM (*) USING 'cat' FROM script_trans").count()) + withTempView("script_trans") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + val data = (1 to 100000).map { i => (i, i, i) } + data.toDF("d1", "d2", "d3").createOrReplaceTempView("script_trans") + assert(100000 === sql("SELECT TRANSFORM (*) USING 'cat' FROM script_trans").count()) + } } test("test script transform for stdout") { - assume(TestUtils.testCommandAvailable("/bin/bash")) - val data = (1 to 100000).map { i => (i, i, i) } - data.toDF("d1", "d2", "d3").createOrReplaceTempView("script_trans") - assert(100000 === - sql("SELECT TRANSFORM (d1, d2, d3) USING 'cat' AS (a,b,c) FROM script_trans").count()) + withTempView("script_trans") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + val data = (1 to 100000).map { i => (i, i, i) } + data.toDF("d1", "d2", "d3").createOrReplaceTempView("script_trans") + assert(100000 === + sql("SELECT TRANSFORM (d1, d2, d3) USING 'cat' AS (a,b,c) FROM script_trans").count()) + } } test("test script transform for stderr") { - assume(TestUtils.testCommandAvailable("/bin/bash")) - val data = (1 to 100000).map { i => (i, i, i) } - data.toDF("d1", "d2", "d3").createOrReplaceTempView("script_trans") - assert(0 === - sql("SELECT TRANSFORM (d1, d2, d3) USING 'cat 1>&2' AS (a,b,c) FROM script_trans").count()) + withTempView("script_trans") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + val data = (1 to 100000).map { i => (i, i, i) } + data.toDF("d1", "d2", "d3").createOrReplaceTempView("script_trans") + assert(0 === + sql("SELECT TRANSFORM (d1, d2, d3) USING 'cat 1>&2' AS (a,b,c) FROM script_trans").count()) + } } test("test script transform data type") { - assume(TestUtils.testCommandAvailable("/bin/bash")) - val data = (1 to 5).map { i => (i, i) } - data.toDF("key", "value").createOrReplaceTempView("test") - checkAnswer( - sql("""FROM - |(FROM test SELECT TRANSFORM(key, value) USING 'cat' AS (`thing1` int, thing2 string)) t - |SELECT thing1 + 1 - """.stripMargin), (2 to 6).map(i => Row(i))) + withTempView("test") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + val data = (1 to 5).map { i => (i, i) } + data.toDF("key", "value").createOrReplaceTempView("test") + checkAnswer( + sql( + """FROM + |(FROM test SELECT TRANSFORM(key, value) USING 'cat' AS (`thing1` int, thing2 string)) t + |SELECT thing1 + 1 + """.stripMargin), (2 to 6).map(i => Row(i))) + } } test("Sorting columns are not in Generate") { @@ -1045,27 +1078,31 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("test case key when") { - (1 to 5).map(i => (i, i.toString)).toDF("k", "v").createOrReplaceTempView("t") - checkAnswer( - sql("SELECT CASE k WHEN 2 THEN 22 WHEN 4 THEN 44 ELSE 0 END, v FROM t"), - Row(0, "1") :: Row(22, "2") :: Row(0, "3") :: Row(44, "4") :: Row(0, "5") :: Nil) + withTempView("t") { + (1 to 5).map(i => (i, i.toString)).toDF("k", "v").createOrReplaceTempView("t") + checkAnswer( + sql("SELECT CASE k WHEN 2 THEN 22 WHEN 4 THEN 44 ELSE 0 END, v FROM t"), + Row(0, "1") :: Row(22, "2") :: Row(0, "3") :: Row(44, "4") :: Row(0, "5") :: Nil) + } } test("SPARK-7269 Check analysis failed in case in-sensitive") { - Seq(1, 2, 3).map { i => - (i.toString, i.toString) - }.toDF("key", "value").createOrReplaceTempView("df_analysis") - sql("SELECT kEy from df_analysis group by key").collect() - sql("SELECT kEy+3 from df_analysis group by key+3").collect() - sql("SELECT kEy+3, a.kEy, A.kEy from df_analysis A group by key").collect() - sql("SELECT cast(kEy+1 as Int) from df_analysis A group by cast(key+1 as int)").collect() - sql("SELECT cast(kEy+1 as Int) from df_analysis A group by key+1").collect() - sql("SELECT 2 from df_analysis A group by key+1").collect() - intercept[AnalysisException] { - sql("SELECT kEy+1 from df_analysis group by key+3") - } - intercept[AnalysisException] { - sql("SELECT cast(key+2 as Int) from df_analysis A group by cast(key+1 as int)") + withTempView("df_analysis") { + Seq(1, 2, 3).map { i => + (i.toString, i.toString) + }.toDF("key", "value").createOrReplaceTempView("df_analysis") + sql("SELECT kEy from df_analysis group by key").collect() + sql("SELECT kEy+3 from df_analysis group by key+3").collect() + sql("SELECT kEy+3, a.kEy, A.kEy from df_analysis A group by key").collect() + sql("SELECT cast(kEy+1 as Int) from df_analysis A group by cast(key+1 as int)").collect() + sql("SELECT cast(kEy+1 as Int) from df_analysis A group by key+1").collect() + sql("SELECT 2 from df_analysis A group by key+1").collect() + intercept[AnalysisException] { + sql("SELECT kEy+1 from df_analysis group by key+3") + } + intercept[AnalysisException] { + sql("SELECT cast(key+2 as Int) from df_analysis A group by cast(key+1 as int)") + } } } @@ -1178,10 +1215,12 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } test("SPARK-9371: fix the support for special chars in column names for hive context") { - val ds = Seq("""{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""").toDS() - read.json(ds).createOrReplaceTempView("t") + withTempView("t") { + val ds = Seq("""{"a": {"c.b": 1}, "b.$q": [{"a@!.q": 1}], "q.w": {"w.i&": [1]}}""").toDS() + read.json(ds).createOrReplaceTempView("t") - checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1)) + checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1)) + } } test("specifying database name for a temporary view is not allowed") { @@ -1236,43 +1275,47 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { ignore("SPARK-10310: " + "script transformation using default input/output SerDe and record reader/writer") { - spark - .range(5) - .selectExpr("id AS a", "id AS b") - .createOrReplaceTempView("test") + withTempView("test") { + spark + .range(5) + .selectExpr("id AS a", "id AS b") + .createOrReplaceTempView("test") - val scriptFilePath = getTestResourcePath("data") - checkAnswer( - sql( - s"""FROM( - | FROM test SELECT TRANSFORM(a, b) - | USING 'python $scriptFilePath/scripts/test_transform.py "\t"' - | AS (c STRING, d STRING) - |) t - |SELECT c - """.stripMargin), - (0 until 5).map(i => Row(i + "#"))) + val scriptFilePath = getTestResourcePath("data") + checkAnswer( + sql( + s"""FROM( + | FROM test SELECT TRANSFORM(a, b) + | USING 'python $scriptFilePath/scripts/test_transform.py "\t"' + | AS (c STRING, d STRING) + |) t + |SELECT c + """.stripMargin), + (0 until 5).map(i => Row(i + "#"))) + } } ignore("SPARK-10310: script transformation using LazySimpleSerDe") { - spark - .range(5) - .selectExpr("id AS a", "id AS b") - .createOrReplaceTempView("test") - - val scriptFilePath = getTestResourcePath("data") - val df = sql( - s"""FROM test - |SELECT TRANSFORM(a, b) - |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' - |WITH SERDEPROPERTIES('field.delim' = '|') - |USING 'python $scriptFilePath/scripts/test_transform.py "|"' - |AS (c STRING, d STRING) - |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' - |WITH SERDEPROPERTIES('field.delim' = '|') - """.stripMargin) + withTempView("test") { + spark + .range(5) + .selectExpr("id AS a", "id AS b") + .createOrReplaceTempView("test") + + val scriptFilePath = getTestResourcePath("data") + val df = sql( + s"""FROM test + |SELECT TRANSFORM(a, b) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES('field.delim' = '|') + |USING 'python $scriptFilePath/scripts/test_transform.py "|"' + |AS (c STRING, d STRING) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES('field.delim' = '|') + """.stripMargin) - checkAnswer(df, (0 until 5).map(i => Row(i + "#", i + "#"))) + checkAnswer(df, (0 until 5).map(i => Row(i + "#", i + "#"))) + } } test("SPARK-10741: Sort on Aggregate using parquet") { @@ -2493,50 +2536,32 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } - test("SPARK-26560 Spark should be able to run Hive UDF using jar regardless of " + - "current thread context classloader") { - // force to use Spark classloader as other test (even in other test suites) may change the - // current thread's context classloader to jar classloader - Utils.withContextClassLoader(Utils.getSparkClassLoader) { - withUserDefinedFunction("udtf_count3" -> false) { - val sparkClassLoader = Thread.currentThread().getContextClassLoader - - // This jar file should not be placed to the classpath; GenericUDTFCount3 is slightly - // modified version of GenericUDTFCount2 in hive/contrib, which emits the count for - // three times. - val jarPath = "src/test/noclasspath/TestUDTF-spark-26560.jar" - val jarURL = s"file://${System.getProperty("user.dir")}/$jarPath" - - sql( - s""" - |CREATE FUNCTION udtf_count3 - |AS 'org.apache.hadoop.hive.contrib.udtf.example.GenericUDTFCount3' - |USING JAR '$jarURL' - """.stripMargin) - - assert(Thread.currentThread().getContextClassLoader eq sparkClassLoader) + test("SPARK-31522: hive metastore related configurations should be static") { + Seq("spark.sql.hive.metastore.version", + "spark.sql.hive.metastore.jars", + "spark.sql.hive.metastore.sharedPrefixes", + "spark.sql.hive.metastore.barrierPrefixes").foreach { key => + val e = intercept[AnalysisException](sql(s"set $key=abc")) + assert(e.getMessage.contains("Cannot modify the value of a static config")) + } + } - // JAR will be loaded at first usage, and it will change the current thread's - // context classloader to jar classloader in sharedState. - // See SessionState.addJar for details. - checkAnswer( - sql("SELECT udtf_count3(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"), - Row(3) :: Row(3) :: Row(3) :: Nil) - - assert(Thread.currentThread().getContextClassLoader ne sparkClassLoader) - assert(Thread.currentThread().getContextClassLoader eq - spark.sqlContext.sharedState.jarClassLoader) - - // Roll back to the original classloader and run query again. Without this line, the test - // would pass, as thread's context classloader is changed to jar classloader. But thread - // context classloader can be changed from others as well which would fail the query; one - // example is spark-shell, which thread context classloader rolls back automatically. This - // mimics the behavior of spark-shell. - Thread.currentThread().setContextClassLoader(sparkClassLoader) - checkAnswer( - sql("SELECT udtf_count3(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"), - Row(3) :: Row(3) :: Row(3) :: Nil) + test("SPARK-29295: dynamic partition map parsed from partition path should be case insensitive") { + withTable("t") { + withSQLConf("hive.exec.dynamic.partition" -> "true", + "hive.exec.dynamic.partition.mode" -> "nonstrict") { + withTempDir { loc => + sql(s"CREATE TABLE t(c1 INT) PARTITIONED BY(P1 STRING) LOCATION '${loc.getAbsolutePath}'") + sql("INSERT OVERWRITE TABLE t PARTITION(P1) VALUES(1, 'caseSensitive')") + checkAnswer(sql("select * from t"), Row(1, "caseSensitive")) + } } } } } + +@SlowHiveTest +class SQLQuerySuite extends SQLQuerySuiteBase with DisableAdaptiveExecutionSuite +@SlowHiveTest +class SQLQuerySuiteAE extends SQLQuerySuiteBase with EnableAdaptiveExecutionSuite + diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala index 80a50c18bcb93..0d1fe20130c0e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.hive.execution +import java.sql.Timestamp + import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.scalatest.Assertions._ import org.scalatest.BeforeAndAfterEach @@ -24,15 +26,19 @@ import org.scalatest.exceptions.TestFailedException import org.apache.spark.{SparkException, TaskContext, TestUtils} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StringType -class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton with - BeforeAndAfterEach { +class ScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with TestHiveSingleton + with BeforeAndAfterEach { import spark.implicits._ private val noSerdeIOSchema = HiveScriptIOSchema( @@ -56,6 +62,14 @@ class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton wit private val uncaughtExceptionHandler = new TestUncaughtExceptionHandler + // In Hive 1.2, the string representation of a decimal omits trailing zeroes. + // But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary. + val decimalToString: Column => Column = if (HiveUtils.isHive23) { + c => c.cast("string") + } else { + c => c.cast("decimal(1, 0)").cast("string") + } + protected override def beforeAll(): Unit = { super.beforeAll() defaultUncaughtExceptionHandler = Thread.getDefaultUncaughtExceptionHandler @@ -186,6 +200,132 @@ class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton wit rowsDf.select("name").collect()) assert(uncaughtExceptionHandler.exception.isEmpty) } + + test("SPARK-25990: TRANSFORM should handle different data types correctly") { + assume(TestUtils.testCommandAvailable("python")) + val scriptFilePath = getTestResourcePath("test_script.py") + + withTempView("v") { + val df = Seq( + (1, "1", 1.0, BigDecimal(1.0), new Timestamp(1)), + (2, "2", 2.0, BigDecimal(2.0), new Timestamp(2)), + (3, "3", 3.0, BigDecimal(3.0), new Timestamp(3)) + ).toDF("a", "b", "c", "d", "e") // Note column d's data type is Decimal(38, 18) + df.createTempView("v") + + val query = sql( + s""" + |SELECT + |TRANSFORM(a, b, c, d, e) + |USING 'python $scriptFilePath' AS (a, b, c, d, e) + |FROM v + """.stripMargin) + + checkAnswer(query, identity, df.select( + 'a.cast("string"), + 'b.cast("string"), + 'c.cast("string"), + decimalToString('d), + 'e.cast("string")).collect()) + } + } + + test("SPARK-30973: TRANSFORM should wait for the termination of the script (no serde)") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + + val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a") + val e = intercept[SparkException] { + val plan = + new ScriptTransformationExec( + input = Seq(rowsDf.col("a").expr), + script = "some_non_existent_command", + output = Seq(AttributeReference("a", StringType)()), + child = rowsDf.queryExecution.sparkPlan, + ioschema = noSerdeIOSchema) + SparkPlanTest.executePlan(plan, hiveContext) + } + assert(e.getMessage.contains("Subprocess exited with status")) + assert(uncaughtExceptionHandler.exception.isEmpty) + } + + test("SPARK-30973: TRANSFORM should wait for the termination of the script (with serde)") { + assume(TestUtils.testCommandAvailable("/bin/bash")) + + val rowsDf = Seq("a", "b", "c").map(Tuple1.apply).toDF("a") + val e = intercept[SparkException] { + val plan = + new ScriptTransformationExec( + input = Seq(rowsDf.col("a").expr), + script = "some_non_existent_command", + output = Seq(AttributeReference("a", StringType)()), + child = rowsDf.queryExecution.sparkPlan, + ioschema = serdeIOSchema) + SparkPlanTest.executePlan(plan, hiveContext) + } + assert(e.getMessage.contains("Subprocess exited with status")) + assert(uncaughtExceptionHandler.exception.isEmpty) + } + + + test("SPARK-32608: Script Transform ROW FORMAT DELIMIT value should format value") { + withTempView("v") { + val df = Seq( + (1, "1", 1.0, BigDecimal(1.0), new Timestamp(1)), + (2, "2", 2.0, BigDecimal(2.0), new Timestamp(2)), + (3, "3", 3.0, BigDecimal(3.0), new Timestamp(3)) + ).toDF("a", "b", "c", "d", "e") // Note column d's data type is Decimal(38, 18) + df.createTempView("v") + + // input/output with same delimit + checkAnswer( + sql( + s""" + |SELECT TRANSFORM(a, b, c, d, cast(e as string)) + | ROW FORMAT DELIMITED + | FIELDS TERMINATED BY ',' + | COLLECTION ITEMS TERMINATED BY '#' + | MAP KEYS TERMINATED BY '@' + | LINES TERMINATED BY '\n' + | NULL DEFINED AS 'null' + | USING 'cat' AS (a, b, c, d, e) + | ROW FORMAT DELIMITED + | FIELDS TERMINATED BY ',' + | COLLECTION ITEMS TERMINATED BY '#' + | MAP KEYS TERMINATED BY '@' + | LINES TERMINATED BY '\n' + | NULL DEFINED AS 'NULL' + |FROM v + """.stripMargin), identity, df.select( + 'a.cast("string"), + 'b.cast("string"), + 'c.cast("string"), + 'd.cast("string"), + 'e.cast("string")).collect()) + + // input/output with different delimit and show result + checkAnswer( + sql( + s""" + |SELECT TRANSFORM(a, b, c, d, cast(e as string)) + | ROW FORMAT DELIMITED + | FIELDS TERMINATED BY ',' + | LINES TERMINATED BY '\n' + | NULL DEFINED AS 'null' + | USING 'cat' AS (value) + | ROW FORMAT DELIMITED + | FIELDS TERMINATED BY '&' + | LINES TERMINATED BY '\n' + | NULL DEFINED AS 'NULL' + |FROM v + """.stripMargin), identity, df.select( + concat_ws(",", + 'a.cast("string"), + 'b.cast("string"), + 'c.cast("string"), + 'd.cast("string"), + 'e.cast("string"))).collect()) + } + } } private case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExecNode { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala index e6856a58b0ea9..1f1a5568b0201 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala @@ -119,6 +119,27 @@ object CountSerDeAgg extends Aggregator[Int, CountSerDeSQL, CountSerDeSQL] { def outputEncoder: Encoder[CountSerDeSQL] = ExpressionEncoder[CountSerDeSQL]() } +object ArrayDataAgg extends Aggregator[Array[Double], Array[Double], Array[Double]] { + def zero: Array[Double] = Array(0.0, 0.0, 0.0) + def reduce(s: Array[Double], array: Array[Double]): Array[Double] = { + require(s.length == array.length) + for ( j <- 0 until s.length ) { + s(j) += array(j) + } + s + } + def merge(s1: Array[Double], s2: Array[Double]): Array[Double] = { + require(s1.length == s2.length) + for ( j <- 0 until s1.length ) { + s1(j) += s2(j) + } + s1 + } + def finish(s: Array[Double]): Array[Double] = s + def bufferEncoder: Encoder[Array[Double]] = ExpressionEncoder[Array[Double]] + def outputEncoder: Encoder[Array[Double]] = ExpressionEncoder[Array[Double]] +} + abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import testImplicits._ @@ -156,20 +177,11 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi (3, null, null)).toDF("key", "value1", "value2") data2.write.saveAsTable("agg2") - val data3 = Seq[(Seq[Integer], Integer, Integer)]( - (Seq[Integer](1, 1), 10, -10), - (Seq[Integer](null), -60, 60), - (Seq[Integer](1, 1), 30, -30), - (Seq[Integer](1), 30, 30), - (Seq[Integer](2), 1, 1), - (null, -10, 10), - (Seq[Integer](2, 3), -1, null), - (Seq[Integer](2, 3), 1, 1), - (Seq[Integer](2, 3, 4), null, 1), - (Seq[Integer](null), 100, -10), - (Seq[Integer](3), null, 3), - (null, null, null), - (Seq[Integer](3), null, null)).toDF("key", "value1", "value2") + val data3 = Seq[(Seq[Double], Int)]( + (Seq(1.0, 2.0, 3.0), 0), + (Seq(4.0, 5.0, 6.0), 0), + (Seq(7.0, 8.0, 9.0), 0) + ).toDF("data", "dummy") data3.write.saveAsTable("agg3") val data4 = Seq[Boolean](true, false, true).toDF("boolvalues") @@ -184,6 +196,7 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi spark.udf.register("mydoublesum", udaf(MyDoubleSumAgg)) spark.udf.register("mydoubleavg", udaf(MyDoubleAvgAgg)) spark.udf.register("longProductSum", udaf(LongProductSumAgg)) + spark.udf.register("arraysum", udaf(ArrayDataAgg)) } override def afterAll(): Unit = { @@ -354,6 +367,12 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi Row(3, 0, null, 1, 3, 0, 0, 0, null, 1, 3, 0, 2, 2) :: Nil) } + test("SPARK-32159: array encoders should be resolved in analyzer") { + checkAnswer( + spark.sql("SELECT arraysum(data) FROM agg3"), + Row(Seq(12.0, 15.0, 18.0)) :: Nil) + } + test("verify aggregator ser/de behavior") { val data = sparkContext.parallelize((1 to 100).toSeq, 3).toDF("value1") val agg = udaf(CountSerDeAgg) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala index 990d9425fb7fc..12ee5bea7c2f9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala @@ -288,4 +288,32 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } } + + test("SPARK-32234 read ORC table with column names all starting with '_col'") { + Seq("native", "hive").foreach { orcImpl => + Seq("false", "true").foreach { vectorized => + withSQLConf( + SQLConf.ORC_IMPLEMENTATION.key -> orcImpl, + SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized) { + withTable("test_hive_orc_impl") { + spark.sql( + s""" + | CREATE TABLE test_hive_orc_impl + | (_col1 INT, _col2 STRING, _col3 INT) + | STORED AS ORC + """.stripMargin) + spark.sql( + s""" + | INSERT INTO + | test_hive_orc_impl + | VALUES(9, '12', 2020) + """.stripMargin) + + val df = spark.sql("SELECT _col2 FROM test_hive_orc_impl") + checkAnswer(df, Row("12")) + } + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index f3e712d6c0a4a..91fd8a47339fc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -320,4 +320,11 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { } } } + + test("SPARK-31580: Read a file written before ORC-569") { + assume(HiveUtils.isHive23) // Hive 1.2 doesn't use Apache ORC + // Test ORC file came from ORC-621 + val df = readResourceOrcFile("test-data/TestStringDictionary.testRowIndex.orc") + assert(df.where("str < 'row 001000'").count() === 1000) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala index cc4592a5caf68..a3e2444cae887 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -40,6 +40,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.execution.command.CacheTableCommand import org.apache.spark.sql.hive._ @@ -212,7 +213,7 @@ private[hive] class TestHiveSparkSession( } } - assume(sc.conf.get(CATALOG_IMPLEMENTATION) == "hive") + assert(sc.conf.get(CATALOG_IMPLEMENTATION) == "hive") @transient override lazy val sharedState: TestHiveSharedState = { @@ -233,16 +234,16 @@ private[hive] class TestHiveSparkSession( * Dataset.ofRows that creates a TestHiveQueryExecution (rather than a normal QueryExecution * which wouldn't load all the test tables). */ - override def sql(sqlText: String): DataFrame = { + override def sql(sqlText: String): DataFrame = withActive { val plan = sessionState.sqlParser.parsePlan(sqlText) Dataset.ofRows(self, plan) } - override def newSession(): TestHiveSparkSession = { + override def newSession(): TestHiveSparkSession = withActive { new TestHiveSparkSession(sc, Some(sharedState), None, loadTestTables) } - override def cloneSession(): SparkSession = { + override def cloneSession(): SparkSession = withActive { val result = new TestHiveSparkSession( sparkContext, Some(sharedState), @@ -263,7 +264,10 @@ private[hive] class TestHiveSparkSession( System.clearProperty("spark.hostPort") // For some hive test case which contain ${system:test.tmp.dir} - System.setProperty("test.tmp.dir", Utils.createTempDir().toURI.getPath) + // Make sure it is not called again when cloning sessions. + if (parentSessionState.isEmpty) { + System.setProperty("test.tmp.dir", Utils.createTempDir().toURI.getPath) + } /** The location of the compiled hive distribution */ lazy val hiveHome = envVarToFile("HIVE_HOME") @@ -501,7 +505,7 @@ private[hive] class TestHiveSparkSession( // has already set the execution id. if (sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) == null) { // We don't actually have a `QueryExecution` here, use a fake one instead. - SQLExecution.withNewExecutionId(this, new QueryExecution(this, OneRowRelation())) { + SQLExecution.withNewExecutionId(new QueryExecution(this, OneRowRelation())) { createCmds.foreach(_()) } } else { @@ -586,22 +590,31 @@ private[hive] class TestHiveQueryExecution( this(TestHive.sparkSession, sql) } - override lazy val analyzed: LogicalPlan = { + override lazy val analyzed: LogicalPlan = sparkSession.withActive { val describedTables = logical match { - case CacheTableCommand(tbl, _, _, _) => tbl.table :: Nil + case CacheTableCommand(tbl, _, _, _) => tbl :: Nil case _ => Nil } // Make sure any test tables referenced are loaded. val referencedTables = describedTables ++ - logical.collect { case UnresolvedRelation(ident) => ident.last } + logical.collect { case UnresolvedRelation(ident) => ident.asTableIdentifier } val resolver = sparkSession.sessionState.conf.resolver - val referencedTestTables = sparkSession.testTables.keys.filter { testTable => - referencedTables.exists(resolver(_, testTable)) + val referencedTestTables = referencedTables.flatMap { tbl => + val testTableOpt = sparkSession.testTables.keys.find(resolver(_, tbl.table)) + testTableOpt.map(testTable => tbl.copy(table = testTable)) + } + logDebug(s"Query references test tables: ${referencedTestTables.map(_.table).mkString(", ")}") + referencedTestTables.foreach { tbl => + val curDB = sparkSession.catalog.currentDatabase + try { + tbl.database.foreach(db => sparkSession.catalog.setCurrentDatabase(db)) + sparkSession.loadTestTable(tbl.table) + } finally { + tbl.database.foreach(_ => sparkSession.catalog.setCurrentDatabase(curDB)) + } } - logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}") - referencedTestTables.foreach(sparkSession.loadTestTable) // Proceed with analysis. sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala index f277f99805a4a..35dab79ff6dff 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala @@ -23,6 +23,6 @@ import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION class BucketedReadWithHiveSupportSuite extends BucketedReadSuite with TestHiveSingleton { protected override def beforeAll(): Unit = { super.beforeAll() - assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive") + assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive") } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteWithHiveSupportSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteWithHiveSupportSuite.scala index 454e2f65d5d88..bdbdcc2951072 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteWithHiveSupportSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedWriteWithHiveSupportSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION class BucketedWriteWithHiveSupportSuite extends BucketedWriteSuite with TestHiveSingleton { protected override def beforeAll(): Unit = { super.beforeAll() - assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive") + assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive") } override protected def fileFormatsToTest: Seq[String] = Seq("parquet", "orc") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index 4ada5077aec7f..cbea74103343e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.DataSourceScanExec import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._ import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ @@ -145,40 +146,52 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes val seed = System.nanoTime() withClue(s"Random data generated with the seed: ${seed}") { - val dataGenerator = RandomDataGenerator.forType( - dataType = dataType, - nullable = true, - new Random(seed) - ).getOrElse { - fail(s"Failed to create data generator for schema $dataType") + val java8ApiConfValues = if (dataType == DateType || dataType == TimestampType) { + Seq(false, true) + } else { + Seq(false) + } + java8ApiConfValues.foreach { java8Api => + withSQLConf( + SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString, + SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString, + SQLConf.LEGACY_AVRO_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString) { + val dataGenerator = RandomDataGenerator.forType( + dataType = dataType, + nullable = true, + new Random(seed) + ).getOrElse { + fail(s"Failed to create data generator for schema $dataType") + } + + // Create a DF for the schema with random data. The index field is used to sort the + // DataFrame. This is a workaround for SPARK-10591. + val schema = new StructType() + .add("index", IntegerType, nullable = false) + .add("col", dataType, nullable = true) + val rdd = + spark.sparkContext.parallelize((1 to 20).map(i => Row(i, dataGenerator()))) + val df = spark.createDataFrame(rdd, schema).orderBy("index").coalesce(1) + + df.write + .mode("overwrite") + .format(dataSourceName) + .option("dataSchema", df.schema.json) + .options(extraOptions) + .save(path) + + val loadedDF = spark + .read + .format(dataSourceName) + .option("dataSchema", df.schema.json) + .schema(df.schema) + .options(extraOptions) + .load(path) + .orderBy("index") + + checkAnswer(loadedDF, df) + } } - - // Create a DF for the schema with random data. The index field is used to sort the - // DataFrame. This is a workaround for SPARK-10591. - val schema = new StructType() - .add("index", IntegerType, nullable = false) - .add("col", dataType, nullable = true) - val rdd = - spark.sparkContext.parallelize((1 to 10).map(i => Row(i, dataGenerator()))) - val df = spark.createDataFrame(rdd, schema).orderBy("index").coalesce(1) - - df.write - .mode("overwrite") - .format(dataSourceName) - .option("dataSchema", df.schema.json) - .options(extraOptions) - .save(path) - - val loadedDF = spark - .read - .format(dataSourceName) - .option("dataSchema", df.schema.json) - .schema(df.schema) - .options(extraOptions) - .load(path) - .orderBy("index") - - checkAnswer(loadedDF, df) } } } diff --git a/sql/mkdocs.yml b/sql/mkdocs.yml index c34c891bb9e42..4463e11f17d1f 100644 --- a/sql/mkdocs.yml +++ b/sql/mkdocs.yml @@ -15,5 +15,8 @@ site_name: Spark SQL, Built-in Functions theme: readthedocs -pages: +nav: - 'Functions': 'index.md' +markdown_extensions: + - toc: + anchorlink: True diff --git a/streaming/pom.xml b/streaming/pom.xml index 87af6388e1118..251570496d805 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../pom.xml diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingConf.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingConf.scala new file mode 100644 index 0000000000000..bb80bd7072e8e --- /dev/null +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingConf.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import java.util.concurrent.TimeUnit + +import org.apache.spark.internal.config.ConfigBuilder +import org.apache.spark.streaming.util.OpenHashMapBasedStateMap.DELTA_CHAIN_LENGTH_THRESHOLD + +object StreamingConf { + + private[streaming] val BACKPRESSURE_ENABLED = + ConfigBuilder("spark.streaming.backpressure.enabled") + .version("1.5.0") + .booleanConf + .createWithDefault(false) + + private[streaming] val RECEIVER_MAX_RATE = + ConfigBuilder("spark.streaming.receiver.maxRate") + .version("1.0.2") + .longConf + .createWithDefault(Long.MaxValue) + + private[streaming] val BACKPRESSURE_INITIAL_RATE = + ConfigBuilder("spark.streaming.backpressure.initialRate") + .version("2.0.0") + .fallbackConf(RECEIVER_MAX_RATE) + + private[streaming] val BLOCK_INTERVAL = + ConfigBuilder("spark.streaming.blockInterval") + .version("0.8.0") + .timeConf(TimeUnit.MILLISECONDS) + .createWithDefaultString("200ms") + + private[streaming] val RECEIVER_WAL_ENABLE_CONF_KEY = + ConfigBuilder("spark.streaming.receiver.writeAheadLog.enable") + .version("1.2.1") + .booleanConf + .createWithDefault(false) + + private[streaming] val RECEIVER_WAL_CLASS_CONF_KEY = + ConfigBuilder("spark.streaming.receiver.writeAheadLog.class") + .version("1.4.0") + .stringConf + .createOptional + + private[streaming] val RECEIVER_WAL_ROLLING_INTERVAL_CONF_KEY = + ConfigBuilder("spark.streaming.receiver.writeAheadLog.rollingIntervalSecs") + .version("1.4.0") + .intConf + .createWithDefault(60) + + private[streaming] val RECEIVER_WAL_MAX_FAILURES_CONF_KEY = + ConfigBuilder("spark.streaming.receiver.writeAheadLog.maxFailures") + .version("1.2.0") + .intConf + .createWithDefault(3) + + private[streaming] val RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY = + ConfigBuilder("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite") + .version("1.6.0") + .booleanConf + .createWithDefault(false) + + private[streaming] val DRIVER_WAL_CLASS_CONF_KEY = + ConfigBuilder("spark.streaming.driver.writeAheadLog.class") + .version("1.4.0") + .stringConf + .createOptional + + private[streaming] val DRIVER_WAL_ROLLING_INTERVAL_CONF_KEY = + ConfigBuilder("spark.streaming.driver.writeAheadLog.rollingIntervalSecs") + .version("1.4.0") + .intConf + .createWithDefault(60) + + private[streaming] val DRIVER_WAL_MAX_FAILURES_CONF_KEY = + ConfigBuilder("spark.streaming.driver.writeAheadLog.maxFailures") + .version("1.4.0") + .intConf + .createWithDefault(3) + + private[streaming] val DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY = + ConfigBuilder("spark.streaming.driver.writeAheadLog.closeFileAfterWrite") + .version("1.6.0") + .booleanConf + .createWithDefault(false) + + private[streaming] val DRIVER_WAL_BATCHING_CONF_KEY = + ConfigBuilder("spark.streaming.driver.writeAheadLog.allowBatching") + .version("1.6.0") + .booleanConf + .createWithDefault(true) + + private[streaming] val DRIVER_WAL_BATCHING_TIMEOUT_CONF_KEY = + ConfigBuilder("spark.streaming.driver.writeAheadLog.batchingTimeout") + .version("1.6.0") + .longConf + .createWithDefault(5000) + + private[streaming] val STREAMING_UNPERSIST = + ConfigBuilder("spark.streaming.unpersist") + .version("0.9.0") + .booleanConf + .createWithDefault(true) + + private[streaming] val STOP_GRACEFULLY_ON_SHUTDOWN = + ConfigBuilder("spark.streaming.stopGracefullyOnShutdown") + .version("1.4.0") + .booleanConf + .createWithDefault(false) + + private[streaming] val UI_RETAINED_BATCHES = + ConfigBuilder("spark.streaming.ui.retainedBatches") + .version("1.0.0") + .intConf + .createWithDefault(1000) + + private[streaming] val SESSION_BY_KEY_DELTA_CHAIN_THRESHOLD = + ConfigBuilder("spark.streaming.sessionByKey.deltaChainThreshold") + .version("1.6.0") + .intConf + .createWithDefault(DELTA_CHAIN_LENGTH_THRESHOLD) + + private[streaming] val BACKPRESSURE_RATE_ESTIMATOR = + ConfigBuilder("spark.streaming.backpressure.rateEstimator") + .version("1.5.0") + .stringConf + .createWithDefault("pid") + + private[streaming] val BACKPRESSURE_PID_PROPORTIONAL = + ConfigBuilder("spark.streaming.backpressure.pid.proportional") + .version("1.5.0") + .doubleConf + .createWithDefault(1.0) + + private[streaming] val BACKPRESSURE_PID_INTEGRAL = + ConfigBuilder("spark.streaming.backpressure.pid.integral") + .version("1.5.0") + .doubleConf + .createWithDefault(0.2) + + private[streaming] val BACKPRESSURE_PID_DERIVED = + ConfigBuilder("spark.streaming.backpressure.pid.derived") + .version("1.5.0") + .doubleConf + .createWithDefault(0.0) + + private[streaming] val BACKPRESSURE_PID_MIN_RATE = + ConfigBuilder("spark.streaming.backpressure.pid.minRate") + .version("1.5.0") + .doubleConf + .createWithDefault(100) + + private[streaming] val CONCURRENT_JOBS = + ConfigBuilder("spark.streaming.concurrentJobs") + .version("0.7.0") + .intConf + .createWithDefault(1) + + private[streaming] val GRACEFUL_STOP_TIMEOUT = + ConfigBuilder("spark.streaming.gracefulStopTimeout") + .version("1.0.0") + .timeConf(TimeUnit.MILLISECONDS) + .createOptional + + private[streaming] val MANUAL_CLOCK_JUMP = + ConfigBuilder("spark.streaming.manualClock.jump") + .version("0.7.0") + .longConf + .createWithDefault(0) + +} diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 440b653e45de1..e3459c96335b3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -41,6 +41,7 @@ import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.scheduler.LiveListenerBus import org.apache.spark.serializer.SerializationDebugger import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.StreamingConf.STOP_GRACEFULLY_ON_SHUTDOWN import org.apache.spark.streaming.StreamingContextState._ import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver @@ -717,7 +718,7 @@ class StreamingContext private[streaming] ( } private def stopOnShutdown(): Unit = { - val stopGracefully = conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", false) + val stopGracefully = conf.get(STOP_GRACEFULLY_ON_SHUTDOWN) logInfo(s"Invoking stop(stopGracefully=$stopGracefully) from shutdown hook") // Do not stop SparkContext, let its own shutdown hook stop it stop(stopSparkContext = false, stopGracefully = stopGracefully) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 6c981b293ac76..e037f26088347 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -31,6 +31,7 @@ import org.apache.spark.internal.io.SparkHadoopWriterUtils import org.apache.spark.rdd.{BlockRDD, RDD, RDDOperationScope} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ +import org.apache.spark.streaming.StreamingConf.STREAMING_UNPERSIST import org.apache.spark.streaming.StreamingContext.rddToFileName import org.apache.spark.streaming.scheduler.Job import org.apache.spark.ui.{UIUtils => SparkUIUtils} @@ -447,7 +448,7 @@ abstract class DStream[T: ClassTag] ( * this to clear their own metadata along with the generated RDDs. */ private[streaming] def clearMetadata(time: Time): Unit = { - val unpersistData = ssc.conf.getBoolean("spark.streaming.unpersist", true) + val unpersistData = ssc.conf.get(STREAMING_UNPERSIST) val oldRDDs = generatedRDDs.filter(_._1 <= (time - rememberDuration)) logDebug("Clearing references to old RDDs: [" + oldRDDs.map(x => s"${x._1} -> ${x._2.id}").mkString(", ") + "]") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 2533c53883cac..d641f55fa7f6f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.storage.StreamBlockId +import org.apache.spark.streaming.StreamingConf.BLOCK_INTERVAL import org.apache.spark.streaming.util.RecurringTimer import org.apache.spark.util.{Clock, SystemClock} @@ -100,8 +101,8 @@ private[streaming] class BlockGenerator( } import GeneratorState._ - private val blockIntervalMs = conf.getTimeAsMs("spark.streaming.blockInterval", "200ms") - require(blockIntervalMs > 0, s"'spark.streaming.blockInterval' should be a positive value") + private val blockIntervalMs = conf.get(BLOCK_INTERVAL) + require(blockIntervalMs > 0, s"'${BLOCK_INTERVAL.key}' should be a positive value") private val blockIntervalTimer = new RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer, "BlockGenerator") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala index c620074b4e44d..f77ca3e8fdb45 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala @@ -21,6 +21,7 @@ import com.google.common.util.concurrent.{RateLimiter => GuavaRateLimiter} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging +import org.apache.spark.streaming.StreamingConf.{BACKPRESSURE_INITIAL_RATE, RECEIVER_MAX_RATE} /** * Provides waitToPush() method to limit the rate at which receivers consume data. @@ -37,7 +38,7 @@ import org.apache.spark.internal.Logging private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging { // treated as an upper limit - private val maxRateLimit = conf.getLong("spark.streaming.receiver.maxRate", Long.MaxValue) + private val maxRateLimit = conf.get(RECEIVER_MAX_RATE) private lazy val rateLimiter = GuavaRateLimiter.create(getInitialRateLimit().toDouble) def waitToPush(): Unit = { @@ -68,6 +69,6 @@ private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging { * Get the initial rateLimit to initial rateLimiter */ private def getInitialRateLimit(): Long = { - math.min(conf.getLong("spark.streaming.backpressure.initialRate", maxRateLimit), maxRateLimit) + math.min(conf.get(BACKPRESSURE_INITIAL_RATE), maxRateLimit) } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index 7e8449ee5aa7e..8008a5c495e9d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -23,7 +23,7 @@ import scala.util.{Failure, Success, Try} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.streaming.{Checkpoint, CheckpointWriter, Time} +import org.apache.spark.streaming.{Checkpoint, CheckpointWriter, StreamingConf, Time} import org.apache.spark.streaming.api.python.PythonDStream import org.apache.spark.streaming.util.RecurringTimer import org.apache.spark.util.{Clock, EventLoop, ManualClock, Utils} @@ -115,7 +115,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { logInfo("Stopping JobGenerator gracefully") val timeWhenStopStarted = System.nanoTime() val stopTimeoutMs = conf.getTimeAsMs( - "spark.streaming.gracefulStopTimeout", s"${10 * ssc.graph.batchDuration.milliseconds}ms") + StreamingConf.GRACEFUL_STOP_TIMEOUT.key, s"${10 * ssc.graph.batchDuration.milliseconds}ms") val pollTime = 100 // To prevent graceful stop to get stuck permanently @@ -206,7 +206,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { // or if the property is defined set it to that time if (clock.isInstanceOf[ManualClock]) { val lastTime = ssc.initialCheckpoint.checkpointTime.milliseconds - val jumpTime = ssc.sc.conf.getLong("spark.streaming.manualClock.jump", 0) + val jumpTime = ssc.sc.conf.get(StreamingConf.MANUAL_CLOCK_JUMP) clock.asInstanceOf[ManualClock].setTime(lastTime + jumpTime) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index 7eea57cc083ed..a6d8dccd7e722 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -47,7 +47,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { // Use of ConcurrentHashMap.keySet later causes an odd runtime problem due to Java 7/8 diff // https://gist.github.com/AlainODea/1375759b8720a3f9f094 private val jobSets: java.util.Map[Time, JobSet] = new ConcurrentHashMap[Time, JobSet] - private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1) + private val numConcurrentJobs = ssc.conf.get(StreamingConf.CONCURRENT_JOBS) private val jobExecutor = ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor") private[streaming] val jobGenerator = new JobGenerator(this) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala index 7774e85f778a6..88f191fa022c6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala @@ -23,6 +23,7 @@ import java.util.concurrent.atomic.AtomicLong import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.SparkConf +import org.apache.spark.streaming.StreamingConf.BACKPRESSURE_ENABLED import org.apache.spark.streaming.scheduler.rate.RateEstimator import org.apache.spark.util.{ThreadUtils, Utils} @@ -86,5 +87,5 @@ private[streaming] abstract class RateController(val streamUID: Int, rateEstimat object RateController { def isBackPressureEnabled(conf: SparkConf): Boolean = - conf.getBoolean("spark.streaming.backpressure.enabled", false) + conf.get(BACKPRESSURE_ENABLED) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala index e4b9dffee04f4..7f4d0f298e8f2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala @@ -19,6 +19,7 @@ package org.apache.spark.streaming.scheduler.rate import org.apache.spark.SparkConf import org.apache.spark.streaming.Duration +import org.apache.spark.streaming.StreamingConf._ /** * A component that estimates the rate at which an `InputDStream` should ingest @@ -57,12 +58,12 @@ object RateEstimator { * @throws IllegalArgumentException if the configured RateEstimator is not `pid`. */ def create(conf: SparkConf, batchInterval: Duration): RateEstimator = - conf.get("spark.streaming.backpressure.rateEstimator", "pid") match { + conf.get(BACKPRESSURE_RATE_ESTIMATOR) match { case "pid" => - val proportional = conf.getDouble("spark.streaming.backpressure.pid.proportional", 1.0) - val integral = conf.getDouble("spark.streaming.backpressure.pid.integral", 0.2) - val derived = conf.getDouble("spark.streaming.backpressure.pid.derived", 0.0) - val minRate = conf.getDouble("spark.streaming.backpressure.pid.minRate", 100) + val proportional = conf.get(BACKPRESSURE_PID_PROPORTIONAL) + val integral = conf.get(BACKPRESSURE_PID_INTEGRAL) + val derived = conf.get(BACKPRESSURE_PID_DERIVED) + val minRate = conf.get(BACKPRESSURE_PID_MIN_RATE) new PIDRateEstimator(batchInterval.milliseconds, proportional, integral, derived, minRate) case estimator => diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala index de73762beb860..da351ecf1889c 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala @@ -24,7 +24,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.{HashMap, Queue} import org.apache.spark.scheduler._ -import org.apache.spark.streaming.{StreamingContext, Time} +import org.apache.spark.streaming.{StreamingConf, StreamingContext, Time} import org.apache.spark.streaming.scheduler._ private[spark] class StreamingJobProgressListener(ssc: StreamingContext) @@ -33,7 +33,7 @@ private[spark] class StreamingJobProgressListener(ssc: StreamingContext) private val waitingBatchUIData = new HashMap[Time, BatchUIData] private val runningBatchUIData = new HashMap[Time, BatchUIData] private val completedBatchUIData = new Queue[BatchUIData] - private val batchUIDataLimit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 1000) + private val batchUIDataLimit = ssc.conf.get(StreamingConf.UI_RETAINED_BATCHES) private var totalCompletedBatches = 0L private var totalReceivedRecords = 0L private var totalProcessedRecords = 0L diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala index d47287b6077f8..bbfb013c8dfdd 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala @@ -80,9 +80,10 @@ private[ui] class StreamingPage(parent: StreamingTab) /** Render the page */ def render(request: HttpServletRequest): Seq[Node] = { val resources = generateLoadResources(request) + val onClickTimelineFunc = generateOnClickTimelineFunction() val basicInfo = generateBasicInfo() val content = resources ++ - basicInfo ++ + onClickTimelineFunc ++ basicInfo ++ listener.synchronized { generateStatTable() ++ generateBatchListTables() @@ -101,6 +102,12 @@ private[ui] class StreamingPage(parent: StreamingTab) // scalastyle:on } + /** Generate html that will set onClickTimeline declared in streaming-page.js */ + private def generateOnClickTimelineFunction(): Seq[Node] = { + val js = "onClickTimeline = getOnClickTimelineFunction();" + + } + /** Generate basic information of the streaming program */ private def generateBasicInfo(): Seq[Node] = { val timeSinceStart = System.currentTimeMillis() - startTime @@ -140,6 +147,16 @@ private[ui] class StreamingPage(parent: StreamingTab) } + private def generateTimeTipStrings(times: Seq[Long]): Seq[Node] = { + // We leverage timeFormat as the value would be same as timeFormat. This means it is + // sensitive to the order - generateTimeMap should be called earlier than this. + val js = "var timeTipStrings = {};\n" + times.map { time => + s"timeTipStrings[$time] = timeFormat[$time];" + }.mkString("\n") + + + } + private def generateStatTable(): Seq[Node] = { val batches = listener.retainedBatches @@ -313,7 +330,8 @@ private[ui] class StreamingPage(parent: StreamingTab) // scalastyle:on - generateTimeMap(batchTimes) ++ table ++ jsCollector.toHtml + generateTimeMap(batchTimes) ++ generateTimeTipStrings(batchTimes) ++ table ++ + jsCollector.toHtml } private def generateInputDStreamsTable( diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala index 146577214de17..006bcad5d68c2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala @@ -58,7 +58,7 @@ private[streaming] object HdfsUtils { // If we are really unlucky, the file may be deleted as we're opening the stream. // This can happen as clean up is performed by daemon threads that may be left over from // previous runs. - if (!dfs.isFile(dfsPath)) null else throw e + if (!dfs.getFileStatus(dfsPath).isFile) null else throw e } } @@ -92,6 +92,10 @@ private[streaming] object HdfsUtils { def checkFileExists(path: String, conf: Configuration): Boolean = { val hdpPath = new Path(path) val fs = getFileSystemForPath(hdpPath, conf) - fs.isFile(hdpPath) + try { + fs.getFileStatus(hdpPath).isFile + } catch { + case _: FileNotFoundException => false + } } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala index 618c036377aee..4224cef1cbae1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala @@ -26,6 +26,7 @@ import com.esotericsoftware.kryo.io.{Input, Output} import org.apache.spark.SparkConf import org.apache.spark.serializer.{KryoInputObjectInputBridge, KryoOutputObjectOutputBridge} +import org.apache.spark.streaming.StreamingConf.SESSION_BY_KEY_DELTA_CHAIN_THRESHOLD import org.apache.spark.streaming.util.OpenHashMapBasedStateMap._ import org.apache.spark.util.collection.OpenHashMap @@ -61,8 +62,7 @@ private[streaming] object StateMap { def empty[K, S]: StateMap[K, S] = new EmptyStateMap[K, S] def create[K: ClassTag, S: ClassTag](conf: SparkConf): StateMap[K, S] = { - val deltaChainThreshold = conf.getInt("spark.streaming.sessionByKey.deltaChainThreshold", - DELTA_CHAIN_LENGTH_THRESHOLD) + val deltaChainThreshold = conf.get(SESSION_BY_KEY_DELTA_CHAIN_THRESHOLD) new OpenHashMapBasedStateMap[K, S](deltaChainThreshold) } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala index b0a4c98fc9e57..224e782066f60 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala @@ -23,52 +23,34 @@ import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging +import org.apache.spark.streaming.StreamingConf._ import org.apache.spark.util.Utils /** A helper class with utility functions related to the WriteAheadLog interface */ private[streaming] object WriteAheadLogUtils extends Logging { - val RECEIVER_WAL_ENABLE_CONF_KEY = "spark.streaming.receiver.writeAheadLog.enable" - val RECEIVER_WAL_CLASS_CONF_KEY = "spark.streaming.receiver.writeAheadLog.class" - val RECEIVER_WAL_ROLLING_INTERVAL_CONF_KEY = - "spark.streaming.receiver.writeAheadLog.rollingIntervalSecs" - val RECEIVER_WAL_MAX_FAILURES_CONF_KEY = "spark.streaming.receiver.writeAheadLog.maxFailures" - val RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY = - "spark.streaming.receiver.writeAheadLog.closeFileAfterWrite" - - val DRIVER_WAL_CLASS_CONF_KEY = "spark.streaming.driver.writeAheadLog.class" - val DRIVER_WAL_ROLLING_INTERVAL_CONF_KEY = - "spark.streaming.driver.writeAheadLog.rollingIntervalSecs" - val DRIVER_WAL_MAX_FAILURES_CONF_KEY = "spark.streaming.driver.writeAheadLog.maxFailures" - val DRIVER_WAL_BATCHING_CONF_KEY = "spark.streaming.driver.writeAheadLog.allowBatching" - val DRIVER_WAL_BATCHING_TIMEOUT_CONF_KEY = "spark.streaming.driver.writeAheadLog.batchingTimeout" - val DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY = - "spark.streaming.driver.writeAheadLog.closeFileAfterWrite" - - val DEFAULT_ROLLING_INTERVAL_SECS = 60 - val DEFAULT_MAX_FAILURES = 3 def enableReceiverLog(conf: SparkConf): Boolean = { - conf.getBoolean(RECEIVER_WAL_ENABLE_CONF_KEY, false) + conf.get(RECEIVER_WAL_ENABLE_CONF_KEY) } def getRollingIntervalSecs(conf: SparkConf, isDriver: Boolean): Int = { if (isDriver) { - conf.getInt(DRIVER_WAL_ROLLING_INTERVAL_CONF_KEY, DEFAULT_ROLLING_INTERVAL_SECS) + conf.get(DRIVER_WAL_ROLLING_INTERVAL_CONF_KEY) } else { - conf.getInt(RECEIVER_WAL_ROLLING_INTERVAL_CONF_KEY, DEFAULT_ROLLING_INTERVAL_SECS) + conf.get(RECEIVER_WAL_ROLLING_INTERVAL_CONF_KEY) } } def getMaxFailures(conf: SparkConf, isDriver: Boolean): Int = { if (isDriver) { - conf.getInt(DRIVER_WAL_MAX_FAILURES_CONF_KEY, DEFAULT_MAX_FAILURES) + conf.get(DRIVER_WAL_MAX_FAILURES_CONF_KEY) } else { - conf.getInt(RECEIVER_WAL_MAX_FAILURES_CONF_KEY, DEFAULT_MAX_FAILURES) + conf.get(RECEIVER_WAL_MAX_FAILURES_CONF_KEY) } } def isBatchingEnabled(conf: SparkConf, isDriver: Boolean): Boolean = { - isDriver && conf.getBoolean(DRIVER_WAL_BATCHING_CONF_KEY, defaultValue = true) + isDriver && conf.get(DRIVER_WAL_BATCHING_CONF_KEY) } /** @@ -76,14 +58,14 @@ private[streaming] object WriteAheadLogUtils extends Logging { * before we fail the write attempt to unblock receivers. */ def getBatchingTimeout(conf: SparkConf): Long = { - conf.getLong(DRIVER_WAL_BATCHING_TIMEOUT_CONF_KEY, defaultValue = 5000) + conf.get(DRIVER_WAL_BATCHING_TIMEOUT_CONF_KEY) } def shouldCloseFileAfterWrite(conf: SparkConf, isDriver: Boolean): Boolean = { if (isDriver) { - conf.getBoolean(DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY, defaultValue = false) + conf.get(DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY) } else { - conf.getBoolean(RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY, defaultValue = false) + conf.get(RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY) } } @@ -126,9 +108,9 @@ private[streaming] object WriteAheadLogUtils extends Logging { ): WriteAheadLog = { val classNameOption = if (isDriver) { - sparkConf.getOption(DRIVER_WAL_CLASS_CONF_KEY) + sparkConf.get(DRIVER_WAL_CLASS_CONF_KEY) } else { - sparkConf.getOption(RECEIVER_WAL_CLASS_CONF_KEY) + sparkConf.get(RECEIVER_WAL_CLASS_CONF_KEY) } val wal = classNameOption.map { className => try { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala index 5e2ce25c7c441..6b332206e8f6d 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala @@ -22,6 +22,7 @@ import scala.util.Random import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.rdd.BlockRDD import org.apache.spark.storage.{StorageLevel, StreamBlockId} +import org.apache.spark.streaming.StreamingConf.RECEIVER_WAL_ENABLE_CONF_KEY import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD import org.apache.spark.streaming.receiver.{BlockManagerBasedStoreResult, Receiver, WriteAheadLogBasedStoreResult} @@ -117,7 +118,7 @@ class ReceiverInputDStreamSuite private def runTest(enableWAL: Boolean, body: ReceiverInputDStream[_] => Unit): Unit = { val conf = new SparkConf() conf.setMaster("local[4]").setAppName("ReceiverInputDStreamSuite") - conf.set(WriteAheadLogUtils.RECEIVER_WAL_ENABLE_CONF_KEY, enableWAL.toString) + conf.set(StreamingConf.RECEIVER_WAL_ENABLE_CONF_KEY.key, enableWAL.toString) require(WriteAheadLogUtils.enableReceiverLog(conf) === enableWAL) ssc = new StreamingContext(conf, Seconds(1)) val receiverStream = new ReceiverInputDStream[Int](ssc) { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index 1d6637861511f..4eff464dcdafb 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -293,7 +293,8 @@ class StreamingContextSuite } } - test("stop gracefully") { + // TODO (SPARK-31728): re-enable it + ignore("stop gracefully") { val conf = new SparkConf().setMaster(master).setAppName(appName) conf.set("spark.dummyTimeConfig", "3600s") val sc = new SparkContext(conf) diff --git a/tools/pom.xml b/tools/pom.xml index e380e869f55c7..41662b1957703 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.0.0-SNAPSHOT + 3.0.2-SNAPSHOT ../pom.xml