Merge remote-tracking branch 'origin/master' into return-map-output-m…

…etadata
apache · May 22, 2020 · 6ecd3ad · 6ecd3ad
2 parents 90084ea + 2115c55
commit 6ecd3ad
Show file tree

Hide file tree

Showing 1,150 changed files with 54,015 additions and 21,984 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://cwiki.apache.org/confluence/display/INFRA/.asf.yaml+features+for+git+repositories
+---
+github:
+  description: "Apache Spark - A unified analytics engine for large-scale data processing"
+  homepage: https://spark.apache.org/
+  labels:
+    - python
+    - scala
+    - r
+    - java
+    - big-data
+    - jdbc
+    - sql
+    - spark
diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
@@ -29,9 +29,11 @@ Please clarify why the changes are needed. For instance,
 -->
 
 
-### Does this PR introduce any user-facing change?
+### Does this PR introduce _any_ user-facing change?
 <!--
+Note that it means *any* user-facing change including all aspects such as the documentation fix.
 If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible.
+If possible, please also clarify if this is a user-facing change compared to the released Spark versions or within the unreleased branches such as master.
 If no, write 'No'.
 -->
 

diff --git a/.github/autolabeler.yml b/.github/autolabeler.yml
@@ -0,0 +1,133 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Bot page: https://github.com/apps/probot-autolabeler
+# The matching patterns follow the .gitignore spec.
+# See: https://git-scm.com/docs/gitignore#_pattern_format
+# Also, note that the plugin uses 'ignore' package. See also
+# https://github.com/kaelzhang/node-ignore
+INFRA:
+  - ".github/"
+  - "appveyor.yml"
+  - "/tools/"
+  - "/dev/create-release/"
+  - ".asf.yaml"
+  - ".gitattributes"
+  - ".gitignore"
+  - "/dev/github_jira_sync.py"
+  - "/dev/merge_spark_pr.py"
+  - "/dev/run-tests-jenkins*"
+BUILD:
+  - "/dev/"
+  - "!/dev/github_jira_sync.py"
+  - "!/dev/merge_spark_pr.py"
+  - "!/dev/run-tests-jenkins*"
+  - "!/dev/.rat-excludes"
+  - "/build/"
+  - "/project/"
+  - "/assembly/"
+  - "*pom.xml"
+  - "/bin/docker-image-tool.sh"
+  - "/bin/find-spark-home*"
+  - "scalastyle-config.xml"
+DOCS:
+  - "docs/"
+  - "/README.md"
+  - "/CONTRIBUTING.md"
+EXAMPLES:
+  - "examples/"
+  - "/bin/run-example*"
+CORE:
+  - "/core/"
+  - "!UI.scala"
+  - "!ui/"
+  - "/common/kvstore/"
+  - "/common/network-common/"
+  - "/common/network-shuffle/"
+  - "/python/pyspark/*.py"
+  - "/python/pyspark/tests/*.py"
+SPARK SUBMIT:
+  - "/bin/spark-submit*"
+SPARK SHELL:
+  - "/repl/"
+  - "/bin/spark-shell*"
+SQL:
+  - "sql/"
+  - "/common/unsafe/"
+  - "!/python/pyspark/sql/avro/"
+  - "!/python/pyspark/sql/streaming.py"
+  - "!/python/pyspark/sql/tests/test_streaming.py"
+  - "/bin/spark-sql*"
+  - "/bin/beeline*"
+  - "/sbin/*thriftserver*.sh"
+  - "*SQL*.R"
+  - "DataFrame.R"
+  - "WindowSpec.R"
+  - "catalog.R"
+  - "column.R"
+  - "functions.R"
+  - "group.R"
+  - "schema.R"
+  - "types.R"
+AVRO:
+  - "/external/avro/"
+  - "/python/pyspark/sql/avro/"
+DSTREAM:
+  - "/streaming/"
+  - "/data/streaming/"
+  - "/external/flume*"
+  - "/external/kinesis*"
+  - "/external/kafka*"
+  - "/python/pyspark/streaming/"
+GRAPHX:
+  - "/graphx/"
+  - "/data/graphx/"
+ML:
+  - "ml/"
+  - "*mllib_*.R"
+MLLIB:
+  - "spark/mllib/"
+  - "/mllib-local/"
+  - "/python/pyspark/mllib/"
+STRUCTURED STREAMING:
+  - "sql/**/streaming/"
+  - "/external/kafka-0-10-sql/"
+  - "/python/pyspark/sql/streaming.py"
+  - "/python/pyspark/sql/tests/test_streaming.py"
+  - "*streaming.R"
+PYTHON:
+  - "/bin/pyspark*"
+  - "python/"
+R:
+  - "r/"
+  - "R/"
+  - "/bin/sparkR*"
+YARN:
+  - "/resource-managers/yarn/"
+MESOS:
+  - "/resource-managers/mesos/"
+  - "/sbin/*mesos*.sh"
+KUBERNETES:
+  - "/resource-managers/kubernetes/"
+WINDOWS:
+  - "*.cmd"
+  - "/R/pkg/tests/fulltests/test_Windows.R"
+WEB UI:
+  - "ui/"
+  - "UI.scala"
+DEPLOY:
+  - "/sbin/"
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -103,12 +103,12 @@ jobs:
     - uses: actions/setup-java@v1
       with:
         java-version: '11'
-    - name: install R
+    - uses: r-lib/actions/setup-r@v1
+      with:
+        r-version: '3.6.2'
+    - name: Install lib
       run: |
-        echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' | sudo tee -a /etc/apt/sources.list
-        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
-        sudo apt-get update
-        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
+        sudo apt-get install -y libcurl4-openssl-dev
     - name: install R packages
       run: |
         sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')"
@@ -139,12 +139,12 @@ jobs:
     - uses: actions/setup-ruby@v1
       with:
         ruby-version: '2.7'
-    - name: Install R
+    - uses: r-lib/actions/setup-r@v1
+      with:
+        r-version: '3.6.2'
+    - name: Install lib and pandoc
       run: |
-        echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' | sudo tee -a /etc/apt/sources.list
-        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
-        sudo apt-get update
-        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev pandoc
+        sudo apt-get install -y libcurl4-openssl-dev pandoc
     - name: Install packages
       run: |
         pip install sphinx mkdocs numpy

diff --git a/.gitignore b/.gitignore
@@ -18,8 +18,6 @@
 .idea_modules/
 .project
 .pydevproject
-.python-version
-.ruby-version
 .scala_dependencies
 .settings
 /lib/

diff --git a/R/create-docs.sh b/R/create-docs.sh
@@ -49,7 +49,7 @@ pushd "$FWDIR" > /dev/null
 mkdir -p pkg/html
 pushd pkg/html
 
-"$R_SCRIPT_PATH/Rscript" -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
+"$R_SCRIPT_PATH/Rscript" -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); knitr::knit_rd("SparkR", links = tools::findHTMLlinks(file.path(libDir, "SparkR")))'
 
 popd
 

diff --git a/R/create-rd.sh b/R/create-rd.sh
@@ -34,4 +34,4 @@ pushd "$FWDIR" > /dev/null
 . "$FWDIR/find-r.sh"
 
 # Generate Rd files if devtools is installed
-"$R_SCRIPT_PATH/Rscript" -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
+"$R_SCRIPT_PATH/Rscript" -e ' if(requireNamespace("devtools", quietly=TRUE)) { setwd("'$FWDIR'"); devtools::document(pkg="./pkg", roclets="rd") }'
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -23,7 +23,7 @@ Suggests:
     testthat,
     e1071,
     survival,
-    arrow
+    arrow (>= 0.15.1)
 Collate:
     'schema.R'
     'generics.R'

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -28,6 +28,7 @@ importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "u
 
 # S3 methods exported
 export("sparkR.session")
+export("sparkR.init")
 export("sparkR.session.stop")
 export("sparkR.stop")
 export("sparkR.conf")
@@ -41,6 +42,9 @@ export("sparkR.callJStatic")
 
 export("install.spark")
 
+export("sparkRSQL.init",
+       "sparkRHive.init")
+
 # MLlib integration
 exportMethods("glm",
               "spark.glm",
@@ -68,7 +72,10 @@ exportMethods("glm",
               "spark.freqItemsets",
               "spark.associationRules",
               "spark.findFrequentSequentialPatterns",
-              "spark.assignClusters")
+              "spark.assignClusters",
+              "spark.fmClassifier",
+              "spark.lm",
+              "spark.fmRegressor")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -148,6 +155,7 @@ exportMethods("arrange",
               "printSchema",
               "randomSplit",
               "rbind",
+              "registerTempTable",
               "rename",
               "repartition",
               "repartitionByRange",
@@ -431,8 +439,10 @@ export("as.DataFrame",
        "cacheTable",
        "clearCache",
        "createDataFrame",
+       "createExternalTable",
        "createTable",
        "currentDatabase",
+       "dropTempTable",
        "dropTempView",
        "listColumns",
        "listDatabases",