Merge commit '2c5d489679ba3814973680d65853877664bcd931' into SPARK-24497

-recursive-sql
apache · Apr 14, 2020 · 8205e61 · 8205e61
2 parents 656995b + 2c5d489
commit 8205e61
Show file tree

Hide file tree

Showing 734 changed files with 28,655 additions and 8,917 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://cwiki.apache.org/confluence/display/INFRA/.asf.yaml+features+for+git+repositories
+---
+github:
+  description: "Apache Spark - A unified analytics engine for large-scale data processing"
+  homepage: https://spark.apache.org/
+  labels:
+    - python
+    - scala
+    - r
+    - java
+    - big-data
+    - jdbc
+    - sql
+    - spark
diff --git a/.github/autolabeler.yml b/.github/autolabeler.yml
@@ -0,0 +1,129 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Bot page: https://github.com/apps/probot-autolabeler
+# The matching patterns follow the .gitignore spec.
+# See: https://git-scm.com/docs/gitignore#_pattern_format
+# Also, note that the plugin uses 'ignore' package. See also
+# https://github.com/kaelzhang/node-ignore
+INFRA:
+  - ".github/"
+  - "appveyor.yml"
+  - "/tools/"
+  - "/dev/create-release/"
+  - ".asf.yaml"
+  - ".gitattributes"
+  - ".gitignore"
+  - "/dev/github_jira_sync.py"
+  - "/dev/merge_spark_pr.py"
+  - "/dev/run-tests-jenkins*"
+BUILD:
+  - "/dev/"
+  - "!/dev/github_jira_sync.py"
+  - "!/dev/merge_spark_pr.py"
+  - "!/dev/run-tests-jenkins*"
+  - "/build/"
+  - "/project/"
+  - "/assembly/"
+  - "*pom.xml"
+  - "/bin/docker-image-tool.sh"
+  - "/bin/find-spark-home*"
+  - "scalastyle-config.xml"
+DOCS:
+  - "docs/"
+  - "/README.md"
+  - "/CONTRIBUTING.md"
+EXAMPLES:
+  - "examples/"
+  - "/bin/run-example*"
+CORE:
+  - "/core/"
+  - "/common/kvstore/"
+  - "/common/network-common/"
+  - "/common/network-shuffle/"
+  - "/python/pyspark/*.py"
+  - "/python/pyspark/tests/*.py"
+SPARK SUBMIT:
+  - "/bin/spark-submit*"
+SPARK SHELL:
+  - "/repl/"
+  - "/bin/spark-shell*"
+SQL:
+  - "sql/"
+  - "/common/unsafe/"
+  - "!/python/pyspark/sql/avro/"
+  - "!/python/pyspark/sql/streaming.py"
+  - "!/python/pyspark/sql/tests/test_streaming.py"
+  - "/bin/spark-sql*"
+  - "/bin/beeline*"
+  - "/sbin/*thriftserver*.sh"
+  - "*SQL*.R"
+  - "DataFrame.R"
+  - "WindowSpec.R"
+  - "catalog.R"
+  - "column.R"
+  - "functions.R"
+  - "group.R"
+  - "schema.R"
+  - "types.R"
+AVRO:
+  - "/external/avro/"
+  - "/python/pyspark/sql/avro/"
+DSTREAM:
+  - "/streaming/"
+  - "/data/streaming/"
+  - "/external/flume*"
+  - "/external/kinesis*"
+  - "/external/kafka*"
+  - "/python/pyspark/streaming/"
+GRAPHX:
+  - "/graphx/"
+  - "/data/graphx/"
+ML:
+  - "ml/"
+  - "*mllib_*.R"
+MLLIB:
+  - "spark/mllib/"
+  - "/mllib-local/"
+  - "/python/pyspark/mllib/"
+STRUCTURED STREAMING:
+  - "sql/**/streaming/"
+  - "/external/kafka-0-10-sql/"
+  - "/python/pyspark/sql/streaming.py"
+  - "/python/pyspark/sql/tests/test_streaming.py"
+  - "*streaming.R"
+PYTHON:
+  - "/bin/pyspark*"
+  - "python/"
+R:
+  - "r/"
+  - "R/"
+  - "/bin/sparkR*"
+YARN:
+  - "/resource-managers/yarn/"
+MESOS:
+  - "/resource-managers/mesos/"
+  - "/sbin/*mesos*.sh"
+KUBERNETES:
+  - "/resource-managers/kubernetes/"
+WINDOWS:
+  - "*.cmd"
+  - "/R/pkg/tests/fulltests/test_Windows.R"
+WEB UI:
+  - "ui/"
+DEPLOY:
+  - "/sbin/"
diff --git a/.gitignore b/.gitignore
@@ -18,8 +18,6 @@
 .idea_modules/
 .project
 .pydevproject
-.python-version
-.ruby-version
 .scala_dependencies
 .settings
 /lib/

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -28,6 +28,7 @@ importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "u
 
 # S3 methods exported
 export("sparkR.session")
+export("sparkR.init")
 export("sparkR.session.stop")
 export("sparkR.stop")
 export("sparkR.conf")
@@ -41,6 +42,9 @@ export("sparkR.callJStatic")
 
 export("install.spark")
 
+export("sparkRSQL.init",
+       "sparkRHive.init")
+
 # MLlib integration
 exportMethods("glm",
               "spark.glm",
@@ -68,7 +72,10 @@ exportMethods("glm",
               "spark.freqItemsets",
               "spark.associationRules",
               "spark.findFrequentSequentialPatterns",
-              "spark.assignClusters")
+              "spark.assignClusters",
+              "spark.fmClassifier",
+              "spark.lm",
+              "spark.fmRegressor")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -148,6 +155,7 @@ exportMethods("arrange",
               "printSchema",
               "randomSplit",
               "rbind",
+              "registerTempTable",
               "rename",
               "repartition",
               "repartitionByRange",
@@ -345,6 +353,7 @@ exportMethods("%<=>%",
               "over",
               "overlay",
               "percent_rank",
+              "percentile_approx",
               "pmod",
               "posexplode",
               "posexplode_outer",
@@ -430,8 +439,10 @@ export("as.DataFrame",
        "cacheTable",
        "clearCache",
        "createDataFrame",
+       "createExternalTable",
        "createTable",
        "currentDatabase",
+       "dropTempTable",
        "dropTempView",
        "listColumns",
        "listDatabases",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -521,6 +521,32 @@ setMethod("createOrReplaceTempView",
               invisible(callJMethod(x@sdf, "createOrReplaceTempView", viewName))
           })
 
+#' (Deprecated) Register Temporary Table
+#'
+#' Registers a SparkDataFrame as a Temporary Table in the SparkSession
+#' @param x A SparkDataFrame
+#' @param tableName A character vector containing the name of the table
+#'
+#' @seealso \link{createOrReplaceTempView}
+#' @rdname registerTempTable-deprecated
+#' @name registerTempTable
+#' @aliases registerTempTable,SparkDataFrame,character-method
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' registerTempTable(df, "json_df")
+#' new_df <- sql("SELECT * FROM json_df")
+#'}
+#' @note registerTempTable since 1.4.0
+setMethod("registerTempTable",
+          signature(x = "SparkDataFrame", tableName = "character"),
+          function(x, tableName) {
+              .Deprecated("createOrReplaceTempView")
+              invisible(callJMethod(x@sdf, "createOrReplaceTempView", tableName))
+          })
+
 #' insertInto
 #'
 #' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession.

diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R
@@ -17,6 +17,35 @@
 
 # catalog.R: SparkSession catalog functions
 
+#' (Deprecated) Create an external table
+#'
+#' Creates an external table based on the dataset in a data source,
+#' Returns a SparkDataFrame associated with the external table.
+#'
+#' The data source is specified by the \code{source} and a set of options(...).
+#' If \code{source} is not specified, the default data source configured by
+#' "spark.sql.sources.default" will be used.
+#'
+#' @param tableName a name of the table.
+#' @param path the path of files to load.
+#' @param source the name of external data source.
+#' @param schema the schema of the data required for some data sources.
+#' @param ... additional argument(s) passed to the method.
+#' @return A SparkDataFrame.
+#' @rdname createExternalTable-deprecated
+#' @seealso \link{createTable}
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df <- createExternalTable("myjson", path="path/to/json", source="json", schema)
+#' }
+#' @name createExternalTable
+#' @note createExternalTable since 1.4.0
+createExternalTable <- function(tableName, path = NULL, source = NULL, schema = NULL, ...) {
+  .Deprecated("createTable", old = "createExternalTable")
+  createTable(tableName, path, source, schema, ...)
+}
+
 #' Creates a table based on the dataset in a data source
 #'
 #' Creates a table based on the dataset in a data source. Returns a SparkDataFrame associated with
@@ -130,6 +159,31 @@ clearCache <- function() {
   invisible(callJMethod(catalog, "clearCache"))
 }
 
+#' (Deprecated) Drop Temporary Table
+#'
+#' Drops the temporary table with the given table name in the catalog.
+#' If the table has been cached/persisted before, it's also unpersisted.
+#'
+#' @param tableName The name of the SparkSQL table to be dropped.
+#' @seealso \link{dropTempView}
+#' @rdname dropTempTable-deprecated
+#' @examples
+#' \dontrun{
+#' sparkR.session()
+#' df <- read.df(path, "parquet")
+#' createOrReplaceTempView(df, "table")
+#' dropTempTable("table")
+#' }
+#' @name dropTempTable
+#' @note dropTempTable since 1.4.0
+dropTempTable <- function(tableName) {
+  .Deprecated("dropTempView", old = "dropTempTable")
+  if (class(tableName) != "character") {
+    stop("tableName must be a string.")
+  }
+  dropTempView(tableName)
+}
+
 #' Drops the temporary view with the given view name in the catalog.
 #'
 #' Drops the temporary view with the given view name in the catalog.