Merge branch 'master' into SPARK-25696

apache · Dec 5, 2018 · 8cc05a5 · 8cc05a5
2 parents 4c4674e + 7e3eb3c
commit 8cc05a5
Show file tree

Hide file tree

Showing 1,362 changed files with 52,893 additions and 43,057 deletions.
diff --git a/.gitignore b/.gitignore
@@ -77,7 +77,6 @@ target/
 unit-tests.log
 work/
 docs/.jekyll-metadata
-*.crc
 
 # For Hive
 TempStatsStore/

diff --git a/R/WINDOWS.md b/R/WINDOWS.md
@@ -3,7 +3,7 @@
 To build SparkR on Windows, the following steps are required
 
 1. Install R (>= 3.1) and [Rtools](http://cran.r-project.org/bin/windows/Rtools/). Make sure to
-include Rtools and R in `PATH`.
+include Rtools and R in `PATH`. Note that support for R prior to version 3.4 is deprecated as of Spark 3.0.0.
 
 2. Install
 [JDK8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) and set

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -15,7 +15,7 @@ URL: http://www.apache.org/ http://spark.apache.org/
 BugReports: http://spark.apache.org/contributing.html
 SystemRequirements: Java (== 8)
 Depends:
-    R (>= 3.0),
+    R (>= 3.1),
     methods
 Suggests:
     knitr,

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -28,9 +28,8 @@ importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "u
 
 # S3 methods exported
 export("sparkR.session")
-export("sparkR.init")
-export("sparkR.stop")
 export("sparkR.session.stop")
+export("sparkR.stop")
 export("sparkR.conf")
 export("sparkR.version")
 export("sparkR.uiWebUrl")
@@ -42,9 +41,6 @@ export("sparkR.callJStatic")
 
 export("install.spark")
 
-export("sparkRSQL.init",
-       "sparkRHive.init")
-
 # MLlib integration
 exportMethods("glm",
               "spark.glm",
@@ -70,7 +66,8 @@ exportMethods("glm",
               "spark.svmLinear",
               "spark.fpGrowth",
               "spark.freqItemsets",
-              "spark.associationRules")
+              "spark.associationRules",
+              "spark.findFrequentSequentialPatterns")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -150,15 +147,13 @@ exportMethods("arrange",
               "printSchema",
               "randomSplit",
               "rbind",
-              "registerTempTable",
               "rename",
               "repartition",
               "repartitionByRange",
               "rollup",
               "sample",
               "sample_frac",
               "sampleBy",
-              "saveAsParquetFile",
               "saveAsTable",
               "saveDF",
               "schema",
@@ -200,6 +195,7 @@ exportMethods("%<=>%",
               "acos",
               "add_months",
               "alias",
+              "approx_count_distinct",
               "approxCountDistinct",
               "approxQuantile",
               "array_contains",
@@ -258,6 +254,7 @@ exportMethods("%<=>%",
               "dayofweek",
               "dayofyear",
               "decode",
+              "degrees",
               "dense_rank",
               "desc",
               "element_at",
@@ -274,6 +271,7 @@ exportMethods("%<=>%",
               "floor",
               "format_number",
               "format_string",
+              "from_csv",
               "from_json",
               "from_unixtime",
               "from_utc_timestamp",
@@ -339,6 +337,7 @@ exportMethods("%<=>%",
               "posexplode",
               "posexplode_outer",
               "quarter",
+              "radians",
               "rand",
               "randn",
               "rank",
@@ -352,6 +351,8 @@ exportMethods("%<=>%",
               "row_number",
               "rpad",
               "rtrim",
+              "schema_of_csv",
+              "schema_of_json",
               "second",
               "sha1",
               "sha2",
@@ -385,6 +386,7 @@ exportMethods("%<=>%",
               "tanh",
               "toDegrees",
               "toRadians",
+              "to_csv",
               "to_date",
               "to_json",
               "to_timestamp",
@@ -413,18 +415,14 @@ export("as.DataFrame",
        "cacheTable",
        "clearCache",
        "createDataFrame",
-       "createExternalTable",
        "createTable",
        "currentDatabase",
-       "dropTempTable",
        "dropTempView",
-       "jsonFile",
        "listColumns",
        "listDatabases",
        "listFunctions",
        "listTables",
        "loadDF",
-       "parquetFile",
        "read.df",
        "read.jdbc",
        "read.json",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -226,7 +226,9 @@ setMethod("showDF",
 
 #' show
 #'
-#' Print class and type information of a Spark object.
+#' If eager evaluation is enabled and the Spark object is a SparkDataFrame, evaluate the
+#' SparkDataFrame and print top rows of the SparkDataFrame, otherwise, print the class
+#' and type information of the Spark object.
 #'
 #' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec.
 #'
@@ -244,11 +246,33 @@ setMethod("showDF",
 #' @note show(SparkDataFrame) since 1.4.0
 setMethod("show", "SparkDataFrame",
           function(object) {
-            cols <- lapply(dtypes(object), function(l) {
-              paste(l, collapse = ":")
-            })
-            s <- paste(cols, collapse = ", ")
-            cat(paste(class(object), "[", s, "]\n", sep = ""))
+            allConf <- sparkR.conf()
+            prop <- allConf[["spark.sql.repl.eagerEval.enabled"]]
+            if (!is.null(prop) && identical(prop, "true")) {
+              argsList <- list()
+              argsList$x <- object
+              prop <- allConf[["spark.sql.repl.eagerEval.maxNumRows"]]
+              if (!is.null(prop)) {
+                numRows <- as.integer(prop)
+                if (numRows > 0) {
+                  argsList$numRows <- numRows
+                }
+              }
+              prop <- allConf[["spark.sql.repl.eagerEval.truncate"]]
+              if (!is.null(prop)) {
+                truncate <- as.integer(prop)
+                if (truncate > 0) {
+                  argsList$truncate <- truncate
+                }
+              }
+              do.call(showDF, argsList)
+            } else {
+              cols <- lapply(dtypes(object), function(l) {
+                paste(l, collapse = ":")
+              })
+              s <- paste(cols, collapse = ", ")
+              cat(paste(class(object), "[", s, "]\n", sep = ""))
+            }
           })
 
 #' DataTypes
@@ -497,32 +521,6 @@ setMethod("createOrReplaceTempView",
               invisible(callJMethod(x@sdf, "createOrReplaceTempView", viewName))
           })
 
-#' (Deprecated) Register Temporary Table
-#'
-#' Registers a SparkDataFrame as a Temporary Table in the SparkSession
-#' @param x A SparkDataFrame
-#' @param tableName A character vector containing the name of the table
-#'
-#' @seealso \link{createOrReplaceTempView}
-#' @rdname registerTempTable-deprecated
-#' @name registerTempTable
-#' @aliases registerTempTable,SparkDataFrame,character-method
-#' @examples
-#'\dontrun{
-#' sparkR.session()
-#' path <- "path/to/file.json"
-#' df <- read.json(path)
-#' registerTempTable(df, "json_df")
-#' new_df <- sql("SELECT * FROM json_df")
-#'}
-#' @note registerTempTable since 1.4.0
-setMethod("registerTempTable",
-          signature(x = "SparkDataFrame", tableName = "character"),
-          function(x, tableName) {
-              .Deprecated("createOrReplaceTempView")
-              invisible(callJMethod(x@sdf, "createOrReplaceTempView", tableName))
-          })
-
 #' insertInto
 #'
 #' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession.
@@ -768,6 +766,13 @@ setMethod("repartition",
 #'  \item{2.} {Return a new SparkDataFrame range partitioned by the given column(s),
 #'                      using \code{spark.sql.shuffle.partitions} as number of partitions.}
 #'}
+#' At least one partition-by expression must be specified.
+#' When no explicit sort order is specified, "ascending nulls first" is assumed.
+#'
+#' Note that due to performance reasons this method uses sampling to estimate the ranges.
+#' Hence, the output may not be consistent, since sampling can return different values.
+#' The sample size can be controlled by the config
+#' \code{spark.sql.execution.rangeExchange.sampleSizePerPartition}.
 #'
 #' @param x a SparkDataFrame.
 #' @param numPartitions the number of partitions to use.
@@ -822,7 +827,6 @@ setMethod("repartitionByRange",
 #' toJSON
 #'
 #' Converts a SparkDataFrame into a SparkDataFrame of JSON string.
-#'
 #' Each row is turned into a JSON document with columns as different fields.
 #' The returned SparkDataFrame has a single character column with the name \code{value}
 #'
@@ -932,7 +936,6 @@ setMethod("write.orc",
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
 #' write.parquet(df, "/tmp/sparkr-tmp1/")
-#' saveAsParquetFile(df, "/tmp/sparkr-tmp2/")
 #'}
 #' @note write.parquet since 1.6.0
 setMethod("write.parquet",
@@ -943,17 +946,6 @@ setMethod("write.parquet",
             invisible(handledCallJMethod(write, "parquet", path))
           })
 
-#' @rdname write.parquet
-#' @name saveAsParquetFile
-#' @aliases saveAsParquetFile,SparkDataFrame,character-method
-#' @note saveAsParquetFile since 1.4.0
-setMethod("saveAsParquetFile",
-          signature(x = "SparkDataFrame", path = "character"),
-          function(x, path) {
-            .Deprecated("write.parquet")
-            write.parquet(x, path)
-          })
-
 #' Save the content of SparkDataFrame in a text file at the specified path.
 #'
 #' Save the content of the SparkDataFrame in a text file at the specified path.
@@ -2738,15 +2730,29 @@ setMethod("union",
             dataFrame(unioned)
           })
 
-#' unionAll is deprecated - use union instead
-#' @rdname union
-#' @name unionAll
+#' Return a new SparkDataFrame containing the union of rows.
+#'
+#' This is an alias for \code{union}.
+#'
+#' @param x a SparkDataFrame.
+#' @param y a SparkDataFrame.
+#' @return A SparkDataFrame containing the result of the unionAll operation.
+#' @family SparkDataFrame functions
 #' @aliases unionAll,SparkDataFrame,SparkDataFrame-method
+#' @rdname unionAll
+#' @name unionAll
+#' @seealso \link{union}
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df1 <- read.json(path)
+#' df2 <- read.json(path2)
+#' unionAllDF <- unionAll(df1, df2)
+#' }
 #' @note unionAll since 1.4.0
 setMethod("unionAll",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
           function(x, y) {
-            .Deprecated("union")
             union(x, y)
           })